{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.02876494, "auxiliary_loss_mlp": 0.02753524, "balance_loss_clip": 1.90675533, "balance_loss_mlp": 1.76437306, "epoch": 0.00012024289063909097, "flos": 24932495567880.0, "grad_norm": 40.19589106810381, "language_loss": 2.58310151, "learning_rate": 0.0, "loss": 1.89365149, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 14.530412673950195 }, { "auxiliary_loss_clip": 0.01913581, "auxiliary_loss_mlp": 0.01822398, "balance_loss_clip": 1.26713109, "balance_loss_mlp": 1.16474426, "epoch": 0.00024048578127818193, "flos": 30664637622720.0, "grad_norm": 55.028380992636706, "language_loss": 1.8877939, "learning_rate": 5.021476677069823e-07, "loss": 1.92515373, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.683375358581543 }, { "auxiliary_loss_clip": 0.01913365, "auxiliary_loss_mlp": 0.01823251, "balance_loss_clip": 1.26620162, "balance_loss_mlp": 1.16979313, "epoch": 0.0003607286719172729, "flos": 19026238109760.0, "grad_norm": 40.39216804819264, "language_loss": 1.61627936, "learning_rate": 7.958852231401551e-07, "loss": 1.65364552, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.5078179836273193 }, { "auxiliary_loss_clip": 0.01913397, "auxiliary_loss_mlp": 0.01824731, "balance_loss_clip": 1.26729369, "balance_loss_mlp": 1.16288126, "epoch": 0.00048097156255636386, "flos": 19316323923840.0, "grad_norm": 36.76567279963711, "language_loss": 1.643852, "learning_rate": 1.0042953354139647e-06, "loss": 1.68123341, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.5810678005218506 }, { "auxiliary_loss_clip": 0.01913834, "auxiliary_loss_mlp": 0.0182358, "balance_loss_clip": 1.26768827, "balance_loss_mlp": 1.16592622, "epoch": 0.0006012144531954548, "flos": 13991273642880.0, "grad_norm": 55.00368990276992, "language_loss": 1.93559706, "learning_rate": 1.1659507774310057e-06, "loss": 1.97297144, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.779829740524292 }, { "auxiliary_loss_clip": 0.01915468, "auxiliary_loss_mlp": 0.01826741, "balance_loss_clip": 1.26864362, "balance_loss_mlp": 1.17519116, "epoch": 0.0007214573438345458, "flos": 23148999485280.0, "grad_norm": 45.193034789160514, "language_loss": 1.61045694, "learning_rate": 1.2980328908471373e-06, "loss": 1.64787889, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.8837621212005615 }, { "auxiliary_loss_clip": 0.01908055, "auxiliary_loss_mlp": 0.01972557, "balance_loss_clip": 1.25232267, "balance_loss_mlp": 1.31948113, "epoch": 0.0008417002344736367, "flos": 67663280593440.0, "grad_norm": 4.58124950160881, "language_loss": 0.81440473, "learning_rate": 1.4097067265369432e-06, "loss": 0.85321081, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.2490129470825195 }, { "auxiliary_loss_clip": 0.01913659, "auxiliary_loss_mlp": 0.01821647, "balance_loss_clip": 1.2681067, "balance_loss_mlp": 1.15560079, "epoch": 0.0009619431251127277, "flos": 21281390324640.0, "grad_norm": 41.046006585151616, "language_loss": 1.58352232, "learning_rate": 1.506443003120947e-06, "loss": 1.62087536, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.800227165222168 }, { "auxiliary_loss_clip": 0.01913126, "auxiliary_loss_mlp": 0.01805744, "balance_loss_clip": 1.26664066, "balance_loss_mlp": 1.14313078, "epoch": 0.0010821860157518186, "flos": 23331353886720.0, "grad_norm": 17.248544872589406, "language_loss": 1.47684979, "learning_rate": 1.5917704462803102e-06, "loss": 1.51403856, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.8654096126556396 }, { "auxiliary_loss_clip": 0.01913826, "auxiliary_loss_mlp": 0.01804525, "balance_loss_clip": 1.26649296, "balance_loss_mlp": 1.14496398, "epoch": 0.0012024289063909096, "flos": 17010173292480.0, "grad_norm": 14.224943867783896, "language_loss": 1.53005338, "learning_rate": 1.6680984451379884e-06, "loss": 1.5672369, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.847200632095337 }, { "auxiliary_loss_clip": 0.01912821, "auxiliary_loss_mlp": 0.01820528, "balance_loss_clip": 1.26723337, "balance_loss_mlp": 1.1602037, "epoch": 0.0013226717970300007, "flos": 21288143976480.0, "grad_norm": 13.449899406619638, "language_loss": 1.32521486, "learning_rate": 1.7371455188905097e-06, "loss": 1.36254835, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.803267240524292 }, { "auxiliary_loss_clip": 0.0191355, "auxiliary_loss_mlp": 0.01809725, "balance_loss_clip": 1.26741147, "balance_loss_mlp": 1.15321529, "epoch": 0.0014429146876690916, "flos": 27237898039680.0, "grad_norm": 10.5438474879481, "language_loss": 1.25444412, "learning_rate": 1.8001805585541196e-06, "loss": 1.29167688, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.8875436782836914 }, { "auxiliary_loss_clip": 0.01915017, "auxiliary_loss_mlp": 0.01820564, "balance_loss_clip": 1.26874328, "balance_loss_mlp": 1.16291046, "epoch": 0.0015631575783081825, "flos": 19062184504320.0, "grad_norm": 6.668204109408272, "language_loss": 1.28979516, "learning_rate": 1.8581671739548328e-06, "loss": 1.32715106, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.8347740173339844 }, { "auxiliary_loss_clip": 0.01912794, "auxiliary_loss_mlp": 0.01803477, "balance_loss_clip": 1.2664355, "balance_loss_mlp": 1.15307069, "epoch": 0.0016834004689472734, "flos": 48139491615840.0, "grad_norm": 6.418410000892865, "language_loss": 1.13529301, "learning_rate": 1.9118543942439254e-06, "loss": 1.17245579, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 3.9609487056732178 }, { "auxiliary_loss_clip": 0.01912943, "auxiliary_loss_mlp": 0.01810538, "balance_loss_clip": 1.266698, "balance_loss_mlp": 1.15097725, "epoch": 0.0018036433595863645, "flos": 34970040789120.0, "grad_norm": 5.689455779540412, "language_loss": 1.12768364, "learning_rate": 1.961836000571161e-06, "loss": 1.16491842, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.919022560119629 }, { "auxiliary_loss_clip": 0.0190542, "auxiliary_loss_mlp": 0.01908204, "balance_loss_clip": 1.24991369, "balance_loss_mlp": 1.25818002, "epoch": 0.0019238862502254555, "flos": 59768312715360.0, "grad_norm": 3.7737816097582306, "language_loss": 0.64635861, "learning_rate": 2.0085906708279293e-06, "loss": 0.68449485, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.2321674823760986 }, { "auxiliary_loss_clip": 0.0191112, "auxiliary_loss_mlp": 0.01802735, "balance_loss_clip": 1.26552486, "balance_loss_mlp": 1.15232944, "epoch": 0.0020441291408645466, "flos": 20814554203200.0, "grad_norm": 4.260065705250067, "language_loss": 1.15976572, "learning_rate": 2.0525099325728135e-06, "loss": 1.1969043, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.866849899291992 }, { "auxiliary_loss_clip": 0.01904605, "auxiliary_loss_mlp": 0.0188715, "balance_loss_clip": 1.24898529, "balance_loss_mlp": 1.24017775, "epoch": 0.0021643720315036373, "flos": 63857031651360.0, "grad_norm": 3.5094211484368407, "language_loss": 0.72190106, "learning_rate": 2.0939181139872922e-06, "loss": 0.75981861, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.1980948448181152 }, { "auxiliary_loss_clip": 0.01910796, "auxiliary_loss_mlp": 0.01809802, "balance_loss_clip": 1.26439512, "balance_loss_mlp": 1.1548183, "epoch": 0.0022846149221427284, "flos": 31284994639680.0, "grad_norm": 4.688932215385755, "language_loss": 1.016397, "learning_rate": 2.1330868934640175e-06, "loss": 1.05360305, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.9459121227264404 }, { "auxiliary_loss_clip": 0.01903953, "auxiliary_loss_mlp": 0.01860505, "balance_loss_clip": 1.24827743, "balance_loss_mlp": 1.21658385, "epoch": 0.002404857812781819, "flos": 51083673714720.0, "grad_norm": 3.5365821166912528, "language_loss": 0.76401877, "learning_rate": 2.170246112844971e-06, "loss": 0.80166334, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.043731212615967 }, { "auxiliary_loss_clip": 0.01910372, "auxiliary_loss_mlp": 0.01795697, "balance_loss_clip": 1.2628479, "balance_loss_mlp": 1.13613617, "epoch": 0.0025251007034209102, "flos": 15815356512480.0, "grad_norm": 6.936085225838243, "language_loss": 1.01343846, "learning_rate": 2.2055919496770983e-06, "loss": 1.0504992, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.7800581455230713 }, { "auxiliary_loss_clip": 0.01906113, "auxiliary_loss_mlp": 0.01775912, "balance_loss_clip": 1.25926721, "balance_loss_mlp": 1.12169123, "epoch": 0.0026453435940600014, "flos": 37851872588640.0, "grad_norm": 4.516189336888185, "language_loss": 0.89587462, "learning_rate": 2.2392931865974923e-06, "loss": 0.93269485, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.9526803493499756 }, { "auxiliary_loss_clip": 0.01906065, "auxiliary_loss_mlp": 0.01760158, "balance_loss_clip": 1.25926888, "balance_loss_mlp": 1.11509252, "epoch": 0.002765586484699092, "flos": 21141987435360.0, "grad_norm": 4.535435927603178, "language_loss": 1.02059984, "learning_rate": 2.271496085962064e-06, "loss": 1.05726206, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.809063673019409 }, { "auxiliary_loss_clip": 0.01902946, "auxiliary_loss_mlp": 0.01769696, "balance_loss_clip": 1.25643909, "balance_loss_mlp": 1.11280549, "epoch": 0.002885829375338183, "flos": 20667391799040.0, "grad_norm": 3.4363388932691525, "language_loss": 1.02596307, "learning_rate": 2.3023282262611022e-06, "loss": 1.06268954, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.8678436279296875 }, { "auxiliary_loss_clip": 0.01904247, "auxiliary_loss_mlp": 0.01753276, "balance_loss_clip": 1.25724578, "balance_loss_mlp": 1.10401464, "epoch": 0.003006072265977274, "flos": 34823884248000.0, "grad_norm": 4.088223620159423, "language_loss": 0.92539859, "learning_rate": 2.3319015548620114e-06, "loss": 0.96197385, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.8813014030456543 }, { "auxiliary_loss_clip": 0.0190439, "auxiliary_loss_mlp": 0.01747438, "balance_loss_clip": 1.25748813, "balance_loss_mlp": 1.09474301, "epoch": 0.003126315156616365, "flos": 24422031622080.0, "grad_norm": 2.5042671551774576, "language_loss": 0.92990601, "learning_rate": 2.3603148416618152e-06, "loss": 0.96642423, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.8056204319000244 }, { "auxiliary_loss_clip": 0.01903544, "auxiliary_loss_mlp": 0.01755411, "balance_loss_clip": 1.25692487, "balance_loss_mlp": 1.09661269, "epoch": 0.003246558047255456, "flos": 23622337792800.0, "grad_norm": 3.123807284878479, "language_loss": 1.00853992, "learning_rate": 2.3876556694204647e-06, "loss": 1.04512954, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.818707227706909 }, { "auxiliary_loss_clip": 0.01901446, "auxiliary_loss_mlp": 0.01757282, "balance_loss_clip": 1.25491166, "balance_loss_mlp": 1.1015358, "epoch": 0.003366800937894547, "flos": 17820284988960.0, "grad_norm": 3.082938301351975, "language_loss": 0.9081012, "learning_rate": 2.414002061950908e-06, "loss": 0.94468844, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.812631130218506 }, { "auxiliary_loss_clip": 0.01901257, "auxiliary_loss_mlp": 0.01743532, "balance_loss_clip": 1.25467324, "balance_loss_mlp": 1.09312642, "epoch": 0.003487043828533638, "flos": 24426126921600.0, "grad_norm": 2.3450446750975473, "language_loss": 0.99958158, "learning_rate": 2.4394238264681557e-06, "loss": 1.03602946, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.8571937084198 }, { "auxiliary_loss_clip": 0.01899735, "auxiliary_loss_mlp": 0.01736216, "balance_loss_clip": 1.25363779, "balance_loss_mlp": 1.08771706, "epoch": 0.003607286719172729, "flos": 26140322957760.0, "grad_norm": 1.9762633002265189, "language_loss": 0.99662769, "learning_rate": 2.4639836682781433e-06, "loss": 1.03298724, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.9623312950134277 }, { "auxiliary_loss_clip": 0.01899923, "auxiliary_loss_mlp": 0.01733189, "balance_loss_clip": 1.25352287, "balance_loss_mlp": 1.0881238, "epoch": 0.00372752960981182, "flos": 20593092123360.0, "grad_norm": 4.605881729437877, "language_loss": 1.00494242, "learning_rate": 2.487738122623307e-06, "loss": 1.04127359, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.822446346282959 }, { "auxiliary_loss_clip": 0.01893611, "auxiliary_loss_mlp": 0.0171681, "balance_loss_clip": 1.24629164, "balance_loss_mlp": 1.0774672, "epoch": 0.003847772500450911, "flos": 22674619391040.0, "grad_norm": 3.075534077002605, "language_loss": 0.99127471, "learning_rate": 2.510738338534912e-06, "loss": 1.02737892, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.819568157196045 }, { "auxiliary_loss_clip": 0.01884305, "auxiliary_loss_mlp": 0.01732703, "balance_loss_clip": 1.23700702, "balance_loss_mlp": 1.0834415, "epoch": 0.003968015391090002, "flos": 17967806629920.0, "grad_norm": 3.0219313990783294, "language_loss": 1.02670908, "learning_rate": 2.5330307420306648e-06, "loss": 1.06287909, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.9025988578796387 }, { "auxiliary_loss_clip": 0.01883698, "auxiliary_loss_mlp": 0.01738397, "balance_loss_clip": 1.23578572, "balance_loss_mlp": 1.09066176, "epoch": 0.004088258281729093, "flos": 27304114887360.0, "grad_norm": 2.1634211208644802, "language_loss": 0.88349634, "learning_rate": 2.554657600279796e-06, "loss": 0.91971731, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.87416934967041 }, { "auxiliary_loss_clip": 0.01878625, "auxiliary_loss_mlp": 0.01716675, "balance_loss_clip": 1.23250723, "balance_loss_mlp": 1.07122815, "epoch": 0.004208501172368184, "flos": 23258598929280.0, "grad_norm": 3.0906839931830734, "language_loss": 1.03485715, "learning_rate": 2.5756575039679493e-06, "loss": 1.07081008, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.8309967517852783 }, { "auxiliary_loss_clip": 0.01877425, "auxiliary_loss_mlp": 0.01713273, "balance_loss_clip": 1.23038125, "balance_loss_mlp": 1.06935251, "epoch": 0.0043287440630072746, "flos": 17312113920960.0, "grad_norm": 1.8954246029866677, "language_loss": 0.95380127, "learning_rate": 2.5960657816942747e-06, "loss": 0.98970819, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.826158046722412 }, { "auxiliary_loss_clip": 0.01874032, "auxiliary_loss_mlp": 0.01668197, "balance_loss_clip": 1.2190907, "balance_loss_mlp": 1.05174148, "epoch": 0.004448986953646365, "flos": 53092517872320.0, "grad_norm": 1.3602840293813474, "language_loss": 0.60957944, "learning_rate": 2.6159148575788668e-06, "loss": 0.64500165, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.2797603607177734 }, { "auxiliary_loss_clip": 0.01875177, "auxiliary_loss_mlp": 0.01715978, "balance_loss_clip": 1.22841585, "balance_loss_mlp": 1.07968688, "epoch": 0.004569229844285457, "flos": 13444174514880.0, "grad_norm": 2.5055858046999027, "language_loss": 0.98889101, "learning_rate": 2.635234561171e-06, "loss": 1.02480257, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.8703057765960693 }, { "auxiliary_loss_clip": 0.0187455, "auxiliary_loss_mlp": 0.01700641, "balance_loss_clip": 1.22754943, "balance_loss_mlp": 1.06663823, "epoch": 0.0046894727349245475, "flos": 16209617294880.0, "grad_norm": 2.375736412601621, "language_loss": 0.94195771, "learning_rate": 2.6540523970949877e-06, "loss": 0.97770965, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.768829822540283 }, { "auxiliary_loss_clip": 0.01875505, "auxiliary_loss_mlp": 0.0169663, "balance_loss_clip": 1.22817194, "balance_loss_mlp": 1.06720507, "epoch": 0.004809715625563638, "flos": 23914255714560.0, "grad_norm": 2.86911294537135, "language_loss": 0.92634547, "learning_rate": 2.6723937805519533e-06, "loss": 0.96206689, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.830716609954834 }, { "auxiliary_loss_clip": 0.01872646, "auxiliary_loss_mlp": 0.01677445, "balance_loss_clip": 1.22657824, "balance_loss_mlp": 1.05297947, "epoch": 0.00492995851620273, "flos": 20773039638240.0, "grad_norm": 2.2493231465491927, "language_loss": 0.93179548, "learning_rate": 2.690282243737839e-06, "loss": 0.96729636, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 3.7958920001983643 }, { "auxiliary_loss_clip": 0.01867601, "auxiliary_loss_mlp": 0.01694351, "balance_loss_clip": 1.22115111, "balance_loss_mlp": 1.06683373, "epoch": 0.0050502014068418205, "flos": 20338665314400.0, "grad_norm": 4.320554601964803, "language_loss": 0.99503583, "learning_rate": 2.7077396173840807e-06, "loss": 1.03065538, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.829970359802246 }, { "auxiliary_loss_clip": 0.01868855, "auxiliary_loss_mlp": 0.01681412, "balance_loss_clip": 1.22277236, "balance_loss_mlp": 1.05847216, "epoch": 0.005170444297480911, "flos": 25994884890240.0, "grad_norm": 2.9108812054383084, "language_loss": 0.92959118, "learning_rate": 2.7247861909342594e-06, "loss": 0.96509385, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.8508596420288086 }, { "auxiliary_loss_clip": 0.01868147, "auxiliary_loss_mlp": 0.01671352, "balance_loss_clip": 1.22135115, "balance_loss_mlp": 1.05146408, "epoch": 0.005290687188120003, "flos": 20954064863520.0, "grad_norm": 2.6945337312491615, "language_loss": 0.83075422, "learning_rate": 2.7414408543044743e-06, "loss": 0.86614919, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.9302732944488525 }, { "auxiliary_loss_clip": 0.01864763, "auxiliary_loss_mlp": 0.01665761, "balance_loss_clip": 1.21935177, "balance_loss_mlp": 1.04549098, "epoch": 0.005410930078759093, "flos": 15851410678080.0, "grad_norm": 4.578661551868265, "language_loss": 0.79416311, "learning_rate": 2.7577212237113157e-06, "loss": 0.82946837, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.7564139366149902 }, { "auxiliary_loss_clip": 0.01863882, "auxiliary_loss_mlp": 0.01662768, "balance_loss_clip": 1.21878839, "balance_loss_mlp": 1.04936492, "epoch": 0.005531172969398184, "flos": 21104999254080.0, "grad_norm": 2.0352472881512376, "language_loss": 1.04452825, "learning_rate": 2.7736437536690466e-06, "loss": 1.07979465, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.8121159076690674 }, { "auxiliary_loss_clip": 0.01862809, "auxiliary_loss_mlp": 0.01645477, "balance_loss_clip": 1.21769452, "balance_loss_mlp": 1.04008412, "epoch": 0.005651415860037276, "flos": 20844896503680.0, "grad_norm": 2.0954680258083886, "language_loss": 1.07884026, "learning_rate": 2.789223836941131e-06, "loss": 1.11392307, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.9159305095672607 }, { "auxiliary_loss_clip": 0.01859326, "auxiliary_loss_mlp": 0.01656416, "balance_loss_clip": 1.21486163, "balance_loss_mlp": 1.04873455, "epoch": 0.005771658750676366, "flos": 13260203547840.0, "grad_norm": 2.237500986782841, "language_loss": 1.08837438, "learning_rate": 2.8044758939680847e-06, "loss": 1.12353182, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.8832688331604004 }, { "auxiliary_loss_clip": 0.01857131, "auxiliary_loss_mlp": 0.01649199, "balance_loss_clip": 1.21263826, "balance_loss_mlp": 1.04571378, "epoch": 0.005891901641315457, "flos": 24425408448000.0, "grad_norm": 2.5454416030890026, "language_loss": 1.02119792, "learning_rate": 2.8194134530738863e-06, "loss": 1.0562613, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.8506345748901367 }, { "auxiliary_loss_clip": 0.01858187, "auxiliary_loss_mlp": 0.01651574, "balance_loss_clip": 1.21365213, "balance_loss_mlp": 1.04427457, "epoch": 0.006012144531954548, "flos": 23076208604160.0, "grad_norm": 2.971016339847077, "language_loss": 0.90385991, "learning_rate": 2.834049222568994e-06, "loss": 0.93895751, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.842764377593994 }, { "auxiliary_loss_clip": 0.01859716, "auxiliary_loss_mlp": 0.01635654, "balance_loss_clip": 1.21538496, "balance_loss_mlp": 1.03064311, "epoch": 0.006132387422593639, "flos": 22528786163040.0, "grad_norm": 1.7744676608733116, "language_loss": 0.92559886, "learning_rate": 2.848395155712969e-06, "loss": 0.96055251, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.927891969680786 }, { "auxiliary_loss_clip": 0.01857825, "auxiliary_loss_mlp": 0.01645265, "balance_loss_clip": 1.21345937, "balance_loss_mlp": 1.04063559, "epoch": 0.00625263031323273, "flos": 27628350912000.0, "grad_norm": 2.2425210272620237, "language_loss": 0.97735584, "learning_rate": 2.8624625093687977e-06, "loss": 1.01238656, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.8583261966705322 }, { "auxiliary_loss_clip": 0.01859185, "auxiliary_loss_mlp": 0.01629499, "balance_loss_clip": 1.21525037, "balance_loss_mlp": 1.03402507, "epoch": 0.006372873203871821, "flos": 23110682127840.0, "grad_norm": 7.372312994718682, "language_loss": 0.88955331, "learning_rate": 2.876261897070029e-06, "loss": 0.92444015, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.865415334701538 }, { "auxiliary_loss_clip": 0.01858055, "auxiliary_loss_mlp": 0.01632262, "balance_loss_clip": 1.21384442, "balance_loss_mlp": 1.03526235, "epoch": 0.006493116094510912, "flos": 22856039776800.0, "grad_norm": 2.2157635942230747, "language_loss": 0.92424357, "learning_rate": 2.889803337127447e-06, "loss": 0.95914674, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.8443069458007812 }, { "auxiliary_loss_clip": 0.0185614, "auxiliary_loss_mlp": 0.01638192, "balance_loss_clip": 1.21223366, "balance_loss_mlp": 1.03890347, "epoch": 0.006613358985150003, "flos": 23071718144160.0, "grad_norm": 2.991903007707229, "language_loss": 0.84767199, "learning_rate": 2.903096296321516e-06, "loss": 0.88261533, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.8121676445007324 }, { "auxiliary_loss_clip": 0.01853548, "auxiliary_loss_mlp": 0.01621898, "balance_loss_clip": 1.21026278, "balance_loss_mlp": 1.03786767, "epoch": 0.006733601875789094, "flos": 26537673176640.0, "grad_norm": 2.1028539858408632, "language_loss": 0.91609704, "learning_rate": 2.9161497296578907e-06, "loss": 0.95085144, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.8688464164733887 }, { "auxiliary_loss_clip": 0.01853014, "auxiliary_loss_mlp": 0.01617671, "balance_loss_clip": 1.20949602, "balance_loss_mlp": 1.03325987, "epoch": 0.006853844766428185, "flos": 15523187124960.0, "grad_norm": 2.2778609600532627, "language_loss": 0.85870171, "learning_rate": 2.928972116604173e-06, "loss": 0.89340854, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.8649866580963135 }, { "auxiliary_loss_clip": 0.01853351, "auxiliary_loss_mlp": 0.01621806, "balance_loss_clip": 1.21009362, "balance_loss_mlp": 1.03281689, "epoch": 0.006974087657067276, "flos": 24243772520160.0, "grad_norm": 2.033260208132056, "language_loss": 1.02029133, "learning_rate": 2.9415714941751377e-06, "loss": 1.05504286, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.8256542682647705 }, { "auxiliary_loss_clip": 0.01854737, "auxiliary_loss_mlp": 0.01612355, "balance_loss_clip": 1.21194613, "balance_loss_mlp": 1.03137684, "epoch": 0.007094330547706367, "flos": 25772524718400.0, "grad_norm": 2.001416177463473, "language_loss": 0.9364711, "learning_rate": 2.9539554871897396e-06, "loss": 0.97114199, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.8636696338653564 }, { "auxiliary_loss_clip": 0.01847664, "auxiliary_loss_mlp": 0.01617486, "balance_loss_clip": 1.20566404, "balance_loss_mlp": 1.03574514, "epoch": 0.007214573438345458, "flos": 21319025132160.0, "grad_norm": 1.9642343714095178, "language_loss": 0.97512543, "learning_rate": 2.9661313359851253e-06, "loss": 1.00977695, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.8107309341430664 }, { "auxiliary_loss_clip": 0.01849051, "auxiliary_loss_mlp": 0.01614846, "balance_loss_clip": 1.20686829, "balance_loss_mlp": 1.03920782, "epoch": 0.007334816328984549, "flos": 24937100036640.0, "grad_norm": 2.4542671145534274, "language_loss": 0.93862718, "learning_rate": 2.978105921839922e-06, "loss": 0.97326612, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.934342622756958 }, { "auxiliary_loss_clip": 0.01846949, "auxiliary_loss_mlp": 0.01622165, "balance_loss_clip": 1.20523179, "balance_loss_mlp": 1.0430938, "epoch": 0.00745505921962364, "flos": 18510594916320.0, "grad_norm": 3.667415567701444, "language_loss": 0.72064412, "learning_rate": 2.9898857903302893e-06, "loss": 0.75533521, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.887375593185425 }, { "auxiliary_loss_clip": 0.01847639, "auxiliary_loss_mlp": 0.01614657, "balance_loss_clip": 1.20592189, "balance_loss_mlp": 1.04474115, "epoch": 0.007575302110262731, "flos": 18477666110880.0, "grad_norm": 2.5695423600580125, "language_loss": 0.88018095, "learning_rate": 3.001477172817253e-06, "loss": 0.91480386, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.7826550006866455 }, { "auxiliary_loss_clip": 0.01845758, "auxiliary_loss_mlp": 0.01611003, "balance_loss_clip": 1.2047199, "balance_loss_mlp": 1.04032397, "epoch": 0.007695545000901822, "flos": 24973190125920.0, "grad_norm": 3.0975792674994573, "language_loss": 0.96112031, "learning_rate": 3.012886006241894e-06, "loss": 0.9956879, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.9311914443969727 }, { "auxiliary_loss_clip": 0.01842645, "auxiliary_loss_mlp": 0.0159531, "balance_loss_clip": 1.20126021, "balance_loss_mlp": 1.02577591, "epoch": 0.007815787891540913, "flos": 21324234065760.0, "grad_norm": 2.817880224079115, "language_loss": 0.88332474, "learning_rate": 3.0241179513858383e-06, "loss": 0.91770434, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.8279612064361572 }, { "auxiliary_loss_clip": 0.01839116, "auxiliary_loss_mlp": 0.01585995, "balance_loss_clip": 1.19833136, "balance_loss_mlp": 1.02637911, "epoch": 0.007936030782180003, "flos": 21575786980320.0, "grad_norm": 2.2635020140590965, "language_loss": 0.87652445, "learning_rate": 3.035178409737647e-06, "loss": 0.91077554, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.810821294784546 }, { "auxiliary_loss_clip": 0.01836923, "auxiliary_loss_mlp": 0.01596943, "balance_loss_clip": 1.19541132, "balance_loss_mlp": 1.03313041, "epoch": 0.008056273672819095, "flos": 20120795602560.0, "grad_norm": 2.3812051968654018, "language_loss": 0.88824558, "learning_rate": 3.046072539090907e-06, "loss": 0.9225843, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 3.7806077003479004 }, { "auxiliary_loss_clip": 0.01831399, "auxiliary_loss_mlp": 0.01589964, "balance_loss_clip": 1.19192803, "balance_loss_mlp": 1.03301764, "epoch": 0.008176516563458186, "flos": 18333126135360.0, "grad_norm": 2.413935447398502, "language_loss": 1.04680252, "learning_rate": 3.056805267986779e-06, "loss": 1.08101618, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 2.825183391571045 }, { "auxiliary_loss_clip": 0.01830978, "auxiliary_loss_mlp": 0.0159141, "balance_loss_clip": 1.18984056, "balance_loss_mlp": 1.03446388, "epoch": 0.008296759454097276, "flos": 21872087591040.0, "grad_norm": 2.5786612151771506, "language_loss": 0.95346534, "learning_rate": 3.0673813091022194e-06, "loss": 0.98768926, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.8213882446289062 }, { "auxiliary_loss_clip": 0.01841816, "auxiliary_loss_mlp": 0.01572226, "balance_loss_clip": 1.1971035, "balance_loss_mlp": 1.04732406, "epoch": 0.008417002344736368, "flos": 63408265140960.0, "grad_norm": 1.2707139996908012, "language_loss": 0.62090373, "learning_rate": 3.0778051716749317e-06, "loss": 0.65504408, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.348858118057251 }, { "auxiliary_loss_clip": 0.01821615, "auxiliary_loss_mlp": 0.01586217, "balance_loss_clip": 1.18326759, "balance_loss_mlp": 1.0307976, "epoch": 0.008537245235375458, "flos": 22966465465440.0, "grad_norm": 2.488468309246532, "language_loss": 0.90406239, "learning_rate": 3.0880811730470094e-06, "loss": 0.93814069, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.8543038368225098 }, { "auxiliary_loss_clip": 0.01837191, "auxiliary_loss_mlp": 0.01559267, "balance_loss_clip": 1.19366574, "balance_loss_mlp": 1.04352045, "epoch": 0.008657488126014549, "flos": 61984083071520.0, "grad_norm": 1.131484678002912, "language_loss": 0.58560121, "learning_rate": 3.098213449401257e-06, "loss": 0.61956584, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.306842803955078 }, { "auxiliary_loss_clip": 0.0182346, "auxiliary_loss_mlp": 0.01570326, "balance_loss_clip": 1.18472385, "balance_loss_mlp": 1.02863884, "epoch": 0.00877773101665364, "flos": 30296803459680.0, "grad_norm": 2.752276881015818, "language_loss": 0.98843467, "learning_rate": 3.1082059657570015e-06, "loss": 1.0223726, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 3.0367298126220703 }, { "auxiliary_loss_clip": 0.01819031, "auxiliary_loss_mlp": 0.01579388, "balance_loss_clip": 1.18121362, "balance_loss_mlp": 1.0331229, "epoch": 0.00889797390729273, "flos": 23514067524960.0, "grad_norm": 3.654137203187419, "language_loss": 0.96573687, "learning_rate": 3.1180625252858496e-06, "loss": 0.99972105, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.7576794624328613 }, { "auxiliary_loss_clip": 0.01818364, "auxiliary_loss_mlp": 0.01570839, "balance_loss_clip": 1.18008018, "balance_loss_mlp": 1.03487396, "epoch": 0.009018216797931822, "flos": 23075849367360.0, "grad_norm": 2.7514647567073185, "language_loss": 0.80069435, "learning_rate": 3.1277867780021663e-06, "loss": 0.83458638, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.792656660079956 }, { "auxiliary_loss_clip": 0.01816846, "auxiliary_loss_mlp": 0.01564066, "balance_loss_clip": 1.17858315, "balance_loss_mlp": 1.02771974, "epoch": 0.009138459688570914, "flos": 15918884854560.0, "grad_norm": 1.962966257071131, "language_loss": 0.95576298, "learning_rate": 3.1373822288779824e-06, "loss": 0.98957211, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.76306414604187 }, { "auxiliary_loss_clip": 0.01814961, "auxiliary_loss_mlp": 0.01577042, "balance_loss_clip": 1.17716455, "balance_loss_mlp": 1.0418396, "epoch": 0.009258702579210003, "flos": 27016543730880.0, "grad_norm": 5.995354430069101, "language_loss": 0.79688466, "learning_rate": 3.1468522454274533e-06, "loss": 0.83080471, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.9220314025878906 }, { "auxiliary_loss_clip": 0.01813443, "auxiliary_loss_mlp": 0.01571437, "balance_loss_clip": 1.17539275, "balance_loss_mlp": 1.03890562, "epoch": 0.009378945469849095, "flos": 26903208224160.0, "grad_norm": 1.8659484793661725, "language_loss": 0.91540557, "learning_rate": 3.15620006480197e-06, "loss": 0.94925439, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.8597190380096436 }, { "auxiliary_loss_clip": 0.0181187, "auxiliary_loss_mlp": 0.01561885, "balance_loss_clip": 1.17523217, "balance_loss_mlp": 1.03354955, "epoch": 0.009499188360488187, "flos": 35694249461280.0, "grad_norm": 3.1948484667623607, "language_loss": 0.74948859, "learning_rate": 3.1654288004333087e-06, "loss": 0.78322613, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.9959628582000732 }, { "auxiliary_loss_clip": 0.01811816, "auxiliary_loss_mlp": 0.01553878, "balance_loss_clip": 1.17517853, "balance_loss_mlp": 1.02859378, "epoch": 0.009619431251127276, "flos": 21503211641280.0, "grad_norm": 3.6200263070464898, "language_loss": 0.76207316, "learning_rate": 3.1745414482589353e-06, "loss": 0.79573011, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.861114025115967 }, { "auxiliary_loss_clip": 0.01808016, "auxiliary_loss_mlp": 0.01553737, "balance_loss_clip": 1.17207778, "balance_loss_mlp": 1.03531957, "epoch": 0.009739674141766368, "flos": 17421066738720.0, "grad_norm": 2.3193693946856384, "language_loss": 0.87201357, "learning_rate": 3.1835408925606204e-06, "loss": 0.90563107, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.782130241394043 }, { "auxiliary_loss_clip": 0.01810024, "auxiliary_loss_mlp": 0.01544896, "balance_loss_clip": 1.17448139, "balance_loss_mlp": 1.02457142, "epoch": 0.00985991703240546, "flos": 27527121685440.0, "grad_norm": 3.250468245538281, "language_loss": 0.89285916, "learning_rate": 3.1924299114448214e-06, "loss": 0.92640829, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.877828598022461 }, { "auxiliary_loss_clip": 0.0180731, "auxiliary_loss_mlp": 0.01560164, "balance_loss_clip": 1.17146373, "balance_loss_mlp": 1.0386945, "epoch": 0.00998015992304455, "flos": 13808092996800.0, "grad_norm": 2.7764573008948896, "language_loss": 0.83432901, "learning_rate": 3.2012111819909055e-06, "loss": 0.86800373, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.7228121757507324 }, { "auxiliary_loss_clip": 0.01807498, "auxiliary_loss_mlp": 0.01546216, "balance_loss_clip": 1.17196941, "balance_loss_mlp": 1.03123188, "epoch": 0.010100402813683641, "flos": 20191395139200.0, "grad_norm": 2.249394161883813, "language_loss": 0.94957131, "learning_rate": 3.2098872850910627e-06, "loss": 0.98310846, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.865494966506958 }, { "auxiliary_loss_clip": 0.01808019, "auxiliary_loss_mlp": 0.01546675, "balance_loss_clip": 1.17227817, "balance_loss_mlp": 1.03436124, "epoch": 0.010220645704322733, "flos": 17201652308640.0, "grad_norm": 1.931176934051606, "language_loss": 0.89326215, "learning_rate": 3.2184607100038194e-06, "loss": 0.92680907, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.795515298843384 }, { "auxiliary_loss_clip": 0.01804326, "auxiliary_loss_mlp": 0.01540078, "balance_loss_clip": 1.16933441, "balance_loss_mlp": 1.02967179, "epoch": 0.010340888594961822, "flos": 21470426530560.0, "grad_norm": 2.140806015256881, "language_loss": 0.93246639, "learning_rate": 3.2269338586412414e-06, "loss": 0.96591049, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 3.0042662620544434 }, { "auxiliary_loss_clip": 0.01806912, "auxiliary_loss_mlp": 0.01539959, "balance_loss_clip": 1.17240036, "balance_loss_mlp": 1.02917147, "epoch": 0.010461131485600914, "flos": 23002842944160.0, "grad_norm": 8.802160573055417, "language_loss": 0.96569514, "learning_rate": 3.2353090496083106e-06, "loss": 0.99916387, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 3.046142816543579 }, { "auxiliary_loss_clip": 0.01804528, "auxiliary_loss_mlp": 0.0154201, "balance_loss_clip": 1.17068076, "balance_loss_mlp": 1.03656292, "epoch": 0.010581374376240005, "flos": 33546864582720.0, "grad_norm": 3.5127375446411175, "language_loss": 0.81251335, "learning_rate": 3.2435885220114572e-06, "loss": 0.84597874, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.9076414108276367 }, { "auxiliary_loss_clip": 0.01802749, "auxiliary_loss_mlp": 0.01541108, "balance_loss_clip": 1.16988361, "balance_loss_mlp": 1.03718674, "epoch": 0.010701617266879095, "flos": 21763098849600.0, "grad_norm": 2.006442421997554, "language_loss": 0.93869728, "learning_rate": 3.2517744390519113e-06, "loss": 0.9721359, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.8527448177337646 }, { "auxiliary_loss_clip": 0.0179918, "auxiliary_loss_mlp": 0.01531969, "balance_loss_clip": 1.16550267, "balance_loss_mlp": 1.02728426, "epoch": 0.010821860157518187, "flos": 19060208701920.0, "grad_norm": 2.039543475119128, "language_loss": 0.75146317, "learning_rate": 3.259868891418298e-06, "loss": 0.78477466, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.9262335300445557 }, { "auxiliary_loss_clip": 0.01800793, "auxiliary_loss_mlp": 0.0152213, "balance_loss_clip": 1.16769493, "balance_loss_mlp": 1.02660131, "epoch": 0.010942103048157278, "flos": 25447390601760.0, "grad_norm": 1.9799588235662395, "language_loss": 0.85138702, "learning_rate": 3.2678739004917757e-06, "loss": 0.88461626, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.836886167526245 }, { "auxiliary_loss_clip": 0.01799841, "auxiliary_loss_mlp": 0.01529053, "balance_loss_clip": 1.16607141, "balance_loss_mlp": 1.02970934, "epoch": 0.011062345938796368, "flos": 27493941414240.0, "grad_norm": 1.6388094001807132, "language_loss": 0.92303765, "learning_rate": 3.275791421376029e-06, "loss": 0.9563266, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.9523708820343018 }, { "auxiliary_loss_clip": 0.01799618, "auxiliary_loss_mlp": 0.01527022, "balance_loss_clip": 1.16663301, "balance_loss_mlp": 1.02653337, "epoch": 0.01118258882943546, "flos": 16071220268640.0, "grad_norm": 2.4037783347935435, "language_loss": 0.96006465, "learning_rate": 3.2836233457634622e-06, "loss": 0.99333107, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 3.8381381034851074 }, { "auxiliary_loss_clip": 0.01795169, "auxiliary_loss_mlp": 0.01538125, "balance_loss_clip": 1.16315889, "balance_loss_mlp": 1.04488468, "epoch": 0.011302831720074551, "flos": 20668613204160.0, "grad_norm": 1.8654137662220924, "language_loss": 0.85278249, "learning_rate": 3.2913715046481135e-06, "loss": 0.88611549, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 3.7328107357025146 }, { "auxiliary_loss_clip": 0.0179633, "auxiliary_loss_mlp": 0.01526818, "balance_loss_clip": 1.16422772, "balance_loss_mlp": 1.02938116, "epoch": 0.011423074610713641, "flos": 13072245052320.0, "grad_norm": 2.694210856959129, "language_loss": 0.88883495, "learning_rate": 3.299037670895023e-06, "loss": 0.92206633, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.760805368423462 }, { "auxiliary_loss_clip": 0.01795766, "auxiliary_loss_mlp": 0.01517318, "balance_loss_clip": 1.16391802, "balance_loss_mlp": 1.03018141, "epoch": 0.011543317501352733, "flos": 30335659672320.0, "grad_norm": 2.214903964902809, "language_loss": 0.80383611, "learning_rate": 3.3066235616750667e-06, "loss": 0.83696699, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.8941457271575928 }, { "auxiliary_loss_clip": 0.01791673, "auxiliary_loss_mlp": 0.01522344, "balance_loss_clip": 1.1600225, "balance_loss_mlp": 1.03787756, "epoch": 0.011663560391991824, "flos": 15522971582880.0, "grad_norm": 2.2370748834191225, "language_loss": 0.92423671, "learning_rate": 3.3141308407736276e-06, "loss": 0.95737696, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.7695276737213135 }, { "auxiliary_loss_clip": 0.01787055, "auxiliary_loss_mlp": 0.01518994, "balance_loss_clip": 1.15661263, "balance_loss_mlp": 1.02995026, "epoch": 0.011783803282630914, "flos": 19902135569760.0, "grad_norm": 3.592293303501833, "language_loss": 0.86687285, "learning_rate": 3.321561120780869e-06, "loss": 0.89993334, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.800607204437256 }, { "auxiliary_loss_clip": 0.01786852, "auxiliary_loss_mlp": 0.01506246, "balance_loss_clip": 1.15521228, "balance_loss_mlp": 1.02597535, "epoch": 0.011904046173270006, "flos": 22340683972800.0, "grad_norm": 2.8665294092974762, "language_loss": 1.01405597, "learning_rate": 3.3289159651708192e-06, "loss": 1.04698682, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.8571319580078125 }, { "auxiliary_loss_clip": 0.01785114, "auxiliary_loss_mlp": 0.01507632, "balance_loss_clip": 1.15472102, "balance_loss_mlp": 1.02659893, "epoch": 0.012024289063909096, "flos": 19100070777600.0, "grad_norm": 1.8694887965137927, "language_loss": 0.97433436, "learning_rate": 3.3361968902759768e-06, "loss": 1.00726175, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.8155312538146973 }, { "auxiliary_loss_clip": 0.01781736, "auxiliary_loss_mlp": 0.01509873, "balance_loss_clip": 1.1503787, "balance_loss_mlp": 1.03379929, "epoch": 0.012144531954548187, "flos": 15012214009920.0, "grad_norm": 2.709253713015997, "language_loss": 0.93845528, "learning_rate": 3.343405367163663e-06, "loss": 0.97137141, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.796427011489868 }, { "auxiliary_loss_clip": 0.01782988, "auxiliary_loss_mlp": 0.01504569, "balance_loss_clip": 1.15166223, "balance_loss_mlp": 1.03383493, "epoch": 0.012264774845187279, "flos": 15122028996000.0, "grad_norm": 2.8814603050567418, "language_loss": 0.81322432, "learning_rate": 3.350542823419951e-06, "loss": 0.84609985, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.8613686561584473 }, { "auxiliary_loss_clip": 0.01778568, "auxiliary_loss_mlp": 0.01498592, "balance_loss_clip": 1.14975369, "balance_loss_mlp": 1.02785897, "epoch": 0.012385017735826368, "flos": 13949256146400.0, "grad_norm": 4.458968782600725, "language_loss": 0.8705824, "learning_rate": 3.3576106448465615e-06, "loss": 0.90335405, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.7763988971710205 }, { "auxiliary_loss_clip": 0.01776472, "auxiliary_loss_mlp": 0.01501909, "balance_loss_clip": 1.1477747, "balance_loss_mlp": 1.03231978, "epoch": 0.01250526062646546, "flos": 23623271808480.0, "grad_norm": 2.153711730583216, "language_loss": 0.88035727, "learning_rate": 3.3646101770757797e-06, "loss": 0.91314113, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.848287343978882 }, { "auxiliary_loss_clip": 0.01773456, "auxiliary_loss_mlp": 0.014987, "balance_loss_clip": 1.14548934, "balance_loss_mlp": 1.03254378, "epoch": 0.012625503517104552, "flos": 34640092899360.0, "grad_norm": 31.522011490379654, "language_loss": 0.85738742, "learning_rate": 3.371542727108104e-06, "loss": 0.89010894, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 3.0282950401306152 }, { "auxiliary_loss_clip": 0.01773186, "auxiliary_loss_mlp": 0.01495731, "balance_loss_clip": 1.14581275, "balance_loss_mlp": 1.03453374, "epoch": 0.012745746407743641, "flos": 17821901554560.0, "grad_norm": 2.5260408451685783, "language_loss": 0.89888644, "learning_rate": 3.3784095647770114e-06, "loss": 0.9315756, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.7656233310699463 }, { "auxiliary_loss_clip": 0.0177014, "auxiliary_loss_mlp": 0.01491445, "balance_loss_clip": 1.14358664, "balance_loss_mlp": 1.02490759, "epoch": 0.012865989298382733, "flos": 20595067925760.0, "grad_norm": 3.090905907608011, "language_loss": 0.88822317, "learning_rate": 3.3852119241449547e-06, "loss": 0.92083907, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.839017391204834 }, { "auxiliary_loss_clip": 0.017694, "auxiliary_loss_mlp": 0.01486445, "balance_loss_clip": 1.14271951, "balance_loss_mlp": 1.02944434, "epoch": 0.012986232189021825, "flos": 23948980704000.0, "grad_norm": 2.518919943369509, "language_loss": 0.9643724, "learning_rate": 3.3919510048344295e-06, "loss": 0.9969309, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.8577563762664795 }, { "auxiliary_loss_clip": 0.01767048, "auxiliary_loss_mlp": 0.01489272, "balance_loss_clip": 1.140113, "balance_loss_mlp": 1.03341532, "epoch": 0.013106475079660914, "flos": 23725435050720.0, "grad_norm": 1.993965187530803, "language_loss": 0.86760056, "learning_rate": 3.3986279732976907e-06, "loss": 0.90016377, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.8234150409698486 }, { "auxiliary_loss_clip": 0.01769845, "auxiliary_loss_mlp": 0.01486209, "balance_loss_clip": 1.14331961, "balance_loss_mlp": 1.03493071, "epoch": 0.013226717970300006, "flos": 21102448672800.0, "grad_norm": 2.1136376021679646, "language_loss": 0.95659506, "learning_rate": 3.4052439640284983e-06, "loss": 0.98915553, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.7776148319244385 }, { "auxiliary_loss_clip": 0.01764913, "auxiliary_loss_mlp": 0.01474536, "balance_loss_clip": 1.14024544, "balance_loss_mlp": 1.02859783, "epoch": 0.013346960860939098, "flos": 24863913995040.0, "grad_norm": 1.8736547395155867, "language_loss": 0.81145513, "learning_rate": 3.4118000807190217e-06, "loss": 0.84384954, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.849896192550659 }, { "auxiliary_loss_clip": 0.01763664, "auxiliary_loss_mlp": 0.014804, "balance_loss_clip": 1.13845289, "balance_loss_mlp": 1.02873969, "epoch": 0.013467203751578187, "flos": 28181952226080.0, "grad_norm": 1.8156242161679488, "language_loss": 0.76345527, "learning_rate": 3.4182973973648723e-06, "loss": 0.79589593, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.840073347091675 }, { "auxiliary_loss_clip": 0.01763365, "auxiliary_loss_mlp": 0.0148161, "balance_loss_clip": 1.13887453, "balance_loss_mlp": 1.036816, "epoch": 0.013587446642217279, "flos": 18916243505280.0, "grad_norm": 2.8409409364302975, "language_loss": 0.94993162, "learning_rate": 3.424736959321014e-06, "loss": 0.98238128, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 3.047708749771118 }, { "auxiliary_loss_clip": 0.01763237, "auxiliary_loss_mlp": 0.01469178, "balance_loss_clip": 1.13908577, "balance_loss_mlp": 1.02629161, "epoch": 0.01370768953285637, "flos": 23988627237600.0, "grad_norm": 1.8942894655489984, "language_loss": 0.88378859, "learning_rate": 3.431119784311155e-06, "loss": 0.91611278, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.871973991394043 }, { "auxiliary_loss_clip": 0.01761922, "auxiliary_loss_mlp": 0.01470445, "balance_loss_clip": 1.13724422, "balance_loss_mlp": 1.03366268, "epoch": 0.01382793242349546, "flos": 39202581227040.0, "grad_norm": 1.667719246533415, "language_loss": 0.77545607, "learning_rate": 3.43744686339307e-06, "loss": 0.80777979, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.9213972091674805 }, { "auxiliary_loss_clip": 0.01759132, "auxiliary_loss_mlp": 0.01460305, "balance_loss_clip": 1.13522422, "balance_loss_mlp": 1.02275944, "epoch": 0.013948175314134552, "flos": 41353522549920.0, "grad_norm": 2.458112866927895, "language_loss": 0.90759903, "learning_rate": 3.44371916188212e-06, "loss": 0.93979341, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 3.0759854316711426 }, { "auxiliary_loss_clip": 0.01757389, "auxiliary_loss_mlp": 0.0146637, "balance_loss_clip": 1.13466048, "balance_loss_mlp": 1.02577233, "epoch": 0.014068418204773643, "flos": 22453552471680.0, "grad_norm": 2.4191498954028443, "language_loss": 0.86339617, "learning_rate": 3.449937620235143e-06, "loss": 0.89563382, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.826017379760742 }, { "auxiliary_loss_clip": 0.01758532, "auxiliary_loss_mlp": 0.01465758, "balance_loss_clip": 1.13484645, "balance_loss_mlp": 1.02935684, "epoch": 0.014188661095412733, "flos": 23805159202080.0, "grad_norm": 1.6858627699896764, "language_loss": 0.89308298, "learning_rate": 3.456103154896722e-06, "loss": 0.92532599, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 2.7552788257598877 }, { "auxiliary_loss_clip": 0.01755116, "auxiliary_loss_mlp": 0.01457545, "balance_loss_clip": 1.13247228, "balance_loss_mlp": 1.02648413, "epoch": 0.014308903986051825, "flos": 23660259989760.0, "grad_norm": 2.0645189696266963, "language_loss": 0.92478371, "learning_rate": 3.462216659109757e-06, "loss": 0.95691031, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 2.7457666397094727 }, { "auxiliary_loss_clip": 0.0175493, "auxiliary_loss_mlp": 0.01458939, "balance_loss_clip": 1.13245618, "balance_loss_mlp": 1.0324558, "epoch": 0.014429146876690916, "flos": 20668002501600.0, "grad_norm": 3.0649668990015226, "language_loss": 0.85075176, "learning_rate": 3.4682790036921077e-06, "loss": 0.88289046, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.605393171310425 }, { "auxiliary_loss_clip": 0.01753953, "auxiliary_loss_mlp": 0.01457952, "balance_loss_clip": 1.13109803, "balance_loss_mlp": 1.02689147, "epoch": 0.014549389767330006, "flos": 20229209565120.0, "grad_norm": 2.1010897991822652, "language_loss": 0.83002776, "learning_rate": 3.4742910377810193e-06, "loss": 0.86214685, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.8110275268554688 }, { "auxiliary_loss_clip": 0.0175195, "auxiliary_loss_mlp": 0.01447125, "balance_loss_clip": 1.12995362, "balance_loss_mlp": 1.02598238, "epoch": 0.014669632657969098, "flos": 18004184108640.0, "grad_norm": 1.986683004859329, "language_loss": 0.889503, "learning_rate": 3.4802535895469042e-06, "loss": 0.92149377, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.746486186981201 }, { "auxiliary_loss_clip": 0.01751651, "auxiliary_loss_mlp": 0.01445486, "balance_loss_clip": 1.12894416, "balance_loss_mlp": 1.0247252, "epoch": 0.01478987554860819, "flos": 22741806178080.0, "grad_norm": 2.275199830542661, "language_loss": 0.89569855, "learning_rate": 3.4861674668779934e-06, "loss": 0.92766988, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.7910995483398438 }, { "auxiliary_loss_clip": 0.01749158, "auxiliary_loss_mlp": 0.01447655, "balance_loss_clip": 1.12757826, "balance_loss_mlp": 1.02842033, "epoch": 0.01491011843924728, "flos": 17198598795840.0, "grad_norm": 2.1058741920963757, "language_loss": 0.84182227, "learning_rate": 3.492033458037272e-06, "loss": 0.87379038, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.7381997108459473 }, { "auxiliary_loss_clip": 0.01744609, "auxiliary_loss_mlp": 0.01443435, "balance_loss_clip": 1.12370312, "balance_loss_mlp": 1.02381837, "epoch": 0.01503036132988637, "flos": 17673876982080.0, "grad_norm": 3.4091207863964486, "language_loss": 0.86730933, "learning_rate": 3.497852332293018e-06, "loss": 0.89918983, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.7474989891052246 }, { "auxiliary_loss_clip": 0.01745961, "auxiliary_loss_mlp": 0.01444295, "balance_loss_clip": 1.12609076, "balance_loss_mlp": 1.022771, "epoch": 0.015150604220525462, "flos": 18878249460960.0, "grad_norm": 3.7050683101545854, "language_loss": 0.96559578, "learning_rate": 3.5036248405242356e-06, "loss": 0.99749839, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.8122098445892334 }, { "auxiliary_loss_clip": 0.01742337, "auxiliary_loss_mlp": 0.01439338, "balance_loss_clip": 1.12228608, "balance_loss_mlp": 1.02582443, "epoch": 0.015270847111164552, "flos": 39420199473120.0, "grad_norm": 1.9897767689119052, "language_loss": 0.82566589, "learning_rate": 3.509351715802146e-06, "loss": 0.85748261, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.9062697887420654 }, { "auxiliary_loss_clip": 0.01738746, "auxiliary_loss_mlp": 0.01436168, "balance_loss_clip": 1.11886859, "balance_loss_mlp": 1.02418065, "epoch": 0.015391090001803644, "flos": 43762483049760.0, "grad_norm": 2.2183219702531884, "language_loss": 0.78417873, "learning_rate": 3.5150336739488763e-06, "loss": 0.81592786, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 2.8952646255493164 }, { "auxiliary_loss_clip": 0.01737761, "auxiliary_loss_mlp": 0.01439071, "balance_loss_clip": 1.11912704, "balance_loss_mlp": 1.02822852, "epoch": 0.015511332892442733, "flos": 18916351276320.0, "grad_norm": 1.9139040243159757, "language_loss": 0.84167719, "learning_rate": 3.5206714140744143e-06, "loss": 0.87344557, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.7125539779663086 }, { "auxiliary_loss_clip": 0.01739941, "auxiliary_loss_mlp": 0.0144163, "balance_loss_clip": 1.12132812, "balance_loss_mlp": 1.03422046, "epoch": 0.015631575783081827, "flos": 24535295281440.0, "grad_norm": 3.0498171501411635, "language_loss": 0.87245703, "learning_rate": 3.5262656190928208e-06, "loss": 0.90427274, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.787031412124634 }, { "auxiliary_loss_clip": 0.0177664, "auxiliary_loss_mlp": 0.01384799, "balance_loss_clip": 1.15458965, "balance_loss_mlp": 1.00943279, "epoch": 0.015751818673720917, "flos": 62328566842560.0, "grad_norm": 1.0576399515283432, "language_loss": 0.71453977, "learning_rate": 3.5318169562186737e-06, "loss": 0.74615419, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.3560402393341064 }, { "auxiliary_loss_clip": 0.01733165, "auxiliary_loss_mlp": 0.01431264, "balance_loss_clip": 1.115152, "balance_loss_mlp": 1.02843165, "epoch": 0.015872061564360006, "flos": 23878560785760.0, "grad_norm": 4.169549015984614, "language_loss": 0.8224774, "learning_rate": 3.5373260774446292e-06, "loss": 0.85412169, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.885373115539551 }, { "auxiliary_loss_clip": 0.0173368, "auxiliary_loss_mlp": 0.01421782, "balance_loss_clip": 1.1166631, "balance_loss_mlp": 1.02314627, "epoch": 0.0159923044549991, "flos": 23367910983840.0, "grad_norm": 2.9307930683268224, "language_loss": 0.90242743, "learning_rate": 3.542793620000961e-06, "loss": 0.93398207, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.8286166191101074 }, { "auxiliary_loss_clip": 0.01730251, "auxiliary_loss_mlp": 0.01419612, "balance_loss_clip": 1.11429739, "balance_loss_mlp": 1.02727044, "epoch": 0.01611254734563819, "flos": 17858314956960.0, "grad_norm": 2.3709146964663033, "language_loss": 0.86960638, "learning_rate": 3.5482202067978894e-06, "loss": 0.90110505, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.7813639640808105 }, { "auxiliary_loss_clip": 0.01728895, "auxiliary_loss_mlp": 0.01423898, "balance_loss_clip": 1.11275136, "balance_loss_mlp": 1.02488053, "epoch": 0.01623279023627728, "flos": 20954783337120.0, "grad_norm": 2.745659510688328, "language_loss": 0.76076913, "learning_rate": 3.553606446851471e-06, "loss": 0.79229701, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.8055264949798584 }, { "auxiliary_loss_clip": 0.0172502, "auxiliary_loss_mlp": 0.01420679, "balance_loss_clip": 1.11082137, "balance_loss_mlp": 1.02509475, "epoch": 0.016353033126916373, "flos": 15742421936640.0, "grad_norm": 2.0652114956947103, "language_loss": 0.83517838, "learning_rate": 3.5589529356937613e-06, "loss": 0.86663544, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.776736259460449 }, { "auxiliary_loss_clip": 0.01728027, "auxiliary_loss_mlp": 0.01417891, "balance_loss_clip": 1.11213017, "balance_loss_mlp": 1.02421403, "epoch": 0.016473276017555463, "flos": 18807290687520.0, "grad_norm": 2.5083927028449757, "language_loss": 0.77108186, "learning_rate": 3.5642602557679627e-06, "loss": 0.80254102, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.7799534797668457 }, { "auxiliary_loss_clip": 0.0172809, "auxiliary_loss_mlp": 0.0141086, "balance_loss_clip": 1.11329126, "balance_loss_mlp": 1.02290535, "epoch": 0.016593518908194552, "flos": 24352653490560.0, "grad_norm": 2.317534423618739, "language_loss": 0.84245187, "learning_rate": 3.569528976809202e-06, "loss": 0.8738414, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.8704051971435547 }, { "auxiliary_loss_clip": 0.01724481, "auxiliary_loss_mlp": 0.01408296, "balance_loss_clip": 1.1111201, "balance_loss_mlp": 1.02167606, "epoch": 0.016713761798833646, "flos": 22346144372160.0, "grad_norm": 2.060969117115394, "language_loss": 0.90016526, "learning_rate": 3.5747596562115522e-06, "loss": 0.93149304, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.8114755153656006 }, { "auxiliary_loss_clip": 0.01723961, "auxiliary_loss_mlp": 0.01416957, "balance_loss_clip": 1.11116135, "balance_loss_mlp": 1.02556896, "epoch": 0.016834004689472735, "flos": 17821829707200.0, "grad_norm": 3.4566888585265954, "language_loss": 0.90836412, "learning_rate": 3.5799528393819138e-06, "loss": 0.9397732, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.8186886310577393 }, { "auxiliary_loss_clip": 0.01718871, "auxiliary_loss_mlp": 0.01413477, "balance_loss_clip": 1.10648131, "balance_loss_mlp": 1.0253315, "epoch": 0.016954247580111825, "flos": 20519510921280.0, "grad_norm": 1.9212661562540314, "language_loss": 0.8799417, "learning_rate": 3.585109060081286e-06, "loss": 0.91126525, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.78157377243042 }, { "auxiliary_loss_clip": 0.01718868, "auxiliary_loss_mlp": 0.01416916, "balance_loss_clip": 1.10683477, "balance_loss_mlp": 1.03220439, "epoch": 0.017074490470750915, "flos": 22088879592480.0, "grad_norm": 1.7855787952400675, "language_loss": 0.78407961, "learning_rate": 3.590228840753992e-06, "loss": 0.81543744, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.8451383113861084 }, { "auxiliary_loss_clip": 0.01717423, "auxiliary_loss_mlp": 0.01405954, "balance_loss_clip": 1.10643911, "balance_loss_mlp": 1.02410257, "epoch": 0.01719473336139001, "flos": 15997279829760.0, "grad_norm": 5.333249666465971, "language_loss": 0.8727057, "learning_rate": 3.5953126928453423e-06, "loss": 0.90393949, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.7619245052337646 }, { "auxiliary_loss_clip": 0.01716384, "auxiliary_loss_mlp": 0.01404826, "balance_loss_clip": 1.10638261, "balance_loss_mlp": 1.02411878, "epoch": 0.017314976252029098, "flos": 22492049447520.0, "grad_norm": 2.3399286055166093, "language_loss": 0.80498242, "learning_rate": 3.600361117108239e-06, "loss": 0.83619452, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 2.835268259048462 }, { "auxiliary_loss_clip": 0.01714449, "auxiliary_loss_mlp": 0.01407058, "balance_loss_clip": 1.10434794, "balance_loss_mlp": 1.02711391, "epoch": 0.017435219142668188, "flos": 22018064513760.0, "grad_norm": 2.6439644143059775, "language_loss": 0.97267199, "learning_rate": 3.6053746038991616e-06, "loss": 1.00388706, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 2.820075750350952 }, { "auxiliary_loss_clip": 0.01752186, "auxiliary_loss_mlp": 0.01346379, "balance_loss_clip": 1.1378504, "balance_loss_mlp": 1.00305641, "epoch": 0.01755546203330728, "flos": 72240556268160.0, "grad_norm": 1.044726201777019, "language_loss": 0.58401215, "learning_rate": 3.6103536334639843e-06, "loss": 0.61499786, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 5.981119394302368 }, { "auxiliary_loss_clip": 0.01709726, "auxiliary_loss_mlp": 0.01401529, "balance_loss_clip": 1.1010102, "balance_loss_mlp": 1.02654386, "epoch": 0.01767570492394637, "flos": 25337072684160.0, "grad_norm": 7.99153949614411, "language_loss": 0.85441828, "learning_rate": 3.615298676214041e-06, "loss": 0.88553089, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 2.92901873588562 }, { "auxiliary_loss_clip": 0.01711124, "auxiliary_loss_mlp": 0.01398209, "balance_loss_clip": 1.10199428, "balance_loss_mlp": 1.02589464, "epoch": 0.01779594781458546, "flos": 20449198774080.0, "grad_norm": 2.546639829819797, "language_loss": 0.89005613, "learning_rate": 3.6202101929928317e-06, "loss": 0.92114937, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.921052932739258 }, { "auxiliary_loss_clip": 0.01709435, "auxiliary_loss_mlp": 0.01384754, "balance_loss_clip": 1.10109997, "balance_loss_mlp": 1.02045035, "epoch": 0.017916190705224554, "flos": 16253610593760.0, "grad_norm": 2.628031746403745, "language_loss": 0.88378108, "learning_rate": 3.6250886353337413e-06, "loss": 0.91472298, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.7797887325286865 }, { "auxiliary_loss_clip": 0.01709384, "auxiliary_loss_mlp": 0.01395166, "balance_loss_clip": 1.10085499, "balance_loss_mlp": 1.02800179, "epoch": 0.018036433595863644, "flos": 23330599489440.0, "grad_norm": 1.9644502223610394, "language_loss": 0.86573851, "learning_rate": 3.6299344457091488e-06, "loss": 0.89678407, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.8497543334960938 }, { "auxiliary_loss_clip": 0.01706872, "auxiliary_loss_mlp": 0.01387507, "balance_loss_clip": 1.09907794, "balance_loss_mlp": 1.02282178, "epoch": 0.018156676486502734, "flos": 18588019952160.0, "grad_norm": 2.925434283919997, "language_loss": 0.93723488, "learning_rate": 3.634748057771256e-06, "loss": 0.96817863, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.8405745029449463 }, { "auxiliary_loss_clip": 0.01705051, "auxiliary_loss_mlp": 0.01386208, "balance_loss_clip": 1.09882832, "balance_loss_mlp": 1.0228579, "epoch": 0.018276919377141827, "flos": 25448719777920.0, "grad_norm": 1.7186595960285178, "language_loss": 0.85820574, "learning_rate": 3.639529896584965e-06, "loss": 0.88911837, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.8509061336517334 }, { "auxiliary_loss_clip": 0.01703778, "auxiliary_loss_mlp": 0.0138719, "balance_loss_clip": 1.09764004, "balance_loss_mlp": 1.0249846, "epoch": 0.018397162267780917, "flos": 20047322171520.0, "grad_norm": 2.9344437506629095, "language_loss": 0.88779128, "learning_rate": 3.6442803788531233e-06, "loss": 0.91870093, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.7850677967071533 }, { "auxiliary_loss_clip": 0.01701907, "auxiliary_loss_mlp": 0.01385741, "balance_loss_clip": 1.09565592, "balance_loss_mlp": 1.02181935, "epoch": 0.018517405158420007, "flos": 27565295348160.0, "grad_norm": 2.0634051747891453, "language_loss": 0.96111953, "learning_rate": 3.6489999131344357e-06, "loss": 0.99199599, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.845689296722412 }, { "auxiliary_loss_clip": 0.0170152, "auxiliary_loss_mlp": 0.01377078, "balance_loss_clip": 1.09542811, "balance_loss_mlp": 1.02250159, "epoch": 0.0186376480490591, "flos": 19354066502400.0, "grad_norm": 1.817428959892808, "language_loss": 0.90630913, "learning_rate": 3.653688900054313e-06, "loss": 0.93709511, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.8094027042388916 }, { "auxiliary_loss_clip": 0.01696945, "auxiliary_loss_mlp": 0.01375869, "balance_loss_clip": 1.09185624, "balance_loss_mlp": 1.02110219, "epoch": 0.01875789093969819, "flos": 26687853169920.0, "grad_norm": 2.0854442915916263, "language_loss": 0.75990069, "learning_rate": 3.6583477325089526e-06, "loss": 0.79062885, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.794642210006714 }, { "auxiliary_loss_clip": 0.01695373, "auxiliary_loss_mlp": 0.01376136, "balance_loss_clip": 1.0916158, "balance_loss_mlp": 1.02251339, "epoch": 0.01887813383033728, "flos": 24353012727360.0, "grad_norm": 2.9561778504490537, "language_loss": 1.04126573, "learning_rate": 3.6629767958628916e-06, "loss": 1.07198071, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.794386386871338 }, { "auxiliary_loss_clip": 0.01697653, "auxiliary_loss_mlp": 0.01382053, "balance_loss_clip": 1.09307694, "balance_loss_mlp": 1.02480626, "epoch": 0.018998376720976373, "flos": 14647541130720.0, "grad_norm": 2.0628537537909635, "language_loss": 0.85439801, "learning_rate": 3.667576468140291e-06, "loss": 0.88519508, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.786101818084717 }, { "auxiliary_loss_clip": 0.01694067, "auxiliary_loss_mlp": 0.01377326, "balance_loss_clip": 1.09033835, "balance_loss_mlp": 1.02255893, "epoch": 0.019118619611615463, "flos": 29305271377440.0, "grad_norm": 2.4444971158915374, "language_loss": 0.88865542, "learning_rate": 3.672147120210184e-06, "loss": 0.91936934, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.9239614009857178 }, { "auxiliary_loss_clip": 0.01693587, "auxiliary_loss_mlp": 0.01374903, "balance_loss_clip": 1.09142995, "balance_loss_mlp": 1.02509546, "epoch": 0.019238862502254553, "flos": 20886734381760.0, "grad_norm": 2.1249809470181273, "language_loss": 0.86116993, "learning_rate": 3.6766891159659177e-06, "loss": 0.89185476, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.8486809730529785 }, { "auxiliary_loss_clip": 0.01690493, "auxiliary_loss_mlp": 0.01368539, "balance_loss_clip": 1.08853769, "balance_loss_mlp": 1.02044809, "epoch": 0.019359105392893646, "flos": 21360683391840.0, "grad_norm": 3.9754174950950154, "language_loss": 0.8787291, "learning_rate": 3.6812028124990075e-06, "loss": 0.9093194, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.7754247188568115 }, { "auxiliary_loss_clip": 0.01688679, "auxiliary_loss_mlp": 0.01369047, "balance_loss_clip": 1.08751404, "balance_loss_mlp": 1.02038383, "epoch": 0.019479348283532736, "flos": 16283737352160.0, "grad_norm": 3.2065497774069565, "language_loss": 0.81397152, "learning_rate": 3.6856885602676016e-06, "loss": 0.84454876, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.9281363487243652 }, { "auxiliary_loss_clip": 0.01687115, "auxiliary_loss_mlp": 0.01364829, "balance_loss_clip": 1.08604991, "balance_loss_mlp": 1.01902711, "epoch": 0.019599591174171826, "flos": 22091250555360.0, "grad_norm": 2.689717827870862, "language_loss": 0.94039714, "learning_rate": 3.6901467032597733e-06, "loss": 0.97091663, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.840123414993286 }, { "auxiliary_loss_clip": 0.0168754, "auxiliary_loss_mlp": 0.01363354, "balance_loss_clip": 1.08640063, "balance_loss_mlp": 1.01964927, "epoch": 0.01971983406481092, "flos": 19609678792800.0, "grad_norm": 2.0915713880979574, "language_loss": 0.87343669, "learning_rate": 3.694577579151804e-06, "loss": 0.90394557, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.8071022033691406 }, { "auxiliary_loss_clip": 0.01686501, "auxiliary_loss_mlp": 0.01368093, "balance_loss_clip": 1.0858202, "balance_loss_mlp": 1.02725017, "epoch": 0.01984007695545001, "flos": 19099352304000.0, "grad_norm": 2.2366998395422515, "language_loss": 0.73783624, "learning_rate": 3.6989815194616703e-06, "loss": 0.76838219, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.762437105178833 }, { "auxiliary_loss_clip": 0.01682481, "auxiliary_loss_mlp": 0.0136081, "balance_loss_clip": 1.0816983, "balance_loss_mlp": 1.01977575, "epoch": 0.0199603198460891, "flos": 20848417024320.0, "grad_norm": 2.701546614254715, "language_loss": 0.79697371, "learning_rate": 3.703358849697888e-06, "loss": 0.82740664, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.8404810428619385 }, { "auxiliary_loss_clip": 0.01683774, "auxiliary_loss_mlp": 0.01358823, "balance_loss_clip": 1.08419204, "balance_loss_mlp": 1.02141261, "epoch": 0.020080562736728192, "flos": 21870758414880.0, "grad_norm": 1.7326324558081787, "language_loss": 0.82555264, "learning_rate": 3.7077098895038803e-06, "loss": 0.85597861, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.8405308723449707 }, { "auxiliary_loss_clip": 0.01680506, "auxiliary_loss_mlp": 0.01355051, "balance_loss_clip": 1.08167267, "balance_loss_mlp": 1.01783192, "epoch": 0.020200805627367282, "flos": 21688799173920.0, "grad_norm": 2.0937395345733774, "language_loss": 0.97183931, "learning_rate": 3.712034952798045e-06, "loss": 1.00219488, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.870408773422241 }, { "auxiliary_loss_clip": 0.01679571, "auxiliary_loss_mlp": 0.01361166, "balance_loss_clip": 1.08002436, "balance_loss_mlp": 1.02184844, "epoch": 0.02032104851800637, "flos": 33543056672640.0, "grad_norm": 3.181121423318076, "language_loss": 0.84544015, "learning_rate": 3.7163343479096656e-06, "loss": 0.87584752, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.8843648433685303 }, { "auxiliary_loss_clip": 0.01680766, "auxiliary_loss_mlp": 0.01351737, "balance_loss_clip": 1.08163977, "balance_loss_mlp": 1.01718843, "epoch": 0.020441291408645465, "flos": 31686979013280.0, "grad_norm": 2.073389598636501, "language_loss": 0.82601786, "learning_rate": 3.720608377710802e-06, "loss": 0.85634291, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.885826826095581 }, { "auxiliary_loss_clip": 0.01676702, "auxiliary_loss_mlp": 0.01352638, "balance_loss_clip": 1.07925701, "balance_loss_mlp": 1.01923347, "epoch": 0.020561534299284555, "flos": 20886698458080.0, "grad_norm": 3.1709993702138206, "language_loss": 0.86229593, "learning_rate": 3.7248573397443277e-06, "loss": 0.89258927, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 2.9430429935455322 }, { "auxiliary_loss_clip": 0.01678579, "auxiliary_loss_mlp": 0.01350218, "balance_loss_clip": 1.08125031, "balance_loss_mlp": 1.0194838, "epoch": 0.020681777189923645, "flos": 20996621215200.0, "grad_norm": 2.990513521727846, "language_loss": 0.97603679, "learning_rate": 3.729081526348224e-06, "loss": 1.00632477, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 5.502302646636963 }, { "auxiliary_loss_clip": 0.01677257, "auxiliary_loss_mlp": 0.01352396, "balance_loss_clip": 1.08046174, "balance_loss_mlp": 1.01994491, "epoch": 0.020802020080562738, "flos": 28257545154240.0, "grad_norm": 2.3925817630482795, "language_loss": 0.85109216, "learning_rate": 3.7332812247762777e-06, "loss": 0.88138866, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 3.851248025894165 }, { "auxiliary_loss_clip": 0.01674903, "auxiliary_loss_mlp": 0.01350571, "balance_loss_clip": 1.07842565, "balance_loss_mlp": 1.02136242, "epoch": 0.020922262971201828, "flos": 19681284192480.0, "grad_norm": 3.9494142199185034, "language_loss": 0.95300281, "learning_rate": 3.737456717315293e-06, "loss": 0.98325753, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.765315532684326 }, { "auxiliary_loss_clip": 0.01674562, "auxiliary_loss_mlp": 0.01352784, "balance_loss_clip": 1.0789212, "balance_loss_mlp": 1.02185917, "epoch": 0.021042505861840918, "flos": 15666362000640.0, "grad_norm": 1.7340536222192982, "language_loss": 0.90643251, "learning_rate": 3.7416082813989552e-06, "loss": 0.93670595, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.8471391201019287 }, { "auxiliary_loss_clip": 0.01671915, "auxiliary_loss_mlp": 0.01348631, "balance_loss_clip": 1.07655048, "balance_loss_mlp": 1.0203768, "epoch": 0.02116274875248001, "flos": 21142023359040.0, "grad_norm": 2.6698546895953656, "language_loss": 0.89682639, "learning_rate": 3.745736189718439e-06, "loss": 0.92703182, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.8350143432617188 }, { "auxiliary_loss_clip": 0.01669377, "auxiliary_loss_mlp": 0.01342693, "balance_loss_clip": 1.07530165, "balance_loss_mlp": 1.01825261, "epoch": 0.0212829916431191, "flos": 24715781651520.0, "grad_norm": 2.6142687360776655, "language_loss": 0.73112178, "learning_rate": 3.749840710329894e-06, "loss": 0.76124245, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.794100522994995 }, { "auxiliary_loss_clip": 0.01670551, "auxiliary_loss_mlp": 0.01346148, "balance_loss_clip": 1.07650065, "balance_loss_mlp": 1.02151704, "epoch": 0.02140323453375819, "flos": 16645500413280.0, "grad_norm": 3.0494545023069106, "language_loss": 0.98158181, "learning_rate": 3.7539221067588938e-06, "loss": 1.01174879, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.858811140060425 }, { "auxiliary_loss_clip": 0.01670674, "auxiliary_loss_mlp": 0.01350216, "balance_loss_clip": 1.07647824, "balance_loss_mlp": 1.02711117, "epoch": 0.021523477424397284, "flos": 20299342093920.0, "grad_norm": 3.777863656603568, "language_loss": 0.93477255, "learning_rate": 3.757980638101964e-06, "loss": 0.96498144, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.942237377166748 }, { "auxiliary_loss_clip": 0.01668705, "auxiliary_loss_mlp": 0.01346221, "balance_loss_clip": 1.07631111, "balance_loss_mlp": 1.02483249, "epoch": 0.021643720315036374, "flos": 26104017326400.0, "grad_norm": 3.0996412053758085, "language_loss": 0.89799732, "learning_rate": 3.7620165591252806e-06, "loss": 0.9281466, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.907104015350342 }, { "auxiliary_loss_clip": 0.01668065, "auxiliary_loss_mlp": 0.01333756, "balance_loss_clip": 1.07568479, "balance_loss_mlp": 1.01656377, "epoch": 0.021763963205675464, "flos": 24787674440640.0, "grad_norm": 1.9906181053014926, "language_loss": 0.94407737, "learning_rate": 3.766030120360636e-06, "loss": 0.97409564, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.9008007049560547 }, { "auxiliary_loss_clip": 0.0166719, "auxiliary_loss_mlp": 0.01337059, "balance_loss_clip": 1.07419229, "balance_loss_mlp": 1.01891315, "epoch": 0.021884206096314557, "flos": 25813572275520.0, "grad_norm": 3.0015000900521764, "language_loss": 0.90390396, "learning_rate": 3.7700215681987578e-06, "loss": 0.93394637, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.828049659729004 }, { "auxiliary_loss_clip": 0.01661927, "auxiliary_loss_mlp": 0.01339057, "balance_loss_clip": 1.07127357, "balance_loss_mlp": 1.01747823, "epoch": 0.022004448986953647, "flos": 20082729710880.0, "grad_norm": 5.265744459675004, "language_loss": 0.82376808, "learning_rate": 3.7739911449800767e-06, "loss": 0.85377789, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.8172802925109863 }, { "auxiliary_loss_clip": 0.0166191, "auxiliary_loss_mlp": 0.0134018, "balance_loss_clip": 1.07179081, "balance_loss_mlp": 1.0203172, "epoch": 0.022124691877592736, "flos": 20480618784960.0, "grad_norm": 1.998700938741619, "language_loss": 0.80708581, "learning_rate": 3.7779390890830114e-06, "loss": 0.8371067, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.7977776527404785 }, { "auxiliary_loss_clip": 0.01658107, "auxiliary_loss_mlp": 0.01335779, "balance_loss_clip": 1.06952381, "balance_loss_mlp": 1.01858735, "epoch": 0.02224493476823183, "flos": 23586858406080.0, "grad_norm": 2.0251699976913926, "language_loss": 0.85791308, "learning_rate": 3.7818656350098723e-06, "loss": 0.88785195, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.8498952388763428 }, { "auxiliary_loss_clip": 0.01658657, "auxiliary_loss_mlp": 0.01332006, "balance_loss_clip": 1.0696218, "balance_loss_mlp": 1.01519561, "epoch": 0.02236517765887092, "flos": 16909949928960.0, "grad_norm": 2.443552127733571, "language_loss": 0.76901191, "learning_rate": 3.7857710134704447e-06, "loss": 0.7989186, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.7229461669921875 }, { "auxiliary_loss_clip": 0.01659537, "auxiliary_loss_mlp": 0.01331694, "balance_loss_clip": 1.07034254, "balance_loss_mlp": 1.01640964, "epoch": 0.02248542054951001, "flos": 43508199935520.0, "grad_norm": 2.0812612370204437, "language_loss": 0.79270583, "learning_rate": 3.7896554514633234e-06, "loss": 0.82261813, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.947754144668579 }, { "auxiliary_loss_clip": 0.01654771, "auxiliary_loss_mlp": 0.01334773, "balance_loss_clip": 1.06695771, "balance_loss_mlp": 1.01987004, "epoch": 0.022605663440149103, "flos": 23367659518080.0, "grad_norm": 2.6640161428947433, "language_loss": 0.84136951, "learning_rate": 3.7935191723550955e-06, "loss": 0.87126493, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.7723982334136963 }, { "auxiliary_loss_clip": 0.01655632, "auxiliary_loss_mlp": 0.0133636, "balance_loss_clip": 1.06889272, "balance_loss_mlp": 1.02031183, "epoch": 0.022725906330788193, "flos": 29019927489120.0, "grad_norm": 2.181104064073378, "language_loss": 0.887375, "learning_rate": 3.797362395957408e-06, "loss": 0.91729486, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.809028148651123 }, { "auxiliary_loss_clip": 0.01655642, "auxiliary_loss_mlp": 0.0133278, "balance_loss_clip": 1.06850386, "balance_loss_mlp": 1.01844859, "epoch": 0.022846149221427282, "flos": 24496187603040.0, "grad_norm": 2.2647097791859845, "language_loss": 0.78496397, "learning_rate": 3.8011853386020055e-06, "loss": 0.81484818, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.828310251235962 }, { "auxiliary_loss_clip": 0.01652863, "auxiliary_loss_mlp": 0.01330944, "balance_loss_clip": 1.06706333, "balance_loss_mlp": 1.02119017, "epoch": 0.022966392112066376, "flos": 15523546361760.0, "grad_norm": 2.8914377502102213, "language_loss": 0.89334607, "learning_rate": 3.804988213213804e-06, "loss": 0.92318416, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.747213125228882 }, { "auxiliary_loss_clip": 0.01691591, "auxiliary_loss_mlp": 0.01295398, "balance_loss_clip": 1.10059547, "balance_loss_mlp": 1.005481, "epoch": 0.023086635002705466, "flos": 55650436960320.0, "grad_norm": 1.020634208985836, "language_loss": 0.63215798, "learning_rate": 3.808771229382049e-06, "loss": 0.6620279, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.4316933155059814 }, { "auxiliary_loss_clip": 0.01651873, "auxiliary_loss_mlp": 0.01332737, "balance_loss_clip": 1.06687105, "balance_loss_mlp": 1.02012229, "epoch": 0.023206877893344555, "flos": 19313450029440.0, "grad_norm": 3.425750271772455, "language_loss": 0.84538168, "learning_rate": 3.8125345934296324e-06, "loss": 0.87522781, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.7779510021209717 }, { "auxiliary_loss_clip": 0.0164777, "auxiliary_loss_mlp": 0.01328517, "balance_loss_clip": 1.06433344, "balance_loss_mlp": 1.01914549, "epoch": 0.02332712078398365, "flos": 23072975472960.0, "grad_norm": 4.4587103771960885, "language_loss": 0.87810004, "learning_rate": 3.81627850848061e-06, "loss": 0.9078629, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.8838469982147217 }, { "auxiliary_loss_clip": 0.01649308, "auxiliary_loss_mlp": 0.01328733, "balance_loss_clip": 1.0656451, "balance_loss_mlp": 1.01974261, "epoch": 0.02344736367462274, "flos": 24425983226880.0, "grad_norm": 2.309428573902893, "language_loss": 0.86304557, "learning_rate": 3.820003174525994e-06, "loss": 0.89282608, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.807778835296631 }, { "auxiliary_loss_clip": 0.01646718, "auxiliary_loss_mlp": 0.01318256, "balance_loss_clip": 1.06380391, "balance_loss_mlp": 1.01155472, "epoch": 0.02356760656526183, "flos": 21579810432480.0, "grad_norm": 2.393673397185942, "language_loss": 0.8290273, "learning_rate": 3.823708788487851e-06, "loss": 0.85867703, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.8162360191345215 }, { "auxiliary_loss_clip": 0.01648159, "auxiliary_loss_mlp": 0.01318063, "balance_loss_clip": 1.06596243, "balance_loss_mlp": 1.01403129, "epoch": 0.02368784945590092, "flos": 25193610419040.0, "grad_norm": 1.8023719041822017, "language_loss": 0.84452093, "learning_rate": 3.827395544281781e-06, "loss": 0.87418318, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 2.8035824298858643 }, { "auxiliary_loss_clip": 0.0164574, "auxiliary_loss_mlp": 0.01326833, "balance_loss_clip": 1.06433737, "balance_loss_mlp": 1.01955938, "epoch": 0.02380809234654001, "flos": 27562493301120.0, "grad_norm": 2.946465733889609, "language_loss": 0.78755748, "learning_rate": 3.831063632877802e-06, "loss": 0.81728327, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 2.8320281505584717 }, { "auxiliary_loss_clip": 0.01648402, "auxiliary_loss_mlp": 0.01318281, "balance_loss_clip": 1.06687951, "balance_loss_mlp": 1.01844621, "epoch": 0.0239283352371791, "flos": 18259796399040.0, "grad_norm": 2.9148723394114304, "language_loss": 0.75890821, "learning_rate": 3.834713242359712e-06, "loss": 0.78857505, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 5.456042766571045 }, { "auxiliary_loss_clip": 0.01643342, "auxiliary_loss_mlp": 0.01322537, "balance_loss_clip": 1.06419301, "balance_loss_mlp": 1.0165987, "epoch": 0.02404857812781819, "flos": 21395120991840.0, "grad_norm": 2.5269537088998777, "language_loss": 0.87175781, "learning_rate": 3.838344557982959e-06, "loss": 0.9014166, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.796494245529175 }, { "auxiliary_loss_clip": 0.01641684, "auxiliary_loss_mlp": 0.01317388, "balance_loss_clip": 1.0622561, "balance_loss_mlp": 1.01526403, "epoch": 0.024168821018457284, "flos": 16654265791200.0, "grad_norm": 3.5347547740531917, "language_loss": 0.8499099, "learning_rate": 3.841957762231063e-06, "loss": 0.87950063, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.7190499305725098 }, { "auxiliary_loss_clip": 0.01640034, "auxiliary_loss_mlp": 0.0131992, "balance_loss_clip": 1.0614326, "balance_loss_mlp": 1.01760554, "epoch": 0.024289063909096374, "flos": 22820883703200.0, "grad_norm": 3.1186451739401986, "language_loss": 0.87827355, "learning_rate": 3.8455530348706454e-06, "loss": 0.90787309, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.7881247997283936 }, { "auxiliary_loss_clip": 0.01642106, "auxiliary_loss_mlp": 0.01313793, "balance_loss_clip": 1.06380844, "balance_loss_mlp": 1.01414847, "epoch": 0.024409306799735464, "flos": 17748607741920.0, "grad_norm": 1.933368234382941, "language_loss": 0.77544504, "learning_rate": 3.849130553005099e-06, "loss": 0.805004, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.8116846084594727 }, { "auxiliary_loss_clip": 0.01636721, "auxiliary_loss_mlp": 0.01313102, "balance_loss_clip": 1.06071198, "balance_loss_mlp": 1.01345754, "epoch": 0.024529549690374557, "flos": 21616223834880.0, "grad_norm": 2.66598278369339, "language_loss": 0.83623552, "learning_rate": 3.852690491126933e-06, "loss": 0.8657338, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.7799901962280273 }, { "auxiliary_loss_clip": 0.0163581, "auxiliary_loss_mlp": 0.0131897, "balance_loss_clip": 1.05926073, "balance_loss_mlp": 1.01760936, "epoch": 0.024649792581013647, "flos": 25551673341120.0, "grad_norm": 2.7073271068215052, "language_loss": 0.91355133, "learning_rate": 3.856233021168845e-06, "loss": 0.94309914, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.7508506774902344 }, { "auxiliary_loss_clip": 0.01635622, "auxiliary_loss_mlp": 0.01307801, "balance_loss_clip": 1.06039429, "balance_loss_mlp": 1.01311553, "epoch": 0.024770035471652737, "flos": 34495588847520.0, "grad_norm": 2.085366191482327, "language_loss": 0.91159463, "learning_rate": 3.859758312553544e-06, "loss": 0.94102883, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.8636834621429443 }, { "auxiliary_loss_clip": 0.01634751, "auxiliary_loss_mlp": 0.0131383, "balance_loss_clip": 1.06084025, "balance_loss_mlp": 1.01800013, "epoch": 0.02489027836229183, "flos": 21505438909440.0, "grad_norm": 2.2796501424695523, "language_loss": 0.91789448, "learning_rate": 3.8632665322423735e-06, "loss": 0.9473803, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.8346827030181885 }, { "auxiliary_loss_clip": 0.01634276, "auxiliary_loss_mlp": 0.01309418, "balance_loss_clip": 1.06044316, "balance_loss_mlp": 1.01377892, "epoch": 0.02501052125293092, "flos": 23219024243040.0, "grad_norm": 1.8010490733603353, "language_loss": 0.85884142, "learning_rate": 3.866757844782762e-06, "loss": 0.88827837, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.788247585296631 }, { "auxiliary_loss_clip": 0.01632548, "auxiliary_loss_mlp": 0.01311462, "balance_loss_clip": 1.05971813, "balance_loss_mlp": 1.01525068, "epoch": 0.02513076414357001, "flos": 26388930130560.0, "grad_norm": 2.320795062102826, "language_loss": 0.91413927, "learning_rate": 3.870232412354527e-06, "loss": 0.94357932, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.779989004135132 }, { "auxiliary_loss_clip": 0.01631693, "auxiliary_loss_mlp": 0.01310187, "balance_loss_clip": 1.05954838, "balance_loss_mlp": 1.01531065, "epoch": 0.025251007034209103, "flos": 13590438827040.0, "grad_norm": 2.020723445791512, "language_loss": 0.9238956, "learning_rate": 3.873690394815086e-06, "loss": 0.95331442, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.735196590423584 }, { "auxiliary_loss_clip": 0.01631663, "auxiliary_loss_mlp": 0.01310373, "balance_loss_clip": 1.0600127, "balance_loss_mlp": 1.01740479, "epoch": 0.025371249924848193, "flos": 15049238114880.0, "grad_norm": 3.0868973452960042, "language_loss": 0.91437769, "learning_rate": 3.877131949743587e-06, "loss": 0.94379807, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.8320415019989014 }, { "auxiliary_loss_clip": 0.01628586, "auxiliary_loss_mlp": 0.01310067, "balance_loss_clip": 1.05787516, "balance_loss_mlp": 1.01614416, "epoch": 0.025491492815487283, "flos": 25553864685600.0, "grad_norm": 3.240797512557162, "language_loss": 0.77924478, "learning_rate": 3.880557232483993e-06, "loss": 0.80863136, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.772639036178589 }, { "auxiliary_loss_clip": 0.01626798, "auxiliary_loss_mlp": 0.0130885, "balance_loss_clip": 1.05683255, "balance_loss_mlp": 1.01550007, "epoch": 0.025611735706126376, "flos": 20630762854560.0, "grad_norm": 1.904208236059277, "language_loss": 0.86958385, "learning_rate": 3.883966396187164e-06, "loss": 0.89894032, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.751570701599121 }, { "auxiliary_loss_clip": 0.01626749, "auxiliary_loss_mlp": 0.01303998, "balance_loss_clip": 1.057042, "balance_loss_mlp": 1.01541615, "epoch": 0.025731978596765466, "flos": 19062292275360.0, "grad_norm": 2.5278264851867203, "language_loss": 0.90182376, "learning_rate": 3.887359591851937e-06, "loss": 0.93113124, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.780702829360962 }, { "auxiliary_loss_clip": 0.01625706, "auxiliary_loss_mlp": 0.01301221, "balance_loss_clip": 1.05755234, "balance_loss_mlp": 1.01302075, "epoch": 0.025852221487404556, "flos": 22163825894400.0, "grad_norm": 1.6596966108635856, "language_loss": 0.92277342, "learning_rate": 3.890736968365265e-06, "loss": 0.9520427, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.7778127193450928 }, { "auxiliary_loss_clip": 0.01624679, "auxiliary_loss_mlp": 0.01301855, "balance_loss_clip": 1.05663896, "balance_loss_mlp": 1.01479864, "epoch": 0.02597246437804365, "flos": 26541984018240.0, "grad_norm": 2.151557577650115, "language_loss": 0.85155737, "learning_rate": 3.894098672541412e-06, "loss": 0.88082266, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.802992820739746 }, { "auxiliary_loss_clip": 0.01624222, "auxiliary_loss_mlp": 0.01297249, "balance_loss_clip": 1.05650282, "balance_loss_mlp": 1.01095605, "epoch": 0.02609270726868274, "flos": 32671685596320.0, "grad_norm": 1.7542908916248627, "language_loss": 0.75208563, "learning_rate": 3.89744484916025e-06, "loss": 0.78130031, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.832982301712036 }, { "auxiliary_loss_clip": 0.01621633, "auxiliary_loss_mlp": 0.01309164, "balance_loss_clip": 1.05484009, "balance_loss_mlp": 1.01733947, "epoch": 0.02621295015932183, "flos": 26243563910400.0, "grad_norm": 2.045755507711167, "language_loss": 0.87395555, "learning_rate": 3.900775641004673e-06, "loss": 0.90326351, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.857541084289551 }, { "auxiliary_loss_clip": 0.01621393, "auxiliary_loss_mlp": 0.01306764, "balance_loss_clip": 1.05627418, "balance_loss_mlp": 1.01837254, "epoch": 0.026333193049960922, "flos": 42921418350240.0, "grad_norm": 3.1713574165085237, "language_loss": 0.73959404, "learning_rate": 3.904091188897156e-06, "loss": 0.7688756, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.9314963817596436 }, { "auxiliary_loss_clip": 0.01620281, "auxiliary_loss_mlp": 0.01303807, "balance_loss_clip": 1.05637026, "balance_loss_mlp": 1.01751387, "epoch": 0.026453435940600012, "flos": 17963855025120.0, "grad_norm": 2.160633336043363, "language_loss": 0.81928396, "learning_rate": 3.90739163173548e-06, "loss": 0.84852481, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.772653102874756 }, { "auxiliary_loss_clip": 0.01618759, "auxiliary_loss_mlp": 0.01299538, "balance_loss_clip": 1.05494344, "balance_loss_mlp": 1.01324511, "epoch": 0.026573678831239102, "flos": 18984328384320.0, "grad_norm": 2.405806947394947, "language_loss": 0.88399327, "learning_rate": 3.910677106527646e-06, "loss": 0.91317618, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.775749921798706 }, { "auxiliary_loss_clip": 0.01619805, "auxiliary_loss_mlp": 0.01303874, "balance_loss_clip": 1.05621552, "balance_loss_mlp": 1.01777172, "epoch": 0.026693921721878195, "flos": 29241461416320.0, "grad_norm": 2.0886423527962616, "language_loss": 0.84225309, "learning_rate": 3.913947748426004e-06, "loss": 0.87148982, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.815901041030884 }, { "auxiliary_loss_clip": 0.01617382, "auxiliary_loss_mlp": 0.01296042, "balance_loss_clip": 1.05539572, "balance_loss_mlp": 1.01432693, "epoch": 0.026814164612517285, "flos": 14128090027200.0, "grad_norm": 3.7104681991523263, "language_loss": 0.76278168, "learning_rate": 3.9172036907606136e-06, "loss": 0.79191589, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 2.7728359699249268 }, { "auxiliary_loss_clip": 0.01613177, "auxiliary_loss_mlp": 0.01304764, "balance_loss_clip": 1.05234897, "balance_loss_mlp": 1.01618207, "epoch": 0.026934407503156375, "flos": 23511983951520.0, "grad_norm": 1.799151154328196, "language_loss": 0.95017433, "learning_rate": 3.920445065071855e-06, "loss": 0.97935373, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 2.7329976558685303 }, { "auxiliary_loss_clip": 0.01611412, "auxiliary_loss_mlp": 0.01301614, "balance_loss_clip": 1.05224824, "balance_loss_mlp": 1.01799083, "epoch": 0.027054650393795468, "flos": 28950369739200.0, "grad_norm": 2.2162938519552142, "language_loss": 0.80112255, "learning_rate": 3.923672001142322e-06, "loss": 0.83025289, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 4.7290940284729 }, { "auxiliary_loss_clip": 0.01611547, "auxiliary_loss_mlp": 0.01292524, "balance_loss_clip": 1.05175424, "balance_loss_mlp": 1.01061749, "epoch": 0.027174893284434558, "flos": 31431582264960.0, "grad_norm": 2.8670916175703907, "language_loss": 0.84456617, "learning_rate": 3.926884627027996e-06, "loss": 0.87360686, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 2.8421435356140137 }, { "auxiliary_loss_clip": 0.01610503, "auxiliary_loss_mlp": 0.0130205, "balance_loss_clip": 1.05160069, "balance_loss_mlp": 1.01823688, "epoch": 0.027295136175073648, "flos": 22054477916160.0, "grad_norm": 1.857509237550303, "language_loss": 0.77495289, "learning_rate": 3.930083069088744e-06, "loss": 0.8040784, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.7981631755828857 }, { "auxiliary_loss_clip": 0.01658466, "auxiliary_loss_mlp": 0.01285853, "balance_loss_clip": 1.09168649, "balance_loss_mlp": 1.00814342, "epoch": 0.02741537906571274, "flos": 60800784583680.0, "grad_norm": 1.012980585010906, "language_loss": 0.59314692, "learning_rate": 3.933267452018137e-06, "loss": 0.62259007, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.3402631282806396 }, { "auxiliary_loss_clip": 0.01610378, "auxiliary_loss_mlp": 0.01289395, "balance_loss_clip": 1.05250311, "balance_loss_mlp": 1.0114944, "epoch": 0.02753562195635183, "flos": 24606289978560.0, "grad_norm": 1.951289272543363, "language_loss": 0.8439272, "learning_rate": 3.936437898872622e-06, "loss": 0.87292492, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.8371148109436035 }, { "auxiliary_loss_clip": 0.01606651, "auxiliary_loss_mlp": 0.01298111, "balance_loss_clip": 1.05007648, "balance_loss_mlp": 1.01677704, "epoch": 0.02765586484699092, "flos": 34094251100160.0, "grad_norm": 2.484842946319726, "language_loss": 0.79861045, "learning_rate": 3.9395945311000525e-06, "loss": 0.82765812, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.868532657623291 }, { "auxiliary_loss_clip": 0.01604567, "auxiliary_loss_mlp": 0.01295333, "balance_loss_clip": 1.04963064, "balance_loss_mlp": 1.01438093, "epoch": 0.027776107737630014, "flos": 14829931455840.0, "grad_norm": 2.483108887731898, "language_loss": 0.90990019, "learning_rate": 3.942737468567608e-06, "loss": 0.93889916, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.726203203201294 }, { "auxiliary_loss_clip": 0.01605541, "auxiliary_loss_mlp": 0.01285203, "balance_loss_clip": 1.05127215, "balance_loss_mlp": 1.00997281, "epoch": 0.027896350628269104, "flos": 47920364575200.0, "grad_norm": 1.985107540399628, "language_loss": 0.85913062, "learning_rate": 3.9458668295891026e-06, "loss": 0.8880381, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.9998998641967773 }, { "auxiliary_loss_clip": 0.0160317, "auxiliary_loss_mlp": 0.01292263, "balance_loss_clip": 1.0500989, "balance_loss_mlp": 1.01550722, "epoch": 0.028016593518908194, "flos": 21684560179680.0, "grad_norm": 2.3959900706029305, "language_loss": 0.86699921, "learning_rate": 3.948982730951712e-06, "loss": 0.89595354, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.819607973098755 }, { "auxiliary_loss_clip": 0.01603458, "auxiliary_loss_mlp": 0.012915, "balance_loss_clip": 1.05059159, "balance_loss_mlp": 1.01359904, "epoch": 0.028136836409547287, "flos": 18439492448160.0, "grad_norm": 2.1763999762224278, "language_loss": 0.81952119, "learning_rate": 3.9520852879421254e-06, "loss": 0.84847075, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.703040838241577 }, { "auxiliary_loss_clip": 0.01601313, "auxiliary_loss_mlp": 0.01285348, "balance_loss_clip": 1.04978228, "balance_loss_mlp": 1.00992727, "epoch": 0.028257079300186377, "flos": 31576948485120.0, "grad_norm": 4.153960533942442, "language_loss": 0.81907511, "learning_rate": 3.955174614372137e-06, "loss": 0.84794176, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.858286142349243 }, { "auxiliary_loss_clip": 0.01601636, "auxiliary_loss_mlp": 0.01289771, "balance_loss_clip": 1.04930985, "balance_loss_mlp": 1.01320541, "epoch": 0.028377322190825467, "flos": 23513349051360.0, "grad_norm": 2.1542740857814624, "language_loss": 0.84307885, "learning_rate": 3.9582508226037045e-06, "loss": 0.87199295, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.797637701034546 }, { "auxiliary_loss_clip": 0.01598056, "auxiliary_loss_mlp": 0.01294201, "balance_loss_clip": 1.04778636, "balance_loss_mlp": 1.01649117, "epoch": 0.02849756508146456, "flos": 20479612921920.0, "grad_norm": 3.467130214055946, "language_loss": 0.94063485, "learning_rate": 3.9613140235734636e-06, "loss": 0.9695574, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.790133237838745 }, { "auxiliary_loss_clip": 0.01598658, "auxiliary_loss_mlp": 0.0128821, "balance_loss_clip": 1.04867411, "balance_loss_mlp": 1.01355183, "epoch": 0.02861780797210365, "flos": 14283371183040.0, "grad_norm": 2.013366381880357, "language_loss": 0.81219852, "learning_rate": 3.96436432681674e-06, "loss": 0.84106719, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.7552809715270996 }, { "auxiliary_loss_clip": 0.0159645, "auxiliary_loss_mlp": 0.01287855, "balance_loss_clip": 1.04884315, "balance_loss_mlp": 1.01415038, "epoch": 0.02873805086274274, "flos": 25808542960320.0, "grad_norm": 2.380454492900551, "language_loss": 0.89264631, "learning_rate": 3.967401840491044e-06, "loss": 0.92148936, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.7620596885681152 }, { "auxiliary_loss_clip": 0.01596114, "auxiliary_loss_mlp": 0.01282513, "balance_loss_clip": 1.04777646, "balance_loss_mlp": 1.01052523, "epoch": 0.028858293753381833, "flos": 17304246635040.0, "grad_norm": 2.3649631884409437, "language_loss": 0.87532991, "learning_rate": 3.97042667139909e-06, "loss": 0.90411621, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.7782554626464844 }, { "auxiliary_loss_clip": 0.01595743, "auxiliary_loss_mlp": 0.01285943, "balance_loss_clip": 1.0481075, "balance_loss_mlp": 1.01261973, "epoch": 0.028978536644020923, "flos": 23038358254560.0, "grad_norm": 1.977909994119393, "language_loss": 0.87573743, "learning_rate": 3.973438925011327e-06, "loss": 0.90455431, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.766472816467285 }, { "auxiliary_loss_clip": 0.01594362, "auxiliary_loss_mlp": 0.01290262, "balance_loss_clip": 1.04790998, "balance_loss_mlp": 1.01713014, "epoch": 0.029098779534660012, "flos": 28329725332800.0, "grad_norm": 2.3497168771038144, "language_loss": 0.91212821, "learning_rate": 3.976438705488002e-06, "loss": 0.94097447, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.874723434448242 }, { "auxiliary_loss_clip": 0.01595726, "auxiliary_loss_mlp": 0.01285489, "balance_loss_clip": 1.04960227, "balance_loss_mlp": 1.01311994, "epoch": 0.029219022425299106, "flos": 13881674198880.0, "grad_norm": 2.7484950485128365, "language_loss": 0.92869002, "learning_rate": 3.9794261157007744e-06, "loss": 0.95750213, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.750938653945923 }, { "auxiliary_loss_clip": 0.01591473, "auxiliary_loss_mlp": 0.01285223, "balance_loss_clip": 1.04700673, "balance_loss_mlp": 1.01266325, "epoch": 0.029339265315938196, "flos": 19422510618240.0, "grad_norm": 7.160939997016582, "language_loss": 0.84716725, "learning_rate": 3.982401257253887e-06, "loss": 0.87593424, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.7661588191986084 }, { "auxiliary_loss_clip": 0.01592658, "auxiliary_loss_mlp": 0.012843, "balance_loss_clip": 1.04851627, "balance_loss_mlp": 1.01364672, "epoch": 0.029459508206577285, "flos": 15669559208160.0, "grad_norm": 2.1812178703524716, "language_loss": 0.89882469, "learning_rate": 3.985364230504893e-06, "loss": 0.9275943, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.677844285964966 }, { "auxiliary_loss_clip": 0.01590267, "auxiliary_loss_mlp": 0.01285473, "balance_loss_clip": 1.04749572, "balance_loss_mlp": 1.01462936, "epoch": 0.02957975109721638, "flos": 28220988057120.0, "grad_norm": 2.0125372545146667, "language_loss": 0.84318805, "learning_rate": 3.988315134584976e-06, "loss": 0.8719455, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.7923967838287354 }, { "auxiliary_loss_clip": 0.01590407, "auxiliary_loss_mlp": 0.01286482, "balance_loss_clip": 1.04819703, "balance_loss_mlp": 1.01258624, "epoch": 0.02969999398785547, "flos": 24315880851360.0, "grad_norm": 2.2159909087696668, "language_loss": 0.80521965, "learning_rate": 3.991254067418851e-06, "loss": 0.83398849, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.8429572582244873 }, { "auxiliary_loss_clip": 0.0158897, "auxiliary_loss_mlp": 0.01280671, "balance_loss_clip": 1.04798639, "balance_loss_mlp": 1.0113529, "epoch": 0.02982023687849456, "flos": 35078598446400.0, "grad_norm": 2.207641543686525, "language_loss": 0.82952911, "learning_rate": 3.994181125744254e-06, "loss": 0.85822546, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.9778153896331787 }, { "auxiliary_loss_clip": 0.01588382, "auxiliary_loss_mlp": 0.01284121, "balance_loss_clip": 1.04740119, "balance_loss_mlp": 1.01442242, "epoch": 0.02994047976913365, "flos": 26177167444320.0, "grad_norm": 1.8854221915160894, "language_loss": 0.740628, "learning_rate": 3.99709640513106e-06, "loss": 0.76935309, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.8855395317077637 }, { "auxiliary_loss_clip": 0.01587281, "auxiliary_loss_mlp": 0.01284108, "balance_loss_clip": 1.04726458, "balance_loss_mlp": 1.01307344, "epoch": 0.03006072265977274, "flos": 25625039001120.0, "grad_norm": 2.637253397835737, "language_loss": 0.85803992, "learning_rate": 4e-06, "loss": 0.88675386, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 2.777620792388916 }, { "auxiliary_loss_clip": 0.01588232, "auxiliary_loss_mlp": 0.01282893, "balance_loss_clip": 1.04939842, "balance_loss_mlp": 1.01510119, "epoch": 0.03018096555041183, "flos": 22127089178880.0, "grad_norm": 2.676112624785935, "language_loss": 0.88564682, "learning_rate": 3.999999848300794e-06, "loss": 0.91435802, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.6637983322143555 }, { "auxiliary_loss_clip": 0.01583307, "auxiliary_loss_mlp": 0.01280405, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.01165998, "epoch": 0.030301208441050925, "flos": 30188209878720.0, "grad_norm": 1.750803801288195, "language_loss": 0.89105701, "learning_rate": 3.999999393203203e-06, "loss": 0.91969419, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 3.8270022869110107 }, { "auxiliary_loss_clip": 0.01582202, "auxiliary_loss_mlp": 0.01281442, "balance_loss_clip": 1.04548109, "balance_loss_mlp": 1.01460361, "epoch": 0.030421451331690014, "flos": 23621403777120.0, "grad_norm": 1.8888351979463314, "language_loss": 0.85292006, "learning_rate": 3.999998634707293e-06, "loss": 0.88155645, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.8165836334228516 }, { "auxiliary_loss_clip": 0.01584879, "auxiliary_loss_mlp": 0.01279746, "balance_loss_clip": 1.04841232, "balance_loss_mlp": 1.01214468, "epoch": 0.030541694222329104, "flos": 27928459432800.0, "grad_norm": 4.369299742850291, "language_loss": 0.96657264, "learning_rate": 3.999997572813182e-06, "loss": 0.99521887, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.8643851280212402 }, { "auxiliary_loss_clip": 0.01580692, "auxiliary_loss_mlp": 0.01278492, "balance_loss_clip": 1.04441965, "balance_loss_mlp": 1.01279807, "epoch": 0.030661937112968194, "flos": 18588451036320.0, "grad_norm": 1.867136122298921, "language_loss": 0.87566233, "learning_rate": 3.999996207521028e-06, "loss": 0.9042542, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.7205944061279297 }, { "auxiliary_loss_clip": 0.01580485, "auxiliary_loss_mlp": 0.01277505, "balance_loss_clip": 1.04659688, "balance_loss_mlp": 1.01123953, "epoch": 0.030782180003607287, "flos": 12969147794400.0, "grad_norm": 2.509841280082876, "language_loss": 0.82333064, "learning_rate": 3.999994538831039e-06, "loss": 0.85191053, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.8163583278656006 }, { "auxiliary_loss_clip": 0.01578774, "auxiliary_loss_mlp": 0.01279789, "balance_loss_clip": 1.04439831, "balance_loss_mlp": 1.01409495, "epoch": 0.030902422894246377, "flos": 23335377338880.0, "grad_norm": 2.393031095890281, "language_loss": 0.85732162, "learning_rate": 3.99999256674347e-06, "loss": 0.88590717, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.8489434719085693 }, { "auxiliary_loss_clip": 0.01619459, "auxiliary_loss_mlp": 0.01267037, "balance_loss_clip": 1.07771587, "balance_loss_mlp": 1.00458574, "epoch": 0.031022665784885467, "flos": 55094177293920.0, "grad_norm": 1.0237334714763826, "language_loss": 0.53519481, "learning_rate": 3.999990291258618e-06, "loss": 0.56405979, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.256096124649048 }, { "auxiliary_loss_clip": 0.01578888, "auxiliary_loss_mlp": 0.01277128, "balance_loss_clip": 1.04636025, "balance_loss_mlp": 1.01219749, "epoch": 0.03114290867552456, "flos": 19317796794720.0, "grad_norm": 2.915461084454814, "language_loss": 0.86707151, "learning_rate": 3.999987712376829e-06, "loss": 0.89563167, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 3.006847381591797 }, { "auxiliary_loss_clip": 0.01576941, "auxiliary_loss_mlp": 0.01278834, "balance_loss_clip": 1.04587901, "balance_loss_mlp": 1.01390338, "epoch": 0.031263151566163654, "flos": 20959453415520.0, "grad_norm": 1.9486397080258164, "language_loss": 0.82268846, "learning_rate": 3.999984830098494e-06, "loss": 0.85124612, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.7611751556396484 }, { "auxiliary_loss_clip": 0.01575389, "auxiliary_loss_mlp": 0.01276725, "balance_loss_clip": 1.04508996, "balance_loss_mlp": 1.01217616, "epoch": 0.03138339445680274, "flos": 14793015121920.0, "grad_norm": 5.870685990163903, "language_loss": 0.98132724, "learning_rate": 3.999981644424051e-06, "loss": 1.00984836, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.8054094314575195 }, { "auxiliary_loss_clip": 0.01574605, "auxiliary_loss_mlp": 0.01278046, "balance_loss_clip": 1.04447508, "balance_loss_mlp": 1.0125432, "epoch": 0.03150363734744183, "flos": 11655606955680.0, "grad_norm": 2.228854378952997, "language_loss": 0.86109751, "learning_rate": 3.999978155353982e-06, "loss": 0.88962406, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.74337100982666 }, { "auxiliary_loss_clip": 0.01573874, "auxiliary_loss_mlp": 0.01278126, "balance_loss_clip": 1.04411149, "balance_loss_mlp": 1.01205075, "epoch": 0.03162388023808092, "flos": 33727746113280.0, "grad_norm": 3.688695211893667, "language_loss": 0.80285418, "learning_rate": 3.9999743628888186e-06, "loss": 0.83137417, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.8790063858032227 }, { "auxiliary_loss_clip": 0.01572565, "auxiliary_loss_mlp": 0.01275601, "balance_loss_clip": 1.04419017, "balance_loss_mlp": 1.01219583, "epoch": 0.03174412312872001, "flos": 20810961835200.0, "grad_norm": 2.482648881125446, "language_loss": 0.89470875, "learning_rate": 3.999970267029133e-06, "loss": 0.92319041, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.829014301300049 }, { "auxiliary_loss_clip": 0.01573975, "auxiliary_loss_mlp": 0.012731, "balance_loss_clip": 1.04580927, "balance_loss_mlp": 1.01064849, "epoch": 0.0318643660193591, "flos": 23727949708320.0, "grad_norm": 2.8660716274418623, "language_loss": 0.8018961, "learning_rate": 3.999965867775548e-06, "loss": 0.83036685, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.7697768211364746 }, { "auxiliary_loss_clip": 0.01571055, "auxiliary_loss_mlp": 0.01277466, "balance_loss_clip": 1.04395175, "balance_loss_mlp": 1.01272583, "epoch": 0.0319846089099982, "flos": 13917872059200.0, "grad_norm": 2.5575199821311707, "language_loss": 0.8680225, "learning_rate": 3.9999611651287315e-06, "loss": 0.89650774, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.748660087585449 }, { "auxiliary_loss_clip": 0.01571007, "auxiliary_loss_mlp": 0.01276295, "balance_loss_clip": 1.04489636, "balance_loss_mlp": 1.01346278, "epoch": 0.03210485180063729, "flos": 14753260817280.0, "grad_norm": 2.524187859208641, "language_loss": 0.78894073, "learning_rate": 3.999956159089396e-06, "loss": 0.81741375, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.7144625186920166 }, { "auxiliary_loss_clip": 0.01570548, "auxiliary_loss_mlp": 0.01277825, "balance_loss_clip": 1.04447675, "balance_loss_mlp": 1.01289439, "epoch": 0.03222509469127638, "flos": 28913166015840.0, "grad_norm": 2.153724617351055, "language_loss": 0.79541272, "learning_rate": 3.999950849658302e-06, "loss": 0.82389647, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.8561794757843018 }, { "auxiliary_loss_clip": 0.01568813, "auxiliary_loss_mlp": 0.01276308, "balance_loss_clip": 1.0442431, "balance_loss_mlp": 1.01137686, "epoch": 0.03234533758191547, "flos": 16946399255040.0, "grad_norm": 2.044338081443681, "language_loss": 0.84277511, "learning_rate": 3.999945236836254e-06, "loss": 0.87122631, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.8587656021118164 }, { "auxiliary_loss_clip": 0.01568488, "auxiliary_loss_mlp": 0.01276364, "balance_loss_clip": 1.04401982, "balance_loss_mlp": 1.0137217, "epoch": 0.03246558047255456, "flos": 18989106233760.0, "grad_norm": 2.921122754300512, "language_loss": 0.94701231, "learning_rate": 3.999939320624103e-06, "loss": 0.97546089, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.7655768394470215 }, { "auxiliary_loss_clip": 0.01566838, "auxiliary_loss_mlp": 0.01274471, "balance_loss_clip": 1.04329503, "balance_loss_mlp": 1.01182914, "epoch": 0.03258582336319365, "flos": 23728344868800.0, "grad_norm": 2.0038043328813315, "language_loss": 0.89965791, "learning_rate": 3.999933101022749e-06, "loss": 0.92807102, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.825662851333618 }, { "auxiliary_loss_clip": 0.01566704, "auxiliary_loss_mlp": 0.01272838, "balance_loss_clip": 1.04438758, "balance_loss_mlp": 1.01172221, "epoch": 0.032706066253832745, "flos": 27670835416320.0, "grad_norm": 1.8248970004816334, "language_loss": 0.86791855, "learning_rate": 3.999926578033132e-06, "loss": 0.89631391, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.7752254009246826 }, { "auxiliary_loss_clip": 0.01564886, "auxiliary_loss_mlp": 0.01279665, "balance_loss_clip": 1.0437212, "balance_loss_mlp": 1.01397109, "epoch": 0.032826309144471835, "flos": 45624703658400.0, "grad_norm": 2.528770215826405, "language_loss": 0.62709391, "learning_rate": 3.999919751656244e-06, "loss": 0.65553939, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.983391046524048 }, { "auxiliary_loss_clip": 0.01563375, "auxiliary_loss_mlp": 0.01275384, "balance_loss_clip": 1.04305029, "balance_loss_mlp": 1.01197934, "epoch": 0.032946552035110925, "flos": 25812386794080.0, "grad_norm": 2.654501148656797, "language_loss": 0.75734866, "learning_rate": 3.9999126218931195e-06, "loss": 0.78573626, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.7592146396636963 }, { "auxiliary_loss_clip": 0.01562397, "auxiliary_loss_mlp": 0.01280019, "balance_loss_clip": 1.04271507, "balance_loss_mlp": 1.01718616, "epoch": 0.033066794925750015, "flos": 15121993072320.0, "grad_norm": 2.2260883395584186, "language_loss": 0.8963092, "learning_rate": 3.99990518874484e-06, "loss": 0.9247334, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.736752986907959 }, { "auxiliary_loss_clip": 0.01563689, "auxiliary_loss_mlp": 0.01269225, "balance_loss_clip": 1.04505253, "balance_loss_mlp": 1.01116085, "epoch": 0.033187037816389105, "flos": 22776603014880.0, "grad_norm": 2.1931684880952, "language_loss": 0.92437565, "learning_rate": 3.999897452212534e-06, "loss": 0.95270479, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 2.7757632732391357 }, { "auxiliary_loss_clip": 0.01561163, "auxiliary_loss_mlp": 0.0127393, "balance_loss_clip": 1.04298949, "balance_loss_mlp": 1.01205063, "epoch": 0.033307280707028195, "flos": 23331425734080.0, "grad_norm": 2.29165948835632, "language_loss": 0.99929214, "learning_rate": 3.999889412297374e-06, "loss": 1.02764297, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 2.8508787155151367 }, { "auxiliary_loss_clip": 0.01561802, "auxiliary_loss_mlp": 0.01268789, "balance_loss_clip": 1.04391062, "balance_loss_mlp": 1.00977087, "epoch": 0.03342752359766729, "flos": 28840303287360.0, "grad_norm": 1.912324050606732, "language_loss": 0.79051065, "learning_rate": 3.999881069000581e-06, "loss": 0.81881654, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 4.7682204246521 }, { "auxiliary_loss_clip": 0.01559301, "auxiliary_loss_mlp": 0.01274353, "balance_loss_clip": 1.04351544, "balance_loss_mlp": 1.01419115, "epoch": 0.03354776648830638, "flos": 19384552497600.0, "grad_norm": 2.4484769428765283, "language_loss": 0.87087214, "learning_rate": 3.99987242232342e-06, "loss": 0.89920866, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 4.500175952911377 }, { "auxiliary_loss_clip": 0.01559426, "auxiliary_loss_mlp": 0.01272169, "balance_loss_clip": 1.04319835, "balance_loss_mlp": 1.01353216, "epoch": 0.03366800937894547, "flos": 17858638270080.0, "grad_norm": 1.9126538898103607, "language_loss": 0.79815179, "learning_rate": 3.9998634722672026e-06, "loss": 0.82646775, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.8489017486572266 }, { "auxiliary_loss_clip": 0.01558259, "auxiliary_loss_mlp": 0.01271093, "balance_loss_clip": 1.0436914, "balance_loss_mlp": 1.01264703, "epoch": 0.03378825226958456, "flos": 35951047233120.0, "grad_norm": 1.8274388871429708, "language_loss": 0.78745699, "learning_rate": 3.999854218833286e-06, "loss": 0.81575048, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.924278974533081 }, { "auxiliary_loss_clip": 0.01555737, "auxiliary_loss_mlp": 0.01268895, "balance_loss_clip": 1.04229677, "balance_loss_mlp": 1.01140308, "epoch": 0.03390849516022365, "flos": 25702499960640.0, "grad_norm": 1.9348545524039213, "language_loss": 0.81871271, "learning_rate": 3.999844662023075e-06, "loss": 0.84695911, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.8056893348693848 }, { "auxiliary_loss_clip": 0.01555588, "auxiliary_loss_mlp": 0.01263921, "balance_loss_clip": 1.04295683, "balance_loss_mlp": 1.00928998, "epoch": 0.03402873805086274, "flos": 21284515684800.0, "grad_norm": 1.7623275771630367, "language_loss": 0.92473626, "learning_rate": 3.999834801838018e-06, "loss": 0.9529314, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.8439719676971436 }, { "auxiliary_loss_clip": 0.01555076, "auxiliary_loss_mlp": 0.01266496, "balance_loss_clip": 1.0437125, "balance_loss_mlp": 1.01110172, "epoch": 0.03414898094150183, "flos": 22710925022400.0, "grad_norm": 1.8318610621742524, "language_loss": 0.74086326, "learning_rate": 3.9998246382796115e-06, "loss": 0.76907897, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.8354105949401855 }, { "auxiliary_loss_clip": 0.01551742, "auxiliary_loss_mlp": 0.0127266, "balance_loss_clip": 1.04042041, "balance_loss_mlp": 1.01306987, "epoch": 0.03426922383214093, "flos": 18879937873920.0, "grad_norm": 2.1224430057203127, "language_loss": 0.90700543, "learning_rate": 3.999814171349399e-06, "loss": 0.93524945, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.736722469329834 }, { "auxiliary_loss_clip": 0.01552866, "auxiliary_loss_mlp": 0.01264091, "balance_loss_clip": 1.04218769, "balance_loss_mlp": 1.0098412, "epoch": 0.03438946672278002, "flos": 34752027382560.0, "grad_norm": 2.0573273571081794, "language_loss": 0.73671758, "learning_rate": 3.9998034010489655e-06, "loss": 0.76488715, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.90596604347229 }, { "auxiliary_loss_clip": 0.01550938, "auxiliary_loss_mlp": 0.01268687, "balance_loss_clip": 1.04236495, "balance_loss_mlp": 1.0129112, "epoch": 0.03450970961341911, "flos": 22164113283840.0, "grad_norm": 6.442846525160166, "language_loss": 0.75896835, "learning_rate": 3.999792327379946e-06, "loss": 0.78716457, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.771996021270752 }, { "auxiliary_loss_clip": 0.01550823, "auxiliary_loss_mlp": 0.01268439, "balance_loss_clip": 1.04246545, "balance_loss_mlp": 1.0128541, "epoch": 0.034629952504058197, "flos": 21725751431520.0, "grad_norm": 2.218216382460267, "language_loss": 0.96226549, "learning_rate": 3.999780950344021e-06, "loss": 0.99045813, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.991957664489746 }, { "auxiliary_loss_clip": 0.01548391, "auxiliary_loss_mlp": 0.01270009, "balance_loss_clip": 1.0415554, "balance_loss_mlp": 1.01404238, "epoch": 0.034750195394697286, "flos": 20048004721440.0, "grad_norm": 1.9524444320533325, "language_loss": 0.82885861, "learning_rate": 3.999769269942916e-06, "loss": 0.85704267, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.851966619491577 }, { "auxiliary_loss_clip": 0.01549489, "auxiliary_loss_mlp": 0.01264327, "balance_loss_clip": 1.04249454, "balance_loss_mlp": 1.01236653, "epoch": 0.034870438285336376, "flos": 27965878698240.0, "grad_norm": 2.0501201709076975, "language_loss": 0.81159985, "learning_rate": 3.999757286178402e-06, "loss": 0.83973801, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.8907949924468994 }, { "auxiliary_loss_clip": 0.01550249, "auxiliary_loss_mlp": 0.0126578, "balance_loss_clip": 1.04219854, "balance_loss_mlp": 1.00981426, "epoch": 0.03499068117597547, "flos": 22017525658560.0, "grad_norm": 1.9891660932983055, "language_loss": 0.9081713, "learning_rate": 3.999744999052299e-06, "loss": 0.93633163, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 3.0285253524780273 }, { "auxiliary_loss_clip": 0.01578707, "auxiliary_loss_mlp": 0.01251337, "balance_loss_clip": 1.06880951, "balance_loss_mlp": 1.00109339, "epoch": 0.03511092406661456, "flos": 57242172875040.0, "grad_norm": 0.9586843216286706, "language_loss": 0.61207163, "learning_rate": 3.9997324085664675e-06, "loss": 0.6403721, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.281651258468628 }, { "auxiliary_loss_clip": 0.01543123, "auxiliary_loss_mlp": 0.01263889, "balance_loss_clip": 1.03974569, "balance_loss_mlp": 1.01078379, "epoch": 0.03523116695725365, "flos": 22928076260640.0, "grad_norm": 2.0787188849167153, "language_loss": 0.91795194, "learning_rate": 3.999719514722821e-06, "loss": 0.94602203, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 3.3317153453826904 }, { "auxiliary_loss_clip": 0.01544873, "auxiliary_loss_mlp": 0.01263661, "balance_loss_clip": 1.04077482, "balance_loss_mlp": 1.01074672, "epoch": 0.03535140984789274, "flos": 36903256094880.0, "grad_norm": 2.634383892915242, "language_loss": 0.74958575, "learning_rate": 3.999706317523314e-06, "loss": 0.7776711, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 3.1050608158111572 }, { "auxiliary_loss_clip": 0.0154392, "auxiliary_loss_mlp": 0.01263337, "balance_loss_clip": 1.04142213, "balance_loss_mlp": 1.01061344, "epoch": 0.03547165273853183, "flos": 20449162850400.0, "grad_norm": 2.2182662210178057, "language_loss": 0.86096418, "learning_rate": 3.999692816969948e-06, "loss": 0.88903677, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.9124648571014404 }, { "auxiliary_loss_clip": 0.01570882, "auxiliary_loss_mlp": 0.01247681, "balance_loss_clip": 1.06482697, "balance_loss_mlp": 1.00048912, "epoch": 0.03559189562917092, "flos": 69850599395040.0, "grad_norm": 0.9965292427456093, "language_loss": 0.69380718, "learning_rate": 3.999679013064772e-06, "loss": 0.72199279, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.377439022064209 }, { "auxiliary_loss_clip": 0.01539764, "auxiliary_loss_mlp": 0.01266031, "balance_loss_clip": 1.03900743, "balance_loss_mlp": 1.01330769, "epoch": 0.03571213851981002, "flos": 21651954687360.0, "grad_norm": 4.52209144039504, "language_loss": 0.85409588, "learning_rate": 3.99966490580988e-06, "loss": 0.88215387, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.9457404613494873 }, { "auxiliary_loss_clip": 0.01539637, "auxiliary_loss_mlp": 0.01262434, "balance_loss_clip": 1.04017711, "balance_loss_mlp": 1.01009202, "epoch": 0.03583238141044911, "flos": 43945627772160.0, "grad_norm": 2.2624255117302767, "language_loss": 0.6581707, "learning_rate": 3.999650495207411e-06, "loss": 0.68619144, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 3.0536742210388184 }, { "auxiliary_loss_clip": 0.01539248, "auxiliary_loss_mlp": 0.012608, "balance_loss_clip": 1.04040611, "balance_loss_mlp": 1.01093721, "epoch": 0.0359526243010882, "flos": 18910819029600.0, "grad_norm": 2.685794836061025, "language_loss": 0.90165579, "learning_rate": 3.999635781259553e-06, "loss": 0.92965627, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.9440553188323975 }, { "auxiliary_loss_clip": 0.01558747, "auxiliary_loss_mlp": 0.01245693, "balance_loss_clip": 1.05839992, "balance_loss_mlp": 1.00002658, "epoch": 0.03607286719172729, "flos": 61668922528800.0, "grad_norm": 0.9148999612785308, "language_loss": 0.52265465, "learning_rate": 3.999620763968535e-06, "loss": 0.550699, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.2219955921173096 }, { "auxiliary_loss_clip": 0.01537127, "auxiliary_loss_mlp": 0.01260777, "balance_loss_clip": 1.03994465, "balance_loss_mlp": 1.01091456, "epoch": 0.03619311008236638, "flos": 27819075530880.0, "grad_norm": 1.5781361316110463, "language_loss": 0.86302406, "learning_rate": 3.999605443336638e-06, "loss": 0.89100307, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.9284112453460693 }, { "auxiliary_loss_clip": 0.01534085, "auxiliary_loss_mlp": 0.01262388, "balance_loss_clip": 1.03770101, "balance_loss_mlp": 1.01042747, "epoch": 0.03631335297300547, "flos": 13621140364320.0, "grad_norm": 2.502548203786903, "language_loss": 0.88944566, "learning_rate": 3.999589819366185e-06, "loss": 0.91741037, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.927891969680786 }, { "auxiliary_loss_clip": 0.01532795, "auxiliary_loss_mlp": 0.01262373, "balance_loss_clip": 1.03746414, "balance_loss_mlp": 1.01270139, "epoch": 0.036433595863644565, "flos": 27631799585280.0, "grad_norm": 2.08001319475621, "language_loss": 0.84958434, "learning_rate": 3.999573892059547e-06, "loss": 0.877536, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 2.8793137073516846 }, { "auxiliary_loss_clip": 0.0153471, "auxiliary_loss_mlp": 0.01258427, "balance_loss_clip": 1.0403955, "balance_loss_mlp": 1.00799251, "epoch": 0.036553838754283655, "flos": 24572031996960.0, "grad_norm": 1.9140338142258455, "language_loss": 0.80956531, "learning_rate": 3.999557661419138e-06, "loss": 0.8374967, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 3.9298229217529297 }, { "auxiliary_loss_clip": 0.01533472, "auxiliary_loss_mlp": 0.01264875, "balance_loss_clip": 1.040447, "balance_loss_mlp": 1.0136776, "epoch": 0.036674081644922744, "flos": 23404324386240.0, "grad_norm": 2.029233729642231, "language_loss": 0.81574154, "learning_rate": 3.9995411274474225e-06, "loss": 0.84372497, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 3.7628211975097656 }, { "auxiliary_loss_clip": 0.01531051, "auxiliary_loss_mlp": 0.01262143, "balance_loss_clip": 1.03874636, "balance_loss_mlp": 1.01056337, "epoch": 0.036794324535561834, "flos": 27489702420000.0, "grad_norm": 4.43135177637366, "language_loss": 0.81388348, "learning_rate": 3.999524290146908e-06, "loss": 0.84181547, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 2.838444709777832 }, { "auxiliary_loss_clip": 0.01531486, "auxiliary_loss_mlp": 0.01265607, "balance_loss_clip": 1.03956699, "balance_loss_mlp": 1.01536274, "epoch": 0.036914567426200924, "flos": 19463486328000.0, "grad_norm": 2.4490813336843247, "language_loss": 0.92829335, "learning_rate": 3.9995071495201485e-06, "loss": 0.95626426, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.8172223567962646 }, { "auxiliary_loss_clip": 0.01530968, "auxiliary_loss_mlp": 0.01258197, "balance_loss_clip": 1.03987551, "balance_loss_mlp": 1.00947857, "epoch": 0.037034810316840014, "flos": 22309335809280.0, "grad_norm": 3.260951745293635, "language_loss": 0.97576755, "learning_rate": 3.999489705569744e-06, "loss": 1.00365925, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.881579875946045 }, { "auxiliary_loss_clip": 0.01530684, "auxiliary_loss_mlp": 0.01258169, "balance_loss_clip": 1.03909695, "balance_loss_mlp": 1.01021338, "epoch": 0.03715505320747911, "flos": 18588343265280.0, "grad_norm": 2.0693754978198258, "language_loss": 0.86133265, "learning_rate": 3.999471958298341e-06, "loss": 0.88922125, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.872070074081421 }, { "auxiliary_loss_clip": 0.01528797, "auxiliary_loss_mlp": 0.01261125, "balance_loss_clip": 1.03912735, "balance_loss_mlp": 1.01278806, "epoch": 0.0372752960981182, "flos": 35955358074720.0, "grad_norm": 2.047967170394918, "language_loss": 0.7602278, "learning_rate": 3.999453907708631e-06, "loss": 0.78812706, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.991701602935791 }, { "auxiliary_loss_clip": 0.01527127, "auxiliary_loss_mlp": 0.01255012, "balance_loss_clip": 1.0379771, "balance_loss_mlp": 1.00915492, "epoch": 0.03739553898875729, "flos": 20814051271680.0, "grad_norm": 2.4829226393951895, "language_loss": 0.81346905, "learning_rate": 3.999435553803353e-06, "loss": 0.84129047, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.9024972915649414 }, { "auxiliary_loss_clip": 0.01527352, "auxiliary_loss_mlp": 0.01258824, "balance_loss_clip": 1.03804636, "balance_loss_mlp": 1.01086926, "epoch": 0.03751578187939638, "flos": 20264150096640.0, "grad_norm": 2.868519747183228, "language_loss": 0.83517367, "learning_rate": 3.999416896585292e-06, "loss": 0.86303544, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.9536571502685547 }, { "auxiliary_loss_clip": 0.01525317, "auxiliary_loss_mlp": 0.01259061, "balance_loss_clip": 1.03703189, "balance_loss_mlp": 1.01110601, "epoch": 0.03763602477003547, "flos": 20668074348960.0, "grad_norm": 2.7990368474993046, "language_loss": 0.84947574, "learning_rate": 3.9993979360572775e-06, "loss": 0.87731957, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.8384339809417725 }, { "auxiliary_loss_clip": 0.0152854, "auxiliary_loss_mlp": 0.01260034, "balance_loss_clip": 1.03987396, "balance_loss_mlp": 1.0118885, "epoch": 0.03775626766067456, "flos": 16691361743520.0, "grad_norm": 2.6259321765316797, "language_loss": 0.82490957, "learning_rate": 3.999378672222185e-06, "loss": 0.8527953, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.766331672668457 }, { "auxiliary_loss_clip": 0.01524711, "auxiliary_loss_mlp": 0.01252131, "balance_loss_clip": 1.03824902, "balance_loss_mlp": 1.00684643, "epoch": 0.03787651055131366, "flos": 21141807816960.0, "grad_norm": 2.2008973380030294, "language_loss": 0.8263371, "learning_rate": 3.9993591050829385e-06, "loss": 0.85410553, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.816284656524658 }, { "auxiliary_loss_clip": 0.01524782, "auxiliary_loss_mlp": 0.01253458, "balance_loss_clip": 1.0385139, "balance_loss_mlp": 1.0093174, "epoch": 0.037996753441952746, "flos": 22018100437440.0, "grad_norm": 1.957938300009072, "language_loss": 0.7930609, "learning_rate": 3.999339234642506e-06, "loss": 0.82084328, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.810264825820923 }, { "auxiliary_loss_clip": 0.01525683, "auxiliary_loss_mlp": 0.01259039, "balance_loss_clip": 1.03927028, "balance_loss_mlp": 1.01127458, "epoch": 0.038116996332591836, "flos": 27709404239520.0, "grad_norm": 2.4074526556177864, "language_loss": 0.83892441, "learning_rate": 3.9993190609038994e-06, "loss": 0.86677164, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.7508695125579834 }, { "auxiliary_loss_clip": 0.01523106, "auxiliary_loss_mlp": 0.01255256, "balance_loss_clip": 1.03834283, "balance_loss_mlp": 1.00939846, "epoch": 0.038237239223230926, "flos": 21178077524640.0, "grad_norm": 2.0049535786793706, "language_loss": 0.83010733, "learning_rate": 3.999298583870182e-06, "loss": 0.8578909, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.9356422424316406 }, { "auxiliary_loss_clip": 0.01521203, "auxiliary_loss_mlp": 0.01259498, "balance_loss_clip": 1.03754866, "balance_loss_mlp": 1.01344979, "epoch": 0.038357482113870016, "flos": 25556630808960.0, "grad_norm": 1.9064292825237532, "language_loss": 0.77354711, "learning_rate": 3.999277803544458e-06, "loss": 0.80135411, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.8485941886901855 }, { "auxiliary_loss_clip": 0.01530403, "auxiliary_loss_mlp": 0.01236268, "balance_loss_clip": 1.05023098, "balance_loss_mlp": 0.99975663, "epoch": 0.038477725004509106, "flos": 59227608002400.0, "grad_norm": 0.9622419864266122, "language_loss": 0.62401265, "learning_rate": 3.999256719929882e-06, "loss": 0.65167934, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.206918478012085 }, { "auxiliary_loss_clip": 0.01528525, "auxiliary_loss_mlp": 0.01236179, "balance_loss_clip": 1.04897499, "balance_loss_mlp": 0.99966794, "epoch": 0.0385979678951482, "flos": 67317705902880.0, "grad_norm": 1.210206836170788, "language_loss": 0.67106891, "learning_rate": 3.999235333029651e-06, "loss": 0.69871593, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.2307076454162598 }, { "auxiliary_loss_clip": 0.0152031, "auxiliary_loss_mlp": 0.01251725, "balance_loss_clip": 1.03852451, "balance_loss_mlp": 1.00891924, "epoch": 0.03871821078578729, "flos": 22746763645920.0, "grad_norm": 1.8283380137155973, "language_loss": 0.82043123, "learning_rate": 3.999213642847009e-06, "loss": 0.84815156, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.766916036605835 }, { "auxiliary_loss_clip": 0.01520684, "auxiliary_loss_mlp": 0.01253796, "balance_loss_clip": 1.03834057, "balance_loss_mlp": 1.01003695, "epoch": 0.03883845367642638, "flos": 26280623939040.0, "grad_norm": 1.8662700249909745, "language_loss": 0.91014135, "learning_rate": 3.999191649385247e-06, "loss": 0.93788612, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.9199905395507812 }, { "auxiliary_loss_clip": 0.01522266, "auxiliary_loss_mlp": 0.01234419, "balance_loss_clip": 1.04588103, "balance_loss_mlp": 0.99943316, "epoch": 0.03895869656706547, "flos": 56962864164960.0, "grad_norm": 0.9125736925494077, "language_loss": 0.59782523, "learning_rate": 3.999169352647702e-06, "loss": 0.62539208, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.2153818607330322 }, { "auxiliary_loss_clip": 0.01517359, "auxiliary_loss_mlp": 0.01254732, "balance_loss_clip": 1.03721189, "balance_loss_mlp": 1.00982833, "epoch": 0.03907893945770456, "flos": 24863375139840.0, "grad_norm": 1.8217048408929197, "language_loss": 0.83039099, "learning_rate": 3.999146752637755e-06, "loss": 0.85811186, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.7482662200927734 }, { "auxiliary_loss_clip": 0.01514933, "auxiliary_loss_mlp": 0.01254592, "balance_loss_clip": 1.03653121, "balance_loss_mlp": 1.011024, "epoch": 0.03919918234834365, "flos": 18368605522080.0, "grad_norm": 2.5249779581989253, "language_loss": 0.89464903, "learning_rate": 3.999123849358836e-06, "loss": 0.92234433, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.87009859085083 }, { "auxiliary_loss_clip": 0.01515348, "auxiliary_loss_mlp": 0.01253645, "balance_loss_clip": 1.03668857, "balance_loss_mlp": 1.01064849, "epoch": 0.03931942523898275, "flos": 25225425590400.0, "grad_norm": 1.8660363099388553, "language_loss": 0.74711132, "learning_rate": 3.999100642814418e-06, "loss": 0.77480125, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.757728338241577 }, { "auxiliary_loss_clip": 0.0151369, "auxiliary_loss_mlp": 0.01251765, "balance_loss_clip": 1.03710723, "balance_loss_mlp": 1.01067579, "epoch": 0.03943966812962184, "flos": 23257916379360.0, "grad_norm": 2.168451572374538, "language_loss": 0.88467634, "learning_rate": 3.999077133008022e-06, "loss": 0.91233093, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.73954176902771 }, { "auxiliary_loss_clip": 0.01513919, "auxiliary_loss_mlp": 0.01249063, "balance_loss_clip": 1.03712118, "balance_loss_mlp": 1.00854671, "epoch": 0.03955991102026093, "flos": 29168850153600.0, "grad_norm": 1.8162309390744649, "language_loss": 0.90600109, "learning_rate": 3.9990533199432145e-06, "loss": 0.93363094, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.7966082096099854 }, { "auxiliary_loss_clip": 0.01512262, "auxiliary_loss_mlp": 0.01250345, "balance_loss_clip": 1.03685915, "balance_loss_mlp": 1.00830197, "epoch": 0.03968015391090002, "flos": 17602451200800.0, "grad_norm": 2.475695977126588, "language_loss": 0.75748205, "learning_rate": 3.999029203623608e-06, "loss": 0.78510809, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 2.737823247909546 }, { "auxiliary_loss_clip": 0.01511545, "auxiliary_loss_mlp": 0.0124874, "balance_loss_clip": 1.03695822, "balance_loss_mlp": 1.00822306, "epoch": 0.03980039680153911, "flos": 21799296709920.0, "grad_norm": 1.992118486972434, "language_loss": 0.8701703, "learning_rate": 3.99900478405286e-06, "loss": 0.89777315, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 5.590032577514648 }, { "auxiliary_loss_clip": 0.01512364, "auxiliary_loss_mlp": 0.01249118, "balance_loss_clip": 1.03845453, "balance_loss_mlp": 1.00936437, "epoch": 0.0399206396921782, "flos": 15195143190240.0, "grad_norm": 2.795949467813754, "language_loss": 0.82576513, "learning_rate": 3.998980061234676e-06, "loss": 0.85337996, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 2.750960350036621 }, { "auxiliary_loss_clip": 0.01511495, "auxiliary_loss_mlp": 0.01250411, "balance_loss_clip": 1.03793585, "balance_loss_mlp": 1.00817776, "epoch": 0.040040882582817294, "flos": 14422917767040.0, "grad_norm": 2.3312787144812406, "language_loss": 0.75730097, "learning_rate": 3.9989550351728055e-06, "loss": 0.7849201, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.7204766273498535 }, { "auxiliary_loss_clip": 0.01509847, "auxiliary_loss_mlp": 0.01246584, "balance_loss_clip": 1.03719854, "balance_loss_mlp": 1.00759315, "epoch": 0.040161125473456384, "flos": 19280916384480.0, "grad_norm": 2.4669753632645666, "language_loss": 0.84566438, "learning_rate": 3.998929705871046e-06, "loss": 0.87322873, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.7538278102874756 }, { "auxiliary_loss_clip": 0.01508858, "auxiliary_loss_mlp": 0.0124849, "balance_loss_clip": 1.03657353, "balance_loss_mlp": 1.00930846, "epoch": 0.040281368364095474, "flos": 17821111233600.0, "grad_norm": 2.3591476677580143, "language_loss": 0.89095151, "learning_rate": 3.99890407333324e-06, "loss": 0.91852504, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.6806697845458984 }, { "auxiliary_loss_clip": 0.01508308, "auxiliary_loss_mlp": 0.01245656, "balance_loss_clip": 1.03537405, "balance_loss_mlp": 1.00685561, "epoch": 0.040401611254734564, "flos": 19573768321920.0, "grad_norm": 1.843942895422418, "language_loss": 0.87176275, "learning_rate": 3.998878137563275e-06, "loss": 0.89930236, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.7277297973632812 }, { "auxiliary_loss_clip": 0.0150754, "auxiliary_loss_mlp": 0.0125033, "balance_loss_clip": 1.03593004, "balance_loss_mlp": 1.01038551, "epoch": 0.040521854145373654, "flos": 22054477916160.0, "grad_norm": 2.1571864112383303, "language_loss": 0.85424173, "learning_rate": 3.998851898565085e-06, "loss": 0.88182044, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.6819944381713867 }, { "auxiliary_loss_clip": 0.01506061, "auxiliary_loss_mlp": 0.01248349, "balance_loss_clip": 1.03519714, "balance_loss_mlp": 1.00840497, "epoch": 0.04064209703601274, "flos": 22674655314720.0, "grad_norm": 2.022947429101867, "language_loss": 0.82951546, "learning_rate": 3.998825356342653e-06, "loss": 0.8570596, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.727177143096924 }, { "auxiliary_loss_clip": 0.01505692, "auxiliary_loss_mlp": 0.01251398, "balance_loss_clip": 1.03622258, "balance_loss_mlp": 1.01030874, "epoch": 0.04076233992665183, "flos": 38582188286400.0, "grad_norm": 2.444458184477406, "language_loss": 0.73212433, "learning_rate": 3.998798510900003e-06, "loss": 0.75969523, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.83905291557312 }, { "auxiliary_loss_clip": 0.0150288, "auxiliary_loss_mlp": 0.01249494, "balance_loss_clip": 1.03407431, "balance_loss_mlp": 1.00935888, "epoch": 0.04088258281729093, "flos": 25885321369920.0, "grad_norm": 1.998489431376658, "language_loss": 0.83934253, "learning_rate": 3.998771362241207e-06, "loss": 0.86686623, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.8338563442230225 }, { "auxiliary_loss_clip": 0.01504508, "auxiliary_loss_mlp": 0.01245895, "balance_loss_clip": 1.03515935, "balance_loss_mlp": 1.00709462, "epoch": 0.04100282570793002, "flos": 19789841849760.0, "grad_norm": 1.7951032963596505, "language_loss": 0.87954742, "learning_rate": 3.998743910370385e-06, "loss": 0.90705144, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.744910717010498 }, { "auxiliary_loss_clip": 0.01504569, "auxiliary_loss_mlp": 0.01245082, "balance_loss_clip": 1.03654134, "balance_loss_mlp": 1.00876117, "epoch": 0.04112306859856911, "flos": 22565163641760.0, "grad_norm": 1.973088756589312, "language_loss": 0.73281485, "learning_rate": 3.998716155291702e-06, "loss": 0.76031137, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.6881401538848877 }, { "auxiliary_loss_clip": 0.01504568, "auxiliary_loss_mlp": 0.01245171, "balance_loss_clip": 1.03699255, "balance_loss_mlp": 1.00732493, "epoch": 0.0412433114892082, "flos": 25040664302400.0, "grad_norm": 1.9599930955189868, "language_loss": 0.90469092, "learning_rate": 3.998688097009366e-06, "loss": 0.93218839, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.832521438598633 }, { "auxiliary_loss_clip": 0.01501368, "auxiliary_loss_mlp": 0.01245488, "balance_loss_clip": 1.03496468, "balance_loss_mlp": 1.00764203, "epoch": 0.04136355437984729, "flos": 25191383150880.0, "grad_norm": 2.493589081162619, "language_loss": 0.79829228, "learning_rate": 3.998659735527636e-06, "loss": 0.82576084, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.8660309314727783 }, { "auxiliary_loss_clip": 0.01500753, "auxiliary_loss_mlp": 0.01243451, "balance_loss_clip": 1.03457248, "balance_loss_mlp": 1.0071305, "epoch": 0.04148379727048638, "flos": 22966788778560.0, "grad_norm": 2.2522670859342067, "language_loss": 0.77711987, "learning_rate": 3.998631070850813e-06, "loss": 0.80456197, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.8237476348876953 }, { "auxiliary_loss_clip": 0.01502809, "auxiliary_loss_mlp": 0.01245053, "balance_loss_clip": 1.0363878, "balance_loss_mlp": 1.00911427, "epoch": 0.041604040161125476, "flos": 14063489745120.0, "grad_norm": 2.216157071949253, "language_loss": 0.83392942, "learning_rate": 3.9986021029832455e-06, "loss": 0.861408, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.726844310760498 }, { "auxiliary_loss_clip": 0.01498933, "auxiliary_loss_mlp": 0.01245468, "balance_loss_clip": 1.03374803, "balance_loss_mlp": 1.00704956, "epoch": 0.041724283051764566, "flos": 12091885234560.0, "grad_norm": 2.8587799864606467, "language_loss": 0.91646516, "learning_rate": 3.9985728319293285e-06, "loss": 0.94390911, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.6565895080566406 }, { "auxiliary_loss_clip": 0.01499496, "auxiliary_loss_mlp": 0.01246849, "balance_loss_clip": 1.03440309, "balance_loss_mlp": 1.0086211, "epoch": 0.041844525942403656, "flos": 12385311950880.0, "grad_norm": 2.132773330409099, "language_loss": 0.8507728, "learning_rate": 3.998543257693501e-06, "loss": 0.87823623, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.690182685852051 }, { "auxiliary_loss_clip": 0.01498677, "auxiliary_loss_mlp": 0.01243343, "balance_loss_clip": 1.03478479, "balance_loss_mlp": 1.00740373, "epoch": 0.041964768833042745, "flos": 23769356502240.0, "grad_norm": 2.3505940680517785, "language_loss": 0.8783524, "learning_rate": 3.998513380280251e-06, "loss": 0.90577257, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.732292890548706 }, { "auxiliary_loss_clip": 0.01497994, "auxiliary_loss_mlp": 0.01249778, "balance_loss_clip": 1.03447652, "balance_loss_mlp": 1.01212215, "epoch": 0.042085011723681835, "flos": 11875344698880.0, "grad_norm": 2.179098421312809, "language_loss": 0.94945771, "learning_rate": 3.99848319969411e-06, "loss": 0.97693539, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.675565481185913 }, { "auxiliary_loss_clip": 0.01498587, "auxiliary_loss_mlp": 0.01247049, "balance_loss_clip": 1.03520751, "balance_loss_mlp": 1.0099659, "epoch": 0.042205254614320925, "flos": 16873967610720.0, "grad_norm": 2.176690155280794, "language_loss": 0.7916823, "learning_rate": 3.9984527159396564e-06, "loss": 0.81913865, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.7099297046661377 }, { "auxiliary_loss_clip": 0.01494954, "auxiliary_loss_mlp": 0.01244757, "balance_loss_clip": 1.03308237, "balance_loss_mlp": 1.00862765, "epoch": 0.04232549750496002, "flos": 25118520422400.0, "grad_norm": 5.872861320891902, "language_loss": 0.84532416, "learning_rate": 3.9984219290215154e-06, "loss": 0.87272125, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.7600889205932617 }, { "auxiliary_loss_clip": 0.014971, "auxiliary_loss_mlp": 0.01244825, "balance_loss_clip": 1.035151, "balance_loss_mlp": 1.00888586, "epoch": 0.04244574039559911, "flos": 26724553961760.0, "grad_norm": 1.609479197792013, "language_loss": 0.89037848, "learning_rate": 3.998390838944356e-06, "loss": 0.9177978, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.7375755310058594 }, { "auxiliary_loss_clip": 0.01494886, "auxiliary_loss_mlp": 0.0124644, "balance_loss_clip": 1.03308487, "balance_loss_mlp": 1.01011992, "epoch": 0.0425659832862382, "flos": 20923255555200.0, "grad_norm": 2.0229969006376396, "language_loss": 0.90429801, "learning_rate": 3.998359445712895e-06, "loss": 0.93171126, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.8291471004486084 }, { "auxiliary_loss_clip": 0.01493598, "auxiliary_loss_mlp": 0.01245209, "balance_loss_clip": 1.03283918, "balance_loss_mlp": 1.00831628, "epoch": 0.04268622617687729, "flos": 23331138344640.0, "grad_norm": 2.253075275131514, "language_loss": 0.81501758, "learning_rate": 3.9983277493318955e-06, "loss": 0.84240562, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.773411989212036 }, { "auxiliary_loss_clip": 0.01494613, "auxiliary_loss_mlp": 0.01244749, "balance_loss_clip": 1.03358209, "balance_loss_mlp": 1.00804663, "epoch": 0.04280646906751638, "flos": 25994022721920.0, "grad_norm": 1.6414631532310346, "language_loss": 0.81047583, "learning_rate": 3.998295749806165e-06, "loss": 0.83786941, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 2.8700742721557617 }, { "auxiliary_loss_clip": 0.0149417, "auxiliary_loss_mlp": 0.01244533, "balance_loss_clip": 1.03387702, "balance_loss_mlp": 1.00935721, "epoch": 0.04292671195815547, "flos": 26906836515840.0, "grad_norm": 2.0797550534661373, "language_loss": 0.83234435, "learning_rate": 3.998263447140558e-06, "loss": 0.85973138, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 3.731904983520508 }, { "auxiliary_loss_clip": 0.01491398, "auxiliary_loss_mlp": 0.01242476, "balance_loss_clip": 1.03211212, "balance_loss_mlp": 1.00749111, "epoch": 0.04304695484879457, "flos": 39457331349120.0, "grad_norm": 1.7513429833314533, "language_loss": 0.81478816, "learning_rate": 3.998230841339976e-06, "loss": 0.84212691, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 4.754363059997559 }, { "auxiliary_loss_clip": 0.01494363, "auxiliary_loss_mlp": 0.01245059, "balance_loss_clip": 1.0343101, "balance_loss_mlp": 1.01064539, "epoch": 0.04316719773943366, "flos": 19646307737280.0, "grad_norm": 2.1493121651823857, "language_loss": 0.84934646, "learning_rate": 3.998197932409363e-06, "loss": 0.87674069, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 2.7435436248779297 }, { "auxiliary_loss_clip": 0.01491308, "auxiliary_loss_mlp": 0.01244291, "balance_loss_clip": 1.03287733, "balance_loss_mlp": 1.00987816, "epoch": 0.04328744063007275, "flos": 22452331066560.0, "grad_norm": 2.3418380956888094, "language_loss": 0.86019397, "learning_rate": 3.9981647203537125e-06, "loss": 0.88755, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.7106404304504395 }, { "auxiliary_loss_clip": 0.01491672, "auxiliary_loss_mlp": 0.01244412, "balance_loss_clip": 1.03314161, "balance_loss_mlp": 1.00828183, "epoch": 0.04340768352071184, "flos": 21283042813920.0, "grad_norm": 2.293908798547335, "language_loss": 0.95760882, "learning_rate": 3.998131205178063e-06, "loss": 0.98496968, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.654909610748291 }, { "auxiliary_loss_clip": 0.01491175, "auxiliary_loss_mlp": 0.01243564, "balance_loss_clip": 1.03276503, "balance_loss_mlp": 1.00819683, "epoch": 0.04352792641135093, "flos": 11583714166560.0, "grad_norm": 2.7815747903053447, "language_loss": 0.77298319, "learning_rate": 3.998097386887498e-06, "loss": 0.80033058, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.6769371032714844 }, { "auxiliary_loss_clip": 0.01491037, "auxiliary_loss_mlp": 0.01242234, "balance_loss_clip": 1.03373957, "balance_loss_mlp": 1.00839305, "epoch": 0.04364816930199002, "flos": 23623559197920.0, "grad_norm": 1.7306302961663604, "language_loss": 0.84691387, "learning_rate": 3.998063265487148e-06, "loss": 0.8742466, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.713604211807251 }, { "auxiliary_loss_clip": 0.01491783, "auxiliary_loss_mlp": 0.01240787, "balance_loss_clip": 1.0345279, "balance_loss_mlp": 1.00675583, "epoch": 0.043768412192629114, "flos": 14429743266240.0, "grad_norm": 1.8226893912064648, "language_loss": 0.81109959, "learning_rate": 3.99802884098219e-06, "loss": 0.83842528, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.6867482662200928 }, { "auxiliary_loss_clip": 0.01488348, "auxiliary_loss_mlp": 0.01242909, "balance_loss_clip": 1.03143573, "balance_loss_mlp": 1.0073514, "epoch": 0.043888655083268203, "flos": 26468905747680.0, "grad_norm": 2.170313545981329, "language_loss": 0.82325363, "learning_rate": 3.997994113377845e-06, "loss": 0.85056615, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.783507823944092 }, { "auxiliary_loss_clip": 0.0148968, "auxiliary_loss_mlp": 0.01242751, "balance_loss_clip": 1.03379488, "balance_loss_mlp": 1.00776517, "epoch": 0.04400889797390729, "flos": 27235275611040.0, "grad_norm": 2.3309180981562574, "language_loss": 0.83398318, "learning_rate": 3.9979590826793815e-06, "loss": 0.86130744, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.734586715698242 }, { "auxiliary_loss_clip": 0.01489532, "auxiliary_loss_mlp": 0.01243892, "balance_loss_clip": 1.03355527, "balance_loss_mlp": 1.00890648, "epoch": 0.04412914086454638, "flos": 20119538273760.0, "grad_norm": 1.8063895857480794, "language_loss": 0.80979705, "learning_rate": 3.997923748892113e-06, "loss": 0.83713138, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.813788652420044 }, { "auxiliary_loss_clip": 0.01488019, "auxiliary_loss_mlp": 0.0124123, "balance_loss_clip": 1.03335309, "balance_loss_mlp": 1.00796127, "epoch": 0.04424938375518547, "flos": 22604630556960.0, "grad_norm": 1.7347179465727138, "language_loss": 0.88567644, "learning_rate": 3.9978881120214015e-06, "loss": 0.91296893, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.758821725845337 }, { "auxiliary_loss_clip": 0.0148666, "auxiliary_loss_mlp": 0.0124284, "balance_loss_clip": 1.03193736, "balance_loss_mlp": 1.00804543, "epoch": 0.04436962664582456, "flos": 24132376892160.0, "grad_norm": 1.8256292456257313, "language_loss": 0.79498982, "learning_rate": 3.997852172072652e-06, "loss": 0.82228482, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.70344614982605 }, { "auxiliary_loss_clip": 0.01486311, "auxiliary_loss_mlp": 0.01241204, "balance_loss_clip": 1.0319556, "balance_loss_mlp": 1.00679088, "epoch": 0.04448986953646366, "flos": 18222915988800.0, "grad_norm": 3.076510686612557, "language_loss": 0.89112854, "learning_rate": 3.9978159290513155e-06, "loss": 0.91840369, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.6884572505950928 }, { "auxiliary_loss_clip": 0.01488202, "auxiliary_loss_mlp": 0.01243555, "balance_loss_clip": 1.0332073, "balance_loss_mlp": 1.00799704, "epoch": 0.04461011242710275, "flos": 30117933655200.0, "grad_norm": 1.8279057285432216, "language_loss": 0.80139244, "learning_rate": 3.997779382962892e-06, "loss": 0.82871002, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.8020427227020264 }, { "auxiliary_loss_clip": 0.01486071, "auxiliary_loss_mlp": 0.01242312, "balance_loss_clip": 1.03267264, "balance_loss_mlp": 1.00751734, "epoch": 0.04473035531774184, "flos": 29752542302400.0, "grad_norm": 2.176522120355585, "language_loss": 0.73948306, "learning_rate": 3.997742533812924e-06, "loss": 0.76676691, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.7917356491088867 }, { "auxiliary_loss_clip": 0.01486227, "auxiliary_loss_mlp": 0.01243244, "balance_loss_clip": 1.03263128, "balance_loss_mlp": 1.00959432, "epoch": 0.04485059820838093, "flos": 13151574043200.0, "grad_norm": 2.9434870414320424, "language_loss": 0.92227638, "learning_rate": 3.997705381607001e-06, "loss": 0.94957101, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.8402764797210693 }, { "auxiliary_loss_clip": 0.01488649, "auxiliary_loss_mlp": 0.01224672, "balance_loss_clip": 1.04292309, "balance_loss_mlp": 1.0003674, "epoch": 0.04497084109902002, "flos": 68094242167680.0, "grad_norm": 1.309800601899897, "language_loss": 0.60322559, "learning_rate": 3.997667926350761e-06, "loss": 0.63035882, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.223118543624878 }, { "auxiliary_loss_clip": 0.01487855, "auxiliary_loss_mlp": 0.01224597, "balance_loss_clip": 1.04249644, "balance_loss_mlp": 1.00029242, "epoch": 0.04509108398965911, "flos": 64342296620640.0, "grad_norm": 0.9013919955019795, "language_loss": 0.57865489, "learning_rate": 3.997630168049886e-06, "loss": 0.60577941, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.2670998573303223 }, { "auxiliary_loss_clip": 0.01486402, "auxiliary_loss_mlp": 0.01241473, "balance_loss_clip": 1.03402519, "balance_loss_mlp": 1.0083952, "epoch": 0.045211326880298205, "flos": 22271126222880.0, "grad_norm": 1.8971795812372692, "language_loss": 0.77640051, "learning_rate": 3.997592106710101e-06, "loss": 0.80367923, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.7624635696411133 }, { "auxiliary_loss_clip": 0.01484306, "auxiliary_loss_mlp": 0.01239809, "balance_loss_clip": 1.03221822, "balance_loss_mlp": 1.00711226, "epoch": 0.045331569770937295, "flos": 32159455152480.0, "grad_norm": 2.5771571190834046, "language_loss": 0.6607461, "learning_rate": 3.997553742337182e-06, "loss": 0.68798721, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.799708366394043 }, { "auxiliary_loss_clip": 0.01482724, "auxiliary_loss_mlp": 0.012408, "balance_loss_clip": 1.03180528, "balance_loss_mlp": 1.00829411, "epoch": 0.045451812661576385, "flos": 22163466657600.0, "grad_norm": 1.7560079368577617, "language_loss": 0.91166008, "learning_rate": 3.997515074936949e-06, "loss": 0.93889534, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.9296774864196777 }, { "auxiliary_loss_clip": 0.01482785, "auxiliary_loss_mlp": 0.01242291, "balance_loss_clip": 1.03247356, "balance_loss_mlp": 1.0080688, "epoch": 0.045572055552215475, "flos": 16581977841600.0, "grad_norm": 2.42494493303557, "language_loss": 0.86897933, "learning_rate": 3.997476104515268e-06, "loss": 0.89623004, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.70711350440979 }, { "auxiliary_loss_clip": 0.0148336, "auxiliary_loss_mlp": 0.01238751, "balance_loss_clip": 1.03279138, "balance_loss_mlp": 1.00872517, "epoch": 0.045692298442854565, "flos": 17603385216480.0, "grad_norm": 1.8555181553241906, "language_loss": 0.7753458, "learning_rate": 3.9974368310780485e-06, "loss": 0.80256695, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.744906425476074 }, { "auxiliary_loss_clip": 0.01484699, "auxiliary_loss_mlp": 0.01244101, "balance_loss_clip": 1.03291833, "balance_loss_mlp": 1.0098784, "epoch": 0.045812541333493655, "flos": 26761111058880.0, "grad_norm": 2.4737985440592922, "language_loss": 0.74363923, "learning_rate": 3.997397254631251e-06, "loss": 0.77092725, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.7082479000091553 }, { "auxiliary_loss_clip": 0.01480534, "auxiliary_loss_mlp": 0.01222639, "balance_loss_clip": 1.0390178, "balance_loss_mlp": 0.99986118, "epoch": 0.04593278422413275, "flos": 60250380477120.0, "grad_norm": 0.8154162541611637, "language_loss": 0.60023308, "learning_rate": 3.997357375180878e-06, "loss": 0.62726474, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 3.3279731273651123 }, { "auxiliary_loss_clip": 0.01481372, "auxiliary_loss_mlp": 0.01238564, "balance_loss_clip": 1.03172171, "balance_loss_mlp": 1.00605786, "epoch": 0.04605302711477184, "flos": 21799260786240.0, "grad_norm": 1.8064298774730319, "language_loss": 0.7527383, "learning_rate": 3.997317192732979e-06, "loss": 0.77993762, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 3.681335210800171 }, { "auxiliary_loss_clip": 0.01482864, "auxiliary_loss_mlp": 0.01238571, "balance_loss_clip": 1.03283942, "balance_loss_mlp": 1.00663781, "epoch": 0.04617327000541093, "flos": 19459714341600.0, "grad_norm": 1.9243661559366327, "language_loss": 0.8248837, "learning_rate": 3.99727670729365e-06, "loss": 0.85209799, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 3.594774007797241 }, { "auxiliary_loss_clip": 0.01481507, "auxiliary_loss_mlp": 0.01238444, "balance_loss_clip": 1.03269339, "balance_loss_mlp": 1.00746417, "epoch": 0.04629351289605002, "flos": 25411480130880.0, "grad_norm": 2.0377285790517576, "language_loss": 0.78044736, "learning_rate": 3.997235918869033e-06, "loss": 0.80764687, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.7438278198242188 }, { "auxiliary_loss_clip": 0.01479785, "auxiliary_loss_mlp": 0.0123849, "balance_loss_clip": 1.03164625, "balance_loss_mlp": 1.00712872, "epoch": 0.04641375578668911, "flos": 20558295286560.0, "grad_norm": 2.027431466456557, "language_loss": 0.82423007, "learning_rate": 3.997194827465315e-06, "loss": 0.85141277, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 3.7498619556427 }, { "auxiliary_loss_clip": 0.01478719, "auxiliary_loss_mlp": 0.01235941, "balance_loss_clip": 1.03073227, "balance_loss_mlp": 1.0061059, "epoch": 0.0465339986773282, "flos": 13188669995520.0, "grad_norm": 2.8403171391602458, "language_loss": 0.91439199, "learning_rate": 3.997153433088728e-06, "loss": 0.94153857, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.7976765632629395 }, { "auxiliary_loss_clip": 0.01478486, "auxiliary_loss_mlp": 0.01236724, "balance_loss_clip": 1.03129077, "balance_loss_mlp": 1.00555336, "epoch": 0.0466542415679673, "flos": 25556558961600.0, "grad_norm": 3.7042195137924727, "language_loss": 0.81282425, "learning_rate": 3.997111735745554e-06, "loss": 0.83997631, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.7971079349517822 }, { "auxiliary_loss_clip": 0.01478533, "auxiliary_loss_mlp": 0.01238427, "balance_loss_clip": 1.03167605, "balance_loss_mlp": 1.00725639, "epoch": 0.04677448445860639, "flos": 22236760470240.0, "grad_norm": 3.5943423363458376, "language_loss": 0.82581806, "learning_rate": 3.997069735442118e-06, "loss": 0.85298765, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.7341549396514893 }, { "auxiliary_loss_clip": 0.01478072, "auxiliary_loss_mlp": 0.01236772, "balance_loss_clip": 1.03108525, "balance_loss_mlp": 1.00674558, "epoch": 0.04689472734924548, "flos": 28147837939200.0, "grad_norm": 1.4659659710177348, "language_loss": 0.80481088, "learning_rate": 3.997027432184792e-06, "loss": 0.83195931, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.7738285064697266 }, { "auxiliary_loss_clip": 0.01476261, "auxiliary_loss_mlp": 0.012376, "balance_loss_clip": 1.03035855, "balance_loss_mlp": 1.00757408, "epoch": 0.04701497023988457, "flos": 23148963561600.0, "grad_norm": 1.8430804599444843, "language_loss": 0.89439774, "learning_rate": 3.99698482597999e-06, "loss": 0.92153645, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.7413175106048584 }, { "auxiliary_loss_clip": 0.01472129, "auxiliary_loss_mlp": 0.01222673, "balance_loss_clip": 1.03541672, "balance_loss_mlp": 0.99989456, "epoch": 0.04713521313052366, "flos": 64827705284640.0, "grad_norm": 0.8644086479342992, "language_loss": 0.63951647, "learning_rate": 3.99694191683418e-06, "loss": 0.66646445, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.282386064529419 }, { "auxiliary_loss_clip": 0.01477635, "auxiliary_loss_mlp": 0.01239911, "balance_loss_clip": 1.03176069, "balance_loss_mlp": 1.00912213, "epoch": 0.047255456021162746, "flos": 18771595758720.0, "grad_norm": 2.1217603106191025, "language_loss": 0.8167305, "learning_rate": 3.996898704753867e-06, "loss": 0.84390604, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.68327260017395 }, { "auxiliary_loss_clip": 0.01474346, "auxiliary_loss_mlp": 0.01239831, "balance_loss_clip": 1.02947474, "balance_loss_mlp": 1.00885081, "epoch": 0.04737569891180184, "flos": 22053831289920.0, "grad_norm": 2.359496844586914, "language_loss": 0.87703323, "learning_rate": 3.996855189745609e-06, "loss": 0.90417492, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.7567660808563232 }, { "auxiliary_loss_clip": 0.01475832, "auxiliary_loss_mlp": 0.01237299, "balance_loss_clip": 1.03049481, "balance_loss_mlp": 1.00727284, "epoch": 0.04749594180244093, "flos": 29057382678240.0, "grad_norm": 1.7942750691866356, "language_loss": 0.92881131, "learning_rate": 3.996811371816007e-06, "loss": 0.95594263, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.8053977489471436 }, { "auxiliary_loss_clip": 0.01475459, "auxiliary_loss_mlp": 0.01236895, "balance_loss_clip": 1.03081727, "balance_loss_mlp": 1.00744104, "epoch": 0.04761618469308002, "flos": 35112281649120.0, "grad_norm": 1.851615521388968, "language_loss": 0.77988446, "learning_rate": 3.996767250971707e-06, "loss": 0.80700797, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.8565096855163574 }, { "auxiliary_loss_clip": 0.01475295, "auxiliary_loss_mlp": 0.01236532, "balance_loss_clip": 1.03076935, "balance_loss_mlp": 1.00784123, "epoch": 0.04773642758371911, "flos": 25630714942560.0, "grad_norm": 2.010267383913653, "language_loss": 0.86732507, "learning_rate": 3.996722827219403e-06, "loss": 0.89444327, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.778878927230835 }, { "auxiliary_loss_clip": 0.01474582, "auxiliary_loss_mlp": 0.012406, "balance_loss_clip": 1.03079152, "balance_loss_mlp": 1.00942993, "epoch": 0.0478566704743582, "flos": 20631517251840.0, "grad_norm": 2.56827640908335, "language_loss": 0.82787049, "learning_rate": 3.996678100565833e-06, "loss": 0.85502231, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.7512924671173096 }, { "auxiliary_loss_clip": 0.01472663, "auxiliary_loss_mlp": 0.01234782, "balance_loss_clip": 1.03000283, "balance_loss_mlp": 1.00704515, "epoch": 0.04797691336499729, "flos": 18835729032960.0, "grad_norm": 2.9778575066635162, "language_loss": 0.88488001, "learning_rate": 3.996633071017783e-06, "loss": 0.91195446, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 3.0085482597351074 }, { "auxiliary_loss_clip": 0.0147293, "auxiliary_loss_mlp": 0.01234602, "balance_loss_clip": 1.03040695, "balance_loss_mlp": 1.00552917, "epoch": 0.04809715625563638, "flos": 21099682549440.0, "grad_norm": 2.237329216751116, "language_loss": 0.81675619, "learning_rate": 3.996587738582084e-06, "loss": 0.84383154, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.816953182220459 }, { "auxiliary_loss_clip": 0.0147197, "auxiliary_loss_mlp": 0.01235079, "balance_loss_clip": 1.02952874, "balance_loss_mlp": 1.00600636, "epoch": 0.04821739914627548, "flos": 23805662133600.0, "grad_norm": 3.054084262814843, "language_loss": 0.86126828, "learning_rate": 3.9965421032656115e-06, "loss": 0.88833874, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.795196771621704 }, { "auxiliary_loss_clip": 0.01470774, "auxiliary_loss_mlp": 0.01232729, "balance_loss_clip": 1.0291667, "balance_loss_mlp": 1.00518274, "epoch": 0.04833764203691457, "flos": 22200598533600.0, "grad_norm": 5.505710689845141, "language_loss": 0.94106174, "learning_rate": 3.99649616507529e-06, "loss": 0.96809679, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.7989730834960938 }, { "auxiliary_loss_clip": 0.0146485, "auxiliary_loss_mlp": 0.01221197, "balance_loss_clip": 1.03257406, "balance_loss_mlp": 0.99994463, "epoch": 0.04845788492755366, "flos": 65904408708480.0, "grad_norm": 0.8974837625775585, "language_loss": 0.63173628, "learning_rate": 3.996449924018088e-06, "loss": 0.65859675, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.2109405994415283 }, { "auxiliary_loss_clip": 0.01471627, "auxiliary_loss_mlp": 0.01235979, "balance_loss_clip": 1.03014255, "balance_loss_mlp": 1.00843251, "epoch": 0.04857812781819275, "flos": 19281311544960.0, "grad_norm": 1.755068794885718, "language_loss": 0.79214418, "learning_rate": 3.99640338010102e-06, "loss": 0.81922019, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.754232406616211 }, { "auxiliary_loss_clip": 0.01470407, "auxiliary_loss_mlp": 0.01230983, "balance_loss_clip": 1.02915621, "balance_loss_mlp": 1.0051533, "epoch": 0.04869837070883184, "flos": 24062388058080.0, "grad_norm": 1.9114525213795721, "language_loss": 0.78503382, "learning_rate": 3.996356533331146e-06, "loss": 0.81204778, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.8572824001312256 }, { "auxiliary_loss_clip": 0.01470294, "auxiliary_loss_mlp": 0.0123627, "balance_loss_clip": 1.02863348, "balance_loss_mlp": 1.00700629, "epoch": 0.04881861359947093, "flos": 25187180080320.0, "grad_norm": 2.291474135969221, "language_loss": 0.61826926, "learning_rate": 3.996309383715573e-06, "loss": 0.64533484, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.762115001678467 }, { "auxiliary_loss_clip": 0.01469512, "auxiliary_loss_mlp": 0.01231872, "balance_loss_clip": 1.02868843, "balance_loss_mlp": 1.00508797, "epoch": 0.048938856490110025, "flos": 16362922648320.0, "grad_norm": 10.593584264702956, "language_loss": 0.73826325, "learning_rate": 3.996261931261454e-06, "loss": 0.76527703, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.807014226913452 }, { "auxiliary_loss_clip": 0.01470989, "auxiliary_loss_mlp": 0.01233365, "balance_loss_clip": 1.03002393, "balance_loss_mlp": 1.0060091, "epoch": 0.049059099380749115, "flos": 29895106475520.0, "grad_norm": 1.755233313870908, "language_loss": 0.86731529, "learning_rate": 3.996214175975987e-06, "loss": 0.89435875, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 2.7805354595184326 }, { "auxiliary_loss_clip": 0.01469914, "auxiliary_loss_mlp": 0.0123357, "balance_loss_clip": 1.02951097, "balance_loss_mlp": 1.0064044, "epoch": 0.049179342271388204, "flos": 35918872824960.0, "grad_norm": 2.183782663005992, "language_loss": 0.79170561, "learning_rate": 3.996166117866417e-06, "loss": 0.81874049, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 2.9146201610565186 }, { "auxiliary_loss_clip": 0.01467774, "auxiliary_loss_mlp": 0.01233885, "balance_loss_clip": 1.02841306, "balance_loss_mlp": 1.00729203, "epoch": 0.049299585162027294, "flos": 14611235499360.0, "grad_norm": 4.49788825641225, "language_loss": 0.86572248, "learning_rate": 3.996117756940035e-06, "loss": 0.89273906, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 4.579894304275513 }, { "auxiliary_loss_clip": 0.01468255, "auxiliary_loss_mlp": 0.01231694, "balance_loss_clip": 1.02861774, "balance_loss_mlp": 1.00510049, "epoch": 0.049419828052666384, "flos": 19567948685760.0, "grad_norm": 2.024611522433769, "language_loss": 0.97399658, "learning_rate": 3.996069093204175e-06, "loss": 1.00099599, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 2.7221293449401855 }, { "auxiliary_loss_clip": 0.01468147, "auxiliary_loss_mlp": 0.01232683, "balance_loss_clip": 1.02919579, "balance_loss_mlp": 1.00608969, "epoch": 0.049540070943305474, "flos": 13659924729600.0, "grad_norm": 2.5470343520480982, "language_loss": 0.88104415, "learning_rate": 3.996020126666221e-06, "loss": 0.90805244, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 4.644726037979126 }, { "auxiliary_loss_clip": 0.0146818, "auxiliary_loss_mlp": 0.01231696, "balance_loss_clip": 1.02906799, "balance_loss_mlp": 1.00510252, "epoch": 0.04966031383394457, "flos": 21832045896960.0, "grad_norm": 2.0026298823689097, "language_loss": 0.82078612, "learning_rate": 3.995970857333601e-06, "loss": 0.84778494, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.8129498958587646 }, { "auxiliary_loss_clip": 0.01467624, "auxiliary_loss_mlp": 0.01232742, "balance_loss_clip": 1.02859509, "balance_loss_mlp": 1.00595784, "epoch": 0.04978055672458366, "flos": 28618805283840.0, "grad_norm": 1.9452610201698157, "language_loss": 0.79989922, "learning_rate": 3.995921285213789e-06, "loss": 0.82690287, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.8101558685302734 }, { "auxiliary_loss_clip": 0.0146609, "auxiliary_loss_mlp": 0.01231774, "balance_loss_clip": 1.02831495, "balance_loss_mlp": 1.00594354, "epoch": 0.04990079961522275, "flos": 19828231054560.0, "grad_norm": 2.5890410839998514, "language_loss": 0.80613124, "learning_rate": 3.995871410314305e-06, "loss": 0.83310992, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.751798152923584 }, { "auxiliary_loss_clip": 0.01432156, "auxiliary_loss_mlp": 0.01218313, "balance_loss_clip": 1.02849507, "balance_loss_mlp": 1.00011277, "epoch": 0.05002104250586184, "flos": 62735077599840.0, "grad_norm": 0.9084509713995458, "language_loss": 0.59634233, "learning_rate": 3.995821232642714e-06, "loss": 0.62284696, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.3874709606170654 }, { "auxiliary_loss_clip": 0.01441569, "auxiliary_loss_mlp": 0.01231723, "balance_loss_clip": 1.0287354, "balance_loss_mlp": 1.0064652, "epoch": 0.05014128539650093, "flos": 27928531280160.0, "grad_norm": 2.2105239693442917, "language_loss": 0.8262133, "learning_rate": 3.995770752206629e-06, "loss": 0.85294622, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.800260543823242 }, { "auxiliary_loss_clip": 0.01465848, "auxiliary_loss_mlp": 0.01231981, "balance_loss_clip": 1.02820635, "balance_loss_mlp": 1.00672328, "epoch": 0.05026152828714002, "flos": 17705584382400.0, "grad_norm": 16.140212492342105, "language_loss": 0.97197074, "learning_rate": 3.995719969013709e-06, "loss": 0.99894905, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.7166049480438232 }, { "auxiliary_loss_clip": 0.01412315, "auxiliary_loss_mlp": 0.01232412, "balance_loss_clip": 1.02655983, "balance_loss_mlp": 1.00639105, "epoch": 0.05038177117777912, "flos": 19133286972480.0, "grad_norm": 2.6139310658815123, "language_loss": 0.86202931, "learning_rate": 3.995668883071655e-06, "loss": 0.88847655, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.768481492996216 }, { "auxiliary_loss_clip": 0.01465762, "auxiliary_loss_mlp": 0.01231796, "balance_loss_clip": 1.02866507, "balance_loss_mlp": 1.00596547, "epoch": 0.050502014068418206, "flos": 20667715112160.0, "grad_norm": 2.384449564498219, "language_loss": 0.90550768, "learning_rate": 3.995617494388219e-06, "loss": 0.93248332, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.7489140033721924 }, { "auxiliary_loss_clip": 0.01422167, "auxiliary_loss_mlp": 0.01232219, "balance_loss_clip": 1.02540576, "balance_loss_mlp": 1.00600767, "epoch": 0.050622256959057296, "flos": 21361114476000.0, "grad_norm": 1.9909993502805192, "language_loss": 0.80422288, "learning_rate": 3.995565802971196e-06, "loss": 0.83076674, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.8370420932769775 }, { "auxiliary_loss_clip": 0.01428575, "auxiliary_loss_mlp": 0.01230059, "balance_loss_clip": 1.02799964, "balance_loss_mlp": 1.00422907, "epoch": 0.050742499849696386, "flos": 27673601539680.0, "grad_norm": 2.077099661100573, "language_loss": 0.67751086, "learning_rate": 3.995513808828427e-06, "loss": 0.70409715, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.9158780574798584 }, { "auxiliary_loss_clip": 0.01420367, "auxiliary_loss_mlp": 0.01228821, "balance_loss_clip": 1.02552319, "balance_loss_mlp": 1.0052799, "epoch": 0.050862742740335476, "flos": 19866009556800.0, "grad_norm": 3.100478061927443, "language_loss": 0.76710808, "learning_rate": 3.9954615119678e-06, "loss": 0.79359996, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 0.21869254112243652 }, { "auxiliary_loss_clip": 0.01450227, "auxiliary_loss_mlp": 0.01231813, "balance_loss_clip": 1.02718115, "balance_loss_mlp": 1.00617349, "epoch": 0.050982985630974566, "flos": 22085107606080.0, "grad_norm": 2.102397455363715, "language_loss": 0.80759811, "learning_rate": 3.995408912397248e-06, "loss": 0.83441854, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.7736947536468506 }, { "auxiliary_loss_clip": 0.01412628, "auxiliary_loss_mlp": 0.01234752, "balance_loss_clip": 1.02556503, "balance_loss_mlp": 1.00777709, "epoch": 0.05110322852161366, "flos": 20740973001120.0, "grad_norm": 2.895054934985047, "language_loss": 0.93267906, "learning_rate": 3.99535601012475e-06, "loss": 0.95915294, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.8772687911987305 }, { "auxiliary_loss_clip": 0.01394705, "auxiliary_loss_mlp": 0.00874784, "balance_loss_clip": 1.02525091, "balance_loss_mlp": 0.99989605, "epoch": 0.05122347141225275, "flos": 28547307655200.0, "grad_norm": 1.6351164933520992, "language_loss": 0.75686705, "learning_rate": 3.995302805158333e-06, "loss": 0.77956194, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.9140093326568604 }, { "auxiliary_loss_clip": 0.0142197, "auxiliary_loss_mlp": 0.01232603, "balance_loss_clip": 1.02531099, "balance_loss_mlp": 1.00581908, "epoch": 0.05134371430289184, "flos": 19722690986400.0, "grad_norm": 1.9081589485487755, "language_loss": 0.83637941, "learning_rate": 3.9952492975060665e-06, "loss": 0.86292517, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.8608615398406982 }, { "auxiliary_loss_clip": 0.01437125, "auxiliary_loss_mlp": 0.01229596, "balance_loss_clip": 1.02637088, "balance_loss_mlp": 1.00510097, "epoch": 0.05146395719353093, "flos": 34458959903040.0, "grad_norm": 2.6528504169397595, "language_loss": 0.84737039, "learning_rate": 3.995195487176067e-06, "loss": 0.87403756, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.9661240577697754 }, { "auxiliary_loss_clip": 0.01463252, "auxiliary_loss_mlp": 0.01233019, "balance_loss_clip": 1.02762127, "balance_loss_mlp": 1.00757027, "epoch": 0.05158420008417002, "flos": 21760296802560.0, "grad_norm": 2.5635225652024123, "language_loss": 0.855739, "learning_rate": 3.995141374176499e-06, "loss": 0.88270169, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.7519237995147705 }, { "auxiliary_loss_clip": 0.01384317, "auxiliary_loss_mlp": 0.00874603, "balance_loss_clip": 1.02380371, "balance_loss_mlp": 1.00013793, "epoch": 0.05170444297480911, "flos": 72553992474240.0, "grad_norm": 0.8755395256829545, "language_loss": 0.6310758, "learning_rate": 3.995086958515572e-06, "loss": 0.65366501, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.404263734817505 }, { "auxiliary_loss_clip": 0.01450493, "auxiliary_loss_mlp": 0.00874584, "balance_loss_clip": 1.02638316, "balance_loss_mlp": 1.00011146, "epoch": 0.05182468586544821, "flos": 62416194203520.0, "grad_norm": 0.8647688951407816, "language_loss": 0.59928751, "learning_rate": 3.995032240201538e-06, "loss": 0.62253833, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.3506247997283936 }, { "auxiliary_loss_clip": 0.01421577, "auxiliary_loss_mlp": 0.01217803, "balance_loss_clip": 1.02527094, "balance_loss_mlp": 1.00112772, "epoch": 0.0519449287560873, "flos": 41226010398720.0, "grad_norm": 0.942549103510543, "language_loss": 0.63179535, "learning_rate": 3.9949772192427e-06, "loss": 0.65818912, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 3.030484914779663 }, { "auxiliary_loss_clip": 0.01420284, "auxiliary_loss_mlp": 0.01231958, "balance_loss_clip": 1.02502322, "balance_loss_mlp": 1.00650978, "epoch": 0.05206517164672639, "flos": 17494540169760.0, "grad_norm": 4.062333787416622, "language_loss": 0.79259562, "learning_rate": 3.994921895647405e-06, "loss": 0.81911808, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.8135159015655518 }, { "auxiliary_loss_clip": 0.0144853, "auxiliary_loss_mlp": 0.01217194, "balance_loss_clip": 1.02512646, "balance_loss_mlp": 1.0005188, "epoch": 0.05218541453736548, "flos": 64002793032000.0, "grad_norm": 0.8367535080542751, "language_loss": 0.55336249, "learning_rate": 3.994866269424043e-06, "loss": 0.58001971, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 3.1362130641937256 }, { "auxiliary_loss_clip": 0.01369195, "auxiliary_loss_mlp": 0.01230891, "balance_loss_clip": 1.02158427, "balance_loss_mlp": 1.00582361, "epoch": 0.05230565742800457, "flos": 19317329786880.0, "grad_norm": 2.1179801906883164, "language_loss": 0.78257376, "learning_rate": 3.9948103405810545e-06, "loss": 0.80857456, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 3.0418262481689453 }, { "auxiliary_loss_clip": 0.0140526, "auxiliary_loss_mlp": 0.01230254, "balance_loss_clip": 1.02410722, "balance_loss_mlp": 1.00575924, "epoch": 0.05242590031864366, "flos": 25298647555680.0, "grad_norm": 1.8341566408434251, "language_loss": 0.85887384, "learning_rate": 3.994754109126923e-06, "loss": 0.88522899, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 4.975503921508789 }, { "auxiliary_loss_clip": 0.01352134, "auxiliary_loss_mlp": 0.01228979, "balance_loss_clip": 1.02239895, "balance_loss_mlp": 1.00543749, "epoch": 0.052546143209282754, "flos": 26211640968000.0, "grad_norm": 1.690869856042698, "language_loss": 0.93548751, "learning_rate": 3.994697575070181e-06, "loss": 0.96129858, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 3.2762343883514404 }, { "auxiliary_loss_clip": 0.01409673, "auxiliary_loss_mlp": 0.01229485, "balance_loss_clip": 1.02578425, "balance_loss_mlp": 1.00537145, "epoch": 0.052666386099921844, "flos": 22158149952960.0, "grad_norm": 2.0402201876602177, "language_loss": 0.9148519, "learning_rate": 3.994640738419402e-06, "loss": 0.94124341, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 5.129714488983154 }, { "auxiliary_loss_clip": 0.01434237, "auxiliary_loss_mlp": 0.01229015, "balance_loss_clip": 1.02531862, "balance_loss_mlp": 1.00566435, "epoch": 0.052786628990560934, "flos": 23881829840640.0, "grad_norm": 1.7858642017125441, "language_loss": 0.81025362, "learning_rate": 3.9945835991832075e-06, "loss": 0.83688617, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.8704276084899902 }, { "auxiliary_loss_clip": 0.01459629, "auxiliary_loss_mlp": 0.01230534, "balance_loss_clip": 1.02682829, "balance_loss_mlp": 1.00680184, "epoch": 0.052906871881200024, "flos": 24605032649760.0, "grad_norm": 2.281075962254016, "language_loss": 0.9282456, "learning_rate": 3.994526157370268e-06, "loss": 0.95514727, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 3.0210225582122803 }, { "auxiliary_loss_clip": 0.01414033, "auxiliary_loss_mlp": 0.01217168, "balance_loss_clip": 1.02274132, "balance_loss_mlp": 1.00049353, "epoch": 0.053027114771839114, "flos": 56461662290880.0, "grad_norm": 0.9152353153745703, "language_loss": 0.59266466, "learning_rate": 3.994468412989296e-06, "loss": 0.61897665, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.468338966369629 }, { "auxiliary_loss_clip": 0.01430427, "auxiliary_loss_mlp": 0.01227235, "balance_loss_clip": 1.02473068, "balance_loss_mlp": 1.00541019, "epoch": 0.053147357662478203, "flos": 17311108057920.0, "grad_norm": 2.100123419727409, "language_loss": 0.92757392, "learning_rate": 3.994410366049052e-06, "loss": 0.95415056, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.8972530364990234 }, { "auxiliary_loss_clip": 0.01432153, "auxiliary_loss_mlp": 0.01228076, "balance_loss_clip": 1.02526224, "balance_loss_mlp": 1.00472569, "epoch": 0.0532676005531173, "flos": 17164987440480.0, "grad_norm": 2.0786074588057555, "language_loss": 0.82741052, "learning_rate": 3.994352016558341e-06, "loss": 0.85401273, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.794853925704956 }, { "auxiliary_loss_clip": 0.01431889, "auxiliary_loss_mlp": 0.01227508, "balance_loss_clip": 1.02603531, "balance_loss_mlp": 1.00530219, "epoch": 0.05338784344375639, "flos": 27819973622880.0, "grad_norm": 1.985570847154065, "language_loss": 0.73839784, "learning_rate": 3.994293364526014e-06, "loss": 0.76499176, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.833977699279785 }, { "auxiliary_loss_clip": 0.01416421, "auxiliary_loss_mlp": 0.01227974, "balance_loss_clip": 1.02434385, "balance_loss_mlp": 1.00481462, "epoch": 0.05350808633439548, "flos": 21507702101280.0, "grad_norm": 2.0063718823068473, "language_loss": 0.84856981, "learning_rate": 3.99423440996097e-06, "loss": 0.87501371, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.824766159057617 }, { "auxiliary_loss_clip": 0.01410646, "auxiliary_loss_mlp": 0.01230483, "balance_loss_clip": 1.02462137, "balance_loss_mlp": 1.00751376, "epoch": 0.05362832922503457, "flos": 20084310352800.0, "grad_norm": 2.7883840092805556, "language_loss": 0.81208402, "learning_rate": 3.994175152872152e-06, "loss": 0.83849525, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.7838284969329834 }, { "auxiliary_loss_clip": 0.01439666, "auxiliary_loss_mlp": 0.01228425, "balance_loss_clip": 1.02462304, "balance_loss_mlp": 1.00640953, "epoch": 0.05374857211567366, "flos": 26137233521280.0, "grad_norm": 2.0716461600183176, "language_loss": 0.78702945, "learning_rate": 3.994115593268548e-06, "loss": 0.81371033, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.830439567565918 }, { "auxiliary_loss_clip": 0.01456859, "auxiliary_loss_mlp": 0.01228667, "balance_loss_clip": 1.02591801, "balance_loss_mlp": 1.00646138, "epoch": 0.05386881500631275, "flos": 27486828525600.0, "grad_norm": 3.5614620651254683, "language_loss": 0.82090867, "learning_rate": 3.994055731159195e-06, "loss": 0.8477639, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.8224709033966064 }, { "auxiliary_loss_clip": 0.01434079, "auxiliary_loss_mlp": 0.01228288, "balance_loss_clip": 1.02630138, "balance_loss_mlp": 1.00570047, "epoch": 0.053989057896951846, "flos": 23585098145760.0, "grad_norm": 3.0437242555919872, "language_loss": 0.86539161, "learning_rate": 3.993995566553172e-06, "loss": 0.89201522, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.7867276668548584 }, { "auxiliary_loss_clip": 0.01423576, "auxiliary_loss_mlp": 0.01227081, "balance_loss_clip": 1.02290559, "balance_loss_mlp": 1.00506544, "epoch": 0.054109300787590936, "flos": 25228874263680.0, "grad_norm": 1.5648047173961201, "language_loss": 0.77159321, "learning_rate": 3.993935099459607e-06, "loss": 0.79809982, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.850494861602783 }, { "auxiliary_loss_clip": 0.01456262, "auxiliary_loss_mlp": 0.01226696, "balance_loss_clip": 1.02535641, "balance_loss_mlp": 1.00544322, "epoch": 0.054229543678230026, "flos": 23841536680800.0, "grad_norm": 2.519481202085559, "language_loss": 0.73882008, "learning_rate": 3.993874329887673e-06, "loss": 0.76564968, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.719386339187622 }, { "auxiliary_loss_clip": 0.01429666, "auxiliary_loss_mlp": 0.01226643, "balance_loss_clip": 1.02449918, "balance_loss_mlp": 1.00462818, "epoch": 0.054349786568869116, "flos": 16320941075520.0, "grad_norm": 3.281896670293021, "language_loss": 0.86460769, "learning_rate": 3.993813257846589e-06, "loss": 0.8911708, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.856834650039673 }, { "auxiliary_loss_clip": 0.0143005, "auxiliary_loss_mlp": 0.01227412, "balance_loss_clip": 1.02486837, "balance_loss_mlp": 1.00596857, "epoch": 0.054470029459508205, "flos": 18660738985920.0, "grad_norm": 2.253131566137972, "language_loss": 0.92898899, "learning_rate": 3.993751883345619e-06, "loss": 0.95556366, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.8413612842559814 }, { "auxiliary_loss_clip": 0.01414522, "auxiliary_loss_mlp": 0.01227385, "balance_loss_clip": 1.02426445, "balance_loss_mlp": 1.00594163, "epoch": 0.054590272350147295, "flos": 17785308533760.0, "grad_norm": 3.067923185156366, "language_loss": 0.87459451, "learning_rate": 3.993690206394073e-06, "loss": 0.90101349, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.8337607383728027 }, { "auxiliary_loss_clip": 0.01419002, "auxiliary_loss_mlp": 0.01230196, "balance_loss_clip": 1.02390254, "balance_loss_mlp": 1.00798953, "epoch": 0.054710515240786385, "flos": 17785955160000.0, "grad_norm": 2.5425281119826844, "language_loss": 0.88027883, "learning_rate": 3.993628227001307e-06, "loss": 0.90677077, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.8747644424438477 }, { "auxiliary_loss_clip": 0.01416015, "auxiliary_loss_mlp": 0.0122767, "balance_loss_clip": 1.0248518, "balance_loss_mlp": 1.00641751, "epoch": 0.05483075813142548, "flos": 48210917397120.0, "grad_norm": 1.9056513848996952, "language_loss": 0.71482259, "learning_rate": 3.993565945176726e-06, "loss": 0.7412594, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 3.0556788444519043 }, { "auxiliary_loss_clip": 0.01407181, "auxiliary_loss_mlp": 0.01223482, "balance_loss_clip": 1.02394271, "balance_loss_mlp": 1.00394654, "epoch": 0.05495100102206457, "flos": 19682254131840.0, "grad_norm": 2.060461585881473, "language_loss": 0.84016955, "learning_rate": 3.993503360929776e-06, "loss": 0.86647618, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.8521080017089844 }, { "auxiliary_loss_clip": 0.01312514, "auxiliary_loss_mlp": 0.01227165, "balance_loss_clip": 1.01874137, "balance_loss_mlp": 1.00514925, "epoch": 0.05507124391270366, "flos": 26360060700960.0, "grad_norm": 1.6657098792222085, "language_loss": 0.8122654, "learning_rate": 3.99344047426995e-06, "loss": 0.8376621, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.328765392303467 }, { "auxiliary_loss_clip": 0.01367806, "auxiliary_loss_mlp": 0.01228718, "balance_loss_clip": 1.02183223, "balance_loss_mlp": 1.00651193, "epoch": 0.05519148680334275, "flos": 22601325578400.0, "grad_norm": 2.3385739897702154, "language_loss": 0.93497157, "learning_rate": 3.993377285206789e-06, "loss": 0.96093684, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 3.3789594173431396 }, { "auxiliary_loss_clip": 0.01401766, "auxiliary_loss_mlp": 0.01225628, "balance_loss_clip": 1.0240438, "balance_loss_mlp": 1.00494766, "epoch": 0.05531172969398184, "flos": 40552535468160.0, "grad_norm": 1.711113938363531, "language_loss": 0.86517495, "learning_rate": 3.99331379374988e-06, "loss": 0.89144886, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 3.1355433464050293 }, { "auxiliary_loss_clip": 0.01425195, "auxiliary_loss_mlp": 0.01225777, "balance_loss_clip": 1.02318859, "balance_loss_mlp": 1.00509667, "epoch": 0.05543197258462093, "flos": 23477905588320.0, "grad_norm": 2.0349042297483635, "language_loss": 0.80467474, "learning_rate": 3.993249999908852e-06, "loss": 0.83118451, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 2.899104595184326 }, { "auxiliary_loss_clip": 0.01454532, "auxiliary_loss_mlp": 0.01228385, "balance_loss_clip": 1.02522469, "balance_loss_mlp": 1.00656009, "epoch": 0.05555221547526003, "flos": 18624612972960.0, "grad_norm": 2.27886198033293, "language_loss": 0.87125218, "learning_rate": 3.993185903693384e-06, "loss": 0.89808142, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 4.75354266166687 }, { "auxiliary_loss_clip": 0.01407935, "auxiliary_loss_mlp": 0.0122561, "balance_loss_clip": 1.02319932, "balance_loss_mlp": 1.00569272, "epoch": 0.05567245836589912, "flos": 23587109871840.0, "grad_norm": 2.408830635859359, "language_loss": 0.82292378, "learning_rate": 3.9931215051131995e-06, "loss": 0.8492592, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 3.116323947906494 }, { "auxiliary_loss_clip": 0.01422718, "auxiliary_loss_mlp": 0.01227561, "balance_loss_clip": 1.02306676, "balance_loss_mlp": 1.0061177, "epoch": 0.05579270125653821, "flos": 27746679810240.0, "grad_norm": 1.6051724726451768, "language_loss": 0.79923517, "learning_rate": 3.993056804178068e-06, "loss": 0.82573789, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 4.910808324813843 }, { "auxiliary_loss_clip": 0.01364743, "auxiliary_loss_mlp": 0.01227612, "balance_loss_clip": 1.02308476, "balance_loss_mlp": 1.00597835, "epoch": 0.0559129441471773, "flos": 27014172768000.0, "grad_norm": 2.0683703718320268, "language_loss": 0.84588265, "learning_rate": 3.992991800897803e-06, "loss": 0.8718062, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 3.5150742530822754 }, { "auxiliary_loss_clip": 0.01453305, "auxiliary_loss_mlp": 0.0122451, "balance_loss_clip": 1.02454901, "balance_loss_mlp": 1.00535583, "epoch": 0.05603318703781639, "flos": 15229796332320.0, "grad_norm": 2.7336661354571894, "language_loss": 0.89979213, "learning_rate": 3.9929264952822665e-06, "loss": 0.9265703, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 3.3196935653686523 }, { "auxiliary_loss_clip": 0.01434015, "auxiliary_loss_mlp": 0.01226034, "balance_loss_clip": 1.02433193, "balance_loss_mlp": 1.00554466, "epoch": 0.05615342992845548, "flos": 22266492068160.0, "grad_norm": 1.8848675567803153, "language_loss": 0.88059378, "learning_rate": 3.992860887341366e-06, "loss": 0.90719432, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.981919288635254 }, { "auxiliary_loss_clip": 0.01396981, "auxiliary_loss_mlp": 0.01223893, "balance_loss_clip": 1.02205908, "balance_loss_mlp": 1.00454772, "epoch": 0.056273672819094574, "flos": 23584990374720.0, "grad_norm": 1.9669346024895682, "language_loss": 0.80893838, "learning_rate": 3.992794977085052e-06, "loss": 0.83514708, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 3.1235666275024414 }, { "auxiliary_loss_clip": 0.01383653, "auxiliary_loss_mlp": 0.01227542, "balance_loss_clip": 1.02309966, "balance_loss_mlp": 1.00648046, "epoch": 0.056393915709733664, "flos": 19858717049760.0, "grad_norm": 1.873210694992401, "language_loss": 0.84806967, "learning_rate": 3.992728764523326e-06, "loss": 0.87418163, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.9112813472747803 }, { "auxiliary_loss_clip": 0.01400344, "auxiliary_loss_mlp": 0.01226045, "balance_loss_clip": 1.02238965, "balance_loss_mlp": 1.00498295, "epoch": 0.05651415860037275, "flos": 22163789970720.0, "grad_norm": 2.055158495934863, "language_loss": 0.80806243, "learning_rate": 3.99266224966623e-06, "loss": 0.83432639, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.9551055431365967 }, { "auxiliary_loss_clip": 0.01411842, "auxiliary_loss_mlp": 0.01224961, "balance_loss_clip": 1.023, "balance_loss_mlp": 1.00542498, "epoch": 0.05663440149101184, "flos": 19463558175360.0, "grad_norm": 2.053366270834822, "language_loss": 0.87799752, "learning_rate": 3.992595432523855e-06, "loss": 0.90436554, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.847482681274414 }, { "auxiliary_loss_clip": 0.01382563, "auxiliary_loss_mlp": 0.01223396, "balance_loss_clip": 1.02064645, "balance_loss_mlp": 1.0046227, "epoch": 0.05675464438165093, "flos": 22670236702080.0, "grad_norm": 1.8772440462648818, "language_loss": 0.8585729, "learning_rate": 3.992528313106338e-06, "loss": 0.88463259, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.8244082927703857 }, { "auxiliary_loss_clip": 0.01451769, "auxiliary_loss_mlp": 0.00874646, "balance_loss_clip": 1.02518022, "balance_loss_mlp": 0.9999814, "epoch": 0.05687488727229002, "flos": 16901184551040.0, "grad_norm": 2.4472197255613963, "language_loss": 0.82085109, "learning_rate": 3.9924608914238595e-06, "loss": 0.8441152, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.6392130851745605 }, { "auxiliary_loss_clip": 0.01424764, "auxiliary_loss_mlp": 0.01224985, "balance_loss_clip": 1.02311385, "balance_loss_mlp": 1.00468612, "epoch": 0.05699513016292912, "flos": 29168993848320.0, "grad_norm": 2.422275371618027, "language_loss": 0.83860898, "learning_rate": 3.992393167486648e-06, "loss": 0.86510646, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.8355093002319336 }, { "auxiliary_loss_clip": 0.01451046, "auxiliary_loss_mlp": 0.01225123, "balance_loss_clip": 1.02451825, "balance_loss_mlp": 1.00520539, "epoch": 0.05711537305356821, "flos": 18916207581600.0, "grad_norm": 2.297365941754308, "language_loss": 0.80640531, "learning_rate": 3.992325141304977e-06, "loss": 0.83316696, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.701594591140747 }, { "auxiliary_loss_clip": 0.0138495, "auxiliary_loss_mlp": 0.01225587, "balance_loss_clip": 1.02235007, "balance_loss_mlp": 1.00624156, "epoch": 0.0572356159442073, "flos": 26758991561760.0, "grad_norm": 2.551189996919854, "language_loss": 0.86367464, "learning_rate": 3.992256812889166e-06, "loss": 0.88978004, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.7975127696990967 }, { "auxiliary_loss_clip": 0.0145088, "auxiliary_loss_mlp": 0.01225626, "balance_loss_clip": 1.02493191, "balance_loss_mlp": 1.00570917, "epoch": 0.05735585883484639, "flos": 35116161406560.0, "grad_norm": 2.4706549134291853, "language_loss": 0.76772082, "learning_rate": 3.992188182249582e-06, "loss": 0.79448593, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.849590301513672 }, { "auxiliary_loss_clip": 0.01397321, "auxiliary_loss_mlp": 0.01224941, "balance_loss_clip": 1.02200794, "balance_loss_mlp": 1.00445175, "epoch": 0.05747610172548548, "flos": 18734392035360.0, "grad_norm": 2.0746843794820826, "language_loss": 0.90515792, "learning_rate": 3.992119249396633e-06, "loss": 0.93138051, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.7561068534851074 }, { "auxiliary_loss_clip": 0.01410076, "auxiliary_loss_mlp": 0.00874651, "balance_loss_clip": 1.02234602, "balance_loss_mlp": 0.99993169, "epoch": 0.05759634461612457, "flos": 27964765064160.0, "grad_norm": 1.7948156592819482, "language_loss": 0.81901073, "learning_rate": 3.992050014340778e-06, "loss": 0.84185791, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.801950216293335 }, { "auxiliary_loss_clip": 0.01408646, "auxiliary_loss_mlp": 0.01213469, "balance_loss_clip": 1.01798773, "balance_loss_mlp": 0.99984616, "epoch": 0.057716587506763666, "flos": 69292040613120.0, "grad_norm": 0.8458973378755839, "language_loss": 0.54997629, "learning_rate": 3.99198047709252e-06, "loss": 0.57619745, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.352386713027954 }, { "auxiliary_loss_clip": 0.01381748, "auxiliary_loss_mlp": 0.01226876, "balance_loss_clip": 1.02017641, "balance_loss_mlp": 1.00657713, "epoch": 0.057836830397402755, "flos": 25009208367840.0, "grad_norm": 1.870841184142586, "language_loss": 0.78975683, "learning_rate": 3.991910637662408e-06, "loss": 0.81584311, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.8959648609161377 }, { "auxiliary_loss_clip": 0.01449275, "auxiliary_loss_mlp": 0.01221884, "balance_loss_clip": 1.02397716, "balance_loss_mlp": 1.0040648, "epoch": 0.057957073288041845, "flos": 25593906379680.0, "grad_norm": 1.9412668605688856, "language_loss": 0.80804968, "learning_rate": 3.9918404960610355e-06, "loss": 0.83476126, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.838233232498169 }, { "auxiliary_loss_clip": 0.01434253, "auxiliary_loss_mlp": 0.01224213, "balance_loss_clip": 1.02331579, "balance_loss_mlp": 1.00524914, "epoch": 0.058077316178680935, "flos": 20777422327200.0, "grad_norm": 2.86537796720022, "language_loss": 0.77165341, "learning_rate": 3.991770052299043e-06, "loss": 0.7982381, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.79559063911438 }, { "auxiliary_loss_clip": 0.01417959, "auxiliary_loss_mlp": 0.0122348, "balance_loss_clip": 1.02307951, "balance_loss_mlp": 1.00527883, "epoch": 0.058197559069320025, "flos": 18916494971040.0, "grad_norm": 2.327249830087299, "language_loss": 0.87699348, "learning_rate": 3.991699306387118e-06, "loss": 0.90340781, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.7477989196777344 }, { "auxiliary_loss_clip": 0.014333, "auxiliary_loss_mlp": 0.01225954, "balance_loss_clip": 1.02366805, "balance_loss_mlp": 1.00641847, "epoch": 0.058317801959959115, "flos": 24863339216160.0, "grad_norm": 2.9166351670910893, "language_loss": 0.77975261, "learning_rate": 3.991628258335991e-06, "loss": 0.80634511, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.81337571144104 }, { "auxiliary_loss_clip": 0.01383815, "auxiliary_loss_mlp": 0.01223178, "balance_loss_clip": 1.02162576, "balance_loss_mlp": 1.00421441, "epoch": 0.05843804485059821, "flos": 23257988226720.0, "grad_norm": 3.5984931298110125, "language_loss": 0.87559742, "learning_rate": 3.991556908156442e-06, "loss": 0.9016673, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.839679002761841 }, { "auxiliary_loss_clip": 0.01410683, "auxiliary_loss_mlp": 0.01223964, "balance_loss_clip": 1.02239239, "balance_loss_mlp": 1.00480962, "epoch": 0.0585582877412373, "flos": 23150544203520.0, "grad_norm": 1.9045809214078784, "language_loss": 0.87807465, "learning_rate": 3.9914852558592914e-06, "loss": 0.90442115, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.850512742996216 }, { "auxiliary_loss_clip": 0.01423847, "auxiliary_loss_mlp": 0.01224131, "balance_loss_clip": 1.02398336, "balance_loss_mlp": 1.00459528, "epoch": 0.05867853063187639, "flos": 23506415781120.0, "grad_norm": 2.7788891084266045, "language_loss": 0.80755544, "learning_rate": 3.991413301455413e-06, "loss": 0.83403528, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.808908700942993 }, { "auxiliary_loss_clip": 0.01408408, "auxiliary_loss_mlp": 0.01222421, "balance_loss_clip": 1.02207136, "balance_loss_mlp": 1.00402927, "epoch": 0.05879877352251548, "flos": 29495816377920.0, "grad_norm": 2.150594766655358, "language_loss": 0.77720517, "learning_rate": 3.991341044955719e-06, "loss": 0.80351341, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 4.780829668045044 }, { "auxiliary_loss_clip": 0.01423427, "auxiliary_loss_mlp": 0.00874618, "balance_loss_clip": 1.02249718, "balance_loss_mlp": 0.9999575, "epoch": 0.05891901641315457, "flos": 20157496394400.0, "grad_norm": 2.332405796732168, "language_loss": 0.81627387, "learning_rate": 3.991268486371172e-06, "loss": 0.83925438, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 4.647362947463989 }, { "auxiliary_loss_clip": 0.01408321, "auxiliary_loss_mlp": 0.01225219, "balance_loss_clip": 1.02173185, "balance_loss_mlp": 1.00587356, "epoch": 0.05903925930379366, "flos": 24644212175520.0, "grad_norm": 3.4880567951254293, "language_loss": 0.87775099, "learning_rate": 3.991195625712779e-06, "loss": 0.90408647, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.787471055984497 }, { "auxiliary_loss_clip": 0.01448437, "auxiliary_loss_mlp": 0.01224011, "balance_loss_clip": 1.02461338, "balance_loss_mlp": 1.00523865, "epoch": 0.05915950219443276, "flos": 21250401397920.0, "grad_norm": 2.4726197858738255, "language_loss": 0.81477946, "learning_rate": 3.991122462991592e-06, "loss": 0.84150392, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.8765149116516113 }, { "auxiliary_loss_clip": 0.01448376, "auxiliary_loss_mlp": 0.01225337, "balance_loss_clip": 1.0238868, "balance_loss_mlp": 1.00541961, "epoch": 0.05927974508507185, "flos": 9902734325280.0, "grad_norm": 2.791047229237109, "language_loss": 0.8129499, "learning_rate": 3.991048998218712e-06, "loss": 0.83968705, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.6662819385528564 }, { "auxiliary_loss_clip": 0.01432914, "auxiliary_loss_mlp": 0.01224056, "balance_loss_clip": 1.02229774, "balance_loss_mlp": 1.00604606, "epoch": 0.05939998797571094, "flos": 18259509009600.0, "grad_norm": 2.481702883299921, "language_loss": 0.76792824, "learning_rate": 3.990975231405281e-06, "loss": 0.79449791, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.7949652671813965 }, { "auxiliary_loss_clip": 0.01433994, "auxiliary_loss_mlp": 0.01222699, "balance_loss_clip": 1.02280557, "balance_loss_mlp": 1.00411677, "epoch": 0.05952023086635003, "flos": 28256611138560.0, "grad_norm": 4.77058474189582, "language_loss": 0.78726065, "learning_rate": 3.990901162562491e-06, "loss": 0.81382757, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.861504554748535 }, { "auxiliary_loss_clip": 0.01391285, "auxiliary_loss_mlp": 0.00874589, "balance_loss_clip": 1.02007508, "balance_loss_mlp": 0.99993503, "epoch": 0.05964047375698912, "flos": 14902506794880.0, "grad_norm": 1.9657795953242998, "language_loss": 0.9054864, "learning_rate": 3.9908267917015765e-06, "loss": 0.92814517, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.869076728820801 }, { "auxiliary_loss_clip": 0.01432588, "auxiliary_loss_mlp": 0.01223066, "balance_loss_clip": 1.02183306, "balance_loss_mlp": 1.00410199, "epoch": 0.059760716647628206, "flos": 23185592506080.0, "grad_norm": 2.269892763640643, "language_loss": 0.93296635, "learning_rate": 3.990752118833821e-06, "loss": 0.95952296, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.847987651824951 }, { "auxiliary_loss_clip": 0.01446602, "auxiliary_loss_mlp": 0.01223629, "balance_loss_clip": 1.02333355, "balance_loss_mlp": 1.00600028, "epoch": 0.0598809595382673, "flos": 22746979188000.0, "grad_norm": 2.4069036365706364, "language_loss": 0.77864283, "learning_rate": 3.990677143970553e-06, "loss": 0.80534518, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.915013313293457 }, { "auxiliary_loss_clip": 0.01367305, "auxiliary_loss_mlp": 0.01223335, "balance_loss_clip": 1.02070546, "balance_loss_mlp": 1.00475276, "epoch": 0.06000120242890639, "flos": 22127232873600.0, "grad_norm": 2.033137619586344, "language_loss": 0.81298208, "learning_rate": 3.990601867123144e-06, "loss": 0.83888841, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 3.008606433868408 }, { "auxiliary_loss_clip": 0.01365535, "auxiliary_loss_mlp": 0.01220221, "balance_loss_clip": 1.02072227, "balance_loss_mlp": 1.00392807, "epoch": 0.06012144531954548, "flos": 19171783948320.0, "grad_norm": 1.87280424083903, "language_loss": 0.84978014, "learning_rate": 3.990526288303014e-06, "loss": 0.87563771, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.813972234725952 }, { "auxiliary_loss_clip": 0.01402619, "auxiliary_loss_mlp": 0.00874598, "balance_loss_clip": 1.020679, "balance_loss_mlp": 1.00005841, "epoch": 0.06024168821018457, "flos": 22783356666720.0, "grad_norm": 1.6546103847044322, "language_loss": 0.90911859, "learning_rate": 3.9904504075216295e-06, "loss": 0.93189085, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 3.0020689964294434 }, { "auxiliary_loss_clip": 0.01390055, "auxiliary_loss_mlp": 0.01223976, "balance_loss_clip": 1.02004147, "balance_loss_mlp": 1.00558424, "epoch": 0.06036193110082366, "flos": 18770697666720.0, "grad_norm": 2.11310100932349, "language_loss": 0.93871409, "learning_rate": 3.990374224790501e-06, "loss": 0.96485448, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.836899757385254 }, { "auxiliary_loss_clip": 0.01394526, "auxiliary_loss_mlp": 0.01221398, "balance_loss_clip": 1.02105272, "balance_loss_mlp": 1.00415087, "epoch": 0.06048217399146275, "flos": 17201580461280.0, "grad_norm": 1.96334988825795, "language_loss": 0.70728683, "learning_rate": 3.990297740121185e-06, "loss": 0.73344606, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.9183993339538574 }, { "auxiliary_loss_clip": 0.01424414, "auxiliary_loss_mlp": 0.00874645, "balance_loss_clip": 1.02166462, "balance_loss_mlp": 1.00018549, "epoch": 0.06060241688210185, "flos": 24024142548000.0, "grad_norm": 1.7555096241621662, "language_loss": 0.7799871, "learning_rate": 3.990220953525284e-06, "loss": 0.80297768, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.8382160663604736 }, { "auxiliary_loss_clip": 0.01404878, "auxiliary_loss_mlp": 0.01222404, "balance_loss_clip": 1.02037895, "balance_loss_mlp": 1.00458515, "epoch": 0.06072265977274094, "flos": 14611199575680.0, "grad_norm": 3.6397551462607542, "language_loss": 0.74130505, "learning_rate": 3.9901438650144465e-06, "loss": 0.76757789, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.7273876667022705 }, { "auxiliary_loss_clip": 0.01431804, "auxiliary_loss_mlp": 0.01223525, "balance_loss_clip": 1.0221808, "balance_loss_mlp": 1.00665987, "epoch": 0.06084290266338003, "flos": 20558295286560.0, "grad_norm": 2.7197270558192916, "language_loss": 0.91600955, "learning_rate": 3.990066474600367e-06, "loss": 0.94256282, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.775709390640259 }, { "auxiliary_loss_clip": 0.01430572, "auxiliary_loss_mlp": 0.01220546, "balance_loss_clip": 1.02152407, "balance_loss_mlp": 1.0040617, "epoch": 0.06096314555401912, "flos": 22309228038240.0, "grad_norm": 1.8977725109630104, "language_loss": 0.67840159, "learning_rate": 3.989988782294786e-06, "loss": 0.70491278, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.746730089187622 }, { "auxiliary_loss_clip": 0.01389824, "auxiliary_loss_mlp": 0.01220352, "balance_loss_clip": 1.02023566, "balance_loss_mlp": 1.00386834, "epoch": 0.06108338844465821, "flos": 19131383017440.0, "grad_norm": 1.8505098515750984, "language_loss": 0.95006859, "learning_rate": 3.989910788109489e-06, "loss": 0.97617036, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.777819871902466 }, { "auxiliary_loss_clip": 0.01374218, "auxiliary_loss_mlp": 0.01222972, "balance_loss_clip": 1.01865244, "balance_loss_mlp": 1.00515318, "epoch": 0.0612036313352973, "flos": 33584032382400.0, "grad_norm": 3.561404277618284, "language_loss": 0.750283, "learning_rate": 3.989832492056307e-06, "loss": 0.77625489, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.8685483932495117 }, { "auxiliary_loss_clip": 0.01419155, "auxiliary_loss_mlp": 0.01223186, "balance_loss_clip": 1.02210796, "balance_loss_mlp": 1.00574851, "epoch": 0.06132387422593639, "flos": 27490564588320.0, "grad_norm": 2.1937976402227264, "language_loss": 0.81155396, "learning_rate": 3.989753894147119e-06, "loss": 0.83797735, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.875173807144165 }, { "auxiliary_loss_clip": 0.01419736, "auxiliary_loss_mlp": 0.01221391, "balance_loss_clip": 1.02302635, "balance_loss_mlp": 1.00547957, "epoch": 0.061444117116575485, "flos": 25885069904160.0, "grad_norm": 2.1910430407780805, "language_loss": 0.79940033, "learning_rate": 3.989674994393846e-06, "loss": 0.82581162, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.806725263595581 }, { "auxiliary_loss_clip": 0.01417461, "auxiliary_loss_mlp": 0.01220195, "balance_loss_clip": 1.02135289, "balance_loss_mlp": 1.00409222, "epoch": 0.061564360007214575, "flos": 28512043810560.0, "grad_norm": 2.4397911824039196, "language_loss": 0.93825197, "learning_rate": 3.98959579280846e-06, "loss": 0.96462858, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.870290756225586 }, { "auxiliary_loss_clip": 0.01349695, "auxiliary_loss_mlp": 0.012212, "balance_loss_clip": 1.01821852, "balance_loss_mlp": 1.00490665, "epoch": 0.061684602897853665, "flos": 12094363968480.0, "grad_norm": 3.9546557024563658, "language_loss": 0.83183086, "learning_rate": 3.989516289402973e-06, "loss": 0.85753983, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 2.8129522800445557 }, { "auxiliary_loss_clip": 0.01361443, "auxiliary_loss_mlp": 0.01220976, "balance_loss_clip": 1.01904988, "balance_loss_mlp": 1.00525498, "epoch": 0.061804845788492754, "flos": 19532648917440.0, "grad_norm": 2.120815171513418, "language_loss": 0.80396187, "learning_rate": 3.989436484189447e-06, "loss": 0.82978606, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 2.823643207550049 }, { "auxiliary_loss_clip": 0.01429123, "auxiliary_loss_mlp": 0.01222244, "balance_loss_clip": 1.02144647, "balance_loss_mlp": 1.00499713, "epoch": 0.061925088679131844, "flos": 15341120112960.0, "grad_norm": 2.495212473352352, "language_loss": 0.80636919, "learning_rate": 3.9893563771799885e-06, "loss": 0.83288288, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 3.6789050102233887 }, { "auxiliary_loss_clip": 0.01443882, "auxiliary_loss_mlp": 0.01221408, "balance_loss_clip": 1.0224812, "balance_loss_mlp": 1.00435138, "epoch": 0.062045331569770934, "flos": 25919938588320.0, "grad_norm": 2.2605573903986023, "language_loss": 0.86348498, "learning_rate": 3.989275968386749e-06, "loss": 0.89013779, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 4.58419394493103 }, { "auxiliary_loss_clip": 0.01415506, "auxiliary_loss_mlp": 0.0122187, "balance_loss_clip": 1.02077067, "balance_loss_mlp": 1.00576758, "epoch": 0.06216557446041003, "flos": 28110634215840.0, "grad_norm": 2.191232766039071, "language_loss": 0.76789886, "learning_rate": 3.989195257821926e-06, "loss": 0.79427266, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.806187391281128 }, { "auxiliary_loss_clip": 0.01392647, "auxiliary_loss_mlp": 0.0122087, "balance_loss_clip": 1.02181971, "balance_loss_mlp": 1.00457692, "epoch": 0.06228581735104912, "flos": 23478192977760.0, "grad_norm": 1.9084325132315754, "language_loss": 0.84357083, "learning_rate": 3.989114245497765e-06, "loss": 0.86970603, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.814199447631836 }, { "auxiliary_loss_clip": 0.01426501, "auxiliary_loss_mlp": 0.01220371, "balance_loss_clip": 1.02055359, "balance_loss_mlp": 1.00445902, "epoch": 0.06240606024168821, "flos": 15195215037600.0, "grad_norm": 1.9583642133807733, "language_loss": 0.94660807, "learning_rate": 3.989032931426554e-06, "loss": 0.97307676, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.6985998153686523 }, { "auxiliary_loss_clip": 0.01396361, "auxiliary_loss_mlp": 0.01221863, "balance_loss_clip": 1.02079976, "balance_loss_mlp": 1.00576043, "epoch": 0.06252630313232731, "flos": 20631840564960.0, "grad_norm": 2.3610925388104986, "language_loss": 0.86834306, "learning_rate": 3.9889513156206295e-06, "loss": 0.89452529, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.889695644378662 }, { "auxiliary_loss_clip": 0.01379378, "auxiliary_loss_mlp": 0.01221133, "balance_loss_clip": 1.02052009, "balance_loss_mlp": 1.00464845, "epoch": 0.06264654602296639, "flos": 20778068953440.0, "grad_norm": 3.5288017105952614, "language_loss": 0.73573339, "learning_rate": 3.988869398092371e-06, "loss": 0.76173854, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.8710334300994873 }, { "auxiliary_loss_clip": 0.01388592, "auxiliary_loss_mlp": 0.01221057, "balance_loss_clip": 1.01918471, "balance_loss_mlp": 1.00381041, "epoch": 0.06276678891360549, "flos": 29605810982400.0, "grad_norm": 2.60878731452996, "language_loss": 0.78941011, "learning_rate": 3.988787178854206e-06, "loss": 0.81550664, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.7840158939361572 }, { "auxiliary_loss_clip": 0.014423, "auxiliary_loss_mlp": 0.01222125, "balance_loss_clip": 1.02202094, "balance_loss_mlp": 1.00583148, "epoch": 0.06288703180424457, "flos": 22126298857920.0, "grad_norm": 3.2952417782266163, "language_loss": 0.87466341, "learning_rate": 3.988704657918608e-06, "loss": 0.90130764, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.8008947372436523 }, { "auxiliary_loss_clip": 0.01421989, "auxiliary_loss_mlp": 0.0122035, "balance_loss_clip": 1.02144051, "balance_loss_mlp": 1.00520146, "epoch": 0.06300727469488367, "flos": 14976698699520.0, "grad_norm": 3.159860710064686, "language_loss": 0.79810059, "learning_rate": 3.988621835298094e-06, "loss": 0.82452404, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.766495943069458 }, { "auxiliary_loss_clip": 0.01441299, "auxiliary_loss_mlp": 0.01220746, "balance_loss_clip": 1.02206755, "balance_loss_mlp": 1.00521541, "epoch": 0.06312751758552275, "flos": 24535403052480.0, "grad_norm": 2.0430926318697273, "language_loss": 0.91775858, "learning_rate": 3.988538711005229e-06, "loss": 0.94437903, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.8960416316986084 }, { "auxiliary_loss_clip": 0.01415457, "auxiliary_loss_mlp": 0.01218083, "balance_loss_clip": 1.02143884, "balance_loss_mlp": 1.00369728, "epoch": 0.06324776047616185, "flos": 21507019551360.0, "grad_norm": 2.1718461138942837, "language_loss": 0.88166994, "learning_rate": 3.988455285052622e-06, "loss": 0.90800536, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.7303977012634277 }, { "auxiliary_loss_clip": 0.01427594, "auxiliary_loss_mlp": 0.0122093, "balance_loss_clip": 1.02175736, "balance_loss_mlp": 1.00463641, "epoch": 0.06336800336680094, "flos": 21688044776640.0, "grad_norm": 2.2261248495718524, "language_loss": 0.84046459, "learning_rate": 3.98837155745293e-06, "loss": 0.8669498, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.8324971199035645 }, { "auxiliary_loss_clip": 0.01414807, "auxiliary_loss_mlp": 0.01220892, "balance_loss_clip": 1.0209707, "balance_loss_mlp": 1.00478911, "epoch": 0.06348824625744003, "flos": 19500905593440.0, "grad_norm": 2.289127868697516, "language_loss": 0.76161563, "learning_rate": 3.988287528218854e-06, "loss": 0.78797257, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.7852683067321777 }, { "auxiliary_loss_clip": 0.01416756, "auxiliary_loss_mlp": 0.01219406, "balance_loss_clip": 1.02183211, "balance_loss_mlp": 1.00444746, "epoch": 0.06360848914807912, "flos": 15481241475840.0, "grad_norm": 2.0348259551967844, "language_loss": 0.90201396, "learning_rate": 3.98820319736314e-06, "loss": 0.92837554, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.6721768379211426 }, { "auxiliary_loss_clip": 0.01378004, "auxiliary_loss_mlp": 0.01220471, "balance_loss_clip": 1.01918626, "balance_loss_mlp": 1.00494063, "epoch": 0.0637287320387182, "flos": 20593379512800.0, "grad_norm": 2.3329088403518234, "language_loss": 0.85198855, "learning_rate": 3.988118564898582e-06, "loss": 0.87797332, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.8487110137939453 }, { "auxiliary_loss_clip": 0.01374878, "auxiliary_loss_mlp": 0.00874577, "balance_loss_clip": 1.01944363, "balance_loss_mlp": 1.00012505, "epoch": 0.0638489749293573, "flos": 17412229513440.0, "grad_norm": 2.4158720329855625, "language_loss": 0.89406979, "learning_rate": 3.988033630838019e-06, "loss": 0.91656435, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.917285919189453 }, { "auxiliary_loss_clip": 0.01429757, "auxiliary_loss_mlp": 0.01219363, "balance_loss_clip": 1.02250409, "balance_loss_mlp": 1.00402355, "epoch": 0.0639692178199964, "flos": 23807673859680.0, "grad_norm": 1.846122843638522, "language_loss": 0.8815949, "learning_rate": 3.987948395194334e-06, "loss": 0.90808618, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.8311948776245117 }, { "auxiliary_loss_clip": 0.01425814, "auxiliary_loss_mlp": 0.01219309, "balance_loss_clip": 1.02072227, "balance_loss_mlp": 1.00377893, "epoch": 0.06408946071063548, "flos": 18477235026720.0, "grad_norm": 2.0878331049436865, "language_loss": 0.76381028, "learning_rate": 3.987862857980458e-06, "loss": 0.79026151, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.7513959407806396 }, { "auxiliary_loss_clip": 0.01370703, "auxiliary_loss_mlp": 0.01220575, "balance_loss_clip": 1.01866746, "balance_loss_mlp": 1.00466311, "epoch": 0.06420970360127458, "flos": 27162233264160.0, "grad_norm": 2.0485493134322095, "language_loss": 0.76557112, "learning_rate": 3.987777019209368e-06, "loss": 0.79148388, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.8692688941955566 }, { "auxiliary_loss_clip": 0.01440283, "auxiliary_loss_mlp": 0.01221345, "balance_loss_clip": 1.02151942, "balance_loss_mlp": 1.0054332, "epoch": 0.06432994649191366, "flos": 23659685210880.0, "grad_norm": 1.7649642138677215, "language_loss": 0.81041074, "learning_rate": 3.987690878894084e-06, "loss": 0.83702701, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.780388355255127 }, { "auxiliary_loss_clip": 0.01403701, "auxiliary_loss_mlp": 0.01219913, "balance_loss_clip": 1.02078235, "balance_loss_mlp": 1.00438285, "epoch": 0.06445018938255276, "flos": 23403965149440.0, "grad_norm": 2.5723831350936393, "language_loss": 0.84759343, "learning_rate": 3.987604437047673e-06, "loss": 0.8738296, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.8710362911224365 }, { "auxiliary_loss_clip": 0.01413297, "auxiliary_loss_mlp": 0.01219365, "balance_loss_clip": 1.02023482, "balance_loss_mlp": 1.00402546, "epoch": 0.06457043227319184, "flos": 19646702897760.0, "grad_norm": 2.3379997995107193, "language_loss": 0.77499211, "learning_rate": 3.987517693683251e-06, "loss": 0.80131876, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.8474550247192383 }, { "auxiliary_loss_clip": 0.01386441, "auxiliary_loss_mlp": 0.01219103, "balance_loss_clip": 1.01996529, "balance_loss_mlp": 1.0037632, "epoch": 0.06469067516383094, "flos": 16978753281600.0, "grad_norm": 2.7485818758401024, "language_loss": 0.96447098, "learning_rate": 3.9874306488139745e-06, "loss": 0.99052644, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.8530235290527344 }, { "auxiliary_loss_clip": 0.01363376, "auxiliary_loss_mlp": 0.0122155, "balance_loss_clip": 1.01872134, "balance_loss_mlp": 1.00563848, "epoch": 0.06481091805447003, "flos": 23296413355200.0, "grad_norm": 2.181235011852502, "language_loss": 0.87807572, "learning_rate": 3.987343302453049e-06, "loss": 0.903925, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.9659323692321777 }, { "auxiliary_loss_clip": 0.01402165, "auxiliary_loss_mlp": 0.01221022, "balance_loss_clip": 1.02094507, "balance_loss_mlp": 1.00472903, "epoch": 0.06493116094510912, "flos": 29172370674240.0, "grad_norm": 1.7391404801782597, "language_loss": 0.8241725, "learning_rate": 3.987255654613724e-06, "loss": 0.85040432, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 3.8637332916259766 }, { "auxiliary_loss_clip": 0.01383305, "auxiliary_loss_mlp": 0.01219121, "balance_loss_clip": 1.01987362, "balance_loss_mlp": 1.00359011, "epoch": 0.06505140383574821, "flos": 19865075541120.0, "grad_norm": 2.1975229504352787, "language_loss": 0.70596802, "learning_rate": 3.987167705309296e-06, "loss": 0.7319923, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 3.754382371902466 }, { "auxiliary_loss_clip": 0.01424721, "auxiliary_loss_mlp": 0.00874434, "balance_loss_clip": 1.02162945, "balance_loss_mlp": 0.99994558, "epoch": 0.0651716467263873, "flos": 17924711423040.0, "grad_norm": 1.9816587838457067, "language_loss": 0.95133734, "learning_rate": 3.987079454553108e-06, "loss": 0.97432888, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 4.061153888702393 }, { "auxiliary_loss_clip": 0.01362664, "auxiliary_loss_mlp": 0.01218728, "balance_loss_clip": 1.02002168, "balance_loss_mlp": 1.00357938, "epoch": 0.0652918896170264, "flos": 20842848853920.0, "grad_norm": 1.971051862967314, "language_loss": 0.91006309, "learning_rate": 3.986990902358546e-06, "loss": 0.93587697, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.903184175491333 }, { "auxiliary_loss_clip": 0.01414556, "auxiliary_loss_mlp": 0.0121809, "balance_loss_clip": 1.01972342, "balance_loss_mlp": 1.00313222, "epoch": 0.06541213250766549, "flos": 21872518675200.0, "grad_norm": 2.225502080500168, "language_loss": 0.9344877, "learning_rate": 3.986902048739045e-06, "loss": 0.96081424, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.843106985092163 }, { "auxiliary_loss_clip": 0.01388431, "auxiliary_loss_mlp": 0.01219808, "balance_loss_clip": 1.01960135, "balance_loss_mlp": 1.00408697, "epoch": 0.06553237539830457, "flos": 23110753975200.0, "grad_norm": 2.6423662648033566, "language_loss": 0.80265439, "learning_rate": 3.986812893708082e-06, "loss": 0.82873678, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.8685879707336426 }, { "auxiliary_loss_clip": 0.01408012, "auxiliary_loss_mlp": 0.01219926, "balance_loss_clip": 1.01978135, "balance_loss_mlp": 1.00382376, "epoch": 0.06565261828894367, "flos": 17923777407360.0, "grad_norm": 1.989743209725459, "language_loss": 0.81559181, "learning_rate": 3.9867234372791826e-06, "loss": 0.84187126, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.8561458587646484 }, { "auxiliary_loss_clip": 0.01413829, "auxiliary_loss_mlp": 0.01216573, "balance_loss_clip": 1.02002108, "balance_loss_mlp": 1.00314057, "epoch": 0.06577286117958275, "flos": 22783069277280.0, "grad_norm": 1.5255612378017835, "language_loss": 0.86988437, "learning_rate": 3.986633679465918e-06, "loss": 0.89618838, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.759730100631714 }, { "auxiliary_loss_clip": 0.0135472, "auxiliary_loss_mlp": 0.01219919, "balance_loss_clip": 1.01925528, "balance_loss_mlp": 1.00457907, "epoch": 0.06589310407022185, "flos": 23696206384320.0, "grad_norm": 2.536763628976149, "language_loss": 0.80562282, "learning_rate": 3.986543620281904e-06, "loss": 0.83136922, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.853186845779419 }, { "auxiliary_loss_clip": 0.01398756, "auxiliary_loss_mlp": 0.01217296, "balance_loss_clip": 1.01908433, "balance_loss_mlp": 1.0042448, "epoch": 0.06601334696086093, "flos": 26864783095680.0, "grad_norm": 1.6749657818963308, "language_loss": 0.91019821, "learning_rate": 3.986453259740802e-06, "loss": 0.93635869, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.8828439712524414 }, { "auxiliary_loss_clip": 0.01389118, "auxiliary_loss_mlp": 0.01218682, "balance_loss_clip": 1.01999855, "balance_loss_mlp": 1.00410509, "epoch": 0.06613358985150003, "flos": 12567702276000.0, "grad_norm": 3.3732667386573674, "language_loss": 0.79267788, "learning_rate": 3.986362597856319e-06, "loss": 0.81875587, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.7979238033294678 }, { "auxiliary_loss_clip": 0.01397436, "auxiliary_loss_mlp": 0.00874438, "balance_loss_clip": 1.01876974, "balance_loss_mlp": 0.99993205, "epoch": 0.06625383274213913, "flos": 18332515432800.0, "grad_norm": 7.892864013766492, "language_loss": 0.81811386, "learning_rate": 3.986271634642211e-06, "loss": 0.84083265, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.8181445598602295 }, { "auxiliary_loss_clip": 0.0143816, "auxiliary_loss_mlp": 0.01217791, "balance_loss_clip": 1.02149618, "balance_loss_mlp": 1.00397754, "epoch": 0.06637407563277821, "flos": 15375593636640.0, "grad_norm": 2.112584124199247, "language_loss": 0.81528872, "learning_rate": 3.986180370112274e-06, "loss": 0.84184825, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.770925521850586 }, { "auxiliary_loss_clip": 0.0141267, "auxiliary_loss_mlp": 0.00874478, "balance_loss_clip": 1.01988482, "balance_loss_mlp": 0.99995685, "epoch": 0.0664943185234173, "flos": 24025256182080.0, "grad_norm": 1.6529194864005778, "language_loss": 0.74571931, "learning_rate": 3.986088804280354e-06, "loss": 0.76859081, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.7975480556488037 }, { "auxiliary_loss_clip": 0.01397672, "auxiliary_loss_mlp": 0.01218475, "balance_loss_clip": 1.01977146, "balance_loss_mlp": 1.00428009, "epoch": 0.06661456141405639, "flos": 20957513536800.0, "grad_norm": 2.7115093828581593, "language_loss": 0.93800437, "learning_rate": 3.985996937160342e-06, "loss": 0.96416593, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.845554828643799 }, { "auxiliary_loss_clip": 0.01411876, "auxiliary_loss_mlp": 0.01219622, "balance_loss_clip": 1.0205003, "balance_loss_mlp": 1.00466418, "epoch": 0.06673480430469549, "flos": 52223971557600.0, "grad_norm": 2.525126663535126, "language_loss": 0.69155711, "learning_rate": 3.985904768766173e-06, "loss": 0.71787214, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.95804762840271 }, { "auxiliary_loss_clip": 0.01372984, "auxiliary_loss_mlp": 0.01217955, "balance_loss_clip": 1.01865053, "balance_loss_mlp": 1.00337827, "epoch": 0.06685504719533458, "flos": 16217089420320.0, "grad_norm": 2.252408821676372, "language_loss": 0.76340055, "learning_rate": 3.98581229911183e-06, "loss": 0.78930998, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.7343943119049072 }, { "auxiliary_loss_clip": 0.01424882, "auxiliary_loss_mlp": 0.01218981, "balance_loss_clip": 1.02069259, "balance_loss_mlp": 1.00497675, "epoch": 0.06697529008597367, "flos": 22491546516000.0, "grad_norm": 1.788120897322753, "language_loss": 0.92184806, "learning_rate": 3.985719528211341e-06, "loss": 0.94828671, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.886401414871216 }, { "auxiliary_loss_clip": 0.01397258, "auxiliary_loss_mlp": 0.01211966, "balance_loss_clip": 1.02242589, "balance_loss_mlp": 1.00139427, "epoch": 0.06709553297661276, "flos": 62688246330240.0, "grad_norm": 0.8422851525287763, "language_loss": 0.63046765, "learning_rate": 3.985626456078777e-06, "loss": 0.65655988, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.388810396194458 }, { "auxiliary_loss_clip": 0.01371089, "auxiliary_loss_mlp": 0.0122082, "balance_loss_clip": 1.01936507, "balance_loss_mlp": 1.00605297, "epoch": 0.06721577586725185, "flos": 11216598477120.0, "grad_norm": 2.3991568907185044, "language_loss": 0.86042356, "learning_rate": 3.985533082728259e-06, "loss": 0.88634264, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.816802978515625 }, { "auxiliary_loss_clip": 0.01437075, "auxiliary_loss_mlp": 0.01218362, "balance_loss_clip": 1.02058101, "balance_loss_mlp": 1.00454831, "epoch": 0.06733601875789094, "flos": 25922201780160.0, "grad_norm": 1.8545220122622532, "language_loss": 0.74847448, "learning_rate": 3.985439408173951e-06, "loss": 0.77502882, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.7360928058624268 }, { "auxiliary_loss_clip": 0.01438565, "auxiliary_loss_mlp": 0.01219077, "balance_loss_clip": 1.0216682, "balance_loss_mlp": 1.00430942, "epoch": 0.06745626164853002, "flos": 20813656111200.0, "grad_norm": 2.0339724688979324, "language_loss": 0.70807314, "learning_rate": 3.9853454324300634e-06, "loss": 0.7346496, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.6921563148498535 }, { "auxiliary_loss_clip": 0.01338498, "auxiliary_loss_mlp": 0.01219163, "balance_loss_clip": 1.01753283, "balance_loss_mlp": 1.00477707, "epoch": 0.06757650453916912, "flos": 19829272841280.0, "grad_norm": 1.9383822583158667, "language_loss": 0.7766006, "learning_rate": 3.985251155510852e-06, "loss": 0.80217719, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 3.0697803497314453 }, { "auxiliary_loss_clip": 0.0134071, "auxiliary_loss_mlp": 0.01219209, "balance_loss_clip": 1.01819968, "balance_loss_mlp": 1.0052042, "epoch": 0.06769674742980822, "flos": 25739236676160.0, "grad_norm": 3.4212872883129006, "language_loss": 0.80251551, "learning_rate": 3.98515657743062e-06, "loss": 0.82811469, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 3.0439207553863525 }, { "auxiliary_loss_clip": 0.01393516, "auxiliary_loss_mlp": 0.01217498, "balance_loss_clip": 1.01801741, "balance_loss_mlp": 1.00387526, "epoch": 0.0678169903204473, "flos": 13074795633600.0, "grad_norm": 2.1726888864375047, "language_loss": 0.77722889, "learning_rate": 3.985061698203711e-06, "loss": 0.803339, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 2.7286880016326904 }, { "auxiliary_loss_clip": 0.01429046, "auxiliary_loss_mlp": 0.0120994, "balance_loss_clip": 1.02276635, "balance_loss_mlp": 1.00089478, "epoch": 0.0679372332110864, "flos": 70865863820640.0, "grad_norm": 0.8923920478086314, "language_loss": 0.63846171, "learning_rate": 3.984966517844523e-06, "loss": 0.66485155, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.1991240978240967 }, { "auxiliary_loss_clip": 0.01437723, "auxiliary_loss_mlp": 0.01218635, "balance_loss_clip": 1.02145648, "balance_loss_mlp": 1.00386786, "epoch": 0.06805747610172548, "flos": 28256431520160.0, "grad_norm": 2.1871927747962565, "language_loss": 0.80658931, "learning_rate": 3.984871036367492e-06, "loss": 0.83315289, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 4.662633180618286 }, { "auxiliary_loss_clip": 0.01411186, "auxiliary_loss_mlp": 0.00874395, "balance_loss_clip": 1.02030444, "balance_loss_mlp": 0.99992394, "epoch": 0.06817771899236458, "flos": 20120544136800.0, "grad_norm": 2.1620211565182497, "language_loss": 0.83224761, "learning_rate": 3.984775253787102e-06, "loss": 0.85510343, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 2.818976640701294 }, { "auxiliary_loss_clip": 0.01421432, "auxiliary_loss_mlp": 0.01217866, "balance_loss_clip": 1.02008104, "balance_loss_mlp": 1.0038619, "epoch": 0.06829796188300366, "flos": 17930638830240.0, "grad_norm": 2.628081613905461, "language_loss": 0.88189715, "learning_rate": 3.984679170117885e-06, "loss": 0.90829015, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 3.6910178661346436 }, { "auxiliary_loss_clip": 0.01409729, "auxiliary_loss_mlp": 0.01216893, "balance_loss_clip": 1.01959801, "balance_loss_mlp": 1.00365126, "epoch": 0.06841820477364276, "flos": 14501636055360.0, "grad_norm": 2.7817196331301473, "language_loss": 0.78579175, "learning_rate": 3.984582785374415e-06, "loss": 0.81205797, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 3.0049917697906494 }, { "auxiliary_loss_clip": 0.01384875, "auxiliary_loss_mlp": 0.00874416, "balance_loss_clip": 1.01895428, "balance_loss_mlp": 0.99996328, "epoch": 0.06853844766428185, "flos": 21938484057120.0, "grad_norm": 1.9494005808964885, "language_loss": 0.80463147, "learning_rate": 3.9844860995713155e-06, "loss": 0.82722431, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.851318836212158 }, { "auxiliary_loss_clip": 0.01410074, "auxiliary_loss_mlp": 0.01217526, "balance_loss_clip": 1.02035129, "balance_loss_mlp": 1.00409389, "epoch": 0.06865869055492094, "flos": 16800637874400.0, "grad_norm": 2.1138852255816656, "language_loss": 0.82801712, "learning_rate": 3.9843891127232524e-06, "loss": 0.85429305, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.7445387840270996 }, { "auxiliary_loss_clip": 0.01350823, "auxiliary_loss_mlp": 0.01217078, "balance_loss_clip": 1.01617479, "balance_loss_mlp": 1.00364554, "epoch": 0.06877893344556003, "flos": 19937291643360.0, "grad_norm": 2.8301790289294755, "language_loss": 0.66809916, "learning_rate": 3.984291824844938e-06, "loss": 0.69377822, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.8652257919311523 }, { "auxiliary_loss_clip": 0.01436245, "auxiliary_loss_mlp": 0.01218051, "balance_loss_clip": 1.02081037, "balance_loss_mlp": 1.00423765, "epoch": 0.06889917633619912, "flos": 23039400041280.0, "grad_norm": 2.665782251964031, "language_loss": 0.85048407, "learning_rate": 3.984194235951132e-06, "loss": 0.87702703, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.7023825645446777 }, { "auxiliary_loss_clip": 0.01438721, "auxiliary_loss_mlp": 0.01217759, "balance_loss_clip": 1.02246678, "balance_loss_mlp": 1.00394511, "epoch": 0.06901941922683821, "flos": 20960567049600.0, "grad_norm": 3.754283671208553, "language_loss": 0.84988624, "learning_rate": 3.9840963460566375e-06, "loss": 0.87645102, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.7374844551086426 }, { "auxiliary_loss_clip": 0.01334598, "auxiliary_loss_mlp": 0.01216183, "balance_loss_clip": 1.01836538, "balance_loss_mlp": 1.00370479, "epoch": 0.06913966211747731, "flos": 24821860574880.0, "grad_norm": 1.7112145277945567, "language_loss": 0.89316237, "learning_rate": 3.983998155176305e-06, "loss": 0.91867018, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.9621310234069824 }, { "auxiliary_loss_clip": 0.01426597, "auxiliary_loss_mlp": 0.01209448, "balance_loss_clip": 1.02198482, "balance_loss_mlp": 1.00040293, "epoch": 0.06925990500811639, "flos": 58367481037920.0, "grad_norm": 0.8164919390674117, "language_loss": 0.57029796, "learning_rate": 3.9838996633250305e-06, "loss": 0.59665847, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.1947453022003174 }, { "auxiliary_loss_clip": 0.01420062, "auxiliary_loss_mlp": 0.01215936, "balance_loss_clip": 1.01960003, "balance_loss_mlp": 1.0036478, "epoch": 0.06938014789875549, "flos": 12749948906400.0, "grad_norm": 2.0955441444401313, "language_loss": 0.88261235, "learning_rate": 3.983800870517753e-06, "loss": 0.90897238, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.7406866550445557 }, { "auxiliary_loss_clip": 0.01409331, "auxiliary_loss_mlp": 0.01215277, "balance_loss_clip": 1.02045572, "balance_loss_mlp": 1.00337076, "epoch": 0.06950039078939457, "flos": 22820237076960.0, "grad_norm": 3.660880237310621, "language_loss": 0.77950925, "learning_rate": 3.983701776769463e-06, "loss": 0.80575526, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.787930965423584 }, { "auxiliary_loss_clip": 0.01423778, "auxiliary_loss_mlp": 0.01215629, "balance_loss_clip": 1.02131712, "balance_loss_mlp": 1.00372219, "epoch": 0.06962063368003367, "flos": 21941357951520.0, "grad_norm": 1.910508794619384, "language_loss": 0.85623312, "learning_rate": 3.9836023820951885e-06, "loss": 0.88262719, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.869927167892456 }, { "auxiliary_loss_clip": 0.01367748, "auxiliary_loss_mlp": 0.01215418, "balance_loss_clip": 1.01834202, "balance_loss_mlp": 1.00293946, "epoch": 0.06974087657067275, "flos": 20706032469600.0, "grad_norm": 1.9704118201295742, "language_loss": 0.68858492, "learning_rate": 3.983502686510011e-06, "loss": 0.71441662, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.888857841491699 }, { "auxiliary_loss_clip": 0.01420494, "auxiliary_loss_mlp": 0.00874433, "balance_loss_clip": 1.02005136, "balance_loss_mlp": 1.00007915, "epoch": 0.06986111946131185, "flos": 22638241912320.0, "grad_norm": 1.9834503980574196, "language_loss": 0.73304909, "learning_rate": 3.9834026900290525e-06, "loss": 0.75599837, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.8279571533203125 }, { "auxiliary_loss_clip": 0.01435682, "auxiliary_loss_mlp": 0.01217746, "balance_loss_clip": 1.0209254, "balance_loss_mlp": 1.00431395, "epoch": 0.06998136235195095, "flos": 26943465460320.0, "grad_norm": 2.311493333300096, "language_loss": 1.00237298, "learning_rate": 3.983302392667482e-06, "loss": 1.0289073, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.819854497909546 }, { "auxiliary_loss_clip": 0.01409975, "auxiliary_loss_mlp": 0.01217434, "balance_loss_clip": 1.02106512, "balance_loss_mlp": 1.00457382, "epoch": 0.07010160524259003, "flos": 22492516455360.0, "grad_norm": 1.7064874740118385, "language_loss": 0.93289089, "learning_rate": 3.983201794440517e-06, "loss": 0.95916498, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.8086001873016357 }, { "auxiliary_loss_clip": 0.01398421, "auxiliary_loss_mlp": 0.01216486, "balance_loss_clip": 1.019997, "balance_loss_mlp": 1.00381708, "epoch": 0.07022184813322913, "flos": 18332551356480.0, "grad_norm": 1.9294148032816822, "language_loss": 0.67841035, "learning_rate": 3.9831008953634165e-06, "loss": 0.70455945, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.8373405933380127 }, { "auxiliary_loss_clip": 0.01357961, "auxiliary_loss_mlp": 0.01217463, "balance_loss_clip": 1.0183177, "balance_loss_mlp": 1.00498486, "epoch": 0.07034209102386821, "flos": 24675560339040.0, "grad_norm": 1.8083217840049468, "language_loss": 0.80827737, "learning_rate": 3.9829996954514864e-06, "loss": 0.83403158, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.883077621459961 }, { "auxiliary_loss_clip": 0.01422089, "auxiliary_loss_mlp": 0.01216827, "balance_loss_clip": 1.02072227, "balance_loss_mlp": 1.00434875, "epoch": 0.0704623339145073, "flos": 25995890753280.0, "grad_norm": 1.803883485261475, "language_loss": 0.84325534, "learning_rate": 3.982898194720079e-06, "loss": 0.86964452, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.860600471496582 }, { "auxiliary_loss_clip": 0.01396504, "auxiliary_loss_mlp": 0.00874433, "balance_loss_clip": 1.01995492, "balance_loss_mlp": 1.00011325, "epoch": 0.0705825768051464, "flos": 25338330012960.0, "grad_norm": 2.5275628626912514, "language_loss": 0.8240211, "learning_rate": 3.982796393184592e-06, "loss": 0.84673047, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.806708812713623 }, { "auxiliary_loss_clip": 0.01413363, "auxiliary_loss_mlp": 0.01207828, "balance_loss_clip": 1.02208018, "balance_loss_mlp": 1.00030804, "epoch": 0.07070281969578548, "flos": 66047583584160.0, "grad_norm": 0.7923349637014395, "language_loss": 0.62655008, "learning_rate": 3.98269429086047e-06, "loss": 0.65276194, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.17594051361084 }, { "auxiliary_loss_clip": 0.01394417, "auxiliary_loss_mlp": 0.0121648, "balance_loss_clip": 1.01853573, "balance_loss_mlp": 1.00419211, "epoch": 0.07082306258642458, "flos": 23653578185280.0, "grad_norm": 2.786566661771735, "language_loss": 0.86238402, "learning_rate": 3.982591887763199e-06, "loss": 0.888493, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.811601161956787 }, { "auxiliary_loss_clip": 0.01396048, "auxiliary_loss_mlp": 0.01215694, "balance_loss_clip": 1.01925862, "balance_loss_mlp": 1.00321555, "epoch": 0.07094330547706366, "flos": 13880057633280.0, "grad_norm": 2.166237438096897, "language_loss": 0.81982934, "learning_rate": 3.982489183908316e-06, "loss": 0.84594679, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.7880167961120605 }, { "auxiliary_loss_clip": 0.01357368, "auxiliary_loss_mlp": 0.01214457, "balance_loss_clip": 1.01772499, "balance_loss_mlp": 1.00369537, "epoch": 0.07106354836770276, "flos": 24645110267520.0, "grad_norm": 1.6807198230469567, "language_loss": 0.84570515, "learning_rate": 3.982386179311399e-06, "loss": 0.87142336, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.932288408279419 }, { "auxiliary_loss_clip": 0.01410272, "auxiliary_loss_mlp": 0.01217878, "balance_loss_clip": 1.01981521, "balance_loss_mlp": 1.00406432, "epoch": 0.07118379125834184, "flos": 16217233115040.0, "grad_norm": 2.800177566840991, "language_loss": 0.87484634, "learning_rate": 3.982282873988075e-06, "loss": 0.90112782, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 4.635165452957153 }, { "auxiliary_loss_clip": 0.01383653, "auxiliary_loss_mlp": 0.01216378, "balance_loss_clip": 1.01866531, "balance_loss_mlp": 1.00428057, "epoch": 0.07130403414898094, "flos": 19719996710400.0, "grad_norm": 1.9664920406780837, "language_loss": 0.86863905, "learning_rate": 3.982179267954016e-06, "loss": 0.89463937, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 2.8207380771636963 }, { "auxiliary_loss_clip": 0.01433822, "auxiliary_loss_mlp": 0.01216099, "balance_loss_clip": 1.02028608, "balance_loss_mlp": 1.00381148, "epoch": 0.07142427703962004, "flos": 21871943896320.0, "grad_norm": 2.773840006382669, "language_loss": 0.95947701, "learning_rate": 3.982075361224937e-06, "loss": 0.98597622, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.624497175216675 }, { "auxiliary_loss_clip": 0.01408374, "auxiliary_loss_mlp": 0.00874356, "balance_loss_clip": 1.02042198, "balance_loss_mlp": 1.00010252, "epoch": 0.07154451993025912, "flos": 18296604961920.0, "grad_norm": 1.9606686154896948, "language_loss": 0.87923443, "learning_rate": 3.981971153816602e-06, "loss": 0.90206182, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 3.6670727729797363 }, { "auxiliary_loss_clip": 0.01434683, "auxiliary_loss_mlp": 0.01215592, "balance_loss_clip": 1.0214119, "balance_loss_mlp": 1.00444853, "epoch": 0.07166476282089822, "flos": 22160700534240.0, "grad_norm": 1.6165534304821036, "language_loss": 0.9632535, "learning_rate": 3.981866645744819e-06, "loss": 0.98975629, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.7239980697631836 }, { "auxiliary_loss_clip": 0.01433733, "auxiliary_loss_mlp": 0.00874419, "balance_loss_clip": 1.02040029, "balance_loss_mlp": 1.00018358, "epoch": 0.0717850057115373, "flos": 14136352473600.0, "grad_norm": 2.3784396510217443, "language_loss": 0.81150591, "learning_rate": 3.9817618370254416e-06, "loss": 0.83458745, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.690976858139038 }, { "auxiliary_loss_clip": 0.0143532, "auxiliary_loss_mlp": 0.01215977, "balance_loss_clip": 1.02139556, "balance_loss_mlp": 1.00407064, "epoch": 0.0719052486021764, "flos": 30917807102880.0, "grad_norm": 2.2534921111001247, "language_loss": 0.87094611, "learning_rate": 3.9816567276743684e-06, "loss": 0.89745909, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.7454330921173096 }, { "auxiliary_loss_clip": 0.01380732, "auxiliary_loss_mlp": 0.01214304, "balance_loss_clip": 1.01839018, "balance_loss_mlp": 1.00335109, "epoch": 0.0720254914928155, "flos": 21287030342400.0, "grad_norm": 2.1805556080701725, "language_loss": 0.77349329, "learning_rate": 3.9815513177075466e-06, "loss": 0.79944366, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.7336320877075195 }, { "auxiliary_loss_clip": 0.014205, "auxiliary_loss_mlp": 0.0121561, "balance_loss_clip": 1.02095616, "balance_loss_mlp": 1.0042758, "epoch": 0.07214573438345458, "flos": 27819183301920.0, "grad_norm": 1.5749170600285292, "language_loss": 0.70341754, "learning_rate": 3.9814456071409646e-06, "loss": 0.72977859, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.7958319187164307 }, { "auxiliary_loss_clip": 0.01347136, "auxiliary_loss_mlp": 0.01217866, "balance_loss_clip": 1.01689076, "balance_loss_mlp": 1.00481522, "epoch": 0.07226597727409367, "flos": 25483588462080.0, "grad_norm": 4.808568297756377, "language_loss": 0.85047364, "learning_rate": 3.981339595990659e-06, "loss": 0.87612367, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.9311156272888184 }, { "auxiliary_loss_clip": 0.0140731, "auxiliary_loss_mlp": 0.01214722, "balance_loss_clip": 1.01942658, "balance_loss_mlp": 1.00338769, "epoch": 0.07238622016473276, "flos": 23513851982880.0, "grad_norm": 1.9318147619019725, "language_loss": 0.81519926, "learning_rate": 3.981233284272713e-06, "loss": 0.84141958, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.8131091594696045 }, { "auxiliary_loss_clip": 0.01378232, "auxiliary_loss_mlp": 0.01213752, "balance_loss_clip": 1.01873064, "balance_loss_mlp": 1.00318027, "epoch": 0.07250646305537185, "flos": 25453533551040.0, "grad_norm": 1.565190670003544, "language_loss": 0.90226412, "learning_rate": 3.981126672003253e-06, "loss": 0.92818397, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.9172706604003906 }, { "auxiliary_loss_clip": 0.01401821, "auxiliary_loss_mlp": 0.01215707, "balance_loss_clip": 1.01846123, "balance_loss_mlp": 1.00399172, "epoch": 0.07262670594601094, "flos": 27155048528160.0, "grad_norm": 1.9275298055875099, "language_loss": 0.78264999, "learning_rate": 3.981019759198451e-06, "loss": 0.80882531, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.8711531162261963 }, { "auxiliary_loss_clip": 0.01388745, "auxiliary_loss_mlp": 0.01214675, "balance_loss_clip": 1.01819372, "balance_loss_mlp": 1.00372243, "epoch": 0.07274694883665003, "flos": 26651619385920.0, "grad_norm": 2.343804333259754, "language_loss": 0.84272599, "learning_rate": 3.980912545874528e-06, "loss": 0.86876023, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.860476016998291 }, { "auxiliary_loss_clip": 0.01420411, "auxiliary_loss_mlp": 0.00874403, "balance_loss_clip": 1.01977289, "balance_loss_mlp": 1.00017989, "epoch": 0.07286719172728913, "flos": 29862357288480.0, "grad_norm": 2.0385111116101884, "language_loss": 0.85300452, "learning_rate": 3.980805032047746e-06, "loss": 0.8759526, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.83563232421875 }, { "auxiliary_loss_clip": 0.01380646, "auxiliary_loss_mlp": 0.01216478, "balance_loss_clip": 1.01742041, "balance_loss_mlp": 1.00438035, "epoch": 0.07298743461792821, "flos": 17382066831360.0, "grad_norm": 1.9586638691798686, "language_loss": 0.80840576, "learning_rate": 3.980697217734415e-06, "loss": 0.83437699, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.7470178604125977 }, { "auxiliary_loss_clip": 0.01346304, "auxiliary_loss_mlp": 0.00874372, "balance_loss_clip": 1.01707542, "balance_loss_mlp": 1.00017464, "epoch": 0.07310767750856731, "flos": 19498211317440.0, "grad_norm": 1.7916258618813516, "language_loss": 0.91646379, "learning_rate": 3.980589102950891e-06, "loss": 0.93867052, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.845811605453491 }, { "auxiliary_loss_clip": 0.01381657, "auxiliary_loss_mlp": 0.01216304, "balance_loss_clip": 1.01893103, "balance_loss_mlp": 1.00477898, "epoch": 0.07322792039920639, "flos": 29168706458880.0, "grad_norm": 2.873814839589236, "language_loss": 0.7615099, "learning_rate": 3.9804806877135755e-06, "loss": 0.78748959, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.9268174171447754 }, { "auxiliary_loss_clip": 0.01419515, "auxiliary_loss_mlp": 0.00874371, "balance_loss_clip": 1.01974988, "balance_loss_mlp": 1.00010216, "epoch": 0.07334816328984549, "flos": 23477833740960.0, "grad_norm": 6.99811707646644, "language_loss": 0.85963738, "learning_rate": 3.980371972038915e-06, "loss": 0.88257617, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.7898008823394775 }, { "auxiliary_loss_clip": 0.01432258, "auxiliary_loss_mlp": 0.01217092, "balance_loss_clip": 1.02006531, "balance_loss_mlp": 1.00499511, "epoch": 0.07346840618048459, "flos": 22962477936960.0, "grad_norm": 1.651089973886225, "language_loss": 0.84393334, "learning_rate": 3.980262955943399e-06, "loss": 0.87042683, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.9148075580596924 }, { "auxiliary_loss_clip": 0.01379475, "auxiliary_loss_mlp": 0.01214953, "balance_loss_clip": 1.01919675, "balance_loss_mlp": 1.00400007, "epoch": 0.07358864907112367, "flos": 17673912905760.0, "grad_norm": 2.93943194516704, "language_loss": 0.86973864, "learning_rate": 3.980153639443569e-06, "loss": 0.89568293, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.824692487716675 }, { "auxiliary_loss_clip": 0.01398166, "auxiliary_loss_mlp": 0.01216475, "balance_loss_clip": 1.01896381, "balance_loss_mlp": 1.00475979, "epoch": 0.07370889196176277, "flos": 24097041200160.0, "grad_norm": 2.2868817773077135, "language_loss": 0.79903722, "learning_rate": 3.980044022556005e-06, "loss": 0.82518363, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.955554485321045 }, { "auxiliary_loss_clip": 0.01405644, "auxiliary_loss_mlp": 0.0121619, "balance_loss_clip": 1.01898515, "balance_loss_mlp": 1.00390244, "epoch": 0.07382913485240185, "flos": 25885932072480.0, "grad_norm": 2.2509972693278537, "language_loss": 0.72647494, "learning_rate": 3.9799341052973375e-06, "loss": 0.7526933, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.977898359298706 }, { "auxiliary_loss_clip": 0.01382066, "auxiliary_loss_mlp": 0.01215246, "balance_loss_clip": 1.01947045, "balance_loss_mlp": 1.00391197, "epoch": 0.07394937774304094, "flos": 16873859839680.0, "grad_norm": 2.3087923731477957, "language_loss": 0.75513715, "learning_rate": 3.979823887684241e-06, "loss": 0.78111029, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.925992965698242 }, { "auxiliary_loss_clip": 0.01432387, "auxiliary_loss_mlp": 0.01214949, "balance_loss_clip": 1.02012992, "balance_loss_mlp": 1.00380599, "epoch": 0.07406962063368003, "flos": 20703481888320.0, "grad_norm": 2.994521920533866, "language_loss": 0.84837222, "learning_rate": 3.979713369733434e-06, "loss": 0.87484562, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 2.7921664714813232 }, { "auxiliary_loss_clip": 0.01420859, "auxiliary_loss_mlp": 0.01215359, "balance_loss_clip": 1.02057242, "balance_loss_mlp": 1.0028801, "epoch": 0.07418986352431912, "flos": 21430995539040.0, "grad_norm": 2.1285021303504634, "language_loss": 0.84994876, "learning_rate": 3.979602551461683e-06, "loss": 0.87631094, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.8972373008728027 }, { "auxiliary_loss_clip": 0.01379766, "auxiliary_loss_mlp": 0.01215586, "balance_loss_clip": 1.01777887, "balance_loss_mlp": 1.00425184, "epoch": 0.07431010641495822, "flos": 12021142003200.0, "grad_norm": 2.2742222887932324, "language_loss": 0.92011303, "learning_rate": 3.979491432885799e-06, "loss": 0.94606662, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 3.813138008117676 }, { "auxiliary_loss_clip": 0.01367196, "auxiliary_loss_mlp": 0.00874256, "balance_loss_clip": 1.01681352, "balance_loss_mlp": 1.00006914, "epoch": 0.0744303493055973, "flos": 20957585384160.0, "grad_norm": 2.1135964752883267, "language_loss": 0.83069855, "learning_rate": 3.97938001402264e-06, "loss": 0.85311306, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 3.8567159175872803 }, { "auxiliary_loss_clip": 0.01366984, "auxiliary_loss_mlp": 0.01215678, "balance_loss_clip": 1.01841474, "balance_loss_mlp": 1.00453496, "epoch": 0.0745505921962364, "flos": 16253143585920.0, "grad_norm": 3.206035769224229, "language_loss": 0.79627812, "learning_rate": 3.979268294889105e-06, "loss": 0.82210475, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 4.035698413848877 }, { "auxiliary_loss_clip": 0.01432299, "auxiliary_loss_mlp": 0.01215813, "balance_loss_clip": 1.02047944, "balance_loss_mlp": 1.00466955, "epoch": 0.07467083508687548, "flos": 50944652776800.0, "grad_norm": 1.960507495408954, "language_loss": 0.73954415, "learning_rate": 3.979156275502143e-06, "loss": 0.76602525, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 3.8498032093048096 }, { "auxiliary_loss_clip": 0.01364674, "auxiliary_loss_mlp": 0.01216142, "balance_loss_clip": 1.01670206, "balance_loss_mlp": 1.004426, "epoch": 0.07479107797751458, "flos": 17529696243360.0, "grad_norm": 2.2379035445320676, "language_loss": 0.91316557, "learning_rate": 3.979043955878749e-06, "loss": 0.93897372, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.9807822704315186 }, { "auxiliary_loss_clip": 0.0138045, "auxiliary_loss_mlp": 0.01214812, "balance_loss_clip": 1.01846528, "balance_loss_mlp": 1.0030961, "epoch": 0.07491132086815366, "flos": 23473953983520.0, "grad_norm": 2.0426761288213613, "language_loss": 0.83095413, "learning_rate": 3.978931336035959e-06, "loss": 0.85690671, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 3.0018224716186523 }, { "auxiliary_loss_clip": 0.01405845, "auxiliary_loss_mlp": 0.01216087, "balance_loss_clip": 1.01956558, "balance_loss_mlp": 1.00456238, "epoch": 0.07503156375879276, "flos": 20157568241760.0, "grad_norm": 2.2506136094481937, "language_loss": 0.82007754, "learning_rate": 3.9788184159908595e-06, "loss": 0.84629685, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.7861428260803223 }, { "auxiliary_loss_clip": 0.01393883, "auxiliary_loss_mlp": 0.01214106, "balance_loss_clip": 1.01908672, "balance_loss_mlp": 1.00296235, "epoch": 0.07515180664943186, "flos": 15115526809920.0, "grad_norm": 4.254638610716504, "language_loss": 0.83235091, "learning_rate": 3.97870519576058e-06, "loss": 0.8584308, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.7976505756378174 }, { "auxiliary_loss_clip": 0.01365348, "auxiliary_loss_mlp": 0.00874323, "balance_loss_clip": 1.01776731, "balance_loss_mlp": 1.00016093, "epoch": 0.07527204954007094, "flos": 21287712892320.0, "grad_norm": 2.437790991263533, "language_loss": 0.81391895, "learning_rate": 3.978591675362295e-06, "loss": 0.83631563, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.800794839859009 }, { "auxiliary_loss_clip": 0.01330793, "auxiliary_loss_mlp": 0.01214182, "balance_loss_clip": 1.01743865, "balance_loss_mlp": 1.0032295, "epoch": 0.07539229243071004, "flos": 21324198142080.0, "grad_norm": 1.8004504363564318, "language_loss": 0.87452441, "learning_rate": 3.978477854813226e-06, "loss": 0.89997423, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.9693431854248047 }, { "auxiliary_loss_clip": 0.01417539, "auxiliary_loss_mlp": 0.01215789, "balance_loss_clip": 1.0192802, "balance_loss_mlp": 1.00426364, "epoch": 0.07551253532134912, "flos": 13042549378080.0, "grad_norm": 1.9583296898069062, "language_loss": 0.82365185, "learning_rate": 3.97836373413064e-06, "loss": 0.84998512, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.778597831726074 }, { "auxiliary_loss_clip": 0.01430453, "auxiliary_loss_mlp": 0.01215871, "balance_loss_clip": 1.01935267, "balance_loss_mlp": 1.00434589, "epoch": 0.07563277821198822, "flos": 19208772129600.0, "grad_norm": 2.461339077827612, "language_loss": 0.74780744, "learning_rate": 3.978249313331848e-06, "loss": 0.77427071, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.7739906311035156 }, { "auxiliary_loss_clip": 0.01418615, "auxiliary_loss_mlp": 0.00874291, "balance_loss_clip": 1.01948273, "balance_loss_mlp": 1.00010419, "epoch": 0.07575302110262731, "flos": 19537211224800.0, "grad_norm": 3.1465701368041556, "language_loss": 0.62054133, "learning_rate": 3.978134592434208e-06, "loss": 0.64347041, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.8045215606689453 }, { "auxiliary_loss_clip": 0.01323978, "auxiliary_loss_mlp": 0.01208389, "balance_loss_clip": 1.01329219, "balance_loss_mlp": 1.00086963, "epoch": 0.0758732639932664, "flos": 67961844396000.0, "grad_norm": 0.9997594161602682, "language_loss": 0.59398687, "learning_rate": 3.978019571455123e-06, "loss": 0.6193105, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.3913187980651855 }, { "auxiliary_loss_clip": 0.01431046, "auxiliary_loss_mlp": 0.01214064, "balance_loss_clip": 1.02039027, "balance_loss_mlp": 1.00349307, "epoch": 0.07599350688390549, "flos": 18989214004800.0, "grad_norm": 2.1611515853569405, "language_loss": 0.84417409, "learning_rate": 3.977904250412042e-06, "loss": 0.8706252, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.800128221511841 }, { "auxiliary_loss_clip": 0.01400009, "auxiliary_loss_mlp": 0.01214157, "balance_loss_clip": 1.01874781, "balance_loss_mlp": 1.00339484, "epoch": 0.07611374977454458, "flos": 21069016935840.0, "grad_norm": 2.085046758013355, "language_loss": 0.85394311, "learning_rate": 3.97778862932246e-06, "loss": 0.88008475, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.8741061687469482 }, { "auxiliary_loss_clip": 0.01307474, "auxiliary_loss_mlp": 0.01215577, "balance_loss_clip": 1.01532316, "balance_loss_mlp": 1.00405192, "epoch": 0.07623399266518367, "flos": 18514546521120.0, "grad_norm": 2.328237275614798, "language_loss": 0.93739378, "learning_rate": 3.9776727082039144e-06, "loss": 0.96262431, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 4.271209716796875 }, { "auxiliary_loss_clip": 0.01422424, "auxiliary_loss_mlp": 0.01206643, "balance_loss_clip": 1.02248812, "balance_loss_mlp": 1.00064969, "epoch": 0.07635423555582276, "flos": 44663060077920.0, "grad_norm": 0.8125284697317212, "language_loss": 0.55529779, "learning_rate": 3.977556487073991e-06, "loss": 0.58158845, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.8156092166900635 }, { "auxiliary_loss_clip": 0.01402412, "auxiliary_loss_mlp": 0.01213551, "balance_loss_clip": 1.01806784, "balance_loss_mlp": 1.00317049, "epoch": 0.07647447844646185, "flos": 21761158970880.0, "grad_norm": 1.763973224148524, "language_loss": 0.8177166, "learning_rate": 3.97743996595032e-06, "loss": 0.84387624, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.8776116371154785 }, { "auxiliary_loss_clip": 0.01430006, "auxiliary_loss_mlp": 0.01214738, "balance_loss_clip": 1.0195713, "balance_loss_mlp": 1.00397646, "epoch": 0.07659472133710095, "flos": 23806811691360.0, "grad_norm": 1.6844988382772894, "language_loss": 0.81751835, "learning_rate": 3.9773231448505804e-06, "loss": 0.84396583, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.8424179553985596 }, { "auxiliary_loss_clip": 0.01378768, "auxiliary_loss_mlp": 0.00874279, "balance_loss_clip": 1.01792049, "balance_loss_mlp": 1.00007153, "epoch": 0.07671496422774003, "flos": 21469995446400.0, "grad_norm": 2.6817214360196866, "language_loss": 0.78234142, "learning_rate": 3.977206023792491e-06, "loss": 0.80487186, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.8788435459136963 }, { "auxiliary_loss_clip": 0.01404756, "auxiliary_loss_mlp": 0.01214277, "balance_loss_clip": 1.01962268, "balance_loss_mlp": 1.00351477, "epoch": 0.07683520711837913, "flos": 16980980549760.0, "grad_norm": 2.2007375274409475, "language_loss": 0.80946714, "learning_rate": 3.97708860279382e-06, "loss": 0.83565748, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.7606818675994873 }, { "auxiliary_loss_clip": 0.01377488, "auxiliary_loss_mlp": 0.01214789, "balance_loss_clip": 1.01747811, "balance_loss_mlp": 1.00440872, "epoch": 0.07695545000901821, "flos": 23476756030560.0, "grad_norm": 2.722473260728529, "language_loss": 0.78055525, "learning_rate": 3.97697088187238e-06, "loss": 0.80647802, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.8909926414489746 }, { "auxiliary_loss_clip": 0.01377808, "auxiliary_loss_mlp": 0.01213275, "balance_loss_clip": 1.01827478, "balance_loss_mlp": 1.00270367, "epoch": 0.07707569289965731, "flos": 17634266372160.0, "grad_norm": 2.1324033984862973, "language_loss": 0.91856724, "learning_rate": 3.976852861046029e-06, "loss": 0.94447803, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.790947914123535 }, { "auxiliary_loss_clip": 0.01339619, "auxiliary_loss_mlp": 0.01213375, "balance_loss_clip": 1.01642966, "balance_loss_mlp": 1.00356698, "epoch": 0.0771959357902964, "flos": 25775686002240.0, "grad_norm": 1.890492109759795, "language_loss": 0.80293667, "learning_rate": 3.97673454033267e-06, "loss": 0.82846665, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 2.989783763885498 }, { "auxiliary_loss_clip": 0.01389703, "auxiliary_loss_mlp": 0.01215021, "balance_loss_clip": 1.01794624, "balance_loss_mlp": 1.00406802, "epoch": 0.07731617868093549, "flos": 19828662138720.0, "grad_norm": 2.1131973460344797, "language_loss": 0.82661021, "learning_rate": 3.976615919750254e-06, "loss": 0.85265744, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.762906551361084 }, { "auxiliary_loss_clip": 0.01403342, "auxiliary_loss_mlp": 0.01213936, "balance_loss_clip": 1.01899672, "balance_loss_mlp": 1.00336456, "epoch": 0.07743642157157458, "flos": 21324665149920.0, "grad_norm": 2.082987597362828, "language_loss": 0.86763632, "learning_rate": 3.976496999316775e-06, "loss": 0.89380908, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 2.7772414684295654 }, { "auxiliary_loss_clip": 0.01379361, "auxiliary_loss_mlp": 0.01213857, "balance_loss_clip": 1.01902556, "balance_loss_mlp": 1.00347602, "epoch": 0.07755666446221367, "flos": 19969142738400.0, "grad_norm": 2.239120733319114, "language_loss": 0.8403275, "learning_rate": 3.976377779050271e-06, "loss": 0.86625963, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 4.727953672409058 }, { "auxiliary_loss_clip": 0.01417144, "auxiliary_loss_mlp": 0.01214348, "balance_loss_clip": 1.0190357, "balance_loss_mlp": 1.00244153, "epoch": 0.07767690735285276, "flos": 23623235884800.0, "grad_norm": 2.7709689111857085, "language_loss": 0.8419444, "learning_rate": 3.976258258968831e-06, "loss": 0.86825931, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.7328078746795654 }, { "auxiliary_loss_clip": 0.01358033, "auxiliary_loss_mlp": 0.01214086, "balance_loss_clip": 1.01786387, "balance_loss_mlp": 1.00389671, "epoch": 0.07779715024349185, "flos": 22236257538720.0, "grad_norm": 2.387855946183521, "language_loss": 0.74511361, "learning_rate": 3.976138439090583e-06, "loss": 0.77083486, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 3.755221128463745 }, { "auxiliary_loss_clip": 0.01355202, "auxiliary_loss_mlp": 0.01215073, "balance_loss_clip": 1.0170486, "balance_loss_mlp": 1.00412071, "epoch": 0.07791739313413094, "flos": 20955106650240.0, "grad_norm": 2.57494241994628, "language_loss": 0.85098892, "learning_rate": 3.976018319433706e-06, "loss": 0.8766917, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.805098533630371 }, { "auxiliary_loss_clip": 0.01402914, "auxiliary_loss_mlp": 0.01215507, "balance_loss_clip": 1.01825225, "balance_loss_mlp": 1.00455475, "epoch": 0.07803763602477004, "flos": 19312336395360.0, "grad_norm": 2.4013675207435727, "language_loss": 0.91260737, "learning_rate": 3.9758979000164205e-06, "loss": 0.93879163, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.72432279586792 }, { "auxiliary_loss_clip": 0.01362147, "auxiliary_loss_mlp": 0.01216326, "balance_loss_clip": 1.01677358, "balance_loss_mlp": 1.00499153, "epoch": 0.07815787891540912, "flos": 22710817251360.0, "grad_norm": 1.921837309274087, "language_loss": 0.71952599, "learning_rate": 3.975777180856995e-06, "loss": 0.74531072, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.7830991744995117 }, { "auxiliary_loss_clip": 0.01428654, "auxiliary_loss_mlp": 0.0121546, "balance_loss_clip": 1.01911092, "balance_loss_mlp": 1.00374472, "epoch": 0.07827812180604822, "flos": 22711140564480.0, "grad_norm": 1.7735266933538358, "language_loss": 0.85923481, "learning_rate": 3.975656161973742e-06, "loss": 0.88567591, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.745069742202759 }, { "auxiliary_loss_clip": 0.01428296, "auxiliary_loss_mlp": 0.01214849, "balance_loss_clip": 1.0188942, "balance_loss_mlp": 1.00408721, "epoch": 0.0783983646966873, "flos": 21725607736800.0, "grad_norm": 2.8462767581369515, "language_loss": 0.88834095, "learning_rate": 3.9755348433850194e-06, "loss": 0.91477239, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.7740042209625244 }, { "auxiliary_loss_clip": 0.0138062, "auxiliary_loss_mlp": 0.01206626, "balance_loss_clip": 1.01865053, "balance_loss_mlp": 1.00063252, "epoch": 0.0785186075873264, "flos": 60640905196800.0, "grad_norm": 0.9713340681857007, "language_loss": 0.63637745, "learning_rate": 3.975413225109232e-06, "loss": 0.66224992, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.3673276901245117 }, { "auxiliary_loss_clip": 0.01401275, "auxiliary_loss_mlp": 0.01216069, "balance_loss_clip": 1.01735616, "balance_loss_mlp": 1.00435305, "epoch": 0.0786388504779655, "flos": 23877914159520.0, "grad_norm": 3.6684258259063895, "language_loss": 0.93249208, "learning_rate": 3.975291307164829e-06, "loss": 0.95866549, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.7737135887145996 }, { "auxiliary_loss_clip": 0.01363888, "auxiliary_loss_mlp": 0.01212563, "balance_loss_clip": 1.01659548, "balance_loss_mlp": 1.00275493, "epoch": 0.07875909336860458, "flos": 15158693864160.0, "grad_norm": 1.935019249658165, "language_loss": 0.8509689, "learning_rate": 3.975169089570306e-06, "loss": 0.87673342, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.7978408336639404 }, { "auxiliary_loss_clip": 0.01415564, "auxiliary_loss_mlp": 0.01213808, "balance_loss_clip": 1.01946187, "balance_loss_mlp": 1.00342703, "epoch": 0.07887933625924368, "flos": 22236868241280.0, "grad_norm": 1.8867705318630508, "language_loss": 0.91453779, "learning_rate": 3.975046572344202e-06, "loss": 0.94083154, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.751882314682007 }, { "auxiliary_loss_clip": 0.01380218, "auxiliary_loss_mlp": 0.01213556, "balance_loss_clip": 1.01848376, "balance_loss_mlp": 1.00336647, "epoch": 0.07899957914988276, "flos": 20777745640320.0, "grad_norm": 1.8974692780094415, "language_loss": 0.71181905, "learning_rate": 3.974923755505103e-06, "loss": 0.73775679, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.8048036098480225 }, { "auxiliary_loss_clip": 0.01378414, "auxiliary_loss_mlp": 0.01214224, "balance_loss_clip": 1.01881862, "balance_loss_mlp": 1.00403428, "epoch": 0.07911982204052186, "flos": 23003058486240.0, "grad_norm": 1.6326363902771102, "language_loss": 0.91145217, "learning_rate": 3.974800639071641e-06, "loss": 0.93737853, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.8589606285095215 }, { "auxiliary_loss_clip": 0.0132604, "auxiliary_loss_mlp": 0.00874356, "balance_loss_clip": 1.01630771, "balance_loss_mlp": 1.00022817, "epoch": 0.07924006493116094, "flos": 23111400601440.0, "grad_norm": 2.1521853142963607, "language_loss": 1.00675523, "learning_rate": 3.974677223062492e-06, "loss": 1.02875924, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.92033052444458 }, { "auxiliary_loss_clip": 0.01375363, "auxiliary_loss_mlp": 0.0121377, "balance_loss_clip": 1.01765132, "balance_loss_mlp": 1.00319862, "epoch": 0.07936030782180004, "flos": 16472162855520.0, "grad_norm": 2.363581045636267, "language_loss": 0.74744785, "learning_rate": 3.974553507496378e-06, "loss": 0.77333915, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 3.1989011764526367 }, { "auxiliary_loss_clip": 0.01389516, "auxiliary_loss_mlp": 0.012148, "balance_loss_clip": 1.01822209, "balance_loss_mlp": 1.00327563, "epoch": 0.07948055071243913, "flos": 23733302336640.0, "grad_norm": 2.2288445074921763, "language_loss": 0.89189804, "learning_rate": 3.974429492392068e-06, "loss": 0.91794115, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.8374521732330322 }, { "auxiliary_loss_clip": 0.01428193, "auxiliary_loss_mlp": 0.00874337, "balance_loss_clip": 1.01941872, "balance_loss_mlp": 1.00026917, "epoch": 0.07960079360307822, "flos": 19573337237760.0, "grad_norm": 2.096019009903549, "language_loss": 0.91170502, "learning_rate": 3.974305177768373e-06, "loss": 0.93473029, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.682705879211426 }, { "auxiliary_loss_clip": 0.01364131, "auxiliary_loss_mlp": 0.01215161, "balance_loss_clip": 1.01759815, "balance_loss_mlp": 1.00478005, "epoch": 0.07972103649371731, "flos": 23513420898720.0, "grad_norm": 3.246738779593591, "language_loss": 0.86501378, "learning_rate": 3.974180563644152e-06, "loss": 0.89080667, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.9900429248809814 }, { "auxiliary_loss_clip": 0.01389202, "auxiliary_loss_mlp": 0.01214423, "balance_loss_clip": 1.01816857, "balance_loss_mlp": 1.00366151, "epoch": 0.0798412793843564, "flos": 16726877053920.0, "grad_norm": 2.072128088202716, "language_loss": 0.89000446, "learning_rate": 3.97405565003831e-06, "loss": 0.91604066, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.8302087783813477 }, { "auxiliary_loss_clip": 0.01367245, "auxiliary_loss_mlp": 0.01213642, "balance_loss_clip": 1.016922, "balance_loss_mlp": 1.00326157, "epoch": 0.07996152227499549, "flos": 18223347072960.0, "grad_norm": 1.9910956013811778, "language_loss": 0.78502381, "learning_rate": 3.973930436969794e-06, "loss": 0.81083274, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.896267890930176 }, { "auxiliary_loss_clip": 0.01402055, "auxiliary_loss_mlp": 0.01215424, "balance_loss_clip": 1.01869905, "balance_loss_mlp": 1.00485277, "epoch": 0.08008176516563459, "flos": 20594888307360.0, "grad_norm": 1.8980048515632189, "language_loss": 0.85769963, "learning_rate": 3.973804924457602e-06, "loss": 0.88387442, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.853886842727661 }, { "auxiliary_loss_clip": 0.01399185, "auxiliary_loss_mlp": 0.0121332, "balance_loss_clip": 1.01753092, "balance_loss_mlp": 1.00293946, "epoch": 0.08020200805627367, "flos": 31834321035840.0, "grad_norm": 1.663529849434744, "language_loss": 0.85468638, "learning_rate": 3.973679112520771e-06, "loss": 0.88081139, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.879521608352661 }, { "auxiliary_loss_clip": 0.01373358, "auxiliary_loss_mlp": 0.01213293, "balance_loss_clip": 1.01699901, "balance_loss_mlp": 1.00329351, "epoch": 0.08032225094691277, "flos": 17783512349760.0, "grad_norm": 1.8400541560184511, "language_loss": 0.98692536, "learning_rate": 3.973553001178389e-06, "loss": 1.01279187, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 3.0198676586151123 }, { "auxiliary_loss_clip": 0.01350768, "auxiliary_loss_mlp": 0.01212993, "balance_loss_clip": 1.01603818, "balance_loss_mlp": 1.00337493, "epoch": 0.08044249383755185, "flos": 24061705508160.0, "grad_norm": 2.2770377996940745, "language_loss": 0.75494206, "learning_rate": 3.973426590449585e-06, "loss": 0.78057969, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.8606784343719482 }, { "auxiliary_loss_clip": 0.01336178, "auxiliary_loss_mlp": 0.01214148, "balance_loss_clip": 1.01642442, "balance_loss_mlp": 1.00376797, "epoch": 0.08056273672819095, "flos": 18223634462400.0, "grad_norm": 2.02344840640808, "language_loss": 0.75164723, "learning_rate": 3.9732998803535364e-06, "loss": 0.77715051, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 2.8982479572296143 }, { "auxiliary_loss_clip": 0.01427029, "auxiliary_loss_mlp": 0.01214369, "balance_loss_clip": 1.01897335, "balance_loss_mlp": 1.00437021, "epoch": 0.08068297961883003, "flos": 19676865579840.0, "grad_norm": 5.78178696492537, "language_loss": 0.8515954, "learning_rate": 3.973172870909465e-06, "loss": 0.87800938, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 4.56633186340332 }, { "auxiliary_loss_clip": 0.01386387, "auxiliary_loss_mlp": 0.01215061, "balance_loss_clip": 1.01711583, "balance_loss_mlp": 1.00429916, "epoch": 0.08080322250946913, "flos": 23148748019520.0, "grad_norm": 2.70343300774048, "language_loss": 0.80550456, "learning_rate": 3.973045562136638e-06, "loss": 0.83151901, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 3.734288215637207 }, { "auxiliary_loss_clip": 0.01414485, "auxiliary_loss_mlp": 0.01213698, "balance_loss_clip": 1.01887417, "balance_loss_mlp": 1.0038898, "epoch": 0.08092346540010822, "flos": 21763637704800.0, "grad_norm": 2.082141856214767, "language_loss": 0.9149797, "learning_rate": 3.972917954054368e-06, "loss": 0.94126153, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 3.7265639305114746 }, { "auxiliary_loss_clip": 0.01389021, "auxiliary_loss_mlp": 0.01215366, "balance_loss_clip": 1.01844573, "balance_loss_mlp": 1.00441313, "epoch": 0.08104370829074731, "flos": 21032495762400.0, "grad_norm": 2.286287218599469, "language_loss": 0.819713, "learning_rate": 3.972790046682013e-06, "loss": 0.84575689, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.8059256076812744 }, { "auxiliary_loss_clip": 0.01367555, "auxiliary_loss_mlp": 0.01213773, "balance_loss_clip": 1.01593161, "balance_loss_mlp": 1.00339246, "epoch": 0.0811639511813864, "flos": 20083196718720.0, "grad_norm": 6.138737290032202, "language_loss": 0.78805292, "learning_rate": 3.972661840038977e-06, "loss": 0.8138662, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.8495492935180664 }, { "auxiliary_loss_clip": 0.01401406, "auxiliary_loss_mlp": 0.01213546, "balance_loss_clip": 1.01879978, "balance_loss_mlp": 1.00354707, "epoch": 0.08128419407202549, "flos": 16836727963680.0, "grad_norm": 2.900590264410846, "language_loss": 0.83582354, "learning_rate": 3.972533334144707e-06, "loss": 0.86197311, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.7309083938598633 }, { "auxiliary_loss_clip": 0.01412199, "auxiliary_loss_mlp": 0.01215894, "balance_loss_clip": 1.01770306, "balance_loss_mlp": 1.0049417, "epoch": 0.08140443696266458, "flos": 23769284654880.0, "grad_norm": 1.9306180266814348, "language_loss": 0.7863459, "learning_rate": 3.972404529018699e-06, "loss": 0.81262684, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.7795305252075195 }, { "auxiliary_loss_clip": 0.014014, "auxiliary_loss_mlp": 0.01212538, "balance_loss_clip": 1.01817429, "balance_loss_mlp": 1.00272965, "epoch": 0.08152467985330367, "flos": 24390144603360.0, "grad_norm": 1.9191280613726114, "language_loss": 0.85681367, "learning_rate": 3.972275424680493e-06, "loss": 0.88295305, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.9369542598724365 }, { "auxiliary_loss_clip": 0.01425914, "auxiliary_loss_mlp": 0.01213269, "balance_loss_clip": 1.01865983, "balance_loss_mlp": 1.00346065, "epoch": 0.08164492274394276, "flos": 19317760871040.0, "grad_norm": 2.156428972406956, "language_loss": 0.91711932, "learning_rate": 3.972146021149673e-06, "loss": 0.94351119, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 3.1025452613830566 }, { "auxiliary_loss_clip": 0.01367293, "auxiliary_loss_mlp": 0.01211392, "balance_loss_clip": 1.01710343, "balance_loss_mlp": 1.00310969, "epoch": 0.08176516563458186, "flos": 14830470311040.0, "grad_norm": 2.1483429781518883, "language_loss": 0.78368235, "learning_rate": 3.972016318445868e-06, "loss": 0.80946916, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.844996690750122 }, { "auxiliary_loss_clip": 0.01399973, "auxiliary_loss_mlp": 0.01213968, "balance_loss_clip": 1.01755321, "balance_loss_mlp": 1.00396943, "epoch": 0.08188540852522094, "flos": 22602331441440.0, "grad_norm": 2.1780658512465427, "language_loss": 0.92323077, "learning_rate": 3.971886316588757e-06, "loss": 0.94937015, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.8454458713531494 }, { "auxiliary_loss_clip": 0.01362696, "auxiliary_loss_mlp": 0.01213521, "balance_loss_clip": 1.0167563, "balance_loss_mlp": 1.00371289, "epoch": 0.08200565141586004, "flos": 19463737793760.0, "grad_norm": 4.558659586463943, "language_loss": 0.73442674, "learning_rate": 3.9717560155980595e-06, "loss": 0.76018894, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.8426289558410645 }, { "auxiliary_loss_clip": 0.01399573, "auxiliary_loss_mlp": 0.01212342, "balance_loss_clip": 1.01751804, "balance_loss_mlp": 1.00272393, "epoch": 0.08212589430649912, "flos": 20594672765280.0, "grad_norm": 1.9408138336527372, "language_loss": 0.91870308, "learning_rate": 3.971625415493542e-06, "loss": 0.94482225, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.787945032119751 }, { "auxiliary_loss_clip": 0.01362127, "auxiliary_loss_mlp": 0.01212482, "balance_loss_clip": 1.01695693, "balance_loss_mlp": 1.00324631, "epoch": 0.08224613719713822, "flos": 25953621791040.0, "grad_norm": 1.9041194327779878, "language_loss": 0.874354, "learning_rate": 3.971494516295017e-06, "loss": 0.90010011, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.863784074783325 }, { "auxiliary_loss_clip": 0.01371632, "auxiliary_loss_mlp": 0.01211509, "balance_loss_clip": 1.01680756, "balance_loss_mlp": 1.00284553, "epoch": 0.08236638008777732, "flos": 23768745799680.0, "grad_norm": 2.1527574364211257, "language_loss": 0.8501097, "learning_rate": 3.971363318022341e-06, "loss": 0.87594104, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.8572006225585938 }, { "auxiliary_loss_clip": 0.01400216, "auxiliary_loss_mlp": 0.01212728, "balance_loss_clip": 1.01787555, "balance_loss_mlp": 1.00330114, "epoch": 0.0824866229784164, "flos": 38799159906240.0, "grad_norm": 2.0709018082722515, "language_loss": 0.68477678, "learning_rate": 3.971231820695417e-06, "loss": 0.71090621, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 3.0444447994232178 }, { "auxiliary_loss_clip": 0.01387283, "auxiliary_loss_mlp": 0.01212712, "balance_loss_clip": 1.01732337, "balance_loss_mlp": 1.0034759, "epoch": 0.0826068658690555, "flos": 23107772309760.0, "grad_norm": 2.172923204738928, "language_loss": 0.81315261, "learning_rate": 3.971100024334193e-06, "loss": 0.83915257, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.8141086101531982 }, { "auxiliary_loss_clip": 0.01374444, "auxiliary_loss_mlp": 0.01211433, "balance_loss_clip": 1.01707184, "balance_loss_mlp": 1.00257826, "epoch": 0.08272710875969458, "flos": 21136383341280.0, "grad_norm": 2.7046593561553887, "language_loss": 0.86385447, "learning_rate": 3.970967928958663e-06, "loss": 0.88971323, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.900355100631714 }, { "auxiliary_loss_clip": 0.01346013, "auxiliary_loss_mlp": 0.01213366, "balance_loss_clip": 1.01580477, "balance_loss_mlp": 1.00393903, "epoch": 0.08284735165033368, "flos": 19063010748960.0, "grad_norm": 1.8482379096368067, "language_loss": 0.83423656, "learning_rate": 3.970835534588865e-06, "loss": 0.85983038, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.8248188495635986 }, { "auxiliary_loss_clip": 0.01384375, "auxiliary_loss_mlp": 0.01213439, "balance_loss_clip": 1.01772547, "balance_loss_mlp": 1.0038209, "epoch": 0.08296759454097276, "flos": 16727451832800.0, "grad_norm": 1.8471357363379635, "language_loss": 0.8570863, "learning_rate": 3.970702841244883e-06, "loss": 0.88306439, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.8717234134674072 }, { "auxiliary_loss_clip": 0.01401452, "auxiliary_loss_mlp": 0.01213291, "balance_loss_clip": 1.01771748, "balance_loss_mlp": 1.00348234, "epoch": 0.08308783743161186, "flos": 18004938505920.0, "grad_norm": 1.7770027566585331, "language_loss": 0.82206947, "learning_rate": 3.970569848946847e-06, "loss": 0.84821689, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.6962673664093018 }, { "auxiliary_loss_clip": 0.01411771, "auxiliary_loss_mlp": 0.01211583, "balance_loss_clip": 1.01785147, "balance_loss_mlp": 1.00311029, "epoch": 0.08320808032225095, "flos": 15079795957440.0, "grad_norm": 2.2671318240320426, "language_loss": 0.83118433, "learning_rate": 3.970436557714932e-06, "loss": 0.85741794, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.713465690612793 }, { "auxiliary_loss_clip": 0.01373021, "auxiliary_loss_mlp": 0.01212624, "balance_loss_clip": 1.01611996, "balance_loss_mlp": 1.00281608, "epoch": 0.08332832321289003, "flos": 22383096629760.0, "grad_norm": 2.329649904576744, "language_loss": 0.86524808, "learning_rate": 3.970302967569358e-06, "loss": 0.89110458, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.7741658687591553 }, { "auxiliary_loss_clip": 0.01399025, "auxiliary_loss_mlp": 0.01212618, "balance_loss_clip": 1.0184325, "balance_loss_mlp": 1.00319099, "epoch": 0.08344856610352913, "flos": 24717398217120.0, "grad_norm": 2.2267264068476953, "language_loss": 0.68441778, "learning_rate": 3.9701690785303896e-06, "loss": 0.71053421, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 2.758695602416992 }, { "auxiliary_loss_clip": 0.01411089, "auxiliary_loss_mlp": 0.01212791, "balance_loss_clip": 1.01756155, "balance_loss_mlp": 1.00374556, "epoch": 0.08356880899416821, "flos": 25370217031680.0, "grad_norm": 2.065793190343104, "language_loss": 0.88357645, "learning_rate": 3.970034890618339e-06, "loss": 0.90981531, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 2.790623664855957 }, { "auxiliary_loss_clip": 0.01411281, "auxiliary_loss_mlp": 0.01212114, "balance_loss_clip": 1.01746798, "balance_loss_mlp": 1.00345051, "epoch": 0.08368905188480731, "flos": 24353084574720.0, "grad_norm": 2.177751380041594, "language_loss": 0.88024563, "learning_rate": 3.969900403853562e-06, "loss": 0.9064796, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 2.7742204666137695 }, { "auxiliary_loss_clip": 0.01424653, "auxiliary_loss_mlp": 0.01212768, "balance_loss_clip": 1.01834714, "balance_loss_mlp": 1.00334084, "epoch": 0.08380929477544641, "flos": 18037328456160.0, "grad_norm": 1.7196492189951462, "language_loss": 0.77837169, "learning_rate": 3.96976561825646e-06, "loss": 0.80474591, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.5780298709869385 }, { "auxiliary_loss_clip": 0.01341328, "auxiliary_loss_mlp": 0.01213771, "balance_loss_clip": 1.01553428, "balance_loss_mlp": 1.00415361, "epoch": 0.08392953766608549, "flos": 26286299880480.0, "grad_norm": 1.924766778869223, "language_loss": 0.86866498, "learning_rate": 3.969630533847479e-06, "loss": 0.894216, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 3.8885436058044434 }, { "auxiliary_loss_clip": 0.01408477, "auxiliary_loss_mlp": 0.01212152, "balance_loss_clip": 1.01685905, "balance_loss_mlp": 1.00310636, "epoch": 0.08404978055672459, "flos": 22492157218560.0, "grad_norm": 1.858401065752002, "language_loss": 0.83763826, "learning_rate": 3.969495150647113e-06, "loss": 0.86384451, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 3.6447958946228027 }, { "auxiliary_loss_clip": 0.01358754, "auxiliary_loss_mlp": 0.01211596, "balance_loss_clip": 1.01685524, "balance_loss_mlp": 1.00293207, "epoch": 0.08417002344736367, "flos": 24826889890080.0, "grad_norm": 2.119049886560861, "language_loss": 0.76305622, "learning_rate": 3.969359468675899e-06, "loss": 0.78875971, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.847179412841797 }, { "auxiliary_loss_clip": 0.01398892, "auxiliary_loss_mlp": 0.01211761, "balance_loss_clip": 1.01734948, "balance_loss_mlp": 1.00309706, "epoch": 0.08429026633800277, "flos": 16945932247200.0, "grad_norm": 2.0321422155940643, "language_loss": 0.89149284, "learning_rate": 3.969223487954418e-06, "loss": 0.91759932, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.6961922645568848 }, { "auxiliary_loss_clip": 0.01343416, "auxiliary_loss_mlp": 0.01212524, "balance_loss_clip": 1.01688886, "balance_loss_mlp": 1.00328827, "epoch": 0.08441050922864185, "flos": 23841931841280.0, "grad_norm": 2.0770196294251413, "language_loss": 0.82772481, "learning_rate": 3.969087208503301e-06, "loss": 0.8532843, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.8215346336364746 }, { "auxiliary_loss_clip": 0.01360245, "auxiliary_loss_mlp": 0.01212083, "balance_loss_clip": 1.01636648, "balance_loss_mlp": 1.00284719, "epoch": 0.08453075211928095, "flos": 25520217406560.0, "grad_norm": 2.5934089080135614, "language_loss": 0.84557211, "learning_rate": 3.968950630343219e-06, "loss": 0.87129533, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.903006076812744 }, { "auxiliary_loss_clip": 0.01386855, "auxiliary_loss_mlp": 0.01212486, "balance_loss_clip": 1.01643384, "balance_loss_mlp": 1.00344014, "epoch": 0.08465099500992004, "flos": 19532505222720.0, "grad_norm": 2.987612638093932, "language_loss": 0.93471313, "learning_rate": 3.968813753494892e-06, "loss": 0.96070653, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.80679988861084 }, { "auxiliary_loss_clip": 0.01374857, "auxiliary_loss_mlp": 0.00874214, "balance_loss_clip": 1.01709855, "balance_loss_mlp": 1.0000118, "epoch": 0.08477123790055913, "flos": 29351312326080.0, "grad_norm": 2.26047499415282, "language_loss": 0.75534338, "learning_rate": 3.968676577979084e-06, "loss": 0.77783406, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 3.013421058654785 }, { "auxiliary_loss_clip": 0.01341665, "auxiliary_loss_mlp": 0.01213291, "balance_loss_clip": 1.01513028, "balance_loss_mlp": 1.00424552, "epoch": 0.08489148079119822, "flos": 18624505201920.0, "grad_norm": 2.0621650021909517, "language_loss": 0.77956271, "learning_rate": 3.968539103816605e-06, "loss": 0.80511224, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.805828332901001 }, { "auxiliary_loss_clip": 0.01386014, "auxiliary_loss_mlp": 0.00874209, "balance_loss_clip": 1.01823473, "balance_loss_mlp": 1.00005388, "epoch": 0.0850117236818373, "flos": 23471403402240.0, "grad_norm": 1.8171414640588097, "language_loss": 0.89300764, "learning_rate": 3.9684013310283085e-06, "loss": 0.91560996, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.899636745452881 }, { "auxiliary_loss_clip": 0.0137401, "auxiliary_loss_mlp": 0.01211605, "balance_loss_clip": 1.01707029, "balance_loss_mlp": 1.00313199, "epoch": 0.0851319665724764, "flos": 40625074883520.0, "grad_norm": 2.0291053349710353, "language_loss": 0.64112467, "learning_rate": 3.9682632596350956e-06, "loss": 0.66698086, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.977829694747925 }, { "auxiliary_loss_clip": 0.01397246, "auxiliary_loss_mlp": 0.01211661, "balance_loss_clip": 1.01733088, "balance_loss_mlp": 1.00261593, "epoch": 0.0852522094631155, "flos": 15879561634080.0, "grad_norm": 2.0109273646320838, "language_loss": 0.78400612, "learning_rate": 3.968124889657911e-06, "loss": 0.81009519, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.71402645111084 }, { "auxiliary_loss_clip": 0.01344907, "auxiliary_loss_mlp": 0.0121338, "balance_loss_clip": 1.0159421, "balance_loss_mlp": 1.00376201, "epoch": 0.08537245235375458, "flos": 14567098505760.0, "grad_norm": 2.304818739473457, "language_loss": 0.9061805, "learning_rate": 3.967986221117746e-06, "loss": 0.93176341, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.8828675746917725 }, { "auxiliary_loss_clip": 0.01299933, "auxiliary_loss_mlp": 0.01212512, "balance_loss_clip": 1.01375365, "balance_loss_mlp": 1.00308537, "epoch": 0.08549269524439368, "flos": 26468941671360.0, "grad_norm": 2.4069179394697344, "language_loss": 0.86841381, "learning_rate": 3.967847254035635e-06, "loss": 0.89353836, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 3.330493688583374 }, { "auxiliary_loss_clip": 0.01354587, "auxiliary_loss_mlp": 0.01211213, "balance_loss_clip": 1.01558757, "balance_loss_mlp": 1.00197721, "epoch": 0.08561293813503276, "flos": 13590223284960.0, "grad_norm": 2.2265402232958498, "language_loss": 0.86543041, "learning_rate": 3.967707988432661e-06, "loss": 0.89108843, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 3.4338529109954834 }, { "auxiliary_loss_clip": 0.01423009, "auxiliary_loss_mlp": 0.01213038, "balance_loss_clip": 1.0174042, "balance_loss_mlp": 1.00380206, "epoch": 0.08573318102567186, "flos": 26943537307680.0, "grad_norm": 2.4071550197926994, "language_loss": 0.87842023, "learning_rate": 3.967568424329949e-06, "loss": 0.90478069, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.91747784614563 }, { "auxiliary_loss_clip": 0.01390911, "auxiliary_loss_mlp": 0.0120429, "balance_loss_clip": 1.02087128, "balance_loss_mlp": 0.99982184, "epoch": 0.08585342391631094, "flos": 67302774861120.0, "grad_norm": 0.8218997284238535, "language_loss": 0.55530322, "learning_rate": 3.967428561748671e-06, "loss": 0.5812552, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.971116065979004 }, { "auxiliary_loss_clip": 0.01347425, "auxiliary_loss_mlp": 0.01214495, "balance_loss_clip": 1.01568222, "balance_loss_mlp": 1.0043056, "epoch": 0.08597366680695004, "flos": 22456605984480.0, "grad_norm": 2.0466194967474554, "language_loss": 0.87541026, "learning_rate": 3.967288400710045e-06, "loss": 0.90102947, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 3.051071882247925 }, { "auxiliary_loss_clip": 0.0134765, "auxiliary_loss_mlp": 0.01210933, "balance_loss_clip": 1.01656842, "balance_loss_mlp": 1.00303245, "epoch": 0.08609390969758914, "flos": 23550516851040.0, "grad_norm": 1.8751510459124614, "language_loss": 0.88458818, "learning_rate": 3.9671479412353335e-06, "loss": 0.91017401, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.9751393795013428 }, { "auxiliary_loss_clip": 0.01397905, "auxiliary_loss_mlp": 0.0121314, "balance_loss_clip": 1.01700616, "balance_loss_mlp": 1.00371289, "epoch": 0.08621415258822822, "flos": 25885860225120.0, "grad_norm": 5.048272185142492, "language_loss": 0.74256378, "learning_rate": 3.967007183345843e-06, "loss": 0.76867425, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.8296053409576416 }, { "auxiliary_loss_clip": 0.01395461, "auxiliary_loss_mlp": 0.01211065, "balance_loss_clip": 1.0164206, "balance_loss_mlp": 1.00278223, "epoch": 0.08633439547886732, "flos": 13589576658720.0, "grad_norm": 2.6218899180326205, "language_loss": 0.89503479, "learning_rate": 3.966866127062927e-06, "loss": 0.92110008, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.847456693649292 }, { "auxiliary_loss_clip": 0.01389178, "auxiliary_loss_mlp": 0.01204537, "balance_loss_clip": 1.02023768, "balance_loss_mlp": 1.0000695, "epoch": 0.0864546383695064, "flos": 57767371767360.0, "grad_norm": 0.9101240568513261, "language_loss": 0.62687254, "learning_rate": 3.966724772407982e-06, "loss": 0.65280968, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.234435796737671 }, { "auxiliary_loss_clip": 0.01363114, "auxiliary_loss_mlp": 0.01212706, "balance_loss_clip": 1.01685607, "balance_loss_mlp": 1.00366044, "epoch": 0.0865748812601455, "flos": 20046962934720.0, "grad_norm": 2.1200620254441134, "language_loss": 0.8900255, "learning_rate": 3.966583119402454e-06, "loss": 0.91578364, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 2.9150798320770264 }, { "auxiliary_loss_clip": 0.01398351, "auxiliary_loss_mlp": 0.008742, "balance_loss_clip": 1.01757193, "balance_loss_mlp": 0.9999786, "epoch": 0.08669512415078459, "flos": 35262461642400.0, "grad_norm": 1.6842810149086296, "language_loss": 0.82080573, "learning_rate": 3.9664411680678305e-06, "loss": 0.84353125, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 2.8992321491241455 }, { "auxiliary_loss_clip": 0.01354877, "auxiliary_loss_mlp": 0.0120462, "balance_loss_clip": 1.02021766, "balance_loss_mlp": 1.00015247, "epoch": 0.08681536704142367, "flos": 65654903443680.0, "grad_norm": 0.8397949293306567, "language_loss": 0.61423278, "learning_rate": 3.966298918425644e-06, "loss": 0.63982785, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 3.979058265686035 }, { "auxiliary_loss_clip": 0.01407584, "auxiliary_loss_mlp": 0.01212993, "balance_loss_clip": 1.01678073, "balance_loss_mlp": 1.00375652, "epoch": 0.08693560993206277, "flos": 34529954600160.0, "grad_norm": 1.7070091479638652, "language_loss": 0.82798874, "learning_rate": 3.966156370497476e-06, "loss": 0.85419446, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 4.332622766494751 }, { "auxiliary_loss_clip": 0.01405762, "auxiliary_loss_mlp": 0.01212543, "balance_loss_clip": 1.01701832, "balance_loss_mlp": 1.00311613, "epoch": 0.08705585282270185, "flos": 23149430569440.0, "grad_norm": 1.921109567613933, "language_loss": 0.88956654, "learning_rate": 3.96601352430495e-06, "loss": 0.91574955, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 6.247823238372803 }, { "auxiliary_loss_clip": 0.01372861, "auxiliary_loss_mlp": 0.01211178, "balance_loss_clip": 1.01671934, "balance_loss_mlp": 1.00327659, "epoch": 0.08717609571334095, "flos": 29497612561920.0, "grad_norm": 1.514987409642183, "language_loss": 0.8321169, "learning_rate": 3.965870379869735e-06, "loss": 0.85795724, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 3.1226511001586914 }, { "auxiliary_loss_clip": 0.01409336, "auxiliary_loss_mlp": 0.01212403, "balance_loss_clip": 1.01733565, "balance_loss_mlp": 1.00335729, "epoch": 0.08729633860398003, "flos": 20667499570080.0, "grad_norm": 2.1041528505746863, "language_loss": 0.870408, "learning_rate": 3.965726937213547e-06, "loss": 0.89662534, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.775360345840454 }, { "auxiliary_loss_clip": 0.014099, "auxiliary_loss_mlp": 0.01214156, "balance_loss_clip": 1.01702154, "balance_loss_mlp": 1.00472903, "epoch": 0.08741658149461913, "flos": 18369503614080.0, "grad_norm": 2.2474215039494227, "language_loss": 0.80622542, "learning_rate": 3.965583196358144e-06, "loss": 0.83246601, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.8626554012298584 }, { "auxiliary_loss_clip": 0.01422407, "auxiliary_loss_mlp": 0.01212167, "balance_loss_clip": 1.01751566, "balance_loss_mlp": 1.00312138, "epoch": 0.08753682438525823, "flos": 18729686033280.0, "grad_norm": 2.7230725742576314, "language_loss": 0.74423873, "learning_rate": 3.965439157325335e-06, "loss": 0.77058446, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.7573001384735107 }, { "auxiliary_loss_clip": 0.01395272, "auxiliary_loss_mlp": 0.01212425, "balance_loss_clip": 1.01650858, "balance_loss_mlp": 1.0033797, "epoch": 0.08765706727589731, "flos": 27776124018720.0, "grad_norm": 2.5040417147241665, "language_loss": 0.75592172, "learning_rate": 3.965294820136968e-06, "loss": 0.78199869, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.989445209503174 }, { "auxiliary_loss_clip": 0.01371793, "auxiliary_loss_mlp": 0.01212509, "balance_loss_clip": 1.01570749, "balance_loss_mlp": 1.00327277, "epoch": 0.08777731016653641, "flos": 24389138740320.0, "grad_norm": 1.9822271648818584, "language_loss": 0.87188506, "learning_rate": 3.965150184814938e-06, "loss": 0.89772809, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 3.075639247894287 }, { "auxiliary_loss_clip": 0.01397106, "auxiliary_loss_mlp": 0.01210875, "balance_loss_clip": 1.01734686, "balance_loss_mlp": 1.00316453, "epoch": 0.08789755305717549, "flos": 21981866653440.0, "grad_norm": 2.0235548700689727, "language_loss": 0.76730454, "learning_rate": 3.965005251381189e-06, "loss": 0.79338437, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.9855587482452393 }, { "auxiliary_loss_clip": 0.0139453, "auxiliary_loss_mlp": 0.01204631, "balance_loss_clip": 1.01885676, "balance_loss_mlp": 1.00016296, "epoch": 0.08801779594781459, "flos": 58360152607200.0, "grad_norm": 0.9008238327829404, "language_loss": 0.64628631, "learning_rate": 3.964860019857705e-06, "loss": 0.67227793, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.4222538471221924 }, { "auxiliary_loss_clip": 0.01421526, "auxiliary_loss_mlp": 0.01210475, "balance_loss_clip": 1.0178237, "balance_loss_mlp": 1.00333691, "epoch": 0.08813803883845367, "flos": 23294796789600.0, "grad_norm": 1.817560733969405, "language_loss": 0.84037924, "learning_rate": 3.964714490266518e-06, "loss": 0.86669928, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.7647783756256104 }, { "auxiliary_loss_clip": 0.01381609, "auxiliary_loss_mlp": 0.01204617, "balance_loss_clip": 1.01635659, "balance_loss_mlp": 1.00014937, "epoch": 0.08825828172909277, "flos": 63424956443040.0, "grad_norm": 0.8887895399931093, "language_loss": 0.64597631, "learning_rate": 3.964568662629706e-06, "loss": 0.67183858, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.178680419921875 }, { "auxiliary_loss_clip": 0.01408926, "auxiliary_loss_mlp": 0.01212343, "balance_loss_clip": 1.01686072, "balance_loss_mlp": 1.00348842, "epoch": 0.08837852461973186, "flos": 26720997517440.0, "grad_norm": 2.6828195277877596, "language_loss": 0.84254766, "learning_rate": 3.9644225369693895e-06, "loss": 0.86876029, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.9110231399536133 }, { "auxiliary_loss_clip": 0.01421774, "auxiliary_loss_mlp": 0.01212198, "balance_loss_clip": 1.01818037, "balance_loss_mlp": 1.00391507, "epoch": 0.08849876751037095, "flos": 27265438293120.0, "grad_norm": 2.4217349399756807, "language_loss": 0.86584365, "learning_rate": 3.964276113307735e-06, "loss": 0.8921833, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.8161978721618652 }, { "auxiliary_loss_clip": 0.01345558, "auxiliary_loss_mlp": 0.01212007, "balance_loss_clip": 1.01524317, "balance_loss_mlp": 1.00315285, "epoch": 0.08861901040101004, "flos": 19828769909760.0, "grad_norm": 1.9103168677948428, "language_loss": 0.80907905, "learning_rate": 3.9641293916669574e-06, "loss": 0.83465469, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.8248915672302246 }, { "auxiliary_loss_clip": 0.0135905, "auxiliary_loss_mlp": 0.01213064, "balance_loss_clip": 1.01634979, "balance_loss_mlp": 1.00382805, "epoch": 0.08873925329164913, "flos": 23658715271520.0, "grad_norm": 2.729150190246389, "language_loss": 0.82884324, "learning_rate": 3.9639823720693115e-06, "loss": 0.85456431, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.968928813934326 }, { "auxiliary_loss_clip": 0.01350086, "auxiliary_loss_mlp": 0.01203012, "balance_loss_clip": 1.01823556, "balance_loss_mlp": 1.00007045, "epoch": 0.08885949618228822, "flos": 71831579986080.0, "grad_norm": 0.8522480559663518, "language_loss": 0.60013807, "learning_rate": 3.963835054537102e-06, "loss": 0.625669, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.426863670349121 }, { "auxiliary_loss_clip": 0.01393707, "auxiliary_loss_mlp": 0.01211529, "balance_loss_clip": 1.016222, "balance_loss_mlp": 1.00305617, "epoch": 0.08897973907292732, "flos": 22346180295840.0, "grad_norm": 2.0856678905657615, "language_loss": 0.60834312, "learning_rate": 3.963687439092676e-06, "loss": 0.63439554, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.8410279750823975 }, { "auxiliary_loss_clip": 0.01395605, "auxiliary_loss_mlp": 0.01211687, "balance_loss_clip": 1.01706433, "balance_loss_mlp": 1.00359583, "epoch": 0.0890999819635664, "flos": 21251838345120.0, "grad_norm": 2.1042297935617382, "language_loss": 0.80280882, "learning_rate": 3.963539525758427e-06, "loss": 0.82888174, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.764317512512207 }, { "auxiliary_loss_clip": 0.01376917, "auxiliary_loss_mlp": 0.01211687, "balance_loss_clip": 1.01596594, "balance_loss_mlp": 1.00321436, "epoch": 0.0892202248542055, "flos": 25370899581600.0, "grad_norm": 4.507676548066569, "language_loss": 0.67593765, "learning_rate": 3.9633913145567925e-06, "loss": 0.70182371, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.8089921474456787 }, { "auxiliary_loss_clip": 0.01383711, "auxiliary_loss_mlp": 0.01210155, "balance_loss_clip": 1.0172931, "balance_loss_mlp": 1.00225413, "epoch": 0.08934046774484458, "flos": 24457906169280.0, "grad_norm": 1.8928837314886924, "language_loss": 0.8171218, "learning_rate": 3.9632428055102575e-06, "loss": 0.84306049, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.841416597366333 }, { "auxiliary_loss_clip": 0.01396488, "auxiliary_loss_mlp": 0.01211764, "balance_loss_clip": 1.01750588, "balance_loss_mlp": 1.00290942, "epoch": 0.08946071063548368, "flos": 35772788131200.0, "grad_norm": 2.0534865803877884, "language_loss": 0.67175186, "learning_rate": 3.9630939986413495e-06, "loss": 0.69783437, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.8457062244415283 }, { "auxiliary_loss_clip": 0.01369172, "auxiliary_loss_mlp": 0.01210218, "balance_loss_clip": 1.01640129, "balance_loss_mlp": 1.00250828, "epoch": 0.08958095352612276, "flos": 14356593148320.0, "grad_norm": 1.8364046175825475, "language_loss": 0.78225732, "learning_rate": 3.962944893972643e-06, "loss": 0.80805123, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.8486790657043457 }, { "auxiliary_loss_clip": 0.01371941, "auxiliary_loss_mlp": 0.01211231, "balance_loss_clip": 1.01651382, "balance_loss_mlp": 1.00313962, "epoch": 0.08970119641676186, "flos": 17853285641760.0, "grad_norm": 6.761052740609932, "language_loss": 0.90944397, "learning_rate": 3.962795491526756e-06, "loss": 0.93527573, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 2.8770551681518555 }, { "auxiliary_loss_clip": 0.01422696, "auxiliary_loss_mlp": 0.01212233, "balance_loss_clip": 1.01851857, "balance_loss_mlp": 1.00299668, "epoch": 0.08982143930740095, "flos": 20811680308800.0, "grad_norm": 2.4613778482969892, "language_loss": 0.8911202, "learning_rate": 3.962645791326354e-06, "loss": 0.9174695, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.9093658924102783 }, { "auxiliary_loss_clip": 0.01395444, "auxiliary_loss_mlp": 0.01211931, "balance_loss_clip": 1.01740861, "balance_loss_mlp": 1.00364912, "epoch": 0.08994168219804004, "flos": 24097723750080.0, "grad_norm": 3.1084872289495133, "language_loss": 0.83189249, "learning_rate": 3.962495793394146e-06, "loss": 0.8579663, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 2.9085915088653564 }, { "auxiliary_loss_clip": 0.01409042, "auxiliary_loss_mlp": 0.01203222, "balance_loss_clip": 1.0174557, "balance_loss_mlp": 1.00028038, "epoch": 0.09006192508867913, "flos": 57188924475840.0, "grad_norm": 0.833536565429098, "language_loss": 0.61199677, "learning_rate": 3.9623454977528864e-06, "loss": 0.63811946, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 4.068679332733154 }, { "auxiliary_loss_clip": 0.01371202, "auxiliary_loss_mlp": 0.01212063, "balance_loss_clip": 1.0162673, "balance_loss_mlp": 1.00358987, "epoch": 0.09018216797931822, "flos": 20487516131520.0, "grad_norm": 1.9500929556411135, "language_loss": 0.85179245, "learning_rate": 3.962194904425375e-06, "loss": 0.87762511, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 5.838853597640991 }, { "auxiliary_loss_clip": 0.01393761, "auxiliary_loss_mlp": 0.01212799, "balance_loss_clip": 1.01638615, "balance_loss_mlp": 1.00356269, "epoch": 0.09030241086995731, "flos": 22638134141280.0, "grad_norm": 2.4134478507757513, "language_loss": 0.68006462, "learning_rate": 3.9620440134344566e-06, "loss": 0.70613021, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.779067039489746 }, { "auxiliary_loss_clip": 0.01343859, "auxiliary_loss_mlp": 0.01212808, "balance_loss_clip": 1.01452768, "balance_loss_mlp": 1.00357199, "epoch": 0.09042265376059641, "flos": 21871512812160.0, "grad_norm": 4.106609543352887, "language_loss": 0.82691324, "learning_rate": 3.9618928248030215e-06, "loss": 0.85247988, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.8491883277893066 }, { "auxiliary_loss_clip": 0.0139444, "auxiliary_loss_mlp": 0.01211349, "balance_loss_clip": 1.0168519, "balance_loss_mlp": 1.0032568, "epoch": 0.0905428966512355, "flos": 24316204164480.0, "grad_norm": 2.284580822488857, "language_loss": 0.82556397, "learning_rate": 3.961741338554005e-06, "loss": 0.85162187, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.85516357421875 }, { "auxiliary_loss_clip": 0.01380089, "auxiliary_loss_mlp": 0.01212256, "balance_loss_clip": 1.01657212, "balance_loss_mlp": 1.00340116, "epoch": 0.09066313954187459, "flos": 35845076080800.0, "grad_norm": 1.8446599579695615, "language_loss": 0.75379401, "learning_rate": 3.9615895547103865e-06, "loss": 0.77971745, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 3.018127202987671 }, { "auxiliary_loss_clip": 0.01380694, "auxiliary_loss_mlp": 0.01211778, "balance_loss_clip": 1.01591372, "balance_loss_mlp": 1.00330496, "epoch": 0.09078338243251367, "flos": 29168742382560.0, "grad_norm": 2.76022528078676, "language_loss": 0.78186452, "learning_rate": 3.961437473295193e-06, "loss": 0.80778921, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.8982765674591064 }, { "auxiliary_loss_clip": 0.01359137, "auxiliary_loss_mlp": 0.01211207, "balance_loss_clip": 1.01573348, "balance_loss_mlp": 1.00330567, "epoch": 0.09090362532315277, "flos": 21907710672480.0, "grad_norm": 2.8427910320751977, "language_loss": 0.72412902, "learning_rate": 3.961285094331495e-06, "loss": 0.74983251, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.912888288497925 }, { "auxiliary_loss_clip": 0.01420612, "auxiliary_loss_mlp": 0.01211258, "balance_loss_clip": 1.01738751, "balance_loss_mlp": 1.00316691, "epoch": 0.09102386821379185, "flos": 27344515818240.0, "grad_norm": 1.6879298503605338, "language_loss": 0.8586095, "learning_rate": 3.961132417842406e-06, "loss": 0.88492823, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.7710113525390625 }, { "auxiliary_loss_clip": 0.01408147, "auxiliary_loss_mlp": 0.01210468, "balance_loss_clip": 1.01742029, "balance_loss_mlp": 1.00275779, "epoch": 0.09114411110443095, "flos": 20813512416480.0, "grad_norm": 2.636327282477052, "language_loss": 0.7492671, "learning_rate": 3.960979443851089e-06, "loss": 0.77545327, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.761991500854492 }, { "auxiliary_loss_clip": 0.01394461, "auxiliary_loss_mlp": 0.01212795, "balance_loss_clip": 1.01720643, "balance_loss_mlp": 1.00413144, "epoch": 0.09126435399507005, "flos": 26145962975520.0, "grad_norm": 2.4143185328887538, "language_loss": 0.78757322, "learning_rate": 3.96082617238075e-06, "loss": 0.81364578, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.849501132965088 }, { "auxiliary_loss_clip": 0.0137889, "auxiliary_loss_mlp": 0.01209833, "balance_loss_clip": 1.01549447, "balance_loss_mlp": 1.00231397, "epoch": 0.09138459688570913, "flos": 24388923198240.0, "grad_norm": 2.3847438220320316, "language_loss": 0.79844302, "learning_rate": 3.960672603454639e-06, "loss": 0.82433021, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.8231000900268555 }, { "auxiliary_loss_clip": 0.01408395, "auxiliary_loss_mlp": 0.01212533, "balance_loss_clip": 1.01735783, "balance_loss_mlp": 1.00348783, "epoch": 0.09150483977634823, "flos": 21032675380800.0, "grad_norm": 2.6745961103233085, "language_loss": 0.76834673, "learning_rate": 3.960518737096054e-06, "loss": 0.79455596, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.8494603633880615 }, { "auxiliary_loss_clip": 0.01396169, "auxiliary_loss_mlp": 0.01210318, "balance_loss_clip": 1.01648831, "balance_loss_mlp": 1.00260758, "epoch": 0.09162508266698731, "flos": 22857009716160.0, "grad_norm": 2.3364894216926295, "language_loss": 0.72851038, "learning_rate": 3.960364573328334e-06, "loss": 0.75457519, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.822849750518799 }, { "auxiliary_loss_clip": 0.01355555, "auxiliary_loss_mlp": 0.01212048, "balance_loss_clip": 1.015661, "balance_loss_mlp": 1.00357509, "epoch": 0.0917453255576264, "flos": 21724422255360.0, "grad_norm": 2.279697384620974, "language_loss": 0.88738519, "learning_rate": 3.9602101121748675e-06, "loss": 0.91306126, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.88354229927063 }, { "auxiliary_loss_clip": 0.01370077, "auxiliary_loss_mlp": 0.01209767, "balance_loss_clip": 1.01545072, "balance_loss_mlp": 1.00224781, "epoch": 0.0918655684482655, "flos": 14609223773280.0, "grad_norm": 2.0857926642477436, "language_loss": 0.72670019, "learning_rate": 3.960055353659085e-06, "loss": 0.75249863, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.6955904960632324 }, { "auxiliary_loss_clip": 0.01356763, "auxiliary_loss_mlp": 0.01211394, "balance_loss_clip": 1.01556766, "balance_loss_mlp": 1.0033021, "epoch": 0.09198581133890459, "flos": 23435025923520.0, "grad_norm": 1.6807150532826225, "language_loss": 0.8354578, "learning_rate": 3.959900297804465e-06, "loss": 0.86113942, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.768444776535034 }, { "auxiliary_loss_clip": 0.01380951, "auxiliary_loss_mlp": 0.01210864, "balance_loss_clip": 1.01727176, "balance_loss_mlp": 1.0029633, "epoch": 0.09210605422954368, "flos": 16795895948640.0, "grad_norm": 2.7330636799922976, "language_loss": 0.77155638, "learning_rate": 3.9597449446345276e-06, "loss": 0.7974745, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 3.004547595977783 }, { "auxiliary_loss_clip": 0.01394901, "auxiliary_loss_mlp": 0.01210445, "balance_loss_clip": 1.01711249, "balance_loss_mlp": 1.00273514, "epoch": 0.09222629712018277, "flos": 22674260154240.0, "grad_norm": 2.3819022572484045, "language_loss": 0.83103573, "learning_rate": 3.95958929417284e-06, "loss": 0.85708916, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 3.062824010848999 }, { "auxiliary_loss_clip": 0.01395372, "auxiliary_loss_mlp": 0.01202842, "balance_loss_clip": 1.01701057, "balance_loss_mlp": 0.99990028, "epoch": 0.09234654001082186, "flos": 69976795579200.0, "grad_norm": 0.7270588090406656, "language_loss": 0.58827996, "learning_rate": 3.9594333464430145e-06, "loss": 0.6142621, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.4630770683288574 }, { "auxiliary_loss_clip": 0.01308281, "auxiliary_loss_mlp": 0.01210357, "balance_loss_clip": 1.01476979, "balance_loss_mlp": 1.0028379, "epoch": 0.09246678290146094, "flos": 20011447624320.0, "grad_norm": 2.337826826825126, "language_loss": 0.88164485, "learning_rate": 3.959277101468709e-06, "loss": 0.90683126, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 3.0443146228790283 }, { "auxiliary_loss_clip": 0.01394029, "auxiliary_loss_mlp": 0.01209925, "balance_loss_clip": 1.01632214, "balance_loss_mlp": 1.00240576, "epoch": 0.09258702579210004, "flos": 17747458184160.0, "grad_norm": 2.507091383543436, "language_loss": 0.78858054, "learning_rate": 3.959120559273624e-06, "loss": 0.81462002, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.985687732696533 }, { "auxiliary_loss_clip": 0.01381576, "auxiliary_loss_mlp": 0.01210269, "balance_loss_clip": 1.01646161, "balance_loss_mlp": 1.00274897, "epoch": 0.09270726868273914, "flos": 20886554763360.0, "grad_norm": 1.8192880276141685, "language_loss": 0.83575046, "learning_rate": 3.958963719881509e-06, "loss": 0.86166883, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.8454604148864746 }, { "auxiliary_loss_clip": 0.0139374, "auxiliary_loss_mlp": 0.01211759, "balance_loss_clip": 1.017066, "balance_loss_mlp": 1.00309551, "epoch": 0.09282751157337822, "flos": 17015705539200.0, "grad_norm": 2.1668750578321627, "language_loss": 0.93970716, "learning_rate": 3.958806583316154e-06, "loss": 0.96576214, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 2.7367618083953857 }, { "auxiliary_loss_clip": 0.01419907, "auxiliary_loss_mlp": 0.0121165, "balance_loss_clip": 1.01756454, "balance_loss_mlp": 1.00355852, "epoch": 0.09294775446401732, "flos": 32523661023840.0, "grad_norm": 1.748484929624479, "language_loss": 0.78768438, "learning_rate": 3.9586491496013985e-06, "loss": 0.81400001, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 2.8623337745666504 }, { "auxiliary_loss_clip": 0.01407584, "auxiliary_loss_mlp": 0.01211118, "balance_loss_clip": 1.01757884, "balance_loss_mlp": 1.00340819, "epoch": 0.0930679973546564, "flos": 18259760475360.0, "grad_norm": 2.059482412341383, "language_loss": 0.83108842, "learning_rate": 3.958491418761124e-06, "loss": 0.85727537, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 2.933993339538574 }, { "auxiliary_loss_clip": 0.01391334, "auxiliary_loss_mlp": 0.0121085, "balance_loss_clip": 1.01623142, "balance_loss_mlp": 1.0029496, "epoch": 0.0931882402452955, "flos": 21099754396800.0, "grad_norm": 2.1720789692207605, "language_loss": 0.72640324, "learning_rate": 3.958333390819258e-06, "loss": 0.75242513, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 4.922531843185425 }, { "auxiliary_loss_clip": 0.01420141, "auxiliary_loss_mlp": 0.01210325, "balance_loss_clip": 1.0177362, "balance_loss_mlp": 1.00280595, "epoch": 0.0933084831359346, "flos": 24207287270400.0, "grad_norm": 2.303464970910634, "language_loss": 0.79802102, "learning_rate": 3.9581750657997754e-06, "loss": 0.82432568, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 4.3111817836761475 }, { "auxiliary_loss_clip": 0.01380251, "auxiliary_loss_mlp": 0.01209843, "balance_loss_clip": 1.0160768, "balance_loss_mlp": 1.00270462, "epoch": 0.09342872602657368, "flos": 25480283483520.0, "grad_norm": 1.7889826091353636, "language_loss": 0.89450538, "learning_rate": 3.95801644372669e-06, "loss": 0.92040628, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 3.1763079166412354 }, { "auxiliary_loss_clip": 0.01390828, "auxiliary_loss_mlp": 0.01210013, "balance_loss_clip": 1.01561689, "balance_loss_mlp": 1.00268435, "epoch": 0.09354896891721277, "flos": 23149071332640.0, "grad_norm": 3.1323945089176037, "language_loss": 0.84457368, "learning_rate": 3.957857524624068e-06, "loss": 0.8705821, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 3.040957450866699 }, { "auxiliary_loss_clip": 0.01380298, "auxiliary_loss_mlp": 0.01209589, "balance_loss_clip": 1.01585984, "balance_loss_mlp": 1.00264144, "epoch": 0.09366921180785186, "flos": 24279575220000.0, "grad_norm": 1.660638602359544, "language_loss": 0.89533126, "learning_rate": 3.957698308516016e-06, "loss": 0.92123008, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 3.2539613246917725 }, { "auxiliary_loss_clip": 0.01393219, "auxiliary_loss_mlp": 0.0087414, "balance_loss_clip": 1.0171659, "balance_loss_mlp": 1.00020576, "epoch": 0.09378945469849095, "flos": 18730045270080.0, "grad_norm": 2.011050425260515, "language_loss": 0.82644677, "learning_rate": 3.957538795426688e-06, "loss": 0.84912038, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.8412394523620605 }, { "auxiliary_loss_clip": 0.01377867, "auxiliary_loss_mlp": 0.01210425, "balance_loss_clip": 1.01477742, "balance_loss_mlp": 1.00271487, "epoch": 0.09390969758913004, "flos": 23218844624640.0, "grad_norm": 2.743809115403482, "language_loss": 0.76867664, "learning_rate": 3.9573789853802804e-06, "loss": 0.79455954, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 3.00675892829895 }, { "auxiliary_loss_clip": 0.01369514, "auxiliary_loss_mlp": 0.00874157, "balance_loss_clip": 1.01594305, "balance_loss_mlp": 1.00015259, "epoch": 0.09402994047976913, "flos": 19646738821440.0, "grad_norm": 1.8521837045666578, "language_loss": 0.74791431, "learning_rate": 3.957218878401037e-06, "loss": 0.77035105, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.7963459491729736 }, { "auxiliary_loss_clip": 0.01420157, "auxiliary_loss_mlp": 0.01212261, "balance_loss_clip": 1.01802647, "balance_loss_mlp": 1.00378823, "epoch": 0.09415018337040823, "flos": 29420474915520.0, "grad_norm": 2.4209920207006235, "language_loss": 0.89308071, "learning_rate": 3.957058474513246e-06, "loss": 0.91940486, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 3.058093309402466 }, { "auxiliary_loss_clip": 0.01392953, "auxiliary_loss_mlp": 0.01210608, "balance_loss_clip": 1.01674521, "balance_loss_mlp": 1.00308824, "epoch": 0.09427042626104731, "flos": 24572103844320.0, "grad_norm": 1.7168040124163297, "language_loss": 0.78417742, "learning_rate": 3.956897773741241e-06, "loss": 0.81021303, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.980221748352051 }, { "auxiliary_loss_clip": 0.01380375, "auxiliary_loss_mlp": 0.01209796, "balance_loss_clip": 1.01682055, "balance_loss_mlp": 1.00284874, "epoch": 0.09439066915168641, "flos": 26359593693120.0, "grad_norm": 2.1399474135769636, "language_loss": 0.71713734, "learning_rate": 3.956736776109398e-06, "loss": 0.74303913, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.9924018383026123 }, { "auxiliary_loss_clip": 0.01406545, "auxiliary_loss_mlp": 0.00874217, "balance_loss_clip": 1.01687956, "balance_loss_mlp": 1.00019693, "epoch": 0.09451091204232549, "flos": 19427288467680.0, "grad_norm": 2.086944963251979, "language_loss": 0.83690161, "learning_rate": 3.956575481642143e-06, "loss": 0.8597092, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.8129501342773438 }, { "auxiliary_loss_clip": 0.01350557, "auxiliary_loss_mlp": 0.01210711, "balance_loss_clip": 1.01450372, "balance_loss_mlp": 1.00319171, "epoch": 0.09463115493296459, "flos": 25368061610880.0, "grad_norm": 4.8695258739356335, "language_loss": 0.75009871, "learning_rate": 3.956413890363943e-06, "loss": 0.77571142, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.9583911895751953 }, { "auxiliary_loss_clip": 0.01392088, "auxiliary_loss_mlp": 0.0121134, "balance_loss_clip": 1.01605415, "balance_loss_mlp": 1.00343907, "epoch": 0.09475139782360369, "flos": 10123262389440.0, "grad_norm": 2.1134138806032476, "language_loss": 0.81918049, "learning_rate": 3.956252002299312e-06, "loss": 0.84521472, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.6982922554016113 }, { "auxiliary_loss_clip": 0.01419689, "auxiliary_loss_mlp": 0.01209655, "balance_loss_clip": 1.01805282, "balance_loss_mlp": 1.00232649, "epoch": 0.09487164071424277, "flos": 17231096517120.0, "grad_norm": 1.9600089305198962, "language_loss": 0.90763867, "learning_rate": 3.956089817472807e-06, "loss": 0.93393213, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.6784448623657227 }, { "auxiliary_loss_clip": 0.01367698, "auxiliary_loss_mlp": 0.01210426, "balance_loss_clip": 1.01635027, "balance_loss_mlp": 1.00252521, "epoch": 0.09499188360488187, "flos": 30849578529120.0, "grad_norm": 2.486729195595696, "language_loss": 0.85425502, "learning_rate": 3.955927335909032e-06, "loss": 0.88003623, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.8207390308380127 }, { "auxiliary_loss_clip": 0.01319362, "auxiliary_loss_mlp": 0.01209151, "balance_loss_clip": 1.01503074, "balance_loss_mlp": 1.00239432, "epoch": 0.09511212649552095, "flos": 29351707486560.0, "grad_norm": 2.2237618719935544, "language_loss": 0.75810003, "learning_rate": 3.955764557632634e-06, "loss": 0.78338522, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.8817155361175537 }, { "auxiliary_loss_clip": 0.01381128, "auxiliary_loss_mlp": 0.01210937, "balance_loss_clip": 1.01596427, "balance_loss_mlp": 1.003227, "epoch": 0.09523236938616005, "flos": 10378695061440.0, "grad_norm": 2.4270780851900833, "language_loss": 0.94527215, "learning_rate": 3.955601482668309e-06, "loss": 0.97119284, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.729097604751587 }, { "auxiliary_loss_clip": 0.01356015, "auxiliary_loss_mlp": 0.01210933, "balance_loss_clip": 1.0151546, "balance_loss_mlp": 1.00322223, "epoch": 0.09535261227679913, "flos": 19061825267520.0, "grad_norm": 1.8099422821192481, "language_loss": 0.88500321, "learning_rate": 3.955438111040794e-06, "loss": 0.91067272, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.7703659534454346 }, { "auxiliary_loss_clip": 0.01343575, "auxiliary_loss_mlp": 0.01210394, "balance_loss_clip": 1.01554322, "balance_loss_mlp": 1.002684, "epoch": 0.09547285516743823, "flos": 20923004089440.0, "grad_norm": 2.3370600509584727, "language_loss": 0.80120707, "learning_rate": 3.955274442774873e-06, "loss": 0.82674682, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.817439079284668 }, { "auxiliary_loss_clip": 0.01406043, "auxiliary_loss_mlp": 0.01211417, "balance_loss_clip": 1.0170927, "balance_loss_mlp": 1.00332522, "epoch": 0.09559309805807732, "flos": 30154418904960.0, "grad_norm": 2.4348079889265803, "language_loss": 0.70942265, "learning_rate": 3.9551104778953725e-06, "loss": 0.73559725, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.736668825149536 }, { "auxiliary_loss_clip": 0.01356337, "auxiliary_loss_mlp": 0.01210257, "balance_loss_clip": 1.0158515, "balance_loss_mlp": 1.00254691, "epoch": 0.0957133409487164, "flos": 21066753744000.0, "grad_norm": 1.8119535464332606, "language_loss": 0.85313988, "learning_rate": 3.954946216427167e-06, "loss": 0.87880582, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.7612643241882324 }, { "auxiliary_loss_clip": 0.01343402, "auxiliary_loss_mlp": 0.01203193, "balance_loss_clip": 1.01261294, "balance_loss_mlp": 1.00025105, "epoch": 0.0958335838393555, "flos": 71298010876320.0, "grad_norm": 0.8748307767757819, "language_loss": 0.61556995, "learning_rate": 3.954781658395176e-06, "loss": 0.64103585, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.193286418914795 }, { "auxiliary_loss_clip": 0.01381694, "auxiliary_loss_mlp": 0.01210823, "balance_loss_clip": 1.01633787, "balance_loss_mlp": 1.00273132, "epoch": 0.09595382672999458, "flos": 21872985683040.0, "grad_norm": 2.5079943146822465, "language_loss": 0.92409694, "learning_rate": 3.95461680382436e-06, "loss": 0.95002216, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 2.7041850090026855 }, { "auxiliary_loss_clip": 0.01394632, "auxiliary_loss_mlp": 0.01211435, "balance_loss_clip": 1.01714587, "balance_loss_mlp": 1.00334358, "epoch": 0.09607406962063368, "flos": 18695571746400.0, "grad_norm": 3.4848452861799855, "language_loss": 0.86541235, "learning_rate": 3.9544516527397295e-06, "loss": 0.89147305, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.6448123455047607 }, { "auxiliary_loss_clip": 0.01379343, "auxiliary_loss_mlp": 0.01210555, "balance_loss_clip": 1.01612735, "balance_loss_mlp": 1.00341702, "epoch": 0.09619431251127276, "flos": 22568468620320.0, "grad_norm": 2.7168930202508306, "language_loss": 0.80359751, "learning_rate": 3.954286205166338e-06, "loss": 0.8294965, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 2.8084537982940674 }, { "auxiliary_loss_clip": 0.01393229, "auxiliary_loss_mlp": 0.0121214, "balance_loss_clip": 1.01719546, "balance_loss_mlp": 1.00366688, "epoch": 0.09631455540191186, "flos": 14246239307040.0, "grad_norm": 5.9379303302971005, "language_loss": 0.84321761, "learning_rate": 3.954120461129282e-06, "loss": 0.86927128, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.6186673641204834 }, { "auxiliary_loss_clip": 0.01418581, "auxiliary_loss_mlp": 0.01209924, "balance_loss_clip": 1.01763523, "balance_loss_mlp": 1.00278616, "epoch": 0.09643479829255096, "flos": 20740398222240.0, "grad_norm": 2.350423241084128, "language_loss": 0.83710033, "learning_rate": 3.953954420653706e-06, "loss": 0.86338532, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 4.707710266113281 }, { "auxiliary_loss_clip": 0.01394024, "auxiliary_loss_mlp": 0.01210322, "balance_loss_clip": 1.01620483, "balance_loss_mlp": 1.00299287, "epoch": 0.09655504118319004, "flos": 24420486903840.0, "grad_norm": 2.0389685889531473, "language_loss": 0.8831442, "learning_rate": 3.953788083764798e-06, "loss": 0.90918767, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 3.705557346343994 }, { "auxiliary_loss_clip": 0.01326095, "auxiliary_loss_mlp": 0.01210061, "balance_loss_clip": 1.01355159, "balance_loss_mlp": 1.00292301, "epoch": 0.09667528407382914, "flos": 18441971182080.0, "grad_norm": 6.3456540882457695, "language_loss": 0.91995931, "learning_rate": 3.953621450487792e-06, "loss": 0.94532096, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.807003974914551 }, { "auxiliary_loss_clip": 0.01405764, "auxiliary_loss_mlp": 0.0120451, "balance_loss_clip": 1.01617026, "balance_loss_mlp": 1.00004256, "epoch": 0.09679552696446822, "flos": 70816495178880.0, "grad_norm": 0.8404248429694475, "language_loss": 0.61196941, "learning_rate": 3.953454520847964e-06, "loss": 0.63807213, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.345938205718994 }, { "auxiliary_loss_clip": 0.01381508, "auxiliary_loss_mlp": 0.01211847, "balance_loss_clip": 1.01630282, "balance_loss_mlp": 1.00318265, "epoch": 0.09691576985510732, "flos": 21945525098400.0, "grad_norm": 2.1362770137954246, "language_loss": 0.73685604, "learning_rate": 3.9532872948706395e-06, "loss": 0.76278961, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.7428689002990723 }, { "auxiliary_loss_clip": 0.0137778, "auxiliary_loss_mlp": 0.01210361, "balance_loss_clip": 1.01604545, "balance_loss_mlp": 1.00284135, "epoch": 0.09703601274574641, "flos": 17965220124960.0, "grad_norm": 2.355205780958683, "language_loss": 0.83001339, "learning_rate": 3.9531197725811845e-06, "loss": 0.8558948, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.80669903755188 }, { "auxiliary_loss_clip": 0.01417719, "auxiliary_loss_mlp": 0.01209093, "balance_loss_clip": 1.0173012, "balance_loss_mlp": 1.00252676, "epoch": 0.0971562556363855, "flos": 22162173405120.0, "grad_norm": 2.047486415122819, "language_loss": 0.8795042, "learning_rate": 3.952951954005013e-06, "loss": 0.90577233, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.668193817138672 }, { "auxiliary_loss_clip": 0.0139348, "auxiliary_loss_mlp": 0.01210512, "balance_loss_clip": 1.01660204, "balance_loss_mlp": 1.00299239, "epoch": 0.0972764985270246, "flos": 25848728349120.0, "grad_norm": 1.8321243133455725, "language_loss": 0.84593153, "learning_rate": 3.952783839167584e-06, "loss": 0.87197143, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.8949146270751953 }, { "auxiliary_loss_clip": 0.01391992, "auxiliary_loss_mlp": 0.01211869, "balance_loss_clip": 1.0163238, "balance_loss_mlp": 1.00377738, "epoch": 0.09739674141766368, "flos": 20339383788000.0, "grad_norm": 3.521756007699334, "language_loss": 0.74229467, "learning_rate": 3.952615428094398e-06, "loss": 0.76833326, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.7212820053100586 }, { "auxiliary_loss_clip": 0.01356605, "auxiliary_loss_mlp": 0.01209396, "balance_loss_clip": 1.0153296, "balance_loss_mlp": 1.00263894, "epoch": 0.09751698430830277, "flos": 15743068562880.0, "grad_norm": 2.4752541424603476, "language_loss": 0.73231274, "learning_rate": 3.952446720811004e-06, "loss": 0.75797272, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.788357973098755 }, { "auxiliary_loss_clip": 0.01344258, "auxiliary_loss_mlp": 0.01204753, "balance_loss_clip": 1.01456118, "balance_loss_mlp": 1.00028551, "epoch": 0.09763722719894186, "flos": 63716838441120.0, "grad_norm": 0.847169584010686, "language_loss": 0.63649315, "learning_rate": 3.952277717342995e-06, "loss": 0.66198319, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.43731951713562 }, { "auxiliary_loss_clip": 0.01388117, "auxiliary_loss_mlp": 0.01210879, "balance_loss_clip": 1.01674366, "balance_loss_mlp": 1.00316918, "epoch": 0.09775747008958095, "flos": 22090927242240.0, "grad_norm": 2.180072925121036, "language_loss": 0.85285163, "learning_rate": 3.952108417716009e-06, "loss": 0.87884164, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.731567859649658 }, { "auxiliary_loss_clip": 0.01406242, "auxiliary_loss_mlp": 0.0121134, "balance_loss_clip": 1.01805484, "balance_loss_mlp": 1.00324869, "epoch": 0.09787771298022005, "flos": 21286060403040.0, "grad_norm": 1.8979549839208842, "language_loss": 0.84723175, "learning_rate": 3.951938821955727e-06, "loss": 0.8734076, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.751437187194824 }, { "auxiliary_loss_clip": 0.01379669, "auxiliary_loss_mlp": 0.0121011, "balance_loss_clip": 1.01655769, "balance_loss_mlp": 1.00239956, "epoch": 0.09799795587085913, "flos": 22054585687200.0, "grad_norm": 1.5789856591169487, "language_loss": 0.76577604, "learning_rate": 3.9517689300878786e-06, "loss": 0.79167378, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.78328013420105 }, { "auxiliary_loss_clip": 0.01416571, "auxiliary_loss_mlp": 0.01208984, "balance_loss_clip": 1.0166564, "balance_loss_mlp": 1.00279951, "epoch": 0.09811819876149823, "flos": 22163754047040.0, "grad_norm": 1.7249181770295003, "language_loss": 0.7869885, "learning_rate": 3.951598742138236e-06, "loss": 0.81324404, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.7264771461486816 }, { "auxiliary_loss_clip": 0.0139036, "auxiliary_loss_mlp": 0.01210671, "balance_loss_clip": 1.01583672, "balance_loss_mlp": 1.00353348, "epoch": 0.09823844165213731, "flos": 22231120452480.0, "grad_norm": 2.2342823584246903, "language_loss": 0.7962274, "learning_rate": 3.951428258132615e-06, "loss": 0.82223773, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.7595036029815674 }, { "auxiliary_loss_clip": 0.01374155, "auxiliary_loss_mlp": 0.01210232, "balance_loss_clip": 1.01601982, "balance_loss_mlp": 1.00271273, "epoch": 0.09835868454277641, "flos": 22487738605920.0, "grad_norm": 2.814905095563752, "language_loss": 0.84254313, "learning_rate": 3.951257478096879e-06, "loss": 0.86838704, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.7601702213287354 }, { "auxiliary_loss_clip": 0.01368937, "auxiliary_loss_mlp": 0.00874224, "balance_loss_clip": 1.01546288, "balance_loss_mlp": 1.00031233, "epoch": 0.0984789274334155, "flos": 16362563411520.0, "grad_norm": 3.0853136839365516, "language_loss": 0.68741971, "learning_rate": 3.951086402056936e-06, "loss": 0.70985132, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.7824225425720215 }, { "auxiliary_loss_clip": 0.01274335, "auxiliary_loss_mlp": 0.00874219, "balance_loss_clip": 1.01248789, "balance_loss_mlp": 1.0003773, "epoch": 0.09859917032405459, "flos": 24243556978080.0, "grad_norm": 2.1840219036621393, "language_loss": 0.83821559, "learning_rate": 3.950915030038735e-06, "loss": 0.85970116, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 3.1292343139648438 }, { "auxiliary_loss_clip": 0.01391472, "auxiliary_loss_mlp": 0.01209689, "balance_loss_clip": 1.01725566, "balance_loss_mlp": 1.00293195, "epoch": 0.09871941321469369, "flos": 17420204570400.0, "grad_norm": 2.0413291395383384, "language_loss": 0.83977431, "learning_rate": 3.9507433620682765e-06, "loss": 0.86578584, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.905609130859375 }, { "auxiliary_loss_clip": 0.01355349, "auxiliary_loss_mlp": 0.01209656, "balance_loss_clip": 1.0145328, "balance_loss_mlp": 1.00232744, "epoch": 0.09883965610533277, "flos": 28477354744800.0, "grad_norm": 1.7331584536756968, "language_loss": 0.88046688, "learning_rate": 3.9505713981716e-06, "loss": 0.90611696, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.8609421253204346 }, { "auxiliary_loss_clip": 0.01379162, "auxiliary_loss_mlp": 0.01209272, "balance_loss_clip": 1.01695716, "balance_loss_mlp": 1.003088, "epoch": 0.09895989899597187, "flos": 23693943192480.0, "grad_norm": 1.7579756552453578, "language_loss": 0.8118788, "learning_rate": 3.950399138374795e-06, "loss": 0.83776313, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.8178348541259766 }, { "auxiliary_loss_clip": 0.01391439, "auxiliary_loss_mlp": 0.01209895, "balance_loss_clip": 1.0157907, "balance_loss_mlp": 1.00275755, "epoch": 0.09908014188661095, "flos": 24679619714880.0, "grad_norm": 2.329040178201206, "language_loss": 0.7428298, "learning_rate": 3.95022658270399e-06, "loss": 0.76884317, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 2.806596040725708 }, { "auxiliary_loss_clip": 0.01378185, "auxiliary_loss_mlp": 0.01210155, "balance_loss_clip": 1.01686871, "balance_loss_mlp": 1.0035888, "epoch": 0.09920038477725004, "flos": 14064315989760.0, "grad_norm": 1.7390715322897192, "language_loss": 0.78089672, "learning_rate": 3.9500537311853635e-06, "loss": 0.8067801, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 2.8337535858154297 }, { "auxiliary_loss_clip": 0.01404336, "auxiliary_loss_mlp": 0.01210669, "balance_loss_clip": 1.01645494, "balance_loss_mlp": 1.00334001, "epoch": 0.09932062766788914, "flos": 13407078562560.0, "grad_norm": 2.4526922249066248, "language_loss": 0.82868975, "learning_rate": 3.949880583845136e-06, "loss": 0.8548398, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.7195799350738525 }, { "auxiliary_loss_clip": 0.01380281, "auxiliary_loss_mlp": 0.01209386, "balance_loss_clip": 1.01589525, "balance_loss_mlp": 1.00282025, "epoch": 0.09944087055852822, "flos": 19500761898720.0, "grad_norm": 2.0964469199468514, "language_loss": 0.8128534, "learning_rate": 3.949707140709575e-06, "loss": 0.83875012, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 3.9027602672576904 }, { "auxiliary_loss_clip": 0.01403813, "auxiliary_loss_mlp": 0.01209708, "balance_loss_clip": 1.01677966, "balance_loss_mlp": 1.00295162, "epoch": 0.09956111344916732, "flos": 17749110673440.0, "grad_norm": 2.6610678511357904, "language_loss": 0.83363444, "learning_rate": 3.949533401804991e-06, "loss": 0.8597697, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 5.542954206466675 }, { "auxiliary_loss_clip": 0.01392422, "auxiliary_loss_mlp": 0.00874245, "balance_loss_clip": 1.01689649, "balance_loss_mlp": 1.00038922, "epoch": 0.0996813563398064, "flos": 17967591087840.0, "grad_norm": 2.108865762232669, "language_loss": 0.90907919, "learning_rate": 3.949359367157739e-06, "loss": 0.93174589, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.747926712036133 }, { "auxiliary_loss_clip": 0.01392857, "auxiliary_loss_mlp": 0.01210141, "balance_loss_clip": 1.01545942, "balance_loss_mlp": 1.00262117, "epoch": 0.0998015992304455, "flos": 17457049056960.0, "grad_norm": 2.0699064590574103, "language_loss": 0.75193942, "learning_rate": 3.949185036794222e-06, "loss": 0.77796936, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.7830796241760254 }, { "auxiliary_loss_clip": 0.01415351, "auxiliary_loss_mlp": 0.01209266, "balance_loss_clip": 1.01631641, "balance_loss_mlp": 1.00308192, "epoch": 0.0999218421210846, "flos": 25888770043200.0, "grad_norm": 1.7170142024108057, "language_loss": 0.78624761, "learning_rate": 3.949010410740884e-06, "loss": 0.8124938, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.76137638092041 }, { "auxiliary_loss_clip": 0.01377821, "auxiliary_loss_mlp": 0.00874182, "balance_loss_clip": 1.01578188, "balance_loss_mlp": 1.00038934, "epoch": 0.10004208501172368, "flos": 21215927874240.0, "grad_norm": 2.242742764557407, "language_loss": 0.86475289, "learning_rate": 3.948835489024216e-06, "loss": 0.88727295, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.8209266662597656 }, { "auxiliary_loss_clip": 0.01401083, "auxiliary_loss_mlp": 0.01208545, "balance_loss_clip": 1.01583242, "balance_loss_mlp": 1.00255084, "epoch": 0.10016232790236278, "flos": 17348419552320.0, "grad_norm": 1.9593828912836426, "language_loss": 0.90394783, "learning_rate": 3.948660271670755e-06, "loss": 0.93004423, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.770923614501953 }, { "auxiliary_loss_clip": 0.01364319, "auxiliary_loss_mlp": 0.01208229, "balance_loss_clip": 1.01482391, "balance_loss_mlp": 1.00185382, "epoch": 0.10028257079300186, "flos": 25666553566080.0, "grad_norm": 2.311488722974811, "language_loss": 0.83796799, "learning_rate": 3.948484758707079e-06, "loss": 0.86369348, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.848304271697998 }, { "auxiliary_loss_clip": 0.01365937, "auxiliary_loss_mlp": 0.01209312, "balance_loss_clip": 1.01586938, "balance_loss_mlp": 1.00274658, "epoch": 0.10040281368364096, "flos": 25156047458880.0, "grad_norm": 2.5504703261886283, "language_loss": 0.83385932, "learning_rate": 3.948308950159815e-06, "loss": 0.85961187, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.7549967765808105 }, { "auxiliary_loss_clip": 0.01365831, "auxiliary_loss_mlp": 0.01208978, "balance_loss_clip": 1.01502228, "balance_loss_mlp": 1.00260246, "epoch": 0.10052305657428004, "flos": 17603313369120.0, "grad_norm": 3.8918444417458837, "language_loss": 0.7564016, "learning_rate": 3.9481328460556326e-06, "loss": 0.78214967, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.801356077194214 }, { "auxiliary_loss_clip": 0.01377561, "auxiliary_loss_mlp": 0.01209791, "balance_loss_clip": 1.01571465, "balance_loss_mlp": 1.00265265, "epoch": 0.10064329946491914, "flos": 18660164207040.0, "grad_norm": 2.3120182558924114, "language_loss": 0.89294958, "learning_rate": 3.9479564464212455e-06, "loss": 0.91882312, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.7267990112304688 }, { "auxiliary_loss_clip": 0.01415733, "auxiliary_loss_mlp": 0.01210202, "balance_loss_clip": 1.01576185, "balance_loss_mlp": 1.0030638, "epoch": 0.10076354235555823, "flos": 17199065803680.0, "grad_norm": 3.7351515359320135, "language_loss": 0.76193535, "learning_rate": 3.947779751283414e-06, "loss": 0.78819466, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.643587350845337 }, { "auxiliary_loss_clip": 0.01390101, "auxiliary_loss_mlp": 0.00874198, "balance_loss_clip": 1.01730812, "balance_loss_mlp": 1.00038552, "epoch": 0.10088378524619732, "flos": 22962262394880.0, "grad_norm": 1.7560831610883259, "language_loss": 0.75989538, "learning_rate": 3.947602760668944e-06, "loss": 0.78253841, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.713542938232422 }, { "auxiliary_loss_clip": 0.01389278, "auxiliary_loss_mlp": 0.01209261, "balance_loss_clip": 1.01556706, "balance_loss_mlp": 1.00250387, "epoch": 0.10100402813683641, "flos": 37885843180800.0, "grad_norm": 1.8810181370873267, "language_loss": 0.71402907, "learning_rate": 3.947425474604684e-06, "loss": 0.74001443, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.8245773315429688 }, { "auxiliary_loss_clip": 0.01375958, "auxiliary_loss_mlp": 0.01210069, "balance_loss_clip": 1.01506662, "balance_loss_mlp": 1.00293064, "epoch": 0.1011242710274755, "flos": 21543468877440.0, "grad_norm": 2.1704469518087706, "language_loss": 0.92123783, "learning_rate": 3.947247893117528e-06, "loss": 0.94709814, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.8012773990631104 }, { "auxiliary_loss_clip": 0.0140362, "auxiliary_loss_mlp": 0.01209706, "balance_loss_clip": 1.01623249, "balance_loss_mlp": 1.00294924, "epoch": 0.10124451391811459, "flos": 13621463677440.0, "grad_norm": 4.994730440006534, "language_loss": 0.70181966, "learning_rate": 3.947070016234413e-06, "loss": 0.7279529, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.641984462738037 }, { "auxiliary_loss_clip": 0.01377041, "auxiliary_loss_mlp": 0.01208381, "balance_loss_clip": 1.01513541, "balance_loss_mlp": 1.00200605, "epoch": 0.10136475680875369, "flos": 16649236476000.0, "grad_norm": 2.1337492713396164, "language_loss": 0.74759984, "learning_rate": 3.946891843982326e-06, "loss": 0.77345407, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.8708224296569824 }, { "auxiliary_loss_clip": 0.01389053, "auxiliary_loss_mlp": 0.01208796, "balance_loss_clip": 1.01538205, "balance_loss_mlp": 1.00223017, "epoch": 0.10148499969939277, "flos": 19461043517760.0, "grad_norm": 2.1300953251163124, "language_loss": 0.74451298, "learning_rate": 3.9467133763882935e-06, "loss": 0.77049148, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.750619888305664 }, { "auxiliary_loss_clip": 0.01402922, "auxiliary_loss_mlp": 0.01208927, "balance_loss_clip": 1.01635325, "balance_loss_mlp": 1.00274253, "epoch": 0.10160524259003187, "flos": 21104999254080.0, "grad_norm": 2.49868885210494, "language_loss": 0.86268789, "learning_rate": 3.9465346134793905e-06, "loss": 0.88880646, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.9550633430480957 }, { "auxiliary_loss_clip": 0.01340618, "auxiliary_loss_mlp": 0.012094, "balance_loss_clip": 1.01416922, "balance_loss_mlp": 1.00264311, "epoch": 0.10172548548067095, "flos": 17712697271040.0, "grad_norm": 2.033922039030081, "language_loss": 0.79436636, "learning_rate": 3.9463555552827335e-06, "loss": 0.81986654, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.7987399101257324 }, { "auxiliary_loss_clip": 0.01402226, "auxiliary_loss_mlp": 0.0120874, "balance_loss_clip": 1.01576543, "balance_loss_mlp": 1.00274682, "epoch": 0.10184572837131005, "flos": 21104855559360.0, "grad_norm": 2.4893063455834565, "language_loss": 0.86508495, "learning_rate": 3.946176201825487e-06, "loss": 0.89119458, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.7405874729156494 }, { "auxiliary_loss_clip": 0.01365768, "auxiliary_loss_mlp": 0.01209037, "balance_loss_clip": 1.01541233, "balance_loss_mlp": 1.00247109, "epoch": 0.10196597126194913, "flos": 26067603924000.0, "grad_norm": 1.9579595363858893, "language_loss": 0.83253586, "learning_rate": 3.9459965531348575e-06, "loss": 0.85828394, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.7994441986083984 }, { "auxiliary_loss_clip": 0.01365793, "auxiliary_loss_mlp": 0.00874208, "balance_loss_clip": 1.01487088, "balance_loss_mlp": 1.00032818, "epoch": 0.10208621415258823, "flos": 29314647457920.0, "grad_norm": 1.9988693981281491, "language_loss": 0.85042882, "learning_rate": 3.945816609238098e-06, "loss": 0.87282878, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.8217499256134033 }, { "auxiliary_loss_clip": 0.01335062, "auxiliary_loss_mlp": 0.01211031, "balance_loss_clip": 1.01458991, "balance_loss_mlp": 1.00370181, "epoch": 0.10220645704322733, "flos": 23805805828320.0, "grad_norm": 3.6091521573499965, "language_loss": 0.85237974, "learning_rate": 3.945636370162507e-06, "loss": 0.87784064, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.877453327178955 }, { "auxiliary_loss_clip": 0.01390553, "auxiliary_loss_mlp": 0.01208017, "balance_loss_clip": 1.0150125, "balance_loss_mlp": 1.00240493, "epoch": 0.10232669993386641, "flos": 23218557235200.0, "grad_norm": 1.853164177894148, "language_loss": 0.78910518, "learning_rate": 3.945455835935425e-06, "loss": 0.81509089, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 2.7387373447418213 }, { "auxiliary_loss_clip": 0.01376723, "auxiliary_loss_mlp": 0.01208881, "balance_loss_clip": 1.01550174, "balance_loss_mlp": 1.00212455, "epoch": 0.1024469428245055, "flos": 22922939174400.0, "grad_norm": 2.4712247983861366, "language_loss": 0.75032783, "learning_rate": 3.94527500658424e-06, "loss": 0.77618384, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 2.780174493789673 }, { "auxiliary_loss_clip": 0.01329135, "auxiliary_loss_mlp": 0.01209113, "balance_loss_clip": 1.01444411, "balance_loss_mlp": 1.00292861, "epoch": 0.10256718571514459, "flos": 31359509857440.0, "grad_norm": 1.805049136973282, "language_loss": 0.81238014, "learning_rate": 3.945093882136382e-06, "loss": 0.83776259, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 3.980661153793335 }, { "auxiliary_loss_clip": 0.01369413, "auxiliary_loss_mlp": 0.0087418, "balance_loss_clip": 1.0143348, "balance_loss_mlp": 1.00033474, "epoch": 0.10268742860578368, "flos": 23474887999200.0, "grad_norm": 1.9740081043223847, "language_loss": 0.85081327, "learning_rate": 3.944912462619329e-06, "loss": 0.87324923, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 4.5296385288238525 }, { "auxiliary_loss_clip": 0.01385591, "auxiliary_loss_mlp": 0.01211056, "balance_loss_clip": 1.01634169, "balance_loss_mlp": 1.00334525, "epoch": 0.10280767149642277, "flos": 25520325177600.0, "grad_norm": 1.9252462869570415, "language_loss": 0.80799973, "learning_rate": 3.9447307480606025e-06, "loss": 0.83396626, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 3.8443825244903564 }, { "auxiliary_loss_clip": 0.01376305, "auxiliary_loss_mlp": 0.01208933, "balance_loss_clip": 1.01542306, "balance_loss_mlp": 1.00236702, "epoch": 0.10292791438706186, "flos": 17347701078720.0, "grad_norm": 2.9962353281356555, "language_loss": 0.89982361, "learning_rate": 3.944548738487767e-06, "loss": 0.92567593, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.7745797634124756 }, { "auxiliary_loss_clip": 0.01414917, "auxiliary_loss_mlp": 0.01210494, "balance_loss_clip": 1.01614022, "balance_loss_mlp": 1.00297427, "epoch": 0.10304815727770096, "flos": 27052705667520.0, "grad_norm": 2.4934150132701, "language_loss": 0.90835392, "learning_rate": 3.944366433928434e-06, "loss": 0.93460798, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.650834321975708 }, { "auxiliary_loss_clip": 0.0138816, "auxiliary_loss_mlp": 0.01208077, "balance_loss_clip": 1.01541638, "balance_loss_mlp": 1.00246477, "epoch": 0.10316840016834004, "flos": 22782602269440.0, "grad_norm": 1.6593318029383235, "language_loss": 0.8341428, "learning_rate": 3.9441838344102594e-06, "loss": 0.86010516, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.8579189777374268 }, { "auxiliary_loss_clip": 0.0138109, "auxiliary_loss_mlp": 0.01211087, "balance_loss_clip": 1.01662314, "balance_loss_mlp": 1.00375807, "epoch": 0.10328864305897914, "flos": 20704595522400.0, "grad_norm": 3.092617053292826, "language_loss": 0.67429662, "learning_rate": 3.944000939960943e-06, "loss": 0.70021832, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.7364039421081543 }, { "auxiliary_loss_clip": 0.01402294, "auxiliary_loss_mlp": 0.01208723, "balance_loss_clip": 1.01611066, "balance_loss_mlp": 1.00253844, "epoch": 0.10340888594961822, "flos": 28478827615680.0, "grad_norm": 1.5448667235364548, "language_loss": 0.80028939, "learning_rate": 3.943817750608229e-06, "loss": 0.82639956, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.798354387283325 }, { "auxiliary_loss_clip": 0.01390985, "auxiliary_loss_mlp": 0.0120888, "balance_loss_clip": 1.01521933, "balance_loss_mlp": 1.00231457, "epoch": 0.10352912884025732, "flos": 13370341847040.0, "grad_norm": 2.3421923103297377, "language_loss": 0.81754458, "learning_rate": 3.943634266379908e-06, "loss": 0.84354323, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.6926755905151367 }, { "auxiliary_loss_clip": 0.01400369, "auxiliary_loss_mlp": 0.01209201, "balance_loss_clip": 1.01554775, "balance_loss_mlp": 1.00301707, "epoch": 0.10364937173089642, "flos": 25558570687680.0, "grad_norm": 2.4269164269604184, "language_loss": 0.84951723, "learning_rate": 3.943450487303815e-06, "loss": 0.87561297, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.7477715015411377 }, { "auxiliary_loss_clip": 0.01387868, "auxiliary_loss_mlp": 0.01208646, "balance_loss_clip": 1.01583552, "balance_loss_mlp": 1.00284338, "epoch": 0.1037696146215355, "flos": 21215496790080.0, "grad_norm": 1.673841973868734, "language_loss": 0.85210419, "learning_rate": 3.943266413407827e-06, "loss": 0.87806928, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.754957914352417 }, { "auxiliary_loss_clip": 0.01396876, "auxiliary_loss_mlp": 0.01208389, "balance_loss_clip": 1.0158298, "balance_loss_mlp": 1.00182295, "epoch": 0.1038898575121746, "flos": 25807393402560.0, "grad_norm": 2.4408224911423835, "language_loss": 0.84802938, "learning_rate": 3.94308204471987e-06, "loss": 0.87408203, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.7400972843170166 }, { "auxiliary_loss_clip": 0.01364742, "auxiliary_loss_mlp": 0.01207906, "balance_loss_clip": 1.01593542, "balance_loss_mlp": 1.00248504, "epoch": 0.10401010040281368, "flos": 19062435970080.0, "grad_norm": 2.240109476606388, "language_loss": 0.74633682, "learning_rate": 3.942897381267912e-06, "loss": 0.77206326, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.773190498352051 }, { "auxiliary_loss_clip": 0.01389848, "auxiliary_loss_mlp": 0.01209738, "balance_loss_clip": 1.01518941, "balance_loss_mlp": 1.00298119, "epoch": 0.10413034329345278, "flos": 16355127209760.0, "grad_norm": 2.3134929557441453, "language_loss": 0.66059172, "learning_rate": 3.942712423079965e-06, "loss": 0.68658757, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.66862154006958 }, { "auxiliary_loss_clip": 0.01375902, "auxiliary_loss_mlp": 0.01209729, "balance_loss_clip": 1.01479983, "balance_loss_mlp": 1.00297284, "epoch": 0.10425058618409186, "flos": 17236520992800.0, "grad_norm": 2.2036951682116115, "language_loss": 0.89992034, "learning_rate": 3.942527170184088e-06, "loss": 0.9257766, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.7942984104156494 }, { "auxiliary_loss_clip": 0.01414967, "auxiliary_loss_mlp": 0.01209838, "balance_loss_clip": 1.01656985, "balance_loss_mlp": 1.00346279, "epoch": 0.10437082907473096, "flos": 17967375545760.0, "grad_norm": 2.613896233769852, "language_loss": 0.77441818, "learning_rate": 3.942341622608385e-06, "loss": 0.80066627, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.6281185150146484 }, { "auxiliary_loss_clip": 0.01367182, "auxiliary_loss_mlp": 0.01208957, "balance_loss_clip": 1.01585913, "balance_loss_mlp": 1.00277233, "epoch": 0.10449107196537005, "flos": 36283330162080.0, "grad_norm": 1.704440453956905, "language_loss": 0.77872491, "learning_rate": 3.942155780381001e-06, "loss": 0.80448633, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.8843986988067627 }, { "auxiliary_loss_clip": 0.01375494, "auxiliary_loss_mlp": 0.01210113, "balance_loss_clip": 1.01485562, "balance_loss_mlp": 1.00316608, "epoch": 0.10461131485600914, "flos": 23802105689280.0, "grad_norm": 2.3207552745639624, "language_loss": 0.75884581, "learning_rate": 3.94196964353013e-06, "loss": 0.78470188, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.795814037322998 }, { "auxiliary_loss_clip": 0.01389356, "auxiliary_loss_mlp": 0.00874082, "balance_loss_clip": 1.01546264, "balance_loss_mlp": 1.00029182, "epoch": 0.10473155774664823, "flos": 18405485932320.0, "grad_norm": 2.5765647889902095, "language_loss": 0.80641055, "learning_rate": 3.941783212084008e-06, "loss": 0.82904494, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.742699384689331 }, { "auxiliary_loss_clip": 0.013757, "auxiliary_loss_mlp": 0.01208607, "balance_loss_clip": 1.01519418, "balance_loss_mlp": 1.00280428, "epoch": 0.10485180063728732, "flos": 25592649050880.0, "grad_norm": 2.730598951858785, "language_loss": 0.78710997, "learning_rate": 3.941596486070916e-06, "loss": 0.812953, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.7813382148742676 }, { "auxiliary_loss_clip": 0.01312044, "auxiliary_loss_mlp": 0.01209213, "balance_loss_clip": 1.01308084, "balance_loss_mlp": 1.00283813, "epoch": 0.10497204352792641, "flos": 27088759833120.0, "grad_norm": 2.275237082791633, "language_loss": 0.58951008, "learning_rate": 3.941409465519182e-06, "loss": 0.61472267, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.87137508392334 }, { "auxiliary_loss_clip": 0.0140133, "auxiliary_loss_mlp": 0.01207631, "balance_loss_clip": 1.01568198, "balance_loss_mlp": 1.00239992, "epoch": 0.10509228641856551, "flos": 32858494534080.0, "grad_norm": 1.7064120936363139, "language_loss": 0.85182953, "learning_rate": 3.941222150457176e-06, "loss": 0.8779192, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.920267105102539 }, { "auxiliary_loss_clip": 0.01400074, "auxiliary_loss_mlp": 0.01210289, "balance_loss_clip": 1.01508832, "balance_loss_mlp": 1.00353217, "epoch": 0.10521252930920459, "flos": 14319173882880.0, "grad_norm": 2.4442082678369297, "language_loss": 0.71335644, "learning_rate": 3.941034540913311e-06, "loss": 0.73945999, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.648155689239502 }, { "auxiliary_loss_clip": 0.01388125, "auxiliary_loss_mlp": 0.00874123, "balance_loss_clip": 1.01515079, "balance_loss_mlp": 1.00026882, "epoch": 0.10533277219984369, "flos": 21687038913600.0, "grad_norm": 2.0123111962871856, "language_loss": 0.82170999, "learning_rate": 3.940846636916051e-06, "loss": 0.84433246, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.7244672775268555 }, { "auxiliary_loss_clip": 0.0136271, "auxiliary_loss_mlp": 0.01207345, "balance_loss_clip": 1.01565182, "balance_loss_mlp": 1.0021143, "epoch": 0.10545301509048277, "flos": 22269797046720.0, "grad_norm": 1.9382195333212469, "language_loss": 0.86446047, "learning_rate": 3.940658438493899e-06, "loss": 0.89016104, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 2.7337565422058105 }, { "auxiliary_loss_clip": 0.01412789, "auxiliary_loss_mlp": 0.01209284, "balance_loss_clip": 1.01519132, "balance_loss_mlp": 1.00329041, "epoch": 0.10557325798112187, "flos": 22199736365280.0, "grad_norm": 2.289469076489344, "language_loss": 0.75674033, "learning_rate": 3.940469945675405e-06, "loss": 0.78296107, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 2.668224573135376 }, { "auxiliary_loss_clip": 0.0133608, "auxiliary_loss_mlp": 0.01206395, "balance_loss_clip": 1.01301956, "balance_loss_mlp": 1.00211811, "epoch": 0.10569350087176095, "flos": 25775901544320.0, "grad_norm": 1.9615115758272794, "language_loss": 0.91515136, "learning_rate": 3.940281158489163e-06, "loss": 0.94057608, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 3.916621685028076 }, { "auxiliary_loss_clip": 0.01323969, "auxiliary_loss_mlp": 0.01208062, "balance_loss_clip": 1.01431704, "balance_loss_mlp": 1.0030216, "epoch": 0.10581374376240005, "flos": 17311395447360.0, "grad_norm": 2.0026590567881204, "language_loss": 0.82853878, "learning_rate": 3.940092076963812e-06, "loss": 0.85385907, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 3.8828322887420654 }, { "auxiliary_loss_clip": 0.01386218, "auxiliary_loss_mlp": 0.01207654, "balance_loss_clip": 1.01468992, "balance_loss_mlp": 1.00242293, "epoch": 0.10593398665303914, "flos": 34349468230080.0, "grad_norm": 2.0972754607690285, "language_loss": 0.7883321, "learning_rate": 3.9399027011280355e-06, "loss": 0.81427079, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 4.006841659545898 }, { "auxiliary_loss_clip": 0.01361628, "auxiliary_loss_mlp": 0.01208918, "balance_loss_clip": 1.01459241, "balance_loss_mlp": 1.00273323, "epoch": 0.10605422954367823, "flos": 23257988226720.0, "grad_norm": 2.3415094818493025, "language_loss": 0.76960802, "learning_rate": 3.939713031010561e-06, "loss": 0.79531348, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.7874958515167236 }, { "auxiliary_loss_clip": 0.01339737, "auxiliary_loss_mlp": 0.01207873, "balance_loss_clip": 1.01463497, "balance_loss_mlp": 1.00226068, "epoch": 0.10617447243431732, "flos": 22820129305920.0, "grad_norm": 2.0275457339569045, "language_loss": 0.77636272, "learning_rate": 3.939523066640163e-06, "loss": 0.80183876, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.799908399581909 }, { "auxiliary_loss_clip": 0.01400875, "auxiliary_loss_mlp": 0.01207646, "balance_loss_clip": 1.01550078, "balance_loss_mlp": 1.00260592, "epoch": 0.10629471532495641, "flos": 24386588159040.0, "grad_norm": 2.203661779954765, "language_loss": 0.80852914, "learning_rate": 3.939332808045657e-06, "loss": 0.8346144, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.8554821014404297 }, { "auxiliary_loss_clip": 0.0134095, "auxiliary_loss_mlp": 0.01207658, "balance_loss_clip": 1.0128895, "balance_loss_mlp": 1.00242686, "epoch": 0.1064149582155955, "flos": 21105502185600.0, "grad_norm": 1.720042399491381, "language_loss": 0.84662455, "learning_rate": 3.939142255255906e-06, "loss": 0.87211066, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.835474729537964 }, { "auxiliary_loss_clip": 0.01388604, "auxiliary_loss_mlp": 0.01207384, "balance_loss_clip": 1.01516926, "balance_loss_mlp": 1.00234377, "epoch": 0.1065352011062346, "flos": 20702044941120.0, "grad_norm": 1.9331605029114636, "language_loss": 0.87099814, "learning_rate": 3.938951408299817e-06, "loss": 0.89695799, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.711636543273926 }, { "auxiliary_loss_clip": 0.01310695, "auxiliary_loss_mlp": 0.01203224, "balance_loss_clip": 1.01211083, "balance_loss_mlp": 1.00028253, "epoch": 0.10665544399687368, "flos": 62659664290080.0, "grad_norm": 0.8011932338473903, "language_loss": 0.54445291, "learning_rate": 3.938760267206342e-06, "loss": 0.56959212, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.444770574569702 }, { "auxiliary_loss_clip": 0.01412951, "auxiliary_loss_mlp": 0.01208259, "balance_loss_clip": 1.01619983, "balance_loss_mlp": 1.00264668, "epoch": 0.10677568688751278, "flos": 26140394805120.0, "grad_norm": 2.2410356262164908, "language_loss": 0.78955173, "learning_rate": 3.938568832004475e-06, "loss": 0.81576377, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.8662686347961426 }, { "auxiliary_loss_clip": 0.01374701, "auxiliary_loss_mlp": 0.01207035, "balance_loss_clip": 1.01490951, "balance_loss_mlp": 1.00237632, "epoch": 0.10689592977815186, "flos": 12786541927200.0, "grad_norm": 1.9377469657879445, "language_loss": 0.75224102, "learning_rate": 3.938377102723257e-06, "loss": 0.77805841, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.8024747371673584 }, { "auxiliary_loss_clip": 0.01339507, "auxiliary_loss_mlp": 0.01208107, "balance_loss_clip": 1.01438177, "balance_loss_mlp": 1.00249493, "epoch": 0.10701617266879096, "flos": 22126694018400.0, "grad_norm": 2.652702435867121, "language_loss": 0.83362621, "learning_rate": 3.938185079391774e-06, "loss": 0.85910231, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.8980140686035156 }, { "auxiliary_loss_clip": 0.01412066, "auxiliary_loss_mlp": 0.01208014, "balance_loss_clip": 1.01534081, "balance_loss_mlp": 1.00278354, "epoch": 0.10713641555943004, "flos": 19745633008800.0, "grad_norm": 3.4781065708827925, "language_loss": 1.06353843, "learning_rate": 3.937992762039157e-06, "loss": 1.08973932, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.737046957015991 }, { "auxiliary_loss_clip": 0.01385986, "auxiliary_loss_mlp": 0.01207885, "balance_loss_clip": 1.01498151, "balance_loss_mlp": 1.00284541, "epoch": 0.10725665845006914, "flos": 23953004156160.0, "grad_norm": 2.882963396645172, "language_loss": 0.80389506, "learning_rate": 3.937800150694577e-06, "loss": 0.82983375, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.8074562549591064 }, { "auxiliary_loss_clip": 0.01320217, "auxiliary_loss_mlp": 0.01207959, "balance_loss_clip": 1.01317358, "balance_loss_mlp": 1.00253701, "epoch": 0.10737690134070824, "flos": 18551714320800.0, "grad_norm": 3.0373187289199146, "language_loss": 0.75870252, "learning_rate": 3.937607245387255e-06, "loss": 0.7839843, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.8165104389190674 }, { "auxiliary_loss_clip": 0.0138502, "auxiliary_loss_mlp": 0.01206656, "balance_loss_clip": 1.01484156, "balance_loss_mlp": 1.00199795, "epoch": 0.10749714423134732, "flos": 22707620043840.0, "grad_norm": 2.011936387546864, "language_loss": 0.72455013, "learning_rate": 3.937414046146455e-06, "loss": 0.75046694, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.808988094329834 }, { "auxiliary_loss_clip": 0.01412545, "auxiliary_loss_mlp": 0.01206506, "balance_loss_clip": 1.01623368, "balance_loss_mlp": 1.00184774, "epoch": 0.10761738712198642, "flos": 21106076964480.0, "grad_norm": 2.100846275183303, "language_loss": 0.75085831, "learning_rate": 3.9372205530014845e-06, "loss": 0.77704877, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.717827558517456 }, { "auxiliary_loss_clip": 0.01412771, "auxiliary_loss_mlp": 0.01207929, "balance_loss_clip": 1.01572347, "balance_loss_mlp": 1.00269818, "epoch": 0.1077376300126255, "flos": 23766734073600.0, "grad_norm": 4.5362145225941255, "language_loss": 0.71329272, "learning_rate": 3.937026765981696e-06, "loss": 0.73949969, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.7861127853393555 }, { "auxiliary_loss_clip": 0.01345455, "auxiliary_loss_mlp": 0.01207665, "balance_loss_clip": 1.01395774, "balance_loss_mlp": 1.00224352, "epoch": 0.1078578729032646, "flos": 20919591339840.0, "grad_norm": 1.8467250883673947, "language_loss": 0.79435915, "learning_rate": 3.936832685116488e-06, "loss": 0.81989038, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.7495322227478027 }, { "auxiliary_loss_clip": 0.01412474, "auxiliary_loss_mlp": 0.01207715, "balance_loss_clip": 1.01586246, "balance_loss_mlp": 1.00286603, "epoch": 0.10797811579390369, "flos": 14829895532160.0, "grad_norm": 3.631643205329271, "language_loss": 0.89900112, "learning_rate": 3.936638310435301e-06, "loss": 0.92520308, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.733896017074585 }, { "auxiliary_loss_clip": 0.01394321, "auxiliary_loss_mlp": 0.01207657, "balance_loss_clip": 1.01533091, "balance_loss_mlp": 1.002617, "epoch": 0.10809835868454278, "flos": 19536995682720.0, "grad_norm": 2.0470867647561795, "language_loss": 0.81186318, "learning_rate": 3.936443641967623e-06, "loss": 0.837883, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.7273566722869873 }, { "auxiliary_loss_clip": 0.01369415, "auxiliary_loss_mlp": 0.01206559, "balance_loss_clip": 1.01409018, "balance_loss_mlp": 1.0019002, "epoch": 0.10821860157518187, "flos": 18442330418880.0, "grad_norm": 2.0777228621630224, "language_loss": 0.8307361, "learning_rate": 3.936248679742983e-06, "loss": 0.85649586, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.7446465492248535 }, { "auxiliary_loss_clip": 0.01335772, "auxiliary_loss_mlp": 0.0120421, "balance_loss_clip": 1.01285625, "balance_loss_mlp": 1.00126767, "epoch": 0.10833884446582095, "flos": 49359490895520.0, "grad_norm": 1.052332343911505, "language_loss": 0.70246029, "learning_rate": 3.936053423790959e-06, "loss": 0.72786009, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.119508743286133 }, { "auxiliary_loss_clip": 0.01411328, "auxiliary_loss_mlp": 0.01207287, "balance_loss_clip": 1.01523936, "balance_loss_mlp": 1.00224686, "epoch": 0.10845908735646005, "flos": 20411923203360.0, "grad_norm": 2.0039230176079514, "language_loss": 0.77176696, "learning_rate": 3.935857874141168e-06, "loss": 0.79795313, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.632932662963867 }, { "auxiliary_loss_clip": 0.01357989, "auxiliary_loss_mlp": 0.01206878, "balance_loss_clip": 1.01373768, "balance_loss_mlp": 1.00241017, "epoch": 0.10857933024709913, "flos": 14027758892640.0, "grad_norm": 2.385241623731553, "language_loss": 0.83834046, "learning_rate": 3.935662030823279e-06, "loss": 0.86398911, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 2.7110493183135986 }, { "auxiliary_loss_clip": 0.0139383, "auxiliary_loss_mlp": 0.01206727, "balance_loss_clip": 1.0149436, "balance_loss_mlp": 1.0018779, "epoch": 0.10869957313773823, "flos": 13369012670880.0, "grad_norm": 5.326807128568777, "language_loss": 0.7250818, "learning_rate": 3.935465893866998e-06, "loss": 0.75108743, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.6921887397766113 }, { "auxiliary_loss_clip": 0.0136205, "auxiliary_loss_mlp": 0.0120615, "balance_loss_clip": 1.01460218, "balance_loss_mlp": 1.00187266, "epoch": 0.10881981602837733, "flos": 25807106013120.0, "grad_norm": 3.215034886316314, "language_loss": 0.80215251, "learning_rate": 3.935269463302079e-06, "loss": 0.82783455, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 2.7862389087677 }, { "auxiliary_loss_clip": 0.01394889, "auxiliary_loss_mlp": 0.01207809, "balance_loss_clip": 1.01556456, "balance_loss_mlp": 1.00276911, "epoch": 0.10894005891901641, "flos": 20777566021920.0, "grad_norm": 2.1694088708627897, "language_loss": 0.76728249, "learning_rate": 3.935072739158322e-06, "loss": 0.79330945, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 4.618542909622192 }, { "auxiliary_loss_clip": 0.0137278, "auxiliary_loss_mlp": 0.01206745, "balance_loss_clip": 1.01476169, "balance_loss_mlp": 1.00227702, "epoch": 0.10906030180965551, "flos": 26649895049280.0, "grad_norm": 1.7050229048971364, "language_loss": 0.79713225, "learning_rate": 3.934875721465569e-06, "loss": 0.82292753, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 4.595600843429565 }, { "auxiliary_loss_clip": 0.01372279, "auxiliary_loss_mlp": 0.01207666, "balance_loss_clip": 1.01400781, "balance_loss_mlp": 1.00224471, "epoch": 0.10918054470029459, "flos": 36534416068800.0, "grad_norm": 4.067106392346537, "language_loss": 0.71415299, "learning_rate": 3.9346784102537076e-06, "loss": 0.73995245, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.8539962768554688 }, { "auxiliary_loss_clip": 0.01411545, "auxiliary_loss_mlp": 0.01207963, "balance_loss_clip": 1.01529014, "balance_loss_mlp": 1.00311351, "epoch": 0.10930078759093369, "flos": 21762559994400.0, "grad_norm": 1.9265245222398881, "language_loss": 0.78353524, "learning_rate": 3.934480805552669e-06, "loss": 0.80973029, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.739118814468384 }, { "auxiliary_loss_clip": 0.01411154, "auxiliary_loss_mlp": 0.00874112, "balance_loss_clip": 1.01557493, "balance_loss_mlp": 1.00043797, "epoch": 0.10942103048157277, "flos": 22601792586240.0, "grad_norm": 1.9902629835068697, "language_loss": 0.8808583, "learning_rate": 3.93428290739243e-06, "loss": 0.90371096, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.746933937072754 }, { "auxiliary_loss_clip": 0.01370415, "auxiliary_loss_mlp": 0.01208632, "balance_loss_clip": 1.01423573, "balance_loss_mlp": 1.00359178, "epoch": 0.10954127337221187, "flos": 15045789441600.0, "grad_norm": 2.4866010456617276, "language_loss": 0.79471421, "learning_rate": 3.9340847158030125e-06, "loss": 0.82050467, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.755124568939209 }, { "auxiliary_loss_clip": 0.01396051, "auxiliary_loss_mlp": 0.01206863, "balance_loss_clip": 1.0150528, "balance_loss_mlp": 1.00258565, "epoch": 0.10966151626285096, "flos": 21650984748000.0, "grad_norm": 1.9373654127066438, "language_loss": 0.75481242, "learning_rate": 3.9338862308144814e-06, "loss": 0.78084159, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.7463648319244385 }, { "auxiliary_loss_clip": 0.01410719, "auxiliary_loss_mlp": 0.01205263, "balance_loss_clip": 1.01502192, "balance_loss_mlp": 1.00155842, "epoch": 0.10978175915349005, "flos": 20121370381440.0, "grad_norm": 1.6606046926274594, "language_loss": 0.84320867, "learning_rate": 3.933687452456946e-06, "loss": 0.86936849, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.695868730545044 }, { "auxiliary_loss_clip": 0.01344664, "auxiliary_loss_mlp": 0.01207826, "balance_loss_clip": 1.01347995, "balance_loss_mlp": 1.0027858, "epoch": 0.10990200204412914, "flos": 20412677600640.0, "grad_norm": 2.144072059258737, "language_loss": 0.86073965, "learning_rate": 3.933488380760562e-06, "loss": 0.88626456, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.785557508468628 }, { "auxiliary_loss_clip": 0.01410833, "auxiliary_loss_mlp": 0.00874089, "balance_loss_clip": 1.01519942, "balance_loss_mlp": 1.00042939, "epoch": 0.11002224493476823, "flos": 17530127327520.0, "grad_norm": 2.124362209391365, "language_loss": 0.87176287, "learning_rate": 3.9332890157555286e-06, "loss": 0.89461207, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.6645212173461914 }, { "auxiliary_loss_clip": 0.01372455, "auxiliary_loss_mlp": 0.01207588, "balance_loss_clip": 1.01485634, "balance_loss_mlp": 1.00273919, "epoch": 0.11014248782540732, "flos": 12203101244160.0, "grad_norm": 2.008174625589888, "language_loss": 0.7619164, "learning_rate": 3.933089357472088e-06, "loss": 0.78771687, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.765516757965088 }, { "auxiliary_loss_clip": 0.01410845, "auxiliary_loss_mlp": 0.01208355, "balance_loss_clip": 1.01538074, "balance_loss_mlp": 1.00369668, "epoch": 0.11026273071604642, "flos": 22382988858720.0, "grad_norm": 2.0449690211220677, "language_loss": 0.85937166, "learning_rate": 3.932889405940529e-06, "loss": 0.88556373, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.706446647644043 }, { "auxiliary_loss_clip": 0.01363079, "auxiliary_loss_mlp": 0.01206755, "balance_loss_clip": 1.01475036, "balance_loss_mlp": 1.00228691, "epoch": 0.1103829736066855, "flos": 19829057299200.0, "grad_norm": 2.4192919828131805, "language_loss": 0.80128759, "learning_rate": 3.932689161191184e-06, "loss": 0.82698596, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.7444069385528564 }, { "auxiliary_loss_clip": 0.01388047, "auxiliary_loss_mlp": 0.01207083, "balance_loss_clip": 1.0145843, "balance_loss_mlp": 1.002424, "epoch": 0.1105032164973246, "flos": 22669625999520.0, "grad_norm": 2.1770355650070883, "language_loss": 0.88176781, "learning_rate": 3.93248862325443e-06, "loss": 0.90771914, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.7201621532440186 }, { "auxiliary_loss_clip": 0.01386745, "auxiliary_loss_mlp": 0.0120383, "balance_loss_clip": 1.01431012, "balance_loss_mlp": 1.00088823, "epoch": 0.11062345938796368, "flos": 66483538549920.0, "grad_norm": 0.9404111866020836, "language_loss": 0.64379722, "learning_rate": 3.932287792160688e-06, "loss": 0.66970295, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.214261770248413 }, { "auxiliary_loss_clip": 0.01398433, "auxiliary_loss_mlp": 0.01206856, "balance_loss_clip": 1.01500034, "balance_loss_mlp": 1.00219715, "epoch": 0.11074370227860278, "flos": 21907782519840.0, "grad_norm": 9.972518259616416, "language_loss": 0.80522466, "learning_rate": 3.932086667940424e-06, "loss": 0.83127749, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.7373104095458984 }, { "auxiliary_loss_clip": 0.01385651, "auxiliary_loss_mlp": 0.00874091, "balance_loss_clip": 1.01435137, "balance_loss_mlp": 1.00046766, "epoch": 0.11086394516924186, "flos": 28658128504320.0, "grad_norm": 2.018452462911281, "language_loss": 0.81834894, "learning_rate": 3.93188525062415e-06, "loss": 0.84094638, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.8667116165161133 }, { "auxiliary_loss_clip": 0.01386892, "auxiliary_loss_mlp": 0.01208426, "balance_loss_clip": 1.01510811, "balance_loss_mlp": 1.00338614, "epoch": 0.11098418805988096, "flos": 24535259357760.0, "grad_norm": 2.249113093062477, "language_loss": 0.86240786, "learning_rate": 3.931683540242418e-06, "loss": 0.88836098, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.851604461669922 }, { "auxiliary_loss_clip": 0.01397835, "auxiliary_loss_mlp": 0.01206456, "balance_loss_clip": 1.01514935, "balance_loss_mlp": 1.00256026, "epoch": 0.11110443095052006, "flos": 22960394363520.0, "grad_norm": 2.623674556426651, "language_loss": 0.91413814, "learning_rate": 3.9314815368258295e-06, "loss": 0.94018102, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.8039908409118652 }, { "auxiliary_loss_clip": 0.01393181, "auxiliary_loss_mlp": 0.01206282, "balance_loss_clip": 1.01541471, "balance_loss_mlp": 1.00238657, "epoch": 0.11122467384115914, "flos": 18950034479040.0, "grad_norm": 1.8932529177524118, "language_loss": 0.79077172, "learning_rate": 3.9312792404050275e-06, "loss": 0.81676632, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.81071138381958 }, { "auxiliary_loss_clip": 0.01410108, "auxiliary_loss_mlp": 0.01205681, "balance_loss_clip": 1.01534295, "balance_loss_mlp": 1.00197625, "epoch": 0.11134491673179824, "flos": 25082969188320.0, "grad_norm": 1.8848062621888098, "language_loss": 0.77425253, "learning_rate": 3.9310766510107e-06, "loss": 0.80041045, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.6958975791931152 }, { "auxiliary_loss_clip": 0.01353957, "auxiliary_loss_mlp": 0.01207187, "balance_loss_clip": 1.01361895, "balance_loss_mlp": 1.00252891, "epoch": 0.11146515962243732, "flos": 24499133344800.0, "grad_norm": 1.9702724305037607, "language_loss": 0.92154825, "learning_rate": 3.9308737686735806e-06, "loss": 0.94715977, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 3.0051450729370117 }, { "auxiliary_loss_clip": 0.01410679, "auxiliary_loss_mlp": 0.01206704, "balance_loss_clip": 1.01528382, "balance_loss_mlp": 1.00242686, "epoch": 0.11158540251307641, "flos": 22343773409280.0, "grad_norm": 2.020416405841549, "language_loss": 0.82743627, "learning_rate": 3.9306705934244455e-06, "loss": 0.85361004, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 2.7137644290924072 }, { "auxiliary_loss_clip": 0.01373051, "auxiliary_loss_mlp": 0.01205792, "balance_loss_clip": 1.01479983, "balance_loss_mlp": 1.00285053, "epoch": 0.11170564540371551, "flos": 19902315188160.0, "grad_norm": 1.891246376847444, "language_loss": 0.88491762, "learning_rate": 3.930467125294116e-06, "loss": 0.91070604, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.7674403190612793 }, { "auxiliary_loss_clip": 0.01300945, "auxiliary_loss_mlp": 0.01202783, "balance_loss_clip": 1.01330519, "balance_loss_mlp": 1.00136745, "epoch": 0.1118258882943546, "flos": 64586269638720.0, "grad_norm": 0.9310150975309438, "language_loss": 0.60485721, "learning_rate": 3.930263364313458e-06, "loss": 0.6298945, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 3.2183375358581543 }, { "auxiliary_loss_clip": 0.01361198, "auxiliary_loss_mlp": 0.01206061, "balance_loss_clip": 1.0151453, "balance_loss_mlp": 1.00254655, "epoch": 0.11194613118499369, "flos": 17201975621760.0, "grad_norm": 2.1047097230228218, "language_loss": 0.8333903, "learning_rate": 3.930059310513384e-06, "loss": 0.85906291, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 2.769747257232666 }, { "auxiliary_loss_clip": 0.01363534, "auxiliary_loss_mlp": 0.00874103, "balance_loss_clip": 1.01484013, "balance_loss_mlp": 1.00048542, "epoch": 0.11206637407563277, "flos": 31863477854880.0, "grad_norm": 1.8872202081652443, "language_loss": 0.84229791, "learning_rate": 3.929854963924846e-06, "loss": 0.86467427, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 5.6645472049713135 }, { "auxiliary_loss_clip": 0.01360185, "auxiliary_loss_mlp": 0.01205052, "balance_loss_clip": 1.01513314, "balance_loss_mlp": 1.00211012, "epoch": 0.11218661696627187, "flos": 21945632869440.0, "grad_norm": 1.9109481586980324, "language_loss": 0.774342, "learning_rate": 3.929650324578845e-06, "loss": 0.79999435, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 3.785114049911499 }, { "auxiliary_loss_clip": 0.01362644, "auxiliary_loss_mlp": 0.01206668, "balance_loss_clip": 1.01430464, "balance_loss_mlp": 1.00181818, "epoch": 0.11230685985691095, "flos": 25878208481280.0, "grad_norm": 9.58679362692527, "language_loss": 0.81840944, "learning_rate": 3.929445392506423e-06, "loss": 0.8441025, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.8794357776641846 }, { "auxiliary_loss_clip": 0.01384083, "auxiliary_loss_mlp": 0.01205893, "balance_loss_clip": 1.01458108, "balance_loss_mlp": 1.00256968, "epoch": 0.11242710274755005, "flos": 22231515612960.0, "grad_norm": 3.851823858025841, "language_loss": 0.75733602, "learning_rate": 3.92924016773867e-06, "loss": 0.78323579, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.766389846801758 }, { "auxiliary_loss_clip": 0.01371978, "auxiliary_loss_mlp": 0.00874008, "balance_loss_clip": 1.01403236, "balance_loss_mlp": 1.00044823, "epoch": 0.11254734563818915, "flos": 17712194339520.0, "grad_norm": 2.28963242063212, "language_loss": 0.73188806, "learning_rate": 3.9290346503067175e-06, "loss": 0.75434786, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.8234312534332275 }, { "auxiliary_loss_clip": 0.01396402, "auxiliary_loss_mlp": 0.01205846, "balance_loss_clip": 1.01489615, "balance_loss_mlp": 1.00214136, "epoch": 0.11266758852882823, "flos": 54930418149600.0, "grad_norm": 1.684777988171217, "language_loss": 0.7904495, "learning_rate": 3.9288288402417415e-06, "loss": 0.81647205, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 3.0990257263183594 }, { "auxiliary_loss_clip": 0.01385343, "auxiliary_loss_mlp": 0.01207158, "balance_loss_clip": 1.01479983, "balance_loss_mlp": 1.0024997, "epoch": 0.11278783141946733, "flos": 18878141689920.0, "grad_norm": 7.288293008236961, "language_loss": 0.70554066, "learning_rate": 3.928622737574964e-06, "loss": 0.7314657, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.921698808670044 }, { "auxiliary_loss_clip": 0.013715, "auxiliary_loss_mlp": 0.01205855, "balance_loss_clip": 1.01461351, "balance_loss_mlp": 1.00215065, "epoch": 0.11290807431010641, "flos": 26469264984480.0, "grad_norm": 1.812635241368181, "language_loss": 0.90897512, "learning_rate": 3.928416342337652e-06, "loss": 0.93474865, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.855752944946289 }, { "auxiliary_loss_clip": 0.01374121, "auxiliary_loss_mlp": 0.01206421, "balance_loss_clip": 1.01496983, "balance_loss_mlp": 1.00290668, "epoch": 0.1130283172007455, "flos": 22710601709280.0, "grad_norm": 1.7173943868898087, "language_loss": 0.82422471, "learning_rate": 3.928209654561113e-06, "loss": 0.85003018, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.823850631713867 }, { "auxiliary_loss_clip": 0.01357779, "auxiliary_loss_mlp": 0.01205822, "balance_loss_clip": 1.01400459, "balance_loss_mlp": 1.00230825, "epoch": 0.1131485600913846, "flos": 23219922335040.0, "grad_norm": 3.1266919065614944, "language_loss": 0.81424004, "learning_rate": 3.928002674276703e-06, "loss": 0.83987606, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.798712730407715 }, { "auxiliary_loss_clip": 0.01347019, "auxiliary_loss_mlp": 0.01205686, "balance_loss_clip": 1.01436853, "balance_loss_mlp": 1.00179029, "epoch": 0.11326880298202369, "flos": 14064280066080.0, "grad_norm": 2.265166823872486, "language_loss": 0.75335073, "learning_rate": 3.92779540151582e-06, "loss": 0.77887785, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.773622751235962 }, { "auxiliary_loss_clip": 0.01367514, "auxiliary_loss_mlp": 0.01205791, "balance_loss_clip": 1.0135653, "balance_loss_mlp": 1.00208628, "epoch": 0.11338904587266278, "flos": 16325395611840.0, "grad_norm": 2.1816019285374946, "language_loss": 0.85685241, "learning_rate": 3.927587836309907e-06, "loss": 0.88258541, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.7766690254211426 }, { "auxiliary_loss_clip": 0.01372286, "auxiliary_loss_mlp": 0.01206329, "balance_loss_clip": 1.01395869, "balance_loss_mlp": 1.00224233, "epoch": 0.11350928876330187, "flos": 24426270616320.0, "grad_norm": 1.730098843162238, "language_loss": 0.78101194, "learning_rate": 3.927379978690452e-06, "loss": 0.8067981, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.7989397048950195 }, { "auxiliary_loss_clip": 0.01370979, "auxiliary_loss_mlp": 0.01205768, "balance_loss_clip": 1.01409173, "balance_loss_mlp": 1.00206268, "epoch": 0.11362953165394096, "flos": 24497085695040.0, "grad_norm": 2.2502605373248152, "language_loss": 0.87261534, "learning_rate": 3.927171828688987e-06, "loss": 0.89838284, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.8316376209259033 }, { "auxiliary_loss_clip": 0.01410291, "auxiliary_loss_mlp": 0.0120644, "balance_loss_clip": 1.01579857, "balance_loss_mlp": 1.00273561, "epoch": 0.11374977454458005, "flos": 24060843339840.0, "grad_norm": 1.9813246155235764, "language_loss": 0.82284176, "learning_rate": 3.926963386337088e-06, "loss": 0.8490091, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.6931467056274414 }, { "auxiliary_loss_clip": 0.01410418, "auxiliary_loss_mlp": 0.01206995, "balance_loss_clip": 1.01571822, "balance_loss_mlp": 1.00271773, "epoch": 0.11387001743521914, "flos": 39457654662240.0, "grad_norm": 2.570205798690783, "language_loss": 0.70003933, "learning_rate": 3.926754651666375e-06, "loss": 0.72621346, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.8766632080078125 }, { "auxiliary_loss_clip": 0.01349855, "auxiliary_loss_mlp": 0.0120603, "balance_loss_clip": 1.01418889, "balance_loss_mlp": 1.00251555, "epoch": 0.11399026032585824, "flos": 25082466256800.0, "grad_norm": 2.4394207387169655, "language_loss": 0.78052723, "learning_rate": 3.926545624708513e-06, "loss": 0.80608606, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.94179368019104 }, { "auxiliary_loss_clip": 0.0135148, "auxiliary_loss_mlp": 0.01206097, "balance_loss_clip": 1.01333559, "balance_loss_mlp": 1.00239241, "epoch": 0.11411050321649732, "flos": 17961843299040.0, "grad_norm": 1.9286267209936083, "language_loss": 0.857265, "learning_rate": 3.926336305495213e-06, "loss": 0.88284081, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.9146220684051514 }, { "auxiliary_loss_clip": 0.0135926, "auxiliary_loss_mlp": 0.01205365, "balance_loss_clip": 1.01390016, "balance_loss_mlp": 1.00185108, "epoch": 0.11423074610713642, "flos": 22455420503040.0, "grad_norm": 2.943680560558695, "language_loss": 0.89180529, "learning_rate": 3.926126694058226e-06, "loss": 0.91745162, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.8503570556640625 }, { "auxiliary_loss_clip": 0.01316696, "auxiliary_loss_mlp": 0.01205552, "balance_loss_clip": 1.01270843, "balance_loss_mlp": 1.00203753, "epoch": 0.1143509889977755, "flos": 19717697594880.0, "grad_norm": 1.5176698059941895, "language_loss": 0.82138562, "learning_rate": 3.92591679042935e-06, "loss": 0.8466081, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.8542959690093994 }, { "auxiliary_loss_clip": 0.01383127, "auxiliary_loss_mlp": 0.01206868, "balance_loss_clip": 1.01443911, "balance_loss_mlp": 1.00259137, "epoch": 0.1144712318884146, "flos": 19822878426240.0, "grad_norm": 1.7827061773499986, "language_loss": 0.8250584, "learning_rate": 3.92570659464043e-06, "loss": 0.85095835, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.797544002532959 }, { "auxiliary_loss_clip": 0.01383029, "auxiliary_loss_mlp": 0.00874051, "balance_loss_clip": 1.01479328, "balance_loss_mlp": 1.00041389, "epoch": 0.1145914747790537, "flos": 14939207586720.0, "grad_norm": 1.969716067308668, "language_loss": 0.79956591, "learning_rate": 3.925496106723349e-06, "loss": 0.82213676, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.736149311065674 }, { "auxiliary_loss_clip": 0.01397228, "auxiliary_loss_mlp": 0.01205377, "balance_loss_clip": 1.01552308, "balance_loss_mlp": 1.00205398, "epoch": 0.11471171766969278, "flos": 19865039617440.0, "grad_norm": 1.9022960554445674, "language_loss": 0.83807594, "learning_rate": 3.9252853267100405e-06, "loss": 0.86410195, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.8703930377960205 }, { "auxiliary_loss_clip": 0.01358423, "auxiliary_loss_mlp": 0.01206625, "balance_loss_clip": 1.01468158, "balance_loss_mlp": 1.00234747, "epoch": 0.11483196056033187, "flos": 22526487047520.0, "grad_norm": 1.783922662751073, "language_loss": 0.8361938, "learning_rate": 3.9250742546324786e-06, "loss": 0.8618443, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 2.829127311706543 }, { "auxiliary_loss_clip": 0.01374028, "auxiliary_loss_mlp": 0.01205848, "balance_loss_clip": 1.01421547, "balance_loss_mlp": 1.00252414, "epoch": 0.11495220345097096, "flos": 28220305507200.0, "grad_norm": 1.7400949653457878, "language_loss": 0.86796427, "learning_rate": 3.924862890522683e-06, "loss": 0.89376295, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 2.796678066253662 }, { "auxiliary_loss_clip": 0.01398006, "auxiliary_loss_mlp": 0.01207236, "balance_loss_clip": 1.01589179, "balance_loss_mlp": 1.00276852, "epoch": 0.11507244634161005, "flos": 17492276977920.0, "grad_norm": 2.578658173898003, "language_loss": 0.85943174, "learning_rate": 3.9246512344127174e-06, "loss": 0.8854841, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 2.769347906112671 }, { "auxiliary_loss_clip": 0.01306198, "auxiliary_loss_mlp": 0.01205822, "balance_loss_clip": 1.01266623, "balance_loss_mlp": 1.00230801, "epoch": 0.11519268923224914, "flos": 22564948099680.0, "grad_norm": 1.8022175294216929, "language_loss": 0.81844521, "learning_rate": 3.9244392863346895e-06, "loss": 0.84356546, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 4.736105680465698 }, { "auxiliary_loss_clip": 0.01361235, "auxiliary_loss_mlp": 0.01206551, "balance_loss_clip": 1.01459324, "balance_loss_mlp": 1.00265586, "epoch": 0.11531293212288823, "flos": 16982848581120.0, "grad_norm": 1.8132497784805155, "language_loss": 0.92327082, "learning_rate": 3.9242270463207524e-06, "loss": 0.94894862, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 4.688525915145874 }, { "auxiliary_loss_clip": 0.01327825, "auxiliary_loss_mlp": 0.01206638, "balance_loss_clip": 1.01233566, "balance_loss_mlp": 1.00255156, "epoch": 0.11543317501352733, "flos": 12422012742720.0, "grad_norm": 14.142407117548297, "language_loss": 0.85262203, "learning_rate": 3.924014514403102e-06, "loss": 0.87796658, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.786985397338867 }, { "auxiliary_loss_clip": 0.01329955, "auxiliary_loss_mlp": 0.01206919, "balance_loss_clip": 1.01300943, "balance_loss_mlp": 1.00206971, "epoch": 0.11555341790416641, "flos": 19821657021120.0, "grad_norm": 2.22275924626034, "language_loss": 0.91344547, "learning_rate": 3.92380169061398e-06, "loss": 0.93881416, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.8250186443328857 }, { "auxiliary_loss_clip": 0.013573, "auxiliary_loss_mlp": 0.00874, "balance_loss_clip": 1.01343381, "balance_loss_mlp": 1.00032139, "epoch": 0.11567366079480551, "flos": 25738877439360.0, "grad_norm": 1.9782407000572308, "language_loss": 0.83718634, "learning_rate": 3.9235885749856705e-06, "loss": 0.85949928, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.82328200340271 }, { "auxiliary_loss_clip": 0.01359975, "auxiliary_loss_mlp": 0.01207171, "balance_loss_clip": 1.01419067, "balance_loss_mlp": 1.00251293, "epoch": 0.1157939036854446, "flos": 18223311149280.0, "grad_norm": 1.893304496537814, "language_loss": 0.82434356, "learning_rate": 3.9233751675505035e-06, "loss": 0.85001498, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.752997636795044 }, { "auxiliary_loss_clip": 0.01372766, "auxiliary_loss_mlp": 0.01208944, "balance_loss_clip": 1.01521683, "balance_loss_mlp": 1.00371289, "epoch": 0.11591414657608369, "flos": 23073765793920.0, "grad_norm": 2.1937493102057157, "language_loss": 0.84881896, "learning_rate": 3.923161468340853e-06, "loss": 0.87463605, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.809668779373169 }, { "auxiliary_loss_clip": 0.01345747, "auxiliary_loss_mlp": 0.01206187, "balance_loss_clip": 1.01359046, "balance_loss_mlp": 1.00171959, "epoch": 0.11603438946672277, "flos": 19461726067680.0, "grad_norm": 1.7118883432496856, "language_loss": 0.8158685, "learning_rate": 3.9229474773891374e-06, "loss": 0.84138787, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.847818374633789 }, { "auxiliary_loss_clip": 0.01360532, "auxiliary_loss_mlp": 0.01206886, "balance_loss_clip": 1.01348174, "balance_loss_mlp": 1.0018456, "epoch": 0.11615463235736187, "flos": 26831997984960.0, "grad_norm": 2.3977052815330504, "language_loss": 0.8387298, "learning_rate": 3.922733194727818e-06, "loss": 0.86440396, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.8710873126983643 }, { "auxiliary_loss_clip": 0.01387267, "auxiliary_loss_mlp": 0.0120611, "balance_loss_clip": 1.01464415, "balance_loss_mlp": 1.00164175, "epoch": 0.11627487524800097, "flos": 18580332284640.0, "grad_norm": 2.6905676041272804, "language_loss": 0.87655485, "learning_rate": 3.922518620389402e-06, "loss": 0.90248859, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.824620246887207 }, { "auxiliary_loss_clip": 0.01295764, "auxiliary_loss_mlp": 0.01205685, "balance_loss_clip": 1.01306057, "balance_loss_mlp": 1.00217104, "epoch": 0.11639511813864005, "flos": 18150484344480.0, "grad_norm": 2.06600396109198, "language_loss": 0.89766777, "learning_rate": 3.922303754406439e-06, "loss": 0.92268229, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.8887343406677246 }, { "auxiliary_loss_clip": 0.01372691, "auxiliary_loss_mlp": 0.01205451, "balance_loss_clip": 1.01508331, "balance_loss_mlp": 1.00212789, "epoch": 0.11651536102927915, "flos": 20922034150080.0, "grad_norm": 2.50298335556148, "language_loss": 0.78931427, "learning_rate": 3.922088596811526e-06, "loss": 0.81509566, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.8410732746124268 }, { "auxiliary_loss_clip": 0.01397206, "auxiliary_loss_mlp": 0.01206061, "balance_loss_clip": 1.01522768, "balance_loss_mlp": 1.00216556, "epoch": 0.11663560391991823, "flos": 16508612181600.0, "grad_norm": 2.271134356687807, "language_loss": 0.86915702, "learning_rate": 3.9218731476373e-06, "loss": 0.89518964, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.8023128509521484 }, { "auxiliary_loss_clip": 0.01384568, "auxiliary_loss_mlp": 0.01207578, "balance_loss_clip": 1.0154078, "balance_loss_mlp": 1.00215673, "epoch": 0.11675584681055733, "flos": 19865039617440.0, "grad_norm": 2.000779289020552, "language_loss": 0.84944725, "learning_rate": 3.9216574069164455e-06, "loss": 0.87536877, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.718750476837158 }, { "auxiliary_loss_clip": 0.01408228, "auxiliary_loss_mlp": 0.01205739, "balance_loss_clip": 1.01458299, "balance_loss_mlp": 1.00203371, "epoch": 0.11687608970119642, "flos": 21944375540640.0, "grad_norm": 1.6534880063750863, "language_loss": 0.80226904, "learning_rate": 3.921441374681691e-06, "loss": 0.82840872, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.7497193813323975 }, { "auxiliary_loss_clip": 0.0135774, "auxiliary_loss_mlp": 0.01205497, "balance_loss_clip": 1.01363122, "balance_loss_mlp": 1.00160146, "epoch": 0.1169963325918355, "flos": 24061166652960.0, "grad_norm": 2.64952434612226, "language_loss": 0.64896023, "learning_rate": 3.921225050965808e-06, "loss": 0.67459261, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.7643260955810547 }, { "auxiliary_loss_clip": 0.01342302, "auxiliary_loss_mlp": 0.01206807, "balance_loss_clip": 1.01248336, "balance_loss_mlp": 1.0023396, "epoch": 0.1171165754824746, "flos": 23368162449600.0, "grad_norm": 2.168213247937705, "language_loss": 0.74792534, "learning_rate": 3.921008435801612e-06, "loss": 0.7734164, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.8017044067382812 }, { "auxiliary_loss_clip": 0.01397207, "auxiliary_loss_mlp": 0.01207965, "balance_loss_clip": 1.01509523, "balance_loss_mlp": 1.00273418, "epoch": 0.11723681837311369, "flos": 18552253176000.0, "grad_norm": 4.373899705986265, "language_loss": 0.76058412, "learning_rate": 3.920791529221963e-06, "loss": 0.78663588, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.7447571754455566 }, { "auxiliary_loss_clip": 0.01370696, "auxiliary_loss_mlp": 0.0087399, "balance_loss_clip": 1.01362562, "balance_loss_mlp": 1.00026274, "epoch": 0.11735706126375278, "flos": 23550552774720.0, "grad_norm": 1.854595103739956, "language_loss": 0.76658618, "learning_rate": 3.920574331259768e-06, "loss": 0.78903306, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.7999441623687744 }, { "auxiliary_loss_clip": 0.01369826, "auxiliary_loss_mlp": 0.01206518, "balance_loss_clip": 1.01378548, "balance_loss_mlp": 1.00205004, "epoch": 0.11747730415439187, "flos": 22381551911520.0, "grad_norm": 1.9497570805398519, "language_loss": 0.79659307, "learning_rate": 3.9203568419479716e-06, "loss": 0.82235652, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.82180118560791 }, { "auxiliary_loss_clip": 0.01372334, "auxiliary_loss_mlp": 0.01206908, "balance_loss_clip": 1.01480973, "balance_loss_mlp": 1.00244009, "epoch": 0.11759754704503096, "flos": 22200742228320.0, "grad_norm": 1.6451095015592105, "language_loss": 0.7518428, "learning_rate": 3.92013906131957e-06, "loss": 0.77763522, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.761861562728882 }, { "auxiliary_loss_clip": 0.01345427, "auxiliary_loss_mlp": 0.01206281, "balance_loss_clip": 1.01397741, "balance_loss_mlp": 1.00238514, "epoch": 0.11771778993567006, "flos": 22309766893440.0, "grad_norm": 1.7292722811643206, "language_loss": 0.82745314, "learning_rate": 3.9199209894076e-06, "loss": 0.85297024, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.837623357772827 }, { "auxiliary_loss_clip": 0.01409303, "auxiliary_loss_mlp": 0.01206748, "balance_loss_clip": 1.01483881, "balance_loss_mlp": 1.00189877, "epoch": 0.11783803282630914, "flos": 21288179900160.0, "grad_norm": 2.0198933979181586, "language_loss": 0.89938605, "learning_rate": 3.919702626245142e-06, "loss": 0.92554659, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.7566347122192383 }, { "auxiliary_loss_clip": 0.01370645, "auxiliary_loss_mlp": 0.01207206, "balance_loss_clip": 1.01388288, "balance_loss_mlp": 1.00235665, "epoch": 0.11795827571694824, "flos": 25371546207840.0, "grad_norm": 2.2133867941803413, "language_loss": 0.66403162, "learning_rate": 3.919483971865322e-06, "loss": 0.6898101, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.896611213684082 }, { "auxiliary_loss_clip": 0.01357817, "auxiliary_loss_mlp": 0.01206579, "balance_loss_clip": 1.01414025, "balance_loss_mlp": 1.00249231, "epoch": 0.11807851860758732, "flos": 23622230021760.0, "grad_norm": 2.398344515981454, "language_loss": 0.87837261, "learning_rate": 3.91926502630131e-06, "loss": 0.90401655, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 2.8507704734802246 }, { "auxiliary_loss_clip": 0.01386941, "auxiliary_loss_mlp": 0.0120629, "balance_loss_clip": 1.01463175, "balance_loss_mlp": 1.00201273, "epoch": 0.11819876149822642, "flos": 24972507576000.0, "grad_norm": 1.9161946647758672, "language_loss": 0.71861315, "learning_rate": 3.91904578958632e-06, "loss": 0.74454546, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.8280539512634277 }, { "auxiliary_loss_clip": 0.01409399, "auxiliary_loss_mlp": 0.01208658, "balance_loss_clip": 1.01525927, "balance_loss_mlp": 1.00323653, "epoch": 0.11831900438886551, "flos": 23003238104640.0, "grad_norm": 2.60419297519983, "language_loss": 0.84053892, "learning_rate": 3.918826261753608e-06, "loss": 0.86671954, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 3.6620335578918457 }, { "auxiliary_loss_clip": 0.01365664, "auxiliary_loss_mlp": 0.01206764, "balance_loss_clip": 1.01348996, "balance_loss_mlp": 1.0024873, "epoch": 0.1184392472795046, "flos": 27965160224640.0, "grad_norm": 5.808713948908919, "language_loss": 0.71294111, "learning_rate": 3.918606442836478e-06, "loss": 0.73866534, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 4.747543811798096 }, { "auxiliary_loss_clip": 0.01382575, "auxiliary_loss_mlp": 0.012059, "balance_loss_clip": 1.01437402, "balance_loss_mlp": 1.00219524, "epoch": 0.1185594901701437, "flos": 19898507278080.0, "grad_norm": 2.1157541733796883, "language_loss": 0.77259874, "learning_rate": 3.918386332868277e-06, "loss": 0.79848349, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.656412124633789 }, { "auxiliary_loss_clip": 0.01397539, "auxiliary_loss_mlp": 0.01207511, "balance_loss_clip": 1.01556361, "balance_loss_mlp": 1.00266182, "epoch": 0.11867973306078278, "flos": 18912363747840.0, "grad_norm": 1.7834253184600233, "language_loss": 0.94311148, "learning_rate": 3.918165931882394e-06, "loss": 0.96916193, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.736374855041504 }, { "auxiliary_loss_clip": 0.01320225, "auxiliary_loss_mlp": 0.01206447, "balance_loss_clip": 1.01305747, "balance_loss_mlp": 1.00217044, "epoch": 0.11879997595142187, "flos": 16982812657440.0, "grad_norm": 4.1602579853639, "language_loss": 0.75589752, "learning_rate": 3.917945239912264e-06, "loss": 0.78116429, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.8261499404907227 }, { "auxiliary_loss_clip": 0.01327747, "auxiliary_loss_mlp": 0.01206188, "balance_loss_clip": 1.01251078, "balance_loss_mlp": 1.00210142, "epoch": 0.11892021884206096, "flos": 17530378793280.0, "grad_norm": 2.0253135950204495, "language_loss": 0.76033771, "learning_rate": 3.917724256991367e-06, "loss": 0.78567708, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.8286309242248535 }, { "auxiliary_loss_clip": 0.01372821, "auxiliary_loss_mlp": 0.01207137, "balance_loss_clip": 1.01453507, "balance_loss_mlp": 1.00247812, "epoch": 0.11904046173270005, "flos": 30955908918240.0, "grad_norm": 3.1618621777897657, "language_loss": 0.81575775, "learning_rate": 3.9175029831532245e-06, "loss": 0.84155732, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.811521530151367 }, { "auxiliary_loss_clip": 0.01337719, "auxiliary_loss_mlp": 0.01206437, "balance_loss_clip": 1.01286125, "balance_loss_mlp": 1.00215983, "epoch": 0.11916070462333915, "flos": 20157244928640.0, "grad_norm": 2.0413518356179905, "language_loss": 0.88659108, "learning_rate": 3.917281418431404e-06, "loss": 0.91203266, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.8006389141082764 }, { "auxiliary_loss_clip": 0.01358066, "auxiliary_loss_mlp": 0.01206331, "balance_loss_clip": 1.01398039, "balance_loss_mlp": 1.00186312, "epoch": 0.11928094751397823, "flos": 23551127553600.0, "grad_norm": 2.0861436551016954, "language_loss": 0.76743686, "learning_rate": 3.917059562859516e-06, "loss": 0.79308081, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.805609941482544 }, { "auxiliary_loss_clip": 0.01371166, "auxiliary_loss_mlp": 0.01205937, "balance_loss_clip": 1.0141356, "balance_loss_mlp": 1.00242281, "epoch": 0.11940119040461733, "flos": 23908436078400.0, "grad_norm": 2.1580209934150845, "language_loss": 0.88828933, "learning_rate": 3.916837416471218e-06, "loss": 0.91406041, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.7869696617126465 }, { "auxiliary_loss_clip": 0.0139544, "auxiliary_loss_mlp": 0.01205797, "balance_loss_clip": 1.0141499, "balance_loss_mlp": 1.00151956, "epoch": 0.11952143329525641, "flos": 13844542322880.0, "grad_norm": 2.2373746176679603, "language_loss": 0.71971846, "learning_rate": 3.916614979300207e-06, "loss": 0.74573082, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.7116942405700684 }, { "auxiliary_loss_clip": 0.01332729, "auxiliary_loss_mlp": 0.01205981, "balance_loss_clip": 1.01328349, "balance_loss_mlp": 1.00208569, "epoch": 0.11964167618589551, "flos": 27015537867840.0, "grad_norm": 1.5543347209459406, "language_loss": 0.78883332, "learning_rate": 3.9163922513802274e-06, "loss": 0.81422049, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.8912229537963867 }, { "auxiliary_loss_clip": 0.0140791, "auxiliary_loss_mlp": 0.01206469, "balance_loss_clip": 1.01382172, "balance_loss_mlp": 1.00219214, "epoch": 0.1197619190765346, "flos": 12567630428640.0, "grad_norm": 4.536915013999774, "language_loss": 0.82514179, "learning_rate": 3.916169232745067e-06, "loss": 0.85128558, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.8486714363098145 }, { "auxiliary_loss_clip": 0.01371334, "auxiliary_loss_mlp": 0.01207155, "balance_loss_clip": 1.01407135, "balance_loss_mlp": 1.00211477, "epoch": 0.11988216196717369, "flos": 16909446997440.0, "grad_norm": 2.7105983213977676, "language_loss": 0.92050886, "learning_rate": 3.915945923428559e-06, "loss": 0.94629377, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.8218235969543457 }, { "auxiliary_loss_clip": 0.01396687, "auxiliary_loss_mlp": 0.01206561, "balance_loss_clip": 1.01481938, "balance_loss_mlp": 1.00228357, "epoch": 0.12000240485781279, "flos": 16216586488800.0, "grad_norm": 2.3454471140748088, "language_loss": 0.82889384, "learning_rate": 3.915722323464577e-06, "loss": 0.85492635, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.7845516204833984 }, { "auxiliary_loss_clip": 0.01383845, "auxiliary_loss_mlp": 0.01206945, "balance_loss_clip": 1.01415229, "balance_loss_mlp": 1.00247681, "epoch": 0.12012264774845187, "flos": 49344905881440.0, "grad_norm": 3.8973936540126246, "language_loss": 0.70709008, "learning_rate": 3.91549843288704e-06, "loss": 0.73299795, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.965009927749634 }, { "auxiliary_loss_clip": 0.01361047, "auxiliary_loss_mlp": 0.00873956, "balance_loss_clip": 1.01364207, "balance_loss_mlp": 1.00029576, "epoch": 0.12024289063909097, "flos": 26979447778560.0, "grad_norm": 6.161771075829625, "language_loss": 0.79079342, "learning_rate": 3.915274251729916e-06, "loss": 0.81314343, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.8095619678497314 }, { "auxiliary_loss_clip": 0.01339634, "auxiliary_loss_mlp": 0.01207447, "balance_loss_clip": 1.01263404, "balance_loss_mlp": 1.00278878, "epoch": 0.12036313352973005, "flos": 19537318995840.0, "grad_norm": 1.8993764221771856, "language_loss": 0.90035689, "learning_rate": 3.91504978002721e-06, "loss": 0.92582774, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.8228697776794434 }, { "auxiliary_loss_clip": 0.01382392, "auxiliary_loss_mlp": 0.00873923, "balance_loss_clip": 1.01441455, "balance_loss_mlp": 1.0002768, "epoch": 0.12048337642036915, "flos": 17268264316800.0, "grad_norm": 3.6208205346769184, "language_loss": 0.76156557, "learning_rate": 3.914825017812974e-06, "loss": 0.78412873, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.7401556968688965 }, { "auxiliary_loss_clip": 0.01372187, "auxiliary_loss_mlp": 0.01205982, "balance_loss_clip": 1.01471448, "balance_loss_mlp": 1.00208688, "epoch": 0.12060361931100824, "flos": 22856973792480.0, "grad_norm": 2.310844108261579, "language_loss": 0.72618926, "learning_rate": 3.9145999651213065e-06, "loss": 0.75197101, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.870558500289917 }, { "auxiliary_loss_clip": 0.01383215, "auxiliary_loss_mlp": 0.01206879, "balance_loss_clip": 1.01383448, "balance_loss_mlp": 1.00222063, "epoch": 0.12072386220164733, "flos": 16726805206560.0, "grad_norm": 3.2806025722456256, "language_loss": 0.88403815, "learning_rate": 3.9143746219863465e-06, "loss": 0.90993905, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.7190630435943604 }, { "auxiliary_loss_clip": 0.01370066, "auxiliary_loss_mlp": 0.01201806, "balance_loss_clip": 1.01329732, "balance_loss_mlp": 1.00038993, "epoch": 0.12084410509228642, "flos": 55144205771040.0, "grad_norm": 0.9354442387689665, "language_loss": 0.64754796, "learning_rate": 3.914148988442278e-06, "loss": 0.67326677, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.266775131225586 }, { "auxiliary_loss_clip": 0.01370481, "auxiliary_loss_mlp": 0.01206541, "balance_loss_clip": 1.01425886, "balance_loss_mlp": 1.0018822, "epoch": 0.1209643479829255, "flos": 26760248890560.0, "grad_norm": 3.0764555481548626, "language_loss": 0.95096898, "learning_rate": 3.91392306452333e-06, "loss": 0.97673917, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.8068277835845947 }, { "auxiliary_loss_clip": 0.01408415, "auxiliary_loss_mlp": 0.01207104, "balance_loss_clip": 1.0149045, "balance_loss_mlp": 1.00206351, "epoch": 0.1210845908735646, "flos": 11035034396640.0, "grad_norm": 2.9120091931471475, "language_loss": 0.66381145, "learning_rate": 3.913696850263774e-06, "loss": 0.68996656, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.68648099899292 }, { "auxiliary_loss_clip": 0.01396282, "auxiliary_loss_mlp": 0.0120688, "balance_loss_clip": 1.01469135, "balance_loss_mlp": 1.00241232, "epoch": 0.1212048337642037, "flos": 20484642237120.0, "grad_norm": 2.0315674865210522, "language_loss": 0.79061079, "learning_rate": 3.913470345697929e-06, "loss": 0.8166424, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 2.805110454559326 }, { "auxiliary_loss_clip": 0.01330113, "auxiliary_loss_mlp": 0.01206938, "balance_loss_clip": 1.0127933, "balance_loss_mlp": 1.00285172, "epoch": 0.12132507665484278, "flos": 22346072524800.0, "grad_norm": 2.070487473741749, "language_loss": 0.85423803, "learning_rate": 3.913243550860153e-06, "loss": 0.87960851, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 2.800138235092163 }, { "auxiliary_loss_clip": 0.01383771, "auxiliary_loss_mlp": 0.01206861, "balance_loss_clip": 1.01482797, "balance_loss_mlp": 1.00258362, "epoch": 0.12144531954548188, "flos": 29314970771040.0, "grad_norm": 1.9242366636269261, "language_loss": 0.76096243, "learning_rate": 3.913016465784852e-06, "loss": 0.78686875, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 4.72818922996521 }, { "auxiliary_loss_clip": 0.01331112, "auxiliary_loss_mlp": 0.01207508, "balance_loss_clip": 1.01275504, "balance_loss_mlp": 1.00284982, "epoch": 0.12156556243612096, "flos": 20485252939680.0, "grad_norm": 3.222212375720931, "language_loss": 0.71816063, "learning_rate": 3.912789090506474e-06, "loss": 0.74354684, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 4.647396087646484 }, { "auxiliary_loss_clip": 0.01358131, "auxiliary_loss_mlp": 0.01206465, "balance_loss_clip": 1.01368427, "balance_loss_mlp": 1.00199747, "epoch": 0.12168580532676006, "flos": 16472019160800.0, "grad_norm": 3.002625468479693, "language_loss": 0.71866548, "learning_rate": 3.9125614250595114e-06, "loss": 0.74431145, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.8263447284698486 }, { "auxiliary_loss_clip": 0.0138514, "auxiliary_loss_mlp": 0.01207153, "balance_loss_clip": 1.01407957, "balance_loss_mlp": 1.00230408, "epoch": 0.12180604821739914, "flos": 15341299731360.0, "grad_norm": 2.199014760809938, "language_loss": 0.88793504, "learning_rate": 3.912333469478502e-06, "loss": 0.913858, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.7391037940979004 }, { "auxiliary_loss_clip": 0.01380271, "auxiliary_loss_mlp": 0.01206735, "balance_loss_clip": 1.01398897, "balance_loss_mlp": 1.00245786, "epoch": 0.12192629110803824, "flos": 19318048260480.0, "grad_norm": 2.826694784840055, "language_loss": 0.77602875, "learning_rate": 3.912105223798025e-06, "loss": 0.80189884, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.760671377182007 }, { "auxiliary_loss_clip": 0.01359307, "auxiliary_loss_mlp": 0.01201944, "balance_loss_clip": 1.01440656, "balance_loss_mlp": 1.00052834, "epoch": 0.12204653399867733, "flos": 47725378247520.0, "grad_norm": 0.9910021459063877, "language_loss": 0.67602575, "learning_rate": 3.9118766880527065e-06, "loss": 0.70163828, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.212027072906494 }, { "auxiliary_loss_clip": 0.01323939, "auxiliary_loss_mlp": 0.01205754, "balance_loss_clip": 1.01251638, "balance_loss_mlp": 1.00185847, "epoch": 0.12216677688931642, "flos": 18221946049440.0, "grad_norm": 1.742293101721692, "language_loss": 0.73803961, "learning_rate": 3.9116478622772145e-06, "loss": 0.76333654, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.844709873199463 }, { "auxiliary_loss_clip": 0.01382251, "auxiliary_loss_mlp": 0.01206004, "balance_loss_clip": 1.01499403, "balance_loss_mlp": 1.00210845, "epoch": 0.12228701977995551, "flos": 27525648814560.0, "grad_norm": 1.731923923157274, "language_loss": 0.88020319, "learning_rate": 3.911418746506261e-06, "loss": 0.90608573, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.7309303283691406 }, { "auxiliary_loss_clip": 0.01383153, "auxiliary_loss_mlp": 0.01206883, "balance_loss_clip": 1.01533258, "balance_loss_mlp": 1.00260627, "epoch": 0.1224072626705946, "flos": 21798147152160.0, "grad_norm": 1.68539502331701, "language_loss": 0.78261459, "learning_rate": 3.911189340774604e-06, "loss": 0.80851495, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.7764716148376465 }, { "auxiliary_loss_clip": 0.01380766, "auxiliary_loss_mlp": 0.01206742, "balance_loss_clip": 1.01411438, "balance_loss_mlp": 1.00284696, "epoch": 0.1225275055612337, "flos": 20703769277760.0, "grad_norm": 1.8479163290835028, "language_loss": 0.79434991, "learning_rate": 3.910959645117043e-06, "loss": 0.820225, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.7599477767944336 }, { "auxiliary_loss_clip": 0.01353081, "auxiliary_loss_mlp": 0.00873464, "balance_loss_clip": 1.01492262, "balance_loss_mlp": 0.99991411, "epoch": 0.12264774845187278, "flos": 57745327836960.0, "grad_norm": 0.8429118015738482, "language_loss": 0.56751406, "learning_rate": 3.910729659568423e-06, "loss": 0.5897795, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.262470006942749 }, { "auxiliary_loss_clip": 0.01368224, "auxiliary_loss_mlp": 0.01205144, "balance_loss_clip": 1.01413751, "balance_loss_mlp": 1.00201118, "epoch": 0.12276799134251187, "flos": 26396294484960.0, "grad_norm": 2.3425752119024126, "language_loss": 0.82069182, "learning_rate": 3.9104993841636344e-06, "loss": 0.84642553, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.8620212078094482 }, { "auxiliary_loss_clip": 0.01356124, "auxiliary_loss_mlp": 0.00873842, "balance_loss_clip": 1.01409101, "balance_loss_mlp": 1.00024056, "epoch": 0.12288823423315097, "flos": 21064203162720.0, "grad_norm": 1.7194919874696113, "language_loss": 0.81055081, "learning_rate": 3.910268818937608e-06, "loss": 0.83285046, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.754556655883789 }, { "auxiliary_loss_clip": 0.01322503, "auxiliary_loss_mlp": 0.01206635, "balance_loss_clip": 1.01351643, "balance_loss_mlp": 1.00293016, "epoch": 0.12300847712379005, "flos": 12312449222400.0, "grad_norm": 3.3434304755147832, "language_loss": 0.87416708, "learning_rate": 3.9100379639253196e-06, "loss": 0.89945841, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.805326223373413 }, { "auxiliary_loss_clip": 0.01368168, "auxiliary_loss_mlp": 0.01205087, "balance_loss_clip": 1.01349831, "balance_loss_mlp": 1.00176322, "epoch": 0.12312872001442915, "flos": 16762248669600.0, "grad_norm": 4.889405485621669, "language_loss": 0.86295712, "learning_rate": 3.909806819161791e-06, "loss": 0.88868976, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.7592546939849854 }, { "auxiliary_loss_clip": 0.01350584, "auxiliary_loss_mlp": 0.01205961, "balance_loss_clip": 1.0129329, "balance_loss_mlp": 1.0022558, "epoch": 0.12324896290506823, "flos": 18404947077120.0, "grad_norm": 2.675227644340687, "language_loss": 0.86104912, "learning_rate": 3.909575384682086e-06, "loss": 0.88661456, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.7841403484344482 }, { "auxiliary_loss_clip": 0.0139442, "auxiliary_loss_mlp": 0.01206091, "balance_loss_clip": 1.014974, "balance_loss_mlp": 1.00238609, "epoch": 0.12336920579570733, "flos": 18915417260640.0, "grad_norm": 1.7723101278997757, "language_loss": 0.69478226, "learning_rate": 3.9093436605213144e-06, "loss": 0.72078741, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.8016955852508545 }, { "auxiliary_loss_clip": 0.01358308, "auxiliary_loss_mlp": 0.01206445, "balance_loss_clip": 1.01303482, "balance_loss_mlp": 1.00274038, "epoch": 0.12348944868634643, "flos": 23878381167360.0, "grad_norm": 2.4669746539451425, "language_loss": 0.79462391, "learning_rate": 3.909111646714627e-06, "loss": 0.82027143, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.801405429840088 }, { "auxiliary_loss_clip": 0.01406434, "auxiliary_loss_mlp": 0.01205076, "balance_loss_clip": 1.01465023, "balance_loss_mlp": 1.00194311, "epoch": 0.12360969157698551, "flos": 19026094415040.0, "grad_norm": 3.1840076855853794, "language_loss": 0.72444201, "learning_rate": 3.9088793432972206e-06, "loss": 0.75055718, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.7428011894226074 }, { "auxiliary_loss_clip": 0.01319061, "auxiliary_loss_mlp": 0.01205726, "balance_loss_clip": 1.01240444, "balance_loss_mlp": 1.0024029, "epoch": 0.1237299344676246, "flos": 13224616390080.0, "grad_norm": 2.875620886550981, "language_loss": 0.81601167, "learning_rate": 3.908646750304336e-06, "loss": 0.84125954, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.8287832736968994 }, { "auxiliary_loss_clip": 0.01365414, "auxiliary_loss_mlp": 0.01206292, "balance_loss_clip": 1.01477289, "balance_loss_mlp": 1.00220609, "epoch": 0.12385017735826369, "flos": 20485684023840.0, "grad_norm": 1.6875967511470487, "language_loss": 0.87329692, "learning_rate": 3.908413867771257e-06, "loss": 0.899014, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.770303964614868 }, { "auxiliary_loss_clip": 0.01381538, "auxiliary_loss_mlp": 0.01206311, "balance_loss_clip": 1.01457286, "balance_loss_mlp": 1.00241578, "epoch": 0.12397042024890279, "flos": 17347844773440.0, "grad_norm": 1.6844708459798663, "language_loss": 0.80833137, "learning_rate": 3.908180695733311e-06, "loss": 0.8342098, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.712346076965332 }, { "auxiliary_loss_clip": 0.01333514, "auxiliary_loss_mlp": 0.01204947, "balance_loss_clip": 1.01251388, "balance_loss_mlp": 1.00162363, "epoch": 0.12409066313954187, "flos": 20412354287520.0, "grad_norm": 1.834324763448528, "language_loss": 0.82779771, "learning_rate": 3.907947234225871e-06, "loss": 0.85318232, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.9283974170684814 }, { "auxiliary_loss_clip": 0.0130513, "auxiliary_loss_mlp": 0.01205767, "balance_loss_clip": 1.0125649, "balance_loss_mlp": 1.00263453, "epoch": 0.12421090603018096, "flos": 20736698083200.0, "grad_norm": 2.3920537887850752, "language_loss": 0.86836195, "learning_rate": 3.907713483284352e-06, "loss": 0.89347088, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.9167728424072266 }, { "auxiliary_loss_clip": 0.0129251, "auxiliary_loss_mlp": 0.01205396, "balance_loss_clip": 1.01288986, "balance_loss_mlp": 1.00226355, "epoch": 0.12433114892082006, "flos": 24498845955360.0, "grad_norm": 2.3299519547183194, "language_loss": 0.97400492, "learning_rate": 3.907479442944216e-06, "loss": 0.99898398, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.0439958572387695 }, { "auxiliary_loss_clip": 0.01382637, "auxiliary_loss_mlp": 0.01205496, "balance_loss_clip": 1.0149982, "balance_loss_mlp": 1.0019815, "epoch": 0.12445139181145914, "flos": 19682325979200.0, "grad_norm": 2.2831444238029364, "language_loss": 0.92191601, "learning_rate": 3.907245113240963e-06, "loss": 0.94779742, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.0492703914642334 }, { "auxiliary_loss_clip": 0.01356257, "auxiliary_loss_mlp": 0.01206193, "balance_loss_clip": 1.01333976, "balance_loss_mlp": 1.0021069, "epoch": 0.12457163470209824, "flos": 46423104235200.0, "grad_norm": 2.2391304847348628, "language_loss": 0.73730922, "learning_rate": 3.907010494210144e-06, "loss": 0.76293373, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 4.348992347717285 }, { "auxiliary_loss_clip": 0.01384465, "auxiliary_loss_mlp": 0.01205192, "balance_loss_clip": 1.01426697, "balance_loss_mlp": 1.00186861, "epoch": 0.12469187759273732, "flos": 20376300121920.0, "grad_norm": 2.0659221120871147, "language_loss": 0.92019629, "learning_rate": 3.9067755858873495e-06, "loss": 0.9460929, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 5.45753288269043 }, { "auxiliary_loss_clip": 0.01341196, "auxiliary_loss_mlp": 0.01200949, "balance_loss_clip": 1.01221895, "balance_loss_mlp": 1.00105882, "epoch": 0.12481212048337642, "flos": 69224674207680.0, "grad_norm": 0.8648350447198745, "language_loss": 0.62779963, "learning_rate": 3.906540388308214e-06, "loss": 0.65322107, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.3456428050994873 }, { "auxiliary_loss_clip": 0.01321069, "auxiliary_loss_mlp": 0.01205358, "balance_loss_clip": 1.01267719, "balance_loss_mlp": 1.0022254, "epoch": 0.12493236337401552, "flos": 18223706309760.0, "grad_norm": 3.922358047335924, "language_loss": 0.81425726, "learning_rate": 3.906304901508417e-06, "loss": 0.83952153, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.8033134937286377 }, { "auxiliary_loss_clip": 0.01383154, "auxiliary_loss_mlp": 0.01205411, "balance_loss_clip": 1.01518202, "balance_loss_mlp": 1.00208735, "epoch": 0.12505260626465461, "flos": 30044388376800.0, "grad_norm": 2.297796834555176, "language_loss": 0.75679266, "learning_rate": 3.9060691255236835e-06, "loss": 0.78267837, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.7899997234344482 }, { "auxiliary_loss_clip": 0.01393401, "auxiliary_loss_mlp": 0.0120565, "balance_loss_clip": 1.01452243, "balance_loss_mlp": 1.00232708, "epoch": 0.1251728491552937, "flos": 24433994207520.0, "grad_norm": 2.350109344224879, "language_loss": 0.80575931, "learning_rate": 3.905833060389778e-06, "loss": 0.8317498, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.8325283527374268 }, { "auxiliary_loss_clip": 0.01407232, "auxiliary_loss_mlp": 0.00873869, "balance_loss_clip": 1.01514137, "balance_loss_mlp": 1.00021935, "epoch": 0.12529309204593278, "flos": 27119820607200.0, "grad_norm": 3.755966378135217, "language_loss": 0.7837168, "learning_rate": 3.905596706142513e-06, "loss": 0.80652785, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.792600393295288 }, { "auxiliary_loss_clip": 0.01354846, "auxiliary_loss_mlp": 0.01205849, "balance_loss_clip": 1.01341248, "balance_loss_mlp": 1.00195384, "epoch": 0.12541333493657186, "flos": 30774165219360.0, "grad_norm": 2.2027955917090165, "language_loss": 0.86316824, "learning_rate": 3.9053600628177435e-06, "loss": 0.88877523, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.878199338912964 }, { "auxiliary_loss_clip": 0.01405713, "auxiliary_loss_mlp": 0.01205145, "balance_loss_clip": 1.01427388, "balance_loss_mlp": 1.00220323, "epoch": 0.12553357782721097, "flos": 23659577439840.0, "grad_norm": 2.3491794029564512, "language_loss": 0.84451783, "learning_rate": 3.905123130451367e-06, "loss": 0.87062639, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.76068377494812 }, { "auxiliary_loss_clip": 0.01407297, "auxiliary_loss_mlp": 0.01205858, "balance_loss_clip": 1.0157994, "balance_loss_mlp": 1.00253475, "epoch": 0.12565382071785006, "flos": 24863770300320.0, "grad_norm": 2.818276842070426, "language_loss": 0.79230475, "learning_rate": 3.904885909079326e-06, "loss": 0.81843638, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.741476058959961 }, { "auxiliary_loss_clip": 0.01381615, "auxiliary_loss_mlp": 0.01206718, "balance_loss_clip": 1.0138526, "balance_loss_mlp": 1.00263131, "epoch": 0.12577406360848914, "flos": 21360791162880.0, "grad_norm": 2.925163940136036, "language_loss": 0.77846545, "learning_rate": 3.904648398737607e-06, "loss": 0.80434883, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.7749953269958496 }, { "auxiliary_loss_clip": 0.01405961, "auxiliary_loss_mlp": 0.01205555, "balance_loss_clip": 1.01496291, "balance_loss_mlp": 1.00204134, "epoch": 0.12589430649912825, "flos": 36138071712960.0, "grad_norm": 1.9290528973191678, "language_loss": 0.78157783, "learning_rate": 3.9044105994622406e-06, "loss": 0.807693, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.7697696685791016 }, { "auxiliary_loss_clip": 0.01370812, "auxiliary_loss_mlp": 0.00873874, "balance_loss_clip": 1.01457119, "balance_loss_mlp": 1.00020933, "epoch": 0.12601454938976733, "flos": 25337683386720.0, "grad_norm": 1.783779903119995, "language_loss": 0.81705594, "learning_rate": 3.9041725112893005e-06, "loss": 0.83950281, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.837782859802246 }, { "auxiliary_loss_clip": 0.01327522, "auxiliary_loss_mlp": 0.01205716, "balance_loss_clip": 1.01218629, "balance_loss_mlp": 1.00182068, "epoch": 0.12613479228040642, "flos": 15560103458880.0, "grad_norm": 1.6499362334270187, "language_loss": 0.75139546, "learning_rate": 3.903934134254904e-06, "loss": 0.77672791, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.769099712371826 }, { "auxiliary_loss_clip": 0.01392076, "auxiliary_loss_mlp": 0.01205576, "balance_loss_clip": 1.01489651, "balance_loss_mlp": 1.00206232, "epoch": 0.1262550351710455, "flos": 21470606148960.0, "grad_norm": 2.9493042739798803, "language_loss": 0.84928501, "learning_rate": 3.903695468395213e-06, "loss": 0.87526155, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.7012858390808105 }, { "auxiliary_loss_clip": 0.0138124, "auxiliary_loss_mlp": 0.01206028, "balance_loss_clip": 1.01474178, "balance_loss_mlp": 1.00251412, "epoch": 0.1263752780616846, "flos": 31576732943040.0, "grad_norm": 3.1295660722356407, "language_loss": 0.55746585, "learning_rate": 3.903456513746434e-06, "loss": 0.58333856, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.9093101024627686 }, { "auxiliary_loss_clip": 0.01405301, "auxiliary_loss_mlp": 0.01205499, "balance_loss_clip": 1.01449561, "balance_loss_mlp": 1.00236607, "epoch": 0.1264955209523237, "flos": 28768230879840.0, "grad_norm": 1.7111460558516152, "language_loss": 0.87543964, "learning_rate": 3.903217270344815e-06, "loss": 0.90154755, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.7321910858154297 }, { "auxiliary_loss_clip": 0.01344717, "auxiliary_loss_mlp": 0.0120486, "balance_loss_clip": 1.01334131, "balance_loss_mlp": 1.00191796, "epoch": 0.12661576384296278, "flos": 29241138103200.0, "grad_norm": 1.7975000694539787, "language_loss": 0.8214916, "learning_rate": 3.902977738226648e-06, "loss": 0.84698731, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.9182815551757812 }, { "auxiliary_loss_clip": 0.01382153, "auxiliary_loss_mlp": 0.01206385, "balance_loss_clip": 1.01440358, "balance_loss_mlp": 1.00267982, "epoch": 0.12673600673360189, "flos": 20850356903040.0, "grad_norm": 3.1026509462873855, "language_loss": 0.90919912, "learning_rate": 3.902737917428273e-06, "loss": 0.93508446, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.725785970687866 }, { "auxiliary_loss_clip": 0.01405533, "auxiliary_loss_mlp": 0.01206427, "balance_loss_clip": 1.01493287, "balance_loss_mlp": 1.00272155, "epoch": 0.12685624962424097, "flos": 25263707024160.0, "grad_norm": 1.7211094838891805, "language_loss": 0.83869171, "learning_rate": 3.902497807986068e-06, "loss": 0.86481136, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.7315785884857178 }, { "auxiliary_loss_clip": 0.01353962, "auxiliary_loss_mlp": 0.01205817, "balance_loss_clip": 1.01353645, "balance_loss_mlp": 1.00192118, "epoch": 0.12697649251488005, "flos": 27527121685440.0, "grad_norm": 1.812375509692444, "language_loss": 0.84054446, "learning_rate": 3.902257409936458e-06, "loss": 0.86614221, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.942173957824707 }, { "auxiliary_loss_clip": 0.01364002, "auxiliary_loss_mlp": 0.0120556, "balance_loss_clip": 1.01451027, "balance_loss_mlp": 1.0022366, "epoch": 0.12709673540551916, "flos": 21251874268800.0, "grad_norm": 1.8238403641468426, "language_loss": 0.83862847, "learning_rate": 3.902016723315912e-06, "loss": 0.86432409, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.7641761302948 }, { "auxiliary_loss_clip": 0.01393576, "auxiliary_loss_mlp": 0.01206185, "balance_loss_clip": 1.01481199, "balance_loss_mlp": 1.00247979, "epoch": 0.12721697829615825, "flos": 25337719310400.0, "grad_norm": 2.279827699933701, "language_loss": 0.69383532, "learning_rate": 3.901775748160941e-06, "loss": 0.7198329, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.7698397636413574 }, { "auxiliary_loss_clip": 0.0134587, "auxiliary_loss_mlp": 0.01200045, "balance_loss_clip": 1.01319003, "balance_loss_mlp": 1.00015473, "epoch": 0.12733722118679733, "flos": 61943322903840.0, "grad_norm": 0.7963460756950651, "language_loss": 0.60863531, "learning_rate": 3.901534484508101e-06, "loss": 0.63409448, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 3.3427271842956543 }, { "auxiliary_loss_clip": 0.01367954, "auxiliary_loss_mlp": 0.01206329, "balance_loss_clip": 1.01399255, "balance_loss_mlp": 1.00262463, "epoch": 0.1274574640774364, "flos": 26976753502560.0, "grad_norm": 1.8979933240115772, "language_loss": 0.75003445, "learning_rate": 3.901292932393991e-06, "loss": 0.77577734, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 2.809826374053955 }, { "auxiliary_loss_clip": 0.01406008, "auxiliary_loss_mlp": 0.01206806, "balance_loss_clip": 1.01514232, "balance_loss_mlp": 1.00271964, "epoch": 0.12757770696807552, "flos": 22236329386080.0, "grad_norm": 2.4123324469589917, "language_loss": 0.85082901, "learning_rate": 3.9010510918552555e-06, "loss": 0.87695718, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 2.7111001014709473 }, { "auxiliary_loss_clip": 0.01380122, "auxiliary_loss_mlp": 0.01207273, "balance_loss_clip": 1.01415348, "balance_loss_mlp": 1.0033772, "epoch": 0.1276979498587146, "flos": 28547918357760.0, "grad_norm": 3.0336418590691125, "language_loss": 0.74230319, "learning_rate": 3.900808962928581e-06, "loss": 0.76817721, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 4.629395246505737 }, { "auxiliary_loss_clip": 0.01404947, "auxiliary_loss_mlp": 0.01205028, "balance_loss_clip": 1.01446331, "balance_loss_mlp": 1.00227642, "epoch": 0.1278181927493537, "flos": 17420348265120.0, "grad_norm": 5.592741554162594, "language_loss": 0.89428008, "learning_rate": 3.900566545650698e-06, "loss": 0.92037976, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 3.6809580326080322 }, { "auxiliary_loss_clip": 0.01381053, "auxiliary_loss_mlp": 0.01206146, "balance_loss_clip": 1.01430595, "balance_loss_mlp": 1.00225043, "epoch": 0.1279384356399928, "flos": 21138646533120.0, "grad_norm": 2.220835914773506, "language_loss": 0.81528348, "learning_rate": 3.900323840058381e-06, "loss": 0.84115547, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.7285311222076416 }, { "auxiliary_loss_clip": 0.0139329, "auxiliary_loss_mlp": 0.01204967, "balance_loss_clip": 1.01474023, "balance_loss_mlp": 1.00183403, "epoch": 0.12805867853063188, "flos": 26576744931360.0, "grad_norm": 2.0857002681782597, "language_loss": 0.8163712, "learning_rate": 3.900080846188449e-06, "loss": 0.8423537, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.825352907180786 }, { "auxiliary_loss_clip": 0.01405269, "auxiliary_loss_mlp": 0.01205779, "balance_loss_clip": 1.0142976, "balance_loss_mlp": 1.00245595, "epoch": 0.12817892142127096, "flos": 16436216460960.0, "grad_norm": 2.2593108617179425, "language_loss": 0.81285596, "learning_rate": 3.8998375640777625e-06, "loss": 0.83896643, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.739586591720581 }, { "auxiliary_loss_clip": 0.01352009, "auxiliary_loss_mlp": 0.01201367, "balance_loss_clip": 1.01206744, "balance_loss_mlp": 0.99995083, "epoch": 0.12829916431191005, "flos": 60757078674240.0, "grad_norm": 0.7084458620155444, "language_loss": 0.52652854, "learning_rate": 3.899593993763229e-06, "loss": 0.55206227, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.196582555770874 }, { "auxiliary_loss_clip": 0.01343988, "auxiliary_loss_mlp": 0.01207346, "balance_loss_clip": 1.01310682, "balance_loss_mlp": 1.00325954, "epoch": 0.12841940720254916, "flos": 29786908055040.0, "grad_norm": 1.834945648254034, "language_loss": 0.81307411, "learning_rate": 3.899350135281796e-06, "loss": 0.8385874, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.8526878356933594 }, { "auxiliary_loss_clip": 0.01349664, "auxiliary_loss_mlp": 0.01205313, "balance_loss_clip": 1.01335812, "balance_loss_mlp": 1.00256205, "epoch": 0.12853965009318824, "flos": 25951861530720.0, "grad_norm": 1.8819485604351567, "language_loss": 0.79407907, "learning_rate": 3.8991059886704585e-06, "loss": 0.81962883, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.848834753036499 }, { "auxiliary_loss_clip": 0.01342938, "auxiliary_loss_mlp": 0.01206117, "balance_loss_clip": 1.01370716, "balance_loss_mlp": 1.00279343, "epoch": 0.12865989298382732, "flos": 30846884253120.0, "grad_norm": 1.999724708945607, "language_loss": 0.82845676, "learning_rate": 3.898861553966252e-06, "loss": 0.85394728, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.840841770172119 }, { "auxiliary_loss_clip": 0.01302466, "auxiliary_loss_mlp": 0.01206, "balance_loss_clip": 1.0124402, "balance_loss_mlp": 1.00267649, "epoch": 0.12878013587446643, "flos": 25885788377760.0, "grad_norm": 2.353486109052701, "language_loss": 0.88046098, "learning_rate": 3.898616831206257e-06, "loss": 0.90554565, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 3.052572727203369 }, { "auxiliary_loss_clip": 0.01356266, "auxiliary_loss_mlp": 0.01206826, "balance_loss_clip": 1.01362324, "balance_loss_mlp": 1.00293005, "epoch": 0.12890037876510552, "flos": 23333150070720.0, "grad_norm": 2.3469464454503393, "language_loss": 0.76917303, "learning_rate": 3.8983718204276e-06, "loss": 0.79480398, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 3.0547780990600586 }, { "auxiliary_loss_clip": 0.01367806, "auxiliary_loss_mlp": 0.01205723, "balance_loss_clip": 1.01375639, "balance_loss_mlp": 1.00297165, "epoch": 0.1290206216557446, "flos": 23587253566560.0, "grad_norm": 1.9664353563311094, "language_loss": 0.82629937, "learning_rate": 3.898126521667446e-06, "loss": 0.85203469, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.8574721813201904 }, { "auxiliary_loss_clip": 0.0139281, "auxiliary_loss_mlp": 0.01207001, "balance_loss_clip": 1.01462317, "balance_loss_mlp": 1.00329602, "epoch": 0.12914086454638368, "flos": 24170622402240.0, "grad_norm": 1.7094945898659029, "language_loss": 0.83212298, "learning_rate": 3.897880934963007e-06, "loss": 0.8581211, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.7517387866973877 }, { "auxiliary_loss_clip": 0.01378166, "auxiliary_loss_mlp": 0.01205699, "balance_loss_clip": 1.01377928, "balance_loss_mlp": 1.00237536, "epoch": 0.1292611074370228, "flos": 20267167685760.0, "grad_norm": 2.0182699854991277, "language_loss": 0.78116006, "learning_rate": 3.89763506035154e-06, "loss": 0.80699873, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.764589786529541 }, { "auxiliary_loss_clip": 0.01392528, "auxiliary_loss_mlp": 0.01204557, "balance_loss_clip": 1.01458931, "balance_loss_mlp": 1.00199616, "epoch": 0.12938135032766188, "flos": 27377696089440.0, "grad_norm": 1.6526760552683377, "language_loss": 0.8132199, "learning_rate": 3.897388897870343e-06, "loss": 0.83919072, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.8136398792266846 }, { "auxiliary_loss_clip": 0.01377537, "auxiliary_loss_mlp": 0.01207213, "balance_loss_clip": 1.01372302, "balance_loss_mlp": 1.00312614, "epoch": 0.12950159321830096, "flos": 29277120421440.0, "grad_norm": 2.5611422038239486, "language_loss": 0.74775708, "learning_rate": 3.89714244755676e-06, "loss": 0.77360451, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.901721239089966 }, { "auxiliary_loss_clip": 0.01343087, "auxiliary_loss_mlp": 0.01207147, "balance_loss_clip": 1.01337767, "balance_loss_mlp": 1.00306058, "epoch": 0.12962183610894007, "flos": 24534900120960.0, "grad_norm": 4.249984575092628, "language_loss": 0.86380422, "learning_rate": 3.896895709448175e-06, "loss": 0.88930655, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.910507917404175 }, { "auxiliary_loss_clip": 0.01301782, "auxiliary_loss_mlp": 0.01206759, "balance_loss_clip": 1.01127648, "balance_loss_mlp": 1.00324488, "epoch": 0.12974207899957915, "flos": 11215951850880.0, "grad_norm": 2.3589608273929947, "language_loss": 0.76819986, "learning_rate": 3.896648683582019e-06, "loss": 0.79328525, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.8266685009002686 }, { "auxiliary_loss_clip": 0.01306475, "auxiliary_loss_mlp": 0.01205354, "balance_loss_clip": 1.01122761, "balance_loss_mlp": 1.00203085, "epoch": 0.12986232189021824, "flos": 24717901148640.0, "grad_norm": 1.9645147742441436, "language_loss": 0.80669129, "learning_rate": 3.896401369995766e-06, "loss": 0.83180964, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.8638975620269775 }, { "auxiliary_loss_clip": 0.01404773, "auxiliary_loss_mlp": 0.01205157, "balance_loss_clip": 1.01495099, "balance_loss_mlp": 1.00202417, "epoch": 0.12998256478085732, "flos": 23915369348640.0, "grad_norm": 1.8235564246805966, "language_loss": 0.79593444, "learning_rate": 3.896153768726932e-06, "loss": 0.82203364, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.713181972503662 }, { "auxiliary_loss_clip": 0.01379001, "auxiliary_loss_mlp": 0.01204813, "balance_loss_clip": 1.01433527, "balance_loss_mlp": 1.00206232, "epoch": 0.13010280767149643, "flos": 18624217812480.0, "grad_norm": 2.0372807693139547, "language_loss": 0.87791133, "learning_rate": 3.8959058798130806e-06, "loss": 0.90374947, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.784740686416626 }, { "auxiliary_loss_clip": 0.0137844, "auxiliary_loss_mlp": 0.0087388, "balance_loss_clip": 1.01433063, "balance_loss_mlp": 1.00025463, "epoch": 0.1302230505621355, "flos": 22784003292960.0, "grad_norm": 1.792256663495603, "language_loss": 0.7523911, "learning_rate": 3.895657703291814e-06, "loss": 0.77491438, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.792592763900757 }, { "auxiliary_loss_clip": 0.01375541, "auxiliary_loss_mlp": 0.01206477, "balance_loss_clip": 1.0142101, "balance_loss_mlp": 1.00239027, "epoch": 0.1303432934527746, "flos": 21323623363200.0, "grad_norm": 2.400043910228162, "language_loss": 0.79464352, "learning_rate": 3.895409239200781e-06, "loss": 0.82046372, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.860105276107788 }, { "auxiliary_loss_clip": 0.01392283, "auxiliary_loss_mlp": 0.0120535, "balance_loss_clip": 1.01437688, "balance_loss_mlp": 1.00202703, "epoch": 0.1304635363434137, "flos": 20922501157920.0, "grad_norm": 2.2678847805005407, "language_loss": 0.91521949, "learning_rate": 3.895160487577673e-06, "loss": 0.94119591, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 2.7283289432525635 }, { "auxiliary_loss_clip": 0.01364957, "auxiliary_loss_mlp": 0.01200344, "balance_loss_clip": 1.01228476, "balance_loss_mlp": 1.00045431, "epoch": 0.1305837792340528, "flos": 63245691478080.0, "grad_norm": 0.7866451167854347, "language_loss": 0.60983086, "learning_rate": 3.894911448460226e-06, "loss": 0.63548386, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.142085313796997 }, { "auxiliary_loss_clip": 0.01284463, "auxiliary_loss_mlp": 0.01206255, "balance_loss_clip": 1.01159763, "balance_loss_mlp": 1.0027411, "epoch": 0.13070402212469187, "flos": 26428864053600.0, "grad_norm": 2.0528384195226845, "language_loss": 0.72794116, "learning_rate": 3.8946621218862195e-06, "loss": 0.75284839, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 3.081364870071411 }, { "auxiliary_loss_clip": 0.01338714, "auxiliary_loss_mlp": 0.01204897, "balance_loss_clip": 1.01317692, "balance_loss_mlp": 1.00195479, "epoch": 0.13082426501533098, "flos": 27673421921280.0, "grad_norm": 2.1459915276350823, "language_loss": 0.88517523, "learning_rate": 3.894412507893475e-06, "loss": 0.91061133, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 4.9172163009643555 }, { "auxiliary_loss_clip": 0.01327911, "auxiliary_loss_mlp": 0.01206424, "balance_loss_clip": 1.01200175, "balance_loss_mlp": 1.00252831, "epoch": 0.13094450790597006, "flos": 24826782119040.0, "grad_norm": 1.9749138052782125, "language_loss": 0.72461808, "learning_rate": 3.894162606519859e-06, "loss": 0.74996138, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 4.665381669998169 }, { "auxiliary_loss_clip": 0.01315145, "auxiliary_loss_mlp": 0.0120578, "balance_loss_clip": 1.01195216, "balance_loss_mlp": 1.00226581, "epoch": 0.13106475079660915, "flos": 19062615588480.0, "grad_norm": 2.1210175534156597, "language_loss": 0.77372217, "learning_rate": 3.893912417803282e-06, "loss": 0.79893142, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.740370750427246 }, { "auxiliary_loss_clip": 0.01342528, "auxiliary_loss_mlp": 0.01206346, "balance_loss_clip": 1.01282978, "balance_loss_mlp": 1.0026412, "epoch": 0.13118499368724823, "flos": 28913201939520.0, "grad_norm": 2.6345145389259574, "language_loss": 0.76952142, "learning_rate": 3.8936619417816975e-06, "loss": 0.79501015, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.905059337615967 }, { "auxiliary_loss_clip": 0.01331559, "auxiliary_loss_mlp": 0.0120552, "balance_loss_clip": 1.01271725, "balance_loss_mlp": 1.00238705, "epoch": 0.13130523657788734, "flos": 14283407106720.0, "grad_norm": 1.6384006535591245, "language_loss": 0.71370614, "learning_rate": 3.8934111784931015e-06, "loss": 0.73907691, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.7368710041046143 }, { "auxiliary_loss_clip": 0.01353411, "auxiliary_loss_mlp": 0.01200358, "balance_loss_clip": 1.01184225, "balance_loss_mlp": 1.00046813, "epoch": 0.13142547946852642, "flos": 70174188793440.0, "grad_norm": 0.9194141559302972, "language_loss": 0.59083998, "learning_rate": 3.893160127975535e-06, "loss": 0.61637771, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.4105849266052246 }, { "auxiliary_loss_clip": 0.01331847, "auxiliary_loss_mlp": 0.01205284, "balance_loss_clip": 1.01260436, "balance_loss_mlp": 1.0025332, "epoch": 0.1315457223591655, "flos": 45806016273120.0, "grad_norm": 3.230187502913906, "language_loss": 0.8096931, "learning_rate": 3.8929087902670826e-06, "loss": 0.83506441, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 2.9866294860839844 }, { "auxiliary_loss_clip": 0.01376124, "auxiliary_loss_mlp": 0.01200237, "balance_loss_clip": 1.01190448, "balance_loss_mlp": 1.00034738, "epoch": 0.13166596524980462, "flos": 62881198217280.0, "grad_norm": 1.0672630968295256, "language_loss": 0.60728824, "learning_rate": 3.8926571654058715e-06, "loss": 0.63305181, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.152944564819336 }, { "auxiliary_loss_clip": 0.01328685, "auxiliary_loss_mlp": 0.01205088, "balance_loss_clip": 1.01234972, "balance_loss_mlp": 1.00195539, "epoch": 0.1317862081404437, "flos": 23586535092960.0, "grad_norm": 2.906490876273844, "language_loss": 0.76686019, "learning_rate": 3.892405253430074e-06, "loss": 0.79219794, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.800882339477539 }, { "auxiliary_loss_clip": 0.01365321, "auxiliary_loss_mlp": 0.00873949, "balance_loss_clip": 1.01411438, "balance_loss_mlp": 1.00033665, "epoch": 0.13190645103108278, "flos": 20260773270720.0, "grad_norm": 1.9222004005648516, "language_loss": 0.82235783, "learning_rate": 3.892153054377904e-06, "loss": 0.84475052, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.7990264892578125 }, { "auxiliary_loss_clip": 0.01288004, "auxiliary_loss_mlp": 0.01199944, "balance_loss_clip": 1.01108134, "balance_loss_mlp": 1.00005424, "epoch": 0.13202669392172187, "flos": 53455466414880.0, "grad_norm": 0.9343082363395683, "language_loss": 0.59408271, "learning_rate": 3.891900568287619e-06, "loss": 0.61896217, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.3093178272247314 }, { "auxiliary_loss_clip": 0.01343966, "auxiliary_loss_mlp": 0.01206934, "balance_loss_clip": 1.01369822, "balance_loss_mlp": 1.00265694, "epoch": 0.13214693681236098, "flos": 15851302907040.0, "grad_norm": 2.395792241489654, "language_loss": 0.723818, "learning_rate": 3.891647795197523e-06, "loss": 0.74932694, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 3.3503947257995605 }, { "auxiliary_loss_clip": 0.01354528, "auxiliary_loss_mlp": 0.01206356, "balance_loss_clip": 1.01384842, "balance_loss_mlp": 1.00284171, "epoch": 0.13226717970300006, "flos": 19353850960320.0, "grad_norm": 7.857666217559197, "language_loss": 0.68844527, "learning_rate": 3.8913947351459605e-06, "loss": 0.71405411, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.727564573287964 }, { "auxiliary_loss_clip": 0.01404424, "auxiliary_loss_mlp": 0.01205371, "balance_loss_clip": 1.01511836, "balance_loss_mlp": 1.00242865, "epoch": 0.13238742259363914, "flos": 20698093336320.0, "grad_norm": 2.198441917721087, "language_loss": 0.6750682, "learning_rate": 3.89114138817132e-06, "loss": 0.70116615, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.770080089569092 }, { "auxiliary_loss_clip": 0.01379556, "auxiliary_loss_mlp": 0.01205024, "balance_loss_clip": 1.01463914, "balance_loss_mlp": 1.00246358, "epoch": 0.13250766548427825, "flos": 21032459838720.0, "grad_norm": 1.8764165865191444, "language_loss": 0.84385443, "learning_rate": 3.890887754312035e-06, "loss": 0.86970019, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.7808730602264404 }, { "auxiliary_loss_clip": 0.01364253, "auxiliary_loss_mlp": 0.01205704, "balance_loss_clip": 1.01293945, "balance_loss_mlp": 1.00276256, "epoch": 0.13262790837491734, "flos": 22637882675520.0, "grad_norm": 1.809506969018571, "language_loss": 0.87495959, "learning_rate": 3.890633833606581e-06, "loss": 0.90065914, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.8309061527252197 }, { "auxiliary_loss_clip": 0.01378038, "auxiliary_loss_mlp": 0.01205124, "balance_loss_clip": 1.01472604, "balance_loss_mlp": 1.00218248, "epoch": 0.13274815126555642, "flos": 19683152223840.0, "grad_norm": 1.9476658212562223, "language_loss": 0.69627404, "learning_rate": 3.890379626093477e-06, "loss": 0.72210562, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.7775864601135254 }, { "auxiliary_loss_clip": 0.01315556, "auxiliary_loss_mlp": 0.01204907, "balance_loss_clip": 1.01305556, "balance_loss_mlp": 1.00196517, "epoch": 0.1328683941561955, "flos": 21317695956000.0, "grad_norm": 2.1178469804090696, "language_loss": 0.92386711, "learning_rate": 3.890125131811287e-06, "loss": 0.94907165, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.818239450454712 }, { "auxiliary_loss_clip": 0.01378008, "auxiliary_loss_mlp": 0.0120494, "balance_loss_clip": 1.01386666, "balance_loss_mlp": 1.00237989, "epoch": 0.1329886370468346, "flos": 13699140179040.0, "grad_norm": 1.9063354984327705, "language_loss": 0.75659657, "learning_rate": 3.889870350798618e-06, "loss": 0.78242606, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.7294259071350098 }, { "auxiliary_loss_clip": 0.01403385, "auxiliary_loss_mlp": 0.01204503, "balance_loss_clip": 1.01426125, "balance_loss_mlp": 1.0019424, "epoch": 0.1331088799374737, "flos": 21032423915040.0, "grad_norm": 1.7429632230858803, "language_loss": 0.78494173, "learning_rate": 3.889615283094119e-06, "loss": 0.81102061, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.6752724647521973 }, { "auxiliary_loss_clip": 0.0140458, "auxiliary_loss_mlp": 0.01205739, "balance_loss_clip": 1.0150677, "balance_loss_mlp": 1.00222516, "epoch": 0.13322912282811278, "flos": 18260443025280.0, "grad_norm": 2.020909733370813, "language_loss": 0.84581119, "learning_rate": 3.889359928736485e-06, "loss": 0.87191439, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.664991617202759 }, { "auxiliary_loss_clip": 0.01351274, "auxiliary_loss_mlp": 0.00873846, "balance_loss_clip": 1.01336002, "balance_loss_mlp": 1.00032568, "epoch": 0.1333493657187519, "flos": 24460887834720.0, "grad_norm": 2.31851073424735, "language_loss": 0.91288507, "learning_rate": 3.889104287764451e-06, "loss": 0.93513626, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.8227925300598145 }, { "auxiliary_loss_clip": 0.01353158, "auxiliary_loss_mlp": 0.0120473, "balance_loss_clip": 1.01333904, "balance_loss_mlp": 1.00197864, "epoch": 0.13346960860939097, "flos": 22158940273920.0, "grad_norm": 2.376528566546786, "language_loss": 0.90212268, "learning_rate": 3.888848360216798e-06, "loss": 0.92770147, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 2.7753713130950928 }, { "auxiliary_loss_clip": 0.01361231, "auxiliary_loss_mlp": 0.01199989, "balance_loss_clip": 1.01140642, "balance_loss_mlp": 1.00009882, "epoch": 0.13358985150003005, "flos": 67931250629760.0, "grad_norm": 0.7916124792824024, "language_loss": 0.56581098, "learning_rate": 3.888592146132351e-06, "loss": 0.59142321, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.4464187622070312 }, { "auxiliary_loss_clip": 0.01378387, "auxiliary_loss_mlp": 0.01205012, "balance_loss_clip": 1.01432252, "balance_loss_mlp": 1.00207007, "epoch": 0.13371009439066917, "flos": 26834297100480.0, "grad_norm": 1.763855440499182, "language_loss": 0.78370905, "learning_rate": 3.888335645549978e-06, "loss": 0.80954301, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 2.7690372467041016 }, { "auxiliary_loss_clip": 0.01403743, "auxiliary_loss_mlp": 0.01204667, "balance_loss_clip": 1.01510692, "balance_loss_mlp": 1.00191617, "epoch": 0.13383033728130825, "flos": 26322856977600.0, "grad_norm": 2.2995007018786104, "language_loss": 0.81510067, "learning_rate": 3.888078858508588e-06, "loss": 0.84118474, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.7364604473114014 }, { "auxiliary_loss_clip": 0.01353522, "auxiliary_loss_mlp": 0.01204542, "balance_loss_clip": 1.0143944, "balance_loss_mlp": 1.00198185, "epoch": 0.13395058017194733, "flos": 22563942236640.0, "grad_norm": 2.0203681278030716, "language_loss": 0.84572983, "learning_rate": 3.8878217850471365e-06, "loss": 0.87131047, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 3.910313844680786 }, { "auxiliary_loss_clip": 0.01404622, "auxiliary_loss_mlp": 0.01204975, "balance_loss_clip": 1.0154078, "balance_loss_mlp": 1.00184202, "epoch": 0.13407082306258641, "flos": 25810949846880.0, "grad_norm": 2.8998436068723117, "language_loss": 0.74204075, "learning_rate": 3.887564425204621e-06, "loss": 0.7681368, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 5.762893915176392 }, { "auxiliary_loss_clip": 0.01314646, "auxiliary_loss_mlp": 0.01199961, "balance_loss_clip": 1.00829411, "balance_loss_mlp": 1.00007081, "epoch": 0.13419106595322552, "flos": 68338394804160.0, "grad_norm": 0.8399192603703084, "language_loss": 0.54628927, "learning_rate": 3.887306779020083e-06, "loss": 0.57143533, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.2783076763153076 }, { "auxiliary_loss_clip": 0.01381751, "auxiliary_loss_mlp": 0.01206269, "balance_loss_clip": 1.01456022, "balance_loss_mlp": 1.00294554, "epoch": 0.1343113088438646, "flos": 20449091003040.0, "grad_norm": 2.4598304701559757, "language_loss": 0.704512, "learning_rate": 3.887048846532608e-06, "loss": 0.73039216, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.925220251083374 }, { "auxiliary_loss_clip": 0.01337354, "auxiliary_loss_mlp": 0.01200025, "balance_loss_clip": 1.01121259, "balance_loss_mlp": 1.00013471, "epoch": 0.1344315517345037, "flos": 67389814234080.0, "grad_norm": 0.7728246650343287, "language_loss": 0.58142984, "learning_rate": 3.8867906277813224e-06, "loss": 0.60680366, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.257277488708496 }, { "auxiliary_loss_clip": 0.01390472, "auxiliary_loss_mlp": 0.00873876, "balance_loss_clip": 1.01472092, "balance_loss_mlp": 1.00043154, "epoch": 0.1345517946251428, "flos": 40734458785440.0, "grad_norm": 2.3269384129007293, "language_loss": 0.73672354, "learning_rate": 3.886532122805399e-06, "loss": 0.75936699, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.9678092002868652 }, { "auxiliary_loss_clip": 0.01316247, "auxiliary_loss_mlp": 0.01205254, "balance_loss_clip": 1.01265955, "balance_loss_mlp": 1.00231242, "epoch": 0.13467203751578188, "flos": 22816860251040.0, "grad_norm": 1.7586714771052068, "language_loss": 0.89729285, "learning_rate": 3.886273331644053e-06, "loss": 0.92250788, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.9144959449768066 }, { "auxiliary_loss_clip": 0.01306417, "auxiliary_loss_mlp": 0.01204424, "balance_loss_clip": 1.01167321, "balance_loss_mlp": 1.00224495, "epoch": 0.13479228040642097, "flos": 17091585856800.0, "grad_norm": 2.09750381475037, "language_loss": 0.82609069, "learning_rate": 3.886014254336542e-06, "loss": 0.85119915, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.7652363777160645 }, { "auxiliary_loss_clip": 0.01390877, "auxiliary_loss_mlp": 0.01204053, "balance_loss_clip": 1.01484728, "balance_loss_mlp": 1.00187421, "epoch": 0.13491252329706005, "flos": 23730536213280.0, "grad_norm": 1.6749154159578654, "language_loss": 0.9271971, "learning_rate": 3.885754890922168e-06, "loss": 0.9531464, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.774810314178467 }, { "auxiliary_loss_clip": 0.0128594, "auxiliary_loss_mlp": 0.0120485, "balance_loss_clip": 1.01271021, "balance_loss_mlp": 1.00209856, "epoch": 0.13503276618769916, "flos": 34127072134560.0, "grad_norm": 2.5177114034831716, "language_loss": 0.78563011, "learning_rate": 3.885495241440277e-06, "loss": 0.81053793, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 3.258749008178711 }, { "auxiliary_loss_clip": 0.01403122, "auxiliary_loss_mlp": 0.01205241, "balance_loss_clip": 1.01480031, "balance_loss_mlp": 1.00210881, "epoch": 0.13515300907833824, "flos": 17712338034240.0, "grad_norm": 1.7721870498679622, "language_loss": 0.74076867, "learning_rate": 3.885235305930257e-06, "loss": 0.76685232, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.824251651763916 }, { "auxiliary_loss_clip": 0.01353657, "auxiliary_loss_mlp": 0.0120513, "balance_loss_clip": 1.0143522, "balance_loss_mlp": 1.0023787, "epoch": 0.13527325196897733, "flos": 20260881041760.0, "grad_norm": 1.9502874456364188, "language_loss": 0.85501146, "learning_rate": 3.884975084431539e-06, "loss": 0.88059926, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.8470568656921387 }, { "auxiliary_loss_clip": 0.01390623, "auxiliary_loss_mlp": 0.00873888, "balance_loss_clip": 1.01487994, "balance_loss_mlp": 1.00038397, "epoch": 0.13539349485961644, "flos": 18186502586400.0, "grad_norm": 2.5705788069390865, "language_loss": 0.91385794, "learning_rate": 3.8847145769836e-06, "loss": 0.93650305, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.750379800796509 }, { "auxiliary_loss_clip": 0.01403952, "auxiliary_loss_mlp": 0.01205338, "balance_loss_clip": 1.01523757, "balance_loss_mlp": 1.00239635, "epoch": 0.13551373775025552, "flos": 19317473481600.0, "grad_norm": 2.5897216018947256, "language_loss": 0.66663909, "learning_rate": 3.884453783625959e-06, "loss": 0.69273204, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.768949508666992 }, { "auxiliary_loss_clip": 0.01350221, "auxiliary_loss_mlp": 0.01204681, "balance_loss_clip": 1.01327646, "balance_loss_mlp": 1.00212073, "epoch": 0.1356339806408946, "flos": 20850823910880.0, "grad_norm": 2.1839952066865447, "language_loss": 0.84783214, "learning_rate": 3.884192704398176e-06, "loss": 0.87338114, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.789409875869751 }, { "auxiliary_loss_clip": 0.01392256, "auxiliary_loss_mlp": 0.01205343, "balance_loss_clip": 1.01524854, "balance_loss_mlp": 1.00221002, "epoch": 0.13575422353153369, "flos": 50476056395040.0, "grad_norm": 1.6755833999786314, "language_loss": 0.74641955, "learning_rate": 3.883931339339858e-06, "loss": 0.77239561, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 3.030661106109619 }, { "auxiliary_loss_clip": 0.01387176, "auxiliary_loss_mlp": 0.0120553, "balance_loss_clip": 1.01481938, "balance_loss_mlp": 1.00277925, "epoch": 0.1358744664221728, "flos": 18150807657600.0, "grad_norm": 2.1072181510004815, "language_loss": 0.78841686, "learning_rate": 3.883669688490654e-06, "loss": 0.81434387, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.7085483074188232 }, { "auxiliary_loss_clip": 0.01365986, "auxiliary_loss_mlp": 0.00873885, "balance_loss_clip": 1.01401329, "balance_loss_mlp": 1.00036108, "epoch": 0.13599470931281188, "flos": 18442977045120.0, "grad_norm": 2.3192922418808797, "language_loss": 0.85777378, "learning_rate": 3.883407751890256e-06, "loss": 0.88017249, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.7844793796539307 }, { "auxiliary_loss_clip": 0.01354466, "auxiliary_loss_mlp": 0.01205325, "balance_loss_clip": 1.01469672, "balance_loss_mlp": 1.00238323, "epoch": 0.13611495220345096, "flos": 26680776204960.0, "grad_norm": 1.8143505624277614, "language_loss": 0.85780901, "learning_rate": 3.8831455295783994e-06, "loss": 0.883407, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.8064045906066895 }, { "auxiliary_loss_clip": 0.01368606, "auxiliary_loss_mlp": 0.01203625, "balance_loss_clip": 1.01397657, "balance_loss_mlp": 1.00182784, "epoch": 0.13623519509409007, "flos": 21686248592640.0, "grad_norm": 2.2565866697940353, "language_loss": 0.73897636, "learning_rate": 3.882883021594864e-06, "loss": 0.76469874, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.8145740032196045 }, { "auxiliary_loss_clip": 0.01348554, "auxiliary_loss_mlp": 0.0120441, "balance_loss_clip": 1.01388526, "balance_loss_mlp": 1.00223088, "epoch": 0.13635543798472916, "flos": 14830398463680.0, "grad_norm": 1.979371474918412, "language_loss": 0.86956954, "learning_rate": 3.8826202279794705e-06, "loss": 0.89509916, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.7599620819091797 }, { "auxiliary_loss_clip": 0.01404091, "auxiliary_loss_mlp": 0.01204094, "balance_loss_clip": 1.01575875, "balance_loss_mlp": 1.00172424, "epoch": 0.13647568087536824, "flos": 22890333682080.0, "grad_norm": 2.063625377145208, "language_loss": 0.70552969, "learning_rate": 3.882357148772085e-06, "loss": 0.73161149, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.6215734481811523 }, { "auxiliary_loss_clip": 0.01337763, "auxiliary_loss_mlp": 0.01204718, "balance_loss_clip": 1.01272273, "balance_loss_mlp": 1.00215816, "epoch": 0.13659592376600732, "flos": 19937938269600.0, "grad_norm": 2.2668635236048824, "language_loss": 0.83865863, "learning_rate": 3.882093784012617e-06, "loss": 0.86408347, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 2.7773239612579346 }, { "auxiliary_loss_clip": 0.01363689, "auxiliary_loss_mlp": 0.01203763, "balance_loss_clip": 1.0137012, "balance_loss_mlp": 1.0021559, "epoch": 0.13671616665664643, "flos": 21428588652480.0, "grad_norm": 1.7648721963798863, "language_loss": 0.83942074, "learning_rate": 3.881830133741019e-06, "loss": 0.86509526, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.7297909259796143 }, { "auxiliary_loss_clip": 0.01330317, "auxiliary_loss_mlp": 0.01204352, "balance_loss_clip": 1.01267362, "balance_loss_mlp": 1.00217271, "epoch": 0.13683640954728551, "flos": 22778147733120.0, "grad_norm": 46.55030407398501, "language_loss": 0.76526141, "learning_rate": 3.881566197997285e-06, "loss": 0.79060805, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 2.835125684738159 }, { "auxiliary_loss_clip": 0.01352848, "auxiliary_loss_mlp": 0.01203981, "balance_loss_clip": 1.01384079, "balance_loss_mlp": 1.00237429, "epoch": 0.1369566524379246, "flos": 21725895126240.0, "grad_norm": 1.5588664520650064, "language_loss": 0.75076663, "learning_rate": 3.881301976821456e-06, "loss": 0.776335, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.756279706954956 }, { "auxiliary_loss_clip": 0.01376298, "auxiliary_loss_mlp": 0.01203787, "balance_loss_clip": 1.01396656, "balance_loss_mlp": 1.00179851, "epoch": 0.1370768953285637, "flos": 18624469278240.0, "grad_norm": 2.9753865519565315, "language_loss": 0.90652376, "learning_rate": 3.881037470253612e-06, "loss": 0.93232465, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.6678760051727295 }, { "auxiliary_loss_clip": 0.01313707, "auxiliary_loss_mlp": 0.01203969, "balance_loss_clip": 1.012676, "balance_loss_mlp": 1.00179052, "epoch": 0.1371971382192028, "flos": 14939530899840.0, "grad_norm": 2.7074977169891215, "language_loss": 0.79039448, "learning_rate": 3.88077267833388e-06, "loss": 0.81557119, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 4.731753826141357 }, { "auxiliary_loss_clip": 0.01326536, "auxiliary_loss_mlp": 0.01204541, "balance_loss_clip": 1.01294839, "balance_loss_mlp": 1.00236249, "epoch": 0.13731738110984187, "flos": 19023795299520.0, "grad_norm": 2.662462978189122, "language_loss": 0.83909154, "learning_rate": 3.880507601102427e-06, "loss": 0.86440229, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 3.9140682220458984 }, { "auxiliary_loss_clip": 0.01402115, "auxiliary_loss_mlp": 0.01204522, "balance_loss_clip": 1.01501906, "balance_loss_mlp": 1.00215244, "epoch": 0.13743762400048098, "flos": 18187472525760.0, "grad_norm": 2.071453192905789, "language_loss": 0.82453221, "learning_rate": 3.880242238599467e-06, "loss": 0.85059857, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.7455315589904785 }, { "auxiliary_loss_clip": 0.01402179, "auxiliary_loss_mlp": 0.01203458, "balance_loss_clip": 1.0148226, "balance_loss_mlp": 1.00166082, "epoch": 0.13755786689112007, "flos": 21031992830880.0, "grad_norm": 2.3547377625998704, "language_loss": 0.83334112, "learning_rate": 3.879976590865254e-06, "loss": 0.85939747, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.7974159717559814 }, { "auxiliary_loss_clip": 0.01353274, "auxiliary_loss_mlp": 0.01204785, "balance_loss_clip": 1.0134207, "balance_loss_mlp": 1.00298786, "epoch": 0.13767810978175915, "flos": 21360647468160.0, "grad_norm": 1.9973829564667598, "language_loss": 0.8728081, "learning_rate": 3.879710657940087e-06, "loss": 0.89838862, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.7094674110412598 }, { "auxiliary_loss_clip": 0.01377675, "auxiliary_loss_mlp": 0.01203705, "balance_loss_clip": 1.01398873, "balance_loss_mlp": 1.00190735, "epoch": 0.13779835267239823, "flos": 30592098207360.0, "grad_norm": 1.7910178555780978, "language_loss": 0.70055461, "learning_rate": 3.879444439864308e-06, "loss": 0.72636837, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.8273911476135254 }, { "auxiliary_loss_clip": 0.0139015, "auxiliary_loss_mlp": 0.00873905, "balance_loss_clip": 1.01470804, "balance_loss_mlp": 1.00038695, "epoch": 0.13791859556303734, "flos": 22669877465280.0, "grad_norm": 1.6841431758848129, "language_loss": 0.86028814, "learning_rate": 3.879177936678301e-06, "loss": 0.88292873, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.784358501434326 }, { "auxiliary_loss_clip": 0.01366161, "auxiliary_loss_mlp": 0.01204219, "balance_loss_clip": 1.01413107, "balance_loss_mlp": 1.00184917, "epoch": 0.13803883845367643, "flos": 35224180208640.0, "grad_norm": 2.0665589129546156, "language_loss": 0.77443749, "learning_rate": 3.878911148422496e-06, "loss": 0.80014133, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.862670421600342 }, { "auxiliary_loss_clip": 0.01378158, "auxiliary_loss_mlp": 0.01203989, "balance_loss_clip": 1.01432407, "balance_loss_mlp": 1.00161934, "epoch": 0.1381590813443155, "flos": 32014555940160.0, "grad_norm": 2.0480098243813614, "language_loss": 0.70534986, "learning_rate": 3.878644075137364e-06, "loss": 0.73117131, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.8311421871185303 }, { "auxiliary_loss_clip": 0.01353414, "auxiliary_loss_mlp": 0.01205583, "balance_loss_clip": 1.01407075, "balance_loss_mlp": 1.00321293, "epoch": 0.13827932423495462, "flos": 17821865630880.0, "grad_norm": 2.1532283403168413, "language_loss": 0.79450941, "learning_rate": 3.878376716863418e-06, "loss": 0.82009935, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.808966875076294 }, { "auxiliary_loss_clip": 0.01364754, "auxiliary_loss_mlp": 0.01204513, "balance_loss_clip": 1.01418412, "balance_loss_mlp": 1.002334, "epoch": 0.1383995671255937, "flos": 19427108849280.0, "grad_norm": 2.383248812228804, "language_loss": 0.7210148, "learning_rate": 3.878109073641219e-06, "loss": 0.7467075, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.8084511756896973 }, { "auxiliary_loss_clip": 0.01312652, "auxiliary_loss_mlp": 0.01204514, "balance_loss_clip": 1.01224339, "balance_loss_mlp": 1.00233483, "epoch": 0.13851981001623279, "flos": 28296616908960.0, "grad_norm": 1.5611665876095284, "language_loss": 0.81282008, "learning_rate": 3.877841145511366e-06, "loss": 0.83799171, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.893019437789917 }, { "auxiliary_loss_clip": 0.01391723, "auxiliary_loss_mlp": 0.01204527, "balance_loss_clip": 1.01560616, "balance_loss_mlp": 1.00215757, "epoch": 0.13864005290687187, "flos": 21213089903520.0, "grad_norm": 1.922839565778061, "language_loss": 0.82646966, "learning_rate": 3.8775729325145035e-06, "loss": 0.85243213, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.7491416931152344 }, { "auxiliary_loss_clip": 0.01309529, "auxiliary_loss_mlp": 0.01198498, "balance_loss_clip": 1.00879872, "balance_loss_mlp": 1.0001334, "epoch": 0.13876029579751098, "flos": 71653428655200.0, "grad_norm": 0.7960456329724781, "language_loss": 0.64721936, "learning_rate": 3.877304434691321e-06, "loss": 0.67229968, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.524287700653076 }, { "auxiliary_loss_clip": 0.01341724, "auxiliary_loss_mlp": 0.01202427, "balance_loss_clip": 1.0138042, "balance_loss_mlp": 1.00158286, "epoch": 0.13888053868815006, "flos": 21941357951520.0, "grad_norm": 1.8894889531500938, "language_loss": 0.79552847, "learning_rate": 3.877035652082548e-06, "loss": 0.82097006, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 3.4202048778533936 }, { "auxiliary_loss_clip": 0.01364783, "auxiliary_loss_mlp": 0.01204601, "balance_loss_clip": 1.01459837, "balance_loss_mlp": 1.00223088, "epoch": 0.13900078157878915, "flos": 19608637006080.0, "grad_norm": 1.8112918621877536, "language_loss": 0.85446334, "learning_rate": 3.87676658472896e-06, "loss": 0.88015711, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.797696590423584 }, { "auxiliary_loss_clip": 0.0138986, "auxiliary_loss_mlp": 0.0120464, "balance_loss_clip": 1.01425529, "balance_loss_mlp": 1.00246072, "epoch": 0.13912102446942826, "flos": 22638062293920.0, "grad_norm": 2.2973752776816143, "language_loss": 0.85436082, "learning_rate": 3.876497232671372e-06, "loss": 0.88030583, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.7004804611206055 }, { "auxiliary_loss_clip": 0.01320087, "auxiliary_loss_mlp": 0.01204419, "balance_loss_clip": 1.01221609, "balance_loss_mlp": 1.00223994, "epoch": 0.13924126736006734, "flos": 29643338018880.0, "grad_norm": 2.3028888007625072, "language_loss": 0.83277357, "learning_rate": 3.876227595950647e-06, "loss": 0.85801864, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.922999143600464 }, { "auxiliary_loss_clip": 0.01401962, "auxiliary_loss_mlp": 0.01203443, "balance_loss_clip": 1.01489162, "balance_loss_mlp": 1.00202656, "epoch": 0.13936151025070642, "flos": 27417665936160.0, "grad_norm": 2.0732615176414946, "language_loss": 0.78859717, "learning_rate": 3.875957674607686e-06, "loss": 0.81465125, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.7925236225128174 }, { "auxiliary_loss_clip": 0.01389318, "auxiliary_loss_mlp": 0.00873972, "balance_loss_clip": 1.01404703, "balance_loss_mlp": 1.00036025, "epoch": 0.1394817531413455, "flos": 16399336050720.0, "grad_norm": 3.222276514948939, "language_loss": 0.88028753, "learning_rate": 3.8756874686834386e-06, "loss": 0.90292048, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.718519926071167 }, { "auxiliary_loss_clip": 0.01387461, "auxiliary_loss_mlp": 0.00873872, "balance_loss_clip": 1.0142982, "balance_loss_mlp": 1.00032365, "epoch": 0.13960199603198462, "flos": 30922333486560.0, "grad_norm": 1.5566423782264809, "language_loss": 0.80118316, "learning_rate": 3.875416978218893e-06, "loss": 0.82379651, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.8557002544403076 }, { "auxiliary_loss_clip": 0.01364937, "auxiliary_loss_mlp": 0.01204999, "balance_loss_clip": 1.01432204, "balance_loss_mlp": 1.00243819, "epoch": 0.1397222389226237, "flos": 18113783552640.0, "grad_norm": 4.023044583220836, "language_loss": 0.82571578, "learning_rate": 3.8751462032550835e-06, "loss": 0.85141516, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.7134053707122803 }, { "auxiliary_loss_clip": 0.01350482, "auxiliary_loss_mlp": 0.01202597, "balance_loss_clip": 1.01353025, "balance_loss_mlp": 1.00156283, "epoch": 0.13984248181326278, "flos": 16872782129280.0, "grad_norm": 2.459819641090905, "language_loss": 0.83146977, "learning_rate": 3.874875143833085e-06, "loss": 0.85700053, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.6628124713897705 }, { "auxiliary_loss_clip": 0.01377514, "auxiliary_loss_mlp": 0.01203742, "balance_loss_clip": 1.01394892, "balance_loss_mlp": 1.00194502, "epoch": 0.1399627247039019, "flos": 54121420087200.0, "grad_norm": 2.0823271148614473, "language_loss": 0.68650126, "learning_rate": 3.874603799994019e-06, "loss": 0.71231383, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 2.987988233566284 }, { "auxiliary_loss_clip": 0.01351809, "auxiliary_loss_mlp": 0.01203631, "balance_loss_clip": 1.01378846, "balance_loss_mlp": 1.00183344, "epoch": 0.14008296759454097, "flos": 11765529712800.0, "grad_norm": 2.1682348359528887, "language_loss": 0.87222147, "learning_rate": 3.874332171779046e-06, "loss": 0.89777583, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 2.7201809883117676 }, { "auxiliary_loss_clip": 0.0133757, "auxiliary_loss_mlp": 0.01202711, "balance_loss_clip": 1.0127542, "balance_loss_mlp": 1.00186741, "epoch": 0.14020321048518006, "flos": 22017525658560.0, "grad_norm": 1.8224883956611941, "language_loss": 0.75602102, "learning_rate": 3.874060259229373e-06, "loss": 0.78142381, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 3.7250285148620605 }, { "auxiliary_loss_clip": 0.01378255, "auxiliary_loss_mlp": 0.01204782, "balance_loss_clip": 1.01454961, "balance_loss_mlp": 1.00241196, "epoch": 0.14032345337581917, "flos": 23404324386240.0, "grad_norm": 2.120006223902677, "language_loss": 0.93826127, "learning_rate": 3.873788062386249e-06, "loss": 0.96409172, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 3.6838903427124023 }, { "auxiliary_loss_clip": 0.01337955, "auxiliary_loss_mlp": 0.01204408, "balance_loss_clip": 1.01271522, "balance_loss_mlp": 1.00222874, "epoch": 0.14044369626645825, "flos": 29645780829120.0, "grad_norm": 1.7303182628147078, "language_loss": 0.82309878, "learning_rate": 3.873515581290965e-06, "loss": 0.84852242, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 3.862926721572876 }, { "auxiliary_loss_clip": 0.01329697, "auxiliary_loss_mlp": 0.01204501, "balance_loss_clip": 1.01292443, "balance_loss_mlp": 1.00251281, "epoch": 0.14056393915709733, "flos": 18332982440640.0, "grad_norm": 3.164661870641983, "language_loss": 0.75712574, "learning_rate": 3.8732428159848575e-06, "loss": 0.78246772, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.8098719120025635 }, { "auxiliary_loss_clip": 0.01377559, "auxiliary_loss_mlp": 0.01204235, "balance_loss_clip": 1.01553929, "balance_loss_mlp": 1.00224686, "epoch": 0.14068418204773642, "flos": 26687529856800.0, "grad_norm": 1.7891278158316883, "language_loss": 0.78173953, "learning_rate": 3.872969766509304e-06, "loss": 0.80755746, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.792276382446289 }, { "auxiliary_loss_clip": 0.01310842, "auxiliary_loss_mlp": 0.01198769, "balance_loss_clip": 1.00886178, "balance_loss_mlp": 1.00040436, "epoch": 0.14080442493837553, "flos": 65259349408800.0, "grad_norm": 0.7625474586040255, "language_loss": 0.55648053, "learning_rate": 3.872696432905726e-06, "loss": 0.5815767, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.400402784347534 }, { "auxiliary_loss_clip": 0.01380249, "auxiliary_loss_mlp": 0.01204658, "balance_loss_clip": 1.01379085, "balance_loss_mlp": 1.00228834, "epoch": 0.1409246678290146, "flos": 25776727788960.0, "grad_norm": 2.931400135861234, "language_loss": 0.71729326, "learning_rate": 3.872422815215589e-06, "loss": 0.74314231, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.7646596431732178 }, { "auxiliary_loss_clip": 0.01389624, "auxiliary_loss_mlp": 0.01204605, "balance_loss_clip": 1.01420999, "balance_loss_mlp": 1.00242603, "epoch": 0.1410449107196537, "flos": 21868531146720.0, "grad_norm": 2.1349627791410737, "language_loss": 0.74312371, "learning_rate": 3.8721489134803994e-06, "loss": 0.76906604, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.743565082550049 }, { "auxiliary_loss_clip": 0.01375194, "auxiliary_loss_mlp": 0.01204119, "balance_loss_clip": 1.01374948, "balance_loss_mlp": 1.00232196, "epoch": 0.1411651536102928, "flos": 16684141083840.0, "grad_norm": 2.3525067543857445, "language_loss": 0.72657645, "learning_rate": 3.871874727741707e-06, "loss": 0.75236958, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.723146438598633 }, { "auxiliary_loss_clip": 0.01375998, "auxiliary_loss_mlp": 0.01203121, "balance_loss_clip": 1.01467681, "balance_loss_mlp": 1.00170517, "epoch": 0.1412853965009319, "flos": 20992274449920.0, "grad_norm": 2.2395753285863047, "language_loss": 0.96598351, "learning_rate": 3.871600258041108e-06, "loss": 0.99177468, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.670928716659546 }, { "auxiliary_loss_clip": 0.0136789, "auxiliary_loss_mlp": 0.01203998, "balance_loss_clip": 1.01428866, "balance_loss_mlp": 1.00200987, "epoch": 0.14140563939157097, "flos": 20335288488480.0, "grad_norm": 2.295822588910416, "language_loss": 0.85655677, "learning_rate": 3.871325504420238e-06, "loss": 0.8822757, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.760253429412842 }, { "auxiliary_loss_clip": 0.0140153, "auxiliary_loss_mlp": 0.01203829, "balance_loss_clip": 1.01488566, "balance_loss_mlp": 1.00203133, "epoch": 0.14152588228221005, "flos": 21068837317440.0, "grad_norm": 2.020607406087465, "language_loss": 0.81801486, "learning_rate": 3.871050466920776e-06, "loss": 0.84406841, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.694315195083618 }, { "auxiliary_loss_clip": 0.01353936, "auxiliary_loss_mlp": 0.01203209, "balance_loss_clip": 1.01327801, "balance_loss_mlp": 1.00198412, "epoch": 0.14164612517284916, "flos": 18223167454560.0, "grad_norm": 2.201889405472648, "language_loss": 0.79725474, "learning_rate": 3.870775145584447e-06, "loss": 0.82282621, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.818401336669922 }, { "auxiliary_loss_clip": 0.01363911, "auxiliary_loss_mlp": 0.01203828, "balance_loss_clip": 1.01368153, "balance_loss_mlp": 1.00164866, "epoch": 0.14176636806348825, "flos": 22744464530400.0, "grad_norm": 2.605441360955488, "language_loss": 0.65442848, "learning_rate": 3.8704995404530145e-06, "loss": 0.68010592, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.774763584136963 }, { "auxiliary_loss_clip": 0.01400783, "auxiliary_loss_mlp": 0.01202401, "balance_loss_clip": 1.01451266, "balance_loss_mlp": 1.00174809, "epoch": 0.14188661095412733, "flos": 22091106860640.0, "grad_norm": 1.8830161953669213, "language_loss": 0.85024941, "learning_rate": 3.87022365156829e-06, "loss": 0.87628126, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.7283430099487305 }, { "auxiliary_loss_clip": 0.01275955, "auxiliary_loss_mlp": 0.01203321, "balance_loss_clip": 1.01096725, "balance_loss_mlp": 1.00171471, "epoch": 0.14200685384476644, "flos": 24352402024800.0, "grad_norm": 2.0424787756410665, "language_loss": 0.81214952, "learning_rate": 3.869947478972123e-06, "loss": 0.83694232, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.870713233947754 }, { "auxiliary_loss_clip": 0.01388339, "auxiliary_loss_mlp": 0.01202625, "balance_loss_clip": 1.01440728, "balance_loss_mlp": 1.00178158, "epoch": 0.14212709673540552, "flos": 24022058974560.0, "grad_norm": 2.12046994426665, "language_loss": 0.82126141, "learning_rate": 3.869671022706412e-06, "loss": 0.84717107, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.8515844345092773 }, { "auxiliary_loss_clip": 0.013397, "auxiliary_loss_mlp": 0.0120344, "balance_loss_clip": 1.01341391, "balance_loss_mlp": 1.0018332, "epoch": 0.1422473396260446, "flos": 26431809795360.0, "grad_norm": 2.0714857256804007, "language_loss": 0.64874542, "learning_rate": 3.869394282813092e-06, "loss": 0.67417681, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.8958516120910645 }, { "auxiliary_loss_clip": 0.01353876, "auxiliary_loss_mlp": 0.01203925, "balance_loss_clip": 1.01388025, "balance_loss_mlp": 1.00212765, "epoch": 0.1423675825166837, "flos": 17055316149120.0, "grad_norm": 2.2719852763627255, "language_loss": 0.8930316, "learning_rate": 3.869117259334147e-06, "loss": 0.91860962, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.742222547531128 }, { "auxiliary_loss_clip": 0.01388861, "auxiliary_loss_mlp": 0.01203483, "balance_loss_clip": 1.01452565, "balance_loss_mlp": 1.00225794, "epoch": 0.1424878254073228, "flos": 17929489272480.0, "grad_norm": 1.8845708209268637, "language_loss": 0.82235247, "learning_rate": 3.868839952311599e-06, "loss": 0.8482759, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.840550661087036 }, { "auxiliary_loss_clip": 0.01348456, "auxiliary_loss_mlp": 0.01203849, "balance_loss_clip": 1.01287234, "balance_loss_mlp": 1.00205159, "epoch": 0.14260806829796188, "flos": 20303617011840.0, "grad_norm": 2.393777335638762, "language_loss": 0.80843258, "learning_rate": 3.868562361787516e-06, "loss": 0.83395559, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.740417242050171 }, { "auxiliary_loss_clip": 0.0129311, "auxiliary_loss_mlp": 0.01203716, "balance_loss_clip": 1.01149392, "balance_loss_mlp": 1.00229979, "epoch": 0.14272831118860096, "flos": 23185736200800.0, "grad_norm": 2.354705837026507, "language_loss": 0.68843782, "learning_rate": 3.868284487804009e-06, "loss": 0.71340609, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 2.8954696655273438 }, { "auxiliary_loss_clip": 0.01375167, "auxiliary_loss_mlp": 0.01202715, "balance_loss_clip": 1.01441157, "balance_loss_mlp": 1.00148952, "epoch": 0.14284855407924008, "flos": 27232222098240.0, "grad_norm": 1.6167317440728861, "language_loss": 0.7804116, "learning_rate": 3.86800633040323e-06, "loss": 0.80619037, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 2.7854297161102295 }, { "auxiliary_loss_clip": 0.01349729, "auxiliary_loss_mlp": 0.00873725, "balance_loss_clip": 1.01346421, "balance_loss_mlp": 1.00027764, "epoch": 0.14296879696987916, "flos": 28184215417920.0, "grad_norm": 2.2637925897417275, "language_loss": 0.78218842, "learning_rate": 3.867727889627376e-06, "loss": 0.80442297, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.8425333499908447 }, { "auxiliary_loss_clip": 0.01338552, "auxiliary_loss_mlp": 0.0120324, "balance_loss_clip": 1.01376188, "balance_loss_mlp": 1.00163269, "epoch": 0.14308903986051824, "flos": 19390300286400.0, "grad_norm": 2.4517228725645763, "language_loss": 0.78323889, "learning_rate": 3.867449165518687e-06, "loss": 0.80865681, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 2.787273645401001 }, { "auxiliary_loss_clip": 0.01400641, "auxiliary_loss_mlp": 0.00873794, "balance_loss_clip": 1.01399684, "balance_loss_mlp": 1.00022888, "epoch": 0.14320928275115732, "flos": 17457516064800.0, "grad_norm": 5.755635658613133, "language_loss": 0.71216476, "learning_rate": 3.867170158119444e-06, "loss": 0.73490912, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 2.691046953201294 }, { "auxiliary_loss_clip": 0.01401617, "auxiliary_loss_mlp": 0.01203499, "balance_loss_clip": 1.01500297, "balance_loss_mlp": 1.00189233, "epoch": 0.14332952564179643, "flos": 21466079765280.0, "grad_norm": 2.428110434734086, "language_loss": 0.75581694, "learning_rate": 3.866890867471972e-06, "loss": 0.7818681, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 3.5874710083007812 }, { "auxiliary_loss_clip": 0.0137719, "auxiliary_loss_mlp": 0.01203963, "balance_loss_clip": 1.0144155, "balance_loss_mlp": 1.00216532, "epoch": 0.14344976853243552, "flos": 16397001011520.0, "grad_norm": 2.632980931876501, "language_loss": 0.89594114, "learning_rate": 3.86661129361864e-06, "loss": 0.92175269, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 3.6944801807403564 }, { "auxiliary_loss_clip": 0.01349637, "auxiliary_loss_mlp": 0.01203968, "balance_loss_clip": 1.01302862, "balance_loss_mlp": 1.00217009, "epoch": 0.1435700114230746, "flos": 18916746436800.0, "grad_norm": 2.3696195120090335, "language_loss": 0.85840017, "learning_rate": 3.866331436601859e-06, "loss": 0.88393629, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 4.4511401653289795 }, { "auxiliary_loss_clip": 0.01401069, "auxiliary_loss_mlp": 0.01203363, "balance_loss_clip": 1.01521385, "balance_loss_mlp": 1.00175595, "epoch": 0.1436902543137137, "flos": 19755404249760.0, "grad_norm": 1.9037976189408985, "language_loss": 0.73690879, "learning_rate": 3.866051296464083e-06, "loss": 0.7629531, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.71989369392395 }, { "auxiliary_loss_clip": 0.01400937, "auxiliary_loss_mlp": 0.00873706, "balance_loss_clip": 1.01438594, "balance_loss_mlp": 1.00022483, "epoch": 0.1438104972043528, "flos": 14684816701440.0, "grad_norm": 3.0471511232169464, "language_loss": 0.85061038, "learning_rate": 3.86577087324781e-06, "loss": 0.87335682, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.684997797012329 }, { "auxiliary_loss_clip": 0.01375221, "auxiliary_loss_mlp": 0.01202658, "balance_loss_clip": 1.01452947, "balance_loss_mlp": 1.00162327, "epoch": 0.14393074009499188, "flos": 17092340254080.0, "grad_norm": 2.5722956089935782, "language_loss": 0.77201039, "learning_rate": 3.865490166995578e-06, "loss": 0.79778916, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.6973917484283447 }, { "auxiliary_loss_clip": 0.01375771, "auxiliary_loss_mlp": 0.01203437, "balance_loss_clip": 1.01395154, "balance_loss_mlp": 1.0018301, "epoch": 0.144050982985631, "flos": 30476212119360.0, "grad_norm": 2.1376264423000464, "language_loss": 0.84170645, "learning_rate": 3.86520917774997e-06, "loss": 0.86749852, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.7977004051208496 }, { "auxiliary_loss_clip": 0.01375235, "auxiliary_loss_mlp": 0.01203778, "balance_loss_clip": 1.0145812, "balance_loss_mlp": 1.00236177, "epoch": 0.14417122587627007, "flos": 17858494575360.0, "grad_norm": 2.125289052818784, "language_loss": 0.75277352, "learning_rate": 3.864927905553614e-06, "loss": 0.77856362, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.7679007053375244 }, { "auxiliary_loss_clip": 0.01336267, "auxiliary_loss_mlp": 0.01203221, "balance_loss_clip": 1.01280951, "balance_loss_mlp": 1.00199616, "epoch": 0.14429146876690915, "flos": 21613924719360.0, "grad_norm": 1.862325623946837, "language_loss": 0.89014214, "learning_rate": 3.8646463504491765e-06, "loss": 0.915537, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.793971300125122 }, { "auxiliary_loss_clip": 0.01376719, "auxiliary_loss_mlp": 0.0120381, "balance_loss_clip": 1.01389575, "balance_loss_mlp": 1.00201201, "epoch": 0.14441171165754824, "flos": 23258131921440.0, "grad_norm": 1.801733615592377, "language_loss": 0.83129537, "learning_rate": 3.8643645124793705e-06, "loss": 0.85710073, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.783717632293701 }, { "auxiliary_loss_clip": 0.01376557, "auxiliary_loss_mlp": 0.01202951, "balance_loss_clip": 1.01385128, "balance_loss_mlp": 1.00172544, "epoch": 0.14453195454818735, "flos": 42854231563200.0, "grad_norm": 1.5963550906133837, "language_loss": 0.74737167, "learning_rate": 3.8640823916869515e-06, "loss": 0.77316678, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.8995909690856934 }, { "auxiliary_loss_clip": 0.01400529, "auxiliary_loss_mlp": 0.01203136, "balance_loss_clip": 1.01424646, "balance_loss_mlp": 1.00191116, "epoch": 0.14465219743882643, "flos": 27235886313600.0, "grad_norm": 1.5091656271158829, "language_loss": 0.78381807, "learning_rate": 3.863799988114714e-06, "loss": 0.80985475, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.761199474334717 }, { "auxiliary_loss_clip": 0.01400687, "auxiliary_loss_mlp": 0.01203915, "balance_loss_clip": 1.014184, "balance_loss_mlp": 1.00192642, "epoch": 0.1447724403294655, "flos": 16690715117280.0, "grad_norm": 2.3595322679579267, "language_loss": 0.71174622, "learning_rate": 3.863517301805502e-06, "loss": 0.73779225, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.7100212574005127 }, { "auxiliary_loss_clip": 0.01335546, "auxiliary_loss_mlp": 0.01203283, "balance_loss_clip": 1.01294219, "balance_loss_mlp": 1.00205755, "epoch": 0.14489268322010462, "flos": 20073748790880.0, "grad_norm": 2.3231794906132284, "language_loss": 0.97151387, "learning_rate": 3.863234332802196e-06, "loss": 0.99690205, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.754676103591919 }, { "auxiliary_loss_clip": 0.01374277, "auxiliary_loss_mlp": 0.0120345, "balance_loss_clip": 1.01404822, "balance_loss_mlp": 1.0022248, "epoch": 0.1450129261107437, "flos": 27125640243360.0, "grad_norm": 2.3014655228271663, "language_loss": 0.74137783, "learning_rate": 3.862951081147723e-06, "loss": 0.76715505, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.806460380554199 }, { "auxiliary_loss_clip": 0.01383655, "auxiliary_loss_mlp": 0.01203565, "balance_loss_clip": 1.01434577, "balance_loss_mlp": 1.00214911, "epoch": 0.1451331690013828, "flos": 25702356265920.0, "grad_norm": 2.1446620445412834, "language_loss": 0.78298312, "learning_rate": 3.862667546885053e-06, "loss": 0.80885535, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.7831616401672363 }, { "auxiliary_loss_clip": 0.01366588, "auxiliary_loss_mlp": 0.01203744, "balance_loss_clip": 1.01323676, "balance_loss_mlp": 1.0023278, "epoch": 0.14525341189202187, "flos": 25737404568480.0, "grad_norm": 1.824090914575218, "language_loss": 0.73123312, "learning_rate": 3.8623837300571965e-06, "loss": 0.75693643, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.717665672302246 }, { "auxiliary_loss_clip": 0.01401042, "auxiliary_loss_mlp": 0.01203569, "balance_loss_clip": 1.01459241, "balance_loss_mlp": 1.00215268, "epoch": 0.14537365478266098, "flos": 23073909488640.0, "grad_norm": 1.9162637387434602, "language_loss": 0.83889484, "learning_rate": 3.8620996307072085e-06, "loss": 0.86494094, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.7057149410247803 }, { "auxiliary_loss_clip": 0.01354152, "auxiliary_loss_mlp": 0.01202539, "balance_loss_clip": 1.01351643, "balance_loss_mlp": 1.00169504, "epoch": 0.14549389767330007, "flos": 20595032002080.0, "grad_norm": 2.0228702673947017, "language_loss": 0.64835036, "learning_rate": 3.861815248878188e-06, "loss": 0.67391729, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.7511889934539795 }, { "auxiliary_loss_clip": 0.0134863, "auxiliary_loss_mlp": 0.01202503, "balance_loss_clip": 1.01334476, "balance_loss_mlp": 1.00185013, "epoch": 0.14561414056393915, "flos": 15121813453920.0, "grad_norm": 2.61801621767564, "language_loss": 0.80129993, "learning_rate": 3.861530584613274e-06, "loss": 0.82681125, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.7125344276428223 }, { "auxiliary_loss_clip": 0.01375086, "auxiliary_loss_mlp": 0.00873654, "balance_loss_clip": 1.01410985, "balance_loss_mlp": 1.00016916, "epoch": 0.14573438345457826, "flos": 19427504009760.0, "grad_norm": 2.9050414602875723, "language_loss": 0.82728899, "learning_rate": 3.86124563795565e-06, "loss": 0.84977639, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.685851573944092 }, { "auxiliary_loss_clip": 0.01400421, "auxiliary_loss_mlp": 0.01203401, "balance_loss_clip": 1.01463175, "balance_loss_mlp": 1.00236702, "epoch": 0.14585462634521734, "flos": 24828434608320.0, "grad_norm": 2.064687438202166, "language_loss": 0.69863546, "learning_rate": 3.860960408948543e-06, "loss": 0.72467363, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.7476131916046143 }, { "auxiliary_loss_clip": 0.0137319, "auxiliary_loss_mlp": 0.01201975, "balance_loss_clip": 1.01322687, "balance_loss_mlp": 1.00151265, "epoch": 0.14597486923585642, "flos": 15448635983520.0, "grad_norm": 3.5553715786724305, "language_loss": 0.89581579, "learning_rate": 3.860674897635222e-06, "loss": 0.92156744, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.7301323413848877 }, { "auxiliary_loss_clip": 0.01374761, "auxiliary_loss_mlp": 0.01202795, "balance_loss_clip": 1.01396787, "balance_loss_mlp": 1.00156999, "epoch": 0.1460951121264955, "flos": 16655163883200.0, "grad_norm": 1.9380864004083849, "language_loss": 0.83419812, "learning_rate": 3.860389104058998e-06, "loss": 0.85997367, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.8193485736846924 }, { "auxiliary_loss_clip": 0.01353142, "auxiliary_loss_mlp": 0.01203541, "balance_loss_clip": 1.01313043, "balance_loss_mlp": 1.00193393, "epoch": 0.14621535501713462, "flos": 24863303292480.0, "grad_norm": 1.8114509840951003, "language_loss": 0.7253207, "learning_rate": 3.860103028263227e-06, "loss": 0.75088751, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 2.8342020511627197 }, { "auxiliary_loss_clip": 0.01328867, "auxiliary_loss_mlp": 0.01204098, "balance_loss_clip": 1.01241231, "balance_loss_mlp": 1.00230002, "epoch": 0.1463355979077737, "flos": 25228012095360.0, "grad_norm": 1.943406347331017, "language_loss": 0.69996399, "learning_rate": 3.859816670291304e-06, "loss": 0.72529364, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 2.8380513191223145 }, { "auxiliary_loss_clip": 0.01294742, "auxiliary_loss_mlp": 0.01203319, "balance_loss_clip": 1.01158512, "balance_loss_mlp": 1.00190258, "epoch": 0.14645584079841278, "flos": 22054154603040.0, "grad_norm": 2.0542558171772938, "language_loss": 0.89835882, "learning_rate": 3.859530030186672e-06, "loss": 0.92333949, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 2.854975700378418 }, { "auxiliary_loss_clip": 0.01359992, "auxiliary_loss_mlp": 0.01203263, "balance_loss_clip": 1.0138762, "balance_loss_mlp": 1.00165606, "epoch": 0.1465760836890519, "flos": 23623882511040.0, "grad_norm": 2.365130869634339, "language_loss": 0.82685935, "learning_rate": 3.859243107992813e-06, "loss": 0.85249186, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 4.617147445678711 }, { "auxiliary_loss_clip": 0.01358711, "auxiliary_loss_mlp": 0.0120378, "balance_loss_clip": 1.01321959, "balance_loss_mlp": 1.00198257, "epoch": 0.14669632657969098, "flos": 37407906642240.0, "grad_norm": 3.7809465146505508, "language_loss": 0.7828728, "learning_rate": 3.858955903753252e-06, "loss": 0.80849767, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 3.8105220794677734 }, { "auxiliary_loss_clip": 0.01386648, "auxiliary_loss_mlp": 0.01203211, "balance_loss_clip": 1.01408625, "balance_loss_mlp": 1.00217664, "epoch": 0.14681656947033006, "flos": 28365923193120.0, "grad_norm": 1.89201523636362, "language_loss": 0.8339498, "learning_rate": 3.858668417511559e-06, "loss": 0.85984838, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.7991065979003906 }, { "auxiliary_loss_clip": 0.01363618, "auxiliary_loss_mlp": 0.01203766, "balance_loss_clip": 1.01412225, "balance_loss_mlp": 1.00215924, "epoch": 0.14693681236096917, "flos": 18479498218560.0, "grad_norm": 2.7164716853777358, "language_loss": 0.76926154, "learning_rate": 3.8583806493113445e-06, "loss": 0.79493535, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.711432933807373 }, { "auxiliary_loss_clip": 0.0137375, "auxiliary_loss_mlp": 0.01203966, "balance_loss_clip": 1.0134325, "balance_loss_mlp": 1.00255024, "epoch": 0.14705705525160825, "flos": 20777817487680.0, "grad_norm": 1.934591478127707, "language_loss": 0.81885207, "learning_rate": 3.858092599196263e-06, "loss": 0.84462923, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.780458688735962 }, { "auxiliary_loss_clip": 0.0137513, "auxiliary_loss_mlp": 0.01202571, "balance_loss_clip": 1.01446223, "balance_loss_mlp": 1.00153685, "epoch": 0.14717729814224734, "flos": 29932956825120.0, "grad_norm": 4.258829843153812, "language_loss": 0.82498848, "learning_rate": 3.857804267210012e-06, "loss": 0.85076547, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.7961924076080322 }, { "auxiliary_loss_clip": 0.01348512, "auxiliary_loss_mlp": 0.01203807, "balance_loss_clip": 1.01264632, "balance_loss_mlp": 1.00239074, "epoch": 0.14729754103288642, "flos": 20047501789920.0, "grad_norm": 5.502078391856664, "language_loss": 0.88420606, "learning_rate": 3.857515653396331e-06, "loss": 0.90972918, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.8204729557037354 }, { "auxiliary_loss_clip": 0.01322506, "auxiliary_loss_mlp": 0.01203096, "balance_loss_clip": 1.01274085, "balance_loss_mlp": 1.00187063, "epoch": 0.14741778392352553, "flos": 19281527087040.0, "grad_norm": 3.901807612596251, "language_loss": 0.86524612, "learning_rate": 3.857226757799002e-06, "loss": 0.8905021, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.8002007007598877 }, { "auxiliary_loss_clip": 0.01362342, "auxiliary_loss_mlp": 0.01203332, "balance_loss_clip": 1.01402533, "balance_loss_mlp": 1.00172567, "epoch": 0.1475380268141646, "flos": 25411120894080.0, "grad_norm": 2.3007244776579734, "language_loss": 0.74531209, "learning_rate": 3.85693758046185e-06, "loss": 0.77096885, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.761817216873169 }, { "auxiliary_loss_clip": 0.01400844, "auxiliary_loss_mlp": 0.01203348, "balance_loss_clip": 1.01505971, "balance_loss_mlp": 1.00212276, "epoch": 0.1476582697048037, "flos": 20847662627040.0, "grad_norm": 2.3447442102021663, "language_loss": 0.82920402, "learning_rate": 3.8566481214287435e-06, "loss": 0.85524595, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.7017860412597656 }, { "auxiliary_loss_clip": 0.0136319, "auxiliary_loss_mlp": 0.01203711, "balance_loss_clip": 1.01451612, "balance_loss_mlp": 1.00229502, "epoch": 0.1477785125954428, "flos": 14028118129440.0, "grad_norm": 4.140815136258555, "language_loss": 0.90745449, "learning_rate": 3.8563583807435935e-06, "loss": 0.93312347, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.750249147415161 }, { "auxiliary_loss_clip": 0.01376941, "auxiliary_loss_mlp": 0.00873602, "balance_loss_clip": 1.01425028, "balance_loss_mlp": 1.00007439, "epoch": 0.1478987554860819, "flos": 20516708874240.0, "grad_norm": 1.9886662069152712, "language_loss": 0.77842999, "learning_rate": 3.856068358450353e-06, "loss": 0.80093539, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.7455432415008545 }, { "auxiliary_loss_clip": 0.01348394, "auxiliary_loss_mlp": 0.01202435, "balance_loss_clip": 1.01321483, "balance_loss_mlp": 1.00159144, "epoch": 0.14801899837672097, "flos": 17857021704480.0, "grad_norm": 2.3567592589168562, "language_loss": 0.85856777, "learning_rate": 3.8557780545930186e-06, "loss": 0.88407606, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.7299017906188965 }, { "auxiliary_loss_clip": 0.01349904, "auxiliary_loss_mlp": 0.01203424, "balance_loss_clip": 1.01276684, "balance_loss_mlp": 1.0018177, "epoch": 0.14813924126736006, "flos": 20881417677120.0, "grad_norm": 1.7705530279813644, "language_loss": 0.79206449, "learning_rate": 3.855487469215628e-06, "loss": 0.81759775, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.7235586643218994 }, { "auxiliary_loss_clip": 0.01335433, "auxiliary_loss_mlp": 0.01202883, "balance_loss_clip": 1.01297772, "balance_loss_mlp": 1.00165772, "epoch": 0.14825948415799917, "flos": 37414085515200.0, "grad_norm": 1.8890063672030353, "language_loss": 0.72339749, "learning_rate": 3.855196602362264e-06, "loss": 0.74878067, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.9366703033447266 }, { "auxiliary_loss_clip": 0.0137821, "auxiliary_loss_mlp": 0.01203309, "balance_loss_clip": 1.01370609, "balance_loss_mlp": 1.0018934, "epoch": 0.14837972704863825, "flos": 22014651764160.0, "grad_norm": 3.4389455326063043, "language_loss": 0.93961877, "learning_rate": 3.854905454077051e-06, "loss": 0.96543396, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.706873655319214 }, { "auxiliary_loss_clip": 0.01289894, "auxiliary_loss_mlp": 0.01203817, "balance_loss_clip": 1.01161027, "balance_loss_mlp": 1.0022099, "epoch": 0.14849996993927733, "flos": 20996333825760.0, "grad_norm": 1.8476536934681504, "language_loss": 0.88168508, "learning_rate": 3.854614024404155e-06, "loss": 0.90662217, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 3.0152177810668945 }, { "auxiliary_loss_clip": 0.01360905, "auxiliary_loss_mlp": 0.01202846, "balance_loss_clip": 1.01290822, "balance_loss_mlp": 1.00181115, "epoch": 0.14862021282991644, "flos": 20048004721440.0, "grad_norm": 1.9787155184534586, "language_loss": 0.89177978, "learning_rate": 3.8543223133877865e-06, "loss": 0.91741729, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.9053850173950195 }, { "auxiliary_loss_clip": 0.01375265, "auxiliary_loss_mlp": 0.01203825, "balance_loss_clip": 1.01404631, "balance_loss_mlp": 1.00183702, "epoch": 0.14874045572055553, "flos": 22712038656480.0, "grad_norm": 2.6471766272143054, "language_loss": 0.88002354, "learning_rate": 3.854030321072198e-06, "loss": 0.90581441, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.812110662460327 }, { "auxiliary_loss_clip": 0.01344296, "auxiliary_loss_mlp": 0.01202969, "balance_loss_clip": 1.0129168, "balance_loss_mlp": 1.00155258, "epoch": 0.1488606986111946, "flos": 25411300512480.0, "grad_norm": 1.9795560458476782, "language_loss": 0.73049933, "learning_rate": 3.853738047501682e-06, "loss": 0.75597203, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.801676034927368 }, { "auxiliary_loss_clip": 0.01376871, "auxiliary_loss_mlp": 0.01203997, "balance_loss_clip": 1.01474595, "balance_loss_mlp": 1.00219989, "epoch": 0.1489809415018337, "flos": 17018759052000.0, "grad_norm": 2.674926374278635, "language_loss": 0.77850223, "learning_rate": 3.85344549272058e-06, "loss": 0.80431092, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.7308828830718994 }, { "auxiliary_loss_clip": 0.01388275, "auxiliary_loss_mlp": 0.01203578, "balance_loss_clip": 1.01489949, "balance_loss_mlp": 1.00197172, "epoch": 0.1491011843924728, "flos": 33659409768480.0, "grad_norm": 1.9343577616433147, "language_loss": 0.82252252, "learning_rate": 3.853152656773269e-06, "loss": 0.84844112, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.86106538772583 }, { "auxiliary_loss_clip": 0.01351693, "auxiliary_loss_mlp": 0.01203621, "balance_loss_clip": 1.01344085, "balance_loss_mlp": 1.00163317, "epoch": 0.14922142728311188, "flos": 21179011540320.0, "grad_norm": 2.5381209628009977, "language_loss": 0.85023224, "learning_rate": 3.852859539704174e-06, "loss": 0.87578541, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.7212605476379395 }, { "auxiliary_loss_clip": 0.01318381, "auxiliary_loss_mlp": 0.01203769, "balance_loss_clip": 1.01139069, "balance_loss_mlp": 1.00235283, "epoch": 0.14934167017375097, "flos": 29860561104480.0, "grad_norm": 2.0372323160471053, "language_loss": 0.76449209, "learning_rate": 3.85256614155776e-06, "loss": 0.78971362, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 2.910153388977051 }, { "auxiliary_loss_clip": 0.01387876, "auxiliary_loss_mlp": 0.01203273, "balance_loss_clip": 1.01442456, "balance_loss_mlp": 1.00166678, "epoch": 0.14946191306439008, "flos": 17019226059840.0, "grad_norm": 1.7902208492501286, "language_loss": 0.74757707, "learning_rate": 3.852272462378535e-06, "loss": 0.77348864, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.7404088973999023 }, { "auxiliary_loss_clip": 0.013569, "auxiliary_loss_mlp": 0.01202875, "balance_loss_clip": 1.01253247, "balance_loss_mlp": 1.00203156, "epoch": 0.14958215595502916, "flos": 15669056276640.0, "grad_norm": 2.1779817344483146, "language_loss": 0.77833027, "learning_rate": 3.85197850221105e-06, "loss": 0.80392802, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 2.8156142234802246 }, { "auxiliary_loss_clip": 0.01374084, "auxiliary_loss_mlp": 0.01202793, "balance_loss_clip": 1.01376128, "balance_loss_mlp": 1.00156784, "epoch": 0.14970239884566824, "flos": 33108574577760.0, "grad_norm": 2.178051678623162, "language_loss": 0.75578141, "learning_rate": 3.851684261099899e-06, "loss": 0.78155023, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 5.553380966186523 }, { "auxiliary_loss_clip": 0.01376409, "auxiliary_loss_mlp": 0.01203743, "balance_loss_clip": 1.01484203, "balance_loss_mlp": 1.00232661, "epoch": 0.14982264173630733, "flos": 17821254928320.0, "grad_norm": 1.9656727314076257, "language_loss": 0.86364329, "learning_rate": 3.851389739089718e-06, "loss": 0.88944483, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 3.7119317054748535 }, { "auxiliary_loss_clip": 0.01375298, "auxiliary_loss_mlp": 0.01203125, "balance_loss_clip": 1.0140928, "balance_loss_mlp": 1.0017091, "epoch": 0.14994288462694644, "flos": 32409571119840.0, "grad_norm": 1.8902656323854243, "language_loss": 0.80457681, "learning_rate": 3.851094936225186e-06, "loss": 0.83036101, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.7706809043884277 }, { "auxiliary_loss_clip": 0.01348722, "auxiliary_loss_mlp": 0.01203449, "balance_loss_clip": 1.01405168, "balance_loss_mlp": 1.00203276, "epoch": 0.15006312751758552, "flos": 31794674502240.0, "grad_norm": 1.4339435675935477, "language_loss": 0.7654593, "learning_rate": 3.850799852551024e-06, "loss": 0.79098094, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.8231396675109863 }, { "auxiliary_loss_clip": 0.0138722, "auxiliary_loss_mlp": 0.0120369, "balance_loss_clip": 1.01430142, "balance_loss_mlp": 1.00189245, "epoch": 0.1501833704082246, "flos": 16618032007200.0, "grad_norm": 2.41343455861689, "language_loss": 0.85877562, "learning_rate": 3.850504488111995e-06, "loss": 0.88468474, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.655393362045288 }, { "auxiliary_loss_clip": 0.01359676, "auxiliary_loss_mlp": 0.01202813, "balance_loss_clip": 1.01281857, "balance_loss_mlp": 1.00158751, "epoch": 0.15030361329886371, "flos": 23471187860160.0, "grad_norm": 2.0358037466780474, "language_loss": 0.82452738, "learning_rate": 3.850208842952907e-06, "loss": 0.85015225, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.757669448852539 }, { "auxiliary_loss_clip": 0.01332249, "auxiliary_loss_mlp": 0.01204039, "balance_loss_clip": 1.01271999, "balance_loss_mlp": 1.00243258, "epoch": 0.1504238561895028, "flos": 25629421690080.0, "grad_norm": 1.9288837364629408, "language_loss": 0.79537588, "learning_rate": 3.849912917118608e-06, "loss": 0.82073879, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.8361103534698486 }, { "auxiliary_loss_clip": 0.01375313, "auxiliary_loss_mlp": 0.01199251, "balance_loss_clip": 1.01497436, "balance_loss_mlp": 1.00088716, "epoch": 0.15054409908014188, "flos": 52095179363040.0, "grad_norm": 0.8773269296495713, "language_loss": 0.59229916, "learning_rate": 3.849616710653992e-06, "loss": 0.61804479, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.239177942276001 }, { "auxiliary_loss_clip": 0.01374891, "auxiliary_loss_mlp": 0.01203855, "balance_loss_clip": 1.01351774, "balance_loss_mlp": 1.00205708, "epoch": 0.150664341970781, "flos": 18880261187040.0, "grad_norm": 1.680921144918278, "language_loss": 0.74846804, "learning_rate": 3.84932022360399e-06, "loss": 0.77425551, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.719313859939575 }, { "auxiliary_loss_clip": 0.01348379, "auxiliary_loss_mlp": 0.01202632, "balance_loss_clip": 1.01303291, "balance_loss_mlp": 1.00178838, "epoch": 0.15078458486142007, "flos": 22163251115520.0, "grad_norm": 3.774255386515959, "language_loss": 0.84615469, "learning_rate": 3.849023456013581e-06, "loss": 0.87166476, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.850315809249878 }, { "auxiliary_loss_clip": 0.01387248, "auxiliary_loss_mlp": 0.0120413, "balance_loss_clip": 1.01402032, "balance_loss_mlp": 1.00233245, "epoch": 0.15090482775205916, "flos": 26651906775360.0, "grad_norm": 2.038977881923882, "language_loss": 0.62085557, "learning_rate": 3.848726407927784e-06, "loss": 0.64676934, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.825345277786255 }, { "auxiliary_loss_clip": 0.0135317, "auxiliary_loss_mlp": 0.01204029, "balance_loss_clip": 1.01372051, "balance_loss_mlp": 1.00242233, "epoch": 0.15102507064269824, "flos": 21799009320480.0, "grad_norm": 2.6343685696864285, "language_loss": 0.8630386, "learning_rate": 3.84842907939166e-06, "loss": 0.8886106, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.7830119132995605 }, { "auxiliary_loss_clip": 0.01340244, "auxiliary_loss_mlp": 0.01202947, "balance_loss_clip": 1.01355815, "balance_loss_mlp": 1.00172198, "epoch": 0.15114531353333735, "flos": 22820919626880.0, "grad_norm": 5.94383635823814, "language_loss": 0.71111929, "learning_rate": 3.8481314704503146e-06, "loss": 0.73655117, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.927659034729004 }, { "auxiliary_loss_clip": 0.01374439, "auxiliary_loss_mlp": 0.0120316, "balance_loss_clip": 1.01399672, "balance_loss_mlp": 1.00212586, "epoch": 0.15126555642397643, "flos": 19682685216000.0, "grad_norm": 2.603665492400634, "language_loss": 0.87874371, "learning_rate": 3.847833581148895e-06, "loss": 0.90451968, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.6873831748962402 }, { "auxiliary_loss_clip": 0.01399717, "auxiliary_loss_mlp": 0.01203707, "balance_loss_clip": 1.01444876, "balance_loss_mlp": 1.00190961, "epoch": 0.15138579931461552, "flos": 28726033764960.0, "grad_norm": 2.543718590784504, "language_loss": 0.81423783, "learning_rate": 3.84753541153259e-06, "loss": 0.84027207, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.7397947311401367 }, { "auxiliary_loss_clip": 0.01375975, "auxiliary_loss_mlp": 0.01203633, "balance_loss_clip": 1.01442528, "balance_loss_mlp": 1.00202584, "epoch": 0.15150604220525463, "flos": 22127017331520.0, "grad_norm": 1.8243538061993965, "language_loss": 0.83378053, "learning_rate": 3.847236961646633e-06, "loss": 0.85957658, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.746530055999756 }, { "auxiliary_loss_clip": 0.01373828, "auxiliary_loss_mlp": 0.01204102, "balance_loss_clip": 1.01429868, "balance_loss_mlp": 1.00230432, "epoch": 0.1516262850958937, "flos": 12968716710240.0, "grad_norm": 2.672283457309638, "language_loss": 0.77944851, "learning_rate": 3.846938231536296e-06, "loss": 0.80522782, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.743723154067993 }, { "auxiliary_loss_clip": 0.01383618, "auxiliary_loss_mlp": 0.01203561, "balance_loss_clip": 1.01465809, "balance_loss_mlp": 1.00252664, "epoch": 0.1517465279865328, "flos": 21797141289120.0, "grad_norm": 1.8664844486565855, "language_loss": 0.80865741, "learning_rate": 3.8466392212468995e-06, "loss": 0.83452922, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.7724099159240723 }, { "auxiliary_loss_clip": 0.01348165, "auxiliary_loss_mlp": 0.01199092, "balance_loss_clip": 1.01291227, "balance_loss_mlp": 1.0007273, "epoch": 0.15186677087717187, "flos": 58174600998240.0, "grad_norm": 0.8211400417243524, "language_loss": 0.61922139, "learning_rate": 3.8463399308238e-06, "loss": 0.64469391, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.2625796794891357 }, { "auxiliary_loss_clip": 0.01375754, "auxiliary_loss_mlp": 0.0120297, "balance_loss_clip": 1.01438928, "balance_loss_mlp": 1.00174451, "epoch": 0.15198701376781099, "flos": 32669709793920.0, "grad_norm": 2.686662777103594, "language_loss": 0.64088982, "learning_rate": 3.846040360312402e-06, "loss": 0.66667706, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.8396856784820557 }, { "auxiliary_loss_clip": 0.01399457, "auxiliary_loss_mlp": 0.0120349, "balance_loss_clip": 1.01443231, "balance_loss_mlp": 1.00207436, "epoch": 0.15210725665845007, "flos": 28402588061280.0, "grad_norm": 2.0534837698840636, "language_loss": 0.81353104, "learning_rate": 3.8457405097581485e-06, "loss": 0.83956051, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.744319200515747 }, { "auxiliary_loss_clip": 0.01338586, "auxiliary_loss_mlp": 0.01203766, "balance_loss_clip": 1.01375234, "balance_loss_mlp": 1.00215936, "epoch": 0.15222749954908915, "flos": 19938189735360.0, "grad_norm": 3.6804656862399328, "language_loss": 0.7799952, "learning_rate": 3.8454403792065275e-06, "loss": 0.80541873, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.8368449211120605 }, { "auxiliary_loss_clip": 0.013434, "auxiliary_loss_mlp": 0.01202379, "balance_loss_clip": 1.01283002, "balance_loss_mlp": 1.00191665, "epoch": 0.15234774243972826, "flos": 21324234065760.0, "grad_norm": 2.088193819261915, "language_loss": 0.84952116, "learning_rate": 3.845139968703068e-06, "loss": 0.87497902, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.782621383666992 }, { "auxiliary_loss_clip": 0.01323907, "auxiliary_loss_mlp": 0.01203286, "balance_loss_clip": 1.01245213, "balance_loss_mlp": 1.00206065, "epoch": 0.15246798533036734, "flos": 25957824861600.0, "grad_norm": 2.039059643572066, "language_loss": 0.82987106, "learning_rate": 3.844839278293342e-06, "loss": 0.85514301, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 2.895690679550171 }, { "auxiliary_loss_clip": 0.01400372, "auxiliary_loss_mlp": 0.01203322, "balance_loss_clip": 1.01530862, "balance_loss_mlp": 1.00171566, "epoch": 0.15258822822100643, "flos": 25811919786240.0, "grad_norm": 6.113464579763363, "language_loss": 0.76582772, "learning_rate": 3.8445383080229654e-06, "loss": 0.79186463, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 2.804309368133545 }, { "auxiliary_loss_clip": 0.0136298, "auxiliary_loss_mlp": 0.01203629, "balance_loss_clip": 1.0135411, "balance_loss_mlp": 1.0016408, "epoch": 0.1527084711116455, "flos": 25265467284480.0, "grad_norm": 2.177250505891513, "language_loss": 0.73604894, "learning_rate": 3.844237057937593e-06, "loss": 0.76171505, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 2.8019964694976807 }, { "auxiliary_loss_clip": 0.01386217, "auxiliary_loss_mlp": 0.01203414, "balance_loss_clip": 1.01479697, "balance_loss_mlp": 1.00199819, "epoch": 0.15282871400228462, "flos": 29240240011200.0, "grad_norm": 2.529371375734944, "language_loss": 0.77914393, "learning_rate": 3.843935528082926e-06, "loss": 0.80504018, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 3.800590991973877 }, { "auxiliary_loss_clip": 0.0137787, "auxiliary_loss_mlp": 0.01202587, "balance_loss_clip": 1.0139178, "balance_loss_mlp": 1.00136185, "epoch": 0.1529489568929237, "flos": 20882962395360.0, "grad_norm": 1.7785333997436807, "language_loss": 0.84934872, "learning_rate": 3.843633718504704e-06, "loss": 0.8751533, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 3.622030019760132 }, { "auxiliary_loss_clip": 0.01336141, "auxiliary_loss_mlp": 0.01203374, "balance_loss_clip": 1.01289868, "balance_loss_mlp": 1.00195801, "epoch": 0.1530691997835628, "flos": 20083843344960.0, "grad_norm": 2.4471307950615735, "language_loss": 0.90243727, "learning_rate": 3.843331629248715e-06, "loss": 0.92783242, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.7452664375305176 }, { "auxiliary_loss_clip": 0.01399667, "auxiliary_loss_mlp": 0.01202364, "balance_loss_clip": 1.01498508, "balance_loss_mlp": 1.00171137, "epoch": 0.1531894426742019, "flos": 28759824738720.0, "grad_norm": 2.027143851709797, "language_loss": 0.76951933, "learning_rate": 3.843029260360782e-06, "loss": 0.79553962, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.754120349884033 }, { "auxiliary_loss_clip": 0.01374249, "auxiliary_loss_mlp": 0.0120311, "balance_loss_clip": 1.01361012, "balance_loss_mlp": 1.00207567, "epoch": 0.15330968556484098, "flos": 22236293462400.0, "grad_norm": 2.0619642206721243, "language_loss": 0.78484797, "learning_rate": 3.8427266118867755e-06, "loss": 0.8106215, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.6994948387145996 }, { "auxiliary_loss_clip": 0.01349495, "auxiliary_loss_mlp": 0.01202841, "balance_loss_clip": 1.01291537, "balance_loss_mlp": 1.00180602, "epoch": 0.15342992845548006, "flos": 27527516845920.0, "grad_norm": 2.1926198613173318, "language_loss": 0.82683796, "learning_rate": 3.842423683872608e-06, "loss": 0.85236126, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.787810802459717 }, { "auxiliary_loss_clip": 0.01387739, "auxiliary_loss_mlp": 0.01203233, "balance_loss_clip": 1.01516438, "balance_loss_mlp": 1.00200737, "epoch": 0.15355017134611917, "flos": 19609606945440.0, "grad_norm": 2.3264114627878345, "language_loss": 0.77799869, "learning_rate": 3.842120476364232e-06, "loss": 0.80390841, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.7384274005889893 }, { "auxiliary_loss_clip": 0.01386435, "auxiliary_loss_mlp": 0.01203837, "balance_loss_clip": 1.01441669, "balance_loss_mlp": 1.00203991, "epoch": 0.15367041423675826, "flos": 18478600126560.0, "grad_norm": 2.5841341120414456, "language_loss": 0.83939326, "learning_rate": 3.841816989407644e-06, "loss": 0.86529601, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.7183215618133545 }, { "auxiliary_loss_clip": 0.01337259, "auxiliary_loss_mlp": 0.01203038, "balance_loss_clip": 1.01251447, "balance_loss_mlp": 1.00181258, "epoch": 0.15379065712739734, "flos": 41427678530880.0, "grad_norm": 2.558934159763573, "language_loss": 0.76954925, "learning_rate": 3.841513223048884e-06, "loss": 0.79495221, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 3.0459916591644287 }, { "auxiliary_loss_clip": 0.01333935, "auxiliary_loss_mlp": 0.012033, "balance_loss_clip": 1.01212692, "balance_loss_mlp": 1.00188351, "epoch": 0.15391090001803642, "flos": 22054226450400.0, "grad_norm": 2.3694031119691026, "language_loss": 0.7856338, "learning_rate": 3.841209177334031e-06, "loss": 0.81100613, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.790585994720459 }, { "auxiliary_loss_clip": 0.01375178, "auxiliary_loss_mlp": 0.01202571, "balance_loss_clip": 1.01414657, "balance_loss_mlp": 1.00172734, "epoch": 0.15403114290867553, "flos": 15450360320160.0, "grad_norm": 1.8263742650743577, "language_loss": 0.74624765, "learning_rate": 3.84090485230921e-06, "loss": 0.77202511, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.738870143890381 }, { "auxiliary_loss_clip": 0.01398919, "auxiliary_loss_mlp": 0.01202643, "balance_loss_clip": 1.01439619, "balance_loss_mlp": 1.00179946, "epoch": 0.15415138579931462, "flos": 17929165959360.0, "grad_norm": 2.4846204720655263, "language_loss": 0.76037741, "learning_rate": 3.840600248020588e-06, "loss": 0.78639305, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.7561450004577637 }, { "auxiliary_loss_clip": 0.01375755, "auxiliary_loss_mlp": 0.0120295, "balance_loss_clip": 1.01445556, "balance_loss_mlp": 1.00172448, "epoch": 0.1542716286899537, "flos": 11429331102720.0, "grad_norm": 2.2246588119293955, "language_loss": 0.7949748, "learning_rate": 3.840295364514371e-06, "loss": 0.82076186, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.798936605453491 }, { "auxiliary_loss_clip": 0.01361823, "auxiliary_loss_mlp": 0.01202724, "balance_loss_clip": 1.01352, "balance_loss_mlp": 1.00168991, "epoch": 0.1543918715805928, "flos": 17420348265120.0, "grad_norm": 2.941963017075892, "language_loss": 0.78786367, "learning_rate": 3.83999020183681e-06, "loss": 0.81350911, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.746840476989746 }, { "auxiliary_loss_clip": 0.01292374, "auxiliary_loss_mlp": 0.01203235, "balance_loss_clip": 1.01081681, "balance_loss_mlp": 1.00181866, "epoch": 0.1545121144712319, "flos": 17786386244160.0, "grad_norm": 1.8429800531941638, "language_loss": 0.78646278, "learning_rate": 3.839684760034199e-06, "loss": 0.81141889, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.943279504776001 }, { "auxiliary_loss_clip": 0.01339233, "auxiliary_loss_mlp": 0.01202825, "balance_loss_clip": 1.01252604, "balance_loss_mlp": 1.00179029, "epoch": 0.15463235736187098, "flos": 28220197736160.0, "grad_norm": 3.7513319559828555, "language_loss": 0.65812218, "learning_rate": 3.8393790391528716e-06, "loss": 0.68354273, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.8727781772613525 }, { "auxiliary_loss_clip": 0.01360739, "auxiliary_loss_mlp": 0.01202629, "balance_loss_clip": 1.0134871, "balance_loss_mlp": 1.00159478, "epoch": 0.15475260025251006, "flos": 22856901945120.0, "grad_norm": 2.9146362307256166, "language_loss": 0.88916612, "learning_rate": 3.8390730392392075e-06, "loss": 0.91479969, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.8061912059783936 }, { "auxiliary_loss_clip": 0.01398599, "auxiliary_loss_mlp": 0.01202588, "balance_loss_clip": 1.01424694, "balance_loss_mlp": 1.00193501, "epoch": 0.15487284314314917, "flos": 17602882284960.0, "grad_norm": 2.237410535356977, "language_loss": 0.79385662, "learning_rate": 3.838766760339626e-06, "loss": 0.81986845, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.753263473510742 }, { "auxiliary_loss_clip": 0.01348951, "auxiliary_loss_mlp": 0.01202974, "balance_loss_clip": 1.01289129, "balance_loss_mlp": 1.00193906, "epoch": 0.15499308603378825, "flos": 20082047160960.0, "grad_norm": 2.3948928295315777, "language_loss": 0.79229945, "learning_rate": 3.838460202500587e-06, "loss": 0.81781864, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.7188024520874023 }, { "auxiliary_loss_clip": 0.0132054, "auxiliary_loss_mlp": 0.01203086, "balance_loss_clip": 1.01207209, "balance_loss_mlp": 1.0018611, "epoch": 0.15511332892442733, "flos": 15918058609920.0, "grad_norm": 2.743714144948564, "language_loss": 0.74394333, "learning_rate": 3.838153365768599e-06, "loss": 0.76917964, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.8111181259155273 }, { "auxiliary_loss_clip": 0.01321341, "auxiliary_loss_mlp": 0.01203718, "balance_loss_clip": 1.01220727, "balance_loss_mlp": 1.00249243, "epoch": 0.15523357181506645, "flos": 41282491929120.0, "grad_norm": 2.122358741964015, "language_loss": 0.75223053, "learning_rate": 3.837846250190206e-06, "loss": 0.77748108, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.9435315132141113 }, { "auxiliary_loss_clip": 0.0132882, "auxiliary_loss_mlp": 0.00873584, "balance_loss_clip": 1.01315641, "balance_loss_mlp": 1.00021625, "epoch": 0.15535381470570553, "flos": 18478779744960.0, "grad_norm": 2.278821934167389, "language_loss": 0.76974404, "learning_rate": 3.837538855811998e-06, "loss": 0.79176807, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.7992947101593018 }, { "auxiliary_loss_clip": 0.01363092, "auxiliary_loss_mlp": 0.01203546, "balance_loss_clip": 1.01411796, "balance_loss_mlp": 1.00213051, "epoch": 0.1554740575963446, "flos": 13918159448640.0, "grad_norm": 2.039806275951555, "language_loss": 0.70859063, "learning_rate": 3.837231182680606e-06, "loss": 0.73425704, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 2.7830398082733154 }, { "auxiliary_loss_clip": 0.01384834, "auxiliary_loss_mlp": 0.01202449, "balance_loss_clip": 1.01431799, "balance_loss_mlp": 1.0014143, "epoch": 0.1555943004869837, "flos": 20847087848160.0, "grad_norm": 1.7703718879132881, "language_loss": 0.76276803, "learning_rate": 3.836923230842706e-06, "loss": 0.78864086, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.7688331604003906 }, { "auxiliary_loss_clip": 0.0133497, "auxiliary_loss_mlp": 0.01202929, "balance_loss_clip": 1.01301718, "balance_loss_mlp": 1.00170338, "epoch": 0.1557145433776228, "flos": 22085897927040.0, "grad_norm": 2.240244709263175, "language_loss": 0.8044852, "learning_rate": 3.836615000345011e-06, "loss": 0.8298642, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 2.9778964519500732 }, { "auxiliary_loss_clip": 0.01397962, "auxiliary_loss_mlp": 0.01201887, "balance_loss_clip": 1.01402164, "balance_loss_mlp": 1.00123405, "epoch": 0.1558347862682619, "flos": 19791997270560.0, "grad_norm": 2.261051633626586, "language_loss": 0.78209579, "learning_rate": 3.836306491234282e-06, "loss": 0.80809432, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.6913669109344482 }, { "auxiliary_loss_clip": 0.01347349, "auxiliary_loss_mlp": 0.01202485, "balance_loss_clip": 1.01266265, "balance_loss_mlp": 1.00183153, "epoch": 0.15595502915890097, "flos": 17237095771680.0, "grad_norm": 2.0721663482795316, "language_loss": 0.74866438, "learning_rate": 3.835997703557317e-06, "loss": 0.77416271, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 4.7174224853515625 }, { "auxiliary_loss_clip": 0.01348085, "auxiliary_loss_mlp": 0.01202295, "balance_loss_clip": 1.01374125, "balance_loss_mlp": 1.00145161, "epoch": 0.15607527204954008, "flos": 19719529702560.0, "grad_norm": 1.652302810732751, "language_loss": 0.80030459, "learning_rate": 3.83568863736096e-06, "loss": 0.82580841, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 4.685322999954224 }, { "auxiliary_loss_clip": 0.01351122, "auxiliary_loss_mlp": 0.01202721, "balance_loss_clip": 1.01299918, "balance_loss_mlp": 1.00168645, "epoch": 0.15619551494017916, "flos": 18515660155200.0, "grad_norm": 2.6816422863397857, "language_loss": 0.89550447, "learning_rate": 3.8353792926920975e-06, "loss": 0.92104292, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.8042356967926025 }, { "auxiliary_loss_clip": 0.01384511, "auxiliary_loss_mlp": 0.01203214, "balance_loss_clip": 1.0140059, "balance_loss_mlp": 1.00160742, "epoch": 0.15631575783081825, "flos": 19902135569760.0, "grad_norm": 2.130135010556561, "language_loss": 0.81691748, "learning_rate": 3.835069669597655e-06, "loss": 0.84279478, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.8470020294189453 }, { "auxiliary_loss_clip": 0.01384918, "auxiliary_loss_mlp": 0.00873612, "balance_loss_clip": 1.01373172, "balance_loss_mlp": 1.0002346, "epoch": 0.15643600072145733, "flos": 20777673792960.0, "grad_norm": 2.0440476900638997, "language_loss": 0.79993922, "learning_rate": 3.834759768124603e-06, "loss": 0.82252455, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.8391692638397217 }, { "auxiliary_loss_clip": 0.01337872, "auxiliary_loss_mlp": 0.01203233, "balance_loss_clip": 1.01330853, "balance_loss_mlp": 1.00162673, "epoch": 0.15655624361209644, "flos": 18546397616160.0, "grad_norm": 2.031481294595024, "language_loss": 0.766298, "learning_rate": 3.834449588319953e-06, "loss": 0.79170907, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.7301456928253174 }, { "auxiliary_loss_clip": 0.01372236, "auxiliary_loss_mlp": 0.01202559, "balance_loss_clip": 1.0137403, "balance_loss_mlp": 1.00190568, "epoch": 0.15667648650273552, "flos": 25229556813600.0, "grad_norm": 1.9253072832598284, "language_loss": 0.85206366, "learning_rate": 3.834139130230758e-06, "loss": 0.87781155, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.811805486679077 }, { "auxiliary_loss_clip": 0.01360528, "auxiliary_loss_mlp": 0.01203115, "balance_loss_clip": 1.01249695, "balance_loss_mlp": 1.00150824, "epoch": 0.1567967293933746, "flos": 24827105432160.0, "grad_norm": 1.941182444650204, "language_loss": 0.81438935, "learning_rate": 3.833828393904117e-06, "loss": 0.84002578, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.7697901725769043 }, { "auxiliary_loss_clip": 0.0133574, "auxiliary_loss_mlp": 0.01202711, "balance_loss_clip": 1.01262641, "balance_loss_mlp": 1.00186682, "epoch": 0.15691697228401372, "flos": 19164563288640.0, "grad_norm": 2.199588920216594, "language_loss": 0.77874249, "learning_rate": 3.833517379387165e-06, "loss": 0.80412698, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.81132173538208 }, { "auxiliary_loss_clip": 0.01385338, "auxiliary_loss_mlp": 0.0120219, "balance_loss_clip": 1.01420009, "balance_loss_mlp": 1.0015372, "epoch": 0.1570372151746528, "flos": 24790943495520.0, "grad_norm": 2.120010483077081, "language_loss": 0.88715076, "learning_rate": 3.833206086727085e-06, "loss": 0.91302609, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.762688398361206 }, { "auxiliary_loss_clip": 0.01348049, "auxiliary_loss_mlp": 0.01202899, "balance_loss_clip": 1.01294553, "balance_loss_mlp": 1.00167346, "epoch": 0.15715745806529188, "flos": 24863662529280.0, "grad_norm": 2.332642495646675, "language_loss": 0.70747733, "learning_rate": 3.8328945159710994e-06, "loss": 0.73298681, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.8124096393585205 }, { "auxiliary_loss_clip": 0.01386549, "auxiliary_loss_mlp": 0.00873545, "balance_loss_clip": 1.01486039, "balance_loss_mlp": 1.00023878, "epoch": 0.157277700955931, "flos": 21872159438400.0, "grad_norm": 1.8876454563376932, "language_loss": 0.88932943, "learning_rate": 3.832582667166473e-06, "loss": 0.91193038, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.730271100997925 }, { "auxiliary_loss_clip": 0.01371004, "auxiliary_loss_mlp": 0.01202584, "balance_loss_clip": 1.01315343, "balance_loss_mlp": 1.00154924, "epoch": 0.15739794384657008, "flos": 24533355402720.0, "grad_norm": 1.8206122209110396, "language_loss": 0.81566048, "learning_rate": 3.8322705403605125e-06, "loss": 0.84139639, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.7920007705688477 }, { "auxiliary_loss_clip": 0.01346316, "auxiliary_loss_mlp": 0.01202279, "balance_loss_clip": 1.01216292, "balance_loss_mlp": 1.00181687, "epoch": 0.15751818673720916, "flos": 17745338687040.0, "grad_norm": 1.9443717821996056, "language_loss": 0.81461501, "learning_rate": 3.831958135600568e-06, "loss": 0.84010094, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.7109451293945312 }, { "auxiliary_loss_clip": 0.01383337, "auxiliary_loss_mlp": 0.01201932, "balance_loss_clip": 1.01364183, "balance_loss_mlp": 1.00146961, "epoch": 0.15763842962784824, "flos": 17858530499040.0, "grad_norm": 1.7429700891166104, "language_loss": 0.7936846, "learning_rate": 3.831645452934032e-06, "loss": 0.81953728, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.6660335063934326 }, { "auxiliary_loss_clip": 0.0139864, "auxiliary_loss_mlp": 0.01202664, "balance_loss_clip": 1.0148747, "balance_loss_mlp": 1.00181985, "epoch": 0.15775867251848735, "flos": 26980920649440.0, "grad_norm": 1.8402654723740721, "language_loss": 0.79599029, "learning_rate": 3.831332492408336e-06, "loss": 0.8220033, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.8530383110046387 }, { "auxiliary_loss_clip": 0.01359176, "auxiliary_loss_mlp": 0.01202144, "balance_loss_clip": 1.01274979, "balance_loss_mlp": 1.00168145, "epoch": 0.15787891540912644, "flos": 19240407682560.0, "grad_norm": 21.271342424743917, "language_loss": 0.6948331, "learning_rate": 3.831019254070957e-06, "loss": 0.72044623, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.72806715965271 }, { "auxiliary_loss_clip": 0.01317547, "auxiliary_loss_mlp": 0.01202222, "balance_loss_clip": 1.01138198, "balance_loss_mlp": 1.00137794, "epoch": 0.15799915829976552, "flos": 27271114234560.0, "grad_norm": 2.2340658931991206, "language_loss": 0.95070946, "learning_rate": 3.8307057379694135e-06, "loss": 0.97590709, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.863790988922119 }, { "auxiliary_loss_clip": 0.01397168, "auxiliary_loss_mlp": 0.01202807, "balance_loss_clip": 1.0132575, "balance_loss_mlp": 1.00158167, "epoch": 0.15811940119040463, "flos": 20405528788320.0, "grad_norm": 2.386907099240455, "language_loss": 0.82531095, "learning_rate": 3.830391944151264e-06, "loss": 0.85131073, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.658160448074341 }, { "auxiliary_loss_clip": 0.01359362, "auxiliary_loss_mlp": 0.01202068, "balance_loss_clip": 1.01209497, "balance_loss_mlp": 1.00160527, "epoch": 0.1582396440810437, "flos": 32599361723040.0, "grad_norm": 1.992916228014343, "language_loss": 0.67194206, "learning_rate": 3.830077872664114e-06, "loss": 0.69755638, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.928251028060913 }, { "auxiliary_loss_clip": 0.01305587, "auxiliary_loss_mlp": 0.01202166, "balance_loss_clip": 1.01123369, "balance_loss_mlp": 1.00208497, "epoch": 0.1583598869716828, "flos": 33800572918080.0, "grad_norm": 1.8974525133265165, "language_loss": 0.73098588, "learning_rate": 3.829763523555604e-06, "loss": 0.75606346, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 3.0314137935638428 }, { "auxiliary_loss_clip": 0.01371444, "auxiliary_loss_mlp": 0.01201136, "balance_loss_clip": 1.01354587, "balance_loss_mlp": 1.00124621, "epoch": 0.15848012986232188, "flos": 24681344051520.0, "grad_norm": 2.078694052746443, "language_loss": 0.78184605, "learning_rate": 3.829448896873423e-06, "loss": 0.80757189, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.7178969383239746 }, { "auxiliary_loss_clip": 0.01312823, "auxiliary_loss_mlp": 0.00873526, "balance_loss_clip": 1.01217127, "balance_loss_mlp": 1.0002923, "epoch": 0.158600372752961, "flos": 22602079975680.0, "grad_norm": 3.4905450431505356, "language_loss": 0.79347384, "learning_rate": 3.829133992665299e-06, "loss": 0.8153373, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 2.9483630657196045 }, { "auxiliary_loss_clip": 0.01384669, "auxiliary_loss_mlp": 0.01202156, "balance_loss_clip": 1.01346612, "balance_loss_mlp": 1.00150275, "epoch": 0.15872061564360007, "flos": 27927956501280.0, "grad_norm": 2.4436120664721277, "language_loss": 0.88903958, "learning_rate": 3.828818810979002e-06, "loss": 0.91490787, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 2.815673351287842 }, { "auxiliary_loss_clip": 0.01397172, "auxiliary_loss_mlp": 0.01202266, "balance_loss_clip": 1.01422906, "balance_loss_mlp": 1.00180364, "epoch": 0.15884085853423915, "flos": 23696817086880.0, "grad_norm": 3.073429791396155, "language_loss": 0.80704319, "learning_rate": 3.8285033518623454e-06, "loss": 0.8330375, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.759995460510254 }, { "auxiliary_loss_clip": 0.01376742, "auxiliary_loss_mlp": 0.01203492, "balance_loss_clip": 1.01355934, "balance_loss_mlp": 1.00207615, "epoch": 0.15896110142487826, "flos": 23112370540800.0, "grad_norm": 2.421227779030754, "language_loss": 0.81162393, "learning_rate": 3.8281876153631845e-06, "loss": 0.8374263, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 2.7797627449035645 }, { "auxiliary_loss_clip": 0.01329951, "auxiliary_loss_mlp": 0.01202385, "balance_loss_clip": 1.01187897, "balance_loss_mlp": 1.00154114, "epoch": 0.15908134431551735, "flos": 14685247785600.0, "grad_norm": 2.0726921482549265, "language_loss": 0.64764225, "learning_rate": 3.827871601529416e-06, "loss": 0.67296559, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 5.681685209274292 }, { "auxiliary_loss_clip": 0.01347022, "auxiliary_loss_mlp": 0.01202303, "balance_loss_clip": 1.01243997, "balance_loss_mlp": 1.00184107, "epoch": 0.15920158720615643, "flos": 20193622407360.0, "grad_norm": 1.6621955126065664, "language_loss": 0.80444515, "learning_rate": 3.827555310408979e-06, "loss": 0.82993835, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 3.7261600494384766 }, { "auxiliary_loss_clip": 0.01321349, "auxiliary_loss_mlp": 0.01202015, "balance_loss_clip": 1.01151896, "balance_loss_mlp": 1.00174308, "epoch": 0.1593218300967955, "flos": 24826638424320.0, "grad_norm": 1.7046394274675338, "language_loss": 0.82714474, "learning_rate": 3.827238742049854e-06, "loss": 0.85237837, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.865055799484253 }, { "auxiliary_loss_clip": 0.01397569, "auxiliary_loss_mlp": 0.01202145, "balance_loss_clip": 1.01387453, "balance_loss_mlp": 1.00149179, "epoch": 0.15944207298743462, "flos": 28328719469760.0, "grad_norm": 1.975520823491543, "language_loss": 0.51875395, "learning_rate": 3.826921896500066e-06, "loss": 0.54475105, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.8438875675201416 }, { "auxiliary_loss_clip": 0.01342379, "auxiliary_loss_mlp": 0.01202671, "balance_loss_clip": 1.01251936, "balance_loss_mlp": 1.00201797, "epoch": 0.1595623158780737, "flos": 22964848899840.0, "grad_norm": 2.3313086210863916, "language_loss": 0.78214854, "learning_rate": 3.826604773807678e-06, "loss": 0.80759907, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.7694895267486572 }, { "auxiliary_loss_clip": 0.01372245, "auxiliary_loss_mlp": 0.01203026, "balance_loss_clip": 1.0131948, "balance_loss_mlp": 1.00161004, "epoch": 0.1596825587687128, "flos": 19710548782560.0, "grad_norm": 2.58898547986274, "language_loss": 0.73553014, "learning_rate": 3.826287374020798e-06, "loss": 0.76128292, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.727693557739258 }, { "auxiliary_loss_clip": 0.01397362, "auxiliary_loss_mlp": 0.01202628, "balance_loss_clip": 1.01418102, "balance_loss_mlp": 1.00159299, "epoch": 0.1598028016593519, "flos": 22637738980800.0, "grad_norm": 2.76290782415737, "language_loss": 0.82384276, "learning_rate": 3.825969697187575e-06, "loss": 0.84984267, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.687892198562622 }, { "auxiliary_loss_clip": 0.01350628, "auxiliary_loss_mlp": 0.01203143, "balance_loss_clip": 1.01228714, "balance_loss_mlp": 1.00210822, "epoch": 0.15992304454999098, "flos": 20482917900480.0, "grad_norm": 2.0528537472525827, "language_loss": 0.69747293, "learning_rate": 3.8256517433562015e-06, "loss": 0.7230106, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.7803256511688232 }, { "auxiliary_loss_clip": 0.01396541, "auxiliary_loss_mlp": 0.01202377, "balance_loss_clip": 1.01366067, "balance_loss_mlp": 1.0019151, "epoch": 0.16004328744063007, "flos": 17676104250240.0, "grad_norm": 2.3147145158121862, "language_loss": 0.91393697, "learning_rate": 3.82533351257491e-06, "loss": 0.93992615, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.721457004547119 }, { "auxiliary_loss_clip": 0.01371113, "auxiliary_loss_mlp": 0.01202638, "balance_loss_clip": 1.01273954, "balance_loss_mlp": 1.00179458, "epoch": 0.16016353033126918, "flos": 24098729613120.0, "grad_norm": 3.48206954381927, "language_loss": 0.88679808, "learning_rate": 3.825015004891975e-06, "loss": 0.91253567, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.780000925064087 }, { "auxiliary_loss_clip": 0.01384748, "auxiliary_loss_mlp": 0.0120257, "balance_loss_clip": 1.01367927, "balance_loss_mlp": 1.00172591, "epoch": 0.16028377322190826, "flos": 27634853098080.0, "grad_norm": 1.9076395041818552, "language_loss": 0.75676429, "learning_rate": 3.824696220355716e-06, "loss": 0.78263742, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.789147138595581 }, { "auxiliary_loss_clip": 0.01346136, "auxiliary_loss_mlp": 0.01202308, "balance_loss_clip": 1.01155925, "balance_loss_mlp": 1.00165534, "epoch": 0.16040401611254734, "flos": 20961213675840.0, "grad_norm": 1.932391273485654, "language_loss": 0.78969151, "learning_rate": 3.824377159014491e-06, "loss": 0.81517589, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.7747175693511963 }, { "auxiliary_loss_clip": 0.01370464, "auxiliary_loss_mlp": 0.01202098, "balance_loss_clip": 1.01235676, "balance_loss_mlp": 1.00144458, "epoch": 0.16052425900318643, "flos": 21247060495680.0, "grad_norm": 2.0505418654698646, "language_loss": 0.84982169, "learning_rate": 3.824057820916702e-06, "loss": 0.87554729, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.666867256164551 }, { "auxiliary_loss_clip": 0.01350055, "auxiliary_loss_mlp": 0.01203085, "balance_loss_clip": 1.01250708, "balance_loss_mlp": 1.00166893, "epoch": 0.16064450189382554, "flos": 15524013369600.0, "grad_norm": 2.0821680126465534, "language_loss": 0.71885097, "learning_rate": 3.8237382061107904e-06, "loss": 0.74438238, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.712193250656128 }, { "auxiliary_loss_clip": 0.01282128, "auxiliary_loss_mlp": 0.01201974, "balance_loss_clip": 1.01034999, "balance_loss_mlp": 1.00151205, "epoch": 0.16076474478446462, "flos": 21178508608800.0, "grad_norm": 1.7544202160318099, "language_loss": 0.78561705, "learning_rate": 3.823418314645243e-06, "loss": 0.81045806, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.9052793979644775 }, { "auxiliary_loss_clip": 0.01306821, "auxiliary_loss_mlp": 0.01202025, "balance_loss_clip": 1.01112831, "balance_loss_mlp": 1.00175369, "epoch": 0.1608849876751037, "flos": 18366486024960.0, "grad_norm": 2.0324902405220793, "language_loss": 0.75746524, "learning_rate": 3.823098146568588e-06, "loss": 0.78255373, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.7932000160217285 }, { "auxiliary_loss_clip": 0.01374366, "auxiliary_loss_mlp": 0.01202104, "balance_loss_clip": 1.01241231, "balance_loss_mlp": 1.00145102, "epoch": 0.1610052305657428, "flos": 29497037783040.0, "grad_norm": 2.1083239790540045, "language_loss": 0.71709073, "learning_rate": 3.822777701929394e-06, "loss": 0.74285543, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.837815046310425 }, { "auxiliary_loss_clip": 0.01384288, "auxiliary_loss_mlp": 0.01202117, "balance_loss_clip": 1.01280618, "balance_loss_mlp": 1.00146365, "epoch": 0.1611254734563819, "flos": 26797883698080.0, "grad_norm": 1.998195535553589, "language_loss": 0.73874915, "learning_rate": 3.8224569807762714e-06, "loss": 0.76461315, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.7691078186035156 }, { "auxiliary_loss_clip": 0.01337703, "auxiliary_loss_mlp": 0.01202275, "balance_loss_clip": 1.0120151, "balance_loss_mlp": 1.00143123, "epoch": 0.16124571634702098, "flos": 22419581879520.0, "grad_norm": 2.084128731406412, "language_loss": 0.76594949, "learning_rate": 3.822135983157873e-06, "loss": 0.79134923, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.8647420406341553 }, { "auxiliary_loss_clip": 0.01395738, "auxiliary_loss_mlp": 0.00873498, "balance_loss_clip": 1.01315713, "balance_loss_mlp": 1.00037158, "epoch": 0.16136595923766006, "flos": 10999123925760.0, "grad_norm": 2.1077583242059617, "language_loss": 0.84393561, "learning_rate": 3.821814709122896e-06, "loss": 0.86662805, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.645594835281372 }, { "auxiliary_loss_clip": 0.01358178, "auxiliary_loss_mlp": 0.01202104, "balance_loss_clip": 1.01243818, "balance_loss_mlp": 1.00164151, "epoch": 0.16148620212829917, "flos": 21214993858560.0, "grad_norm": 2.7731625612696718, "language_loss": 0.8473624, "learning_rate": 3.821493158720076e-06, "loss": 0.87296522, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.7910406589508057 }, { "auxiliary_loss_clip": 0.01347098, "auxiliary_loss_mlp": 0.0120288, "balance_loss_clip": 1.01257181, "balance_loss_mlp": 1.00184584, "epoch": 0.16160644501893826, "flos": 16758476683200.0, "grad_norm": 4.7569250397421445, "language_loss": 0.73046899, "learning_rate": 3.821171331998191e-06, "loss": 0.75596881, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.8241918087005615 }, { "auxiliary_loss_clip": 0.01332565, "auxiliary_loss_mlp": 0.01198488, "balance_loss_clip": 1.00889349, "balance_loss_mlp": 1.00012386, "epoch": 0.16172668790957734, "flos": 64444998718080.0, "grad_norm": 0.7048924706201246, "language_loss": 0.54441261, "learning_rate": 3.820849229006064e-06, "loss": 0.56972313, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 3.4657931327819824 }, { "auxiliary_loss_clip": 0.01395771, "auxiliary_loss_mlp": 0.012019, "balance_loss_clip": 1.01249158, "balance_loss_mlp": 1.00143778, "epoch": 0.16184693080021645, "flos": 23257700837280.0, "grad_norm": 1.8322159841362264, "language_loss": 0.70617819, "learning_rate": 3.8205268497925564e-06, "loss": 0.73215485, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.7115235328674316 }, { "auxiliary_loss_clip": 0.01396547, "auxiliary_loss_mlp": 0.01202272, "balance_loss_clip": 1.01348591, "balance_loss_mlp": 1.00161898, "epoch": 0.16196717369085553, "flos": 17451121649760.0, "grad_norm": 2.2679602457132253, "language_loss": 0.78512079, "learning_rate": 3.8202041944065725e-06, "loss": 0.81110895, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 2.650920867919922 }, { "auxiliary_loss_clip": 0.01396355, "auxiliary_loss_mlp": 0.01201814, "balance_loss_clip": 1.01357687, "balance_loss_mlp": 1.00154281, "epoch": 0.16208741658149461, "flos": 23873387775840.0, "grad_norm": 2.4811544214511394, "language_loss": 0.73728633, "learning_rate": 3.819881262897061e-06, "loss": 0.76326799, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.7040188312530518 }, { "auxiliary_loss_clip": 0.01326246, "auxiliary_loss_mlp": 0.01202691, "balance_loss_clip": 1.01199734, "balance_loss_mlp": 1.00184703, "epoch": 0.1622076594721337, "flos": 25884818438400.0, "grad_norm": 2.0551352984714266, "language_loss": 0.74025357, "learning_rate": 3.819558055313008e-06, "loss": 0.76554286, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 3.8243558406829834 }, { "auxiliary_loss_clip": 0.01379675, "auxiliary_loss_mlp": 0.0120261, "balance_loss_clip": 1.01257265, "balance_loss_mlp": 1.00195694, "epoch": 0.1623279023627728, "flos": 21539768738400.0, "grad_norm": 1.7906824258253995, "language_loss": 0.77363652, "learning_rate": 3.819234571703444e-06, "loss": 0.79945934, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 4.656007528305054 }, { "auxiliary_loss_clip": 0.01384783, "auxiliary_loss_mlp": 0.01202703, "balance_loss_clip": 1.01331675, "balance_loss_mlp": 1.00166893, "epoch": 0.1624481452534119, "flos": 22085430919200.0, "grad_norm": 2.4059786289875524, "language_loss": 0.85544169, "learning_rate": 3.8189108121174435e-06, "loss": 0.8813166, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.7107114791870117 }, { "auxiliary_loss_clip": 0.01320319, "auxiliary_loss_mlp": 0.01202276, "balance_loss_clip": 1.01151454, "balance_loss_mlp": 1.00181389, "epoch": 0.16256838814405097, "flos": 27087502504320.0, "grad_norm": 1.737703651960423, "language_loss": 0.83356869, "learning_rate": 3.818586776604118e-06, "loss": 0.85879463, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.8661978244781494 }, { "auxiliary_loss_clip": 0.0137274, "auxiliary_loss_mlp": 0.01202559, "balance_loss_clip": 1.01370525, "balance_loss_mlp": 1.00190592, "epoch": 0.16268863103469008, "flos": 20120364518400.0, "grad_norm": 2.0484457579075555, "language_loss": 0.6152162, "learning_rate": 3.818262465212625e-06, "loss": 0.64096916, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.9720964431762695 }, { "auxiliary_loss_clip": 0.01383957, "auxiliary_loss_mlp": 0.01202454, "balance_loss_clip": 1.01347017, "balance_loss_mlp": 1.00180066, "epoch": 0.16280887392532917, "flos": 18332802822240.0, "grad_norm": 2.243559876215624, "language_loss": 0.77584749, "learning_rate": 3.817937877992161e-06, "loss": 0.80171156, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.778076648712158 }, { "auxiliary_loss_clip": 0.01357787, "auxiliary_loss_mlp": 0.00873683, "balance_loss_clip": 1.01298237, "balance_loss_mlp": 1.00041223, "epoch": 0.16292911681596825, "flos": 11874338835840.0, "grad_norm": 2.261163235669378, "language_loss": 0.85663331, "learning_rate": 3.817613014991967e-06, "loss": 0.87894809, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.781752586364746 }, { "auxiliary_loss_clip": 0.01345872, "auxiliary_loss_mlp": 0.01202406, "balance_loss_clip": 1.01215816, "balance_loss_mlp": 1.00175285, "epoch": 0.16304935970660733, "flos": 26103478471200.0, "grad_norm": 2.5838134672505686, "language_loss": 0.76742649, "learning_rate": 3.817287876261323e-06, "loss": 0.79290926, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.836056709289551 }, { "auxiliary_loss_clip": 0.01343215, "auxiliary_loss_mlp": 0.01201889, "balance_loss_clip": 1.01153922, "balance_loss_mlp": 1.00123644, "epoch": 0.16316960259724644, "flos": 29351958952320.0, "grad_norm": 1.873641309386189, "language_loss": 0.80032015, "learning_rate": 3.816962461849553e-06, "loss": 0.82577115, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.7507541179656982 }, { "auxiliary_loss_clip": 0.01358962, "auxiliary_loss_mlp": 0.01202353, "balance_loss_clip": 1.01323414, "balance_loss_mlp": 1.00208163, "epoch": 0.16328984548788553, "flos": 20886770305440.0, "grad_norm": 1.822693996585561, "language_loss": 0.84489214, "learning_rate": 3.8166367718060235e-06, "loss": 0.87050527, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.7109155654907227 }, { "auxiliary_loss_clip": 0.01383854, "auxiliary_loss_mlp": 0.01201976, "balance_loss_clip": 1.01318169, "balance_loss_mlp": 1.0015142, "epoch": 0.1634100883785246, "flos": 18041100442560.0, "grad_norm": 8.289763296305644, "language_loss": 0.76298308, "learning_rate": 3.816310806180139e-06, "loss": 0.78884137, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.694573402404785 }, { "auxiliary_loss_clip": 0.01343398, "auxiliary_loss_mlp": 0.01202657, "balance_loss_clip": 1.01149774, "balance_loss_mlp": 1.00200415, "epoch": 0.16353033126916372, "flos": 24572139768000.0, "grad_norm": 1.6818528631746568, "language_loss": 0.81025875, "learning_rate": 3.81598456502135e-06, "loss": 0.83571935, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.8011932373046875 }, { "auxiliary_loss_clip": 0.01343552, "auxiliary_loss_mlp": 0.01202306, "balance_loss_clip": 1.01146412, "balance_loss_mlp": 1.00184369, "epoch": 0.1636505741598028, "flos": 19892903184000.0, "grad_norm": 1.9885465207565818, "language_loss": 0.87207109, "learning_rate": 3.8156580483791455e-06, "loss": 0.89752966, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.761486768722534 }, { "auxiliary_loss_clip": 0.01395899, "auxiliary_loss_mlp": 0.01202025, "balance_loss_clip": 1.0130291, "balance_loss_mlp": 1.00137162, "epoch": 0.16377081705044189, "flos": 28402623984960.0, "grad_norm": 2.6471790391847967, "language_loss": 0.76565355, "learning_rate": 3.815331256303059e-06, "loss": 0.79163277, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.7209255695343018 }, { "auxiliary_loss_clip": 0.01318677, "auxiliary_loss_mlp": 0.01203208, "balance_loss_clip": 1.01111877, "balance_loss_mlp": 1.0023644, "epoch": 0.163891059941081, "flos": 21908069909280.0, "grad_norm": 2.0379105489599776, "language_loss": 0.76792341, "learning_rate": 3.815004188842665e-06, "loss": 0.79314226, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.8176136016845703 }, { "auxiliary_loss_clip": 0.01371102, "auxiliary_loss_mlp": 0.012014, "balance_loss_clip": 1.01248503, "balance_loss_mlp": 1.00150967, "epoch": 0.16401130283172008, "flos": 26797632232320.0, "grad_norm": 1.5202487271167302, "language_loss": 0.79654741, "learning_rate": 3.814676846047578e-06, "loss": 0.82227242, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.810062885284424 }, { "auxiliary_loss_clip": 0.01383806, "auxiliary_loss_mlp": 0.01202908, "balance_loss_clip": 1.01308346, "balance_loss_mlp": 1.00206423, "epoch": 0.16413154572235916, "flos": 32997430415520.0, "grad_norm": 2.603334958418176, "language_loss": 0.69531381, "learning_rate": 3.8143492279674565e-06, "loss": 0.72118104, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.8445940017700195 }, { "auxiliary_loss_clip": 0.01315013, "auxiliary_loss_mlp": 0.01198687, "balance_loss_clip": 1.00657344, "balance_loss_mlp": 1.00032318, "epoch": 0.16425178861299825, "flos": 40113647969760.0, "grad_norm": 0.841378165502469, "language_loss": 0.58457029, "learning_rate": 3.8140213346519997e-06, "loss": 0.60970724, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.1295998096466064 }, { "auxiliary_loss_clip": 0.01340721, "auxiliary_loss_mlp": 0.01202254, "balance_loss_clip": 1.01192331, "balance_loss_mlp": 1.00198209, "epoch": 0.16437203150363736, "flos": 25447498372800.0, "grad_norm": 1.7062192855408216, "language_loss": 0.77312517, "learning_rate": 3.813693166150948e-06, "loss": 0.7985549, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.893352746963501 }, { "auxiliary_loss_clip": 0.01334539, "auxiliary_loss_mlp": 0.01202305, "balance_loss_clip": 1.01216316, "balance_loss_mlp": 1.0018425, "epoch": 0.16449227439427644, "flos": 23476899725280.0, "grad_norm": 2.3810530289032967, "language_loss": 0.85487825, "learning_rate": 3.813364722514086e-06, "loss": 0.8802467, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.831829309463501 }, { "auxiliary_loss_clip": 0.01383588, "auxiliary_loss_mlp": 0.01202287, "balance_loss_clip": 1.01312447, "balance_loss_mlp": 1.00163388, "epoch": 0.16461251728491552, "flos": 13545224123040.0, "grad_norm": 4.1105202584363685, "language_loss": 0.80359977, "learning_rate": 3.8130360037912368e-06, "loss": 0.82945853, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.6471288204193115 }, { "auxiliary_loss_clip": 0.01370656, "auxiliary_loss_mlp": 0.01202594, "balance_loss_clip": 1.01242292, "balance_loss_mlp": 1.00155962, "epoch": 0.16473276017555463, "flos": 23003309952000.0, "grad_norm": 2.6239735070199472, "language_loss": 0.82111907, "learning_rate": 3.812707010032268e-06, "loss": 0.84685159, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.679739236831665 }, { "auxiliary_loss_clip": 0.01374747, "auxiliary_loss_mlp": 0.01202376, "balance_loss_clip": 1.01284683, "balance_loss_mlp": 1.00210392, "epoch": 0.16485300306619372, "flos": 24790692029760.0, "grad_norm": 3.0443409640746393, "language_loss": 0.78983343, "learning_rate": 3.8123777412870863e-06, "loss": 0.81560469, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 2.73933482170105 }, { "auxiliary_loss_clip": 0.0135773, "auxiliary_loss_mlp": 0.01202444, "balance_loss_clip": 1.01123023, "balance_loss_mlp": 1.00179148, "epoch": 0.1649732459568328, "flos": 21106508048640.0, "grad_norm": 1.9652359359325828, "language_loss": 0.78560829, "learning_rate": 3.812048197605643e-06, "loss": 0.81121004, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 2.7643070220947266 }, { "auxiliary_loss_clip": 0.01371472, "auxiliary_loss_mlp": 0.0120226, "balance_loss_clip": 1.01231551, "balance_loss_mlp": 1.00160658, "epoch": 0.16509348884747188, "flos": 20266736601600.0, "grad_norm": 2.036888760769494, "language_loss": 0.81388563, "learning_rate": 3.8117183790379277e-06, "loss": 0.83962291, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 2.6993625164031982 }, { "auxiliary_loss_clip": 0.0139513, "auxiliary_loss_mlp": 0.01202413, "balance_loss_clip": 1.01273084, "balance_loss_mlp": 1.00214148, "epoch": 0.165213731738111, "flos": 11035501404480.0, "grad_norm": 2.624514298947671, "language_loss": 0.94210982, "learning_rate": 3.811388285633976e-06, "loss": 0.96808529, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.6738977432250977 }, { "auxiliary_loss_clip": 0.01315036, "auxiliary_loss_mlp": 0.01202174, "balance_loss_clip": 1.01049113, "balance_loss_mlp": 1.00152135, "epoch": 0.16533397462875007, "flos": 29972064503520.0, "grad_norm": 4.978692630935644, "language_loss": 0.62653756, "learning_rate": 3.811057917443861e-06, "loss": 0.65170968, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 4.743656396865845 }, { "auxiliary_loss_clip": 0.01344891, "auxiliary_loss_mlp": 0.01198686, "balance_loss_clip": 1.00773966, "balance_loss_mlp": 1.00032127, "epoch": 0.16545421751938916, "flos": 65556799577280.0, "grad_norm": 0.8562273866031808, "language_loss": 0.6829145, "learning_rate": 3.8107272745177e-06, "loss": 0.70835018, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 4.408194303512573 }, { "auxiliary_loss_clip": 0.01331219, "auxiliary_loss_mlp": 0.01202272, "balance_loss_clip": 1.01146603, "balance_loss_mlp": 1.00161874, "epoch": 0.16557446041002827, "flos": 22492372760640.0, "grad_norm": 2.8937397947171144, "language_loss": 0.78849053, "learning_rate": 3.8103963569056513e-06, "loss": 0.81382549, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 3.6135008335113525 }, { "auxiliary_loss_clip": 0.01370671, "auxiliary_loss_mlp": 0.01201728, "balance_loss_clip": 1.01282454, "balance_loss_mlp": 1.00164723, "epoch": 0.16569470330066735, "flos": 24602733534240.0, "grad_norm": 1.861616682875588, "language_loss": 0.88150871, "learning_rate": 3.8100651646579146e-06, "loss": 0.90723264, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.7731308937072754 }, { "auxiliary_loss_clip": 0.01370822, "auxiliary_loss_mlp": 0.01202028, "balance_loss_clip": 1.01292133, "balance_loss_mlp": 1.00156522, "epoch": 0.16581494619130643, "flos": 15006214755360.0, "grad_norm": 1.9991786578931345, "language_loss": 0.92387456, "learning_rate": 3.8097336978247317e-06, "loss": 0.94960308, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.7007553577423096 }, { "auxiliary_loss_clip": 0.01356809, "auxiliary_loss_mlp": 0.0120242, "balance_loss_clip": 1.01223779, "balance_loss_mlp": 1.00157583, "epoch": 0.16593518908194552, "flos": 17420348265120.0, "grad_norm": 2.7853179886883783, "language_loss": 0.88627446, "learning_rate": 3.8094019564563854e-06, "loss": 0.91186678, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.71110463142395 }, { "auxiliary_loss_clip": 0.01395141, "auxiliary_loss_mlp": 0.00873675, "balance_loss_clip": 1.01274753, "balance_loss_mlp": 1.0004859, "epoch": 0.16605543197258463, "flos": 20412641676960.0, "grad_norm": 2.189583857330301, "language_loss": 0.75177908, "learning_rate": 3.809069940603201e-06, "loss": 0.77446723, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.7416605949401855 }, { "auxiliary_loss_clip": 0.01357936, "auxiliary_loss_mlp": 0.01201149, "balance_loss_clip": 1.01285076, "balance_loss_mlp": 1.00144982, "epoch": 0.1661756748632237, "flos": 14209754057280.0, "grad_norm": 2.104445074276428, "language_loss": 0.7825346, "learning_rate": 3.8087376503155452e-06, "loss": 0.8081255, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.718836545944214 }, { "auxiliary_loss_clip": 0.01341548, "auxiliary_loss_mlp": 0.01198958, "balance_loss_clip": 1.00788212, "balance_loss_mlp": 1.00059402, "epoch": 0.1662959177538628, "flos": 66080907550080.0, "grad_norm": 0.8961187508070136, "language_loss": 0.56233346, "learning_rate": 3.808405085643826e-06, "loss": 0.58773851, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.3623294830322266 }, { "auxiliary_loss_clip": 0.01395204, "auxiliary_loss_mlp": 0.00873654, "balance_loss_clip": 1.0129714, "balance_loss_mlp": 1.00044918, "epoch": 0.1664161606445019, "flos": 20740577840640.0, "grad_norm": 2.0405405499274485, "language_loss": 0.88935775, "learning_rate": 3.8080722466384925e-06, "loss": 0.91204631, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.65592622756958 }, { "auxiliary_loss_clip": 0.01395219, "auxiliary_loss_mlp": 0.01201944, "balance_loss_clip": 1.01276731, "balance_loss_mlp": 1.00129104, "epoch": 0.166536403535141, "flos": 25260940900800.0, "grad_norm": 2.6884919876866546, "language_loss": 0.70549655, "learning_rate": 3.8077391333500376e-06, "loss": 0.73146814, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.666867971420288 }, { "auxiliary_loss_clip": 0.01357834, "auxiliary_loss_mlp": 0.01202639, "balance_loss_clip": 1.01268685, "balance_loss_mlp": 1.00236702, "epoch": 0.16665664642578007, "flos": 25447462449120.0, "grad_norm": 1.7366158323164556, "language_loss": 0.76916373, "learning_rate": 3.8074057458289934e-06, "loss": 0.79476845, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.785964012145996 }, { "auxiliary_loss_clip": 0.01357834, "auxiliary_loss_mlp": 0.01202105, "balance_loss_clip": 1.01262391, "balance_loss_mlp": 1.00183296, "epoch": 0.16677688931641918, "flos": 22200778152000.0, "grad_norm": 2.1501397617003675, "language_loss": 0.82441443, "learning_rate": 3.807072084125934e-06, "loss": 0.85001385, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.805316686630249 }, { "auxiliary_loss_clip": 0.01344295, "auxiliary_loss_mlp": 0.01201416, "balance_loss_clip": 1.01236486, "balance_loss_mlp": 1.00133562, "epoch": 0.16689713220705826, "flos": 16945968170880.0, "grad_norm": 2.1291810865688405, "language_loss": 0.80196321, "learning_rate": 3.806738148291477e-06, "loss": 0.82742035, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6908912658691406 }, { "auxiliary_loss_clip": 0.01308252, "auxiliary_loss_mlp": 0.01202192, "balance_loss_clip": 1.01135778, "balance_loss_mlp": 1.00153875, "epoch": 0.16701737509769735, "flos": 36244438025760.0, "grad_norm": 2.080669472702609, "language_loss": 0.70968235, "learning_rate": 3.8064039383762793e-06, "loss": 0.73478675, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.940669059753418 }, { "auxiliary_loss_clip": 0.01369821, "auxiliary_loss_mlp": 0.01201963, "balance_loss_clip": 1.0128603, "balance_loss_mlp": 1.00188208, "epoch": 0.16713761798833643, "flos": 23258670776640.0, "grad_norm": 2.235421800136789, "language_loss": 0.77211827, "learning_rate": 3.8060694544310396e-06, "loss": 0.79783607, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.696415901184082 }, { "auxiliary_loss_clip": 0.01395442, "auxiliary_loss_mlp": 0.01201712, "balance_loss_clip": 1.01319718, "balance_loss_mlp": 1.00144076, "epoch": 0.16725786087897554, "flos": 25302527313120.0, "grad_norm": 2.085621931024103, "language_loss": 0.78785872, "learning_rate": 3.8057346965065006e-06, "loss": 0.81383026, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.6501574516296387 }, { "auxiliary_loss_clip": 0.01347517, "auxiliary_loss_mlp": 0.01201589, "balance_loss_clip": 1.01166439, "balance_loss_mlp": 1.00150824, "epoch": 0.16737810376961462, "flos": 31831554912480.0, "grad_norm": 1.5649509356040945, "language_loss": 0.8453266, "learning_rate": 3.805399664653443e-06, "loss": 0.87081766, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.84938907623291 }, { "auxiliary_loss_clip": 0.01395688, "auxiliary_loss_mlp": 0.01201943, "balance_loss_clip": 1.0131247, "balance_loss_mlp": 1.00167179, "epoch": 0.1674983466602537, "flos": 27961855246080.0, "grad_norm": 2.0951842376438097, "language_loss": 0.74165362, "learning_rate": 3.805064358922692e-06, "loss": 0.76762992, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.6553497314453125 }, { "auxiliary_loss_clip": 0.01373072, "auxiliary_loss_mlp": 0.01202482, "balance_loss_clip": 1.01197124, "balance_loss_mlp": 1.00182939, "epoch": 0.16761858955089282, "flos": 21762667765440.0, "grad_norm": 2.000124096448862, "language_loss": 0.80883247, "learning_rate": 3.8047287793651136e-06, "loss": 0.83458805, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.694471836090088 }, { "auxiliary_loss_clip": 0.01337263, "auxiliary_loss_mlp": 0.01201892, "balance_loss_clip": 1.01206338, "balance_loss_mlp": 1.00162053, "epoch": 0.1677388324415319, "flos": 23805518438880.0, "grad_norm": 2.0971198681398864, "language_loss": 0.88250577, "learning_rate": 3.8043929260316137e-06, "loss": 0.90789735, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.760206460952759 }, { "auxiliary_loss_clip": 0.01346535, "auxiliary_loss_mlp": 0.01202893, "balance_loss_clip": 1.01251304, "balance_loss_mlp": 1.00224018, "epoch": 0.16785907533217098, "flos": 20558870065440.0, "grad_norm": 2.0704506847436717, "language_loss": 0.83450443, "learning_rate": 3.8040567989731417e-06, "loss": 0.8599987, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.72131609916687 }, { "auxiliary_loss_clip": 0.01370364, "auxiliary_loss_mlp": 0.01200997, "balance_loss_clip": 1.01235747, "balance_loss_mlp": 1.00129771, "epoch": 0.16797931822281006, "flos": 15669666979200.0, "grad_norm": 3.2614145888696977, "language_loss": 0.79491234, "learning_rate": 3.8037203982406876e-06, "loss": 0.82062596, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.6601290702819824 }, { "auxiliary_loss_clip": 0.01395194, "auxiliary_loss_mlp": 0.01201445, "balance_loss_clip": 1.01319957, "balance_loss_mlp": 1.00136399, "epoch": 0.16809956111344918, "flos": 16541109902880.0, "grad_norm": 2.476308265656654, "language_loss": 0.73037976, "learning_rate": 3.8033837238852835e-06, "loss": 0.75634611, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 2.7120280265808105 }, { "auxiliary_loss_clip": 0.01356987, "auxiliary_loss_mlp": 0.01201357, "balance_loss_clip": 1.01201153, "balance_loss_mlp": 1.00127649, "epoch": 0.16821980400408826, "flos": 23258095997760.0, "grad_norm": 2.3387043004445207, "language_loss": 0.69507623, "learning_rate": 3.8030467759580017e-06, "loss": 0.72065967, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.75551438331604 }, { "auxiliary_loss_clip": 0.01383415, "auxiliary_loss_mlp": 0.01201615, "balance_loss_clip": 1.01321149, "balance_loss_mlp": 1.00153375, "epoch": 0.16834004689472734, "flos": 20774763974880.0, "grad_norm": 2.1688910516183943, "language_loss": 0.87006295, "learning_rate": 3.802709554509958e-06, "loss": 0.89591324, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 2.683403253555298 }, { "auxiliary_loss_clip": 0.01357826, "auxiliary_loss_mlp": 0.01201889, "balance_loss_clip": 1.01190686, "balance_loss_mlp": 1.0018084, "epoch": 0.16846028978536645, "flos": 26687314314720.0, "grad_norm": 1.7511337191807241, "language_loss": 0.79289025, "learning_rate": 3.8023720595923083e-06, "loss": 0.81848741, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 3.654433012008667 }, { "auxiliary_loss_clip": 0.01332589, "auxiliary_loss_mlp": 0.01202024, "balance_loss_clip": 1.01135123, "balance_loss_mlp": 1.001562, "epoch": 0.16858053267600553, "flos": 18843308929440.0, "grad_norm": 2.6799192601214537, "language_loss": 0.87220639, "learning_rate": 3.80203429125625e-06, "loss": 0.89755255, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 3.7835826873779297 }, { "auxiliary_loss_clip": 0.01296371, "auxiliary_loss_mlp": 0.01201478, "balance_loss_clip": 1.01186204, "balance_loss_mlp": 1.00177908, "epoch": 0.16870077556664462, "flos": 27744560313120.0, "grad_norm": 2.73225720852643, "language_loss": 0.70371705, "learning_rate": 3.8016962495530225e-06, "loss": 0.72869557, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 3.658017635345459 }, { "auxiliary_loss_clip": 0.0139488, "auxiliary_loss_mlp": 0.01201686, "balance_loss_clip": 1.01302505, "balance_loss_mlp": 1.00141406, "epoch": 0.1688210184572837, "flos": 13730775732000.0, "grad_norm": 2.4240890382903113, "language_loss": 0.7650677, "learning_rate": 3.8013579345339063e-06, "loss": 0.79103339, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.6350343227386475 }, { "auxiliary_loss_clip": 0.0133215, "auxiliary_loss_mlp": 0.01201584, "balance_loss_clip": 1.01145971, "balance_loss_mlp": 1.00150323, "epoch": 0.1689412613479228, "flos": 26468797976640.0, "grad_norm": 1.8438111411782045, "language_loss": 0.69161355, "learning_rate": 3.801019346250224e-06, "loss": 0.71695083, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.7565672397613525 }, { "auxiliary_loss_clip": 0.01369665, "auxiliary_loss_mlp": 0.01201804, "balance_loss_clip": 1.01260424, "balance_loss_mlp": 1.00172281, "epoch": 0.1690615042385619, "flos": 21138862075200.0, "grad_norm": 2.38366328396077, "language_loss": 0.8359617, "learning_rate": 3.8006804847533395e-06, "loss": 0.86167634, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.713038921356201 }, { "auxiliary_loss_clip": 0.01394488, "auxiliary_loss_mlp": 0.01201946, "balance_loss_clip": 1.01257813, "balance_loss_mlp": 1.00186467, "epoch": 0.16918174712920098, "flos": 20849351040000.0, "grad_norm": 1.9321908449668357, "language_loss": 0.8513335, "learning_rate": 3.8003413500946556e-06, "loss": 0.87729788, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.642219305038452 }, { "auxiliary_loss_clip": 0.01348575, "auxiliary_loss_mlp": 0.01201805, "balance_loss_clip": 1.01173747, "balance_loss_mlp": 1.00172448, "epoch": 0.1693019900198401, "flos": 16983279665280.0, "grad_norm": 3.4806610451522664, "language_loss": 0.83266473, "learning_rate": 3.8000019423256216e-06, "loss": 0.85816854, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.714073896408081 }, { "auxiliary_loss_clip": 0.01358362, "auxiliary_loss_mlp": 0.01201359, "balance_loss_clip": 1.01182628, "balance_loss_mlp": 1.00127792, "epoch": 0.16942223291047917, "flos": 26796913758720.0, "grad_norm": 1.7352530432465703, "language_loss": 0.87862724, "learning_rate": 3.7996622614977234e-06, "loss": 0.90422446, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.825984239578247 }, { "auxiliary_loss_clip": 0.01357789, "auxiliary_loss_mlp": 0.01201024, "balance_loss_clip": 1.01313627, "balance_loss_mlp": 1.00151527, "epoch": 0.16954247580111825, "flos": 18583709110560.0, "grad_norm": 1.9204916507417868, "language_loss": 0.79091328, "learning_rate": 3.799322307662492e-06, "loss": 0.81650138, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.7128043174743652 }, { "auxiliary_loss_clip": 0.01317339, "auxiliary_loss_mlp": 0.01201854, "balance_loss_clip": 1.01083541, "balance_loss_mlp": 1.00177288, "epoch": 0.16966271869175734, "flos": 13983657822720.0, "grad_norm": 2.187983466276237, "language_loss": 0.83381569, "learning_rate": 3.798982080871496e-06, "loss": 0.8590076, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.784599542617798 }, { "auxiliary_loss_clip": 0.01394533, "auxiliary_loss_mlp": 0.01201935, "balance_loss_clip": 1.01247716, "balance_loss_mlp": 1.00185466, "epoch": 0.16978296158239645, "flos": 37487343404160.0, "grad_norm": 2.180866444857061, "language_loss": 0.67991436, "learning_rate": 3.798641581176349e-06, "loss": 0.70587903, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.8828608989715576 }, { "auxiliary_loss_clip": 0.01357028, "auxiliary_loss_mlp": 0.01201501, "balance_loss_clip": 1.01169014, "balance_loss_mlp": 1.00180125, "epoch": 0.16990320447303553, "flos": 28328970935520.0, "grad_norm": 4.15950722956562, "language_loss": 0.74793887, "learning_rate": 3.7983008086287044e-06, "loss": 0.77352417, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.78906512260437 }, { "auxiliary_loss_clip": 0.01357149, "auxiliary_loss_mlp": 0.01202191, "balance_loss_clip": 1.01216221, "balance_loss_mlp": 1.00153804, "epoch": 0.1700234473636746, "flos": 20188197931680.0, "grad_norm": 2.385415285071186, "language_loss": 0.79141361, "learning_rate": 3.797959763280257e-06, "loss": 0.81700695, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.7631094455718994 }, { "auxiliary_loss_clip": 0.01373285, "auxiliary_loss_mlp": 0.01202259, "balance_loss_clip": 1.01220393, "balance_loss_mlp": 1.00198686, "epoch": 0.17014369025431372, "flos": 24858669137760.0, "grad_norm": 2.56327498339167, "language_loss": 0.78747118, "learning_rate": 3.797618445182743e-06, "loss": 0.81322658, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.758209228515625 }, { "auxiliary_loss_clip": 0.01324888, "auxiliary_loss_mlp": 0.01201876, "balance_loss_clip": 1.01128244, "balance_loss_mlp": 1.00179565, "epoch": 0.1702639331449528, "flos": 16467241311360.0, "grad_norm": 2.4631188178533354, "language_loss": 0.84964842, "learning_rate": 3.79727685438794e-06, "loss": 0.87491608, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.7160937786102295 }, { "auxiliary_loss_clip": 0.01365589, "auxiliary_loss_mlp": 0.01198944, "balance_loss_clip": 1.00931752, "balance_loss_mlp": 1.00057924, "epoch": 0.1703841760355919, "flos": 52508897570880.0, "grad_norm": 0.8342919376838426, "language_loss": 0.61647487, "learning_rate": 3.796934990947667e-06, "loss": 0.64212024, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.3119399547576904 }, { "auxiliary_loss_clip": 0.01365821, "auxiliary_loss_mlp": 0.01198975, "balance_loss_clip": 1.00953126, "balance_loss_mlp": 1.00061083, "epoch": 0.170504418926231, "flos": 49370663160000.0, "grad_norm": 0.8828831244221037, "language_loss": 0.62484694, "learning_rate": 3.7965928549137854e-06, "loss": 0.65049493, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.1699585914611816 }, { "auxiliary_loss_clip": 0.01348765, "auxiliary_loss_mlp": 0.01202821, "balance_loss_clip": 1.01133609, "balance_loss_mlp": 1.00216794, "epoch": 0.17062466181687008, "flos": 25849231280640.0, "grad_norm": 1.9090501496379868, "language_loss": 0.77652013, "learning_rate": 3.7962504463381953e-06, "loss": 0.80203593, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.9071362018585205 }, { "auxiliary_loss_clip": 0.01357962, "auxiliary_loss_mlp": 0.00873646, "balance_loss_clip": 1.013026, "balance_loss_mlp": 1.000489, "epoch": 0.17074490470750917, "flos": 20960423354880.0, "grad_norm": 2.7202812758109425, "language_loss": 0.78723574, "learning_rate": 3.7959077652728412e-06, "loss": 0.80955184, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.8056137561798096 }, { "auxiliary_loss_clip": 0.01357761, "auxiliary_loss_mlp": 0.01201471, "balance_loss_clip": 1.01208055, "balance_loss_mlp": 1.00158083, "epoch": 0.17086514759814825, "flos": 20959776728640.0, "grad_norm": 1.9854184690823213, "language_loss": 0.77432334, "learning_rate": 3.795564811769707e-06, "loss": 0.79991567, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.7463393211364746 }, { "auxiliary_loss_clip": 0.01347031, "auxiliary_loss_mlp": 0.01201693, "balance_loss_clip": 1.01203823, "balance_loss_mlp": 1.00161242, "epoch": 0.17098539048878736, "flos": 28474085689920.0, "grad_norm": 1.8724283415956597, "language_loss": 0.77766007, "learning_rate": 3.795221585880818e-06, "loss": 0.80314732, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.813868284225464 }, { "auxiliary_loss_clip": 0.01328, "auxiliary_loss_mlp": 0.01202305, "balance_loss_clip": 1.01108062, "balance_loss_mlp": 1.00222385, "epoch": 0.17110563337942644, "flos": 16290023996160.0, "grad_norm": 1.9487982001191255, "language_loss": 0.91367787, "learning_rate": 3.794878087658242e-06, "loss": 0.93898094, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 2.735811710357666 }, { "auxiliary_loss_clip": 0.01382714, "auxiliary_loss_mlp": 0.01202275, "balance_loss_clip": 1.01264572, "balance_loss_mlp": 1.00181246, "epoch": 0.17122587627006552, "flos": 29674219174560.0, "grad_norm": 1.939007855292022, "language_loss": 0.78424728, "learning_rate": 3.7945343171540873e-06, "loss": 0.8100971, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 2.811002492904663 }, { "auxiliary_loss_clip": 0.01395521, "auxiliary_loss_mlp": 0.0120168, "balance_loss_clip": 1.01298273, "balance_loss_mlp": 1.00159883, "epoch": 0.17134611916070464, "flos": 25338401860320.0, "grad_norm": 2.2165257006164802, "language_loss": 0.78690374, "learning_rate": 3.7941902744205033e-06, "loss": 0.81287575, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 2.7676432132720947 }, { "auxiliary_loss_clip": 0.01358285, "auxiliary_loss_mlp": 0.01201988, "balance_loss_clip": 1.0127207, "balance_loss_mlp": 1.0015254, "epoch": 0.17146636205134372, "flos": 13953854377440.0, "grad_norm": 1.8985117904820317, "language_loss": 0.83520031, "learning_rate": 3.7938459595096817e-06, "loss": 0.86080307, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.6811017990112305 }, { "auxiliary_loss_clip": 0.01380898, "auxiliary_loss_mlp": 0.01202502, "balance_loss_clip": 1.01252747, "balance_loss_mlp": 1.00203943, "epoch": 0.1715866049419828, "flos": 23915225653920.0, "grad_norm": 1.8427597518923757, "language_loss": 0.85999262, "learning_rate": 3.7935013724738545e-06, "loss": 0.88582659, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 3.79127836227417 }, { "auxiliary_loss_clip": 0.01368554, "auxiliary_loss_mlp": 0.01201854, "balance_loss_clip": 1.01217258, "balance_loss_mlp": 1.0017736, "epoch": 0.17170684783262188, "flos": 22709380304160.0, "grad_norm": 1.7427801891880728, "language_loss": 0.77853161, "learning_rate": 3.7931565133652945e-06, "loss": 0.8042357, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 3.6851205825805664 }, { "auxiliary_loss_clip": 0.01394343, "auxiliary_loss_mlp": 0.01202368, "balance_loss_clip": 1.01239336, "balance_loss_mlp": 1.00209641, "epoch": 0.171827090723261, "flos": 26613302028480.0, "grad_norm": 2.121421897421299, "language_loss": 0.67999911, "learning_rate": 3.792811382236317e-06, "loss": 0.70596623, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 3.7017152309417725 }, { "auxiliary_loss_clip": 0.0138258, "auxiliary_loss_mlp": 0.01202123, "balance_loss_clip": 1.0131321, "balance_loss_mlp": 1.00185192, "epoch": 0.17194733361390008, "flos": 28148520489120.0, "grad_norm": 2.4369458551684415, "language_loss": 0.78213632, "learning_rate": 3.792465979139279e-06, "loss": 0.8079834, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.790398120880127 }, { "auxiliary_loss_clip": 0.01315529, "auxiliary_loss_mlp": 0.01197343, "balance_loss_clip": 1.00840354, "balance_loss_mlp": 1.00050473, "epoch": 0.17206757650453916, "flos": 65530718985600.0, "grad_norm": 0.9307417594916372, "language_loss": 0.6567347, "learning_rate": 3.792120304126576e-06, "loss": 0.68186343, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.386317491531372 }, { "auxiliary_loss_clip": 0.01272026, "auxiliary_loss_mlp": 0.01201814, "balance_loss_clip": 1.0091188, "balance_loss_mlp": 1.00154257, "epoch": 0.17218781939517827, "flos": 22273497185760.0, "grad_norm": 3.204233137205264, "language_loss": 0.83579624, "learning_rate": 3.791774357250649e-06, "loss": 0.86053467, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.8707334995269775 }, { "auxiliary_loss_clip": 0.01358046, "auxiliary_loss_mlp": 0.01202038, "balance_loss_clip": 1.01258659, "balance_loss_mlp": 1.00195765, "epoch": 0.17230806228581735, "flos": 14137322412960.0, "grad_norm": 3.0876045814217266, "language_loss": 0.79552817, "learning_rate": 3.7914281385639757e-06, "loss": 0.82112896, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.757293224334717 }, { "auxiliary_loss_clip": 0.01380837, "auxiliary_loss_mlp": 0.01201713, "balance_loss_clip": 1.0116173, "balance_loss_mlp": 1.00182295, "epoch": 0.17242830517645644, "flos": 20704846988160.0, "grad_norm": 2.118642079909253, "language_loss": 0.79578489, "learning_rate": 3.7910816481190784e-06, "loss": 0.82161039, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.727506637573242 }, { "auxiliary_loss_clip": 0.01360914, "auxiliary_loss_mlp": 0.0120143, "balance_loss_clip": 1.01155496, "balance_loss_mlp": 1.00153959, "epoch": 0.17254854806709552, "flos": 30774596303520.0, "grad_norm": 2.109501579914878, "language_loss": 0.75288695, "learning_rate": 3.7907348859685193e-06, "loss": 0.77851033, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.8632619380950928 }, { "auxiliary_loss_clip": 0.01368521, "auxiliary_loss_mlp": 0.01201391, "balance_loss_clip": 1.01257694, "balance_loss_mlp": 1.00150061, "epoch": 0.17266879095773463, "flos": 26614738975680.0, "grad_norm": 2.1439756353462833, "language_loss": 0.80436492, "learning_rate": 3.790387852164902e-06, "loss": 0.83006406, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.808216094970703 }, { "auxiliary_loss_clip": 0.01373046, "auxiliary_loss_mlp": 0.01202022, "balance_loss_clip": 1.01182151, "balance_loss_mlp": 1.00155997, "epoch": 0.1727890338483737, "flos": 20266305517440.0, "grad_norm": 1.9668124167742072, "language_loss": 0.76470912, "learning_rate": 3.7900405467608707e-06, "loss": 0.79045975, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.7269303798675537 }, { "auxiliary_loss_clip": 0.01332536, "auxiliary_loss_mlp": 0.01201777, "balance_loss_clip": 1.01153588, "balance_loss_mlp": 1.00150526, "epoch": 0.1729092767390128, "flos": 18179820781920.0, "grad_norm": 3.016802561596278, "language_loss": 0.79417992, "learning_rate": 3.7896929698091114e-06, "loss": 0.81952298, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.8082354068756104 }, { "auxiliary_loss_clip": 0.01394888, "auxiliary_loss_mlp": 0.01202125, "balance_loss_clip": 1.01313353, "balance_loss_mlp": 1.00185347, "epoch": 0.1730295196296519, "flos": 26759530416960.0, "grad_norm": 3.4746965227230198, "language_loss": 0.68396676, "learning_rate": 3.7893451213623518e-06, "loss": 0.70993692, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.7689483165740967 }, { "auxiliary_loss_clip": 0.01370184, "auxiliary_loss_mlp": 0.00873676, "balance_loss_clip": 1.01298261, "balance_loss_mlp": 1.00042748, "epoch": 0.173149762520291, "flos": 23842542543840.0, "grad_norm": 2.10733779843401, "language_loss": 0.82626104, "learning_rate": 3.7889970014733606e-06, "loss": 0.84869969, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.8622193336486816 }, { "auxiliary_loss_clip": 0.01332249, "auxiliary_loss_mlp": 0.01201673, "balance_loss_clip": 1.01188016, "balance_loss_mlp": 1.00197387, "epoch": 0.17327000541093007, "flos": 23368198373280.0, "grad_norm": 1.7187188363407595, "language_loss": 0.77855551, "learning_rate": 3.7886486101949463e-06, "loss": 0.80389476, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.98724365234375 }, { "auxiliary_loss_clip": 0.01345799, "auxiliary_loss_mlp": 0.01201985, "balance_loss_clip": 1.0125128, "balance_loss_mlp": 1.00190449, "epoch": 0.17339024830156918, "flos": 18221299423200.0, "grad_norm": 2.6495290865208396, "language_loss": 0.8827498, "learning_rate": 3.7882999475799594e-06, "loss": 0.90822768, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.8725972175598145 }, { "auxiliary_loss_clip": 0.01320159, "auxiliary_loss_mlp": 0.01201914, "balance_loss_clip": 1.01137817, "balance_loss_mlp": 1.00164223, "epoch": 0.17351049119220827, "flos": 23332036436640.0, "grad_norm": 2.055236658986139, "language_loss": 0.81540203, "learning_rate": 3.787951013681293e-06, "loss": 0.84062266, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.80037260055542 }, { "auxiliary_loss_clip": 0.01382398, "auxiliary_loss_mlp": 0.01202185, "balance_loss_clip": 1.01242042, "balance_loss_mlp": 1.00153184, "epoch": 0.17363073408284735, "flos": 23803506712800.0, "grad_norm": 2.9636072792155836, "language_loss": 0.77740175, "learning_rate": 3.787601808551879e-06, "loss": 0.80324757, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.7266664505004883 }, { "auxiliary_loss_clip": 0.01335562, "auxiliary_loss_mlp": 0.01202685, "balance_loss_clip": 1.01168287, "balance_loss_mlp": 1.00203204, "epoch": 0.17375097697348643, "flos": 18515300918400.0, "grad_norm": 2.569687510261161, "language_loss": 0.84189111, "learning_rate": 3.7872523322446926e-06, "loss": 0.86727357, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.7680039405822754 }, { "auxiliary_loss_clip": 0.01329231, "auxiliary_loss_mlp": 0.01202051, "balance_loss_clip": 1.01085103, "balance_loss_mlp": 1.00177979, "epoch": 0.17387121986412554, "flos": 38877914118240.0, "grad_norm": 1.6555150104007432, "language_loss": 0.59967476, "learning_rate": 3.7869025848127478e-06, "loss": 0.6249876, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 2.9661166667938232 }, { "auxiliary_loss_clip": 0.0138071, "auxiliary_loss_mlp": 0.01201613, "balance_loss_clip": 1.01210976, "balance_loss_mlp": 1.00153232, "epoch": 0.17399146275476463, "flos": 20375725343040.0, "grad_norm": 3.0342181760968208, "language_loss": 0.80718243, "learning_rate": 3.786552566309102e-06, "loss": 0.83300567, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.7635929584503174 }, { "auxiliary_loss_clip": 0.01342695, "auxiliary_loss_mlp": 0.00873689, "balance_loss_clip": 1.01166117, "balance_loss_mlp": 1.00041699, "epoch": 0.1741117056454037, "flos": 19164347746560.0, "grad_norm": 2.034456757690641, "language_loss": 0.86253476, "learning_rate": 3.7862022767868517e-06, "loss": 0.88469863, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.7249138355255127 }, { "auxiliary_loss_clip": 0.01322177, "auxiliary_loss_mlp": 0.0120206, "balance_loss_clip": 1.01174855, "balance_loss_mlp": 1.00236034, "epoch": 0.17423194853604282, "flos": 25374312331200.0, "grad_norm": 2.6449962688715205, "language_loss": 0.8467716, "learning_rate": 3.7858517162991367e-06, "loss": 0.87201393, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 2.777221918106079 }, { "auxiliary_loss_clip": 0.01345367, "auxiliary_loss_mlp": 0.01201712, "balance_loss_clip": 1.01159275, "balance_loss_mlp": 1.00163138, "epoch": 0.1743521914266819, "flos": 25191886082400.0, "grad_norm": 2.3581229334045344, "language_loss": 0.60932267, "learning_rate": 3.7855008848991363e-06, "loss": 0.63479346, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 2.7364418506622314 }, { "auxiliary_loss_clip": 0.01347753, "auxiliary_loss_mlp": 0.0120124, "balance_loss_clip": 1.01167595, "balance_loss_mlp": 1.00134993, "epoch": 0.17447243431732098, "flos": 25666589489760.0, "grad_norm": 1.7950237934159636, "language_loss": 0.77614927, "learning_rate": 3.7851497826400714e-06, "loss": 0.8016392, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 2.8807642459869385 }, { "auxiliary_loss_clip": 0.01394697, "auxiliary_loss_mlp": 0.01202157, "balance_loss_clip": 1.01328015, "balance_loss_mlp": 1.00150394, "epoch": 0.17459267720796007, "flos": 36281965062240.0, "grad_norm": 5.203827372880638, "language_loss": 0.76148641, "learning_rate": 3.7847984095752034e-06, "loss": 0.7874549, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.768364191055298 }, { "auxiliary_loss_clip": 0.01393643, "auxiliary_loss_mlp": 0.01201481, "balance_loss_clip": 1.01216984, "balance_loss_mlp": 1.00159061, "epoch": 0.17471292009859918, "flos": 20011124311200.0, "grad_norm": 2.017094913232364, "language_loss": 0.80273718, "learning_rate": 3.784446765757836e-06, "loss": 0.82868844, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 3.588878870010376 }, { "auxiliary_loss_clip": 0.01330815, "auxiliary_loss_mlp": 0.01202051, "balance_loss_clip": 1.01127779, "balance_loss_mlp": 1.00158906, "epoch": 0.17483316298923826, "flos": 27819255149280.0, "grad_norm": 2.33584559733057, "language_loss": 0.77602553, "learning_rate": 3.7840948512413133e-06, "loss": 0.80135417, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 3.744553565979004 }, { "auxiliary_loss_clip": 0.01318469, "auxiliary_loss_mlp": 0.01202582, "balance_loss_clip": 1.01164007, "balance_loss_mlp": 1.00173867, "epoch": 0.17495340587987734, "flos": 44017951645440.0, "grad_norm": 1.861756448657423, "language_loss": 0.78635788, "learning_rate": 3.7837426660790196e-06, "loss": 0.81156838, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 3.8612377643585205 }, { "auxiliary_loss_clip": 0.01393897, "auxiliary_loss_mlp": 0.01201534, "balance_loss_clip": 1.01259685, "balance_loss_mlp": 1.0016439, "epoch": 0.17507364877051645, "flos": 20885836289760.0, "grad_norm": 1.9732067824677648, "language_loss": 0.81823373, "learning_rate": 3.783390210324382e-06, "loss": 0.84418803, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.6391217708587646 }, { "auxiliary_loss_clip": 0.01332608, "auxiliary_loss_mlp": 0.01201377, "balance_loss_clip": 1.01187229, "balance_loss_mlp": 1.00148678, "epoch": 0.17519389166115554, "flos": 24717613759200.0, "grad_norm": 1.8444727013323, "language_loss": 0.72608179, "learning_rate": 3.7830374840308676e-06, "loss": 0.75142163, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.7859325408935547 }, { "auxiliary_loss_clip": 0.01369125, "auxiliary_loss_mlp": 0.01202559, "balance_loss_clip": 1.01151586, "balance_loss_mlp": 1.00171566, "epoch": 0.17531413455179462, "flos": 23798154084480.0, "grad_norm": 2.292851080360323, "language_loss": 0.82850051, "learning_rate": 3.7826844872519842e-06, "loss": 0.85421741, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.696977376937866 }, { "auxiliary_loss_clip": 0.01346786, "auxiliary_loss_mlp": 0.01201591, "balance_loss_clip": 1.01180089, "balance_loss_mlp": 1.00189102, "epoch": 0.1754343774424337, "flos": 24572391233760.0, "grad_norm": 2.255066815720762, "language_loss": 0.72827733, "learning_rate": 3.782331220041282e-06, "loss": 0.75376117, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.8715147972106934 }, { "auxiliary_loss_clip": 0.01346996, "auxiliary_loss_mlp": 0.01201612, "balance_loss_clip": 1.01227212, "balance_loss_mlp": 1.00134039, "epoch": 0.17555462033307281, "flos": 18114609797280.0, "grad_norm": 1.933793027516654, "language_loss": 0.82846546, "learning_rate": 3.7819776824523504e-06, "loss": 0.85395157, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.7129461765289307 }, { "auxiliary_loss_clip": 0.01367385, "auxiliary_loss_mlp": 0.01202577, "balance_loss_clip": 1.0124197, "balance_loss_mlp": 1.00211453, "epoch": 0.1756748632237119, "flos": 28366030964160.0, "grad_norm": 2.094487730108026, "language_loss": 0.83826292, "learning_rate": 3.7816238745388213e-06, "loss": 0.86396247, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.7617387771606445 }, { "auxiliary_loss_clip": 0.01367412, "auxiliary_loss_mlp": 0.0120179, "balance_loss_clip": 1.01189482, "balance_loss_mlp": 1.00170934, "epoch": 0.17579510611435098, "flos": 25732950032160.0, "grad_norm": 1.9392568063865445, "language_loss": 0.87170625, "learning_rate": 3.781269796354367e-06, "loss": 0.89739835, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.7650444507598877 }, { "auxiliary_loss_clip": 0.01356359, "auxiliary_loss_mlp": 0.01202147, "balance_loss_clip": 1.01261258, "balance_loss_mlp": 1.00187516, "epoch": 0.1759153490049901, "flos": 18588091799520.0, "grad_norm": 2.1389196714540217, "language_loss": 0.86185515, "learning_rate": 3.7809154479527006e-06, "loss": 0.88744009, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.723837375640869 }, { "auxiliary_loss_clip": 0.0133073, "auxiliary_loss_mlp": 0.01201382, "balance_loss_clip": 1.01144576, "balance_loss_mlp": 1.00130129, "epoch": 0.17603559189562917, "flos": 18619332192000.0, "grad_norm": 2.1885088558712344, "language_loss": 0.84201843, "learning_rate": 3.780560829387577e-06, "loss": 0.86733955, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.7613611221313477 }, { "auxiliary_loss_clip": 0.01355716, "auxiliary_loss_mlp": 0.01197264, "balance_loss_clip": 1.00830567, "balance_loss_mlp": 1.00042558, "epoch": 0.17615583478626826, "flos": 60530802821280.0, "grad_norm": 0.8546857860204033, "language_loss": 0.5790652, "learning_rate": 3.7802059407127915e-06, "loss": 0.60459495, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.22127366065979 }, { "auxiliary_loss_clip": 0.01368863, "auxiliary_loss_mlp": 0.01201616, "balance_loss_clip": 1.01248336, "balance_loss_mlp": 1.00153518, "epoch": 0.17627607767690734, "flos": 23616230767200.0, "grad_norm": 2.7827395116119886, "language_loss": 0.85966539, "learning_rate": 3.7798507819821797e-06, "loss": 0.88537014, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.8022167682647705 }, { "auxiliary_loss_clip": 0.01331931, "auxiliary_loss_mlp": 0.01201977, "balance_loss_clip": 1.01113105, "balance_loss_mlp": 1.00170577, "epoch": 0.17639632056754645, "flos": 17639080145280.0, "grad_norm": 2.63205018246018, "language_loss": 0.7910133, "learning_rate": 3.7794953532496197e-06, "loss": 0.81635243, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.7353765964508057 }, { "auxiliary_loss_clip": 0.01295102, "auxiliary_loss_mlp": 0.00873042, "balance_loss_clip": 1.01145887, "balance_loss_mlp": 1.00016499, "epoch": 0.17651656345818553, "flos": 57932639706240.0, "grad_norm": 0.8495161871297038, "language_loss": 0.57947338, "learning_rate": 3.7791396545690295e-06, "loss": 0.6011548, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.346752166748047 }, { "auxiliary_loss_clip": 0.01369104, "auxiliary_loss_mlp": 0.01201349, "balance_loss_clip": 1.01225412, "balance_loss_mlp": 1.00164962, "epoch": 0.17663680634882462, "flos": 22929513207840.0, "grad_norm": 1.9733536331960508, "language_loss": 0.80828416, "learning_rate": 3.7787836859943685e-06, "loss": 0.83398873, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.8201873302459717 }, { "auxiliary_loss_clip": 0.01369479, "auxiliary_loss_mlp": 0.01201802, "balance_loss_clip": 1.01203215, "balance_loss_mlp": 1.00153041, "epoch": 0.17675704923946373, "flos": 22637990446560.0, "grad_norm": 2.723280282701542, "language_loss": 0.78853703, "learning_rate": 3.7784274475796363e-06, "loss": 0.81424981, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.7470476627349854 }, { "auxiliary_loss_clip": 0.01331734, "auxiliary_loss_mlp": 0.012023, "balance_loss_clip": 1.0110817, "balance_loss_mlp": 1.0014559, "epoch": 0.1768772921301028, "flos": 27126538335360.0, "grad_norm": 2.543233417862593, "language_loss": 0.75952697, "learning_rate": 3.7780709393788745e-06, "loss": 0.78486729, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.8054428100585938 }, { "auxiliary_loss_clip": 0.01392999, "auxiliary_loss_mlp": 0.01201422, "balance_loss_clip": 1.01245236, "balance_loss_mlp": 1.00134134, "epoch": 0.1769975350207419, "flos": 19172143185120.0, "grad_norm": 2.074954221206031, "language_loss": 0.75581431, "learning_rate": 3.777714161446165e-06, "loss": 0.78175855, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.7302908897399902 }, { "auxiliary_loss_clip": 0.01371114, "auxiliary_loss_mlp": 0.01201605, "balance_loss_clip": 1.01236582, "balance_loss_mlp": 1.00152421, "epoch": 0.177117777911381, "flos": 36134946352800.0, "grad_norm": 1.9915530188327044, "language_loss": 0.69153577, "learning_rate": 3.7773571138356304e-06, "loss": 0.71726298, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.984924793243408 }, { "auxiliary_loss_clip": 0.01303131, "auxiliary_loss_mlp": 0.01201505, "balance_loss_clip": 1.01095939, "balance_loss_mlp": 1.00180531, "epoch": 0.17723802080202009, "flos": 22090603929120.0, "grad_norm": 3.265210196776146, "language_loss": 0.88998747, "learning_rate": 3.776999796601435e-06, "loss": 0.91503382, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.9630608558654785 }, { "auxiliary_loss_clip": 0.01379857, "auxiliary_loss_mlp": 0.01201586, "balance_loss_clip": 1.01249337, "balance_loss_mlp": 1.00150549, "epoch": 0.17735826369265917, "flos": 30222683402400.0, "grad_norm": 2.176923685352601, "language_loss": 0.72751045, "learning_rate": 3.776642209797783e-06, "loss": 0.75332481, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.8155179023742676 }, { "auxiliary_loss_clip": 0.01382508, "auxiliary_loss_mlp": 0.01202091, "balance_loss_clip": 1.01314199, "balance_loss_mlp": 1.00181913, "epoch": 0.17747850658329825, "flos": 21397599725760.0, "grad_norm": 1.7339906792215123, "language_loss": 0.77909744, "learning_rate": 3.7762843534789205e-06, "loss": 0.80494344, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 2.7288780212402344 }, { "auxiliary_loss_clip": 0.01359134, "auxiliary_loss_mlp": 0.01201659, "balance_loss_clip": 1.01167369, "balance_loss_mlp": 1.00157797, "epoch": 0.17759874947393736, "flos": 16983351512640.0, "grad_norm": 2.1537110957276027, "language_loss": 0.87964457, "learning_rate": 3.7759262276991343e-06, "loss": 0.90525246, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 2.777695417404175 }, { "auxiliary_loss_clip": 0.01356901, "auxiliary_loss_mlp": 0.0120243, "balance_loss_clip": 1.01269293, "balance_loss_mlp": 1.00215876, "epoch": 0.17771899236457644, "flos": 11546115282720.0, "grad_norm": 3.776657060585399, "language_loss": 0.80444127, "learning_rate": 3.7755678325127506e-06, "loss": 0.83003461, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.7439956665039062 }, { "auxiliary_loss_clip": 0.01293126, "auxiliary_loss_mlp": 0.01201351, "balance_loss_clip": 1.00988102, "balance_loss_mlp": 1.00146055, "epoch": 0.17783923525521553, "flos": 18807757695360.0, "grad_norm": 1.76016756045793, "language_loss": 0.75564998, "learning_rate": 3.7752091679741393e-06, "loss": 0.78059477, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 3.7078709602355957 }, { "auxiliary_loss_clip": 0.01367694, "auxiliary_loss_mlp": 0.01202153, "balance_loss_clip": 1.01185751, "balance_loss_mlp": 1.0018816, "epoch": 0.17795947814585464, "flos": 30408378706080.0, "grad_norm": 5.135798256926455, "language_loss": 0.76904249, "learning_rate": 3.774850234137708e-06, "loss": 0.79474092, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 4.102688550949097 }, { "auxiliary_loss_clip": 0.01381713, "auxiliary_loss_mlp": 0.01201854, "balance_loss_clip": 1.0130136, "balance_loss_mlp": 1.00196409, "epoch": 0.17807972103649372, "flos": 24389066892960.0, "grad_norm": 2.4768761860575927, "language_loss": 0.82544577, "learning_rate": 3.7744910310579076e-06, "loss": 0.8512814, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 3.657154083251953 }, { "auxiliary_loss_clip": 0.01393451, "auxiliary_loss_mlp": 0.01200168, "balance_loss_clip": 1.01311135, "balance_loss_mlp": 1.00104082, "epoch": 0.1781999639271328, "flos": 20301569362080.0, "grad_norm": 2.137314196752638, "language_loss": 0.85627431, "learning_rate": 3.774131558789229e-06, "loss": 0.88221049, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.629467248916626 }, { "auxiliary_loss_clip": 0.0139377, "auxiliary_loss_mlp": 0.00873568, "balance_loss_clip": 1.01305258, "balance_loss_mlp": 1.00041151, "epoch": 0.1783202068177719, "flos": 15924488948640.0, "grad_norm": 2.1960517325087756, "language_loss": 0.70050174, "learning_rate": 3.773771817386203e-06, "loss": 0.72317505, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.5612545013427734 }, { "auxiliary_loss_clip": 0.01354817, "auxiliary_loss_mlp": 0.01201762, "balance_loss_clip": 1.01228833, "balance_loss_mlp": 1.00168109, "epoch": 0.178440449708411, "flos": 20631768717600.0, "grad_norm": 3.464854167969205, "language_loss": 0.7949152, "learning_rate": 3.773411806903403e-06, "loss": 0.82048094, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.676892042160034 }, { "auxiliary_loss_clip": 0.01315913, "auxiliary_loss_mlp": 0.01202026, "balance_loss_clip": 1.01217651, "balance_loss_mlp": 1.00194502, "epoch": 0.17856069259905008, "flos": 21686068974240.0, "grad_norm": 1.818691437195138, "language_loss": 0.94732606, "learning_rate": 3.7730515273954415e-06, "loss": 0.97250545, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.782229423522949 }, { "auxiliary_loss_clip": 0.01394071, "auxiliary_loss_mlp": 0.01201329, "balance_loss_clip": 1.0135498, "balance_loss_mlp": 1.00124824, "epoch": 0.17868093548968916, "flos": 26572972944960.0, "grad_norm": 2.626087417604498, "language_loss": 0.85348517, "learning_rate": 3.772690978916973e-06, "loss": 0.87943918, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.7353055477142334 }, { "auxiliary_loss_clip": 0.01368451, "auxiliary_loss_mlp": 0.01202041, "balance_loss_clip": 1.01260352, "balance_loss_mlp": 1.00157917, "epoch": 0.17880117838032827, "flos": 18581014834560.0, "grad_norm": 2.8209926539788337, "language_loss": 0.8674438, "learning_rate": 3.772330161522693e-06, "loss": 0.89314872, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.662527561187744 }, { "auxiliary_loss_clip": 0.01343344, "auxiliary_loss_mlp": 0.01201347, "balance_loss_clip": 1.01207781, "balance_loss_mlp": 1.00164723, "epoch": 0.17892142127096736, "flos": 26541229620960.0, "grad_norm": 2.0155722294026224, "language_loss": 0.79866517, "learning_rate": 3.7719690752673365e-06, "loss": 0.82411206, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.818390130996704 }, { "auxiliary_loss_clip": 0.01332497, "auxiliary_loss_mlp": 0.01201918, "balance_loss_clip": 1.01186657, "balance_loss_mlp": 1.00164676, "epoch": 0.17904166416160644, "flos": 23872633378560.0, "grad_norm": 1.83194734332046, "language_loss": 0.77992678, "learning_rate": 3.7716077202056796e-06, "loss": 0.80527091, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.7662947177886963 }, { "auxiliary_loss_clip": 0.01355776, "auxiliary_loss_mlp": 0.01201184, "balance_loss_clip": 1.01220846, "balance_loss_mlp": 1.00129437, "epoch": 0.17916190705224552, "flos": 19134436530240.0, "grad_norm": 1.9991544874396359, "language_loss": 0.93742472, "learning_rate": 3.7712460963925404e-06, "loss": 0.96299434, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.7280967235565186 }, { "auxiliary_loss_clip": 0.01369346, "auxiliary_loss_mlp": 0.01201122, "balance_loss_clip": 1.0130837, "balance_loss_mlp": 1.00123203, "epoch": 0.17928214994288463, "flos": 25152131777760.0, "grad_norm": 1.8205987147565386, "language_loss": 0.75324756, "learning_rate": 3.7708842038827775e-06, "loss": 0.77895224, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.7063686847686768 }, { "auxiliary_loss_clip": 0.01381832, "auxiliary_loss_mlp": 0.0120141, "balance_loss_clip": 1.01278591, "balance_loss_mlp": 1.00171077, "epoch": 0.17940239283352372, "flos": 22384641348000.0, "grad_norm": 2.364926316592329, "language_loss": 0.86135679, "learning_rate": 3.770522042731288e-06, "loss": 0.88718921, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.7153587341308594 }, { "auxiliary_loss_clip": 0.01316961, "auxiliary_loss_mlp": 0.01201281, "balance_loss_clip": 1.01119363, "balance_loss_mlp": 1.00158179, "epoch": 0.1795226357241628, "flos": 23178695159520.0, "grad_norm": 1.9066786191848588, "language_loss": 0.87786114, "learning_rate": 3.7701596129930122e-06, "loss": 0.90304357, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.7921128273010254 }, { "auxiliary_loss_clip": 0.01345195, "auxiliary_loss_mlp": 0.01202099, "balance_loss_clip": 1.01239955, "balance_loss_mlp": 1.00201809, "epoch": 0.1796428786148019, "flos": 22090424310720.0, "grad_norm": 1.8447711405266272, "language_loss": 0.7330761, "learning_rate": 3.7697969147229315e-06, "loss": 0.75854909, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.8261959552764893 }, { "auxiliary_loss_clip": 0.01381264, "auxiliary_loss_mlp": 0.01201136, "balance_loss_clip": 1.01299322, "balance_loss_mlp": 1.00162756, "epoch": 0.179763121505441, "flos": 21324629226240.0, "grad_norm": 2.323953410038779, "language_loss": 0.85495675, "learning_rate": 3.7694339479760647e-06, "loss": 0.88078076, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.7099413871765137 }, { "auxiliary_loss_clip": 0.01338736, "auxiliary_loss_mlp": 0.01197058, "balance_loss_clip": 1.00942945, "balance_loss_mlp": 1.00021958, "epoch": 0.17988336439608008, "flos": 68161895962560.0, "grad_norm": 0.7802359643620931, "language_loss": 0.57302868, "learning_rate": 3.769070712807476e-06, "loss": 0.59838665, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.4060161113739014 }, { "auxiliary_loss_clip": 0.01283313, "auxiliary_loss_mlp": 0.01200787, "balance_loss_clip": 1.00989914, "balance_loss_mlp": 1.00127864, "epoch": 0.18000360728671919, "flos": 21945058090560.0, "grad_norm": 1.8912710529074448, "language_loss": 0.78632104, "learning_rate": 3.768707209272266e-06, "loss": 0.81116205, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.882187843322754 }, { "auxiliary_loss_clip": 0.0135628, "auxiliary_loss_mlp": 0.01201488, "balance_loss_clip": 1.01206028, "balance_loss_mlp": 1.00178814, "epoch": 0.18012385017735827, "flos": 18986340110400.0, "grad_norm": 2.5066976094137763, "language_loss": 0.76753175, "learning_rate": 3.768343437425579e-06, "loss": 0.79310942, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.7473087310791016 }, { "auxiliary_loss_clip": 0.01270303, "auxiliary_loss_mlp": 0.01201493, "balance_loss_clip": 1.01028669, "balance_loss_mlp": 1.00160265, "epoch": 0.18024409306799735, "flos": 19748111742720.0, "grad_norm": 2.323593159495284, "language_loss": 0.85826916, "learning_rate": 3.7679793973225987e-06, "loss": 0.88298714, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.84818172454834 }, { "auxiliary_loss_clip": 0.01294399, "auxiliary_loss_mlp": 0.01197161, "balance_loss_clip": 1.00695801, "balance_loss_mlp": 1.0003227, "epoch": 0.18036433595863643, "flos": 67227217856640.0, "grad_norm": 0.849169074342832, "language_loss": 0.61620069, "learning_rate": 3.767615089018549e-06, "loss": 0.64111626, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.361711025238037 }, { "auxiliary_loss_clip": 0.01357053, "auxiliary_loss_mlp": 0.01201246, "balance_loss_clip": 1.01242375, "balance_loss_mlp": 1.00135565, "epoch": 0.18048457884927555, "flos": 18181473271200.0, "grad_norm": 2.0108319522553306, "language_loss": 0.86304784, "learning_rate": 3.7672505125686966e-06, "loss": 0.88863087, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.6895411014556885 }, { "auxiliary_loss_clip": 0.01319676, "auxiliary_loss_mlp": 0.0120203, "balance_loss_clip": 1.01124537, "balance_loss_mlp": 1.00214028, "epoch": 0.18060482173991463, "flos": 15813775870560.0, "grad_norm": 3.1610582888652337, "language_loss": 0.84235072, "learning_rate": 3.7668856680283455e-06, "loss": 0.86756778, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 2.8786301612854004 }, { "auxiliary_loss_clip": 0.01356128, "auxiliary_loss_mlp": 0.01201651, "balance_loss_clip": 1.01247454, "balance_loss_mlp": 1.0015707, "epoch": 0.1807250646305537, "flos": 18587409249600.0, "grad_norm": 1.8270826994182503, "language_loss": 0.82712412, "learning_rate": 3.7665205554528437e-06, "loss": 0.85270196, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 2.6937174797058105 }, { "auxiliary_loss_clip": 0.01344546, "auxiliary_loss_mlp": 0.01201755, "balance_loss_clip": 1.01123798, "balance_loss_mlp": 1.00186491, "epoch": 0.18084530752119282, "flos": 23149143180000.0, "grad_norm": 1.8040556662145117, "language_loss": 0.74077642, "learning_rate": 3.7661551748975782e-06, "loss": 0.7662394, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.8202292919158936 }, { "auxiliary_loss_clip": 0.01338193, "auxiliary_loss_mlp": 0.01197133, "balance_loss_clip": 1.00917768, "balance_loss_mlp": 1.00029469, "epoch": 0.1809655504118319, "flos": 59803181399520.0, "grad_norm": 0.8144134729712119, "language_loss": 0.6049757, "learning_rate": 3.7657895264179772e-06, "loss": 0.63032901, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 5.419504880905151 }, { "auxiliary_loss_clip": 0.01369504, "auxiliary_loss_mlp": 0.01201713, "balance_loss_clip": 1.01289999, "balance_loss_mlp": 1.00182247, "epoch": 0.181085793302471, "flos": 44201958536160.0, "grad_norm": 2.4484886524856315, "language_loss": 0.74616492, "learning_rate": 3.765423610069509e-06, "loss": 0.77187705, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 3.182666063308716 }, { "auxiliary_loss_clip": 0.01346782, "auxiliary_loss_mlp": 0.01201641, "balance_loss_clip": 1.011204, "balance_loss_mlp": 1.00175095, "epoch": 0.18120603619311007, "flos": 34898399465760.0, "grad_norm": 1.871352613071777, "language_loss": 0.72052765, "learning_rate": 3.765057425907683e-06, "loss": 0.74601191, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 3.8586294651031494 }, { "auxiliary_loss_clip": 0.01378541, "auxiliary_loss_mlp": 0.01200987, "balance_loss_clip": 1.01173759, "balance_loss_mlp": 1.00147808, "epoch": 0.18132627908374918, "flos": 21506768085600.0, "grad_norm": 2.07418163974665, "language_loss": 0.78579128, "learning_rate": 3.764690973988048e-06, "loss": 0.81158656, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.759404182434082 }, { "auxiliary_loss_clip": 0.01334826, "auxiliary_loss_mlp": 0.01200873, "balance_loss_clip": 1.01091719, "balance_loss_mlp": 1.00155485, "epoch": 0.18144652197438826, "flos": 29057705991360.0, "grad_norm": 2.362853485497586, "language_loss": 0.74043959, "learning_rate": 3.7643242543661967e-06, "loss": 0.7657966, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 3.0392959117889404 }, { "auxiliary_loss_clip": 0.01323181, "auxiliary_loss_mlp": 0.01197024, "balance_loss_clip": 1.00636125, "balance_loss_mlp": 1.00018549, "epoch": 0.18156676486502735, "flos": 68675096345760.0, "grad_norm": 0.8411055049458789, "language_loss": 0.60460532, "learning_rate": 3.7639572670977573e-06, "loss": 0.62980735, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.2135047912597656 }, { "auxiliary_loss_clip": 0.01335337, "auxiliary_loss_mlp": 0.01201126, "balance_loss_clip": 1.01120353, "balance_loss_mlp": 1.0014267, "epoch": 0.18168700775566646, "flos": 26471528176320.0, "grad_norm": 1.8587566604081378, "language_loss": 0.76638913, "learning_rate": 3.7635900122384042e-06, "loss": 0.79175377, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.827857732772827 }, { "auxiliary_loss_clip": 0.0135433, "auxiliary_loss_mlp": 0.01201772, "balance_loss_clip": 1.01142764, "balance_loss_mlp": 1.00150025, "epoch": 0.18180725064630554, "flos": 15005675900160.0, "grad_norm": 2.0282605870387256, "language_loss": 0.86659372, "learning_rate": 3.7632224898438477e-06, "loss": 0.89215469, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.7296743392944336 }, { "auxiliary_loss_clip": 0.0134306, "auxiliary_loss_mlp": 0.01201015, "balance_loss_clip": 1.01123214, "balance_loss_mlp": 1.00150645, "epoch": 0.18192749353694462, "flos": 19682397826560.0, "grad_norm": 1.753444428220311, "language_loss": 0.79229814, "learning_rate": 3.762854699969842e-06, "loss": 0.81773889, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.8266730308532715 }, { "auxiliary_loss_clip": 0.01366679, "auxiliary_loss_mlp": 0.01201321, "balance_loss_clip": 1.01152349, "balance_loss_mlp": 1.00143135, "epoch": 0.1820477364275837, "flos": 20702727491040.0, "grad_norm": 1.9816521062121024, "language_loss": 0.7290374, "learning_rate": 3.762486642672179e-06, "loss": 0.75471735, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.70467472076416 }, { "auxiliary_loss_clip": 0.01349835, "auxiliary_loss_mlp": 0.01201101, "balance_loss_clip": 1.01138902, "balance_loss_mlp": 1.00140154, "epoch": 0.18216797931822282, "flos": 17128717732800.0, "grad_norm": 3.0500661929983104, "language_loss": 0.87042075, "learning_rate": 3.7621183180066946e-06, "loss": 0.89593011, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.7722620964050293 }, { "auxiliary_loss_clip": 0.01366998, "auxiliary_loss_mlp": 0.01201243, "balance_loss_clip": 1.01195717, "balance_loss_mlp": 1.00173461, "epoch": 0.1822882222088619, "flos": 29242575050400.0, "grad_norm": 1.5387147214090227, "language_loss": 0.73689801, "learning_rate": 3.7617497260292625e-06, "loss": 0.76258039, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.8272085189819336 }, { "auxiliary_loss_clip": 0.01340963, "auxiliary_loss_mlp": 0.01201329, "balance_loss_clip": 1.01156139, "balance_loss_mlp": 1.00163007, "epoch": 0.18240846509950098, "flos": 17702746411680.0, "grad_norm": 2.8138705989772363, "language_loss": 0.78635919, "learning_rate": 3.7613808667957967e-06, "loss": 0.81178212, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.7092010974884033 }, { "auxiliary_loss_clip": 0.0135658, "auxiliary_loss_mlp": 0.01201354, "balance_loss_clip": 1.01227784, "balance_loss_mlp": 1.00146437, "epoch": 0.1825287079901401, "flos": 14790033456480.0, "grad_norm": 3.760326675766989, "language_loss": 0.90849501, "learning_rate": 3.7610117403622547e-06, "loss": 0.93407434, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.713390827178955 }, { "auxiliary_loss_clip": 0.01346461, "auxiliary_loss_mlp": 0.0120077, "balance_loss_clip": 1.01169729, "balance_loss_mlp": 1.00145197, "epoch": 0.18264895088077918, "flos": 21946243572000.0, "grad_norm": 1.8794203840515111, "language_loss": 0.89989448, "learning_rate": 3.7606423467846313e-06, "loss": 0.92536676, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.762075662612915 }, { "auxiliary_loss_clip": 0.01332092, "auxiliary_loss_mlp": 0.01201236, "balance_loss_clip": 1.01089418, "balance_loss_mlp": 1.0013454, "epoch": 0.18276919377141826, "flos": 20886770305440.0, "grad_norm": 1.555183228292317, "language_loss": 0.79971427, "learning_rate": 3.760272686118964e-06, "loss": 0.82504755, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.805131196975708 }, { "auxiliary_loss_clip": 0.01351734, "auxiliary_loss_mlp": 0.01200953, "balance_loss_clip": 1.01096272, "balance_loss_mlp": 1.00125408, "epoch": 0.18288943666205737, "flos": 21469887675360.0, "grad_norm": 2.043532542469835, "language_loss": 0.9238174, "learning_rate": 3.7599027584213297e-06, "loss": 0.94934428, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.834587812423706 }, { "auxiliary_loss_clip": 0.01379845, "auxiliary_loss_mlp": 0.01201467, "balance_loss_clip": 1.01234651, "balance_loss_mlp": 1.00176787, "epoch": 0.18300967955269645, "flos": 21539373577920.0, "grad_norm": 3.040225455551343, "language_loss": 0.78508276, "learning_rate": 3.7595325637478465e-06, "loss": 0.81089586, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.7816569805145264 }, { "auxiliary_loss_clip": 0.01353972, "auxiliary_loss_mlp": 0.01201445, "balance_loss_clip": 1.01214838, "balance_loss_mlp": 1.00155473, "epoch": 0.18312992244333554, "flos": 28876249681920.0, "grad_norm": 2.8398313281111762, "language_loss": 0.81646776, "learning_rate": 3.7591621021546723e-06, "loss": 0.84202194, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.7582173347473145 }, { "auxiliary_loss_clip": 0.01379939, "auxiliary_loss_mlp": 0.01201343, "balance_loss_clip": 1.01185226, "balance_loss_mlp": 1.00164366, "epoch": 0.18325016533397462, "flos": 20120113052640.0, "grad_norm": 2.127414469778929, "language_loss": 0.81751573, "learning_rate": 3.7587913736980062e-06, "loss": 0.84332854, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 2.754271984100342 }, { "auxiliary_loss_clip": 0.0130396, "auxiliary_loss_mlp": 0.01201526, "balance_loss_clip": 1.01057196, "balance_loss_mlp": 1.001827, "epoch": 0.18337040822461373, "flos": 23329198465920.0, "grad_norm": 1.69375248953615, "language_loss": 0.84475023, "learning_rate": 3.7584203784340865e-06, "loss": 0.8698051, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.856062889099121 }, { "auxiliary_loss_clip": 0.01355153, "auxiliary_loss_mlp": 0.01200622, "balance_loss_clip": 1.01200676, "balance_loss_mlp": 1.00149488, "epoch": 0.1834906511152528, "flos": 25009567604640.0, "grad_norm": 1.964691227010593, "language_loss": 0.86106873, "learning_rate": 3.7580491164191938e-06, "loss": 0.88662648, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.7634434700012207 }, { "auxiliary_loss_clip": 0.01358674, "auxiliary_loss_mlp": 0.0119705, "balance_loss_clip": 1.0086422, "balance_loss_mlp": 1.00021195, "epoch": 0.1836108940058919, "flos": 67251524978880.0, "grad_norm": 0.7476292880545282, "language_loss": 0.61213601, "learning_rate": 3.757677587709648e-06, "loss": 0.63769329, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 3.3617424964904785 }, { "auxiliary_loss_clip": 0.01317299, "auxiliary_loss_mlp": 0.01200939, "balance_loss_clip": 1.01103854, "balance_loss_mlp": 1.0012399, "epoch": 0.183731136896531, "flos": 25738733744640.0, "grad_norm": 1.9017230823595206, "language_loss": 0.75653946, "learning_rate": 3.7573057923618095e-06, "loss": 0.78172183, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 2.929544687271118 }, { "auxiliary_loss_clip": 0.01320992, "auxiliary_loss_mlp": 0.01200703, "balance_loss_clip": 1.0106622, "balance_loss_mlp": 1.00138462, "epoch": 0.1838513797871701, "flos": 20449414316160.0, "grad_norm": 1.7715278849312712, "language_loss": 0.74236512, "learning_rate": 3.7569337304320793e-06, "loss": 0.767582, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.7837657928466797 }, { "auxiliary_loss_clip": 0.01337505, "auxiliary_loss_mlp": 0.01196913, "balance_loss_clip": 1.00849366, "balance_loss_mlp": 1.00007439, "epoch": 0.18397162267780917, "flos": 68565173588640.0, "grad_norm": 0.8469494250193802, "language_loss": 0.64502972, "learning_rate": 3.756561401976899e-06, "loss": 0.67037398, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.2085108757019043 }, { "auxiliary_loss_clip": 0.01392315, "auxiliary_loss_mlp": 0.01201105, "balance_loss_clip": 1.01247394, "balance_loss_mlp": 1.00159633, "epoch": 0.18409186556844825, "flos": 31941118432800.0, "grad_norm": 1.7805744864290118, "language_loss": 0.825957, "learning_rate": 3.7561888070527514e-06, "loss": 0.85189122, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 3.649850606918335 }, { "auxiliary_loss_clip": 0.01329195, "auxiliary_loss_mlp": 0.00873477, "balance_loss_clip": 1.01065791, "balance_loss_mlp": 1.00039792, "epoch": 0.18421210845908736, "flos": 20120544136800.0, "grad_norm": 2.022003598227063, "language_loss": 0.79996854, "learning_rate": 3.7558159457161577e-06, "loss": 0.82199526, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 4.583298206329346 }, { "auxiliary_loss_clip": 0.01355418, "auxiliary_loss_mlp": 0.00873485, "balance_loss_clip": 1.01186347, "balance_loss_mlp": 1.00044012, "epoch": 0.18433235134972645, "flos": 23110502509440.0, "grad_norm": 2.7078112954688245, "language_loss": 0.78311694, "learning_rate": 3.755442818023681e-06, "loss": 0.80540597, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 3.677558422088623 }, { "auxiliary_loss_clip": 0.0133026, "auxiliary_loss_mlp": 0.01201442, "balance_loss_clip": 1.01097465, "balance_loss_mlp": 1.0019331, "epoch": 0.18445259424036553, "flos": 18291360104640.0, "grad_norm": 2.061789398316136, "language_loss": 0.76157045, "learning_rate": 3.7550694240319246e-06, "loss": 0.78688753, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.764585494995117 }, { "auxiliary_loss_clip": 0.01376253, "auxiliary_loss_mlp": 0.01200585, "balance_loss_clip": 1.01192403, "balance_loss_mlp": 1.00145733, "epoch": 0.18457283713100464, "flos": 21324090371040.0, "grad_norm": 4.636266351491047, "language_loss": 0.7612884, "learning_rate": 3.7546957637975326e-06, "loss": 0.7870568, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.7578160762786865 }, { "auxiliary_loss_clip": 0.01318151, "auxiliary_loss_mlp": 0.01199769, "balance_loss_clip": 1.01062727, "balance_loss_mlp": 1.00121379, "epoch": 0.18469308002164372, "flos": 20375689419360.0, "grad_norm": 1.4851215117589447, "language_loss": 0.74239206, "learning_rate": 3.7543218373771873e-06, "loss": 0.76757127, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.83676815032959 }, { "auxiliary_loss_clip": 0.01305927, "auxiliary_loss_mlp": 0.00873502, "balance_loss_clip": 1.01032126, "balance_loss_mlp": 1.00040615, "epoch": 0.1848133229122828, "flos": 26435905094880.0, "grad_norm": 1.54799407882126, "language_loss": 0.78130925, "learning_rate": 3.753947644827615e-06, "loss": 0.80310351, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.8975586891174316 }, { "auxiliary_loss_clip": 0.01340995, "auxiliary_loss_mlp": 0.01196857, "balance_loss_clip": 1.00819397, "balance_loss_mlp": 1.00001907, "epoch": 0.1849335658029219, "flos": 70547483355840.0, "grad_norm": 0.9431481839680556, "language_loss": 0.57193929, "learning_rate": 3.753573186205579e-06, "loss": 0.59731781, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.4605870246887207 }, { "auxiliary_loss_clip": 0.01367916, "auxiliary_loss_mlp": 0.00873449, "balance_loss_clip": 1.01208651, "balance_loss_mlp": 1.00034916, "epoch": 0.185053808693561, "flos": 17384150404800.0, "grad_norm": 3.9061613441103633, "language_loss": 0.7809577, "learning_rate": 3.753198461567885e-06, "loss": 0.80337137, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.683645009994507 }, { "auxiliary_loss_clip": 0.01316801, "auxiliary_loss_mlp": 0.01199808, "balance_loss_clip": 1.01101398, "balance_loss_mlp": 1.00144362, "epoch": 0.18517405158420008, "flos": 28986172439040.0, "grad_norm": 1.8010720471727089, "language_loss": 0.92030424, "learning_rate": 3.7528234709713783e-06, "loss": 0.94547033, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.8461148738861084 }, { "auxiliary_loss_clip": 0.01376195, "auxiliary_loss_mlp": 0.01200762, "balance_loss_clip": 1.01242852, "balance_loss_mlp": 1.0014441, "epoch": 0.18529429447483917, "flos": 26794973880000.0, "grad_norm": 2.0645582056074656, "language_loss": 0.8448149, "learning_rate": 3.7524482144729447e-06, "loss": 0.87058443, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.7238829135894775 }, { "auxiliary_loss_clip": 0.01355457, "auxiliary_loss_mlp": 0.0120083, "balance_loss_clip": 1.01252961, "balance_loss_mlp": 1.00132143, "epoch": 0.18541453736547828, "flos": 13581601601760.0, "grad_norm": 3.588912649925367, "language_loss": 0.83584559, "learning_rate": 3.7520726921295106e-06, "loss": 0.86140847, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.787541389465332 }, { "auxiliary_loss_clip": 0.01379125, "auxiliary_loss_mlp": 0.01200433, "balance_loss_clip": 1.01213789, "balance_loss_mlp": 1.00149632, "epoch": 0.18553478025611736, "flos": 24025435800480.0, "grad_norm": 2.5945110688773525, "language_loss": 0.72226584, "learning_rate": 3.751696903998042e-06, "loss": 0.74806142, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.7548258304595947 }, { "auxiliary_loss_clip": 0.01365021, "auxiliary_loss_mlp": 0.01200826, "balance_loss_clip": 1.01144147, "balance_loss_mlp": 1.00131762, "epoch": 0.18565502314675644, "flos": 25885177675200.0, "grad_norm": 1.5564412175220517, "language_loss": 0.69869292, "learning_rate": 3.7513208501355456e-06, "loss": 0.72435141, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.6907451152801514 }, { "auxiliary_loss_clip": 0.01353455, "auxiliary_loss_mlp": 0.01200151, "balance_loss_clip": 1.01175642, "balance_loss_mlp": 1.001405, "epoch": 0.18577526603739553, "flos": 19610073953280.0, "grad_norm": 1.9296364528991246, "language_loss": 0.83672833, "learning_rate": 3.750944530599069e-06, "loss": 0.86226451, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.771368980407715 }, { "auxiliary_loss_clip": 0.01376851, "auxiliary_loss_mlp": 0.01201054, "balance_loss_clip": 1.01275671, "balance_loss_mlp": 1.00154483, "epoch": 0.18589550892803464, "flos": 18474899987520.0, "grad_norm": 2.178275636317797, "language_loss": 0.80622333, "learning_rate": 3.7505679454456992e-06, "loss": 0.8320024, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.693984031677246 }, { "auxiliary_loss_clip": 0.01278394, "auxiliary_loss_mlp": 0.01200582, "balance_loss_clip": 1.0100131, "balance_loss_mlp": 1.0010736, "epoch": 0.18601575181867372, "flos": 23549977995840.0, "grad_norm": 2.0345955017997532, "language_loss": 0.69770253, "learning_rate": 3.750191094732564e-06, "loss": 0.72249234, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 3.005155324935913 }, { "auxiliary_loss_clip": 0.01289412, "auxiliary_loss_mlp": 0.00873427, "balance_loss_clip": 1.01035404, "balance_loss_mlp": 1.00033998, "epoch": 0.1861359947093128, "flos": 26360204395680.0, "grad_norm": 3.8679751944562772, "language_loss": 0.75208211, "learning_rate": 3.7498139785168313e-06, "loss": 0.77371049, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 3.0429131984710693 }, { "auxiliary_loss_clip": 0.01365692, "auxiliary_loss_mlp": 0.01200342, "balance_loss_clip": 1.01156783, "balance_loss_mlp": 1.00121498, "epoch": 0.1862562375999519, "flos": 23331210192000.0, "grad_norm": 1.6207113382954212, "language_loss": 0.77073354, "learning_rate": 3.749436596855709e-06, "loss": 0.79639387, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.7298636436462402 }, { "auxiliary_loss_clip": 0.0137902, "auxiliary_loss_mlp": 0.01200636, "balance_loss_clip": 1.01202798, "balance_loss_mlp": 1.00150871, "epoch": 0.186376480490591, "flos": 16648230612960.0, "grad_norm": 2.3962654104210417, "language_loss": 0.90879548, "learning_rate": 3.749058949806446e-06, "loss": 0.93459201, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.697033166885376 }, { "auxiliary_loss_clip": 0.01370031, "auxiliary_loss_mlp": 0.01200479, "balance_loss_clip": 1.01145911, "balance_loss_mlp": 1.00135159, "epoch": 0.18649672338123008, "flos": 21468666270240.0, "grad_norm": 1.6222562738754565, "language_loss": 0.84408122, "learning_rate": 3.748681037426331e-06, "loss": 0.86978626, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.733222484588623 }, { "auxiliary_loss_clip": 0.01392248, "auxiliary_loss_mlp": 0.01201242, "balance_loss_clip": 1.01294303, "balance_loss_mlp": 1.00154257, "epoch": 0.1866169662718692, "flos": 12312736611840.0, "grad_norm": 2.081345555028197, "language_loss": 0.91518241, "learning_rate": 3.7483028597726936e-06, "loss": 0.94111735, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.6891961097717285 }, { "auxiliary_loss_clip": 0.01318692, "auxiliary_loss_mlp": 0.01200905, "balance_loss_clip": 1.01015997, "balance_loss_mlp": 1.00120509, "epoch": 0.18673720916250827, "flos": 23581290235680.0, "grad_norm": 2.227996065843076, "language_loss": 0.62285781, "learning_rate": 3.7479244169029017e-06, "loss": 0.64805377, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 2.7609076499938965 }, { "auxiliary_loss_clip": 0.01379161, "auxiliary_loss_mlp": 0.01200585, "balance_loss_clip": 1.01189685, "balance_loss_mlp": 1.00126684, "epoch": 0.18685745205314735, "flos": 19718380144800.0, "grad_norm": 2.5034281182466054, "language_loss": 0.73656076, "learning_rate": 3.7475457088743658e-06, "loss": 0.76235825, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 2.7185006141662598 }, { "auxiliary_loss_clip": 0.01353481, "auxiliary_loss_mlp": 0.01200753, "balance_loss_clip": 1.01167655, "balance_loss_mlp": 1.00143516, "epoch": 0.18697769494378644, "flos": 34204137933600.0, "grad_norm": 1.9892908889971763, "language_loss": 0.74678564, "learning_rate": 3.7471667357445348e-06, "loss": 0.77232796, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 2.8747398853302 }, { "auxiliary_loss_clip": 0.01281155, "auxiliary_loss_mlp": 0.01201546, "balance_loss_clip": 1.00959384, "balance_loss_mlp": 1.00203729, "epoch": 0.18709793783442555, "flos": 34241341656960.0, "grad_norm": 2.096847963202218, "language_loss": 0.7254377, "learning_rate": 3.7467874975709e-06, "loss": 0.75026464, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.9606385231018066 }, { "auxiliary_loss_clip": 0.01379151, "auxiliary_loss_mlp": 0.01200816, "balance_loss_clip": 1.01257443, "balance_loss_mlp": 1.00149798, "epoch": 0.18721818072506463, "flos": 40734566556480.0, "grad_norm": 3.7687792737321026, "language_loss": 0.78428674, "learning_rate": 3.7464079944109904e-06, "loss": 0.81008643, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 3.8131279945373535 }, { "auxiliary_loss_clip": 0.01342645, "auxiliary_loss_mlp": 0.01200585, "balance_loss_clip": 1.01240993, "balance_loss_mlp": 1.00107646, "epoch": 0.18733842361570371, "flos": 22157395555680.0, "grad_norm": 2.2786493249660693, "language_loss": 0.77801639, "learning_rate": 3.746028226322376e-06, "loss": 0.80344874, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 3.784454107284546 }, { "auxiliary_loss_clip": 0.01355503, "auxiliary_loss_mlp": 0.0120033, "balance_loss_clip": 1.01155293, "balance_loss_mlp": 1.00139308, "epoch": 0.18745866650634282, "flos": 18914950252800.0, "grad_norm": 3.2484126603633987, "language_loss": 0.75364393, "learning_rate": 3.745648193362669e-06, "loss": 0.77920228, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 3.6786892414093018 }, { "auxiliary_loss_clip": 0.01355094, "auxiliary_loss_mlp": 0.01200057, "balance_loss_clip": 1.01212907, "balance_loss_mlp": 1.0013113, "epoch": 0.1875789093969819, "flos": 19314635510880.0, "grad_norm": 13.220934355687461, "language_loss": 0.72669637, "learning_rate": 3.745267895589518e-06, "loss": 0.75224793, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.7478537559509277 }, { "auxiliary_loss_clip": 0.01344864, "auxiliary_loss_mlp": 0.01200888, "balance_loss_clip": 1.01067948, "balance_loss_mlp": 1.00118923, "epoch": 0.187699152287621, "flos": 17018974594080.0, "grad_norm": 1.9134315980693752, "language_loss": 0.81982636, "learning_rate": 3.7448873330606154e-06, "loss": 0.84528387, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.663027286529541 }, { "auxiliary_loss_clip": 0.01327991, "auxiliary_loss_mlp": 0.01200403, "balance_loss_clip": 1.01077998, "balance_loss_mlp": 1.00165725, "epoch": 0.18781939517826007, "flos": 22346395837920.0, "grad_norm": 2.2162030228631058, "language_loss": 0.87352109, "learning_rate": 3.7445065058336914e-06, "loss": 0.89880502, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7775235176086426 }, { "auxiliary_loss_clip": 0.0132832, "auxiliary_loss_mlp": 0.01200533, "balance_loss_clip": 1.01169801, "balance_loss_mlp": 1.00121462, "epoch": 0.18793963806889918, "flos": 14611486965120.0, "grad_norm": 4.928968729372051, "language_loss": 0.86605167, "learning_rate": 3.7441254139665176e-06, "loss": 0.8913402, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.75006365776062 }, { "auxiliary_loss_clip": 0.01391732, "auxiliary_loss_mlp": 0.01200954, "balance_loss_clip": 1.01291287, "balance_loss_mlp": 1.00163603, "epoch": 0.18805988095953827, "flos": 17457084980640.0, "grad_norm": 1.7867766524051474, "language_loss": 0.83149606, "learning_rate": 3.743744057516905e-06, "loss": 0.85742295, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.675140142440796 }, { "auxiliary_loss_clip": 0.01315082, "auxiliary_loss_mlp": 0.01200877, "balance_loss_clip": 1.01075983, "balance_loss_mlp": 1.00136876, "epoch": 0.18818012385017735, "flos": 15043885486560.0, "grad_norm": 2.6111894653172265, "language_loss": 0.87262088, "learning_rate": 3.743362436542706e-06, "loss": 0.89778054, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.729652166366577 }, { "auxiliary_loss_clip": 0.01390895, "auxiliary_loss_mlp": 0.01200585, "balance_loss_clip": 1.01202154, "balance_loss_mlp": 1.00126719, "epoch": 0.18830036674081646, "flos": 47551991556960.0, "grad_norm": 1.833563130951063, "language_loss": 0.76952231, "learning_rate": 3.7429805511018115e-06, "loss": 0.7954371, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.848886013031006 }, { "auxiliary_loss_clip": 0.0132827, "auxiliary_loss_mlp": 0.00873455, "balance_loss_clip": 1.01135254, "balance_loss_mlp": 1.00024629, "epoch": 0.18842060963145554, "flos": 30044639842560.0, "grad_norm": 2.0486295262245298, "language_loss": 0.78156853, "learning_rate": 3.7425984012521524e-06, "loss": 0.80358589, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.773702383041382 }, { "auxiliary_loss_clip": 0.01315743, "auxiliary_loss_mlp": 0.00872898, "balance_loss_clip": 1.0089426, "balance_loss_mlp": 1.00026655, "epoch": 0.18854085252209463, "flos": 70318405455840.0, "grad_norm": 0.7375921581614913, "language_loss": 0.60413861, "learning_rate": 3.7422159870517025e-06, "loss": 0.62602502, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.3321304321289062 }, { "auxiliary_loss_clip": 0.01357512, "auxiliary_loss_mlp": 0.01200931, "balance_loss_clip": 1.01148832, "balance_loss_mlp": 1.00161302, "epoch": 0.1886610954127337, "flos": 21289329457920.0, "grad_norm": 1.5397410959852973, "language_loss": 0.79153156, "learning_rate": 3.7418333085584717e-06, "loss": 0.81711602, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.798156499862671 }, { "auxiliary_loss_clip": 0.01329819, "auxiliary_loss_mlp": 0.01200628, "balance_loss_clip": 1.0115242, "balance_loss_mlp": 1.00131023, "epoch": 0.18878133830337282, "flos": 17266827369600.0, "grad_norm": 2.0986368321155684, "language_loss": 0.90967304, "learning_rate": 3.7414503658305128e-06, "loss": 0.93497753, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.7138800621032715 }, { "auxiliary_loss_clip": 0.01324598, "auxiliary_loss_mlp": 0.01200559, "balance_loss_clip": 1.01059747, "balance_loss_mlp": 1.00124145, "epoch": 0.1889015811940119, "flos": 25775218994400.0, "grad_norm": 2.912868809271119, "language_loss": 0.77617878, "learning_rate": 3.7410671589259185e-06, "loss": 0.80143034, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.840848684310913 }, { "auxiliary_loss_clip": 0.0139248, "auxiliary_loss_mlp": 0.01200476, "balance_loss_clip": 1.0129106, "balance_loss_mlp": 1.00115788, "epoch": 0.18902182408465099, "flos": 21032208372960.0, "grad_norm": 2.27820655548039, "language_loss": 0.79844427, "learning_rate": 3.7406836879028205e-06, "loss": 0.82437378, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.660871744155884 }, { "auxiliary_loss_clip": 0.0136712, "auxiliary_loss_mlp": 0.0120038, "balance_loss_clip": 1.01199365, "balance_loss_mlp": 1.00144339, "epoch": 0.1891420669752901, "flos": 22272132085920.0, "grad_norm": 2.129487153377808, "language_loss": 0.76564676, "learning_rate": 3.7402999528193907e-06, "loss": 0.79132175, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.7644762992858887 }, { "auxiliary_loss_clip": 0.01327429, "auxiliary_loss_mlp": 0.00873392, "balance_loss_clip": 1.01055455, "balance_loss_mlp": 1.00017917, "epoch": 0.18926230986592918, "flos": 22017813048000.0, "grad_norm": 2.966132099302536, "language_loss": 0.85377395, "learning_rate": 3.739915953733842e-06, "loss": 0.87578219, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.7617247104644775 }, { "auxiliary_loss_clip": 0.01391068, "auxiliary_loss_mlp": 0.01200218, "balance_loss_clip": 1.01232159, "balance_loss_mlp": 1.00128174, "epoch": 0.18938255275656826, "flos": 24462684018720.0, "grad_norm": 1.5421836788698107, "language_loss": 0.81631041, "learning_rate": 3.7395316907044264e-06, "loss": 0.84222329, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.720950126647949 }, { "auxiliary_loss_clip": 0.01366508, "auxiliary_loss_mlp": 0.01199952, "balance_loss_clip": 1.01174331, "balance_loss_mlp": 1.00101531, "epoch": 0.18950279564720737, "flos": 24427060937280.0, "grad_norm": 1.7111052777281768, "language_loss": 0.79554015, "learning_rate": 3.7391471637894364e-06, "loss": 0.82120472, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.7288684844970703 }, { "auxiliary_loss_clip": 0.01341647, "auxiliary_loss_mlp": 0.01200684, "balance_loss_clip": 1.01174617, "balance_loss_mlp": 1.00136626, "epoch": 0.18962303853784646, "flos": 19756302341760.0, "grad_norm": 1.8766302073166419, "language_loss": 0.84729296, "learning_rate": 3.738762373047205e-06, "loss": 0.87271631, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.755039930343628 }, { "auxiliary_loss_clip": 0.01336745, "auxiliary_loss_mlp": 0.01200872, "balance_loss_clip": 1.01107883, "balance_loss_mlp": 1.00155449, "epoch": 0.18974328142848554, "flos": 21032064678240.0, "grad_norm": 1.5950450648496455, "language_loss": 0.83240402, "learning_rate": 3.738377318536103e-06, "loss": 0.85778022, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 2.7848541736602783 }, { "auxiliary_loss_clip": 0.01390129, "auxiliary_loss_mlp": 0.01199735, "balance_loss_clip": 1.01220345, "balance_loss_mlp": 1.00137079, "epoch": 0.18986352431912462, "flos": 12966130205280.0, "grad_norm": 2.0954548347271102, "language_loss": 0.71954381, "learning_rate": 3.7379920003145447e-06, "loss": 0.74544239, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.607973575592041 }, { "auxiliary_loss_clip": 0.01356088, "auxiliary_loss_mlp": 0.01200468, "balance_loss_clip": 1.01223707, "balance_loss_mlp": 1.00153112, "epoch": 0.18998376720976373, "flos": 23767919555040.0, "grad_norm": 1.6545110900028575, "language_loss": 0.83637613, "learning_rate": 3.7376064184409817e-06, "loss": 0.8619417, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 2.7735469341278076 }, { "auxiliary_loss_clip": 0.0135578, "auxiliary_loss_mlp": 0.01200479, "balance_loss_clip": 1.01235867, "balance_loss_mlp": 1.00135207, "epoch": 0.19010401010040281, "flos": 22966034381280.0, "grad_norm": 1.4499143524714724, "language_loss": 0.86928928, "learning_rate": 3.7372205729739063e-06, "loss": 0.8948518, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.7546887397766113 }, { "auxiliary_loss_clip": 0.01379553, "auxiliary_loss_mlp": 0.01200079, "balance_loss_clip": 1.01285791, "balance_loss_mlp": 1.00114202, "epoch": 0.1902242529910419, "flos": 19135657935360.0, "grad_norm": 2.920537785149895, "language_loss": 0.71935534, "learning_rate": 3.7368344639718514e-06, "loss": 0.7451517, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.7386744022369385 }, { "auxiliary_loss_clip": 0.01378768, "auxiliary_loss_mlp": 0.01200552, "balance_loss_clip": 1.01202512, "balance_loss_mlp": 1.00161505, "epoch": 0.190344495881681, "flos": 25483947698880.0, "grad_norm": 1.6464871671702397, "language_loss": 0.80289716, "learning_rate": 3.7364480914933895e-06, "loss": 0.82869035, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 4.560103893280029 }, { "auxiliary_loss_clip": 0.01302349, "auxiliary_loss_mlp": 0.00873382, "balance_loss_clip": 1.01069641, "balance_loss_mlp": 1.00018787, "epoch": 0.1904647387723201, "flos": 26792854382880.0, "grad_norm": 1.884162448143911, "language_loss": 0.81285661, "learning_rate": 3.7360614555971325e-06, "loss": 0.83461392, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 3.7429587841033936 }, { "auxiliary_loss_clip": 0.01366599, "auxiliary_loss_mlp": 0.00873387, "balance_loss_clip": 1.01171279, "balance_loss_mlp": 1.00020826, "epoch": 0.19058498166295917, "flos": 23987765069280.0, "grad_norm": 1.8119617572732973, "language_loss": 0.85065067, "learning_rate": 3.735674556341733e-06, "loss": 0.87305057, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 3.5972707271575928 }, { "auxiliary_loss_clip": 0.01340546, "auxiliary_loss_mlp": 0.01200073, "balance_loss_clip": 1.0112201, "balance_loss_mlp": 1.00113702, "epoch": 0.19070522455359826, "flos": 28293311930400.0, "grad_norm": 2.194868620861168, "language_loss": 0.82837564, "learning_rate": 3.7352873937858835e-06, "loss": 0.85378188, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.78855037689209 }, { "auxiliary_loss_clip": 0.01329099, "auxiliary_loss_mlp": 0.00873419, "balance_loss_clip": 1.01132023, "balance_loss_mlp": 1.00021327, "epoch": 0.19082546744423737, "flos": 25660230998400.0, "grad_norm": 1.8392862475277163, "language_loss": 0.71516067, "learning_rate": 3.734899967988316e-06, "loss": 0.73718584, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.7577855587005615 }, { "auxiliary_loss_clip": 0.01339288, "auxiliary_loss_mlp": 0.01200786, "balance_loss_clip": 1.01104665, "balance_loss_mlp": 1.00165892, "epoch": 0.19094571033487645, "flos": 19719493778880.0, "grad_norm": 1.7868300901294065, "language_loss": 0.83792913, "learning_rate": 3.7345122790078026e-06, "loss": 0.86332983, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.791254758834839 }, { "auxiliary_loss_clip": 0.01365279, "auxiliary_loss_mlp": 0.01200811, "balance_loss_clip": 1.01151395, "balance_loss_mlp": 1.00149274, "epoch": 0.19106595322551553, "flos": 21616331605920.0, "grad_norm": 2.6030720079711296, "language_loss": 0.92896533, "learning_rate": 3.7341243269031556e-06, "loss": 0.95462632, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.6940829753875732 }, { "auxiliary_loss_clip": 0.01354147, "auxiliary_loss_mlp": 0.01200173, "balance_loss_clip": 1.01213217, "balance_loss_mlp": 1.00142717, "epoch": 0.19118619611615464, "flos": 29896902659520.0, "grad_norm": 1.5714829322956958, "language_loss": 0.77323472, "learning_rate": 3.7337361117332275e-06, "loss": 0.79877794, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.9204256534576416 }, { "auxiliary_loss_clip": 0.01340399, "auxiliary_loss_mlp": 0.01200747, "balance_loss_clip": 1.01089334, "balance_loss_mlp": 1.00161934, "epoch": 0.19130643900679373, "flos": 17273437326720.0, "grad_norm": 1.908217531083368, "language_loss": 0.77160692, "learning_rate": 3.7333476335569087e-06, "loss": 0.79701841, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.7979190349578857 }, { "auxiliary_loss_clip": 0.01353585, "auxiliary_loss_mlp": 0.01200559, "balance_loss_clip": 1.01181018, "balance_loss_mlp": 1.00143206, "epoch": 0.1914266818974328, "flos": 24826351034880.0, "grad_norm": 2.2855521785889104, "language_loss": 0.67070472, "learning_rate": 3.7329588924331325e-06, "loss": 0.69624615, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.778458833694458 }, { "auxiliary_loss_clip": 0.01341324, "auxiliary_loss_mlp": 0.01200287, "balance_loss_clip": 1.01166117, "balance_loss_mlp": 1.00135088, "epoch": 0.1915469247880719, "flos": 18952477289280.0, "grad_norm": 1.8429206072882276, "language_loss": 0.82507616, "learning_rate": 3.732569888420871e-06, "loss": 0.8504923, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.8328282833099365 }, { "auxiliary_loss_clip": 0.01390772, "auxiliary_loss_mlp": 0.01200973, "balance_loss_clip": 1.01210499, "balance_loss_mlp": 1.00146437, "epoch": 0.191667167678711, "flos": 21032962770240.0, "grad_norm": 2.0566290588527987, "language_loss": 0.82327938, "learning_rate": 3.732180621579134e-06, "loss": 0.84919679, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.685377836227417 }, { "auxiliary_loss_clip": 0.01342231, "auxiliary_loss_mlp": 0.01201187, "balance_loss_clip": 1.01198733, "balance_loss_mlp": 1.0018692, "epoch": 0.1917874105693501, "flos": 34237677441600.0, "grad_norm": 1.879088711434243, "language_loss": 0.81337637, "learning_rate": 3.7317910919669745e-06, "loss": 0.83881056, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.8905773162841797 }, { "auxiliary_loss_clip": 0.0136552, "auxiliary_loss_mlp": 0.01200612, "balance_loss_clip": 1.01225364, "balance_loss_mlp": 1.00129414, "epoch": 0.19190765345998917, "flos": 23550624622080.0, "grad_norm": 6.127615851166354, "language_loss": 0.76423961, "learning_rate": 3.7314012996434826e-06, "loss": 0.78990096, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.6870946884155273 }, { "auxiliary_loss_clip": 0.01352764, "auxiliary_loss_mlp": 0.0120039, "balance_loss_clip": 1.01217139, "balance_loss_mlp": 1.0012629, "epoch": 0.19202789635062828, "flos": 19861339478400.0, "grad_norm": 2.04660760325488, "language_loss": 0.80802977, "learning_rate": 3.7310112446677907e-06, "loss": 0.8335613, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.714186429977417 }, { "auxiliary_loss_clip": 0.01391384, "auxiliary_loss_mlp": 0.01200922, "balance_loss_clip": 1.01314569, "balance_loss_mlp": 1.00141323, "epoch": 0.19214813924126736, "flos": 20922968165760.0, "grad_norm": 2.155804426594054, "language_loss": 0.68712544, "learning_rate": 3.7306209270990695e-06, "loss": 0.71304846, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.656404972076416 }, { "auxiliary_loss_clip": 0.01350066, "auxiliary_loss_mlp": 0.01200858, "balance_loss_clip": 1.01115322, "balance_loss_mlp": 1.00134957, "epoch": 0.19226838213190645, "flos": 26359737387840.0, "grad_norm": 1.9219942206042355, "language_loss": 0.86428177, "learning_rate": 3.7302303469965292e-06, "loss": 0.88979095, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.7480716705322266 }, { "auxiliary_loss_clip": 0.01365736, "auxiliary_loss_mlp": 0.01200884, "balance_loss_clip": 1.0120523, "balance_loss_mlp": 1.00175643, "epoch": 0.19238862502254553, "flos": 20850536521440.0, "grad_norm": 1.9482089480872198, "language_loss": 0.70998514, "learning_rate": 3.7298395044194206e-06, "loss": 0.73565131, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.7112386226654053 }, { "auxiliary_loss_clip": 0.01390628, "auxiliary_loss_mlp": 0.01200578, "balance_loss_clip": 1.01283336, "balance_loss_mlp": 1.00145042, "epoch": 0.19250886791318464, "flos": 21726074744640.0, "grad_norm": 3.516781917894686, "language_loss": 0.94316804, "learning_rate": 3.7294483994270356e-06, "loss": 0.96908009, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.625812530517578 }, { "auxiliary_loss_clip": 0.01303285, "auxiliary_loss_mlp": 0.01200462, "balance_loss_clip": 1.00994015, "balance_loss_mlp": 1.00171602, "epoch": 0.19262911080382372, "flos": 23367839136480.0, "grad_norm": 2.1335347024755364, "language_loss": 0.77961761, "learning_rate": 3.7290570320787033e-06, "loss": 0.80465508, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 2.8137221336364746 }, { "auxiliary_loss_clip": 0.01365786, "auxiliary_loss_mlp": 0.01200411, "balance_loss_clip": 1.0124445, "balance_loss_mlp": 1.00147486, "epoch": 0.1927493536944628, "flos": 21943513372320.0, "grad_norm": 1.8563326875170698, "language_loss": 0.7106272, "learning_rate": 3.728665402433793e-06, "loss": 0.7362892, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.688084125518799 }, { "auxiliary_loss_clip": 0.01352726, "auxiliary_loss_mlp": 0.01200753, "balance_loss_clip": 1.01257801, "balance_loss_mlp": 1.00162601, "epoch": 0.19286959658510192, "flos": 16545600362880.0, "grad_norm": 2.3482548326556754, "language_loss": 0.86028564, "learning_rate": 3.7282735105517164e-06, "loss": 0.88582039, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 2.7760231494903564 }, { "auxiliary_loss_clip": 0.01320469, "auxiliary_loss_mlp": 0.01201132, "balance_loss_clip": 1.01080656, "balance_loss_mlp": 1.00181448, "epoch": 0.192989839475741, "flos": 21616978232160.0, "grad_norm": 1.8343820298437068, "language_loss": 0.67271519, "learning_rate": 3.727881356491922e-06, "loss": 0.69793123, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.7801437377929688 }, { "auxiliary_loss_clip": 0.01390631, "auxiliary_loss_mlp": 0.01200715, "balance_loss_clip": 1.01277316, "balance_loss_mlp": 1.00177872, "epoch": 0.19311008236638008, "flos": 19281527087040.0, "grad_norm": 2.5045586007719156, "language_loss": 0.75271964, "learning_rate": 3.7274889403139002e-06, "loss": 0.77863312, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.754033088684082 }, { "auxiliary_loss_clip": 0.01292912, "auxiliary_loss_mlp": 0.01200194, "balance_loss_clip": 1.00928354, "balance_loss_mlp": 1.00144768, "epoch": 0.1932303252570192, "flos": 28652380715520.0, "grad_norm": 2.065904541664042, "language_loss": 0.78294384, "learning_rate": 3.727096262077179e-06, "loss": 0.80787486, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 2.8467025756835938 }, { "auxiliary_loss_clip": 0.01366981, "auxiliary_loss_mlp": 0.01199774, "balance_loss_clip": 1.01157522, "balance_loss_mlp": 1.0010283, "epoch": 0.19335056814765827, "flos": 18369000682560.0, "grad_norm": 1.7910658331439688, "language_loss": 0.85323429, "learning_rate": 3.7267033218413285e-06, "loss": 0.8789019, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.7247536182403564 }, { "auxiliary_loss_clip": 0.01310923, "auxiliary_loss_mlp": 0.01201147, "balance_loss_clip": 1.01023841, "balance_loss_mlp": 1.00182939, "epoch": 0.19347081103829736, "flos": 13260886097760.0, "grad_norm": 2.5463460792056654, "language_loss": 0.81181669, "learning_rate": 3.726310119665957e-06, "loss": 0.83693737, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 4.586663484573364 }, { "auxiliary_loss_clip": 0.01366089, "auxiliary_loss_mlp": 0.01200861, "balance_loss_clip": 1.01196587, "balance_loss_mlp": 1.00135207, "epoch": 0.19359105392893644, "flos": 20300132414880.0, "grad_norm": 1.7722022267075017, "language_loss": 0.85325313, "learning_rate": 3.725916655610713e-06, "loss": 0.87892264, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 3.6450507640838623 }, { "auxiliary_loss_clip": 0.01351519, "auxiliary_loss_mlp": 0.01200859, "balance_loss_clip": 1.01114058, "balance_loss_mlp": 1.00154102, "epoch": 0.19371129681957555, "flos": 20484606313440.0, "grad_norm": 2.2495245341403125, "language_loss": 0.75713664, "learning_rate": 3.725522929735284e-06, "loss": 0.78266042, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 3.5688278675079346 }, { "auxiliary_loss_clip": 0.01365698, "auxiliary_loss_mlp": 0.01200764, "balance_loss_clip": 1.01191497, "balance_loss_mlp": 1.00163651, "epoch": 0.19383153971021463, "flos": 30445510582080.0, "grad_norm": 2.053168985336404, "language_loss": 0.74328929, "learning_rate": 3.725128942099399e-06, "loss": 0.76895392, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.809877872467041 }, { "auxiliary_loss_clip": 0.01356003, "auxiliary_loss_mlp": 0.01200396, "balance_loss_clip": 1.0111959, "balance_loss_mlp": 1.00107861, "epoch": 0.19395178260085372, "flos": 24569948423520.0, "grad_norm": 1.6185624896556858, "language_loss": 0.80077893, "learning_rate": 3.7247346927628245e-06, "loss": 0.82634294, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.755859851837158 }, { "auxiliary_loss_clip": 0.01339394, "auxiliary_loss_mlp": 0.0087332, "balance_loss_clip": 1.01091766, "balance_loss_mlp": 1.00014973, "epoch": 0.19407202549149283, "flos": 28950621204960.0, "grad_norm": 1.8417793162048506, "language_loss": 0.79270339, "learning_rate": 3.7243401817853694e-06, "loss": 0.81483054, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.7674248218536377 }, { "auxiliary_loss_clip": 0.01377084, "auxiliary_loss_mlp": 0.0120006, "balance_loss_clip": 1.01187515, "balance_loss_mlp": 1.00131452, "epoch": 0.1941922683821319, "flos": 18004507421760.0, "grad_norm": 1.9696724854828431, "language_loss": 0.72137606, "learning_rate": 3.723945409226879e-06, "loss": 0.74714756, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.7199273109436035 }, { "auxiliary_loss_clip": 0.01378017, "auxiliary_loss_mlp": 0.0120044, "balance_loss_clip": 1.01221275, "balance_loss_mlp": 1.00150347, "epoch": 0.194312511272771, "flos": 9720344000160.0, "grad_norm": 2.2224562614672037, "language_loss": 0.79703921, "learning_rate": 3.723550375147241e-06, "loss": 0.82282376, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.705573320388794 }, { "auxiliary_loss_clip": 0.01338923, "auxiliary_loss_mlp": 0.01199606, "balance_loss_clip": 1.01119578, "balance_loss_mlp": 1.00105047, "epoch": 0.19443275416341008, "flos": 27016220417760.0, "grad_norm": 1.6682588355471657, "language_loss": 0.79964942, "learning_rate": 3.7231550796063816e-06, "loss": 0.82503468, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.855468273162842 }, { "auxiliary_loss_clip": 0.0134923, "auxiliary_loss_mlp": 0.01201087, "balance_loss_clip": 1.01083481, "balance_loss_mlp": 1.00157857, "epoch": 0.1945529970540492, "flos": 15846632828640.0, "grad_norm": 1.8878747337453525, "language_loss": 0.64873976, "learning_rate": 3.722759522664266e-06, "loss": 0.67424291, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.6868319511413574 }, { "auxiliary_loss_clip": 0.01313798, "auxiliary_loss_mlp": 0.01200083, "balance_loss_clip": 1.01067328, "balance_loss_mlp": 1.0009563, "epoch": 0.19467323994468827, "flos": 19314994747680.0, "grad_norm": 1.960508814692113, "language_loss": 0.81649619, "learning_rate": 3.7223637043809016e-06, "loss": 0.84163505, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.803391695022583 }, { "auxiliary_loss_clip": 0.0132681, "auxiliary_loss_mlp": 0.01200394, "balance_loss_clip": 1.01085162, "balance_loss_mlp": 1.00145745, "epoch": 0.19479348283532735, "flos": 24133239060480.0, "grad_norm": 5.426741794902186, "language_loss": 0.86428487, "learning_rate": 3.7219676248163322e-06, "loss": 0.88955688, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.767240047454834 }, { "auxiliary_loss_clip": 0.0136963, "auxiliary_loss_mlp": 0.01200464, "balance_loss_clip": 1.01197588, "balance_loss_mlp": 1.00133657, "epoch": 0.19491372572596646, "flos": 25775650078560.0, "grad_norm": 1.7262293619913973, "language_loss": 0.9341737, "learning_rate": 3.721571284030643e-06, "loss": 0.95987463, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.8091819286346436 }, { "auxiliary_loss_clip": 0.0137375, "auxiliary_loss_mlp": 0.01200089, "balance_loss_clip": 1.01171315, "balance_loss_mlp": 1.00134301, "epoch": 0.19503396861660555, "flos": 19645230026880.0, "grad_norm": 2.461961999594414, "language_loss": 0.79096872, "learning_rate": 3.7211746820839587e-06, "loss": 0.81670713, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.670743227005005 }, { "auxiliary_loss_clip": 0.01271024, "auxiliary_loss_mlp": 0.01199603, "balance_loss_clip": 1.00940323, "balance_loss_mlp": 1.00104845, "epoch": 0.19515421150724463, "flos": 21033034617600.0, "grad_norm": 1.6045460266648461, "language_loss": 0.80914223, "learning_rate": 3.7207778190364437e-06, "loss": 0.83384854, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.9393463134765625 }, { "auxiliary_loss_clip": 0.0130266, "auxiliary_loss_mlp": 0.01199953, "balance_loss_clip": 1.01039481, "balance_loss_mlp": 1.00139761, "epoch": 0.1952744543978837, "flos": 32961268478880.0, "grad_norm": 1.7452607921081054, "language_loss": 0.74206418, "learning_rate": 3.720380694948302e-06, "loss": 0.76709032, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 3.139427423477173 }, { "auxiliary_loss_clip": 0.0131374, "auxiliary_loss_mlp": 0.01197121, "balance_loss_clip": 1.01159215, "balance_loss_mlp": 1.00028288, "epoch": 0.19539469728852282, "flos": 64044343520640.0, "grad_norm": 1.034863923067736, "language_loss": 0.7124992, "learning_rate": 3.719983309879777e-06, "loss": 0.73760784, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.3254501819610596 }, { "auxiliary_loss_clip": 0.01340803, "auxiliary_loss_mlp": 0.01200408, "balance_loss_clip": 1.0120728, "balance_loss_mlp": 1.001472, "epoch": 0.1955149401791619, "flos": 13370916625920.0, "grad_norm": 1.789516743270783, "language_loss": 0.7745229, "learning_rate": 3.719585663891151e-06, "loss": 0.79993498, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.7947440147399902 }, { "auxiliary_loss_clip": 0.01308705, "auxiliary_loss_mlp": 0.01200012, "balance_loss_clip": 1.01077294, "balance_loss_mlp": 1.00107574, "epoch": 0.195635183069801, "flos": 18728895712320.0, "grad_norm": 3.160881286345082, "language_loss": 0.78924125, "learning_rate": 3.719187757042747e-06, "loss": 0.81432837, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.7510058879852295 }, { "auxiliary_loss_clip": 0.01335274, "auxiliary_loss_mlp": 0.01197, "balance_loss_clip": 1.01140594, "balance_loss_mlp": 1.00016165, "epoch": 0.1957554259604401, "flos": 69313989981600.0, "grad_norm": 0.7233018219152998, "language_loss": 0.5494616, "learning_rate": 3.7187895893949275e-06, "loss": 0.57478434, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.3659329414367676 }, { "auxiliary_loss_clip": 0.01315353, "auxiliary_loss_mlp": 0.0119998, "balance_loss_clip": 1.01063919, "balance_loss_mlp": 1.0010438, "epoch": 0.19587566885107918, "flos": 21069268401600.0, "grad_norm": 3.2361629532777547, "language_loss": 0.76134378, "learning_rate": 3.7183911610080937e-06, "loss": 0.78649712, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.865273952484131 }, { "auxiliary_loss_clip": 0.01341545, "auxiliary_loss_mlp": 0.01200989, "balance_loss_clip": 1.01201057, "balance_loss_mlp": 1.00148034, "epoch": 0.19599591174171827, "flos": 22194671126400.0, "grad_norm": 3.0704930492840927, "language_loss": 0.75957668, "learning_rate": 3.7179924719426872e-06, "loss": 0.78500205, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 2.741806983947754 }, { "auxiliary_loss_clip": 0.01365563, "auxiliary_loss_mlp": 0.01200585, "balance_loss_clip": 1.01197982, "balance_loss_mlp": 1.00145793, "epoch": 0.19611615463235738, "flos": 23768386562880.0, "grad_norm": 2.4538352188619466, "language_loss": 0.75928706, "learning_rate": 3.7175935222591885e-06, "loss": 0.78494859, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.762312650680542 }, { "auxiliary_loss_clip": 0.01342875, "auxiliary_loss_mlp": 0.01200626, "balance_loss_clip": 1.01119208, "balance_loss_mlp": 1.0014987, "epoch": 0.19623639752299646, "flos": 28618230504960.0, "grad_norm": 3.055791923965045, "language_loss": 0.74076676, "learning_rate": 3.717194312018118e-06, "loss": 0.76620179, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 2.7907497882843018 }, { "auxiliary_loss_clip": 0.01366624, "auxiliary_loss_mlp": 0.01200543, "balance_loss_clip": 1.01181722, "balance_loss_mlp": 1.00141561, "epoch": 0.19635664041363554, "flos": 21032711304480.0, "grad_norm": 1.8342065747030398, "language_loss": 0.75978887, "learning_rate": 3.716794841280036e-06, "loss": 0.78546053, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.7636947631835938 }, { "auxiliary_loss_clip": 0.01378926, "auxiliary_loss_mlp": 0.01200712, "balance_loss_clip": 1.01256692, "balance_loss_mlp": 1.00158477, "epoch": 0.19647688330427462, "flos": 18879758255520.0, "grad_norm": 2.409712418489749, "language_loss": 0.77642512, "learning_rate": 3.7163951101055407e-06, "loss": 0.80222148, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 3.9542653560638428 }, { "auxiliary_loss_clip": 0.01354048, "auxiliary_loss_mlp": 0.01200179, "balance_loss_clip": 1.01147556, "balance_loss_mlp": 1.00124288, "epoch": 0.19659712619491373, "flos": 24242515191360.0, "grad_norm": 1.8718184356988072, "language_loss": 0.79123586, "learning_rate": 3.715995118555273e-06, "loss": 0.81677818, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.8524301052093506 }, { "auxiliary_loss_clip": 0.01322211, "auxiliary_loss_mlp": 0.01200081, "balance_loss_clip": 1.01104653, "balance_loss_mlp": 1.00114441, "epoch": 0.19671736908555282, "flos": 24717434140800.0, "grad_norm": 1.982822004778642, "language_loss": 0.85986626, "learning_rate": 3.71559486668991e-06, "loss": 0.88508922, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 4.733439683914185 }, { "auxiliary_loss_clip": 0.01378146, "auxiliary_loss_mlp": 0.00873288, "balance_loss_clip": 1.01274192, "balance_loss_mlp": 1.00014377, "epoch": 0.1968376119761919, "flos": 23842291078080.0, "grad_norm": 1.5776317145957086, "language_loss": 0.77607763, "learning_rate": 3.715194354570169e-06, "loss": 0.79859203, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.7621827125549316 }, { "auxiliary_loss_clip": 0.01365102, "auxiliary_loss_mlp": 0.01199878, "balance_loss_clip": 1.01171541, "balance_loss_mlp": 1.00113201, "epoch": 0.196957854866831, "flos": 18113927247360.0, "grad_norm": 1.8578366074531767, "language_loss": 0.83298117, "learning_rate": 3.714793582256809e-06, "loss": 0.85863096, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 3.5444698333740234 }, { "auxiliary_loss_clip": 0.0139017, "auxiliary_loss_mlp": 0.01199772, "balance_loss_clip": 1.01270437, "balance_loss_mlp": 1.00121748, "epoch": 0.1970780977574701, "flos": 21653140168800.0, "grad_norm": 2.370407804282917, "language_loss": 0.85308081, "learning_rate": 3.7143925498106253e-06, "loss": 0.87898028, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.705587148666382 }, { "auxiliary_loss_clip": 0.01366107, "auxiliary_loss_mlp": 0.01200876, "balance_loss_clip": 1.01186597, "balance_loss_mlp": 1.00174904, "epoch": 0.19719834064810918, "flos": 20811824003520.0, "grad_norm": 1.7843939049353912, "language_loss": 0.79337859, "learning_rate": 3.7139912572924558e-06, "loss": 0.8190484, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.748168706893921 }, { "auxiliary_loss_clip": 0.01377177, "auxiliary_loss_mlp": 0.01200788, "balance_loss_clip": 1.01189542, "balance_loss_mlp": 1.00127888, "epoch": 0.19731858353874826, "flos": 23434810381440.0, "grad_norm": 2.6397182365468885, "language_loss": 0.80812794, "learning_rate": 3.7135897047631744e-06, "loss": 0.8339076, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.979870080947876 }, { "auxiliary_loss_clip": 0.01345975, "auxiliary_loss_mlp": 0.01200312, "balance_loss_clip": 1.01066828, "balance_loss_mlp": 1.00137568, "epoch": 0.19743882642938737, "flos": 23988196153440.0, "grad_norm": 2.4106545447095713, "language_loss": 0.76178002, "learning_rate": 3.713187892283698e-06, "loss": 0.78724289, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.8006789684295654 }, { "auxiliary_loss_clip": 0.01315749, "auxiliary_loss_mlp": 0.01200323, "balance_loss_clip": 1.01047206, "balance_loss_mlp": 1.00119615, "epoch": 0.19755906932002645, "flos": 15004346724000.0, "grad_norm": 2.022887577069056, "language_loss": 0.87307036, "learning_rate": 3.71278581991498e-06, "loss": 0.89823115, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.7629876136779785 }, { "auxiliary_loss_clip": 0.01320117, "auxiliary_loss_mlp": 0.00873285, "balance_loss_clip": 1.01123548, "balance_loss_mlp": 1.00020909, "epoch": 0.19767931221066554, "flos": 19494475254720.0, "grad_norm": 4.047149125849116, "language_loss": 0.78788161, "learning_rate": 3.712383487718015e-06, "loss": 0.80981565, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.8540749549865723 }, { "auxiliary_loss_clip": 0.01293241, "auxiliary_loss_mlp": 0.0120031, "balance_loss_clip": 1.00999522, "balance_loss_mlp": 1.001755, "epoch": 0.19779955510130465, "flos": 25737907500000.0, "grad_norm": 2.0671912274749067, "language_loss": 0.86585534, "learning_rate": 3.7119808957538365e-06, "loss": 0.89079082, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.9023516178131104 }, { "auxiliary_loss_clip": 0.013668, "auxiliary_loss_mlp": 0.01200477, "balance_loss_clip": 1.01242054, "balance_loss_mlp": 1.00134945, "epoch": 0.19791979799194373, "flos": 20777709716640.0, "grad_norm": 1.9693179018809424, "language_loss": 0.8016662, "learning_rate": 3.711578044083517e-06, "loss": 0.82733899, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.757550001144409 }, { "auxiliary_loss_clip": 0.01352843, "auxiliary_loss_mlp": 0.01199994, "balance_loss_clip": 1.01142645, "balance_loss_mlp": 1.00143945, "epoch": 0.1980400408825828, "flos": 25589020759200.0, "grad_norm": 1.8282174412134293, "language_loss": 0.74879766, "learning_rate": 3.7111749327681698e-06, "loss": 0.77432597, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.791891098022461 }, { "auxiliary_loss_clip": 0.0136919, "auxiliary_loss_mlp": 0.01199852, "balance_loss_clip": 1.01199603, "balance_loss_mlp": 1.00091577, "epoch": 0.1981602837732219, "flos": 23513851982880.0, "grad_norm": 2.0890914187224863, "language_loss": 0.86554682, "learning_rate": 3.7107715618689455e-06, "loss": 0.89123726, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.7543466091156006 }, { "auxiliary_loss_clip": 0.0136289, "auxiliary_loss_mlp": 0.01200302, "balance_loss_clip": 1.01129353, "balance_loss_mlp": 1.00117457, "epoch": 0.198280526663861, "flos": 23185376964000.0, "grad_norm": 1.4427503947269926, "language_loss": 0.83639109, "learning_rate": 3.710367931447035e-06, "loss": 0.862023, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.740628957748413 }, { "auxiliary_loss_clip": 0.01377888, "auxiliary_loss_mlp": 0.01200651, "balance_loss_clip": 1.01241946, "balance_loss_mlp": 1.00152361, "epoch": 0.1984007695545001, "flos": 21689481723840.0, "grad_norm": 2.5718477005765967, "language_loss": 0.86808646, "learning_rate": 3.70996404156367e-06, "loss": 0.8938719, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.7380144596099854 }, { "auxiliary_loss_clip": 0.01317433, "auxiliary_loss_mlp": 0.01199793, "balance_loss_clip": 1.01087379, "balance_loss_mlp": 1.00123811, "epoch": 0.19852101244513917, "flos": 36064023503040.0, "grad_norm": 1.7854895770625008, "language_loss": 0.72743881, "learning_rate": 3.7095598922801187e-06, "loss": 0.7526111, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.8865699768066406 }, { "auxiliary_loss_clip": 0.01389731, "auxiliary_loss_mlp": 0.0120027, "balance_loss_clip": 1.012079, "balance_loss_mlp": 1.00152421, "epoch": 0.19864125533577828, "flos": 23105904278400.0, "grad_norm": 2.846024273886077, "language_loss": 0.76014298, "learning_rate": 3.7091554836576914e-06, "loss": 0.78604299, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.620821237564087 }, { "auxiliary_loss_clip": 0.01364525, "auxiliary_loss_mlp": 0.00873207, "balance_loss_clip": 1.01224232, "balance_loss_mlp": 1.00017118, "epoch": 0.19876149822641737, "flos": 24608517246720.0, "grad_norm": 1.85746205799233, "language_loss": 0.82951111, "learning_rate": 3.708750815757736e-06, "loss": 0.85188842, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.714822769165039 }, { "auxiliary_loss_clip": 0.01364358, "auxiliary_loss_mlp": 0.0120017, "balance_loss_clip": 1.01166761, "balance_loss_mlp": 1.00123382, "epoch": 0.19888174111705645, "flos": 32196658875840.0, "grad_norm": 2.9468129297554007, "language_loss": 0.73208666, "learning_rate": 3.7083458886416407e-06, "loss": 0.75773203, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.769667863845825 }, { "auxiliary_loss_clip": 0.01282402, "auxiliary_loss_mlp": 0.01199937, "balance_loss_clip": 1.00948095, "balance_loss_mlp": 1.00119114, "epoch": 0.19900198400769553, "flos": 24608481323040.0, "grad_norm": 1.8699597147881444, "language_loss": 0.88118583, "learning_rate": 3.707940702370832e-06, "loss": 0.9060092, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.861628770828247 }, { "auxiliary_loss_clip": 0.01346118, "auxiliary_loss_mlp": 0.01196909, "balance_loss_clip": 1.00775146, "balance_loss_mlp": 1.00007081, "epoch": 0.19912222689833464, "flos": 67915839371040.0, "grad_norm": 0.7673804903373975, "language_loss": 0.58254135, "learning_rate": 3.707535257006777e-06, "loss": 0.60797155, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.338402509689331 }, { "auxiliary_loss_clip": 0.01351652, "auxiliary_loss_mlp": 0.01200178, "balance_loss_clip": 1.01195335, "balance_loss_mlp": 1.00124192, "epoch": 0.19924246978897373, "flos": 15742350089280.0, "grad_norm": 2.6240718034630706, "language_loss": 0.88236916, "learning_rate": 3.707129552610981e-06, "loss": 0.9078874, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.733928918838501 }, { "auxiliary_loss_clip": 0.01352829, "auxiliary_loss_mlp": 0.01199849, "balance_loss_clip": 1.01239955, "balance_loss_mlp": 1.00091267, "epoch": 0.1993627126796128, "flos": 17566576653600.0, "grad_norm": 1.826389527511032, "language_loss": 0.73911309, "learning_rate": 3.70672358924499e-06, "loss": 0.76463985, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 2.724133014678955 }, { "auxiliary_loss_clip": 0.01317705, "auxiliary_loss_mlp": 0.01200477, "balance_loss_clip": 1.01040936, "balance_loss_mlp": 1.00134945, "epoch": 0.19948295557025192, "flos": 40843842687360.0, "grad_norm": 1.8722041393496753, "language_loss": 0.7845487, "learning_rate": 3.706317366970386e-06, "loss": 0.80973047, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 2.945774793624878 }, { "auxiliary_loss_clip": 0.01389748, "auxiliary_loss_mlp": 0.0087328, "balance_loss_clip": 1.01187551, "balance_loss_mlp": 1.00016618, "epoch": 0.199603198460891, "flos": 25082430333120.0, "grad_norm": 2.094271007966585, "language_loss": 0.83584297, "learning_rate": 3.705910885848795e-06, "loss": 0.85847324, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 3.6962239742279053 }, { "auxiliary_loss_clip": 0.01365054, "auxiliary_loss_mlp": 0.01199676, "balance_loss_clip": 1.01162481, "balance_loss_mlp": 1.00093031, "epoch": 0.19972344135153008, "flos": 20084130734400.0, "grad_norm": 2.146954151649213, "language_loss": 0.84409565, "learning_rate": 3.705504145941879e-06, "loss": 0.86974299, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.657060146331787 }, { "auxiliary_loss_clip": 0.01389447, "auxiliary_loss_mlp": 0.0120008, "balance_loss_clip": 1.01212764, "balance_loss_mlp": 1.00114322, "epoch": 0.1998436842421692, "flos": 23727482700480.0, "grad_norm": 1.8044882775779372, "language_loss": 0.78869927, "learning_rate": 3.7050971473113403e-06, "loss": 0.81459457, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 4.614501476287842 }, { "auxiliary_loss_clip": 0.01377628, "auxiliary_loss_mlp": 0.00873174, "balance_loss_clip": 1.01203251, "balance_loss_mlp": 1.00016832, "epoch": 0.19996392713280828, "flos": 36102376784160.0, "grad_norm": 1.6489465584621528, "language_loss": 0.79930031, "learning_rate": 3.7046898900189196e-06, "loss": 0.82180834, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 3.78901743888855 }, { "auxiliary_loss_clip": 0.01327323, "auxiliary_loss_mlp": 0.01199883, "balance_loss_clip": 1.01136982, "balance_loss_mlp": 1.00113761, "epoch": 0.20008417002344736, "flos": 23657673484800.0, "grad_norm": 1.5662256842339712, "language_loss": 0.82695854, "learning_rate": 3.704282374126398e-06, "loss": 0.85223061, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.862769365310669 }, { "auxiliary_loss_clip": 0.01335846, "auxiliary_loss_mlp": 0.01200801, "balance_loss_clip": 1.01123834, "balance_loss_mlp": 1.00129259, "epoch": 0.20020441291408644, "flos": 21872087591040.0, "grad_norm": 1.8043778476711732, "language_loss": 0.87650758, "learning_rate": 3.7038745996955954e-06, "loss": 0.90187407, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 302.97244596481323 }, { "auxiliary_loss_clip": 0.01341466, "auxiliary_loss_mlp": 0.01200922, "balance_loss_clip": 1.012604, "balance_loss_mlp": 1.00141323, "epoch": 0.20032465580472555, "flos": 23179701022560.0, "grad_norm": 5.208399576950813, "language_loss": 0.7194351, "learning_rate": 3.703466566788371e-06, "loss": 0.74485898, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 3.0048623085021973 }, { "auxiliary_loss_clip": 0.01352279, "auxiliary_loss_mlp": 0.01200201, "balance_loss_clip": 1.01235163, "balance_loss_mlp": 1.00126481, "epoch": 0.20044489869536464, "flos": 23873531470560.0, "grad_norm": 1.9987793018226456, "language_loss": 0.74402189, "learning_rate": 3.703058275466622e-06, "loss": 0.76954669, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.9121384620666504 }, { "auxiliary_loss_clip": 0.01348599, "auxiliary_loss_mlp": 0.01199879, "balance_loss_clip": 1.0116508, "balance_loss_mlp": 1.00113368, "epoch": 0.20056514158600372, "flos": 21945237708960.0, "grad_norm": 1.8002901666408797, "language_loss": 0.77634901, "learning_rate": 3.7026497257922877e-06, "loss": 0.80183381, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.879005193710327 }, { "auxiliary_loss_clip": 0.01317391, "auxiliary_loss_mlp": 0.01200099, "balance_loss_clip": 1.01057959, "balance_loss_mlp": 1.00116277, "epoch": 0.20068538447664283, "flos": 23879171488320.0, "grad_norm": 2.118457403558475, "language_loss": 0.85315388, "learning_rate": 3.7022409178273436e-06, "loss": 0.8783288, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.980567455291748 }, { "auxiliary_loss_clip": 0.01363854, "auxiliary_loss_mlp": 0.01199964, "balance_loss_clip": 1.01180756, "balance_loss_mlp": 1.00083709, "epoch": 0.2008056273672819, "flos": 18442833350400.0, "grad_norm": 1.7582151541614417, "language_loss": 0.78430676, "learning_rate": 3.7018318516338054e-06, "loss": 0.80994499, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.7078447341918945 }, { "auxiliary_loss_clip": 0.01373549, "auxiliary_loss_mlp": 0.01199877, "balance_loss_clip": 1.01158357, "balance_loss_mlp": 1.00132203, "epoch": 0.200925870257921, "flos": 23659541516160.0, "grad_norm": 3.46933626598271, "language_loss": 0.81161261, "learning_rate": 3.7014225272737284e-06, "loss": 0.83734691, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.8076677322387695 }, { "auxiliary_loss_clip": 0.01377497, "auxiliary_loss_mlp": 0.01199898, "balance_loss_clip": 1.01202464, "balance_loss_mlp": 1.00115275, "epoch": 0.20104611314856008, "flos": 16217125344000.0, "grad_norm": 2.5609951133406685, "language_loss": 0.73823804, "learning_rate": 3.701012944809207e-06, "loss": 0.76401198, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.8626930713653564 }, { "auxiliary_loss_clip": 0.01343022, "auxiliary_loss_mlp": 0.00873211, "balance_loss_clip": 1.0109849, "balance_loss_mlp": 1.00010777, "epoch": 0.2011663560391992, "flos": 21397384183680.0, "grad_norm": 2.093146026582285, "language_loss": 0.79166543, "learning_rate": 3.700603104302374e-06, "loss": 0.81382775, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.843520402908325 }, { "auxiliary_loss_clip": 0.01296603, "auxiliary_loss_mlp": 0.01196893, "balance_loss_clip": 1.00919247, "balance_loss_mlp": 1.00005436, "epoch": 0.20128659892983827, "flos": 62229493036800.0, "grad_norm": 0.8948361958581379, "language_loss": 0.55927342, "learning_rate": 3.7001930058154027e-06, "loss": 0.58420837, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.4694323539733887 }, { "auxiliary_loss_clip": 0.01328221, "auxiliary_loss_mlp": 0.01200795, "balance_loss_clip": 1.01125145, "balance_loss_mlp": 1.00147676, "epoch": 0.20140684182047736, "flos": 28438749997920.0, "grad_norm": 3.3605404689697425, "language_loss": 0.79878485, "learning_rate": 3.6997826494105037e-06, "loss": 0.82407498, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.926342725753784 }, { "auxiliary_loss_clip": 0.01344965, "auxiliary_loss_mlp": 0.01200606, "balance_loss_clip": 1.01119661, "balance_loss_mlp": 1.00147939, "epoch": 0.20152708471111647, "flos": 28074077118720.0, "grad_norm": 2.1439939109443076, "language_loss": 0.68894559, "learning_rate": 3.6993720351499286e-06, "loss": 0.71440136, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.8886702060699463 }, { "auxiliary_loss_clip": 0.01337707, "auxiliary_loss_mlp": 0.01199624, "balance_loss_clip": 1.01129293, "balance_loss_mlp": 1.00125945, "epoch": 0.20164732760175555, "flos": 23549762453760.0, "grad_norm": 2.061972712454077, "language_loss": 0.77036309, "learning_rate": 3.6989611630959666e-06, "loss": 0.79573643, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.966515302658081 }, { "auxiliary_loss_clip": 0.01349705, "auxiliary_loss_mlp": 0.01196968, "balance_loss_clip": 1.01028371, "balance_loss_mlp": 1.0001297, "epoch": 0.20176757049239463, "flos": 71100653585760.0, "grad_norm": 0.6815385746077406, "language_loss": 0.58271492, "learning_rate": 3.6985500333109474e-06, "loss": 0.60818166, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.422569513320923 }, { "auxiliary_loss_clip": 0.01339944, "auxiliary_loss_mlp": 0.01199857, "balance_loss_clip": 1.01175249, "balance_loss_mlp": 1.00130236, "epoch": 0.20188781338303372, "flos": 21430169294400.0, "grad_norm": 2.3110402758341007, "language_loss": 0.76499176, "learning_rate": 3.6981386458572385e-06, "loss": 0.79038978, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 2.9420883655548096 }, { "auxiliary_loss_clip": 0.01331113, "auxiliary_loss_mlp": 0.01201443, "balance_loss_clip": 1.01055348, "balance_loss_mlp": 1.00212491, "epoch": 0.20200805627367283, "flos": 11546223053760.0, "grad_norm": 2.255453472276198, "language_loss": 0.76081741, "learning_rate": 3.6977270007972468e-06, "loss": 0.78614295, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.8910634517669678 }, { "auxiliary_loss_clip": 0.01350149, "auxiliary_loss_mlp": 0.0120091, "balance_loss_clip": 1.01132667, "balance_loss_mlp": 1.0015918, "epoch": 0.2021282991643119, "flos": 28545403700160.0, "grad_norm": 3.2498792842558206, "language_loss": 0.72287107, "learning_rate": 3.6973150981934196e-06, "loss": 0.74838161, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 2.861664056777954 }, { "auxiliary_loss_clip": 0.0138975, "auxiliary_loss_mlp": 0.01200589, "balance_loss_clip": 1.01226783, "balance_loss_mlp": 1.00146222, "epoch": 0.202248542054951, "flos": 17923454094240.0, "grad_norm": 2.5094119652144684, "language_loss": 0.83511293, "learning_rate": 3.6969029381082415e-06, "loss": 0.86101633, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.722299098968506 }, { "auxiliary_loss_clip": 0.01350409, "auxiliary_loss_mlp": 0.01199943, "balance_loss_clip": 1.01133144, "balance_loss_mlp": 1.00138807, "epoch": 0.2023687849455901, "flos": 19864644456960.0, "grad_norm": 1.8535734283004792, "language_loss": 0.79319715, "learning_rate": 3.696490520604237e-06, "loss": 0.81870073, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.9304070472717285 }, { "auxiliary_loss_clip": 0.01365499, "auxiliary_loss_mlp": 0.01199704, "balance_loss_clip": 1.01187468, "balance_loss_mlp": 1.00133967, "epoch": 0.20248902783622919, "flos": 22564732557600.0, "grad_norm": 1.7072669370164613, "language_loss": 0.80867207, "learning_rate": 3.696077845743968e-06, "loss": 0.83432412, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.8485589027404785 }, { "auxiliary_loss_clip": 0.01388813, "auxiliary_loss_mlp": 0.0120019, "balance_loss_clip": 1.01167035, "balance_loss_mlp": 1.00125396, "epoch": 0.20260927072686827, "flos": 22709739540960.0, "grad_norm": 3.2205306729138066, "language_loss": 0.72908109, "learning_rate": 3.69566491359004e-06, "loss": 0.75497115, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 2.8620455265045166 }, { "auxiliary_loss_clip": 0.01361662, "auxiliary_loss_mlp": 0.01200167, "balance_loss_clip": 1.01218677, "balance_loss_mlp": 1.00123096, "epoch": 0.20272951361750738, "flos": 51024017691360.0, "grad_norm": 29.0540193013849, "language_loss": 0.69318748, "learning_rate": 3.695251724205092e-06, "loss": 0.71880579, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 4.144987106323242 }, { "auxiliary_loss_clip": 0.01389108, "auxiliary_loss_mlp": 0.01200151, "balance_loss_clip": 1.01231861, "balance_loss_mlp": 1.0012145, "epoch": 0.20284975650814646, "flos": 26578145954880.0, "grad_norm": 1.6659222291678362, "language_loss": 0.86849821, "learning_rate": 3.6948382776518054e-06, "loss": 0.89439082, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 3.7307443618774414 }, { "auxiliary_loss_clip": 0.01338193, "auxiliary_loss_mlp": 0.01200719, "balance_loss_clip": 1.01126289, "balance_loss_mlp": 1.00178266, "epoch": 0.20296999939878554, "flos": 16034232087360.0, "grad_norm": 2.3130013853740103, "language_loss": 0.79375827, "learning_rate": 3.6944245739929e-06, "loss": 0.81914747, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 3.769861936569214 }, { "auxiliary_loss_clip": 0.01363571, "auxiliary_loss_mlp": 0.0120063, "balance_loss_clip": 1.01128709, "balance_loss_mlp": 1.00131166, "epoch": 0.20309024228942463, "flos": 19203383577600.0, "grad_norm": 2.9437292649645648, "language_loss": 0.71712065, "learning_rate": 3.6940106132911332e-06, "loss": 0.74276268, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 4.373710870742798 }, { "auxiliary_loss_clip": 0.01365904, "auxiliary_loss_mlp": 0.01199847, "balance_loss_clip": 1.01142895, "balance_loss_mlp": 1.00129259, "epoch": 0.20321048518006374, "flos": 22821099245280.0, "grad_norm": 1.8914510369036845, "language_loss": 0.89035916, "learning_rate": 3.6935963956093037e-06, "loss": 0.9160167, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.8557286262512207 }, { "auxiliary_loss_clip": 0.01377311, "auxiliary_loss_mlp": 0.01199829, "balance_loss_clip": 1.01283288, "balance_loss_mlp": 1.00127399, "epoch": 0.20333072807070282, "flos": 19096406562240.0, "grad_norm": 1.7728266000131991, "language_loss": 0.68997747, "learning_rate": 3.6931819210102474e-06, "loss": 0.71574885, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.854729413986206 }, { "auxiliary_loss_clip": 0.01389299, "auxiliary_loss_mlp": 0.0120012, "balance_loss_clip": 1.0121491, "balance_loss_mlp": 1.00137424, "epoch": 0.2034509709613419, "flos": 18180970339680.0, "grad_norm": 1.7751017522948214, "language_loss": 0.84427214, "learning_rate": 3.6927671895568402e-06, "loss": 0.8701663, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.8487701416015625 }, { "auxiliary_loss_clip": 0.01389451, "auxiliary_loss_mlp": 0.0120036, "balance_loss_clip": 1.01252902, "balance_loss_mlp": 1.00142312, "epoch": 0.20357121385198101, "flos": 22923909113760.0, "grad_norm": 4.180234746916203, "language_loss": 0.86692417, "learning_rate": 3.692352201311996e-06, "loss": 0.89282227, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.8812031745910645 }, { "auxiliary_loss_clip": 0.01326973, "auxiliary_loss_mlp": 0.01200518, "balance_loss_clip": 1.01100206, "balance_loss_mlp": 1.00158143, "epoch": 0.2036914567426201, "flos": 20922141921120.0, "grad_norm": 2.1772413648317963, "language_loss": 0.7628293, "learning_rate": 3.6919369563386687e-06, "loss": 0.7881043, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 3.06186842918396 }, { "auxiliary_loss_clip": 0.01339487, "auxiliary_loss_mlp": 0.01200127, "balance_loss_clip": 1.01125252, "balance_loss_mlp": 1.0011909, "epoch": 0.20381169963325918, "flos": 15519163672800.0, "grad_norm": 2.114758356660841, "language_loss": 0.78987718, "learning_rate": 3.69152145469985e-06, "loss": 0.81527328, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.8542239665985107 }, { "auxiliary_loss_clip": 0.01311589, "auxiliary_loss_mlp": 0.01200735, "balance_loss_clip": 1.01037824, "balance_loss_mlp": 1.00141668, "epoch": 0.20393194252389826, "flos": 28833154475040.0, "grad_norm": 1.791400486955152, "language_loss": 0.82212675, "learning_rate": 3.691105696458572e-06, "loss": 0.84724998, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.983912706375122 }, { "auxiliary_loss_clip": 0.01388666, "auxiliary_loss_mlp": 0.01200048, "balance_loss_clip": 1.01229262, "balance_loss_mlp": 1.00130177, "epoch": 0.20405218541453737, "flos": 22488564850560.0, "grad_norm": 2.443562149299013, "language_loss": 0.68192416, "learning_rate": 3.690689681677904e-06, "loss": 0.70781124, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.7362453937530518 }, { "auxiliary_loss_clip": 0.01360556, "auxiliary_loss_mlp": 0.01199929, "balance_loss_clip": 1.01132655, "balance_loss_mlp": 1.00137448, "epoch": 0.20417242830517646, "flos": 25374420102240.0, "grad_norm": 2.721536401302222, "language_loss": 0.88592702, "learning_rate": 3.690273410420956e-06, "loss": 0.91153187, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.8506762981414795 }, { "auxiliary_loss_clip": 0.01363127, "auxiliary_loss_mlp": 0.01200321, "balance_loss_clip": 1.01117826, "balance_loss_mlp": 1.00138474, "epoch": 0.20429267119581554, "flos": 14793086969280.0, "grad_norm": 2.9267035217668513, "language_loss": 0.76682079, "learning_rate": 3.689856882750875e-06, "loss": 0.7924552, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.782503604888916 }, { "auxiliary_loss_clip": 0.01363704, "auxiliary_loss_mlp": 0.01199158, "balance_loss_clip": 1.01168478, "balance_loss_mlp": 1.00079393, "epoch": 0.20441291408645465, "flos": 17781859860480.0, "grad_norm": 1.841949506993517, "language_loss": 0.78845286, "learning_rate": 3.6894400987308486e-06, "loss": 0.81408149, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.836517333984375 }, { "auxiliary_loss_clip": 0.01366611, "auxiliary_loss_mlp": 0.01199894, "balance_loss_clip": 1.01161337, "balance_loss_mlp": 1.00114822, "epoch": 0.20453315697709373, "flos": 16435677605760.0, "grad_norm": 1.8705162010962597, "language_loss": 0.84707701, "learning_rate": 3.6890230584241024e-06, "loss": 0.87274206, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.749075174331665 }, { "auxiliary_loss_clip": 0.01369134, "auxiliary_loss_mlp": 0.01197032, "balance_loss_clip": 1.00880086, "balance_loss_mlp": 1.00019383, "epoch": 0.20465339986773282, "flos": 66713119381440.0, "grad_norm": 1.0752233018438946, "language_loss": 0.66374981, "learning_rate": 3.6886057618939016e-06, "loss": 0.68941152, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.388259172439575 }, { "auxiliary_loss_clip": 0.01337874, "auxiliary_loss_mlp": 0.01199646, "balance_loss_clip": 1.01136756, "balance_loss_mlp": 1.001091, "epoch": 0.2047736427583719, "flos": 41974131032640.0, "grad_norm": 1.9083629078058304, "language_loss": 0.69415432, "learning_rate": 3.6881882092035492e-06, "loss": 0.71952951, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 3.0392379760742188 }, { "auxiliary_loss_clip": 0.01324501, "auxiliary_loss_mlp": 0.00872697, "balance_loss_clip": 1.00824571, "balance_loss_mlp": 0.99993128, "epoch": 0.204893885649011, "flos": 69940907822880.0, "grad_norm": 1.0075840353101622, "language_loss": 0.6130175, "learning_rate": 3.6877704004163873e-06, "loss": 0.63498944, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.5103681087493896 }, { "auxiliary_loss_clip": 0.01389735, "auxiliary_loss_mlp": 0.0120062, "balance_loss_clip": 1.01297188, "balance_loss_mlp": 1.00168324, "epoch": 0.2050141285396501, "flos": 22200023754720.0, "grad_norm": 2.002396917725063, "language_loss": 0.7742281, "learning_rate": 3.6873523355957984e-06, "loss": 0.80013168, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 2.8643229007720947 }, { "auxiliary_loss_clip": 0.01368874, "auxiliary_loss_mlp": 0.01196993, "balance_loss_clip": 1.00872386, "balance_loss_mlp": 1.00015438, "epoch": 0.20513437143028918, "flos": 46283750478720.0, "grad_norm": 0.9830070805502359, "language_loss": 0.64073741, "learning_rate": 3.686934014805201e-06, "loss": 0.66639608, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.163412094116211 }, { "auxiliary_loss_clip": 0.01363617, "auxiliary_loss_mlp": 0.01200673, "balance_loss_clip": 1.01144242, "balance_loss_mlp": 1.00173712, "epoch": 0.20525461432092829, "flos": 21904333846560.0, "grad_norm": 1.902967184852814, "language_loss": 0.80848384, "learning_rate": 3.6865154381080552e-06, "loss": 0.83412671, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 2.9302687644958496 }, { "auxiliary_loss_clip": 0.012684, "auxiliary_loss_mlp": 0.01199984, "balance_loss_clip": 1.00908411, "balance_loss_mlp": 1.00123787, "epoch": 0.20537485721156737, "flos": 21214275384960.0, "grad_norm": 2.884114747382971, "language_loss": 0.824323, "learning_rate": 3.6860966055678585e-06, "loss": 0.84900689, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.9488778114318848 }, { "auxiliary_loss_clip": 0.01363487, "auxiliary_loss_mlp": 0.01200227, "balance_loss_clip": 1.01149845, "balance_loss_mlp": 1.00129104, "epoch": 0.20549510010220645, "flos": 20191215520800.0, "grad_norm": 1.5755371785216472, "language_loss": 0.86499393, "learning_rate": 3.685677517248147e-06, "loss": 0.89063114, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.9754772186279297 }, { "auxiliary_loss_clip": 0.01339926, "auxiliary_loss_mlp": 0.00873261, "balance_loss_clip": 1.01167536, "balance_loss_mlp": 1.00016272, "epoch": 0.20561534299284553, "flos": 17016711402240.0, "grad_norm": 2.049321896723907, "language_loss": 0.80294645, "learning_rate": 3.6852581732124967e-06, "loss": 0.82507837, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.857006072998047 }, { "auxiliary_loss_clip": 0.01365348, "auxiliary_loss_mlp": 0.01200492, "balance_loss_clip": 1.01192141, "balance_loss_mlp": 1.00117362, "epoch": 0.20573558588348465, "flos": 22890477376800.0, "grad_norm": 1.9410513284469093, "language_loss": 0.76106536, "learning_rate": 3.6848385735245213e-06, "loss": 0.78672379, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 2.8065083026885986 }, { "auxiliary_loss_clip": 0.01375839, "auxiliary_loss_mlp": 0.0119962, "balance_loss_clip": 1.0118804, "balance_loss_mlp": 1.00106454, "epoch": 0.20585582877412373, "flos": 24643134465120.0, "grad_norm": 1.8950393322496142, "language_loss": 0.86332285, "learning_rate": 3.6844187182478734e-06, "loss": 0.88907743, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 4.3268210887908936 }, { "auxiliary_loss_clip": 0.01350957, "auxiliary_loss_mlp": 0.01200023, "balance_loss_clip": 1.01122773, "balance_loss_mlp": 1.00127745, "epoch": 0.2059760716647628, "flos": 24206963957280.0, "grad_norm": 1.992166077273701, "language_loss": 0.75262737, "learning_rate": 3.683998607446246e-06, "loss": 0.77813721, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 3.697187662124634 }, { "auxiliary_loss_clip": 0.01367643, "auxiliary_loss_mlp": 0.01199466, "balance_loss_clip": 1.01177812, "balance_loss_mlp": 1.00148368, "epoch": 0.20609631455540192, "flos": 20229532878240.0, "grad_norm": 2.0368942213325623, "language_loss": 0.74960077, "learning_rate": 3.6835782411833686e-06, "loss": 0.77527189, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 3.837578058242798 }, { "auxiliary_loss_clip": 0.01327216, "auxiliary_loss_mlp": 0.01200048, "balance_loss_clip": 1.01087213, "balance_loss_mlp": 1.00149333, "epoch": 0.206216557446041, "flos": 19864967770080.0, "grad_norm": 1.7086254613730252, "language_loss": 0.74405301, "learning_rate": 3.68315761952301e-06, "loss": 0.76932561, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 3.9886443614959717 }, { "auxiliary_loss_clip": 0.01389178, "auxiliary_loss_mlp": 0.0120032, "balance_loss_clip": 1.01282227, "balance_loss_mlp": 1.00138354, "epoch": 0.2063368003366801, "flos": 24096322726560.0, "grad_norm": 1.9744031533112345, "language_loss": 0.82909822, "learning_rate": 3.6827367425289797e-06, "loss": 0.85499316, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.82633638381958 }, { "auxiliary_loss_clip": 0.01340768, "auxiliary_loss_mlp": 0.01199902, "balance_loss_clip": 1.01102924, "balance_loss_mlp": 1.00115609, "epoch": 0.2064570432273192, "flos": 20340174108960.0, "grad_norm": 2.151194472346078, "language_loss": 0.7275486, "learning_rate": 3.6823156102651225e-06, "loss": 0.75295526, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.856114149093628 }, { "auxiliary_loss_clip": 0.01260859, "auxiliary_loss_mlp": 0.0119999, "balance_loss_clip": 1.00919223, "balance_loss_mlp": 1.00143468, "epoch": 0.20657728611795828, "flos": 20520373089600.0, "grad_norm": 1.6577329383554633, "language_loss": 0.7111991, "learning_rate": 3.6818942227953257e-06, "loss": 0.73580766, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.979947805404663 }, { "auxiliary_loss_clip": 0.01324371, "auxiliary_loss_mlp": 0.01200152, "balance_loss_clip": 1.01036251, "balance_loss_mlp": 1.00140643, "epoch": 0.20669752900859736, "flos": 21799296709920.0, "grad_norm": 1.8377924873936502, "language_loss": 0.6905272, "learning_rate": 3.681472580183512e-06, "loss": 0.71577239, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.9061527252197266 }, { "auxiliary_loss_clip": 0.0136302, "auxiliary_loss_mlp": 0.01199554, "balance_loss_clip": 1.01189375, "balance_loss_mlp": 1.0013808, "epoch": 0.20681777189923645, "flos": 15122028996000.0, "grad_norm": 1.8251172811934275, "language_loss": 0.86653602, "learning_rate": 3.6810506824936455e-06, "loss": 0.89216173, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.7439558506011963 }, { "auxiliary_loss_clip": 0.013309, "auxiliary_loss_mlp": 0.0119694, "balance_loss_clip": 1.00783062, "balance_loss_mlp": 1.00010192, "epoch": 0.20693801478987556, "flos": 56481047868960.0, "grad_norm": 1.052287009019266, "language_loss": 0.6257093, "learning_rate": 3.680628529789726e-06, "loss": 0.65098768, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.256497383117676 }, { "auxiliary_loss_clip": 0.01390312, "auxiliary_loss_mlp": 0.01200768, "balance_loss_clip": 1.01341343, "balance_loss_mlp": 1.00145006, "epoch": 0.20705825768051464, "flos": 21614212108800.0, "grad_norm": 2.980012514951489, "language_loss": 0.86437404, "learning_rate": 3.680206122135796e-06, "loss": 0.89028478, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.7218992710113525 }, { "auxiliary_loss_clip": 0.01307358, "auxiliary_loss_mlp": 0.01199742, "balance_loss_clip": 1.01041555, "balance_loss_mlp": 1.00118709, "epoch": 0.20717850057115372, "flos": 25848907967520.0, "grad_norm": 2.0090021979027433, "language_loss": 0.78672636, "learning_rate": 3.6797834595959323e-06, "loss": 0.81179738, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.922579288482666 }, { "auxiliary_loss_clip": 0.01304183, "auxiliary_loss_mlp": 0.01199783, "balance_loss_clip": 1.00968373, "balance_loss_mlp": 1.0010376, "epoch": 0.20729874346179283, "flos": 29130820185600.0, "grad_norm": 2.743744818889649, "language_loss": 0.77655923, "learning_rate": 3.679360542234254e-06, "loss": 0.80159891, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.9471707344055176 }, { "auxiliary_loss_clip": 0.01364447, "auxiliary_loss_mlp": 0.00873348, "balance_loss_clip": 1.01262712, "balance_loss_mlp": 1.00022006, "epoch": 0.20741898635243192, "flos": 29023376162400.0, "grad_norm": 1.7030879997406372, "language_loss": 0.72425532, "learning_rate": 3.678937370114916e-06, "loss": 0.74663329, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.886711597442627 }, { "auxiliary_loss_clip": 0.01339366, "auxiliary_loss_mlp": 0.01199667, "balance_loss_clip": 1.01067781, "balance_loss_mlp": 1.00130296, "epoch": 0.207539229243071, "flos": 15559456832640.0, "grad_norm": 1.866227372102434, "language_loss": 0.78966421, "learning_rate": 3.678513943302114e-06, "loss": 0.81505454, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.861593723297119 }, { "auxiliary_loss_clip": 0.01388671, "auxiliary_loss_mlp": 0.01199709, "balance_loss_clip": 1.01296937, "balance_loss_mlp": 1.00134516, "epoch": 0.20765947213371008, "flos": 20521091563200.0, "grad_norm": 1.7095905440620627, "language_loss": 0.84992075, "learning_rate": 3.678090261860082e-06, "loss": 0.87580454, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.7590994834899902 }, { "auxiliary_loss_clip": 0.01338796, "auxiliary_loss_mlp": 0.01199446, "balance_loss_clip": 1.01079941, "balance_loss_mlp": 1.00108123, "epoch": 0.2077797150243492, "flos": 19354425739200.0, "grad_norm": 2.0753860078513107, "language_loss": 0.77870643, "learning_rate": 3.6776663258530906e-06, "loss": 0.80408883, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.9468038082122803 }, { "auxiliary_loss_clip": 0.01376097, "auxiliary_loss_mlp": 0.01199331, "balance_loss_clip": 1.01261151, "balance_loss_mlp": 1.00115752, "epoch": 0.20789995791498828, "flos": 21829962323520.0, "grad_norm": 1.865159714055912, "language_loss": 0.71299934, "learning_rate": 3.6772421353454516e-06, "loss": 0.73875362, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.8456640243530273 }, { "auxiliary_loss_clip": 0.01363759, "auxiliary_loss_mlp": 0.01200275, "balance_loss_clip": 1.01265454, "balance_loss_mlp": 1.00152922, "epoch": 0.20802020080562736, "flos": 23148855790560.0, "grad_norm": 1.959953875704139, "language_loss": 0.88304055, "learning_rate": 3.6768176904015153e-06, "loss": 0.90868086, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.785979747772217 }, { "auxiliary_loss_clip": 0.01366959, "auxiliary_loss_mlp": 0.0120011, "balance_loss_clip": 1.01138544, "balance_loss_mlp": 1.00155544, "epoch": 0.20814044369626647, "flos": 23072688083520.0, "grad_norm": 1.8625412583320529, "language_loss": 0.59860116, "learning_rate": 3.6763929910856674e-06, "loss": 0.62427187, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 2.8583528995513916 }, { "auxiliary_loss_clip": 0.01362691, "auxiliary_loss_mlp": 0.01199502, "balance_loss_clip": 1.01166475, "balance_loss_mlp": 1.00113797, "epoch": 0.20826068658690555, "flos": 19608026303520.0, "grad_norm": 2.5875248383623193, "language_loss": 0.77665901, "learning_rate": 3.6759680374623365e-06, "loss": 0.80228096, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.8006539344787598 }, { "auxiliary_loss_clip": 0.01387067, "auxiliary_loss_mlp": 0.01199692, "balance_loss_clip": 1.01229239, "balance_loss_mlp": 1.00132775, "epoch": 0.20838092947754464, "flos": 25374060865440.0, "grad_norm": 2.480030631935898, "language_loss": 0.75264823, "learning_rate": 3.675542829595986e-06, "loss": 0.77851588, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 2.8312411308288574 }, { "auxiliary_loss_clip": 0.01351799, "auxiliary_loss_mlp": 0.01199256, "balance_loss_clip": 1.01194429, "balance_loss_mlp": 1.00108206, "epoch": 0.20850117236818372, "flos": 24061741431840.0, "grad_norm": 1.4790765129367038, "language_loss": 0.79330653, "learning_rate": 3.6751173675511213e-06, "loss": 0.81881714, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.889030933380127 }, { "auxiliary_loss_clip": 0.01363205, "auxiliary_loss_mlp": 0.01199884, "balance_loss_clip": 1.01231647, "balance_loss_mlp": 1.00113773, "epoch": 0.20862141525882283, "flos": 20077808166720.0, "grad_norm": 2.0328195372458824, "language_loss": 0.87927234, "learning_rate": 3.674691651392283e-06, "loss": 0.90490323, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.8644683361053467 }, { "auxiliary_loss_clip": 0.01350221, "auxiliary_loss_mlp": 0.01200053, "balance_loss_clip": 1.01189315, "balance_loss_mlp": 1.00149775, "epoch": 0.2087416581494619, "flos": 39015197510400.0, "grad_norm": 2.902330360841315, "language_loss": 0.75798309, "learning_rate": 3.674265681184053e-06, "loss": 0.78348589, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 3.026801586151123 }, { "auxiliary_loss_clip": 0.01351577, "auxiliary_loss_mlp": 0.011997, "balance_loss_clip": 1.0120734, "balance_loss_mlp": 1.00114524, "epoch": 0.208861901040101, "flos": 26102005600320.0, "grad_norm": 3.2228581156796414, "language_loss": 0.86649919, "learning_rate": 3.6738394569910504e-06, "loss": 0.892012, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 2.8897619247436523 }, { "auxiliary_loss_clip": 0.01363635, "auxiliary_loss_mlp": 0.01200156, "balance_loss_clip": 1.01274729, "balance_loss_mlp": 1.00141001, "epoch": 0.2089821439307401, "flos": 28398744227520.0, "grad_norm": 2.230872480320224, "language_loss": 0.82728523, "learning_rate": 3.6734129788779333e-06, "loss": 0.85292315, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 3.9685964584350586 }, { "auxiliary_loss_clip": 0.01326921, "auxiliary_loss_mlp": 0.01199459, "balance_loss_clip": 1.01134777, "balance_loss_mlp": 1.00128508, "epoch": 0.2091023868213792, "flos": 21069627638400.0, "grad_norm": 1.9338139167749322, "language_loss": 0.90742439, "learning_rate": 3.6729862469093976e-06, "loss": 0.93268818, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 3.7187650203704834 }, { "auxiliary_loss_clip": 0.01348803, "auxiliary_loss_mlp": 0.01199323, "balance_loss_clip": 1.01181257, "balance_loss_mlp": 1.00114965, "epoch": 0.20922262971201827, "flos": 22455492350400.0, "grad_norm": 3.3258315013195414, "language_loss": 0.82550061, "learning_rate": 3.6725592611501782e-06, "loss": 0.85098183, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.8456523418426514 }, { "auxiliary_loss_clip": 0.01376441, "auxiliary_loss_mlp": 0.01199877, "balance_loss_clip": 1.01270819, "balance_loss_mlp": 1.00113082, "epoch": 0.20934287260265738, "flos": 27852255802080.0, "grad_norm": 1.8254681620583957, "language_loss": 0.76169634, "learning_rate": 3.6721320216650496e-06, "loss": 0.78745949, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 4.887996673583984 }, { "auxiliary_loss_clip": 0.01339511, "auxiliary_loss_mlp": 0.01199118, "balance_loss_clip": 1.01080203, "balance_loss_mlp": 1.00094497, "epoch": 0.20946311549329646, "flos": 16435318368960.0, "grad_norm": 1.6799031687721686, "language_loss": 0.83912843, "learning_rate": 3.6717045285188215e-06, "loss": 0.86451471, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.8772377967834473 }, { "auxiliary_loss_clip": 0.01314843, "auxiliary_loss_mlp": 0.01199479, "balance_loss_clip": 1.01037574, "balance_loss_mlp": 1.00111449, "epoch": 0.20958335838393555, "flos": 22492731997440.0, "grad_norm": 1.9895370273819672, "language_loss": 0.87190449, "learning_rate": 3.671276781776346e-06, "loss": 0.8970477, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.8659074306488037 }, { "auxiliary_loss_clip": 0.01350255, "auxiliary_loss_mlp": 0.012, "balance_loss_clip": 1.01235855, "balance_loss_mlp": 1.00144506, "epoch": 0.20970360127457463, "flos": 25224778964160.0, "grad_norm": 1.8689400683300548, "language_loss": 0.67083287, "learning_rate": 3.6708487815025128e-06, "loss": 0.69633543, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.9740891456604004 }, { "auxiliary_loss_clip": 0.01325332, "auxiliary_loss_mlp": 0.01200002, "balance_loss_clip": 1.01062942, "balance_loss_mlp": 1.00144672, "epoch": 0.20982384416521374, "flos": 18479174905440.0, "grad_norm": 2.3142628444633506, "language_loss": 0.74690998, "learning_rate": 3.6704205277622463e-06, "loss": 0.77216333, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.8808183670043945 }, { "auxiliary_loss_clip": 0.0135496, "auxiliary_loss_mlp": 0.01199997, "balance_loss_clip": 1.01172161, "balance_loss_mlp": 1.00125134, "epoch": 0.20994408705585282, "flos": 25373557933920.0, "grad_norm": 1.734669836210292, "language_loss": 0.80494994, "learning_rate": 3.6699920206205146e-06, "loss": 0.83049953, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.9541518688201904 }, { "auxiliary_loss_clip": 0.01374859, "auxiliary_loss_mlp": 0.01199264, "balance_loss_clip": 1.01225269, "balance_loss_mlp": 1.00109029, "epoch": 0.2100643299464919, "flos": 21320965010880.0, "grad_norm": 1.952651059157795, "language_loss": 0.82185119, "learning_rate": 3.669563260142321e-06, "loss": 0.84759247, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.909301280975342 }, { "auxiliary_loss_clip": 0.01336197, "auxiliary_loss_mlp": 0.01199034, "balance_loss_clip": 1.01095545, "balance_loss_mlp": 1.00105166, "epoch": 0.21018457283713102, "flos": 19354389815520.0, "grad_norm": 2.309277112190666, "language_loss": 0.84175068, "learning_rate": 3.6691342463927083e-06, "loss": 0.86710298, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.806640148162842 }, { "auxiliary_loss_clip": 0.01337025, "auxiliary_loss_mlp": 0.01199549, "balance_loss_clip": 1.012012, "balance_loss_mlp": 1.00137579, "epoch": 0.2103048157277701, "flos": 28330048645920.0, "grad_norm": 1.5901469209407064, "language_loss": 0.82065588, "learning_rate": 3.668704979436758e-06, "loss": 0.84602165, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.889089345932007 }, { "auxiliary_loss_clip": 0.01363377, "auxiliary_loss_mlp": 0.01200144, "balance_loss_clip": 1.0121299, "balance_loss_mlp": 1.00158942, "epoch": 0.21042505861840918, "flos": 17457300522720.0, "grad_norm": 1.9150931895215124, "language_loss": 0.78620529, "learning_rate": 3.668275459339588e-06, "loss": 0.81184053, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.828420639038086 }, { "auxiliary_loss_clip": 0.01387623, "auxiliary_loss_mlp": 0.01199829, "balance_loss_clip": 1.01221108, "balance_loss_mlp": 1.00127447, "epoch": 0.21054530150904827, "flos": 14209825904640.0, "grad_norm": 1.8816097483889695, "language_loss": 0.80129391, "learning_rate": 3.667845686166358e-06, "loss": 0.82716841, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.794052839279175 }, { "auxiliary_loss_clip": 0.01336971, "auxiliary_loss_mlp": 0.01199806, "balance_loss_clip": 1.0117054, "balance_loss_mlp": 1.00144148, "epoch": 0.21066554439968738, "flos": 18618218557920.0, "grad_norm": 1.7051025315353217, "language_loss": 0.86102176, "learning_rate": 3.6674156599822634e-06, "loss": 0.88638949, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.934260129928589 }, { "auxiliary_loss_clip": 0.01325448, "auxiliary_loss_mlp": 0.01199993, "balance_loss_clip": 1.01138365, "balance_loss_mlp": 1.00162888, "epoch": 0.21078578729032646, "flos": 23658894889920.0, "grad_norm": 1.8482202828156473, "language_loss": 0.81711996, "learning_rate": 3.666985380852539e-06, "loss": 0.84237432, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.8824353218078613 }, { "auxiliary_loss_clip": 0.01339028, "auxiliary_loss_mlp": 0.01200297, "balance_loss_clip": 1.01077521, "balance_loss_mlp": 1.00136089, "epoch": 0.21090603018096554, "flos": 29346390781920.0, "grad_norm": 2.141456522167116, "language_loss": 0.74556506, "learning_rate": 3.6665548488424576e-06, "loss": 0.7709583, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.923661470413208 }, { "auxiliary_loss_clip": 0.01387778, "auxiliary_loss_mlp": 0.01200055, "balance_loss_clip": 1.01268387, "balance_loss_mlp": 1.00169063, "epoch": 0.21102627307160465, "flos": 23261257281600.0, "grad_norm": 1.8267276522233737, "language_loss": 0.88068426, "learning_rate": 3.6661240640173307e-06, "loss": 0.90656263, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.867332935333252 }, { "auxiliary_loss_clip": 0.01318998, "auxiliary_loss_mlp": 0.011971, "balance_loss_clip": 1.00891316, "balance_loss_mlp": 1.00026131, "epoch": 0.21114651596224374, "flos": 54633484121760.0, "grad_norm": 0.8423494726268649, "language_loss": 0.57905674, "learning_rate": 3.6656930264425085e-06, "loss": 0.60421777, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.4555327892303467 }, { "auxiliary_loss_clip": 0.01386992, "auxiliary_loss_mlp": 0.01200286, "balance_loss_clip": 1.01224709, "balance_loss_mlp": 1.00154066, "epoch": 0.21126675885288282, "flos": 21543325182720.0, "grad_norm": 1.8472471038019527, "language_loss": 0.75568759, "learning_rate": 3.665261736183378e-06, "loss": 0.7815603, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 2.7644011974334717 }, { "auxiliary_loss_clip": 0.01324613, "auxiliary_loss_mlp": 0.01199681, "balance_loss_clip": 1.01133132, "balance_loss_mlp": 1.00112653, "epoch": 0.2113870017435219, "flos": 10961884278720.0, "grad_norm": 2.219275885429932, "language_loss": 0.886913, "learning_rate": 3.664830193305366e-06, "loss": 0.91215599, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.868067741394043 }, { "auxiliary_loss_clip": 0.01337952, "auxiliary_loss_mlp": 0.01199439, "balance_loss_clip": 1.01103282, "balance_loss_mlp": 1.00107431, "epoch": 0.211507244634161, "flos": 16653834707040.0, "grad_norm": 2.203223576967882, "language_loss": 0.7656523, "learning_rate": 3.6643983978739373e-06, "loss": 0.79102623, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 2.9460339546203613 }, { "auxiliary_loss_clip": 0.01350598, "auxiliary_loss_mlp": 0.01199274, "balance_loss_clip": 1.01233971, "balance_loss_mlp": 1.00110006, "epoch": 0.2116274875248001, "flos": 20954100787200.0, "grad_norm": 2.032664509776818, "language_loss": 0.82266641, "learning_rate": 3.663966349954596e-06, "loss": 0.84816504, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.9438648223876953 }, { "auxiliary_loss_clip": 0.01356703, "auxiliary_loss_mlp": 0.01197041, "balance_loss_clip": 1.01027596, "balance_loss_mlp": 1.00020301, "epoch": 0.21174773041543918, "flos": 68196980188800.0, "grad_norm": 0.7859957203541473, "language_loss": 0.59713817, "learning_rate": 3.6635340496128816e-06, "loss": 0.62267566, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 3.3251278400421143 }, { "auxiliary_loss_clip": 0.0131624, "auxiliary_loss_mlp": 0.01199112, "balance_loss_clip": 1.01048326, "balance_loss_mlp": 1.00112975, "epoch": 0.2118679733060783, "flos": 20668325814720.0, "grad_norm": 1.6234812030703758, "language_loss": 0.92770171, "learning_rate": 3.6631014969143747e-06, "loss": 0.95285523, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.9023032188415527 }, { "auxiliary_loss_clip": 0.01362019, "auxiliary_loss_mlp": 0.01200242, "balance_loss_clip": 1.01169181, "balance_loss_mlp": 1.00168681, "epoch": 0.21198821619671737, "flos": 23223442855680.0, "grad_norm": 1.869679565460029, "language_loss": 0.89049762, "learning_rate": 3.662668691924693e-06, "loss": 0.91612029, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 2.83566951751709 }, { "auxiliary_loss_clip": 0.01342317, "auxiliary_loss_mlp": 0.01199807, "balance_loss_clip": 1.01222837, "balance_loss_mlp": 1.00125206, "epoch": 0.21210845908735645, "flos": 24498558565920.0, "grad_norm": 3.257460216568912, "language_loss": 0.71744645, "learning_rate": 3.6622356347094927e-06, "loss": 0.74286771, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 3.8565237522125244 }, { "auxiliary_loss_clip": 0.01349084, "auxiliary_loss_mlp": 0.01199902, "balance_loss_clip": 1.01173675, "balance_loss_mlp": 1.00134742, "epoch": 0.21222870197799554, "flos": 27089801619840.0, "grad_norm": 2.1068593603110255, "language_loss": 0.78767359, "learning_rate": 3.6618023253344684e-06, "loss": 0.8131634, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.90939998626709 }, { "auxiliary_loss_clip": 0.01375356, "auxiliary_loss_mlp": 0.01199768, "balance_loss_clip": 1.01238453, "balance_loss_mlp": 1.00140333, "epoch": 0.21234894486863465, "flos": 16873859839680.0, "grad_norm": 1.6737487994574458, "language_loss": 0.83515793, "learning_rate": 3.6613687638653527e-06, "loss": 0.86090916, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 4.71606969833374 }, { "auxiliary_loss_clip": 0.01337344, "auxiliary_loss_mlp": 0.01199536, "balance_loss_clip": 1.01089525, "balance_loss_mlp": 1.00117183, "epoch": 0.21246918775927373, "flos": 23474959846560.0, "grad_norm": 2.4657464097234043, "language_loss": 0.78072, "learning_rate": 3.660934950367916e-06, "loss": 0.80608881, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 4.426951169967651 }, { "auxiliary_loss_clip": 0.01366095, "auxiliary_loss_mlp": 0.01199143, "balance_loss_clip": 1.01164329, "balance_loss_mlp": 1.00096977, "epoch": 0.21258943064991281, "flos": 22382306308800.0, "grad_norm": 1.8765575091934363, "language_loss": 0.83859777, "learning_rate": 3.660500884907968e-06, "loss": 0.86425018, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.9071688652038574 }, { "auxiliary_loss_clip": 0.01299315, "auxiliary_loss_mlp": 0.01197026, "balance_loss_clip": 1.01046598, "balance_loss_mlp": 1.00018775, "epoch": 0.21270967354055192, "flos": 59440196933280.0, "grad_norm": 0.8343816949405906, "language_loss": 0.6006521, "learning_rate": 3.660066567551356e-06, "loss": 0.6256156, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.3246195316314697 }, { "auxiliary_loss_clip": 0.0136576, "auxiliary_loss_mlp": 0.00873244, "balance_loss_clip": 1.01182365, "balance_loss_mlp": 1.00007927, "epoch": 0.212829916431191, "flos": 21544043656320.0, "grad_norm": 5.877526046613948, "language_loss": 0.84500414, "learning_rate": 3.6596319983639657e-06, "loss": 0.86739421, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.9427218437194824 }, { "auxiliary_loss_clip": 0.01325015, "auxiliary_loss_mlp": 0.00873318, "balance_loss_clip": 1.0110333, "balance_loss_mlp": 1.00010884, "epoch": 0.2129501593218301, "flos": 28987753080960.0, "grad_norm": 1.5895854931586033, "language_loss": 0.86382461, "learning_rate": 3.6591971774117214e-06, "loss": 0.88580793, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.96105694770813 }, { "auxiliary_loss_clip": 0.01373807, "auxiliary_loss_mlp": 0.01199666, "balance_loss_clip": 1.0121274, "balance_loss_mlp": 1.00168347, "epoch": 0.2130704022124692, "flos": 18807003298080.0, "grad_norm": 1.9272373118849009, "language_loss": 0.80429244, "learning_rate": 3.6587621047605833e-06, "loss": 0.83002722, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.766298532485962 }, { "auxiliary_loss_clip": 0.01364518, "auxiliary_loss_mlp": 0.01198411, "balance_loss_clip": 1.011235, "balance_loss_mlp": 1.00080967, "epoch": 0.21319064510310828, "flos": 13918159448640.0, "grad_norm": 2.0115485035112775, "language_loss": 0.86891502, "learning_rate": 3.6583267804765542e-06, "loss": 0.89454424, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.7973177433013916 }, { "auxiliary_loss_clip": 0.01360084, "auxiliary_loss_mlp": 0.01199206, "balance_loss_clip": 1.01147127, "balance_loss_mlp": 1.00103283, "epoch": 0.21331088799374737, "flos": 20959704881280.0, "grad_norm": 1.7524505278500786, "language_loss": 0.85723329, "learning_rate": 3.6578912046256702e-06, "loss": 0.88282621, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.7513797283172607 }, { "auxiliary_loss_clip": 0.0133794, "auxiliary_loss_mlp": 0.011994, "balance_loss_clip": 1.01110363, "balance_loss_mlp": 1.00103593, "epoch": 0.21343113088438645, "flos": 18624648896640.0, "grad_norm": 1.9851499098221546, "language_loss": 0.76232767, "learning_rate": 3.6574553772740083e-06, "loss": 0.78770107, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.9522454738616943 }, { "auxiliary_loss_clip": 0.01345894, "auxiliary_loss_mlp": 0.01195371, "balance_loss_clip": 1.01297045, "balance_loss_mlp": 1.00005841, "epoch": 0.21355137377502556, "flos": 67413164626080.0, "grad_norm": 0.8555650514587171, "language_loss": 0.61827725, "learning_rate": 3.657019298487684e-06, "loss": 0.64368993, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.464115619659424 }, { "auxiliary_loss_clip": 0.01375158, "auxiliary_loss_mlp": 0.00873298, "balance_loss_clip": 1.01231194, "balance_loss_mlp": 1.00010276, "epoch": 0.21367161666566464, "flos": 34532110020960.0, "grad_norm": 1.5882681336478424, "language_loss": 0.83663201, "learning_rate": 3.6565829683328495e-06, "loss": 0.85911655, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.9557814598083496 }, { "auxiliary_loss_clip": 0.01360657, "auxiliary_loss_mlp": 0.01199479, "balance_loss_clip": 1.01177394, "balance_loss_mlp": 1.00149655, "epoch": 0.21379185955630373, "flos": 18989357699520.0, "grad_norm": 1.9663571214210325, "language_loss": 0.86004323, "learning_rate": 3.6561463868756965e-06, "loss": 0.88564461, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.869817018508911 }, { "auxiliary_loss_clip": 0.01362935, "auxiliary_loss_mlp": 0.01199414, "balance_loss_clip": 1.01192379, "balance_loss_mlp": 1.00143147, "epoch": 0.21391210244694284, "flos": 28218509323200.0, "grad_norm": 1.4361474555946068, "language_loss": 0.78212297, "learning_rate": 3.655709554182452e-06, "loss": 0.80774641, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.8652074337005615 }, { "auxiliary_loss_clip": 0.01373775, "auxiliary_loss_mlp": 0.01199676, "balance_loss_clip": 1.01203942, "balance_loss_mlp": 1.00150228, "epoch": 0.21403234533758192, "flos": 17455073254560.0, "grad_norm": 1.9116413086578556, "language_loss": 0.84366357, "learning_rate": 3.6552724703193855e-06, "loss": 0.86939806, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.8541388511657715 }, { "auxiliary_loss_clip": 0.01305939, "auxiliary_loss_mlp": 0.01195389, "balance_loss_clip": 1.00964618, "balance_loss_mlp": 1.00007617, "epoch": 0.214152588228221, "flos": 51637634265600.0, "grad_norm": 0.7841667745294428, "language_loss": 0.55939013, "learning_rate": 3.654835135352801e-06, "loss": 0.5844034, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.338397979736328 }, { "auxiliary_loss_clip": 0.01327486, "auxiliary_loss_mlp": 0.01198807, "balance_loss_clip": 1.01059031, "balance_loss_mlp": 1.00101483, "epoch": 0.21427283111886009, "flos": 19496163667680.0, "grad_norm": 1.7598284627958691, "language_loss": 0.87583542, "learning_rate": 3.654397549349043e-06, "loss": 0.90109837, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.9679744243621826 }, { "auxiliary_loss_clip": 0.01336615, "auxiliary_loss_mlp": 0.0119929, "balance_loss_clip": 1.0107398, "balance_loss_mlp": 1.00130677, "epoch": 0.2143930740094992, "flos": 20084813284320.0, "grad_norm": 2.002423502295937, "language_loss": 0.75396883, "learning_rate": 3.653959712374491e-06, "loss": 0.77932799, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.993163824081421 }, { "auxiliary_loss_clip": 0.01311081, "auxiliary_loss_mlp": 0.01199093, "balance_loss_clip": 1.01069021, "balance_loss_mlp": 1.00111032, "epoch": 0.21451331690013828, "flos": 21798614160000.0, "grad_norm": 1.6998875757476573, "language_loss": 0.82659096, "learning_rate": 3.6535216244955663e-06, "loss": 0.85169268, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.9828908443450928 }, { "auxiliary_loss_clip": 0.01338105, "auxiliary_loss_mlp": 0.01199129, "balance_loss_clip": 1.01009727, "balance_loss_mlp": 1.00114655, "epoch": 0.21463355979077736, "flos": 32853896303040.0, "grad_norm": 1.7745331545330485, "language_loss": 0.7110368, "learning_rate": 3.653083285778726e-06, "loss": 0.73640919, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 3.040388584136963 }, { "auxiliary_loss_clip": 0.01371752, "auxiliary_loss_mlp": 0.01198971, "balance_loss_clip": 1.0118885, "balance_loss_mlp": 1.00117874, "epoch": 0.21475380268141647, "flos": 21543828114240.0, "grad_norm": 2.1025577801339526, "language_loss": 0.80906284, "learning_rate": 3.6526446962904653e-06, "loss": 0.83477008, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 2.8166353702545166 }, { "auxiliary_loss_clip": 0.01359432, "auxiliary_loss_mlp": 0.01199084, "balance_loss_clip": 1.01161742, "balance_loss_mlp": 1.00129223, "epoch": 0.21487404557205556, "flos": 32159095915680.0, "grad_norm": 1.5202684182315422, "language_loss": 0.74419582, "learning_rate": 3.652205856097318e-06, "loss": 0.76978099, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.960970878601074 }, { "auxiliary_loss_clip": 0.01336537, "auxiliary_loss_mlp": 0.00873254, "balance_loss_clip": 1.01023984, "balance_loss_mlp": 1.00013113, "epoch": 0.21499428846269464, "flos": 12673098649440.0, "grad_norm": 1.7962587853339553, "language_loss": 0.79074919, "learning_rate": 3.651766765265856e-06, "loss": 0.81284714, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 2.9510316848754883 }, { "auxiliary_loss_clip": 0.01349509, "auxiliary_loss_mlp": 0.01198683, "balance_loss_clip": 1.01188707, "balance_loss_mlp": 1.00127268, "epoch": 0.21511453135333372, "flos": 23471583020640.0, "grad_norm": 2.18716918186111, "language_loss": 0.81336325, "learning_rate": 3.65132742386269e-06, "loss": 0.83884513, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.8517684936523438 }, { "auxiliary_loss_clip": 0.0138621, "auxiliary_loss_mlp": 0.01199183, "balance_loss_clip": 1.01198673, "balance_loss_mlp": 1.00139093, "epoch": 0.21523477424397283, "flos": 26943573231360.0, "grad_norm": 1.8070060318271153, "language_loss": 0.84502482, "learning_rate": 3.6508878319544656e-06, "loss": 0.87087876, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 3.9691731929779053 }, { "auxiliary_loss_clip": 0.01351196, "auxiliary_loss_mlp": 0.01199393, "balance_loss_clip": 1.01136041, "balance_loss_mlp": 1.00141072, "epoch": 0.21535501713461191, "flos": 18916171657920.0, "grad_norm": 3.402254276754704, "language_loss": 0.81216973, "learning_rate": 3.65044798960787e-06, "loss": 0.83767563, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.886789321899414 }, { "auxiliary_loss_clip": 0.01324224, "auxiliary_loss_mlp": 0.01198505, "balance_loss_clip": 1.0110594, "balance_loss_mlp": 1.00109422, "epoch": 0.215475260025251, "flos": 17895123519840.0, "grad_norm": 2.6080913122286002, "language_loss": 0.78077036, "learning_rate": 3.650007896889627e-06, "loss": 0.80599761, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 3.7569212913513184 }, { "auxiliary_loss_clip": 0.01385886, "auxiliary_loss_mlp": 0.01198925, "balance_loss_clip": 1.01234627, "balance_loss_mlp": 1.00132394, "epoch": 0.2155955029158901, "flos": 16654301714880.0, "grad_norm": 1.6747912222897114, "language_loss": 0.80501312, "learning_rate": 3.6495675538664974e-06, "loss": 0.83086121, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 5.155128002166748 }, { "auxiliary_loss_clip": 0.01360774, "auxiliary_loss_mlp": 0.01198768, "balance_loss_clip": 1.01180577, "balance_loss_mlp": 1.00097632, "epoch": 0.2157157458065292, "flos": 23621224158720.0, "grad_norm": 1.6966431240818727, "language_loss": 0.82505065, "learning_rate": 3.649126960605282e-06, "loss": 0.85064602, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.9317686557769775 }, { "auxiliary_loss_clip": 0.01335646, "auxiliary_loss_mlp": 0.01199365, "balance_loss_clip": 1.01088846, "balance_loss_mlp": 1.00119185, "epoch": 0.21583598869716827, "flos": 22127089178880.0, "grad_norm": 2.182583589206265, "language_loss": 0.83426464, "learning_rate": 3.6486861171728174e-06, "loss": 0.85961473, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.8875415325164795 }, { "auxiliary_loss_clip": 0.01340401, "auxiliary_loss_mlp": 0.01199231, "balance_loss_clip": 1.01080716, "balance_loss_mlp": 1.00105762, "epoch": 0.21595623158780738, "flos": 23441240720160.0, "grad_norm": 1.68946835880143, "language_loss": 0.78530574, "learning_rate": 3.6482450236359803e-06, "loss": 0.81070209, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 3.0152971744537354 }, { "auxiliary_loss_clip": 0.01361277, "auxiliary_loss_mlp": 0.01198382, "balance_loss_clip": 1.01169443, "balance_loss_mlp": 1.0009712, "epoch": 0.21607647447844647, "flos": 26906513202720.0, "grad_norm": 2.079981183161054, "language_loss": 0.78247416, "learning_rate": 3.647803680061683e-06, "loss": 0.80807072, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.8716652393341064 }, { "auxiliary_loss_clip": 0.01340134, "auxiliary_loss_mlp": 0.01199101, "balance_loss_clip": 1.01074719, "balance_loss_mlp": 1.00111818, "epoch": 0.21619671736908555, "flos": 14495385335040.0, "grad_norm": 5.240429206423884, "language_loss": 0.74818933, "learning_rate": 3.6473620865168776e-06, "loss": 0.77358168, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.871581792831421 }, { "auxiliary_loss_clip": 0.01340005, "auxiliary_loss_mlp": 0.01198731, "balance_loss_clip": 1.01067472, "balance_loss_mlp": 1.00112915, "epoch": 0.21631696025972463, "flos": 17931105838080.0, "grad_norm": 2.14373071309421, "language_loss": 0.81549346, "learning_rate": 3.646920243068554e-06, "loss": 0.84088087, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.9100167751312256 }, { "auxiliary_loss_clip": 0.01348261, "auxiliary_loss_mlp": 0.01199043, "balance_loss_clip": 1.01180696, "balance_loss_mlp": 1.00125098, "epoch": 0.21643720315036374, "flos": 24462396629280.0, "grad_norm": 1.7458335856869394, "language_loss": 0.74852604, "learning_rate": 3.6464781497837384e-06, "loss": 0.77399904, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.9323716163635254 }, { "auxiliary_loss_clip": 0.01353341, "auxiliary_loss_mlp": 0.0119958, "balance_loss_clip": 1.01118684, "balance_loss_mlp": 1.00121582, "epoch": 0.21655744604100283, "flos": 28474444926720.0, "grad_norm": 1.7286721418860322, "language_loss": 0.72679734, "learning_rate": 3.6460358067294965e-06, "loss": 0.75232661, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.9299659729003906 }, { "auxiliary_loss_clip": 0.01386523, "auxiliary_loss_mlp": 0.01199633, "balance_loss_clip": 1.01191318, "balance_loss_mlp": 1.00126863, "epoch": 0.2166776889316419, "flos": 20152970010720.0, "grad_norm": 1.9536855083292755, "language_loss": 0.77612489, "learning_rate": 3.645593213972932e-06, "loss": 0.80198646, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.8420844078063965 }, { "auxiliary_loss_clip": 0.01374516, "auxiliary_loss_mlp": 0.01199109, "balance_loss_clip": 1.01250005, "balance_loss_mlp": 1.00112629, "epoch": 0.21679793182228102, "flos": 15193490700960.0, "grad_norm": 2.473327592916435, "language_loss": 0.79701394, "learning_rate": 3.6451503715811852e-06, "loss": 0.82275015, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.7964887619018555 }, { "auxiliary_loss_clip": 0.01340213, "auxiliary_loss_mlp": 0.01198486, "balance_loss_clip": 1.01140165, "balance_loss_mlp": 1.00107539, "epoch": 0.2169181747129201, "flos": 17384473717920.0, "grad_norm": 1.852930346929901, "language_loss": 0.80101514, "learning_rate": 3.6447072796214345e-06, "loss": 0.82640213, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.979881763458252 }, { "auxiliary_loss_clip": 0.01296479, "auxiliary_loss_mlp": 0.01197126, "balance_loss_clip": 1.00774479, "balance_loss_mlp": 1.00028801, "epoch": 0.21703841760355919, "flos": 58760987423040.0, "grad_norm": 0.9163579342435508, "language_loss": 0.63152885, "learning_rate": 3.644263938160898e-06, "loss": 0.65646487, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.3603780269622803 }, { "auxiliary_loss_clip": 0.01331975, "auxiliary_loss_mlp": 0.01198943, "balance_loss_clip": 1.01063347, "balance_loss_mlp": 1.00115108, "epoch": 0.21715866049419827, "flos": 22418468245440.0, "grad_norm": 1.89916624694571, "language_loss": 0.72468978, "learning_rate": 3.6438203472668293e-06, "loss": 0.74999893, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 3.0048019886016846 }, { "auxiliary_loss_clip": 0.01350093, "auxiliary_loss_mlp": 0.01198746, "balance_loss_clip": 1.01149523, "balance_loss_mlp": 1.00095439, "epoch": 0.21727890338483738, "flos": 17237742397920.0, "grad_norm": 1.8472410180117946, "language_loss": 0.82136446, "learning_rate": 3.6433765070065206e-06, "loss": 0.8468529, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.982538938522339 }, { "auxiliary_loss_clip": 0.01385625, "auxiliary_loss_mlp": 0.01199079, "balance_loss_clip": 1.01220632, "balance_loss_mlp": 1.00109613, "epoch": 0.21739914627547646, "flos": 13434798434400.0, "grad_norm": 3.416954935099118, "language_loss": 0.87515891, "learning_rate": 3.6429324174473025e-06, "loss": 0.90100598, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.7523903846740723 }, { "auxiliary_loss_clip": 0.01372367, "auxiliary_loss_mlp": 0.01198499, "balance_loss_clip": 1.0115943, "balance_loss_mlp": 1.000898, "epoch": 0.21751938916611555, "flos": 20959525262880.0, "grad_norm": 2.0347158296365975, "language_loss": 0.85054886, "learning_rate": 3.6424880786565425e-06, "loss": 0.87625748, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.8265390396118164 }, { "auxiliary_loss_clip": 0.01287627, "auxiliary_loss_mlp": 0.01199566, "balance_loss_clip": 1.00953436, "balance_loss_mlp": 1.00120175, "epoch": 0.21763963205675466, "flos": 27599948490240.0, "grad_norm": 2.559232086352782, "language_loss": 0.7978965, "learning_rate": 3.6420434907016482e-06, "loss": 0.82276845, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 2.9878664016723633 }, { "auxiliary_loss_clip": 0.01363898, "auxiliary_loss_mlp": 0.01198882, "balance_loss_clip": 1.01190972, "balance_loss_mlp": 1.00109029, "epoch": 0.21775987494739374, "flos": 21430420760160.0, "grad_norm": 1.57530674374096, "language_loss": 0.81199503, "learning_rate": 3.6415986536500606e-06, "loss": 0.83762282, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.8808836936950684 }, { "auxiliary_loss_clip": 0.0128438, "auxiliary_loss_mlp": 0.011984, "balance_loss_clip": 1.00896418, "balance_loss_mlp": 1.0009892, "epoch": 0.21788011783803282, "flos": 18332982440640.0, "grad_norm": 1.7419563914621425, "language_loss": 0.80723482, "learning_rate": 3.641153567569263e-06, "loss": 0.83206266, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 2.9081740379333496 }, { "auxiliary_loss_clip": 0.0136115, "auxiliary_loss_mlp": 0.01197893, "balance_loss_clip": 1.01117325, "balance_loss_mlp": 1.00086379, "epoch": 0.2180003607286719, "flos": 30262761020160.0, "grad_norm": 1.9657860015156383, "language_loss": 0.95588928, "learning_rate": 3.640708232526774e-06, "loss": 0.9814797, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 2.9071238040924072 }, { "auxiliary_loss_clip": 0.01300217, "auxiliary_loss_mlp": 0.01199135, "balance_loss_clip": 1.01133299, "balance_loss_mlp": 1.00096154, "epoch": 0.21812060361931102, "flos": 25480283483520.0, "grad_norm": 1.6534149677183456, "language_loss": 0.78521484, "learning_rate": 3.6402626485901504e-06, "loss": 0.81020832, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 2.9536988735198975 }, { "auxiliary_loss_clip": 0.01361312, "auxiliary_loss_mlp": 0.01198141, "balance_loss_clip": 1.01256871, "balance_loss_mlp": 1.00092077, "epoch": 0.2182408465099501, "flos": 21908177680320.0, "grad_norm": 2.1324262702242422, "language_loss": 0.78122264, "learning_rate": 3.639816815826988e-06, "loss": 0.80681717, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.781061887741089 }, { "auxiliary_loss_clip": 0.01339808, "auxiliary_loss_mlp": 0.01198527, "balance_loss_clip": 1.01046419, "balance_loss_mlp": 1.00111604, "epoch": 0.21836108940058918, "flos": 23657350171680.0, "grad_norm": 1.7773143179534905, "language_loss": 0.78038871, "learning_rate": 3.6393707343049176e-06, "loss": 0.80577207, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 3.9367356300354004 }, { "auxiliary_loss_clip": 0.01372126, "auxiliary_loss_mlp": 0.01198749, "balance_loss_clip": 1.01154566, "balance_loss_mlp": 1.00114727, "epoch": 0.2184813322912283, "flos": 24681020738400.0, "grad_norm": 2.341540365699128, "language_loss": 0.73565292, "learning_rate": 3.6389244040916104e-06, "loss": 0.76136172, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 3.7550196647644043 }, { "auxiliary_loss_clip": 0.01348533, "auxiliary_loss_mlp": 0.00873126, "balance_loss_clip": 1.01203489, "balance_loss_mlp": 1.00015879, "epoch": 0.21860157518186737, "flos": 26574661357920.0, "grad_norm": 2.0090298863687264, "language_loss": 0.79229712, "learning_rate": 3.6384778252547747e-06, "loss": 0.81451374, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.865180253982544 }, { "auxiliary_loss_clip": 0.01337732, "auxiliary_loss_mlp": 0.00873132, "balance_loss_clip": 1.01177478, "balance_loss_mlp": 1.00012517, "epoch": 0.21872181807250646, "flos": 20886303297600.0, "grad_norm": 2.9541423109978777, "language_loss": 0.77629089, "learning_rate": 3.638030997862155e-06, "loss": 0.79839945, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 4.065143585205078 }, { "auxiliary_loss_clip": 0.01322484, "auxiliary_loss_mlp": 0.01195458, "balance_loss_clip": 1.01021814, "balance_loss_mlp": 1.00014591, "epoch": 0.21884206096314554, "flos": 61209486685440.0, "grad_norm": 1.0687504305666446, "language_loss": 0.59397566, "learning_rate": 3.6375839219815356e-06, "loss": 0.61915505, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.351442337036133 }, { "auxiliary_loss_clip": 0.01386032, "auxiliary_loss_mlp": 0.0119929, "balance_loss_clip": 1.01212955, "balance_loss_mlp": 1.00111604, "epoch": 0.21896230385378465, "flos": 23473846212480.0, "grad_norm": 2.1748302189740985, "language_loss": 0.82840264, "learning_rate": 3.6371365976807375e-06, "loss": 0.85425586, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.772529125213623 }, { "auxiliary_loss_clip": 0.01306303, "auxiliary_loss_mlp": 0.01199399, "balance_loss_clip": 1.01020098, "balance_loss_mlp": 1.00122583, "epoch": 0.21908254674442373, "flos": 25081927401600.0, "grad_norm": 2.430210657240721, "language_loss": 0.83401752, "learning_rate": 3.6366890250276185e-06, "loss": 0.85907459, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 3.003798246383667 }, { "auxiliary_loss_clip": 0.01385489, "auxiliary_loss_mlp": 0.01198926, "balance_loss_clip": 1.01212215, "balance_loss_mlp": 1.00113416, "epoch": 0.21920278963506282, "flos": 23513780135520.0, "grad_norm": 2.003403397402656, "language_loss": 0.89651364, "learning_rate": 3.6362412040900764e-06, "loss": 0.9223578, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.857584238052368 }, { "auxiliary_loss_clip": 0.01372991, "auxiliary_loss_mlp": 0.01198892, "balance_loss_clip": 1.01224315, "balance_loss_mlp": 1.00090933, "epoch": 0.21932303252570193, "flos": 29242251737280.0, "grad_norm": 2.28671102964875, "language_loss": 0.80145442, "learning_rate": 3.635793134936044e-06, "loss": 0.82717335, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.930690288543701 }, { "auxiliary_loss_clip": 0.01362957, "auxiliary_loss_mlp": 0.01199082, "balance_loss_clip": 1.01132214, "balance_loss_mlp": 1.00129056, "epoch": 0.219443275416341, "flos": 20806866535680.0, "grad_norm": 1.7189800865806184, "language_loss": 0.73477471, "learning_rate": 3.635344817633494e-06, "loss": 0.76039517, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.9150569438934326 }, { "auxiliary_loss_clip": 0.01359963, "auxiliary_loss_mlp": 0.01198421, "balance_loss_clip": 1.01115048, "balance_loss_mlp": 1.00082016, "epoch": 0.2195635183069801, "flos": 14501564208000.0, "grad_norm": 2.110099849359205, "language_loss": 0.7555446, "learning_rate": 3.634896252250436e-06, "loss": 0.78112841, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.795846939086914 }, { "auxiliary_loss_clip": 0.01386512, "auxiliary_loss_mlp": 0.01198966, "balance_loss_clip": 1.01224041, "balance_loss_mlp": 1.00098372, "epoch": 0.2196837611976192, "flos": 24243485130720.0, "grad_norm": 2.001447190717125, "language_loss": 0.82429862, "learning_rate": 3.6344474388549157e-06, "loss": 0.85015339, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.7645132541656494 }, { "auxiliary_loss_clip": 0.0136105, "auxiliary_loss_mlp": 0.01198885, "balance_loss_clip": 1.01157749, "balance_loss_mlp": 1.00109315, "epoch": 0.2198040040882583, "flos": 18074532179520.0, "grad_norm": 1.9562393688362676, "language_loss": 0.79971552, "learning_rate": 3.6339983775150183e-06, "loss": 0.82531488, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.7041268348693848 }, { "auxiliary_loss_clip": 0.01362383, "auxiliary_loss_mlp": 0.01198974, "balance_loss_clip": 1.01177454, "balance_loss_mlp": 1.00118208, "epoch": 0.21992424697889737, "flos": 17784194899680.0, "grad_norm": 2.613807914099848, "language_loss": 0.84392583, "learning_rate": 3.6335490682988664e-06, "loss": 0.86953938, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.8249335289001465 }, { "auxiliary_loss_clip": 0.01300456, "auxiliary_loss_mlp": 0.01199149, "balance_loss_clip": 1.01091015, "balance_loss_mlp": 1.00116622, "epoch": 0.22004448986953645, "flos": 17638505366400.0, "grad_norm": 2.0613696708859446, "language_loss": 0.83060032, "learning_rate": 3.63309951127462e-06, "loss": 0.85559636, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.9072484970092773 }, { "auxiliary_loss_clip": 0.01314685, "auxiliary_loss_mlp": 0.01199157, "balance_loss_clip": 1.00971365, "balance_loss_mlp": 1.00098348, "epoch": 0.22016473276017556, "flos": 22275544835520.0, "grad_norm": 1.8082211585062076, "language_loss": 0.75037438, "learning_rate": 3.6326497065104757e-06, "loss": 0.77551281, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.9218599796295166 }, { "auxiliary_loss_clip": 0.01373192, "auxiliary_loss_mlp": 0.01198936, "balance_loss_clip": 1.01203537, "balance_loss_mlp": 1.00114417, "epoch": 0.22028497565081465, "flos": 25556271572160.0, "grad_norm": 3.875162925707595, "language_loss": 0.78044987, "learning_rate": 3.6321996540746697e-06, "loss": 0.80617112, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.832253932952881 }, { "auxiliary_loss_clip": 0.01322044, "auxiliary_loss_mlp": 0.01198956, "balance_loss_clip": 1.01035357, "balance_loss_mlp": 1.00116372, "epoch": 0.22040521854145373, "flos": 36247347843840.0, "grad_norm": 1.9653375126958421, "language_loss": 0.80576873, "learning_rate": 3.6317493540354733e-06, "loss": 0.83097869, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.9295384883880615 }, { "auxiliary_loss_clip": 0.01374283, "auxiliary_loss_mlp": 0.01199381, "balance_loss_clip": 1.0123266, "balance_loss_mlp": 1.00139809, "epoch": 0.22052546143209284, "flos": 11838428364960.0, "grad_norm": 1.938564477807765, "language_loss": 0.76709712, "learning_rate": 3.6312988064611976e-06, "loss": 0.79283375, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.796654224395752 }, { "auxiliary_loss_clip": 0.01336233, "auxiliary_loss_mlp": 0.01198905, "balance_loss_clip": 1.01082969, "balance_loss_mlp": 1.00092185, "epoch": 0.22064570432273192, "flos": 24209263072800.0, "grad_norm": 1.6950285937927312, "language_loss": 0.81417227, "learning_rate": 3.6308480114201896e-06, "loss": 0.83952367, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.866487741470337 }, { "auxiliary_loss_clip": 0.01386112, "auxiliary_loss_mlp": 0.01198803, "balance_loss_clip": 1.01276922, "balance_loss_mlp": 1.00120115, "epoch": 0.220765947213371, "flos": 17931357303840.0, "grad_norm": 1.7736300378072938, "language_loss": 0.76604712, "learning_rate": 3.630396968980835e-06, "loss": 0.79189628, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.796396017074585 }, { "auxiliary_loss_clip": 0.01348536, "auxiliary_loss_mlp": 0.01198869, "balance_loss_clip": 1.01128495, "balance_loss_mlp": 1.00088668, "epoch": 0.2208861901040101, "flos": 26757051683040.0, "grad_norm": 2.19929008925509, "language_loss": 0.83636063, "learning_rate": 3.6299456792115575e-06, "loss": 0.8618347, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 3.028726100921631 }, { "auxiliary_loss_clip": 0.01287817, "auxiliary_loss_mlp": 0.011983, "balance_loss_clip": 1.01052761, "balance_loss_mlp": 1.0008893, "epoch": 0.2210064329946492, "flos": 17817985873440.0, "grad_norm": 1.8330563332651815, "language_loss": 0.8092134, "learning_rate": 3.629494142180815e-06, "loss": 0.83407462, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.042517900466919 }, { "auxiliary_loss_clip": 0.01385416, "auxiliary_loss_mlp": 0.01198347, "balance_loss_clip": 1.01259136, "balance_loss_mlp": 1.00112724, "epoch": 0.22112667588528828, "flos": 17967411469440.0, "grad_norm": 2.3574266173910052, "language_loss": 0.85328829, "learning_rate": 3.6290423579571075e-06, "loss": 0.87912595, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 2.8106319904327393 }, { "auxiliary_loss_clip": 0.01360628, "auxiliary_loss_mlp": 0.0119891, "balance_loss_clip": 1.012393, "balance_loss_mlp": 1.00111794, "epoch": 0.22124691877592736, "flos": 18369216224640.0, "grad_norm": 1.6745672287964077, "language_loss": 0.80263209, "learning_rate": 3.6285903266089694e-06, "loss": 0.82822752, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.722651481628418 }, { "auxiliary_loss_clip": 0.01339845, "auxiliary_loss_mlp": 0.0119898, "balance_loss_clip": 1.01053154, "balance_loss_mlp": 1.00137854, "epoch": 0.22136716166656648, "flos": 20813296874400.0, "grad_norm": 1.8226275912121668, "language_loss": 0.77118993, "learning_rate": 3.628138048204974e-06, "loss": 0.79657817, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.819061040878296 }, { "auxiliary_loss_clip": 0.01300849, "auxiliary_loss_mlp": 0.0119888, "balance_loss_clip": 1.00977409, "balance_loss_mlp": 1.00089729, "epoch": 0.22148740455720556, "flos": 17675709089760.0, "grad_norm": 1.6996385457460115, "language_loss": 0.76576126, "learning_rate": 3.6276855228137304e-06, "loss": 0.79075849, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.8279099464416504 }, { "auxiliary_loss_clip": 0.01385618, "auxiliary_loss_mlp": 0.00873177, "balance_loss_clip": 1.01204586, "balance_loss_mlp": 1.00011992, "epoch": 0.22160764744784464, "flos": 21726721370880.0, "grad_norm": 2.131999830367179, "language_loss": 0.82036859, "learning_rate": 3.6272327505038874e-06, "loss": 0.84295654, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 4.710585594177246 }, { "auxiliary_loss_clip": 0.0131239, "auxiliary_loss_mlp": 0.01198245, "balance_loss_clip": 1.00995576, "balance_loss_mlp": 1.00102544, "epoch": 0.22172789033848372, "flos": 23764722347520.0, "grad_norm": 1.908269305529856, "language_loss": 0.78147143, "learning_rate": 3.626779731344131e-06, "loss": 0.8065778, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.8346917629241943 }, { "auxiliary_loss_clip": 0.01384537, "auxiliary_loss_mlp": 0.01198867, "balance_loss_clip": 1.01197314, "balance_loss_mlp": 1.00107503, "epoch": 0.22184813322912283, "flos": 16982309725920.0, "grad_norm": 2.1318510084432427, "language_loss": 0.85259104, "learning_rate": 3.6263264654031814e-06, "loss": 0.87842506, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 3.7283883094787598 }, { "auxiliary_loss_clip": 0.01308085, "auxiliary_loss_mlp": 0.01195318, "balance_loss_clip": 1.0084157, "balance_loss_mlp": 1.00000501, "epoch": 0.22196837611976192, "flos": 61823772600480.0, "grad_norm": 0.6942594594939132, "language_loss": 0.59177452, "learning_rate": 3.6258729527498008e-06, "loss": 0.61680853, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 4.365764617919922 }, { "auxiliary_loss_clip": 0.01346923, "auxiliary_loss_mlp": 0.01198736, "balance_loss_clip": 1.01105833, "balance_loss_mlp": 1.00113511, "epoch": 0.222088619010401, "flos": 25558031832480.0, "grad_norm": 2.7326887795986305, "language_loss": 0.65145683, "learning_rate": 3.6254191934527854e-06, "loss": 0.67691338, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.9073843955993652 }, { "auxiliary_loss_clip": 0.01313772, "auxiliary_loss_mlp": 0.01199283, "balance_loss_clip": 1.01115394, "balance_loss_mlp": 1.00110996, "epoch": 0.2222088619010401, "flos": 19318623039360.0, "grad_norm": 2.4042760228687303, "language_loss": 0.64445978, "learning_rate": 3.6249651875809715e-06, "loss": 0.66959035, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.8351502418518066 }, { "auxiliary_loss_clip": 0.01334316, "auxiliary_loss_mlp": 0.01198714, "balance_loss_clip": 1.01140296, "balance_loss_mlp": 1.00111234, "epoch": 0.2223291047916792, "flos": 19099352304000.0, "grad_norm": 1.8413771055528176, "language_loss": 0.89513576, "learning_rate": 3.62451093520323e-06, "loss": 0.92046607, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.87906551361084 }, { "auxiliary_loss_clip": 0.01309995, "auxiliary_loss_mlp": 0.01198703, "balance_loss_clip": 1.01078582, "balance_loss_mlp": 1.00091076, "epoch": 0.22244934768231828, "flos": 20850428750400.0, "grad_norm": 2.0493684169784276, "language_loss": 0.90389562, "learning_rate": 3.6240564363884714e-06, "loss": 0.92898256, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.9555323123931885 }, { "auxiliary_loss_clip": 0.01372509, "auxiliary_loss_mlp": 0.01199083, "balance_loss_clip": 1.01153719, "balance_loss_mlp": 1.00110066, "epoch": 0.2225695905729574, "flos": 15632930263680.0, "grad_norm": 1.8255595976365224, "language_loss": 0.70545983, "learning_rate": 3.623601691205643e-06, "loss": 0.73117578, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.8033647537231445 }, { "auxiliary_loss_clip": 0.01372878, "auxiliary_loss_mlp": 0.01199233, "balance_loss_clip": 1.01222348, "balance_loss_mlp": 1.00125074, "epoch": 0.22268983346359647, "flos": 25373593857600.0, "grad_norm": 2.1248042440183204, "language_loss": 0.81769276, "learning_rate": 3.623146699723729e-06, "loss": 0.84341389, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.86008620262146 }, { "auxiliary_loss_clip": 0.01337278, "auxiliary_loss_mlp": 0.01199274, "balance_loss_clip": 1.01102304, "balance_loss_mlp": 1.00129104, "epoch": 0.22281007635423555, "flos": 13261460876640.0, "grad_norm": 1.9346555868434694, "language_loss": 0.77676839, "learning_rate": 3.6226914620117507e-06, "loss": 0.80213392, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.9578654766082764 }, { "auxiliary_loss_clip": 0.01349008, "auxiliary_loss_mlp": 0.01198204, "balance_loss_clip": 1.01175094, "balance_loss_mlp": 1.00098395, "epoch": 0.22293031924487464, "flos": 15340545334080.0, "grad_norm": 2.358317622465624, "language_loss": 0.81121117, "learning_rate": 3.622235978138768e-06, "loss": 0.83668327, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.799251079559326 }, { "auxiliary_loss_clip": 0.01359562, "auxiliary_loss_mlp": 0.01199007, "balance_loss_clip": 1.01163661, "balance_loss_mlp": 1.00121462, "epoch": 0.22305056213551375, "flos": 22564660710240.0, "grad_norm": 1.9432441516461534, "language_loss": 0.81354219, "learning_rate": 3.621780248173877e-06, "loss": 0.83912778, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.811070203781128 }, { "auxiliary_loss_clip": 0.01352826, "auxiliary_loss_mlp": 0.01195413, "balance_loss_clip": 1.01033115, "balance_loss_mlp": 1.00010002, "epoch": 0.22317080502615283, "flos": 64880450752320.0, "grad_norm": 0.9531739434374394, "language_loss": 0.60979825, "learning_rate": 3.6213242721862125e-06, "loss": 0.63528061, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.3147661685943604 }, { "auxiliary_loss_clip": 0.01351959, "auxiliary_loss_mlp": 0.01198574, "balance_loss_clip": 1.01172948, "balance_loss_mlp": 1.00116348, "epoch": 0.2232910479167919, "flos": 25776009315360.0, "grad_norm": 1.4908417382837726, "language_loss": 0.75141442, "learning_rate": 3.620868050244945e-06, "loss": 0.77691978, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.8792951107025146 }, { "auxiliary_loss_clip": 0.01352976, "auxiliary_loss_mlp": 0.01199617, "balance_loss_clip": 1.01147401, "balance_loss_mlp": 1.00144386, "epoch": 0.22341129080743102, "flos": 23251809353760.0, "grad_norm": 1.982404531465199, "language_loss": 0.77773297, "learning_rate": 3.6204115824192817e-06, "loss": 0.8032589, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.8627769947052 }, { "auxiliary_loss_clip": 0.01360672, "auxiliary_loss_mlp": 0.01198822, "balance_loss_clip": 1.0120815, "balance_loss_mlp": 1.00102973, "epoch": 0.2235315336980701, "flos": 21214562774400.0, "grad_norm": 2.379299093989626, "language_loss": 0.77311081, "learning_rate": 3.619954868778471e-06, "loss": 0.79870582, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.746286392211914 }, { "auxiliary_loss_clip": 0.01360451, "auxiliary_loss_mlp": 0.01198752, "balance_loss_clip": 1.01236713, "balance_loss_mlp": 1.00115097, "epoch": 0.2236517765887092, "flos": 19901955951360.0, "grad_norm": 1.806560635160594, "language_loss": 0.82909614, "learning_rate": 3.6194979093917944e-06, "loss": 0.85468817, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.809293746948242 }, { "auxiliary_loss_clip": 0.01347358, "auxiliary_loss_mlp": 0.01198152, "balance_loss_clip": 1.01093268, "balance_loss_mlp": 1.00093246, "epoch": 0.22377201947934827, "flos": 23214857096160.0, "grad_norm": 1.8408497695592396, "language_loss": 0.86786312, "learning_rate": 3.6190407043285724e-06, "loss": 0.89331818, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.7195935249328613 }, { "auxiliary_loss_clip": 0.01384972, "auxiliary_loss_mlp": 0.01199177, "balance_loss_clip": 1.01215744, "balance_loss_mlp": 1.00119448, "epoch": 0.22389226236998738, "flos": 26794255406400.0, "grad_norm": 2.161958162605114, "language_loss": 0.76068723, "learning_rate": 3.618583253658163e-06, "loss": 0.78652871, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.7098846435546875 }, { "auxiliary_loss_clip": 0.01308871, "auxiliary_loss_mlp": 0.00873109, "balance_loss_clip": 1.01154995, "balance_loss_mlp": 1.00014448, "epoch": 0.22401250526062647, "flos": 24170370936480.0, "grad_norm": 1.9951898798257008, "language_loss": 0.86655533, "learning_rate": 3.618125557449961e-06, "loss": 0.88837516, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 2.884664535522461 }, { "auxiliary_loss_clip": 0.01359386, "auxiliary_loss_mlp": 0.01199084, "balance_loss_clip": 1.01168656, "balance_loss_mlp": 1.0011009, "epoch": 0.22413274815126555, "flos": 16759769935680.0, "grad_norm": 1.911582150329858, "language_loss": 0.83317327, "learning_rate": 3.6176676157733983e-06, "loss": 0.85875797, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.7154669761657715 }, { "auxiliary_loss_clip": 0.01335543, "auxiliary_loss_mlp": 0.01198616, "balance_loss_clip": 1.01124918, "balance_loss_mlp": 1.00101423, "epoch": 0.22425299104190466, "flos": 21360216384000.0, "grad_norm": 2.0119159829457547, "language_loss": 0.76076508, "learning_rate": 3.6172094286979443e-06, "loss": 0.78610671, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.9105026721954346 }, { "auxiliary_loss_clip": 0.01350845, "auxiliary_loss_mlp": 0.01198, "balance_loss_clip": 1.01079988, "balance_loss_mlp": 1.00078011, "epoch": 0.22437323393254374, "flos": 32165562178080.0, "grad_norm": 1.4154847607309915, "language_loss": 0.81547803, "learning_rate": 3.6167509962931064e-06, "loss": 0.84096646, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 3.0230486392974854 }, { "auxiliary_loss_clip": 0.01302049, "auxiliary_loss_mlp": 0.01199025, "balance_loss_clip": 1.00919688, "balance_loss_mlp": 1.00123334, "epoch": 0.22449347682318282, "flos": 18002819008800.0, "grad_norm": 16.46218998984296, "language_loss": 0.76723051, "learning_rate": 3.6162923186284276e-06, "loss": 0.79224128, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.9680306911468506 }, { "auxiliary_loss_clip": 0.01359939, "auxiliary_loss_mlp": 0.01199408, "balance_loss_clip": 1.01188159, "balance_loss_mlp": 1.0014255, "epoch": 0.2246137197138219, "flos": 18697296083040.0, "grad_norm": 2.0628919849505563, "language_loss": 0.8594141, "learning_rate": 3.6158333957734888e-06, "loss": 0.88500762, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.759645700454712 }, { "auxiliary_loss_clip": 0.01337343, "auxiliary_loss_mlp": 0.01198228, "balance_loss_clip": 1.01120996, "balance_loss_mlp": 1.00119877, "epoch": 0.22473396260446102, "flos": 15590661301440.0, "grad_norm": 1.9930732035890173, "language_loss": 0.82981634, "learning_rate": 3.6153742277979088e-06, "loss": 0.85517204, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 3.7944083213806152 }, { "auxiliary_loss_clip": 0.01348132, "auxiliary_loss_mlp": 0.01198562, "balance_loss_clip": 1.01068926, "balance_loss_mlp": 1.00096083, "epoch": 0.2248542054951001, "flos": 14465510042400.0, "grad_norm": 2.115534861812456, "language_loss": 0.7823205, "learning_rate": 3.6149148147713434e-06, "loss": 0.80778742, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 3.6787030696868896 }, { "auxiliary_loss_clip": 0.01369174, "auxiliary_loss_mlp": 0.01199057, "balance_loss_clip": 1.01205683, "balance_loss_mlp": 1.00164664, "epoch": 0.22497444838573918, "flos": 19243892279520.0, "grad_norm": 1.7375744599537999, "language_loss": 0.86370146, "learning_rate": 3.614455156763484e-06, "loss": 0.88938379, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 3.7007007598876953 }, { "auxiliary_loss_clip": 0.01324448, "auxiliary_loss_mlp": 0.01198667, "balance_loss_clip": 1.01063919, "balance_loss_mlp": 1.00106597, "epoch": 0.2250946912763783, "flos": 16910309165760.0, "grad_norm": 2.104657367898188, "language_loss": 0.71426868, "learning_rate": 3.613995253844061e-06, "loss": 0.73949981, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 4.379636764526367 }, { "auxiliary_loss_clip": 0.01359143, "auxiliary_loss_mlp": 0.01197868, "balance_loss_clip": 1.01180089, "balance_loss_mlp": 1.0006479, "epoch": 0.22521493416701738, "flos": 24681379975200.0, "grad_norm": 1.7431992662540976, "language_loss": 0.80936891, "learning_rate": 3.6135351060828414e-06, "loss": 0.834939, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.834629774093628 }, { "auxiliary_loss_clip": 0.01385475, "auxiliary_loss_mlp": 0.0119886, "balance_loss_clip": 1.01233029, "balance_loss_mlp": 1.00106764, "epoch": 0.22533517705765646, "flos": 17821973401920.0, "grad_norm": 1.8269501083463913, "language_loss": 0.6959641, "learning_rate": 3.6130747135496285e-06, "loss": 0.72180742, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.7586116790771484 }, { "auxiliary_loss_clip": 0.01384449, "auxiliary_loss_mlp": 0.0119878, "balance_loss_clip": 1.01212561, "balance_loss_mlp": 1.00079703, "epoch": 0.22545541994829554, "flos": 33691404558240.0, "grad_norm": 1.795385359862803, "language_loss": 0.65728301, "learning_rate": 3.6126140763142646e-06, "loss": 0.68311524, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.822455883026123 }, { "auxiliary_loss_clip": 0.01384647, "auxiliary_loss_mlp": 0.01198969, "balance_loss_clip": 1.01190877, "balance_loss_mlp": 1.00136733, "epoch": 0.22557566283893465, "flos": 19171604329920.0, "grad_norm": 2.3677627797989658, "language_loss": 0.85940742, "learning_rate": 3.6121531944466275e-06, "loss": 0.88524354, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.775515079498291 }, { "auxiliary_loss_clip": 0.01358653, "auxiliary_loss_mlp": 0.01197871, "balance_loss_clip": 1.01112509, "balance_loss_mlp": 1.00084233, "epoch": 0.22569590572957374, "flos": 20773291104000.0, "grad_norm": 2.0837128033734404, "language_loss": 0.7809211, "learning_rate": 3.611692068016633e-06, "loss": 0.80648637, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.768787145614624 }, { "auxiliary_loss_clip": 0.01339684, "auxiliary_loss_mlp": 0.01199162, "balance_loss_clip": 1.01124442, "balance_loss_mlp": 1.00117898, "epoch": 0.22581614862021282, "flos": 18442725579360.0, "grad_norm": 2.1779960887051217, "language_loss": 0.75117171, "learning_rate": 3.611230697094233e-06, "loss": 0.77656019, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.905515193939209 }, { "auxiliary_loss_clip": 0.01349914, "auxiliary_loss_mlp": 0.01198126, "balance_loss_clip": 1.01099253, "balance_loss_mlp": 1.00109649, "epoch": 0.22593639151085193, "flos": 20048399881920.0, "grad_norm": 1.866554628713024, "language_loss": 0.87479913, "learning_rate": 3.6107690817494173e-06, "loss": 0.90027958, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.9639716148376465 }, { "auxiliary_loss_clip": 0.01311116, "auxiliary_loss_mlp": 0.011986, "balance_loss_clip": 1.01000309, "balance_loss_mlp": 1.00118923, "epoch": 0.226056634401491, "flos": 13115124717120.0, "grad_norm": 3.727894003890252, "language_loss": 0.70752454, "learning_rate": 3.6103072220522117e-06, "loss": 0.73262167, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.9760043621063232 }, { "auxiliary_loss_clip": 0.01333877, "auxiliary_loss_mlp": 0.01198736, "balance_loss_clip": 1.01097727, "balance_loss_mlp": 1.0009439, "epoch": 0.2261768772921301, "flos": 18988387760160.0, "grad_norm": 1.7732428193129586, "language_loss": 0.92126262, "learning_rate": 3.609845118072682e-06, "loss": 0.94658875, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.8532521724700928 }, { "auxiliary_loss_clip": 0.01371157, "auxiliary_loss_mlp": 0.00873194, "balance_loss_clip": 1.01184952, "balance_loss_mlp": 1.00025535, "epoch": 0.2262971201827692, "flos": 19974064282560.0, "grad_norm": 1.756744652351244, "language_loss": 0.80012429, "learning_rate": 3.6093827698809276e-06, "loss": 0.82256782, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.759612798690796 }, { "auxiliary_loss_clip": 0.01371159, "auxiliary_loss_mlp": 0.01198252, "balance_loss_clip": 1.0113095, "balance_loss_mlp": 1.00103235, "epoch": 0.2264173630734083, "flos": 16654553180640.0, "grad_norm": 4.397267898402275, "language_loss": 0.84884453, "learning_rate": 3.6089201775470864e-06, "loss": 0.87453866, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.7414543628692627 }, { "auxiliary_loss_clip": 0.01321206, "auxiliary_loss_mlp": 0.01198628, "balance_loss_clip": 1.00987101, "balance_loss_mlp": 1.00121725, "epoch": 0.22653760596404737, "flos": 24389821290240.0, "grad_norm": 1.2965404592619785, "language_loss": 0.77727735, "learning_rate": 3.6084573411413334e-06, "loss": 0.80247569, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.9527037143707275 }, { "auxiliary_loss_clip": 0.01335842, "auxiliary_loss_mlp": 0.01199585, "balance_loss_clip": 1.01135039, "balance_loss_mlp": 1.00122082, "epoch": 0.22665784885468646, "flos": 18332551356480.0, "grad_norm": 2.315528006483346, "language_loss": 0.80853921, "learning_rate": 3.607994260733881e-06, "loss": 0.83389342, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.8272483348846436 }, { "auxiliary_loss_clip": 0.01371069, "auxiliary_loss_mlp": 0.01198484, "balance_loss_clip": 1.01135564, "balance_loss_mlp": 1.00107336, "epoch": 0.22677809174532557, "flos": 24058113140160.0, "grad_norm": 1.5713182882280448, "language_loss": 0.75116658, "learning_rate": 3.6075309363949776e-06, "loss": 0.77686214, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.7593674659729004 }, { "auxiliary_loss_clip": 0.01384354, "auxiliary_loss_mlp": 0.01198403, "balance_loss_clip": 1.01194429, "balance_loss_mlp": 1.00099289, "epoch": 0.22689833463596465, "flos": 20374252472160.0, "grad_norm": 1.7385239862352126, "language_loss": 0.81451094, "learning_rate": 3.6070673681949094e-06, "loss": 0.84033853, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.670877456665039 }, { "auxiliary_loss_clip": 0.01348494, "auxiliary_loss_mlp": 0.00873089, "balance_loss_clip": 1.01126862, "balance_loss_mlp": 1.0002408, "epoch": 0.22701857752660373, "flos": 30120412389120.0, "grad_norm": 1.8035432992148106, "language_loss": 0.81073302, "learning_rate": 3.606603556203999e-06, "loss": 0.8329488, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.783384323120117 }, { "auxiliary_loss_clip": 0.01371159, "auxiliary_loss_mlp": 0.01198935, "balance_loss_clip": 1.01132441, "balance_loss_mlp": 1.00114286, "epoch": 0.22713882041724284, "flos": 22492193142240.0, "grad_norm": 1.7449688052315016, "language_loss": 0.83776522, "learning_rate": 3.6061395004926066e-06, "loss": 0.86346614, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.7708182334899902 }, { "auxiliary_loss_clip": 0.01384229, "auxiliary_loss_mlp": 0.01198735, "balance_loss_clip": 1.01178765, "balance_loss_mlp": 1.00113416, "epoch": 0.22725906330788193, "flos": 20521558571040.0, "grad_norm": 2.350049788540077, "language_loss": 0.8459183, "learning_rate": 3.605675201131129e-06, "loss": 0.87174791, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 2.7981441020965576 }, { "auxiliary_loss_clip": 0.01372788, "auxiliary_loss_mlp": 0.01198663, "balance_loss_clip": 1.01217365, "balance_loss_mlp": 1.00087142, "epoch": 0.227379306198521, "flos": 18989932478400.0, "grad_norm": 2.0622368714462316, "language_loss": 0.79532778, "learning_rate": 3.60521065819e-06, "loss": 0.8210423, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 2.774228572845459 }, { "auxiliary_loss_clip": 0.01351257, "auxiliary_loss_mlp": 0.01198498, "balance_loss_clip": 1.01069427, "balance_loss_mlp": 1.00108707, "epoch": 0.2274995490891601, "flos": 21798362694240.0, "grad_norm": 1.7350648035809386, "language_loss": 0.87485313, "learning_rate": 3.60474587173969e-06, "loss": 0.90035069, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.8305513858795166 }, { "auxiliary_loss_clip": 0.01359071, "auxiliary_loss_mlp": 0.01198592, "balance_loss_clip": 1.01186383, "balance_loss_mlp": 1.00099039, "epoch": 0.2276197919797992, "flos": 19058663983680.0, "grad_norm": 2.008078699501494, "language_loss": 0.84534311, "learning_rate": 3.6042808418507084e-06, "loss": 0.8709197, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 2.7109060287475586 }, { "auxiliary_loss_clip": 0.01359918, "auxiliary_loss_mlp": 0.01198831, "balance_loss_clip": 1.01115811, "balance_loss_mlp": 1.00122952, "epoch": 0.22774003487043828, "flos": 18806787756000.0, "grad_norm": 2.4486668973744843, "language_loss": 0.77351701, "learning_rate": 3.6038155685935976e-06, "loss": 0.79910457, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.774480104446411 }, { "auxiliary_loss_clip": 0.01358302, "auxiliary_loss_mlp": 0.01198662, "balance_loss_clip": 1.01070619, "balance_loss_mlp": 1.00086975, "epoch": 0.22786027776107737, "flos": 23002555554720.0, "grad_norm": 1.972871961276039, "language_loss": 0.70932209, "learning_rate": 3.6033500520389404e-06, "loss": 0.73489177, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 3.672640085220337 }, { "auxiliary_loss_clip": 0.01304498, "auxiliary_loss_mlp": 0.01195406, "balance_loss_clip": 1.00989473, "balance_loss_mlp": 1.0000931, "epoch": 0.22798052065171648, "flos": 66706904584800.0, "grad_norm": 0.7894005630361915, "language_loss": 0.648431, "learning_rate": 3.6028842922573553e-06, "loss": 0.67343003, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 4.384974718093872 }, { "auxiliary_loss_clip": 0.01328976, "auxiliary_loss_mlp": 0.00872389, "balance_loss_clip": 1.01044261, "balance_loss_mlp": 0.99994051, "epoch": 0.22810076354235556, "flos": 62080929609120.0, "grad_norm": 0.8567420431754758, "language_loss": 0.62974137, "learning_rate": 3.602418289319497e-06, "loss": 0.65175503, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 4.271141529083252 }, { "auxiliary_loss_clip": 0.01324592, "auxiliary_loss_mlp": 0.01198938, "balance_loss_clip": 1.01109827, "balance_loss_mlp": 1.00114608, "epoch": 0.22822100643299464, "flos": 23876369441280.0, "grad_norm": 1.6817031592202047, "language_loss": 0.73158073, "learning_rate": 3.601952043296059e-06, "loss": 0.75681603, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 3.931034564971924 }, { "auxiliary_loss_clip": 0.01359605, "auxiliary_loss_mlp": 0.01198631, "balance_loss_clip": 1.0113771, "balance_loss_mlp": 1.00083852, "epoch": 0.22834124932363373, "flos": 20991340434240.0, "grad_norm": 1.90626732926283, "language_loss": 0.80724663, "learning_rate": 3.6014855542577696e-06, "loss": 0.832829, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.7950918674468994 }, { "auxiliary_loss_clip": 0.01351277, "auxiliary_loss_mlp": 0.01199056, "balance_loss_clip": 1.01146698, "balance_loss_mlp": 1.00145471, "epoch": 0.22846149221427284, "flos": 24901584726240.0, "grad_norm": 2.589271919612104, "language_loss": 0.84401143, "learning_rate": 3.6010188222753943e-06, "loss": 0.8695147, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.9610915184020996 }, { "auxiliary_loss_clip": 0.01327778, "auxiliary_loss_mlp": 0.01195412, "balance_loss_clip": 1.00817513, "balance_loss_mlp": 1.00009942, "epoch": 0.22858173510491192, "flos": 56132325103680.0, "grad_norm": 0.9107002669335589, "language_loss": 0.64296484, "learning_rate": 3.6005518474197372e-06, "loss": 0.66819674, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.200284004211426 }, { "auxiliary_loss_clip": 0.0135834, "auxiliary_loss_mlp": 0.01199537, "balance_loss_clip": 1.01120126, "balance_loss_mlp": 1.00136352, "epoch": 0.228701977995551, "flos": 24170837944320.0, "grad_norm": 1.76343683907046, "language_loss": 0.78369236, "learning_rate": 3.6000846297616373e-06, "loss": 0.8092711, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.778982162475586 }, { "auxiliary_loss_clip": 0.01384087, "auxiliary_loss_mlp": 0.01199632, "balance_loss_clip": 1.01215208, "balance_loss_mlp": 1.00145841, "epoch": 0.22882222088619011, "flos": 21387900332160.0, "grad_norm": 2.215877228085187, "language_loss": 0.72722554, "learning_rate": 3.5996171693719717e-06, "loss": 0.75306278, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.7377994060516357 }, { "auxiliary_loss_clip": 0.01351129, "auxiliary_loss_mlp": 0.0119549, "balance_loss_clip": 1.00954008, "balance_loss_mlp": 1.00017738, "epoch": 0.2289424637768292, "flos": 64589646464640.0, "grad_norm": 0.8747999735005766, "language_loss": 0.64857924, "learning_rate": 3.5991494663216528e-06, "loss": 0.67404544, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.4137563705444336 }, { "auxiliary_loss_clip": 0.01384077, "auxiliary_loss_mlp": 0.01198555, "balance_loss_clip": 1.01223588, "balance_loss_mlp": 1.0009532, "epoch": 0.22906270666746828, "flos": 22163430733920.0, "grad_norm": 1.8071582509687858, "language_loss": 0.87463981, "learning_rate": 3.5986815206816314e-06, "loss": 0.90046614, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.8693370819091797 }, { "auxiliary_loss_clip": 0.01384024, "auxiliary_loss_mlp": 0.01199078, "balance_loss_clip": 1.01167464, "balance_loss_mlp": 1.00128555, "epoch": 0.2291829495581074, "flos": 25772345100000.0, "grad_norm": 1.6921287525985462, "language_loss": 0.75035429, "learning_rate": 3.598213332522895e-06, "loss": 0.77618527, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.8352537155151367 }, { "auxiliary_loss_clip": 0.01371264, "auxiliary_loss_mlp": 0.01198571, "balance_loss_clip": 1.01195991, "balance_loss_mlp": 1.00135088, "epoch": 0.22930319244874647, "flos": 31172772767040.0, "grad_norm": 1.7712135314335227, "language_loss": 0.77454132, "learning_rate": 3.597744901916466e-06, "loss": 0.80023968, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.8595240116119385 }, { "auxiliary_loss_clip": 0.01384308, "auxiliary_loss_mlp": 0.0119911, "balance_loss_clip": 1.01143026, "balance_loss_mlp": 1.00112689, "epoch": 0.22942343533938556, "flos": 23254108469280.0, "grad_norm": 1.9118875888530478, "language_loss": 0.77180827, "learning_rate": 3.5972762289334058e-06, "loss": 0.79764241, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.7538371086120605 }, { "auxiliary_loss_clip": 0.01270334, "auxiliary_loss_mlp": 0.01198767, "balance_loss_clip": 1.00831223, "balance_loss_mlp": 1.00116515, "epoch": 0.22954367823002464, "flos": 14610912186240.0, "grad_norm": 2.3090532797933707, "language_loss": 0.84993035, "learning_rate": 3.5968073136448116e-06, "loss": 0.87462139, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.8842966556549072 }, { "auxiliary_loss_clip": 0.01370698, "auxiliary_loss_mlp": 0.01198925, "balance_loss_clip": 1.01173496, "balance_loss_mlp": 1.00132418, "epoch": 0.22966392112066375, "flos": 16763613769440.0, "grad_norm": 1.714720353447255, "language_loss": 0.91334099, "learning_rate": 3.596338156121818e-06, "loss": 0.9390372, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.81977915763855 }, { "auxiliary_loss_clip": 0.01330235, "auxiliary_loss_mlp": 0.01195571, "balance_loss_clip": 1.00858808, "balance_loss_mlp": 1.00025868, "epoch": 0.22978416401130283, "flos": 67474280311200.0, "grad_norm": 0.7428519968285701, "language_loss": 0.59330064, "learning_rate": 3.595868756435595e-06, "loss": 0.61855865, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.4326791763305664 }, { "auxiliary_loss_clip": 0.01316528, "auxiliary_loss_mlp": 0.011993, "balance_loss_clip": 1.01036024, "balance_loss_mlp": 1.00131774, "epoch": 0.22990440690194192, "flos": 19865147388480.0, "grad_norm": 2.200618704437238, "language_loss": 0.80593455, "learning_rate": 3.5953991146573504e-06, "loss": 0.83109283, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.8213720321655273 }, { "auxiliary_loss_clip": 0.01371573, "auxiliary_loss_mlp": 0.01199124, "balance_loss_clip": 1.01144075, "balance_loss_mlp": 1.00133181, "epoch": 0.23002464979258103, "flos": 13289252595840.0, "grad_norm": 3.111928504014201, "language_loss": 0.83272517, "learning_rate": 3.5949292308583294e-06, "loss": 0.85843211, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.766186475753784 }, { "auxiliary_loss_clip": 0.01383523, "auxiliary_loss_mlp": 0.01198957, "balance_loss_clip": 1.01176476, "balance_loss_mlp": 1.00135565, "epoch": 0.2301448926832201, "flos": 22163789970720.0, "grad_norm": 1.8299044530114927, "language_loss": 0.809358, "learning_rate": 3.594459105109811e-06, "loss": 0.83518279, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.706834316253662 }, { "auxiliary_loss_clip": 0.01367653, "auxiliary_loss_mlp": 0.01198401, "balance_loss_clip": 1.01134372, "balance_loss_mlp": 1.00137186, "epoch": 0.2302651355738592, "flos": 20704487751360.0, "grad_norm": 1.9197306500476135, "language_loss": 0.81587851, "learning_rate": 3.593988737483115e-06, "loss": 0.84153908, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.7572121620178223 }, { "auxiliary_loss_clip": 0.0134466, "auxiliary_loss_mlp": 0.01198801, "balance_loss_clip": 1.01155043, "balance_loss_mlp": 1.0012002, "epoch": 0.23038537846449827, "flos": 18588953967840.0, "grad_norm": 3.649267124188363, "language_loss": 0.78484762, "learning_rate": 3.5935181280495947e-06, "loss": 0.81028223, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 2.742985248565674 }, { "auxiliary_loss_clip": 0.01324605, "auxiliary_loss_mlp": 0.01195484, "balance_loss_clip": 1.00876927, "balance_loss_mlp": 1.00017202, "epoch": 0.23050562135513739, "flos": 64224291035520.0, "grad_norm": 0.8108892993579863, "language_loss": 0.54245222, "learning_rate": 3.5930472768806412e-06, "loss": 0.56765306, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.2962076663970947 }, { "auxiliary_loss_clip": 0.01383136, "auxiliary_loss_mlp": 0.01199053, "balance_loss_clip": 1.01191616, "balance_loss_mlp": 1.00126076, "epoch": 0.23062586424577647, "flos": 17313407173440.0, "grad_norm": 1.790715350428133, "language_loss": 0.77187097, "learning_rate": 3.5925761840476826e-06, "loss": 0.79769284, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 2.7117931842803955 }, { "auxiliary_loss_clip": 0.0133367, "auxiliary_loss_mlp": 0.01198793, "balance_loss_clip": 1.01140702, "balance_loss_mlp": 1.00138223, "epoch": 0.23074610713641555, "flos": 27855991864800.0, "grad_norm": 3.017054891783476, "language_loss": 0.81350255, "learning_rate": 3.592104849622183e-06, "loss": 0.83882719, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 2.8919155597686768 }, { "auxiliary_loss_clip": 0.01300961, "auxiliary_loss_mlp": 0.01198775, "balance_loss_clip": 1.00929976, "balance_loss_mlp": 1.000983, "epoch": 0.23086635002705466, "flos": 28841812081920.0, "grad_norm": 2.0115573347393725, "language_loss": 0.73516154, "learning_rate": 3.591633273675644e-06, "loss": 0.7601589, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.974849224090576 }, { "auxiliary_loss_clip": 0.01291792, "auxiliary_loss_mlp": 0.01195347, "balance_loss_clip": 1.01202965, "balance_loss_mlp": 1.00003457, "epoch": 0.23098659291769374, "flos": 62923705436160.0, "grad_norm": 0.9053339037623304, "language_loss": 0.58173347, "learning_rate": 3.591161456279602e-06, "loss": 0.60660487, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 4.322242021560669 }, { "auxiliary_loss_clip": 0.0134864, "auxiliary_loss_mlp": 0.01199116, "balance_loss_clip": 1.01082373, "balance_loss_mlp": 1.0013237, "epoch": 0.23110683580833283, "flos": 23476827877920.0, "grad_norm": 1.5525114740791293, "language_loss": 0.803581, "learning_rate": 3.590689397505633e-06, "loss": 0.82905859, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 3.708165407180786 }, { "auxiliary_loss_clip": 0.01382769, "auxiliary_loss_mlp": 0.01198084, "balance_loss_clip": 1.01191533, "balance_loss_mlp": 1.00105441, "epoch": 0.2312270786989719, "flos": 27271078310880.0, "grad_norm": 1.7098302411725657, "language_loss": 0.86926997, "learning_rate": 3.590217097425347e-06, "loss": 0.89507842, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 3.760204553604126 }, { "auxiliary_loss_clip": 0.0138385, "auxiliary_loss_mlp": 0.01198682, "balance_loss_clip": 1.01208115, "balance_loss_mlp": 1.00108051, "epoch": 0.23134732158961102, "flos": 13261353105600.0, "grad_norm": 2.0576393545436966, "language_loss": 0.71281993, "learning_rate": 3.589744556110391e-06, "loss": 0.73864526, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 3.632999897003174 }, { "auxiliary_loss_clip": 0.01356006, "auxiliary_loss_mlp": 0.01199259, "balance_loss_clip": 1.01119685, "balance_loss_mlp": 1.00127673, "epoch": 0.2314675644802501, "flos": 36977663541600.0, "grad_norm": 1.662870927800945, "language_loss": 0.84195942, "learning_rate": 3.58927177363245e-06, "loss": 0.86751205, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.8650429248809814 }, { "auxiliary_loss_clip": 0.01335372, "auxiliary_loss_mlp": 0.01199031, "balance_loss_clip": 1.01106775, "balance_loss_mlp": 1.00104785, "epoch": 0.2315878073708892, "flos": 23842219230720.0, "grad_norm": 2.1683337089698407, "language_loss": 0.72625291, "learning_rate": 3.5887987500632447e-06, "loss": 0.75159693, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.8379077911376953 }, { "auxiliary_loss_clip": 0.01336084, "auxiliary_loss_mlp": 0.01198158, "balance_loss_clip": 1.01085353, "balance_loss_mlp": 1.00093782, "epoch": 0.2317080502615283, "flos": 23039435964960.0, "grad_norm": 1.7282538046769589, "language_loss": 0.84394813, "learning_rate": 3.5883254854745325e-06, "loss": 0.86929047, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.8076088428497314 }, { "auxiliary_loss_clip": 0.01371013, "auxiliary_loss_mlp": 0.01198289, "balance_loss_clip": 1.01153743, "balance_loss_mlp": 1.00087881, "epoch": 0.23182829315216738, "flos": 11254664368800.0, "grad_norm": 1.8217255986891718, "language_loss": 0.75238621, "learning_rate": 3.587851979938107e-06, "loss": 0.77807921, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.787806510925293 }, { "auxiliary_loss_clip": 0.01357699, "auxiliary_loss_mlp": 0.01198156, "balance_loss_clip": 1.01076829, "balance_loss_mlp": 1.00093603, "epoch": 0.23194853604280646, "flos": 19828949528160.0, "grad_norm": 1.9897067065409895, "language_loss": 0.7753607, "learning_rate": 3.5873782335257985e-06, "loss": 0.80091929, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.7870631217956543 }, { "auxiliary_loss_clip": 0.01310786, "auxiliary_loss_mlp": 0.01198607, "balance_loss_clip": 1.00997448, "balance_loss_mlp": 1.00100577, "epoch": 0.23206877893344555, "flos": 15305030023680.0, "grad_norm": 2.099152257309817, "language_loss": 0.78571212, "learning_rate": 3.5869042463094744e-06, "loss": 0.8108061, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.794628143310547 }, { "auxiliary_loss_clip": 0.01311963, "auxiliary_loss_mlp": 0.01198923, "balance_loss_clip": 1.00987434, "balance_loss_mlp": 1.00132191, "epoch": 0.23218902182408466, "flos": 22711499801280.0, "grad_norm": 1.8979759607942852, "language_loss": 0.77016085, "learning_rate": 3.586430018361038e-06, "loss": 0.79526973, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.982538938522339 }, { "auxiliary_loss_clip": 0.01359091, "auxiliary_loss_mlp": 0.01198438, "balance_loss_clip": 1.01185584, "balance_loss_mlp": 1.00102758, "epoch": 0.23230926471472374, "flos": 22710745404000.0, "grad_norm": 2.347290831375278, "language_loss": 0.76333529, "learning_rate": 3.5859555497524283e-06, "loss": 0.78891057, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.9144155979156494 }, { "auxiliary_loss_clip": 0.01361143, "auxiliary_loss_mlp": 0.0119833, "balance_loss_clip": 1.01102924, "balance_loss_mlp": 1.00111055, "epoch": 0.23242950760536282, "flos": 20375509800960.0, "grad_norm": 1.799166055558646, "language_loss": 0.91893774, "learning_rate": 3.5854808405556237e-06, "loss": 0.94453251, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.7969541549682617 }, { "auxiliary_loss_clip": 0.01323782, "auxiliary_loss_mlp": 0.01198473, "balance_loss_clip": 1.01017153, "balance_loss_mlp": 1.0012536, "epoch": 0.23254975049600193, "flos": 16908333363360.0, "grad_norm": 2.3736153228095747, "language_loss": 0.75660717, "learning_rate": 3.5850058908426355e-06, "loss": 0.78182971, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.876169443130493 }, { "auxiliary_loss_clip": 0.01358769, "auxiliary_loss_mlp": 0.01198182, "balance_loss_clip": 1.01173019, "balance_loss_mlp": 1.00096178, "epoch": 0.23266999338664102, "flos": 23294832713280.0, "grad_norm": 1.7342323876224375, "language_loss": 0.8563503, "learning_rate": 3.584530700685514e-06, "loss": 0.8819198, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.8550925254821777 }, { "auxiliary_loss_clip": 0.01332474, "auxiliary_loss_mlp": 0.01198459, "balance_loss_clip": 1.01054335, "balance_loss_mlp": 1.00104856, "epoch": 0.2327902362772801, "flos": 19569996335520.0, "grad_norm": 2.6287608571820047, "language_loss": 0.89182425, "learning_rate": 3.5840552701563448e-06, "loss": 0.91713357, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.8668999671936035 }, { "auxiliary_loss_clip": 0.01381961, "auxiliary_loss_mlp": 0.01198713, "balance_loss_clip": 1.011796, "balance_loss_mlp": 1.00130212, "epoch": 0.2329104791679192, "flos": 16727523680160.0, "grad_norm": 2.09779851264328, "language_loss": 0.81615734, "learning_rate": 3.5835795993272513e-06, "loss": 0.84196407, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.7812983989715576 }, { "auxiliary_loss_clip": 0.01253271, "auxiliary_loss_mlp": 0.01198806, "balance_loss_clip": 1.00899458, "balance_loss_mlp": 1.00101423, "epoch": 0.2330307220585583, "flos": 22163754047040.0, "grad_norm": 1.9602000600131728, "language_loss": 0.71434605, "learning_rate": 3.583103688270391e-06, "loss": 0.73886681, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 3.1140029430389404 }, { "auxiliary_loss_clip": 0.01358838, "auxiliary_loss_mlp": 0.01198759, "balance_loss_clip": 1.01188362, "balance_loss_mlp": 1.001158, "epoch": 0.23315096494919738, "flos": 19317329786880.0, "grad_norm": 2.154716797314215, "language_loss": 0.89214301, "learning_rate": 3.58262753705796e-06, "loss": 0.91771895, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 3.695767402648926 }, { "auxiliary_loss_clip": 0.01315327, "auxiliary_loss_mlp": 0.01195391, "balance_loss_clip": 1.00793171, "balance_loss_mlp": 1.00007808, "epoch": 0.23327120783983646, "flos": 53031078874080.0, "grad_norm": 0.7538897212673066, "language_loss": 0.55505788, "learning_rate": 3.5821511457621902e-06, "loss": 0.58016503, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.412980318069458 }, { "auxiliary_loss_clip": 0.01346375, "auxiliary_loss_mlp": 0.01199263, "balance_loss_clip": 1.01137972, "balance_loss_mlp": 1.00128007, "epoch": 0.23339145073047557, "flos": 17126993396160.0, "grad_norm": 2.779499449570906, "language_loss": 0.81593353, "learning_rate": 3.5816745144553497e-06, "loss": 0.84138989, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.847891092300415 }, { "auxiliary_loss_clip": 0.01283562, "auxiliary_loss_mlp": 0.01198387, "balance_loss_clip": 1.00827408, "balance_loss_mlp": 1.00097656, "epoch": 0.23351169362111465, "flos": 13078926856800.0, "grad_norm": 2.216786800912838, "language_loss": 0.75666165, "learning_rate": 3.5811976432097424e-06, "loss": 0.78148115, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 2.8823142051696777 }, { "auxiliary_loss_clip": 0.0136013, "auxiliary_loss_mlp": 0.00873263, "balance_loss_clip": 1.01180577, "balance_loss_mlp": 1.00031233, "epoch": 0.23363193651175373, "flos": 15851266983360.0, "grad_norm": 2.134167266518663, "language_loss": 0.8472662, "learning_rate": 3.58072053209771e-06, "loss": 0.86960012, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.7827770709991455 }, { "auxiliary_loss_clip": 0.01359082, "auxiliary_loss_mlp": 0.01198634, "balance_loss_clip": 1.01143122, "balance_loss_mlp": 1.00103307, "epoch": 0.23375217940239285, "flos": 21025778034240.0, "grad_norm": 1.9787334281786748, "language_loss": 0.7908752, "learning_rate": 3.5802431811916296e-06, "loss": 0.81645238, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 2.908336639404297 }, { "auxiliary_loss_clip": 0.01334658, "auxiliary_loss_mlp": 0.01198281, "balance_loss_clip": 1.01038277, "balance_loss_mlp": 1.00106156, "epoch": 0.23387242229303193, "flos": 20594708688960.0, "grad_norm": 1.5996837897795722, "language_loss": 0.80694401, "learning_rate": 3.579765590563916e-06, "loss": 0.83227342, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.7814817428588867 }, { "auxiliary_loss_clip": 0.01370562, "auxiliary_loss_mlp": 0.01198635, "balance_loss_clip": 1.0118562, "balance_loss_mlp": 1.00141501, "epoch": 0.233992665183671, "flos": 24279503372640.0, "grad_norm": 3.0237584325453946, "language_loss": 0.81943542, "learning_rate": 3.579287760287017e-06, "loss": 0.8451274, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.7693521976470947 }, { "auxiliary_loss_clip": 0.01358185, "auxiliary_loss_mlp": 0.01198241, "balance_loss_clip": 1.01104903, "balance_loss_mlp": 1.00102115, "epoch": 0.2341129080743101, "flos": 30154634447040.0, "grad_norm": 1.7550615671173335, "language_loss": 0.72740018, "learning_rate": 3.578809690433421e-06, "loss": 0.75296444, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 3.1868627071380615 }, { "auxiliary_loss_clip": 0.0138226, "auxiliary_loss_mlp": 0.01199309, "balance_loss_clip": 1.01163602, "balance_loss_mlp": 1.00113511, "epoch": 0.2342331509649492, "flos": 22784146987680.0, "grad_norm": 3.3436231790205526, "language_loss": 0.8119638, "learning_rate": 3.578331381075651e-06, "loss": 0.83777952, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 4.688729763031006 }, { "auxiliary_loss_clip": 0.01370638, "auxiliary_loss_mlp": 0.01198939, "balance_loss_clip": 1.01178956, "balance_loss_mlp": 1.00114739, "epoch": 0.2343533938555883, "flos": 23623164037440.0, "grad_norm": 2.169221693531493, "language_loss": 0.69784093, "learning_rate": 3.5778528322862646e-06, "loss": 0.72353673, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 3.7086710929870605 }, { "auxiliary_loss_clip": 0.01370305, "auxiliary_loss_mlp": 0.01198409, "balance_loss_clip": 1.01158214, "balance_loss_mlp": 1.00118935, "epoch": 0.23447363674622737, "flos": 24570343584000.0, "grad_norm": 1.528282909762179, "language_loss": 0.86639327, "learning_rate": 3.5773740441378585e-06, "loss": 0.89208043, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 3.9353487491607666 }, { "auxiliary_loss_clip": 0.01356909, "auxiliary_loss_mlp": 0.01198106, "balance_loss_clip": 1.01106226, "balance_loss_mlp": 1.00088632, "epoch": 0.23459387963686648, "flos": 53140341795840.0, "grad_norm": 4.05957276737499, "language_loss": 0.74214315, "learning_rate": 3.5768950167030633e-06, "loss": 0.76769328, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 3.0604915618896484 }, { "auxiliary_loss_clip": 0.01357988, "auxiliary_loss_mlp": 0.01197719, "balance_loss_clip": 1.01176739, "balance_loss_mlp": 1.00068974, "epoch": 0.23471412252750556, "flos": 23951423514240.0, "grad_norm": 1.709102494587674, "language_loss": 0.78267634, "learning_rate": 3.576415750054548e-06, "loss": 0.80823338, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.875852346420288 }, { "auxiliary_loss_clip": 0.01345038, "auxiliary_loss_mlp": 0.01199094, "balance_loss_clip": 1.01144814, "balance_loss_mlp": 1.00130224, "epoch": 0.23483436541814465, "flos": 15706583313120.0, "grad_norm": 1.7718367968128095, "language_loss": 0.85682565, "learning_rate": 3.5759362442650172e-06, "loss": 0.88226694, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.813080310821533 }, { "auxiliary_loss_clip": 0.01357241, "auxiliary_loss_mlp": 0.01198733, "balance_loss_clip": 1.01178193, "balance_loss_mlp": 1.00151324, "epoch": 0.23495460830878373, "flos": 24936273792000.0, "grad_norm": 1.9706481970089467, "language_loss": 0.85260129, "learning_rate": 3.5754564994072113e-06, "loss": 0.87816107, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.791346788406372 }, { "auxiliary_loss_clip": 0.01355274, "auxiliary_loss_mlp": 0.01198778, "balance_loss_clip": 1.01154661, "balance_loss_mlp": 1.00098562, "epoch": 0.23507485119942284, "flos": 30482678381760.0, "grad_norm": 2.0415086789101053, "language_loss": 0.59677434, "learning_rate": 3.5749765155539067e-06, "loss": 0.62231481, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 3.044353485107422 }, { "auxiliary_loss_clip": 0.0132271, "auxiliary_loss_mlp": 0.01198954, "balance_loss_clip": 1.01022911, "balance_loss_mlp": 1.00135231, "epoch": 0.23519509409006192, "flos": 18329138606880.0, "grad_norm": 2.0026131511353182, "language_loss": 0.92306137, "learning_rate": 3.574496292777917e-06, "loss": 0.94827795, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.7529611587524414 }, { "auxiliary_loss_clip": 0.01338817, "auxiliary_loss_mlp": 0.011993, "balance_loss_clip": 1.01007295, "balance_loss_mlp": 1.00131738, "epoch": 0.235315336980701, "flos": 29643230247840.0, "grad_norm": 2.0863859359924843, "language_loss": 0.71870935, "learning_rate": 3.574015831152092e-06, "loss": 0.7440905, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.8497416973114014 }, { "auxiliary_loss_clip": 0.01344572, "auxiliary_loss_mlp": 0.01198715, "balance_loss_clip": 1.01150417, "balance_loss_mlp": 1.00111365, "epoch": 0.23543557987134012, "flos": 18551714320800.0, "grad_norm": 1.9613984773452997, "language_loss": 0.83245295, "learning_rate": 3.573535130749316e-06, "loss": 0.85788584, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.754457712173462 }, { "auxiliary_loss_clip": 0.01345327, "auxiliary_loss_mlp": 0.01197923, "balance_loss_clip": 1.01192892, "balance_loss_mlp": 1.00089407, "epoch": 0.2355558227619792, "flos": 24679044936000.0, "grad_norm": 1.958810050297828, "language_loss": 0.73961651, "learning_rate": 3.5730541916425127e-06, "loss": 0.76504904, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 3.0246872901916504 }, { "auxiliary_loss_clip": 0.01321422, "auxiliary_loss_mlp": 0.0119858, "balance_loss_clip": 1.01030636, "balance_loss_mlp": 1.00097871, "epoch": 0.23567606565261828, "flos": 21944806624800.0, "grad_norm": 1.828096422926383, "language_loss": 0.8569994, "learning_rate": 3.572573013904639e-06, "loss": 0.88219941, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.9564669132232666 }, { "auxiliary_loss_clip": 0.01382673, "auxiliary_loss_mlp": 0.01198591, "balance_loss_clip": 1.01183987, "balance_loss_mlp": 1.00118077, "epoch": 0.2357963085432574, "flos": 13589361116640.0, "grad_norm": 2.432344926201864, "language_loss": 0.92219859, "learning_rate": 3.572091597608689e-06, "loss": 0.94801122, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.868303060531616 }, { "auxiliary_loss_clip": 0.01342558, "auxiliary_loss_mlp": 0.01199146, "balance_loss_clip": 1.01062059, "balance_loss_mlp": 1.00097263, "epoch": 0.23591655143389648, "flos": 22088699974080.0, "grad_norm": 23.34296008318961, "language_loss": 0.73125446, "learning_rate": 3.571609942827694e-06, "loss": 0.75667149, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.9160232543945312 }, { "auxiliary_loss_clip": 0.01345382, "auxiliary_loss_mlp": 0.01198099, "balance_loss_clip": 1.01115203, "balance_loss_mlp": 1.00106978, "epoch": 0.23603679432453556, "flos": 17017358028480.0, "grad_norm": 1.6831791828992984, "language_loss": 0.88171673, "learning_rate": 3.57112804963472e-06, "loss": 0.90715158, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.858151435852051 }, { "auxiliary_loss_clip": 0.01306582, "auxiliary_loss_mlp": 0.01198098, "balance_loss_clip": 1.00953031, "balance_loss_mlp": 1.00087821, "epoch": 0.23615703721517464, "flos": 19171316940480.0, "grad_norm": 1.767023083892025, "language_loss": 0.76310456, "learning_rate": 3.57064591810287e-06, "loss": 0.78815132, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.851203203201294 }, { "auxiliary_loss_clip": 0.01382305, "auxiliary_loss_mlp": 0.00873112, "balance_loss_clip": 1.01215935, "balance_loss_mlp": 1.0003438, "epoch": 0.23627728010581375, "flos": 19098813448800.0, "grad_norm": 2.2674229119384877, "language_loss": 0.80643392, "learning_rate": 3.570163548305284e-06, "loss": 0.82898808, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 2.7105727195739746 }, { "auxiliary_loss_clip": 0.01334852, "auxiliary_loss_mlp": 0.01198642, "balance_loss_clip": 1.01070416, "balance_loss_mlp": 1.00104046, "epoch": 0.23639752299645284, "flos": 14282221625280.0, "grad_norm": 2.0632841554054564, "language_loss": 0.70395368, "learning_rate": 3.569680940315135e-06, "loss": 0.72928858, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.776426076889038 }, { "auxiliary_loss_clip": 0.01332252, "auxiliary_loss_mlp": 0.01198767, "balance_loss_clip": 1.01144052, "balance_loss_mlp": 1.00116539, "epoch": 0.23651776588709192, "flos": 22893423118560.0, "grad_norm": 1.8320266848735445, "language_loss": 0.81895572, "learning_rate": 3.5691980942056356e-06, "loss": 0.84426594, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.925642251968384 }, { "auxiliary_loss_clip": 0.01370337, "auxiliary_loss_mlp": 0.01198279, "balance_loss_clip": 1.01179528, "balance_loss_mlp": 1.00105941, "epoch": 0.23663800877773103, "flos": 18624541125600.0, "grad_norm": 1.6067294289340768, "language_loss": 0.7989043, "learning_rate": 3.5687150100500332e-06, "loss": 0.82459044, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.8108861446380615 }, { "auxiliary_loss_clip": 0.01362077, "auxiliary_loss_mlp": 0.01198852, "balance_loss_clip": 1.01114738, "balance_loss_mlp": 1.00125051, "epoch": 0.2367582516683701, "flos": 25555840488000.0, "grad_norm": 1.890826050876592, "language_loss": 0.74421775, "learning_rate": 3.568231687921611e-06, "loss": 0.76982707, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.8376705646514893 }, { "auxiliary_loss_clip": 0.01382391, "auxiliary_loss_mlp": 0.01198375, "balance_loss_clip": 1.01194191, "balance_loss_mlp": 1.00096464, "epoch": 0.2368784945590092, "flos": 23295084179040.0, "grad_norm": 1.4720128963255508, "language_loss": 0.80524033, "learning_rate": 3.5677481278936883e-06, "loss": 0.83104801, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 2.7138450145721436 }, { "auxiliary_loss_clip": 0.01325642, "auxiliary_loss_mlp": 0.01195348, "balance_loss_clip": 1.0113771, "balance_loss_mlp": 1.00003576, "epoch": 0.23699873744964828, "flos": 69859328849280.0, "grad_norm": 0.8309521929563681, "language_loss": 0.57783842, "learning_rate": 3.5672643300396214e-06, "loss": 0.60304832, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.460470676422119 }, { "auxiliary_loss_clip": 0.01320375, "auxiliary_loss_mlp": 0.0119777, "balance_loss_clip": 1.01022625, "balance_loss_mlp": 1.00093174, "epoch": 0.2371189803402874, "flos": 21835063486080.0, "grad_norm": 2.679284947233026, "language_loss": 0.6774509, "learning_rate": 3.566780294432802e-06, "loss": 0.70263231, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 2.8423986434936523 }, { "auxiliary_loss_clip": 0.01381413, "auxiliary_loss_mlp": 0.01198494, "balance_loss_clip": 1.0119524, "balance_loss_mlp": 1.00108361, "epoch": 0.23723922323092647, "flos": 21908500993440.0, "grad_norm": 2.174218206199664, "language_loss": 0.74574256, "learning_rate": 3.566296021146657e-06, "loss": 0.7715416, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 3.8153505325317383 }, { "auxiliary_loss_clip": 0.01383763, "auxiliary_loss_mlp": 0.01198488, "balance_loss_clip": 1.01308024, "balance_loss_mlp": 1.00107718, "epoch": 0.23735946612156555, "flos": 32708817472320.0, "grad_norm": 1.5924896885326483, "language_loss": 0.73272932, "learning_rate": 3.565811510254652e-06, "loss": 0.75855178, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 3.7512106895446777 }, { "auxiliary_loss_clip": 0.01340755, "auxiliary_loss_mlp": 0.01194675, "balance_loss_clip": 1.01365542, "balance_loss_mlp": 1.00012553, "epoch": 0.23747970901220466, "flos": 70546980424320.0, "grad_norm": 0.8360457802615276, "language_loss": 0.5823257, "learning_rate": 3.5653267618302845e-06, "loss": 0.60767996, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 4.33915638923645 }, { "auxiliary_loss_clip": 0.01381241, "auxiliary_loss_mlp": 0.01197978, "balance_loss_clip": 1.01140404, "balance_loss_mlp": 1.00094879, "epoch": 0.23759995190284375, "flos": 20849818047840.0, "grad_norm": 1.680149529043359, "language_loss": 0.85388654, "learning_rate": 3.564841775947093e-06, "loss": 0.87967873, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 4.161528825759888 }, { "auxiliary_loss_clip": 0.01338238, "auxiliary_loss_mlp": 0.01198175, "balance_loss_clip": 1.01171684, "balance_loss_mlp": 1.00114608, "epoch": 0.23772019479348283, "flos": 32921657868960.0, "grad_norm": 1.997037607470778, "language_loss": 0.7620368, "learning_rate": 3.5643565526786475e-06, "loss": 0.78740084, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.99015212059021 }, { "auxiliary_loss_clip": 0.01382725, "auxiliary_loss_mlp": 0.01198128, "balance_loss_clip": 1.01223648, "balance_loss_mlp": 1.00090837, "epoch": 0.2378404376841219, "flos": 32342779493280.0, "grad_norm": 1.6316322104269787, "language_loss": 0.77380097, "learning_rate": 3.5638710920985574e-06, "loss": 0.79960942, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.794187545776367 }, { "auxiliary_loss_clip": 0.0136927, "auxiliary_loss_mlp": 0.00873346, "balance_loss_clip": 1.01120687, "balance_loss_mlp": 1.00046623, "epoch": 0.23796068057476102, "flos": 22997634010560.0, "grad_norm": 1.7859103282973705, "language_loss": 0.81769133, "learning_rate": 3.5633853942804655e-06, "loss": 0.84011745, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.7838151454925537 }, { "auxiliary_loss_clip": 0.01335206, "auxiliary_loss_mlp": 0.01198843, "balance_loss_clip": 1.01118255, "balance_loss_mlp": 1.00124168, "epoch": 0.2380809234654001, "flos": 13480947154080.0, "grad_norm": 2.1952117542330627, "language_loss": 0.76417804, "learning_rate": 3.5628994592980527e-06, "loss": 0.78951848, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.8364436626434326 }, { "auxiliary_loss_clip": 0.01382074, "auxiliary_loss_mlp": 0.01198194, "balance_loss_clip": 1.01176083, "balance_loss_mlp": 1.00097382, "epoch": 0.2382011663560392, "flos": 16871812189920.0, "grad_norm": 1.92434441096622, "language_loss": 0.70360398, "learning_rate": 3.562413287225034e-06, "loss": 0.72940671, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.758441209793091 }, { "auxiliary_loss_clip": 0.01370695, "auxiliary_loss_mlp": 0.01198256, "balance_loss_clip": 1.012676, "balance_loss_mlp": 1.00103593, "epoch": 0.2383214092466783, "flos": 18441144937440.0, "grad_norm": 2.021384890004412, "language_loss": 0.89343345, "learning_rate": 3.5619268781351623e-06, "loss": 0.91912293, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.7845406532287598 }, { "auxiliary_loss_clip": 0.01331042, "auxiliary_loss_mlp": 0.01197916, "balance_loss_clip": 1.01081944, "balance_loss_mlp": 1.00107729, "epoch": 0.23844165213731738, "flos": 19755727562880.0, "grad_norm": 1.8610299955490954, "language_loss": 0.77051377, "learning_rate": 3.5614402321022256e-06, "loss": 0.79580331, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.872044801712036 }, { "auxiliary_loss_clip": 0.01307413, "auxiliary_loss_mlp": 0.01198176, "balance_loss_clip": 1.01027679, "balance_loss_mlp": 1.00095606, "epoch": 0.23856189502795647, "flos": 23367372128640.0, "grad_norm": 1.6611831907726453, "language_loss": 0.87102163, "learning_rate": 3.5609533492000463e-06, "loss": 0.89607751, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.8801369667053223 }, { "auxiliary_loss_clip": 0.01330695, "auxiliary_loss_mlp": 0.01198534, "balance_loss_clip": 1.01049829, "balance_loss_mlp": 1.00131369, "epoch": 0.23868213791859555, "flos": 23475067617600.0, "grad_norm": 2.0029846653607417, "language_loss": 0.78446859, "learning_rate": 3.560466229502485e-06, "loss": 0.80976093, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.808418035507202 }, { "auxiliary_loss_clip": 0.01330359, "auxiliary_loss_mlp": 0.00873225, "balance_loss_clip": 1.01074266, "balance_loss_mlp": 1.00034809, "epoch": 0.23880238080923466, "flos": 16617349457280.0, "grad_norm": 1.9822443099265885, "language_loss": 0.9004941, "learning_rate": 3.5599788730834384e-06, "loss": 0.92252988, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.7362565994262695 }, { "auxiliary_loss_clip": 0.01366083, "auxiliary_loss_mlp": 0.01198589, "balance_loss_clip": 1.01119757, "balance_loss_mlp": 1.00098777, "epoch": 0.23892262369987374, "flos": 17348419552320.0, "grad_norm": 3.45478226761876, "language_loss": 0.78425395, "learning_rate": 3.559491280016836e-06, "loss": 0.80990064, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.7559001445770264 }, { "auxiliary_loss_clip": 0.0133504, "auxiliary_loss_mlp": 0.01198363, "balance_loss_clip": 1.01081967, "balance_loss_mlp": 1.00114274, "epoch": 0.23904286659051283, "flos": 22309910588160.0, "grad_norm": 1.7161521616253679, "language_loss": 0.71317524, "learning_rate": 3.5590034503766465e-06, "loss": 0.7385093, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.890704393386841 }, { "auxiliary_loss_clip": 0.0138114, "auxiliary_loss_mlp": 0.01198107, "balance_loss_clip": 1.01177859, "balance_loss_mlp": 1.00107765, "epoch": 0.23916310948115194, "flos": 21178257143040.0, "grad_norm": 2.0914215371352913, "language_loss": 0.81186312, "learning_rate": 3.558515384236874e-06, "loss": 0.8376556, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.79075288772583 }, { "auxiliary_loss_clip": 0.01318413, "auxiliary_loss_mlp": 0.00873337, "balance_loss_clip": 1.01041257, "balance_loss_mlp": 1.00050855, "epoch": 0.23928335237179102, "flos": 14137358336640.0, "grad_norm": 2.0366205211878277, "language_loss": 0.83738291, "learning_rate": 3.558027081671556e-06, "loss": 0.85930043, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.8411624431610107 }, { "auxiliary_loss_clip": 0.01369815, "auxiliary_loss_mlp": 0.01198179, "balance_loss_clip": 1.01170921, "balance_loss_mlp": 1.0009594, "epoch": 0.2394035952624301, "flos": 23769607968000.0, "grad_norm": 1.5719282079182793, "language_loss": 0.68663239, "learning_rate": 3.557538542754769e-06, "loss": 0.71231234, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 2.8821003437042236 }, { "auxiliary_loss_clip": 0.01380815, "auxiliary_loss_mlp": 0.01198472, "balance_loss_clip": 1.01173425, "balance_loss_mlp": 1.00106144, "epoch": 0.2395238381530692, "flos": 24206209560000.0, "grad_norm": 2.0885615910333386, "language_loss": 0.66769028, "learning_rate": 3.557049767560623e-06, "loss": 0.69348311, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.7765018939971924 }, { "auxiliary_loss_clip": 0.01290478, "auxiliary_loss_mlp": 0.01198334, "balance_loss_clip": 1.0085516, "balance_loss_mlp": 1.00111413, "epoch": 0.2396440810437083, "flos": 25295773661280.0, "grad_norm": 2.199785298248644, "language_loss": 0.85788846, "learning_rate": 3.5565607561632655e-06, "loss": 0.88277662, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.848870277404785 }, { "auxiliary_loss_clip": 0.01330834, "auxiliary_loss_mlp": 0.01198382, "balance_loss_clip": 1.01012301, "balance_loss_mlp": 1.00097096, "epoch": 0.23976432393434738, "flos": 28543104584640.0, "grad_norm": 2.1916176631914883, "language_loss": 0.79290557, "learning_rate": 3.5560715086368787e-06, "loss": 0.81819767, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 2.841148614883423 }, { "auxiliary_loss_clip": 0.01330302, "auxiliary_loss_mlp": 0.01197989, "balance_loss_clip": 1.01089418, "balance_loss_mlp": 1.00096011, "epoch": 0.23988456682498646, "flos": 19494367483680.0, "grad_norm": 1.9006815181161179, "language_loss": 0.82311749, "learning_rate": 3.5555820250556816e-06, "loss": 0.84840041, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.727454662322998 }, { "auxiliary_loss_clip": 0.01345275, "auxiliary_loss_mlp": 0.01198823, "balance_loss_clip": 1.01149225, "balance_loss_mlp": 1.00141203, "epoch": 0.24000480971562557, "flos": 20266341441120.0, "grad_norm": 2.2088864952085916, "language_loss": 0.69525337, "learning_rate": 3.5550923054939278e-06, "loss": 0.72069436, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 2.9097864627838135 }, { "auxiliary_loss_clip": 0.01316786, "auxiliary_loss_mlp": 0.01198074, "balance_loss_clip": 1.01117229, "balance_loss_mlp": 1.00104487, "epoch": 0.24012505260626466, "flos": 25443187531200.0, "grad_norm": 2.0542652298429824, "language_loss": 0.744268, "learning_rate": 3.5546023500259083e-06, "loss": 0.76941657, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.921365737915039 }, { "auxiliary_loss_clip": 0.01310425, "auxiliary_loss_mlp": 0.0119795, "balance_loss_clip": 1.01002908, "balance_loss_mlp": 1.00111187, "epoch": 0.24024529549690374, "flos": 15553349807040.0, "grad_norm": 1.9304074566813023, "language_loss": 0.80778819, "learning_rate": 3.5541121587259477e-06, "loss": 0.83287191, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 2.8540666103363037 }, { "auxiliary_loss_clip": 0.01334349, "auxiliary_loss_mlp": 0.01195436, "balance_loss_clip": 1.01002145, "balance_loss_mlp": 1.00012326, "epoch": 0.24036553838754285, "flos": 57122384315040.0, "grad_norm": 0.82473937920465, "language_loss": 0.57914793, "learning_rate": 3.553621731668408e-06, "loss": 0.60444582, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 4.372134208679199 }, { "auxiliary_loss_clip": 0.01368705, "auxiliary_loss_mlp": 0.01197908, "balance_loss_clip": 1.01113629, "balance_loss_mlp": 1.0010699, "epoch": 0.24048578127818193, "flos": 24969956994720.0, "grad_norm": 1.5998851164340844, "language_loss": 0.83608353, "learning_rate": 3.553131068927688e-06, "loss": 0.86174965, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 3.5944876670837402 }, { "auxiliary_loss_clip": 0.01320493, "auxiliary_loss_mlp": 0.01198195, "balance_loss_clip": 1.01088047, "balance_loss_mlp": 1.00116539, "epoch": 0.24060602416882101, "flos": 23330958726240.0, "grad_norm": 1.601465925715712, "language_loss": 0.80704045, "learning_rate": 3.552640170578219e-06, "loss": 0.83222735, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 3.7721850872039795 }, { "auxiliary_loss_clip": 0.0134216, "auxiliary_loss_mlp": 0.01198345, "balance_loss_clip": 1.01069522, "balance_loss_mlp": 1.00112498, "epoch": 0.2407262670594601, "flos": 14173268807520.0, "grad_norm": 2.0077566171190315, "language_loss": 0.78024328, "learning_rate": 3.5521490366944703e-06, "loss": 0.80564833, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 3.7736241817474365 }, { "auxiliary_loss_clip": 0.01321088, "auxiliary_loss_mlp": 0.01197932, "balance_loss_clip": 1.01065469, "balance_loss_mlp": 1.00109327, "epoch": 0.2408465099500992, "flos": 13663121937120.0, "grad_norm": 2.1848865454959667, "language_loss": 0.80174643, "learning_rate": 3.5516576673509474e-06, "loss": 0.8269366, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.824429750442505 }, { "auxiliary_loss_clip": 0.01381388, "auxiliary_loss_mlp": 0.01198278, "balance_loss_clip": 1.01193142, "balance_loss_mlp": 1.00105846, "epoch": 0.2409667528407383, "flos": 31248042382080.0, "grad_norm": 2.0301274542868293, "language_loss": 0.86222035, "learning_rate": 3.5511660626221896e-06, "loss": 0.888017, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.7612881660461426 }, { "auxiliary_loss_clip": 0.01331664, "auxiliary_loss_mlp": 0.00873245, "balance_loss_clip": 1.01044345, "balance_loss_mlp": 1.00046051, "epoch": 0.24108699573137737, "flos": 22199951907360.0, "grad_norm": 3.3486064641458486, "language_loss": 0.89223409, "learning_rate": 3.5506742225827744e-06, "loss": 0.91428322, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.9610846042633057 }, { "auxiliary_loss_clip": 0.01320951, "auxiliary_loss_mlp": 0.01198705, "balance_loss_clip": 1.01064134, "balance_loss_mlp": 1.00129473, "epoch": 0.24120723862201648, "flos": 26103047387040.0, "grad_norm": 1.9863198580720474, "language_loss": 0.90152037, "learning_rate": 3.5501821473073116e-06, "loss": 0.92671692, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.908160924911499 }, { "auxiliary_loss_clip": 0.0131896, "auxiliary_loss_mlp": 0.01198237, "balance_loss_clip": 1.01131964, "balance_loss_mlp": 1.00120831, "epoch": 0.24132748151265557, "flos": 18624936286080.0, "grad_norm": 1.9651404447213021, "language_loss": 0.86896414, "learning_rate": 3.54968983687045e-06, "loss": 0.89413613, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.793023109436035 }, { "auxiliary_loss_clip": 0.01344232, "auxiliary_loss_mlp": 0.01198124, "balance_loss_clip": 1.01182723, "balance_loss_mlp": 1.00109529, "epoch": 0.24144772440329465, "flos": 15267682605600.0, "grad_norm": 2.2528846525139694, "language_loss": 0.89385211, "learning_rate": 3.549197291346872e-06, "loss": 0.9192757, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.7915470600128174 }, { "auxiliary_loss_clip": 0.01368535, "auxiliary_loss_mlp": 0.01198713, "balance_loss_clip": 1.01184726, "balance_loss_mlp": 1.00130224, "epoch": 0.24156796729393373, "flos": 24024286242720.0, "grad_norm": 2.121811793991985, "language_loss": 0.7945587, "learning_rate": 3.548704510811297e-06, "loss": 0.82023114, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.781207323074341 }, { "auxiliary_loss_clip": 0.01306724, "auxiliary_loss_mlp": 0.01197982, "balance_loss_clip": 1.00961637, "balance_loss_mlp": 1.00114393, "epoch": 0.24168821018457284, "flos": 26286802812000.0, "grad_norm": 2.1559629898466586, "language_loss": 0.74537361, "learning_rate": 3.5482114953384787e-06, "loss": 0.77042067, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.910219192504883 }, { "auxiliary_loss_clip": 0.01367081, "auxiliary_loss_mlp": 0.0119858, "balance_loss_clip": 1.0114032, "balance_loss_mlp": 1.00136065, "epoch": 0.24180845307521193, "flos": 18223203378240.0, "grad_norm": 2.048757071865527, "language_loss": 0.84779763, "learning_rate": 3.5477182450032077e-06, "loss": 0.87345421, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.7142276763916016 }, { "auxiliary_loss_clip": 0.01355945, "auxiliary_loss_mlp": 0.01198328, "balance_loss_clip": 1.01068795, "balance_loss_mlp": 1.00110769, "epoch": 0.241928695965851, "flos": 20449270621440.0, "grad_norm": 2.049217762559181, "language_loss": 0.83298814, "learning_rate": 3.5472247598803097e-06, "loss": 0.85853088, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.7096526622772217 }, { "auxiliary_loss_clip": 0.01381085, "auxiliary_loss_mlp": 0.0119882, "balance_loss_clip": 1.01169348, "balance_loss_mlp": 1.00160015, "epoch": 0.24204893885649012, "flos": 25556487114240.0, "grad_norm": 2.8544665927030466, "language_loss": 0.85255671, "learning_rate": 3.546731040044645e-06, "loss": 0.87835574, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.6557817459106445 }, { "auxiliary_loss_clip": 0.01380916, "auxiliary_loss_mlp": 0.01198111, "balance_loss_clip": 1.0117172, "balance_loss_mlp": 1.00108194, "epoch": 0.2421691817471292, "flos": 30660219010080.0, "grad_norm": 1.7455066128849672, "language_loss": 0.75108826, "learning_rate": 3.546237085571112e-06, "loss": 0.77687848, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.9099221229553223 }, { "auxiliary_loss_clip": 0.01359178, "auxiliary_loss_mlp": 0.01198159, "balance_loss_clip": 1.01120794, "balance_loss_mlp": 1.00093937, "epoch": 0.24228942463776829, "flos": 21945022166880.0, "grad_norm": 1.9129560855559888, "language_loss": 0.72558075, "learning_rate": 3.5457428965346425e-06, "loss": 0.75115412, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.711012601852417 }, { "auxiliary_loss_clip": 0.01294509, "auxiliary_loss_mlp": 0.01198283, "balance_loss_clip": 1.00988567, "balance_loss_mlp": 1.00144506, "epoch": 0.2424096675284074, "flos": 33984507961440.0, "grad_norm": 1.674197899157284, "language_loss": 0.74666935, "learning_rate": 3.545248473010205e-06, "loss": 0.77159727, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.9930617809295654 }, { "auxiliary_loss_clip": 0.0138277, "auxiliary_loss_mlp": 0.0087338, "balance_loss_clip": 1.0120914, "balance_loss_mlp": 1.000561, "epoch": 0.24252991041904648, "flos": 21653427558240.0, "grad_norm": 1.8431831692047491, "language_loss": 0.87855399, "learning_rate": 3.544753815072802e-06, "loss": 0.90111542, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.7107410430908203 }, { "auxiliary_loss_clip": 0.01273251, "auxiliary_loss_mlp": 0.01198148, "balance_loss_clip": 1.0099802, "balance_loss_mlp": 1.00092828, "epoch": 0.24265015330968556, "flos": 21870075864960.0, "grad_norm": 1.8751536819939965, "language_loss": 0.88158447, "learning_rate": 3.544258922797474e-06, "loss": 0.90629852, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.810720443725586 }, { "auxiliary_loss_clip": 0.01381011, "auxiliary_loss_mlp": 0.01198477, "balance_loss_clip": 1.01134205, "balance_loss_mlp": 1.00125766, "epoch": 0.24277039620032465, "flos": 25628272132320.0, "grad_norm": 1.9018643529783548, "language_loss": 0.78233373, "learning_rate": 3.543763796259295e-06, "loss": 0.8081286, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.752624273300171 }, { "auxiliary_loss_clip": 0.01356546, "auxiliary_loss_mlp": 0.01198634, "balance_loss_clip": 1.01040864, "balance_loss_mlp": 1.00160468, "epoch": 0.24289063909096376, "flos": 26286587269920.0, "grad_norm": 1.7684702599394841, "language_loss": 0.91020668, "learning_rate": 3.5432684355333754e-06, "loss": 0.93575847, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 2.7508842945098877 }, { "auxiliary_loss_clip": 0.0136788, "auxiliary_loss_mlp": 0.01197966, "balance_loss_clip": 1.01099205, "balance_loss_mlp": 1.00112748, "epoch": 0.24301088198160284, "flos": 25075065978720.0, "grad_norm": 2.020144081972215, "language_loss": 0.76624501, "learning_rate": 3.5427728406948613e-06, "loss": 0.7919035, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.7506415843963623 }, { "auxiliary_loss_clip": 0.01325379, "auxiliary_loss_mlp": 0.01195344, "balance_loss_clip": 1.00982392, "balance_loss_mlp": 1.00003147, "epoch": 0.24313112487224192, "flos": 69900987108960.0, "grad_norm": 0.7662834864148202, "language_loss": 0.57930279, "learning_rate": 3.542277011818934e-06, "loss": 0.60451001, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.5227766036987305 }, { "auxiliary_loss_clip": 0.01344402, "auxiliary_loss_mlp": 0.01198149, "balance_loss_clip": 1.01160061, "balance_loss_mlp": 1.00131106, "epoch": 0.24325136776288103, "flos": 40662350454240.0, "grad_norm": 1.848535590902112, "language_loss": 0.73552179, "learning_rate": 3.5417809489808104e-06, "loss": 0.76094729, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.8868675231933594 }, { "auxiliary_loss_clip": 0.01365828, "auxiliary_loss_mlp": 0.01198296, "balance_loss_clip": 1.01133025, "balance_loss_mlp": 1.00107574, "epoch": 0.24337161065352012, "flos": 25046412091200.0, "grad_norm": 1.7951898417867977, "language_loss": 0.72787821, "learning_rate": 3.5412846522557422e-06, "loss": 0.75351954, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 2.788769006729126 }, { "auxiliary_loss_clip": 0.01381292, "auxiliary_loss_mlp": 0.01197997, "balance_loss_clip": 1.01193416, "balance_loss_mlp": 1.0009675, "epoch": 0.2434918535441592, "flos": 18661170070080.0, "grad_norm": 2.1195601461846914, "language_loss": 0.74203157, "learning_rate": 3.540788121719018e-06, "loss": 0.76782447, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.6279168128967285 }, { "auxiliary_loss_clip": 0.01305869, "auxiliary_loss_mlp": 0.01198223, "balance_loss_clip": 1.00978494, "balance_loss_mlp": 1.00100303, "epoch": 0.24361209643479828, "flos": 23915153806560.0, "grad_norm": 2.0426460564665483, "language_loss": 0.82236493, "learning_rate": 3.5402913574459604e-06, "loss": 0.84740585, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 3.7526376247406006 }, { "auxiliary_loss_clip": 0.01293993, "auxiliary_loss_mlp": 0.01198024, "balance_loss_clip": 1.00959921, "balance_loss_mlp": 1.00099516, "epoch": 0.2437323393254374, "flos": 28657517801760.0, "grad_norm": 1.5536663740129462, "language_loss": 0.86603224, "learning_rate": 3.5397943595119297e-06, "loss": 0.89095241, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 5.792986869812012 }, { "auxiliary_loss_clip": 0.01330325, "auxiliary_loss_mlp": 0.0119817, "balance_loss_clip": 1.0103991, "balance_loss_mlp": 1.00114095, "epoch": 0.24385258221607647, "flos": 23550337232640.0, "grad_norm": 2.2801431739633857, "language_loss": 0.77078891, "learning_rate": 3.5392971279923177e-06, "loss": 0.79607391, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 3.005741596221924 }, { "auxiliary_loss_clip": 0.01332801, "auxiliary_loss_mlp": 0.011985, "balance_loss_clip": 1.01122963, "balance_loss_mlp": 1.00127983, "epoch": 0.24397282510671556, "flos": 25336102744800.0, "grad_norm": 2.3928776556946136, "language_loss": 0.82813978, "learning_rate": 3.5387996629625557e-06, "loss": 0.8534528, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.8405771255493164 }, { "auxiliary_loss_clip": 0.01362578, "auxiliary_loss_mlp": 0.01195382, "balance_loss_clip": 1.01151252, "balance_loss_mlp": 1.00006914, "epoch": 0.24409306799735467, "flos": 65187456619680.0, "grad_norm": 0.8014076355746992, "language_loss": 0.54983306, "learning_rate": 3.5383019644981083e-06, "loss": 0.57541263, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.3108971118927 }, { "auxiliary_loss_clip": 0.01331585, "auxiliary_loss_mlp": 0.01198501, "balance_loss_clip": 1.01068878, "balance_loss_mlp": 1.00109029, "epoch": 0.24421331088799375, "flos": 19537103453760.0, "grad_norm": 2.0445742268988774, "language_loss": 0.73151982, "learning_rate": 3.5378040326744763e-06, "loss": 0.75682068, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.817505359649658 }, { "auxiliary_loss_clip": 0.01313133, "auxiliary_loss_mlp": 0.01197963, "balance_loss_clip": 1.00978172, "balance_loss_mlp": 1.00093424, "epoch": 0.24433355377863283, "flos": 21068585851680.0, "grad_norm": 2.5096862609421673, "language_loss": 0.85760492, "learning_rate": 3.5373058675671946e-06, "loss": 0.88271594, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.900080919265747 }, { "auxiliary_loss_clip": 0.0130461, "auxiliary_loss_mlp": 0.01198382, "balance_loss_clip": 1.01008272, "balance_loss_mlp": 1.00116229, "epoch": 0.24445379666927192, "flos": 22637200125600.0, "grad_norm": 1.9360518325997342, "language_loss": 0.72637326, "learning_rate": 3.536807469251836e-06, "loss": 0.75140321, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.8753092288970947 }, { "auxiliary_loss_clip": 0.0133221, "auxiliary_loss_mlp": 0.01198062, "balance_loss_clip": 1.01027918, "balance_loss_mlp": 1.00103307, "epoch": 0.24457403955991103, "flos": 21251622803040.0, "grad_norm": 1.7106908517979367, "language_loss": 0.82631963, "learning_rate": 3.5363088378040055e-06, "loss": 0.85162234, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.820483922958374 }, { "auxiliary_loss_clip": 0.01362672, "auxiliary_loss_mlp": 0.00872489, "balance_loss_clip": 1.01150787, "balance_loss_mlp": 1.00011146, "epoch": 0.2446942824505501, "flos": 66997852567200.0, "grad_norm": 0.7496355791538704, "language_loss": 0.64369333, "learning_rate": 3.5358099732993463e-06, "loss": 0.66604483, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.249882221221924 }, { "auxiliary_loss_clip": 0.01352458, "auxiliary_loss_mlp": 0.01198243, "balance_loss_clip": 1.01117074, "balance_loss_mlp": 1.00102353, "epoch": 0.2448145253411892, "flos": 20411132882400.0, "grad_norm": 3.4892570508013367, "language_loss": 0.89740515, "learning_rate": 3.535310875813535e-06, "loss": 0.92291212, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.785916328430176 }, { "auxiliary_loss_clip": 0.0135669, "auxiliary_loss_mlp": 0.01198437, "balance_loss_clip": 1.01048326, "balance_loss_mlp": 1.00121737, "epoch": 0.2449347682318283, "flos": 28804752053280.0, "grad_norm": 2.116580270589734, "language_loss": 0.81589103, "learning_rate": 3.5348115454222843e-06, "loss": 0.84144235, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.8568551540374756 }, { "auxiliary_loss_clip": 0.01353037, "auxiliary_loss_mlp": 0.01198325, "balance_loss_clip": 1.01115417, "balance_loss_mlp": 1.00110567, "epoch": 0.2450550111224674, "flos": 22529001705120.0, "grad_norm": 1.8781144503616252, "language_loss": 0.86464268, "learning_rate": 3.5343119822013425e-06, "loss": 0.89015633, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.8181252479553223 }, { "auxiliary_loss_clip": 0.01369735, "auxiliary_loss_mlp": 0.01198807, "balance_loss_clip": 1.0118835, "balance_loss_mlp": 1.00120544, "epoch": 0.24517525401310647, "flos": 21759147244800.0, "grad_norm": 1.7339416081793373, "language_loss": 0.776057, "learning_rate": 3.533812186226493e-06, "loss": 0.80174243, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.849256753921509 }, { "auxiliary_loss_clip": 0.01380182, "auxiliary_loss_mlp": 0.01197772, "balance_loss_clip": 1.01102173, "balance_loss_mlp": 1.00093341, "epoch": 0.24529549690374555, "flos": 25043322654720.0, "grad_norm": 1.7181119659468922, "language_loss": 0.75816333, "learning_rate": 3.5333121575735545e-06, "loss": 0.78394282, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.770620584487915 }, { "auxiliary_loss_clip": 0.01335612, "auxiliary_loss_mlp": 0.01198518, "balance_loss_clip": 1.01043224, "balance_loss_mlp": 1.00129855, "epoch": 0.24541573979438466, "flos": 32123652452640.0, "grad_norm": 2.229037823201567, "language_loss": 0.75353873, "learning_rate": 3.532811896318381e-06, "loss": 0.77888, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.8884804248809814 }, { "auxiliary_loss_clip": 0.01320385, "auxiliary_loss_mlp": 0.01198047, "balance_loss_clip": 1.01006544, "balance_loss_mlp": 1.00082755, "epoch": 0.24553598268502375, "flos": 31357569978720.0, "grad_norm": 2.0548990806517473, "language_loss": 0.81789881, "learning_rate": 3.5323114025368615e-06, "loss": 0.84308314, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.8894245624542236 }, { "auxiliary_loss_clip": 0.01369498, "auxiliary_loss_mlp": 0.01197996, "balance_loss_clip": 1.01152396, "balance_loss_mlp": 1.00096655, "epoch": 0.24565622557566283, "flos": 14027471503200.0, "grad_norm": 2.438950909458398, "language_loss": 0.81722248, "learning_rate": 3.53181067630492e-06, "loss": 0.84289742, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.6808536052703857 }, { "auxiliary_loss_clip": 0.0134423, "auxiliary_loss_mlp": 0.01197903, "balance_loss_clip": 1.0108974, "balance_loss_mlp": 1.00087392, "epoch": 0.24577646846630194, "flos": 16581474910080.0, "grad_norm": 1.6594529673571214, "language_loss": 0.76144969, "learning_rate": 3.5313097176985175e-06, "loss": 0.78687108, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.8745245933532715 }, { "auxiliary_loss_clip": 0.01358462, "auxiliary_loss_mlp": 0.01198013, "balance_loss_clip": 1.01059294, "balance_loss_mlp": 1.00117457, "epoch": 0.24589671135694102, "flos": 18807434382240.0, "grad_norm": 1.7173568671705588, "language_loss": 0.80882108, "learning_rate": 3.5308085267936482e-06, "loss": 0.83438581, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.770035982131958 }, { "auxiliary_loss_clip": 0.01279793, "auxiliary_loss_mlp": 0.0087321, "balance_loss_clip": 1.00940669, "balance_loss_mlp": 1.0005157, "epoch": 0.2460169542475801, "flos": 19938548972160.0, "grad_norm": 1.620441390600048, "language_loss": 0.90183163, "learning_rate": 3.530307103666342e-06, "loss": 0.92336166, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 2.9149887561798096 }, { "auxiliary_loss_clip": 0.0132444, "auxiliary_loss_mlp": 0.01198324, "balance_loss_clip": 1.01043785, "balance_loss_mlp": 1.00091362, "epoch": 0.24613719713821922, "flos": 24171233104800.0, "grad_norm": 1.6054426974509486, "language_loss": 0.80162251, "learning_rate": 3.5298054483926658e-06, "loss": 0.82685018, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.872560977935791 }, { "auxiliary_loss_clip": 0.01368737, "auxiliary_loss_mlp": 0.01198775, "balance_loss_clip": 1.01176918, "balance_loss_mlp": 1.00117409, "epoch": 0.2462574400288583, "flos": 30221066836800.0, "grad_norm": 2.4695280585570765, "language_loss": 0.82648927, "learning_rate": 3.5293035610487187e-06, "loss": 0.85216439, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 2.868119478225708 }, { "auxiliary_loss_clip": 0.01323281, "auxiliary_loss_mlp": 0.01195408, "balance_loss_clip": 1.01070619, "balance_loss_mlp": 1.00009561, "epoch": 0.24637768291949738, "flos": 68943066382080.0, "grad_norm": 0.7215925364717624, "language_loss": 0.62026119, "learning_rate": 3.5288014417106374e-06, "loss": 0.64544809, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.375168561935425 }, { "auxiliary_loss_clip": 0.01315238, "auxiliary_loss_mlp": 0.01198147, "balance_loss_clip": 1.01035154, "balance_loss_mlp": 1.00111759, "epoch": 0.24649792581013646, "flos": 34384013601120.0, "grad_norm": 1.831915923601295, "language_loss": 0.75461644, "learning_rate": 3.528299090454593e-06, "loss": 0.77975035, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 2.8909354209899902 }, { "auxiliary_loss_clip": 0.01368078, "auxiliary_loss_mlp": 0.01198289, "balance_loss_clip": 1.01122618, "balance_loss_mlp": 1.00106955, "epoch": 0.24661816870077558, "flos": 19680457947840.0, "grad_norm": 2.08507648639999, "language_loss": 0.82397777, "learning_rate": 3.527796507356792e-06, "loss": 0.84964144, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 3.696549892425537 }, { "auxiliary_loss_clip": 0.01369013, "auxiliary_loss_mlp": 0.01197877, "balance_loss_clip": 1.0116117, "balance_loss_mlp": 1.00103903, "epoch": 0.24673841159141466, "flos": 20002287085920.0, "grad_norm": 2.638080611498367, "language_loss": 0.90005213, "learning_rate": 3.527293692493475e-06, "loss": 0.92572093, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.704747438430786 }, { "auxiliary_loss_clip": 0.01369691, "auxiliary_loss_mlp": 0.01198538, "balance_loss_clip": 1.01187515, "balance_loss_mlp": 1.00093651, "epoch": 0.24685865448205374, "flos": 21646602059040.0, "grad_norm": 2.2801306769722895, "language_loss": 0.73538113, "learning_rate": 3.52679064594092e-06, "loss": 0.7610634, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 4.777032375335693 }, { "auxiliary_loss_clip": 0.01331488, "auxiliary_loss_mlp": 0.01197906, "balance_loss_clip": 1.01077652, "balance_loss_mlp": 1.00087714, "epoch": 0.24697889737269285, "flos": 17960478199200.0, "grad_norm": 1.8903117276548707, "language_loss": 0.74628806, "learning_rate": 3.5262873677754375e-06, "loss": 0.77158195, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.715484380722046 }, { "auxiliary_loss_clip": 0.01381147, "auxiliary_loss_mlp": 0.01198037, "balance_loss_clip": 1.01161265, "balance_loss_mlp": 1.00100827, "epoch": 0.24709914026333193, "flos": 27344623589280.0, "grad_norm": 1.6618521863056932, "language_loss": 0.80462778, "learning_rate": 3.5257838580733745e-06, "loss": 0.8304196, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.6472651958465576 }, { "auxiliary_loss_clip": 0.0136864, "auxiliary_loss_mlp": 0.01198111, "balance_loss_clip": 1.01179874, "balance_loss_mlp": 1.00089133, "epoch": 0.24721938315397102, "flos": 19275527832480.0, "grad_norm": 1.9385909248043611, "language_loss": 0.87167507, "learning_rate": 3.5252801169111138e-06, "loss": 0.89734262, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.6611392498016357 }, { "auxiliary_loss_clip": 0.01332872, "auxiliary_loss_mlp": 0.01198077, "balance_loss_clip": 1.01070547, "balance_loss_mlp": 1.00123835, "epoch": 0.2473396260446101, "flos": 23185808048160.0, "grad_norm": 1.7731857196283838, "language_loss": 0.80058485, "learning_rate": 3.524776144365072e-06, "loss": 0.82589436, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.757037878036499 }, { "auxiliary_loss_clip": 0.01330354, "auxiliary_loss_mlp": 0.01197879, "balance_loss_clip": 1.01104867, "balance_loss_mlp": 1.00104046, "epoch": 0.2474598689352492, "flos": 21142454443200.0, "grad_norm": 2.1801855935675243, "language_loss": 0.7922529, "learning_rate": 3.5242719405117016e-06, "loss": 0.81753522, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.7834603786468506 }, { "auxiliary_loss_clip": 0.01342543, "auxiliary_loss_mlp": 0.00873142, "balance_loss_clip": 1.01135516, "balance_loss_mlp": 1.00045729, "epoch": 0.2475801118258883, "flos": 21648362319360.0, "grad_norm": 2.4375707976984864, "language_loss": 0.74976408, "learning_rate": 3.5237675054274893e-06, "loss": 0.77192086, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.762509822845459 }, { "auxiliary_loss_clip": 0.01369521, "auxiliary_loss_mlp": 0.01198333, "balance_loss_clip": 1.01193345, "balance_loss_mlp": 1.00111318, "epoch": 0.24770035471652738, "flos": 22674511620000.0, "grad_norm": 1.7851146730834793, "language_loss": 0.80212581, "learning_rate": 3.5232628391889584e-06, "loss": 0.82780439, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.7498178482055664 }, { "auxiliary_loss_clip": 0.01296139, "auxiliary_loss_mlp": 0.01197726, "balance_loss_clip": 1.00959194, "balance_loss_mlp": 1.00088763, "epoch": 0.2478205976071665, "flos": 22163825894400.0, "grad_norm": 2.1397329444355866, "language_loss": 0.64414221, "learning_rate": 3.522757941872666e-06, "loss": 0.66908085, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.777998685836792 }, { "auxiliary_loss_clip": 0.01380552, "auxiliary_loss_mlp": 0.00873186, "balance_loss_clip": 1.01151907, "balance_loss_mlp": 1.0005399, "epoch": 0.24794084049780557, "flos": 24973118278560.0, "grad_norm": 1.5042415400038118, "language_loss": 0.82655913, "learning_rate": 3.5222528135552042e-06, "loss": 0.84909654, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.6715891361236572 }, { "auxiliary_loss_clip": 0.01354792, "auxiliary_loss_mlp": 0.01198364, "balance_loss_clip": 1.01067734, "balance_loss_mlp": 1.00114393, "epoch": 0.24806108338844465, "flos": 18296389419840.0, "grad_norm": 1.7239534573992294, "language_loss": 0.80236131, "learning_rate": 3.521747454313201e-06, "loss": 0.82789284, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.6515190601348877 }, { "auxiliary_loss_clip": 0.01331149, "auxiliary_loss_mlp": 0.0119794, "balance_loss_clip": 1.01051736, "balance_loss_mlp": 1.00091124, "epoch": 0.24818132627908374, "flos": 19282173713280.0, "grad_norm": 3.7426101975864334, "language_loss": 0.66994584, "learning_rate": 3.521241864223319e-06, "loss": 0.69523674, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.7568631172180176 }, { "auxiliary_loss_clip": 0.01334521, "auxiliary_loss_mlp": 0.01194665, "balance_loss_clip": 1.01010776, "balance_loss_mlp": 1.00011575, "epoch": 0.24830156916972285, "flos": 70285871810880.0, "grad_norm": 0.7890299263323565, "language_loss": 0.61970192, "learning_rate": 3.5207360433622552e-06, "loss": 0.64499378, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.3294472694396973 }, { "auxiliary_loss_clip": 0.01343579, "auxiliary_loss_mlp": 0.01197714, "balance_loss_clip": 1.01102364, "balance_loss_mlp": 1.00087595, "epoch": 0.24842181206036193, "flos": 40409145050400.0, "grad_norm": 1.6451390407888735, "language_loss": 0.74609101, "learning_rate": 3.5202299918067437e-06, "loss": 0.77150393, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.9540748596191406 }, { "auxiliary_loss_clip": 0.01359185, "auxiliary_loss_mlp": 0.01197791, "balance_loss_clip": 1.01073146, "balance_loss_mlp": 1.00095296, "epoch": 0.248542054951001, "flos": 20082442321440.0, "grad_norm": 2.5278847835552414, "language_loss": 0.69278455, "learning_rate": 3.519723709633551e-06, "loss": 0.71835428, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.727260112762451 }, { "auxiliary_loss_clip": 0.01345056, "auxiliary_loss_mlp": 0.01197749, "balance_loss_clip": 1.01116788, "balance_loss_mlp": 1.00091052, "epoch": 0.24866229784164012, "flos": 23513959753920.0, "grad_norm": 1.8371315363261222, "language_loss": 0.83500063, "learning_rate": 3.519217196919479e-06, "loss": 0.86042863, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.7367424964904785 }, { "auxiliary_loss_clip": 0.01341718, "auxiliary_loss_mlp": 0.01197989, "balance_loss_clip": 1.01067686, "balance_loss_mlp": 1.0009594, "epoch": 0.2487825407322792, "flos": 19865111464800.0, "grad_norm": 1.6264435930028467, "language_loss": 0.73155856, "learning_rate": 3.518710453741367e-06, "loss": 0.75695562, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.850971221923828 }, { "auxiliary_loss_clip": 0.01344488, "auxiliary_loss_mlp": 0.00873156, "balance_loss_clip": 1.01075244, "balance_loss_mlp": 1.00040126, "epoch": 0.2489027836229183, "flos": 22017633429600.0, "grad_norm": 2.3953152898915198, "language_loss": 0.67739236, "learning_rate": 3.518203480176086e-06, "loss": 0.69956881, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.7544736862182617 }, { "auxiliary_loss_clip": 0.01282381, "auxiliary_loss_mlp": 0.011982, "balance_loss_clip": 1.00926483, "balance_loss_mlp": 1.00117052, "epoch": 0.2490230265135574, "flos": 23294365705440.0, "grad_norm": 1.7305006797608422, "language_loss": 0.80600685, "learning_rate": 3.517696276300545e-06, "loss": 0.83081263, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 2.9826128482818604 }, { "auxiliary_loss_clip": 0.0135463, "auxiliary_loss_mlp": 0.01198355, "balance_loss_clip": 1.01110768, "balance_loss_mlp": 1.0009439, "epoch": 0.24914326940419648, "flos": 19826794107360.0, "grad_norm": 2.509367392262262, "language_loss": 0.68973446, "learning_rate": 3.517188842191685e-06, "loss": 0.71526426, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 2.840731382369995 }, { "auxiliary_loss_clip": 0.01368072, "auxiliary_loss_mlp": 0.01198302, "balance_loss_clip": 1.01142001, "balance_loss_mlp": 1.00108218, "epoch": 0.24926351229483557, "flos": 20229281412480.0, "grad_norm": 1.6437099096831176, "language_loss": 0.73968184, "learning_rate": 3.5166811779264837e-06, "loss": 0.76534557, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.7316417694091797 }, { "auxiliary_loss_clip": 0.01380166, "auxiliary_loss_mlp": 0.01197816, "balance_loss_clip": 1.01116455, "balance_loss_mlp": 1.00097728, "epoch": 0.24938375518547465, "flos": 23294581247520.0, "grad_norm": 1.7841478336486214, "language_loss": 0.78006268, "learning_rate": 3.5161732835819545e-06, "loss": 0.80584246, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 2.6951756477355957 }, { "auxiliary_loss_clip": 0.01380086, "auxiliary_loss_mlp": 0.01198355, "balance_loss_clip": 1.01159179, "balance_loss_mlp": 1.0013262, "epoch": 0.24950399807611376, "flos": 17311682836800.0, "grad_norm": 1.7907537585386561, "language_loss": 0.83294684, "learning_rate": 3.515665159235143e-06, "loss": 0.85873127, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.686516523361206 }, { "auxiliary_loss_clip": 0.01356215, "auxiliary_loss_mlp": 0.011982, "balance_loss_clip": 1.01161373, "balance_loss_mlp": 1.00136137, "epoch": 0.24962424096675284, "flos": 19024873009920.0, "grad_norm": 1.5917056155127478, "language_loss": 0.74974453, "learning_rate": 3.5151568049631318e-06, "loss": 0.7752887, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.7439615726470947 }, { "auxiliary_loss_clip": 0.01380777, "auxiliary_loss_mlp": 0.01197986, "balance_loss_clip": 1.01158512, "balance_loss_mlp": 1.00095677, "epoch": 0.24974448385739192, "flos": 33398804086560.0, "grad_norm": 1.6696382910894836, "language_loss": 0.80297565, "learning_rate": 3.5146482208430385e-06, "loss": 0.82876325, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.775977373123169 }, { "auxiliary_loss_clip": 0.01294367, "auxiliary_loss_mlp": 0.01198289, "balance_loss_clip": 1.00921869, "balance_loss_mlp": 1.00106943, "epoch": 0.24986472674803104, "flos": 30007292424480.0, "grad_norm": 6.7000073049076665, "language_loss": 0.68271899, "learning_rate": 3.514139406952014e-06, "loss": 0.70764554, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 4.142930269241333 }, { "auxiliary_loss_clip": 0.01357448, "auxiliary_loss_mlp": 0.01197854, "balance_loss_clip": 1.01052141, "balance_loss_mlp": 1.00101566, "epoch": 0.24998496963867012, "flos": 26613086486400.0, "grad_norm": 1.7892123152020059, "language_loss": 0.83492124, "learning_rate": 3.5136303633672454e-06, "loss": 0.86047429, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 4.6555609703063965 }, { "auxiliary_loss_clip": 0.01327262, "auxiliary_loss_mlp": 0.00873166, "balance_loss_clip": 1.01048827, "balance_loss_mlp": 1.00047731, "epoch": 0.25010521252930923, "flos": 23554001448000.0, "grad_norm": 1.6570496717689456, "language_loss": 0.74847424, "learning_rate": 3.5131210901659544e-06, "loss": 0.77047855, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.829864978790283 }, { "auxiliary_loss_clip": 0.01321491, "auxiliary_loss_mlp": 0.01197742, "balance_loss_clip": 1.00938821, "balance_loss_mlp": 1.00090337, "epoch": 0.2502254554199483, "flos": 23441204796480.0, "grad_norm": 2.234397244322541, "language_loss": 0.82531059, "learning_rate": 3.5126115874253967e-06, "loss": 0.85050291, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.7845399379730225 }, { "auxiliary_loss_clip": 0.01311394, "auxiliary_loss_mlp": 0.01198261, "balance_loss_clip": 1.0098176, "balance_loss_mlp": 1.00104141, "epoch": 0.2503456983105874, "flos": 28761692770080.0, "grad_norm": 1.8547203866863093, "language_loss": 0.80882156, "learning_rate": 3.5121018552228644e-06, "loss": 0.83391804, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.8170552253723145 }, { "auxiliary_loss_clip": 0.01319552, "auxiliary_loss_mlp": 0.01197873, "balance_loss_clip": 1.00958323, "balance_loss_mlp": 1.00103498, "epoch": 0.2504659412012265, "flos": 18770266582560.0, "grad_norm": 1.9487010330205425, "language_loss": 0.76218116, "learning_rate": 3.5115918936356827e-06, "loss": 0.78735542, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.7285196781158447 }, { "auxiliary_loss_clip": 0.01331462, "auxiliary_loss_mlp": 0.01197666, "balance_loss_clip": 1.01136672, "balance_loss_mlp": 1.00082827, "epoch": 0.25058618409186556, "flos": 16873392831840.0, "grad_norm": 1.8075779387851147, "language_loss": 0.78828084, "learning_rate": 3.5110817027412123e-06, "loss": 0.81357217, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.860013008117676 }, { "auxiliary_loss_clip": 0.01329783, "auxiliary_loss_mlp": 0.01197604, "balance_loss_clip": 1.01037025, "balance_loss_mlp": 1.00076556, "epoch": 0.25070642698250467, "flos": 24425552142720.0, "grad_norm": 1.7898961526318224, "language_loss": 0.6890682, "learning_rate": 3.5105712826168493e-06, "loss": 0.71434206, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.799191474914551 }, { "auxiliary_loss_clip": 0.01366867, "auxiliary_loss_mlp": 0.00873108, "balance_loss_clip": 1.01133704, "balance_loss_mlp": 1.00056458, "epoch": 0.2508266698731437, "flos": 20260952889120.0, "grad_norm": 1.8249598081684673, "language_loss": 0.7063849, "learning_rate": 3.5100606333400235e-06, "loss": 0.72878468, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.752960681915283 }, { "auxiliary_loss_clip": 0.01355414, "auxiliary_loss_mlp": 0.01197658, "balance_loss_clip": 1.01132107, "balance_loss_mlp": 1.00101089, "epoch": 0.25094691276378284, "flos": 19245329226720.0, "grad_norm": 2.0797377656697815, "language_loss": 0.76913345, "learning_rate": 3.5095497549882006e-06, "loss": 0.7946642, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.8866732120513916 }, { "auxiliary_loss_clip": 0.01356483, "auxiliary_loss_mlp": 0.01198251, "balance_loss_clip": 1.01120448, "balance_loss_mlp": 1.00103152, "epoch": 0.25106715565442195, "flos": 26943249918240.0, "grad_norm": 2.7066885288079856, "language_loss": 0.72626001, "learning_rate": 3.50903864763888e-06, "loss": 0.75180739, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.776203155517578 }, { "auxiliary_loss_clip": 0.01367236, "auxiliary_loss_mlp": 0.01198466, "balance_loss_clip": 1.01107597, "balance_loss_mlp": 1.001055, "epoch": 0.251187398545061, "flos": 48359588595840.0, "grad_norm": 2.076063889543218, "language_loss": 0.75962055, "learning_rate": 3.5085273113695965e-06, "loss": 0.78527755, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.9292049407958984 }, { "auxiliary_loss_clip": 0.01379933, "auxiliary_loss_mlp": 0.01197662, "balance_loss_clip": 1.01148248, "balance_loss_mlp": 1.00082397, "epoch": 0.2513076414357001, "flos": 27016112646720.0, "grad_norm": 1.8752610600216795, "language_loss": 0.78680146, "learning_rate": 3.508015746257919e-06, "loss": 0.81257737, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.8061318397521973 }, { "auxiliary_loss_clip": 0.01322216, "auxiliary_loss_mlp": 0.01197689, "balance_loss_clip": 1.00993156, "balance_loss_mlp": 1.00085044, "epoch": 0.2514278843263392, "flos": 19463630022720.0, "grad_norm": 1.8539810706196982, "language_loss": 0.83314985, "learning_rate": 3.5075039523814518e-06, "loss": 0.85834885, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.8414931297302246 }, { "auxiliary_loss_clip": 0.01369081, "auxiliary_loss_mlp": 0.01198005, "balance_loss_clip": 1.01191425, "balance_loss_mlp": 1.00097609, "epoch": 0.2515481272169783, "flos": 16866100324800.0, "grad_norm": 1.9298527124236078, "language_loss": 0.82017356, "learning_rate": 3.506991929817834e-06, "loss": 0.84584451, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.7383415699005127 }, { "auxiliary_loss_clip": 0.01378739, "auxiliary_loss_mlp": 0.01197388, "balance_loss_clip": 1.0116837, "balance_loss_mlp": 1.00074053, "epoch": 0.2516683701076174, "flos": 23732476092000.0, "grad_norm": 1.8593539510900157, "language_loss": 0.82931757, "learning_rate": 3.506479678644738e-06, "loss": 0.85507882, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.8227458000183105 }, { "auxiliary_loss_clip": 0.0129307, "auxiliary_loss_mlp": 0.01197794, "balance_loss_clip": 1.00908756, "balance_loss_mlp": 1.00095582, "epoch": 0.2517886129982565, "flos": 27635966732160.0, "grad_norm": 2.825753286522511, "language_loss": 0.738783, "learning_rate": 3.505967198939873e-06, "loss": 0.76369154, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 3.139629602432251 }, { "auxiliary_loss_clip": 0.01341974, "auxiliary_loss_mlp": 0.01197556, "balance_loss_clip": 1.01060033, "balance_loss_mlp": 1.00090802, "epoch": 0.25190885588889556, "flos": 38104610984640.0, "grad_norm": 1.8392537393659736, "language_loss": 0.78151351, "learning_rate": 3.5054544907809813e-06, "loss": 0.80690885, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.926079750061035 }, { "auxiliary_loss_clip": 0.01329663, "auxiliary_loss_mlp": 0.00873224, "balance_loss_clip": 1.01009536, "balance_loss_mlp": 1.00062001, "epoch": 0.25202909877953467, "flos": 22269904817760.0, "grad_norm": 2.067362690501323, "language_loss": 0.80894029, "learning_rate": 3.50494155424584e-06, "loss": 0.83096915, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.9487664699554443 }, { "auxiliary_loss_clip": 0.0136726, "auxiliary_loss_mlp": 0.01197716, "balance_loss_clip": 1.01147676, "balance_loss_mlp": 1.00087762, "epoch": 0.2521493416701738, "flos": 21761769673440.0, "grad_norm": 1.552456452908815, "language_loss": 0.83158755, "learning_rate": 3.504428389412262e-06, "loss": 0.85723728, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.801116943359375 }, { "auxiliary_loss_clip": 0.01368482, "auxiliary_loss_mlp": 0.01198219, "balance_loss_clip": 1.01204419, "balance_loss_mlp": 1.00099897, "epoch": 0.25226958456081283, "flos": 27746751657600.0, "grad_norm": 2.2043515148370205, "language_loss": 0.73085588, "learning_rate": 3.5039149963580927e-06, "loss": 0.75652289, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 2.8060436248779297 }, { "auxiliary_loss_clip": 0.0132793, "auxiliary_loss_mlp": 0.01198244, "balance_loss_clip": 1.01047349, "balance_loss_mlp": 1.00102448, "epoch": 0.25238982745145194, "flos": 30732183646560.0, "grad_norm": 3.3918714268090753, "language_loss": 0.70481288, "learning_rate": 3.503401375161215e-06, "loss": 0.73007464, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.8292593955993652 }, { "auxiliary_loss_clip": 0.01378975, "auxiliary_loss_mlp": 0.01197906, "balance_loss_clip": 1.01126456, "balance_loss_mlp": 1.001068, "epoch": 0.252510070342091, "flos": 20266341441120.0, "grad_norm": 1.553556374190144, "language_loss": 0.8364293, "learning_rate": 3.502887525899544e-06, "loss": 0.86219811, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 2.7042489051818848 }, { "auxiliary_loss_clip": 0.01345783, "auxiliary_loss_mlp": 0.01198128, "balance_loss_clip": 1.01094508, "balance_loss_mlp": 1.0009079, "epoch": 0.2526303132327301, "flos": 22747410272160.0, "grad_norm": 1.6587664373327753, "language_loss": 0.82774013, "learning_rate": 3.50237344865103e-06, "loss": 0.85317922, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.845136880874634 }, { "auxiliary_loss_clip": 0.0137995, "auxiliary_loss_mlp": 0.01197821, "balance_loss_clip": 1.01150131, "balance_loss_mlp": 1.00098276, "epoch": 0.2527505561233692, "flos": 30263479493760.0, "grad_norm": 2.3394421172336743, "language_loss": 0.76184726, "learning_rate": 3.501859143493658e-06, "loss": 0.78762496, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.7758355140686035 }, { "auxiliary_loss_clip": 0.01360627, "auxiliary_loss_mlp": 0.01194652, "balance_loss_clip": 1.01128578, "balance_loss_mlp": 1.00010228, "epoch": 0.2528707990140083, "flos": 58492945539360.0, "grad_norm": 0.9384554790707255, "language_loss": 0.60565484, "learning_rate": 3.5013446105054488e-06, "loss": 0.63120759, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 4.028244495391846 }, { "auxiliary_loss_clip": 0.01315466, "auxiliary_loss_mlp": 0.01197778, "balance_loss_clip": 1.00998068, "balance_loss_mlp": 1.00093985, "epoch": 0.2529910419046474, "flos": 24645146191200.0, "grad_norm": 2.0718617766307306, "language_loss": 0.75008333, "learning_rate": 3.5008298497644555e-06, "loss": 0.77521574, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 3.7531049251556396 }, { "auxiliary_loss_clip": 0.0132052, "auxiliary_loss_mlp": 0.01198013, "balance_loss_clip": 1.01026368, "balance_loss_mlp": 1.00098372, "epoch": 0.2531112847952865, "flos": 23842147383360.0, "grad_norm": 1.550523893500985, "language_loss": 0.87952262, "learning_rate": 3.500314861348767e-06, "loss": 0.90470797, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 4.7618088722229 }, { "auxiliary_loss_clip": 0.01318519, "auxiliary_loss_mlp": 0.01197453, "balance_loss_clip": 1.01020467, "balance_loss_mlp": 1.00080585, "epoch": 0.25323152768592555, "flos": 16143831531360.0, "grad_norm": 2.097416871147343, "language_loss": 0.76829529, "learning_rate": 3.499799645336507e-06, "loss": 0.793455, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.8321452140808105 }, { "auxiliary_loss_clip": 0.01365033, "auxiliary_loss_mlp": 0.01197593, "balance_loss_clip": 1.01100159, "balance_loss_mlp": 1.00075495, "epoch": 0.25335177057656466, "flos": 28405174566240.0, "grad_norm": 1.461460232696422, "language_loss": 0.86963123, "learning_rate": 3.4992842018058336e-06, "loss": 0.89525747, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.8256943225860596 }, { "auxiliary_loss_clip": 0.01329939, "auxiliary_loss_mlp": 0.0119774, "balance_loss_clip": 1.01073098, "balance_loss_mlp": 1.00109184, "epoch": 0.25347201346720377, "flos": 18799674867360.0, "grad_norm": 2.251083528640981, "language_loss": 0.88603479, "learning_rate": 3.4987685308349384e-06, "loss": 0.91131157, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.7798244953155518 }, { "auxiliary_loss_clip": 0.01342379, "auxiliary_loss_mlp": 0.01197721, "balance_loss_clip": 1.01077366, "balance_loss_mlp": 1.00088286, "epoch": 0.2535922563578428, "flos": 15815500207200.0, "grad_norm": 2.009314468609221, "language_loss": 0.61520839, "learning_rate": 3.4982526325020497e-06, "loss": 0.64060944, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.799731969833374 }, { "auxiliary_loss_clip": 0.01342009, "auxiliary_loss_mlp": 0.0119763, "balance_loss_clip": 1.01045156, "balance_loss_mlp": 1.00079179, "epoch": 0.25371249924848194, "flos": 16318929349440.0, "grad_norm": 2.063408605165073, "language_loss": 0.8190468, "learning_rate": 3.4977365068854273e-06, "loss": 0.8444432, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.6887366771698 }, { "auxiliary_loss_clip": 0.01341549, "auxiliary_loss_mlp": 0.01197733, "balance_loss_clip": 1.01025987, "balance_loss_mlp": 1.00108504, "epoch": 0.25383274213912105, "flos": 21761625978720.0, "grad_norm": 4.189007329011586, "language_loss": 0.73638451, "learning_rate": 3.4972201540633676e-06, "loss": 0.7617774, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.8152551651000977 }, { "auxiliary_loss_clip": 0.01344042, "auxiliary_loss_mlp": 0.01197718, "balance_loss_clip": 1.01091695, "balance_loss_mlp": 1.00087941, "epoch": 0.2539529850297601, "flos": 21396881252160.0, "grad_norm": 1.802805258427218, "language_loss": 0.8510167, "learning_rate": 3.4967035741142008e-06, "loss": 0.87643427, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.7562367916107178 }, { "auxiliary_loss_clip": 0.01327456, "auxiliary_loss_mlp": 0.01196965, "balance_loss_clip": 1.01077175, "balance_loss_mlp": 1.00088978, "epoch": 0.2540732279203992, "flos": 25228479103200.0, "grad_norm": 1.8898552503831212, "language_loss": 0.82276845, "learning_rate": 3.4961867671162917e-06, "loss": 0.84801269, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.748420000076294 }, { "auxiliary_loss_clip": 0.01379398, "auxiliary_loss_mlp": 0.01197626, "balance_loss_clip": 1.01119113, "balance_loss_mlp": 1.00078809, "epoch": 0.2541934708110383, "flos": 19427396238720.0, "grad_norm": 11.601726149184577, "language_loss": 0.77222359, "learning_rate": 3.4956697331480402e-06, "loss": 0.79799384, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.702547073364258 }, { "auxiliary_loss_clip": 0.01331266, "auxiliary_loss_mlp": 0.01197516, "balance_loss_clip": 1.0104301, "balance_loss_mlp": 1.00086856, "epoch": 0.2543137137016774, "flos": 23949447711840.0, "grad_norm": 1.9958342163160456, "language_loss": 0.80088431, "learning_rate": 3.495152472287879e-06, "loss": 0.82617211, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.850390911102295 }, { "auxiliary_loss_clip": 0.01314025, "auxiliary_loss_mlp": 0.01197786, "balance_loss_clip": 1.00967169, "balance_loss_mlp": 1.00094748, "epoch": 0.2544339565923165, "flos": 25593295677120.0, "grad_norm": 1.7847544537524755, "language_loss": 0.74405825, "learning_rate": 3.4946349846142766e-06, "loss": 0.76917642, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.8058547973632812 }, { "auxiliary_loss_clip": 0.01379583, "auxiliary_loss_mlp": 0.01197431, "balance_loss_clip": 1.01121259, "balance_loss_mlp": 1.00059247, "epoch": 0.25455419948295555, "flos": 21689481723840.0, "grad_norm": 2.18774239315228, "language_loss": 0.75866604, "learning_rate": 3.4941172702057353e-06, "loss": 0.78443623, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.7445836067199707 }, { "auxiliary_loss_clip": 0.01335851, "auxiliary_loss_mlp": 0.01198048, "balance_loss_clip": 1.01014555, "balance_loss_mlp": 1.00120986, "epoch": 0.25467444237359466, "flos": 26250353485920.0, "grad_norm": 1.7540930947708822, "language_loss": 0.80609113, "learning_rate": 3.4935993291407924e-06, "loss": 0.83143014, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.7869114875793457 }, { "auxiliary_loss_clip": 0.01348137, "auxiliary_loss_mlp": 0.01197984, "balance_loss_clip": 1.01118016, "balance_loss_mlp": 1.00095439, "epoch": 0.25479468526423377, "flos": 26979699244320.0, "grad_norm": 2.136336354318054, "language_loss": 0.71507591, "learning_rate": 3.4930811614980183e-06, "loss": 0.74053705, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.747647762298584 }, { "auxiliary_loss_clip": 0.01367471, "auxiliary_loss_mlp": 0.01197821, "balance_loss_clip": 1.01179969, "balance_loss_mlp": 1.0009824, "epoch": 0.2549149281548728, "flos": 23475822014880.0, "grad_norm": 1.657569032771901, "language_loss": 0.79548657, "learning_rate": 3.4925627673560198e-06, "loss": 0.82113945, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.7638533115386963 }, { "auxiliary_loss_clip": 0.01317601, "auxiliary_loss_mlp": 0.01197661, "balance_loss_clip": 1.01013756, "balance_loss_mlp": 1.00082278, "epoch": 0.25503517104551193, "flos": 25812314946720.0, "grad_norm": 1.733574933249715, "language_loss": 0.88358903, "learning_rate": 3.4920441467934357e-06, "loss": 0.90874159, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.8223981857299805 }, { "auxiliary_loss_clip": 0.01316165, "auxiliary_loss_mlp": 0.01197611, "balance_loss_clip": 1.0103159, "balance_loss_mlp": 1.00096321, "epoch": 0.25515541393615104, "flos": 26645117199840.0, "grad_norm": 2.035115019426683, "language_loss": 0.83099008, "learning_rate": 3.491525299888941e-06, "loss": 0.8561278, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.8313820362091064 }, { "auxiliary_loss_clip": 0.01311985, "auxiliary_loss_mlp": 0.00872399, "balance_loss_clip": 1.01347816, "balance_loss_mlp": 1.00019395, "epoch": 0.2552756568267901, "flos": 65955981903840.0, "grad_norm": 1.1381923990806315, "language_loss": 0.62725377, "learning_rate": 3.491006226721244e-06, "loss": 0.64909762, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.48187518119812 }, { "auxiliary_loss_clip": 0.01341219, "auxiliary_loss_mlp": 0.00873004, "balance_loss_clip": 1.01067162, "balance_loss_mlp": 1.000525, "epoch": 0.2553958997174292, "flos": 17931105838080.0, "grad_norm": 1.971253711796353, "language_loss": 0.7755903, "learning_rate": 3.4904869273690882e-06, "loss": 0.79773247, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.7891476154327393 }, { "auxiliary_loss_clip": 0.01366877, "auxiliary_loss_mlp": 0.01197783, "balance_loss_clip": 1.01139617, "balance_loss_mlp": 1.00094509, "epoch": 0.2555161426080683, "flos": 23367803212800.0, "grad_norm": 1.8541461655731994, "language_loss": 0.89014256, "learning_rate": 3.489967401911251e-06, "loss": 0.91578919, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 2.803515672683716 }, { "auxiliary_loss_clip": 0.01379795, "auxiliary_loss_mlp": 0.0119787, "balance_loss_clip": 1.01164293, "balance_loss_mlp": 1.00084138, "epoch": 0.2556363854987074, "flos": 40625146730880.0, "grad_norm": 2.460690081660367, "language_loss": 0.69584817, "learning_rate": 3.4894476504265428e-06, "loss": 0.72162479, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.8459157943725586 }, { "auxiliary_loss_clip": 0.0133737, "auxiliary_loss_mlp": 0.01194659, "balance_loss_clip": 1.01136589, "balance_loss_mlp": 1.0001092, "epoch": 0.2557566283893465, "flos": 68019475484160.0, "grad_norm": 0.7344482327338092, "language_loss": 0.54451871, "learning_rate": 3.4889276729938104e-06, "loss": 0.56983894, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.229541540145874 }, { "auxiliary_loss_clip": 0.0132873, "auxiliary_loss_mlp": 0.0119767, "balance_loss_clip": 1.01026821, "balance_loss_mlp": 1.00083184, "epoch": 0.2558768712799856, "flos": 22635655407360.0, "grad_norm": 1.8227498117629695, "language_loss": 0.80735052, "learning_rate": 3.488407469691934e-06, "loss": 0.83261454, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.8221848011016846 }, { "auxiliary_loss_clip": 0.01355192, "auxiliary_loss_mlp": 0.01197942, "balance_loss_clip": 1.01148212, "balance_loss_mlp": 1.001104, "epoch": 0.25599711417062465, "flos": 26396366332320.0, "grad_norm": 1.9647601850355545, "language_loss": 0.80934173, "learning_rate": 3.487887040599828e-06, "loss": 0.83487308, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 3.7852864265441895 }, { "auxiliary_loss_clip": 0.01379158, "auxiliary_loss_mlp": 0.01198062, "balance_loss_clip": 1.01166618, "balance_loss_mlp": 1.00103307, "epoch": 0.25611735706126376, "flos": 22852052248320.0, "grad_norm": 2.2071201441858634, "language_loss": 0.76264155, "learning_rate": 3.4873663857964407e-06, "loss": 0.78841376, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.775716543197632 }, { "auxiliary_loss_clip": 0.01293776, "auxiliary_loss_mlp": 0.0119782, "balance_loss_clip": 1.01005173, "balance_loss_mlp": 1.00098157, "epoch": 0.2562375999519028, "flos": 23367875060160.0, "grad_norm": 2.1279400187612088, "language_loss": 0.66749918, "learning_rate": 3.4868455053607556e-06, "loss": 0.69241524, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 4.74087119102478 }, { "auxiliary_loss_clip": 0.01367662, "auxiliary_loss_mlp": 0.01197691, "balance_loss_clip": 1.01167393, "balance_loss_mlp": 1.00104308, "epoch": 0.2563578428425419, "flos": 22856973792480.0, "grad_norm": 2.107887919869775, "language_loss": 0.71300256, "learning_rate": 3.486324399371789e-06, "loss": 0.73865616, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 4.253636121749878 }, { "auxiliary_loss_clip": 0.01316465, "auxiliary_loss_mlp": 0.01197749, "balance_loss_clip": 1.01079452, "balance_loss_mlp": 1.0009104, "epoch": 0.25647808573318104, "flos": 21653894566080.0, "grad_norm": 1.8464908720504791, "language_loss": 0.782915, "learning_rate": 3.485803067908593e-06, "loss": 0.80805713, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.8720409870147705 }, { "auxiliary_loss_clip": 0.01295449, "auxiliary_loss_mlp": 0.01197635, "balance_loss_clip": 1.0103296, "balance_loss_mlp": 1.00079656, "epoch": 0.2565983286238201, "flos": 33730584084000.0, "grad_norm": 1.8688931912099493, "language_loss": 0.79696167, "learning_rate": 3.485281511050253e-06, "loss": 0.8218925, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 3.150195598602295 }, { "auxiliary_loss_clip": 0.01366711, "auxiliary_loss_mlp": 0.01197445, "balance_loss_clip": 1.01134288, "balance_loss_mlp": 1.00079763, "epoch": 0.2567185715144592, "flos": 16216011709920.0, "grad_norm": 2.1879601413211565, "language_loss": 0.90245116, "learning_rate": 3.484759728875889e-06, "loss": 0.92809272, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 3.038835048675537 }, { "auxiliary_loss_clip": 0.01301646, "auxiliary_loss_mlp": 0.01197886, "balance_loss_clip": 1.00999403, "balance_loss_mlp": 1.00104761, "epoch": 0.2568388144050983, "flos": 17458486004160.0, "grad_norm": 1.7106341792772626, "language_loss": 0.80710149, "learning_rate": 3.4842377214646543e-06, "loss": 0.83209682, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.9203081130981445 }, { "auxiliary_loss_clip": 0.01378326, "auxiliary_loss_mlp": 0.01197651, "balance_loss_clip": 1.01118243, "balance_loss_mlp": 1.00081229, "epoch": 0.25695905729573737, "flos": 20887452855360.0, "grad_norm": 1.8327762324277328, "language_loss": 0.66289872, "learning_rate": 3.483715488895737e-06, "loss": 0.68865848, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.772294521331787 }, { "auxiliary_loss_clip": 0.01303522, "auxiliary_loss_mlp": 0.01197359, "balance_loss_clip": 1.00990009, "balance_loss_mlp": 1.00071156, "epoch": 0.2570793001863765, "flos": 24717290446080.0, "grad_norm": 1.8896042487561293, "language_loss": 0.78559804, "learning_rate": 3.48319303124836e-06, "loss": 0.81060696, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 3.0071144104003906 }, { "auxiliary_loss_clip": 0.01328322, "auxiliary_loss_mlp": 0.0119757, "balance_loss_clip": 1.01028013, "balance_loss_mlp": 1.00092268, "epoch": 0.2571995430770156, "flos": 26906908363200.0, "grad_norm": 2.16389215810541, "language_loss": 0.67392337, "learning_rate": 3.4826703486017798e-06, "loss": 0.69918227, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.918370485305786 }, { "auxiliary_loss_clip": 0.01354314, "auxiliary_loss_mlp": 0.01197742, "balance_loss_clip": 1.01043284, "balance_loss_mlp": 1.00090396, "epoch": 0.25731978596765465, "flos": 19792572049440.0, "grad_norm": 1.5606160614748155, "language_loss": 0.76993334, "learning_rate": 3.4821474410352867e-06, "loss": 0.79545391, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.7996606826782227 }, { "auxiliary_loss_clip": 0.01305682, "auxiliary_loss_mlp": 0.01194566, "balance_loss_clip": 1.01136994, "balance_loss_mlp": 1.00001669, "epoch": 0.25744002885829376, "flos": 70565000902560.0, "grad_norm": 0.9049350281980427, "language_loss": 0.62785703, "learning_rate": 3.481624308628205e-06, "loss": 0.65285951, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.534980535507202 }, { "auxiliary_loss_clip": 0.0134232, "auxiliary_loss_mlp": 0.01197703, "balance_loss_clip": 1.01069069, "balance_loss_mlp": 1.00086474, "epoch": 0.25756027174893287, "flos": 18038190624480.0, "grad_norm": 2.944387422518158, "language_loss": 1.00198197, "learning_rate": 3.481100951459893e-06, "loss": 1.02738225, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.8793554306030273 }, { "auxiliary_loss_clip": 0.01354282, "auxiliary_loss_mlp": 0.01197729, "balance_loss_clip": 1.01031172, "balance_loss_mlp": 1.00089085, "epoch": 0.2576805146395719, "flos": 22674080535840.0, "grad_norm": 1.530697411590408, "language_loss": 0.78835869, "learning_rate": 3.4805773696097453e-06, "loss": 0.81387877, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.815049171447754 }, { "auxiliary_loss_clip": 0.01327791, "auxiliary_loss_mlp": 0.01197428, "balance_loss_clip": 1.01098919, "balance_loss_mlp": 1.00078058, "epoch": 0.25780075753021103, "flos": 16472234702880.0, "grad_norm": 1.9437949893971678, "language_loss": 0.87642384, "learning_rate": 3.4800535631571874e-06, "loss": 0.90167606, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.840891122817993 }, { "auxiliary_loss_clip": 0.01342426, "auxiliary_loss_mlp": 0.01198043, "balance_loss_clip": 1.01023161, "balance_loss_mlp": 1.00120449, "epoch": 0.25792100042085014, "flos": 22820308924320.0, "grad_norm": 2.155070495651602, "language_loss": 0.76548135, "learning_rate": 3.4795295321816804e-06, "loss": 0.79088604, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 3.0117270946502686 }, { "auxiliary_loss_clip": 0.01342134, "auxiliary_loss_mlp": 0.01197629, "balance_loss_clip": 1.01040947, "balance_loss_mlp": 1.00079072, "epoch": 0.2580412433114892, "flos": 18697296083040.0, "grad_norm": 2.1256714991186216, "language_loss": 0.91176248, "learning_rate": 3.47900527676272e-06, "loss": 0.93716007, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.8965375423431396 }, { "auxiliary_loss_clip": 0.01379346, "auxiliary_loss_mlp": 0.01197657, "balance_loss_clip": 1.01219583, "balance_loss_mlp": 1.00081849, "epoch": 0.2581614862021283, "flos": 14283155640960.0, "grad_norm": 1.8930388932763793, "language_loss": 0.88398898, "learning_rate": 3.478480796979835e-06, "loss": 0.90975904, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.7321434020996094 }, { "auxiliary_loss_clip": 0.01329431, "auxiliary_loss_mlp": 0.01197648, "balance_loss_clip": 1.00943911, "balance_loss_mlp": 1.00080967, "epoch": 0.25828172909276736, "flos": 29498295111840.0, "grad_norm": 1.568930888812678, "language_loss": 0.77765155, "learning_rate": 3.4779560929125894e-06, "loss": 0.80292237, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.8954715728759766 }, { "auxiliary_loss_clip": 0.0129849, "auxiliary_loss_mlp": 0.01194708, "balance_loss_clip": 1.01121509, "balance_loss_mlp": 1.00015831, "epoch": 0.2584019719834065, "flos": 67114421205120.0, "grad_norm": 0.665425883880195, "language_loss": 0.56933612, "learning_rate": 3.4774311646405783e-06, "loss": 0.59426808, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.4535531997680664 }, { "auxiliary_loss_clip": 0.01329756, "auxiliary_loss_mlp": 0.01197677, "balance_loss_clip": 1.01076365, "balance_loss_mlp": 1.00083852, "epoch": 0.2585222148740456, "flos": 22893566813280.0, "grad_norm": 2.1597150969202703, "language_loss": 0.83398694, "learning_rate": 3.476906012243435e-06, "loss": 0.85926127, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 2.760258197784424 }, { "auxiliary_loss_clip": 0.01367102, "auxiliary_loss_mlp": 0.01198236, "balance_loss_clip": 1.01190281, "balance_loss_mlp": 1.00120699, "epoch": 0.25864245776468464, "flos": 28909214411040.0, "grad_norm": 1.6392553948015995, "language_loss": 0.81085873, "learning_rate": 3.476380635800824e-06, "loss": 0.83651209, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.8027749061584473 }, { "auxiliary_loss_clip": 0.01330137, "auxiliary_loss_mlp": 0.01197619, "balance_loss_clip": 1.01001811, "balance_loss_mlp": 1.00097167, "epoch": 0.25876270065532375, "flos": 14793194740320.0, "grad_norm": 1.9459488670217513, "language_loss": 0.86050045, "learning_rate": 3.475855035392444e-06, "loss": 0.88577807, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 2.7102322578430176 }, { "auxiliary_loss_clip": 0.01269798, "auxiliary_loss_mlp": 0.01197966, "balance_loss_clip": 1.0085665, "balance_loss_mlp": 1.00093734, "epoch": 0.25888294354596286, "flos": 60467841819360.0, "grad_norm": 1.813882256079225, "language_loss": 0.71070457, "learning_rate": 3.475329211098029e-06, "loss": 0.7353822, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.3118209838867188 }, { "auxiliary_loss_clip": 0.01297562, "auxiliary_loss_mlp": 0.01198149, "balance_loss_clip": 1.00925303, "balance_loss_mlp": 1.00111961, "epoch": 0.2590031864366019, "flos": 27851178091680.0, "grad_norm": 1.6047027842403554, "language_loss": 0.82483292, "learning_rate": 3.4748031629973453e-06, "loss": 0.8497901, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.9620649814605713 }, { "auxiliary_loss_clip": 0.01279713, "auxiliary_loss_mlp": 0.01194603, "balance_loss_clip": 1.0100621, "balance_loss_mlp": 1.00005305, "epoch": 0.25912342932724103, "flos": 62422480847520.0, "grad_norm": 0.9088101811597885, "language_loss": 0.56663626, "learning_rate": 3.4742768911701944e-06, "loss": 0.59137946, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 4.517101526260376 }, { "auxiliary_loss_clip": 0.01355273, "auxiliary_loss_mlp": 0.01198384, "balance_loss_clip": 1.01079035, "balance_loss_mlp": 1.00135505, "epoch": 0.25924367221788014, "flos": 12378845688480.0, "grad_norm": 2.8497480197422256, "language_loss": 0.7023896, "learning_rate": 3.4737503956964113e-06, "loss": 0.72792614, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.7500035762786865 }, { "auxiliary_loss_clip": 0.01341711, "auxiliary_loss_mlp": 0.01197932, "balance_loss_clip": 1.01096249, "balance_loss_mlp": 1.00090289, "epoch": 0.2593639151085192, "flos": 14575217257440.0, "grad_norm": 1.9338648113353942, "language_loss": 0.67227292, "learning_rate": 3.473223676655865e-06, "loss": 0.69766933, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 4.956044435501099 }, { "auxiliary_loss_clip": 0.01354861, "auxiliary_loss_mlp": 0.01197915, "balance_loss_clip": 1.01138353, "balance_loss_mlp": 1.0010767, "epoch": 0.2594841579991583, "flos": 15230945890080.0, "grad_norm": 1.6797674789064418, "language_loss": 0.79671359, "learning_rate": 3.472696734128459e-06, "loss": 0.82224137, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 3.8817808628082275 }, { "auxiliary_loss_clip": 0.01355434, "auxiliary_loss_mlp": 0.01197855, "balance_loss_clip": 1.01053333, "balance_loss_mlp": 1.00082564, "epoch": 0.2596044008897974, "flos": 23623595121600.0, "grad_norm": 2.5449935504622236, "language_loss": 0.75531912, "learning_rate": 3.4721695681941286e-06, "loss": 0.78085202, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.7261340618133545 }, { "auxiliary_loss_clip": 0.01340886, "auxiliary_loss_mlp": 0.00872975, "balance_loss_clip": 1.01050663, "balance_loss_mlp": 1.00024569, "epoch": 0.25972464378043647, "flos": 13772290296960.0, "grad_norm": 2.1950535843242363, "language_loss": 0.82659537, "learning_rate": 3.471642178932845e-06, "loss": 0.84873402, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.7391879558563232 }, { "auxiliary_loss_clip": 0.0134129, "auxiliary_loss_mlp": 0.01197672, "balance_loss_clip": 1.01045215, "balance_loss_mlp": 1.00083399, "epoch": 0.2598448866710756, "flos": 19573588703520.0, "grad_norm": 2.039565444366398, "language_loss": 0.89963293, "learning_rate": 3.471114566424613e-06, "loss": 0.9250226, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.786407947540283 }, { "auxiliary_loss_clip": 0.01332057, "auxiliary_loss_mlp": 0.01197794, "balance_loss_clip": 1.01027858, "balance_loss_mlp": 1.00095582, "epoch": 0.25996512956171464, "flos": 21653247939840.0, "grad_norm": 1.8280153668456873, "language_loss": 0.75744605, "learning_rate": 3.4705867307494715e-06, "loss": 0.78274453, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.797800064086914 }, { "auxiliary_loss_clip": 0.01366752, "auxiliary_loss_mlp": 0.0119816, "balance_loss_clip": 1.01143837, "balance_loss_mlp": 1.0011307, "epoch": 0.26008537245235375, "flos": 18223490767680.0, "grad_norm": 2.2184602737406993, "language_loss": 0.84525955, "learning_rate": 3.470058671987492e-06, "loss": 0.87090868, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.717478036880493 }, { "auxiliary_loss_clip": 0.01367536, "auxiliary_loss_mlp": 0.0119801, "balance_loss_clip": 1.0117259, "balance_loss_mlp": 1.00098121, "epoch": 0.26020561534299286, "flos": 24645397656960.0, "grad_norm": 1.8232000847027447, "language_loss": 0.84402454, "learning_rate": 3.4695303902187805e-06, "loss": 0.86967999, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.7947704792022705 }, { "auxiliary_loss_clip": 0.01341945, "auxiliary_loss_mlp": 0.01197757, "balance_loss_clip": 1.01106048, "balance_loss_mlp": 1.0009191, "epoch": 0.2603258582336319, "flos": 25773674276160.0, "grad_norm": 1.96666188965171, "language_loss": 0.78490925, "learning_rate": 3.469001885523478e-06, "loss": 0.81030631, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.9250502586364746 }, { "auxiliary_loss_clip": 0.01378095, "auxiliary_loss_mlp": 0.01197862, "balance_loss_clip": 1.01094103, "balance_loss_mlp": 1.00102377, "epoch": 0.260446101124271, "flos": 28766326924800.0, "grad_norm": 1.543323066506392, "language_loss": 0.81026161, "learning_rate": 3.4684731579817568e-06, "loss": 0.83602124, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.859128475189209 }, { "auxiliary_loss_clip": 0.01268779, "auxiliary_loss_mlp": 0.01197728, "balance_loss_clip": 1.00854301, "balance_loss_mlp": 1.00088978, "epoch": 0.26056634401491013, "flos": 25666769108160.0, "grad_norm": 1.5277733095815904, "language_loss": 0.76556629, "learning_rate": 3.4679442076738247e-06, "loss": 0.79023135, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.953551769256592 }, { "auxiliary_loss_clip": 0.01378683, "auxiliary_loss_mlp": 0.01198278, "balance_loss_clip": 1.01128507, "balance_loss_mlp": 1.00105786, "epoch": 0.2606865869055492, "flos": 27052777514880.0, "grad_norm": 2.3316155734502235, "language_loss": 0.83617532, "learning_rate": 3.4674150346799245e-06, "loss": 0.86194491, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.721843957901001 }, { "auxiliary_loss_clip": 0.01332172, "auxiliary_loss_mlp": 0.01197316, "balance_loss_clip": 1.0096997, "balance_loss_mlp": 1.00066781, "epoch": 0.2608068297961883, "flos": 17712625423680.0, "grad_norm": 2.274705800990871, "language_loss": 0.79925805, "learning_rate": 3.4668856390803295e-06, "loss": 0.82455289, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.77852463722229 }, { "auxiliary_loss_clip": 0.01364478, "auxiliary_loss_mlp": 0.01197635, "balance_loss_clip": 1.01063144, "balance_loss_mlp": 1.00079679, "epoch": 0.2609270726868274, "flos": 18551642473440.0, "grad_norm": 1.8539966366122818, "language_loss": 0.89903188, "learning_rate": 3.4663560209553495e-06, "loss": 0.92465305, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.692530632019043 }, { "auxiliary_loss_clip": 0.01345059, "auxiliary_loss_mlp": 0.01197714, "balance_loss_clip": 1.01012087, "balance_loss_mlp": 1.00087571, "epoch": 0.26104731557746647, "flos": 21835710112320.0, "grad_norm": 1.6796224306550256, "language_loss": 0.7915349, "learning_rate": 3.4658261803853267e-06, "loss": 0.81696266, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 3.06817889213562 }, { "auxiliary_loss_clip": 0.01327705, "auxiliary_loss_mlp": 0.01197658, "balance_loss_clip": 1.01050484, "balance_loss_mlp": 1.00081968, "epoch": 0.2611675584681056, "flos": 21689661342240.0, "grad_norm": 2.330247113983471, "language_loss": 0.80945706, "learning_rate": 3.4652961174506383e-06, "loss": 0.83471066, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.8079543113708496 }, { "auxiliary_loss_clip": 0.01333003, "auxiliary_loss_mlp": 0.01194771, "balance_loss_clip": 1.0115819, "balance_loss_mlp": 1.00022149, "epoch": 0.2612878013587447, "flos": 71862137828640.0, "grad_norm": 1.2452701472165433, "language_loss": 0.58120036, "learning_rate": 3.464765832231694e-06, "loss": 0.6064781, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.3056859970092773 }, { "auxiliary_loss_clip": 0.013554, "auxiliary_loss_mlp": 0.01197706, "balance_loss_clip": 1.01060247, "balance_loss_mlp": 1.00086737, "epoch": 0.26140804424938374, "flos": 20227521152160.0, "grad_norm": 1.9293595716424448, "language_loss": 0.70515805, "learning_rate": 3.4642353248089373e-06, "loss": 0.73068911, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.7605974674224854 }, { "auxiliary_loss_clip": 0.01342557, "auxiliary_loss_mlp": 0.01197567, "balance_loss_clip": 1.01028705, "balance_loss_mlp": 1.00072908, "epoch": 0.26152828714002285, "flos": 25557097816800.0, "grad_norm": 1.743316732139226, "language_loss": 0.80258149, "learning_rate": 3.463704595262846e-06, "loss": 0.82798278, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 2.93420147895813 }, { "auxiliary_loss_clip": 0.01314923, "auxiliary_loss_mlp": 0.01197891, "balance_loss_clip": 1.01002026, "balance_loss_mlp": 1.00105286, "epoch": 0.26164853003066196, "flos": 25446528433440.0, "grad_norm": 1.7564840748778014, "language_loss": 0.71033704, "learning_rate": 3.463173643673931e-06, "loss": 0.73546517, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.8756606578826904 }, { "auxiliary_loss_clip": 0.01346801, "auxiliary_loss_mlp": 0.01194719, "balance_loss_clip": 1.01104081, "balance_loss_mlp": 1.00016928, "epoch": 0.261768772921301, "flos": 53944611141600.0, "grad_norm": 0.9086691774349486, "language_loss": 0.63482589, "learning_rate": 3.4626424701227387e-06, "loss": 0.66024113, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.271196126937866 }, { "auxiliary_loss_clip": 0.01358671, "auxiliary_loss_mlp": 0.01194734, "balance_loss_clip": 1.01106763, "balance_loss_mlp": 1.00018454, "epoch": 0.26188901581194013, "flos": 70687640642400.0, "grad_norm": 0.8141865500731832, "language_loss": 0.55785549, "learning_rate": 3.4621110746898452e-06, "loss": 0.58338952, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 3.3576407432556152 }, { "auxiliary_loss_clip": 0.01364938, "auxiliary_loss_mlp": 0.0119754, "balance_loss_clip": 1.01113236, "balance_loss_mlp": 1.00089216, "epoch": 0.2620092587025792, "flos": 21069591714720.0, "grad_norm": 1.4989220262113299, "language_loss": 0.74493575, "learning_rate": 3.4615794574558654e-06, "loss": 0.77056056, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.81235671043396 }, { "auxiliary_loss_clip": 0.01340358, "auxiliary_loss_mlp": 0.01197724, "balance_loss_clip": 1.01087928, "balance_loss_mlp": 1.00088561, "epoch": 0.2621295015932183, "flos": 18369611385120.0, "grad_norm": 2.4500069158326694, "language_loss": 0.84092706, "learning_rate": 3.4610476185014436e-06, "loss": 0.86630785, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.7369189262390137 }, { "auxiliary_loss_clip": 0.0137819, "auxiliary_loss_mlp": 0.01197428, "balance_loss_clip": 1.01094198, "balance_loss_mlp": 1.00058985, "epoch": 0.2622497444838574, "flos": 23660008524000.0, "grad_norm": 1.7820840993210343, "language_loss": 0.79276371, "learning_rate": 3.4605155579072597e-06, "loss": 0.81851995, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 3.7001190185546875 }, { "auxiliary_loss_clip": 0.01307007, "auxiliary_loss_mlp": 0.01197607, "balance_loss_clip": 1.00996482, "balance_loss_mlp": 1.0007689, "epoch": 0.26236998737449646, "flos": 22123820124000.0, "grad_norm": 1.7386029416787088, "language_loss": 0.71441758, "learning_rate": 3.459983275754027e-06, "loss": 0.73946375, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.8785464763641357 }, { "auxiliary_loss_clip": 0.01377932, "auxiliary_loss_mlp": 0.01197699, "balance_loss_clip": 1.01127303, "balance_loss_mlp": 1.0006696, "epoch": 0.26249023026513557, "flos": 17895195367200.0, "grad_norm": 2.3791600882087276, "language_loss": 0.80029106, "learning_rate": 3.4594507721224918e-06, "loss": 0.82604736, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 5.4710705280303955 }, { "auxiliary_loss_clip": 0.01352015, "auxiliary_loss_mlp": 0.01197659, "balance_loss_clip": 1.01051557, "balance_loss_mlp": 1.00082111, "epoch": 0.2626104731557747, "flos": 18332946516960.0, "grad_norm": 1.6955285822603428, "language_loss": 0.81697452, "learning_rate": 3.4589180470934353e-06, "loss": 0.8424713, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.853785753250122 }, { "auxiliary_loss_clip": 0.01366057, "auxiliary_loss_mlp": 0.011978, "balance_loss_clip": 1.01062834, "balance_loss_mlp": 1.0009613, "epoch": 0.26273071604641374, "flos": 19317724947360.0, "grad_norm": 1.8645405816161384, "language_loss": 0.76544696, "learning_rate": 3.4583851007476713e-06, "loss": 0.79108548, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.7140183448791504 }, { "auxiliary_loss_clip": 0.01315819, "auxiliary_loss_mlp": 0.01197732, "balance_loss_clip": 1.00998092, "balance_loss_mlp": 1.00070274, "epoch": 0.26285095893705285, "flos": 18327486117600.0, "grad_norm": 1.9818730500173172, "language_loss": 0.68380642, "learning_rate": 3.4578519331660464e-06, "loss": 0.70894194, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.7792134284973145 }, { "auxiliary_loss_clip": 0.01352903, "auxiliary_loss_mlp": 0.01197616, "balance_loss_clip": 1.01072848, "balance_loss_mlp": 1.00077784, "epoch": 0.26297120182769196, "flos": 20193837949440.0, "grad_norm": 1.871058268823132, "language_loss": 0.81864226, "learning_rate": 3.4573185444294426e-06, "loss": 0.84414744, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.673707962036133 }, { "auxiliary_loss_clip": 0.01346149, "auxiliary_loss_mlp": 0.0087302, "balance_loss_clip": 1.01049948, "balance_loss_mlp": 1.00016665, "epoch": 0.263091444718331, "flos": 22418432321760.0, "grad_norm": 1.6619687213256733, "language_loss": 0.78985256, "learning_rate": 3.456784934618774e-06, "loss": 0.81204414, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.8262975215911865 }, { "auxiliary_loss_clip": 0.0135072, "auxiliary_loss_mlp": 0.01197775, "balance_loss_clip": 1.01038289, "balance_loss_mlp": 1.00093699, "epoch": 0.2632116876089701, "flos": 19024837086240.0, "grad_norm": 2.1837903006620674, "language_loss": 0.7983942, "learning_rate": 3.4562511038149897e-06, "loss": 0.82387912, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.73164701461792 }, { "auxiliary_loss_clip": 0.01281071, "auxiliary_loss_mlp": 0.01194678, "balance_loss_clip": 1.00853062, "balance_loss_mlp": 1.00012803, "epoch": 0.26333193049960923, "flos": 67308091565760.0, "grad_norm": 0.8625348131792174, "language_loss": 0.57806611, "learning_rate": 3.4557170520990705e-06, "loss": 0.60282362, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.471560001373291 }, { "auxiliary_loss_clip": 0.01352013, "auxiliary_loss_mlp": 0.01197499, "balance_loss_clip": 1.01035547, "balance_loss_mlp": 1.00085163, "epoch": 0.2634521733902483, "flos": 25048818977760.0, "grad_norm": 1.6213551529284622, "language_loss": 0.86544096, "learning_rate": 3.4551827795520324e-06, "loss": 0.89093614, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.7935454845428467 }, { "auxiliary_loss_clip": 0.0136517, "auxiliary_loss_mlp": 0.0119775, "balance_loss_clip": 1.01087344, "balance_loss_mlp": 1.00091147, "epoch": 0.2635724162808874, "flos": 20594996078400.0, "grad_norm": 1.6306983789907878, "language_loss": 0.84889007, "learning_rate": 3.4546482862549226e-06, "loss": 0.87451923, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.7922072410583496 }, { "auxiliary_loss_clip": 0.01327607, "auxiliary_loss_mlp": 0.01197572, "balance_loss_clip": 1.01037765, "balance_loss_mlp": 1.00092387, "epoch": 0.2636926591715265, "flos": 19244646676800.0, "grad_norm": 2.122561480637602, "language_loss": 0.7932241, "learning_rate": 3.4541135722888253e-06, "loss": 0.81847584, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.765953779220581 }, { "auxiliary_loss_clip": 0.01377526, "auxiliary_loss_mlp": 0.011974, "balance_loss_clip": 1.01067543, "balance_loss_mlp": 1.00075221, "epoch": 0.26381290206216557, "flos": 28804895748000.0, "grad_norm": 2.2207096292790522, "language_loss": 0.80040741, "learning_rate": 3.453578637734854e-06, "loss": 0.82615674, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.739084482192993 }, { "auxiliary_loss_clip": 0.01378401, "auxiliary_loss_mlp": 0.01197832, "balance_loss_clip": 1.01158929, "balance_loss_mlp": 1.00099373, "epoch": 0.2639331449528047, "flos": 25008920978400.0, "grad_norm": 1.6970692050433667, "language_loss": 0.78458524, "learning_rate": 3.4530434826741605e-06, "loss": 0.81034756, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.805600166320801 }, { "auxiliary_loss_clip": 0.01328437, "auxiliary_loss_mlp": 0.01197572, "balance_loss_clip": 1.00956345, "balance_loss_mlp": 1.00073361, "epoch": 0.26405338784344373, "flos": 46535793115680.0, "grad_norm": 1.8737012740682781, "language_loss": 0.68934846, "learning_rate": 3.452508107187926e-06, "loss": 0.71460855, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 2.9427719116210938 }, { "auxiliary_loss_clip": 0.01285769, "auxiliary_loss_mlp": 0.01197959, "balance_loss_clip": 1.00931323, "balance_loss_mlp": 1.00112069, "epoch": 0.26417363073408284, "flos": 21179478548160.0, "grad_norm": 2.5102622933674996, "language_loss": 0.77225238, "learning_rate": 3.451972511357366e-06, "loss": 0.7970897, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.9091176986694336 }, { "auxiliary_loss_clip": 0.0135227, "auxiliary_loss_mlp": 0.01197656, "balance_loss_clip": 1.01012075, "balance_loss_mlp": 1.00081754, "epoch": 0.26429387362472195, "flos": 22674763085760.0, "grad_norm": 1.8176800892251708, "language_loss": 0.85037893, "learning_rate": 3.45143669526373e-06, "loss": 0.87587821, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.8668437004089355 }, { "auxiliary_loss_clip": 0.01331527, "auxiliary_loss_mlp": 0.01194702, "balance_loss_clip": 1.01001513, "balance_loss_mlp": 1.00015283, "epoch": 0.264414116515361, "flos": 67180566205440.0, "grad_norm": 0.7818866896923293, "language_loss": 0.63221025, "learning_rate": 3.450900658988302e-06, "loss": 0.65747255, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.2697627544403076 }, { "auxiliary_loss_clip": 0.01341691, "auxiliary_loss_mlp": 0.01197598, "balance_loss_clip": 1.01069319, "balance_loss_mlp": 1.00075924, "epoch": 0.2645343594060001, "flos": 25664721458400.0, "grad_norm": 1.9104334411929607, "language_loss": 0.77626616, "learning_rate": 3.450364402612397e-06, "loss": 0.80165911, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.7721457481384277 }, { "auxiliary_loss_clip": 0.01327876, "auxiliary_loss_mlp": 0.01197633, "balance_loss_clip": 1.00951231, "balance_loss_mlp": 1.00079429, "epoch": 0.26465460229663923, "flos": 22491833905440.0, "grad_norm": 1.954354484594906, "language_loss": 0.83784652, "learning_rate": 3.449827926217366e-06, "loss": 0.8631016, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.735194683074951 }, { "auxiliary_loss_clip": 0.01345379, "auxiliary_loss_mlp": 0.011978, "balance_loss_clip": 1.0097158, "balance_loss_mlp": 1.00077069, "epoch": 0.2647748451872783, "flos": 29388049041600.0, "grad_norm": 1.8373926848108677, "language_loss": 0.8048532, "learning_rate": 3.449291229884591e-06, "loss": 0.83028507, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.882692575454712 }, { "auxiliary_loss_clip": 0.01325958, "auxiliary_loss_mlp": 0.01197736, "balance_loss_clip": 1.00990403, "balance_loss_mlp": 1.00089753, "epoch": 0.2648950880779174, "flos": 26797811850720.0, "grad_norm": 3.4692937011491836, "language_loss": 0.86760855, "learning_rate": 3.4487543136954887e-06, "loss": 0.89284551, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 2.7947943210601807 }, { "auxiliary_loss_clip": 0.01313791, "auxiliary_loss_mlp": 0.01197642, "balance_loss_clip": 1.00902033, "balance_loss_mlp": 1.00080395, "epoch": 0.2650153309685565, "flos": 28841021760960.0, "grad_norm": 1.6092392218697873, "language_loss": 0.91088521, "learning_rate": 3.448217177731509e-06, "loss": 0.93599957, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.8888351917266846 }, { "auxiliary_loss_clip": 0.01326603, "auxiliary_loss_mlp": 0.01197556, "balance_loss_clip": 1.01007271, "balance_loss_mlp": 1.00090837, "epoch": 0.26513557385919556, "flos": 20303257775040.0, "grad_norm": 2.251019745048885, "language_loss": 0.78660315, "learning_rate": 3.4476798220741348e-06, "loss": 0.81184477, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.7105519771575928 }, { "auxiliary_loss_clip": 0.01377802, "auxiliary_loss_mlp": 0.01197604, "balance_loss_clip": 1.01150417, "balance_loss_mlp": 1.00076568, "epoch": 0.26525581674983467, "flos": 17676283868640.0, "grad_norm": 2.0385377448317215, "language_loss": 0.78426278, "learning_rate": 3.4471422468048826e-06, "loss": 0.81001675, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.6776843070983887 }, { "auxiliary_loss_clip": 0.01366114, "auxiliary_loss_mlp": 0.0119779, "balance_loss_clip": 1.011572, "balance_loss_mlp": 1.00095129, "epoch": 0.2653760596404738, "flos": 26833758245280.0, "grad_norm": 2.1639835473305373, "language_loss": 0.73342633, "learning_rate": 3.4466044520053022e-06, "loss": 0.75906539, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 3.7959156036376953 }, { "auxiliary_loss_clip": 0.01353396, "auxiliary_loss_mlp": 0.01197248, "balance_loss_clip": 1.01084173, "balance_loss_mlp": 1.00079119, "epoch": 0.26549630253111284, "flos": 22782171185280.0, "grad_norm": 1.7857400885755939, "language_loss": 0.60471338, "learning_rate": 3.446066437756977e-06, "loss": 0.63021982, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.728262424468994 }, { "auxiliary_loss_clip": 0.0132997, "auxiliary_loss_mlp": 0.01197663, "balance_loss_clip": 1.00935686, "balance_loss_mlp": 1.00082505, "epoch": 0.26561654542175195, "flos": 23550013919520.0, "grad_norm": 2.1079544819577176, "language_loss": 0.75276595, "learning_rate": 3.4455282041415224e-06, "loss": 0.77804232, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 4.651750087738037 }, { "auxiliary_loss_clip": 0.01314451, "auxiliary_loss_mlp": 0.01197498, "balance_loss_clip": 1.00857687, "balance_loss_mlp": 1.00084996, "epoch": 0.265736788312391, "flos": 26906692821120.0, "grad_norm": 2.2317900650943576, "language_loss": 0.87186557, "learning_rate": 3.4449897512405894e-06, "loss": 0.89698505, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 3.8046884536743164 }, { "auxiliary_loss_clip": 0.01292584, "auxiliary_loss_mlp": 0.0087296, "balance_loss_clip": 1.00960064, "balance_loss_mlp": 1.0002315, "epoch": 0.2658570312030301, "flos": 23477151191040.0, "grad_norm": 2.328714708220541, "language_loss": 0.75586987, "learning_rate": 3.444451079135859e-06, "loss": 0.77752531, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.933204412460327 }, { "auxiliary_loss_clip": 0.01303381, "auxiliary_loss_mlp": 0.00872989, "balance_loss_clip": 1.00899792, "balance_loss_mlp": 1.00018239, "epoch": 0.2659772740936692, "flos": 21866411649600.0, "grad_norm": 2.265940713496039, "language_loss": 0.74287546, "learning_rate": 3.4439121879090493e-06, "loss": 0.76463914, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.911763906478882 }, { "auxiliary_loss_clip": 0.01345835, "auxiliary_loss_mlp": 0.01197709, "balance_loss_clip": 1.01088691, "balance_loss_mlp": 1.00087082, "epoch": 0.2660975169843083, "flos": 19793110904640.0, "grad_norm": 1.8231866385006799, "language_loss": 0.83038759, "learning_rate": 3.4433730776419082e-06, "loss": 0.85582304, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.7938003540039062 }, { "auxiliary_loss_clip": 0.01365416, "auxiliary_loss_mlp": 0.00873014, "balance_loss_clip": 1.01106644, "balance_loss_mlp": 1.00029802, "epoch": 0.2662177598749474, "flos": 29018993473440.0, "grad_norm": 2.0982924198346686, "language_loss": 0.80338079, "learning_rate": 3.4428337484162183e-06, "loss": 0.82576513, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.84546160697937 }, { "auxiliary_loss_clip": 0.01352599, "auxiliary_loss_mlp": 0.01197659, "balance_loss_clip": 1.01089478, "balance_loss_mlp": 1.00082076, "epoch": 0.2663380027655865, "flos": 21762559994400.0, "grad_norm": 1.8597539748225884, "language_loss": 0.84120357, "learning_rate": 3.442294200313797e-06, "loss": 0.86670619, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.8245182037353516 }, { "auxiliary_loss_clip": 0.01358011, "auxiliary_loss_mlp": 0.01194912, "balance_loss_clip": 1.01033652, "balance_loss_mlp": 1.00036228, "epoch": 0.26645824565622556, "flos": 66980370944160.0, "grad_norm": 0.8244370802909773, "language_loss": 0.52710271, "learning_rate": 3.4417544334164916e-06, "loss": 0.55263197, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.318509817123413 }, { "auxiliary_loss_clip": 0.01308255, "auxiliary_loss_mlp": 0.0119795, "balance_loss_clip": 1.00900662, "balance_loss_mlp": 1.00111127, "epoch": 0.26657848854686467, "flos": 25264209955680.0, "grad_norm": 1.6762074218210554, "language_loss": 0.77503192, "learning_rate": 3.4412144478061854e-06, "loss": 0.80009395, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.839380979537964 }, { "auxiliary_loss_clip": 0.01236449, "auxiliary_loss_mlp": 0.01198012, "balance_loss_clip": 1.00769305, "balance_loss_mlp": 1.00098336, "epoch": 0.2666987314375038, "flos": 23696781163200.0, "grad_norm": 1.7580344549413556, "language_loss": 0.7567116, "learning_rate": 3.4406742435647925e-06, "loss": 0.78105623, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 3.0287983417510986 }, { "auxiliary_loss_clip": 0.01352885, "auxiliary_loss_mlp": 0.01197951, "balance_loss_clip": 1.00997829, "balance_loss_mlp": 1.00111294, "epoch": 0.26681897432814283, "flos": 27048969604800.0, "grad_norm": 1.8455917563912048, "language_loss": 0.79035509, "learning_rate": 3.440133820774263e-06, "loss": 0.81586343, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.905620574951172 }, { "auxiliary_loss_clip": 0.01340117, "auxiliary_loss_mlp": 0.01197516, "balance_loss_clip": 1.01059449, "balance_loss_mlp": 1.00086868, "epoch": 0.26693921721878194, "flos": 28985956896960.0, "grad_norm": 1.9950223008140446, "language_loss": 0.81912088, "learning_rate": 3.439593179516578e-06, "loss": 0.84449726, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.7908263206481934 }, { "auxiliary_loss_clip": 0.01341712, "auxiliary_loss_mlp": 0.01197713, "balance_loss_clip": 1.01037085, "balance_loss_mlp": 1.00087452, "epoch": 0.26705946010942105, "flos": 21507845796000.0, "grad_norm": 1.9891683102081776, "language_loss": 0.8076759, "learning_rate": 3.4390523198737524e-06, "loss": 0.83307016, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.758378267288208 }, { "auxiliary_loss_clip": 0.01377362, "auxiliary_loss_mlp": 0.00872984, "balance_loss_clip": 1.01089978, "balance_loss_mlp": 1.00030184, "epoch": 0.2671797030000601, "flos": 21471288698880.0, "grad_norm": 1.621286128705205, "language_loss": 0.7366814, "learning_rate": 3.4385112419278333e-06, "loss": 0.75918484, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.778589963912964 }, { "auxiliary_loss_clip": 0.01345712, "auxiliary_loss_mlp": 0.0119489, "balance_loss_clip": 1.01047421, "balance_loss_mlp": 1.00034046, "epoch": 0.2672999458906992, "flos": 64189961206560.0, "grad_norm": 0.7879967917364894, "language_loss": 0.64837527, "learning_rate": 3.4379699457609033e-06, "loss": 0.67378128, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 3.1880476474761963 }, { "auxiliary_loss_clip": 0.01342552, "auxiliary_loss_mlp": 0.01197717, "balance_loss_clip": 1.01035786, "balance_loss_mlp": 1.00087833, "epoch": 0.26742018878133833, "flos": 16909051836960.0, "grad_norm": 2.0624672012412293, "language_loss": 0.90240246, "learning_rate": 3.4374284314550755e-06, "loss": 0.92780519, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.7474124431610107 }, { "auxiliary_loss_clip": 0.01376819, "auxiliary_loss_mlp": 0.0119766, "balance_loss_clip": 1.01082373, "balance_loss_mlp": 1.00082207, "epoch": 0.2675404316719774, "flos": 20667571417440.0, "grad_norm": 1.89064598590356, "language_loss": 0.80815804, "learning_rate": 3.436886699092498e-06, "loss": 0.83390284, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.728459119796753 }, { "auxiliary_loss_clip": 0.01378023, "auxiliary_loss_mlp": 0.01198447, "balance_loss_clip": 1.01112652, "balance_loss_mlp": 1.0012275, "epoch": 0.2676606745626165, "flos": 17485020394560.0, "grad_norm": 4.34881966250911, "language_loss": 0.71469748, "learning_rate": 3.4363447487553502e-06, "loss": 0.74046218, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.574718713760376 }, { "auxiliary_loss_clip": 0.01327388, "auxiliary_loss_mlp": 0.01197316, "balance_loss_clip": 1.00987363, "balance_loss_mlp": 1.00066829, "epoch": 0.26778091745325555, "flos": 27852686886240.0, "grad_norm": 1.7339714398752282, "language_loss": 0.78136933, "learning_rate": 3.4358025805258455e-06, "loss": 0.80661631, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.86008358001709 }, { "auxiliary_loss_clip": 0.01304413, "auxiliary_loss_mlp": 0.01197733, "balance_loss_clip": 1.00958419, "balance_loss_mlp": 1.00089431, "epoch": 0.26790116034389466, "flos": 20956004742240.0, "grad_norm": 2.0695325614181743, "language_loss": 0.83371723, "learning_rate": 3.435260194486232e-06, "loss": 0.85873866, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.800870656967163 }, { "auxiliary_loss_clip": 0.01331063, "auxiliary_loss_mlp": 0.01197754, "balance_loss_clip": 1.00991499, "balance_loss_mlp": 1.00091565, "epoch": 0.2680214032345338, "flos": 18040669358400.0, "grad_norm": 2.1308479240214453, "language_loss": 0.82183433, "learning_rate": 3.4347175907187875e-06, "loss": 0.84712243, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.794731855392456 }, { "auxiliary_loss_clip": 0.01352747, "auxiliary_loss_mlp": 0.01197762, "balance_loss_clip": 1.00981152, "balance_loss_mlp": 1.00111461, "epoch": 0.26814164612517283, "flos": 22419438184800.0, "grad_norm": 1.9220863905379024, "language_loss": 0.87710547, "learning_rate": 3.4341747693058254e-06, "loss": 0.9026106, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 2.7770345211029053 }, { "auxiliary_loss_clip": 0.01241938, "auxiliary_loss_mlp": 0.01197579, "balance_loss_clip": 1.00757051, "balance_loss_mlp": 1.00093138, "epoch": 0.26826188901581194, "flos": 35627385987360.0, "grad_norm": 1.8812222439975714, "language_loss": 0.76997936, "learning_rate": 3.4336317303296916e-06, "loss": 0.79437459, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.162644863128662 }, { "auxiliary_loss_clip": 0.01352041, "auxiliary_loss_mlp": 0.01197597, "balance_loss_clip": 1.0102973, "balance_loss_mlp": 1.00075841, "epoch": 0.26838213190645105, "flos": 17639798618880.0, "grad_norm": 1.8674863033207085, "language_loss": 0.74913889, "learning_rate": 3.4330884738727635e-06, "loss": 0.7746352, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 3.5756921768188477 }, { "auxiliary_loss_clip": 0.01297282, "auxiliary_loss_mlp": 0.01197519, "balance_loss_clip": 1.00904882, "balance_loss_mlp": 1.0008713, "epoch": 0.2685023747970901, "flos": 22674834933120.0, "grad_norm": 1.8135130963791453, "language_loss": 0.71080643, "learning_rate": 3.4325450000174535e-06, "loss": 0.73575443, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 4.063469648361206 }, { "auxiliary_loss_clip": 0.01295226, "auxiliary_loss_mlp": 0.0119767, "balance_loss_clip": 1.00908613, "balance_loss_mlp": 1.0010221, "epoch": 0.2686226176877292, "flos": 20120544136800.0, "grad_norm": 1.6954304467107504, "language_loss": 0.74043345, "learning_rate": 3.4320013088462067e-06, "loss": 0.76536244, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.8581228256225586 }, { "auxiliary_loss_clip": 0.01329757, "auxiliary_loss_mlp": 0.01197409, "balance_loss_clip": 1.01001191, "balance_loss_mlp": 1.00095201, "epoch": 0.2687428605783683, "flos": 21872051667360.0, "grad_norm": 1.3918990815430554, "language_loss": 0.81815076, "learning_rate": 3.431457400441499e-06, "loss": 0.84342241, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 3.6953160762786865 }, { "auxiliary_loss_clip": 0.01252765, "auxiliary_loss_mlp": 0.0119487, "balance_loss_clip": 1.00837362, "balance_loss_mlp": 1.00031996, "epoch": 0.2688631034690074, "flos": 69943350633120.0, "grad_norm": 0.9088721420402027, "language_loss": 0.60908484, "learning_rate": 3.4309132748858424e-06, "loss": 0.63356125, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 4.495317697525024 }, { "auxiliary_loss_clip": 0.01352049, "auxiliary_loss_mlp": 0.01197713, "balance_loss_clip": 1.01036739, "balance_loss_mlp": 1.00087452, "epoch": 0.2689833463596465, "flos": 22856650479360.0, "grad_norm": 1.5841191879162995, "language_loss": 0.83553296, "learning_rate": 3.430368932261779e-06, "loss": 0.86103058, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 3.1628360748291016 }, { "auxiliary_loss_clip": 0.01328455, "auxiliary_loss_mlp": 0.01197272, "balance_loss_clip": 1.00917816, "balance_loss_mlp": 1.00062394, "epoch": 0.2691035892502856, "flos": 17200251285120.0, "grad_norm": 1.8733627827174892, "language_loss": 0.74960804, "learning_rate": 3.429824372651886e-06, "loss": 0.77486533, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.7201547622680664 }, { "auxiliary_loss_clip": 0.0129985, "auxiliary_loss_mlp": 0.01198189, "balance_loss_clip": 1.00882387, "balance_loss_mlp": 1.00115955, "epoch": 0.26922383214092466, "flos": 17747494107840.0, "grad_norm": 2.00383604729827, "language_loss": 0.83463621, "learning_rate": 3.4292795961387732e-06, "loss": 0.85961658, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.802680015563965 }, { "auxiliary_loss_clip": 0.01376721, "auxiliary_loss_mlp": 0.01197649, "balance_loss_clip": 1.01061833, "balance_loss_mlp": 1.00081038, "epoch": 0.26934407503156377, "flos": 16173383510880.0, "grad_norm": 2.0917663290435726, "language_loss": 0.8771041, "learning_rate": 3.4287346028050818e-06, "loss": 0.90284783, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.696925401687622 }, { "auxiliary_loss_clip": 0.01339184, "auxiliary_loss_mlp": 0.01196835, "balance_loss_clip": 1.01009607, "balance_loss_mlp": 1.00075912, "epoch": 0.2694643179222028, "flos": 23732907176160.0, "grad_norm": 1.5918791434415145, "language_loss": 0.79510379, "learning_rate": 3.4281893927334866e-06, "loss": 0.82046396, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.816909074783325 }, { "auxiliary_loss_clip": 0.01355102, "auxiliary_loss_mlp": 0.011972, "balance_loss_clip": 1.01022255, "balance_loss_mlp": 1.00074363, "epoch": 0.26958456081284193, "flos": 24718152614400.0, "grad_norm": 1.8983619145124344, "language_loss": 0.75026596, "learning_rate": 3.4276439660066963e-06, "loss": 0.77578902, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.781735897064209 }, { "auxiliary_loss_clip": 0.01376942, "auxiliary_loss_mlp": 0.01197086, "balance_loss_clip": 1.01082277, "balance_loss_mlp": 1.00081944, "epoch": 0.26970480370348104, "flos": 18112598071200.0, "grad_norm": 1.9254886147096668, "language_loss": 0.84280872, "learning_rate": 3.427098322707452e-06, "loss": 0.86854899, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.7160098552703857 }, { "auxiliary_loss_clip": 0.01352029, "auxiliary_loss_mlp": 0.01197637, "balance_loss_clip": 1.01121318, "balance_loss_mlp": 1.00098896, "epoch": 0.2698250465941201, "flos": 10816553982240.0, "grad_norm": 2.1256485512467824, "language_loss": 0.89615017, "learning_rate": 3.426552462918526e-06, "loss": 0.92164683, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.7438700199127197 }, { "auxiliary_loss_clip": 0.01376323, "auxiliary_loss_mlp": 0.01197631, "balance_loss_clip": 1.01086998, "balance_loss_mlp": 1.00098324, "epoch": 0.2699452894847592, "flos": 17308126392480.0, "grad_norm": 2.2437977020691227, "language_loss": 0.73128039, "learning_rate": 3.426006386722726e-06, "loss": 0.75702, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.7040083408355713 }, { "auxiliary_loss_clip": 0.01313766, "auxiliary_loss_mlp": 0.01197533, "balance_loss_clip": 1.00985503, "balance_loss_mlp": 1.00088501, "epoch": 0.2700655323753983, "flos": 18078160471200.0, "grad_norm": 2.2178947075447244, "language_loss": 0.92206705, "learning_rate": 3.4254600942028914e-06, "loss": 0.94718003, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.7650504112243652 }, { "auxiliary_loss_clip": 0.01327567, "auxiliary_loss_mlp": 0.01196752, "balance_loss_clip": 1.00948703, "balance_loss_mlp": 1.00067687, "epoch": 0.2701857752660374, "flos": 18186646281120.0, "grad_norm": 1.9539473929366062, "language_loss": 0.83119035, "learning_rate": 3.424913585441893e-06, "loss": 0.85643357, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.755506992340088 }, { "auxiliary_loss_clip": 0.01351506, "auxiliary_loss_mlp": 0.01197555, "balance_loss_clip": 1.01089025, "balance_loss_mlp": 1.0009079, "epoch": 0.2703060181566765, "flos": 16319504128320.0, "grad_norm": 1.8609685906920594, "language_loss": 0.87297076, "learning_rate": 3.4243668605226374e-06, "loss": 0.89846134, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.6976232528686523 }, { "auxiliary_loss_clip": 0.01303537, "auxiliary_loss_mlp": 0.01197896, "balance_loss_clip": 1.00907648, "balance_loss_mlp": 1.00086689, "epoch": 0.2704262610473156, "flos": 19572367298400.0, "grad_norm": 2.1087836242229594, "language_loss": 0.82579714, "learning_rate": 3.423819919528061e-06, "loss": 0.85081148, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.828059673309326 }, { "auxiliary_loss_clip": 0.0131426, "auxiliary_loss_mlp": 0.01197626, "balance_loss_clip": 1.0102632, "balance_loss_mlp": 1.00078726, "epoch": 0.27054650393795465, "flos": 20740757459040.0, "grad_norm": 1.679466971319799, "language_loss": 0.78434074, "learning_rate": 3.4232727625411355e-06, "loss": 0.80945957, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.9530036449432373 }, { "auxiliary_loss_clip": 0.01268959, "auxiliary_loss_mlp": 0.01197133, "balance_loss_clip": 1.00784791, "balance_loss_mlp": 1.00086701, "epoch": 0.27066674682859376, "flos": 18658332099360.0, "grad_norm": 2.8179991245955756, "language_loss": 0.86493766, "learning_rate": 3.4227253896448626e-06, "loss": 0.88959861, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.7584011554718018 }, { "auxiliary_loss_clip": 0.01375866, "auxiliary_loss_mlp": 0.0119734, "balance_loss_clip": 1.0102365, "balance_loss_mlp": 1.00088334, "epoch": 0.2707869897192329, "flos": 23002771096800.0, "grad_norm": 2.252612770293606, "language_loss": 0.82386196, "learning_rate": 3.42217780092228e-06, "loss": 0.849594, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.81280517578125 }, { "auxiliary_loss_clip": 0.01318901, "auxiliary_loss_mlp": 0.01195062, "balance_loss_clip": 1.0098002, "balance_loss_mlp": 1.00051236, "epoch": 0.27090723260987193, "flos": 58323272196960.0, "grad_norm": 1.0351103063850782, "language_loss": 0.60367763, "learning_rate": 3.421629996456456e-06, "loss": 0.62881732, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.261807441711426 }, { "auxiliary_loss_clip": 0.01364203, "auxiliary_loss_mlp": 0.0119752, "balance_loss_clip": 1.01050305, "balance_loss_mlp": 1.00087202, "epoch": 0.27102747550051104, "flos": 11984548982400.0, "grad_norm": 1.813344832605844, "language_loss": 0.82567906, "learning_rate": 3.421081976330491e-06, "loss": 0.85129631, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.778372049331665 }, { "auxiliary_loss_clip": 0.01341953, "auxiliary_loss_mlp": 0.01197794, "balance_loss_clip": 1.01008916, "balance_loss_mlp": 1.00095582, "epoch": 0.27114771839115015, "flos": 19900411233120.0, "grad_norm": 1.7057770693572636, "language_loss": 0.87977576, "learning_rate": 3.4205337406275207e-06, "loss": 0.90517318, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.7954344749450684 }, { "auxiliary_loss_clip": 0.01376106, "auxiliary_loss_mlp": 0.01197553, "balance_loss_clip": 1.01029384, "balance_loss_mlp": 1.00071454, "epoch": 0.2712679612817892, "flos": 18331976577600.0, "grad_norm": 2.582440541618349, "language_loss": 0.7547791, "learning_rate": 3.4199852894307114e-06, "loss": 0.78051567, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.699118137359619 }, { "auxiliary_loss_clip": 0.01270266, "auxiliary_loss_mlp": 0.01197731, "balance_loss_clip": 1.00821137, "balance_loss_mlp": 1.0010829, "epoch": 0.2713882041724283, "flos": 24460313055840.0, "grad_norm": 1.8233307156845107, "language_loss": 0.78609109, "learning_rate": 3.419436622823262e-06, "loss": 0.81077111, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.978144884109497 }, { "auxiliary_loss_clip": 0.01342843, "auxiliary_loss_mlp": 0.01197682, "balance_loss_clip": 1.01036549, "balance_loss_mlp": 1.0010339, "epoch": 0.27150844706306737, "flos": 23039328193920.0, "grad_norm": 1.665538416828085, "language_loss": 0.74392974, "learning_rate": 3.4188877408884063e-06, "loss": 0.76933497, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.834324598312378 }, { "auxiliary_loss_clip": 0.01341413, "auxiliary_loss_mlp": 0.01197255, "balance_loss_clip": 1.01025188, "balance_loss_mlp": 1.00060773, "epoch": 0.2716286899537065, "flos": 22563654847200.0, "grad_norm": 5.253506397663873, "language_loss": 0.65683675, "learning_rate": 3.4183386437094088e-06, "loss": 0.68222344, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 3.7940165996551514 }, { "auxiliary_loss_clip": 0.01339062, "auxiliary_loss_mlp": 0.01197585, "balance_loss_clip": 1.00942159, "balance_loss_mlp": 1.0009371, "epoch": 0.2717489328443456, "flos": 13115052869760.0, "grad_norm": 2.2614353305486974, "language_loss": 0.82516503, "learning_rate": 3.417789331369565e-06, "loss": 0.85053158, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.8090670108795166 }, { "auxiliary_loss_clip": 0.01377018, "auxiliary_loss_mlp": 0.01197699, "balance_loss_clip": 1.01075983, "balance_loss_mlp": 1.00086105, "epoch": 0.27186917573498465, "flos": 29278701063360.0, "grad_norm": 1.9303139477745603, "language_loss": 0.90991545, "learning_rate": 3.4172398039522088e-06, "loss": 0.93566263, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 3.607583522796631 }, { "auxiliary_loss_clip": 0.01366179, "auxiliary_loss_mlp": 0.01197812, "balance_loss_clip": 1.01091802, "balance_loss_mlp": 1.0009737, "epoch": 0.27198941862562376, "flos": 26032232308320.0, "grad_norm": 1.5500843018660722, "language_loss": 0.79949373, "learning_rate": 3.4166900615407e-06, "loss": 0.82513356, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 3.6758081912994385 }, { "auxiliary_loss_clip": 0.01351409, "auxiliary_loss_mlp": 0.0119742, "balance_loss_clip": 1.00978303, "balance_loss_mlp": 1.00077248, "epoch": 0.27210966151626287, "flos": 32780997650880.0, "grad_norm": 2.081749139852048, "language_loss": 0.7527355, "learning_rate": 3.416140104218436e-06, "loss": 0.77822381, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.845005989074707 }, { "auxiliary_loss_clip": 0.01322258, "auxiliary_loss_mlp": 0.00872461, "balance_loss_clip": 1.01023412, "balance_loss_mlp": 0.99988097, "epoch": 0.2722299044069019, "flos": 65471076171360.0, "grad_norm": 0.841618590120014, "language_loss": 0.69686085, "learning_rate": 3.4155899320688437e-06, "loss": 0.71880805, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.3933467864990234 }, { "auxiliary_loss_clip": 0.01268805, "auxiliary_loss_mlp": 0.01197582, "balance_loss_clip": 1.00814748, "balance_loss_mlp": 1.0007441, "epoch": 0.27235014729754103, "flos": 15334150919040.0, "grad_norm": 2.002613859875992, "language_loss": 0.73960412, "learning_rate": 3.415039545175384e-06, "loss": 0.76426804, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.848361015319824 }, { "auxiliary_loss_clip": 0.01356938, "auxiliary_loss_mlp": 0.01197134, "balance_loss_clip": 1.01022506, "balance_loss_mlp": 1.00086737, "epoch": 0.27247039018818014, "flos": 21872374980480.0, "grad_norm": 2.953841665756714, "language_loss": 0.64915484, "learning_rate": 3.414488943621551e-06, "loss": 0.67469555, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.728256940841675 }, { "auxiliary_loss_clip": 0.01351224, "auxiliary_loss_mlp": 0.01197766, "balance_loss_clip": 1.01064134, "balance_loss_mlp": 1.00092804, "epoch": 0.2725906330788192, "flos": 18695499899040.0, "grad_norm": 1.8201943046844602, "language_loss": 0.73981279, "learning_rate": 3.41393812749087e-06, "loss": 0.76530266, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.7248008251190186 }, { "auxiliary_loss_clip": 0.01327963, "auxiliary_loss_mlp": 0.01197588, "balance_loss_clip": 1.00996804, "balance_loss_mlp": 1.00074959, "epoch": 0.2727108759694583, "flos": 17886106676160.0, "grad_norm": 2.234284795404064, "language_loss": 0.71535563, "learning_rate": 3.4133870968668984e-06, "loss": 0.74061108, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.7231082916259766 }, { "auxiliary_loss_clip": 0.01341656, "auxiliary_loss_mlp": 0.0119772, "balance_loss_clip": 1.01036739, "balance_loss_mlp": 1.00088203, "epoch": 0.2728311188600974, "flos": 24461678155680.0, "grad_norm": 1.7683748157632717, "language_loss": 0.78504485, "learning_rate": 3.412835851833229e-06, "loss": 0.81043869, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.8311827182769775 }, { "auxiliary_loss_clip": 0.01351287, "auxiliary_loss_mlp": 0.01197224, "balance_loss_clip": 1.01083589, "balance_loss_mlp": 1.00076699, "epoch": 0.2729513617507365, "flos": 30993328183680.0, "grad_norm": 1.7155321384373026, "language_loss": 0.77496642, "learning_rate": 3.4122843924734834e-06, "loss": 0.80045152, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.8556127548217773 }, { "auxiliary_loss_clip": 0.01341913, "auxiliary_loss_mlp": 0.01197509, "balance_loss_clip": 1.00999534, "balance_loss_mlp": 1.00086141, "epoch": 0.2730716046413756, "flos": 19094646301920.0, "grad_norm": 2.278838731203801, "language_loss": 0.88364756, "learning_rate": 3.411732718871319e-06, "loss": 0.90904176, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.6988935470581055 }, { "auxiliary_loss_clip": 0.0137587, "auxiliary_loss_mlp": 0.01197138, "balance_loss_clip": 1.01114106, "balance_loss_mlp": 1.0008719, "epoch": 0.27319184753201464, "flos": 26944579094400.0, "grad_norm": 1.548305109738417, "language_loss": 0.78731966, "learning_rate": 3.4111808311104227e-06, "loss": 0.81304979, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.731818437576294 }, { "auxiliary_loss_clip": 0.01352356, "auxiliary_loss_mlp": 0.01197358, "balance_loss_clip": 1.01080298, "balance_loss_mlp": 1.00071025, "epoch": 0.27331209042265375, "flos": 31759841741760.0, "grad_norm": 1.684409104132906, "language_loss": 0.69566453, "learning_rate": 3.410628729274517e-06, "loss": 0.72116166, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.8516011238098145 }, { "auxiliary_loss_clip": 0.0132614, "auxiliary_loss_mlp": 0.00872919, "balance_loss_clip": 1.00998545, "balance_loss_mlp": 1.00022316, "epoch": 0.27343233331329286, "flos": 25739092981440.0, "grad_norm": 1.737592760966566, "language_loss": 0.81997275, "learning_rate": 3.4100764134473546e-06, "loss": 0.84196329, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.7835819721221924 }, { "auxiliary_loss_clip": 0.01374982, "auxiliary_loss_mlp": 0.01197587, "balance_loss_clip": 1.01056194, "balance_loss_mlp": 1.00093949, "epoch": 0.2735525762039319, "flos": 24389426129760.0, "grad_norm": 3.644343592890833, "language_loss": 0.85034168, "learning_rate": 3.4095238837127215e-06, "loss": 0.8760674, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.642831563949585 }, { "auxiliary_loss_clip": 0.01314949, "auxiliary_loss_mlp": 0.01197249, "balance_loss_clip": 1.00909209, "balance_loss_mlp": 1.00079179, "epoch": 0.27367281909457103, "flos": 14465366347680.0, "grad_norm": 2.0061029696216317, "language_loss": 0.79243362, "learning_rate": 3.4089711401544355e-06, "loss": 0.81755561, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.7740302085876465 }, { "auxiliary_loss_clip": 0.01364447, "auxiliary_loss_mlp": 0.01197841, "balance_loss_clip": 1.01088262, "balance_loss_mlp": 1.00100231, "epoch": 0.27379306198521014, "flos": 23477007496320.0, "grad_norm": 2.2166110065526823, "language_loss": 0.67883641, "learning_rate": 3.4084181828563486e-06, "loss": 0.70445925, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.740150213241577 }, { "auxiliary_loss_clip": 0.01303901, "auxiliary_loss_mlp": 0.01198115, "balance_loss_clip": 1.01021266, "balance_loss_mlp": 1.00108576, "epoch": 0.2739133048758492, "flos": 17458162691040.0, "grad_norm": 1.7538593565415106, "language_loss": 0.70948529, "learning_rate": 3.4078650119023428e-06, "loss": 0.73450541, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.7807559967041016 }, { "auxiliary_loss_clip": 0.01308209, "auxiliary_loss_mlp": 0.01197884, "balance_loss_clip": 1.01026201, "balance_loss_mlp": 1.00104547, "epoch": 0.2740335477664883, "flos": 19273120945920.0, "grad_norm": 2.1244271410734967, "language_loss": 0.74031252, "learning_rate": 3.4073116273763337e-06, "loss": 0.76537347, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.8595378398895264 }, { "auxiliary_loss_clip": 0.01341776, "auxiliary_loss_mlp": 0.01197515, "balance_loss_clip": 1.01035929, "balance_loss_mlp": 1.00086772, "epoch": 0.2741537906571274, "flos": 26104735800000.0, "grad_norm": 1.8278988032550731, "language_loss": 0.81494987, "learning_rate": 3.40675802936227e-06, "loss": 0.84034276, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.727726697921753 }, { "auxiliary_loss_clip": 0.01342496, "auxiliary_loss_mlp": 0.01197388, "balance_loss_clip": 1.01032853, "balance_loss_mlp": 1.00074053, "epoch": 0.27427403354776647, "flos": 34164204010560.0, "grad_norm": 1.7941956233118772, "language_loss": 0.72035897, "learning_rate": 3.4062042179441318e-06, "loss": 0.74575782, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 2.8526360988616943 }, { "auxiliary_loss_clip": 0.01350333, "auxiliary_loss_mlp": 0.01197244, "balance_loss_clip": 1.00999939, "balance_loss_mlp": 1.00078702, "epoch": 0.2743942764384056, "flos": 18766997527680.0, "grad_norm": 1.7688484596814076, "language_loss": 0.80396068, "learning_rate": 3.4056501932059314e-06, "loss": 0.82943642, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.6991055011749268 }, { "auxiliary_loss_clip": 0.01356554, "auxiliary_loss_mlp": 0.01194813, "balance_loss_clip": 1.00996804, "balance_loss_mlp": 1.00026369, "epoch": 0.2745145193290447, "flos": 64904075324640.0, "grad_norm": 0.7695259598776464, "language_loss": 0.58113444, "learning_rate": 3.405095955231715e-06, "loss": 0.60664809, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.2201218605041504 }, { "auxiliary_loss_clip": 0.01361191, "auxiliary_loss_mlp": 0.01197433, "balance_loss_clip": 1.0100863, "balance_loss_mlp": 1.00078523, "epoch": 0.27463476221968375, "flos": 16136934184800.0, "grad_norm": 3.129512826780351, "language_loss": 0.94775969, "learning_rate": 3.4045415041055585e-06, "loss": 0.97334594, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.6614508628845215 }, { "auxiliary_loss_clip": 0.01335922, "auxiliary_loss_mlp": 0.0119754, "balance_loss_clip": 1.00957584, "balance_loss_mlp": 1.0007019, "epoch": 0.27475500511032286, "flos": 10376719259040.0, "grad_norm": 2.2713095729654014, "language_loss": 0.78561604, "learning_rate": 3.4039868399115728e-06, "loss": 0.81095064, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 3.7421867847442627 }, { "auxiliary_loss_clip": 0.01272978, "auxiliary_loss_mlp": 0.01197395, "balance_loss_clip": 1.00823462, "balance_loss_mlp": 1.0007478, "epoch": 0.27487524800096197, "flos": 17311072134240.0, "grad_norm": 2.0636209751428245, "language_loss": 0.80598509, "learning_rate": 3.4034319627339003e-06, "loss": 0.83068883, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.8809964656829834 }, { "auxiliary_loss_clip": 0.01335834, "auxiliary_loss_mlp": 0.01197548, "balance_loss_clip": 1.00984955, "balance_loss_mlp": 1.00090003, "epoch": 0.274995490891601, "flos": 27120215767680.0, "grad_norm": 2.389750944837681, "language_loss": 0.70014393, "learning_rate": 3.402876872656715e-06, "loss": 0.72547781, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 3.6589787006378174 }, { "auxiliary_loss_clip": 0.01327773, "auxiliary_loss_mlp": 0.01197472, "balance_loss_clip": 1.0098505, "balance_loss_mlp": 1.00101471, "epoch": 0.27511573378224013, "flos": 23436103633920.0, "grad_norm": 4.05872748965735, "language_loss": 0.89732325, "learning_rate": 3.402321569764223e-06, "loss": 0.92257571, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 4.680979490280151 }, { "auxiliary_loss_clip": 0.01291804, "auxiliary_loss_mlp": 0.00872898, "balance_loss_clip": 1.00828218, "balance_loss_mlp": 1.0003866, "epoch": 0.2752359766728792, "flos": 16722027357120.0, "grad_norm": 1.7252969474336266, "language_loss": 0.83668518, "learning_rate": 3.4017660541406635e-06, "loss": 0.85833216, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.893692970275879 }, { "auxiliary_loss_clip": 0.01352069, "auxiliary_loss_mlp": 0.01197531, "balance_loss_clip": 1.01114321, "balance_loss_mlp": 1.00088298, "epoch": 0.2753562195635183, "flos": 25297749463680.0, "grad_norm": 2.183641994844732, "language_loss": 0.74262315, "learning_rate": 3.4012103258703092e-06, "loss": 0.76811916, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.812246322631836 }, { "auxiliary_loss_clip": 0.01320675, "auxiliary_loss_mlp": 0.01197753, "balance_loss_clip": 1.01000357, "balance_loss_mlp": 1.00091434, "epoch": 0.2754764624541574, "flos": 27338983571520.0, "grad_norm": 1.8686383596829779, "language_loss": 0.82744992, "learning_rate": 3.4006543850374616e-06, "loss": 0.85263425, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.8609509468078613 }, { "auxiliary_loss_clip": 0.01363363, "auxiliary_loss_mlp": 0.01197277, "balance_loss_clip": 1.01088905, "balance_loss_mlp": 1.0008198, "epoch": 0.27559670534479647, "flos": 17238389024160.0, "grad_norm": 1.965006174006391, "language_loss": 0.74879789, "learning_rate": 3.400098231726458e-06, "loss": 0.77440429, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.708958864212036 }, { "auxiliary_loss_clip": 0.0132734, "auxiliary_loss_mlp": 0.01197453, "balance_loss_clip": 1.01039004, "balance_loss_mlp": 1.00099564, "epoch": 0.2757169482354356, "flos": 21939094759680.0, "grad_norm": 1.9651894486546069, "language_loss": 0.87024707, "learning_rate": 3.3995418660216657e-06, "loss": 0.89549494, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.7532756328582764 }, { "auxiliary_loss_clip": 0.01376991, "auxiliary_loss_mlp": 0.01197746, "balance_loss_clip": 1.01076818, "balance_loss_mlp": 1.00090778, "epoch": 0.2758371911260747, "flos": 20850680216160.0, "grad_norm": 2.6712059446253114, "language_loss": 0.80228692, "learning_rate": 3.3989852880074848e-06, "loss": 0.82803428, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.857975959777832 }, { "auxiliary_loss_clip": 0.01321131, "auxiliary_loss_mlp": 0.01194618, "balance_loss_clip": 1.01241732, "balance_loss_mlp": 1.00006855, "epoch": 0.27595743401671374, "flos": 69269098590720.0, "grad_norm": 0.737383732199259, "language_loss": 0.60634649, "learning_rate": 3.398428497768348e-06, "loss": 0.631504, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.413301944732666 }, { "auxiliary_loss_clip": 0.01331797, "auxiliary_loss_mlp": 0.011975, "balance_loss_clip": 1.00961661, "balance_loss_mlp": 1.0008527, "epoch": 0.27607767690735285, "flos": 21215029782240.0, "grad_norm": 2.214484581771097, "language_loss": 0.72054327, "learning_rate": 3.3978714953887205e-06, "loss": 0.74583626, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.7851200103759766 }, { "auxiliary_loss_clip": 0.0130338, "auxiliary_loss_mlp": 0.01197116, "balance_loss_clip": 1.0094595, "balance_loss_mlp": 1.00104022, "epoch": 0.27619791979799196, "flos": 24825345171840.0, "grad_norm": 2.2432302481605335, "language_loss": 0.85993719, "learning_rate": 3.397314280953098e-06, "loss": 0.88494217, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.8441030979156494 }, { "auxiliary_loss_clip": 0.01339693, "auxiliary_loss_mlp": 0.01196761, "balance_loss_clip": 1.01039624, "balance_loss_mlp": 1.00068593, "epoch": 0.276318162688631, "flos": 24753560153760.0, "grad_norm": 1.859759482573877, "language_loss": 0.80305541, "learning_rate": 3.3967568545460108e-06, "loss": 0.82841992, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.917287826538086 }, { "auxiliary_loss_clip": 0.0135034, "auxiliary_loss_mlp": 0.01197597, "balance_loss_clip": 1.01057839, "balance_loss_mlp": 1.0009495, "epoch": 0.27643840557927013, "flos": 18150017336640.0, "grad_norm": 1.8046977482410533, "language_loss": 0.80669153, "learning_rate": 3.3961992162520185e-06, "loss": 0.8321709, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.781853675842285 }, { "auxiliary_loss_clip": 0.01353101, "auxiliary_loss_mlp": 0.01197223, "balance_loss_clip": 1.00979853, "balance_loss_mlp": 1.00076604, "epoch": 0.27655864846990924, "flos": 24823944148320.0, "grad_norm": 2.4317305662405713, "language_loss": 0.7203806, "learning_rate": 3.3956413661557156e-06, "loss": 0.74588382, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.7707290649414062 }, { "auxiliary_loss_clip": 0.0132938, "auxiliary_loss_mlp": 0.01197179, "balance_loss_clip": 1.00979233, "balance_loss_mlp": 1.00072193, "epoch": 0.2766788913605483, "flos": 20266592906880.0, "grad_norm": 2.828149021718558, "language_loss": 0.66221625, "learning_rate": 3.3950833043417273e-06, "loss": 0.68748182, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.8370633125305176 }, { "auxiliary_loss_clip": 0.01352407, "auxiliary_loss_mlp": 0.01197503, "balance_loss_clip": 1.01071405, "balance_loss_mlp": 1.00085509, "epoch": 0.2767991342511874, "flos": 21470282835840.0, "grad_norm": 9.338431421267352, "language_loss": 0.73640418, "learning_rate": 3.3945250308947105e-06, "loss": 0.76190329, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.713582992553711 }, { "auxiliary_loss_clip": 0.01341852, "auxiliary_loss_mlp": 0.01194872, "balance_loss_clip": 1.0094105, "balance_loss_mlp": 1.00032222, "epoch": 0.2769193771418265, "flos": 66002656269600.0, "grad_norm": 1.2891533868965457, "language_loss": 0.68356484, "learning_rate": 3.3939665458993556e-06, "loss": 0.70893204, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.2137033939361572 }, { "auxiliary_loss_clip": 0.01326926, "auxiliary_loss_mlp": 0.0119758, "balance_loss_clip": 1.00956392, "balance_loss_mlp": 1.00093246, "epoch": 0.27703962003246557, "flos": 20704451827680.0, "grad_norm": 1.813225860729811, "language_loss": 0.77256811, "learning_rate": 3.3934078494403843e-06, "loss": 0.79781318, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.758878231048584 }, { "auxiliary_loss_clip": 0.01266257, "auxiliary_loss_mlp": 0.00872905, "balance_loss_clip": 1.00948429, "balance_loss_mlp": 1.000319, "epoch": 0.2771598629231047, "flos": 22929908368320.0, "grad_norm": 1.7282659893821637, "language_loss": 0.81424761, "learning_rate": 3.3928489416025495e-06, "loss": 0.83563918, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 3.006953239440918 }, { "auxiliary_loss_clip": 0.01325945, "auxiliary_loss_mlp": 0.01197478, "balance_loss_clip": 1.00930691, "balance_loss_mlp": 1.00083005, "epoch": 0.27728010581374374, "flos": 18369467690400.0, "grad_norm": 2.275514015915012, "language_loss": 0.7849378, "learning_rate": 3.392289822470638e-06, "loss": 0.81017208, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 3.143669366836548 }, { "auxiliary_loss_clip": 0.01339039, "auxiliary_loss_mlp": 0.0119708, "balance_loss_clip": 1.01004219, "balance_loss_mlp": 1.00062358, "epoch": 0.27740034870438285, "flos": 19427647704480.0, "grad_norm": 1.9222957267185334, "language_loss": 0.75987834, "learning_rate": 3.3917304921294674e-06, "loss": 0.78523958, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.7456815242767334 }, { "auxiliary_loss_clip": 0.01351285, "auxiliary_loss_mlp": 0.01197379, "balance_loss_clip": 1.00964653, "balance_loss_mlp": 1.00054026, "epoch": 0.27752059159502196, "flos": 21614786887680.0, "grad_norm": 1.5621859047879465, "language_loss": 0.80689561, "learning_rate": 3.3911709506638876e-06, "loss": 0.8323822, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.8010501861572266 }, { "auxiliary_loss_clip": 0.01328366, "auxiliary_loss_mlp": 0.00872973, "balance_loss_clip": 1.00935316, "balance_loss_mlp": 1.00037265, "epoch": 0.277640834485661, "flos": 26608021247520.0, "grad_norm": 2.1297730316151107, "language_loss": 0.8109231, "learning_rate": 3.390611198158781e-06, "loss": 0.83293653, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.8236563205718994 }, { "auxiliary_loss_clip": 0.013758, "auxiliary_loss_mlp": 0.01196984, "balance_loss_clip": 1.01052177, "balance_loss_mlp": 1.00090826, "epoch": 0.2777610773763001, "flos": 19492822765440.0, "grad_norm": 2.081791380228798, "language_loss": 0.89723343, "learning_rate": 3.3900512346990612e-06, "loss": 0.92296129, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.6515698432922363 }, { "auxiliary_loss_clip": 0.01315395, "auxiliary_loss_mlp": 0.01197196, "balance_loss_clip": 1.01019835, "balance_loss_mlp": 1.0007391, "epoch": 0.27788132026693924, "flos": 38290665525120.0, "grad_norm": 1.7459900897479548, "language_loss": 0.65813255, "learning_rate": 3.389491060369674e-06, "loss": 0.68325841, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 4.12514591217041 }, { "auxiliary_loss_clip": 0.01297663, "auxiliary_loss_mlp": 0.01197093, "balance_loss_clip": 1.00875056, "balance_loss_mlp": 1.0008266, "epoch": 0.2780015631575783, "flos": 22382557774560.0, "grad_norm": 1.8162039246836612, "language_loss": 0.89224064, "learning_rate": 3.388930675255598e-06, "loss": 0.91718817, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.9694314002990723 }, { "auxiliary_loss_clip": 0.01340055, "auxiliary_loss_mlp": 0.01197666, "balance_loss_clip": 1.00958729, "balance_loss_mlp": 1.00101805, "epoch": 0.2781218060482174, "flos": 12203209015200.0, "grad_norm": 2.2162655401904887, "language_loss": 0.79508626, "learning_rate": 3.388370079441843e-06, "loss": 0.82046354, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.763532876968384 }, { "auxiliary_loss_clip": 0.01314258, "auxiliary_loss_mlp": 0.01197774, "balance_loss_clip": 1.0095551, "balance_loss_mlp": 1.00093532, "epoch": 0.2782420489388565, "flos": 18107640603360.0, "grad_norm": 1.9902756837440354, "language_loss": 0.92630529, "learning_rate": 3.3878092730134505e-06, "loss": 0.95142567, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 4.727071285247803 }, { "auxiliary_loss_clip": 0.01364963, "auxiliary_loss_mlp": 0.01197102, "balance_loss_clip": 1.01110768, "balance_loss_mlp": 1.00083637, "epoch": 0.27836229182949557, "flos": 18514762063200.0, "grad_norm": 1.8872680838880496, "language_loss": 0.80604655, "learning_rate": 3.3872482560554947e-06, "loss": 0.83166718, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.682825803756714 }, { "auxiliary_loss_clip": 0.01344308, "auxiliary_loss_mlp": 0.01194979, "balance_loss_clip": 1.01014197, "balance_loss_mlp": 1.00042903, "epoch": 0.2784825347201347, "flos": 67079265131520.0, "grad_norm": 0.7934317909465995, "language_loss": 0.56985819, "learning_rate": 3.386687028653082e-06, "loss": 0.59525096, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.424865961074829 }, { "auxiliary_loss_clip": 0.01292024, "auxiliary_loss_mlp": 0.01197286, "balance_loss_clip": 1.00891805, "balance_loss_mlp": 1.00082886, "epoch": 0.2786027776107738, "flos": 22631129023680.0, "grad_norm": 1.6840790473213565, "language_loss": 0.85127395, "learning_rate": 3.386125590891349e-06, "loss": 0.87616706, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.887754440307617 }, { "auxiliary_loss_clip": 0.01351814, "auxiliary_loss_mlp": 0.01196903, "balance_loss_clip": 1.01079035, "balance_loss_mlp": 1.00082803, "epoch": 0.27872302050141284, "flos": 15778835339040.0, "grad_norm": 1.903251590936619, "language_loss": 0.8315078, "learning_rate": 3.3855639428554657e-06, "loss": 0.85699499, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.713510751724243 }, { "auxiliary_loss_clip": 0.01311705, "auxiliary_loss_mlp": 0.01197477, "balance_loss_clip": 1.00953996, "balance_loss_mlp": 1.0008297, "epoch": 0.27884326339205195, "flos": 22126981407840.0, "grad_norm": 1.8293905613918602, "language_loss": 0.80405509, "learning_rate": 3.385002084630635e-06, "loss": 0.82914692, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.8606388568878174 }, { "auxiliary_loss_clip": 0.01363708, "auxiliary_loss_mlp": 0.01197439, "balance_loss_clip": 1.01093721, "balance_loss_mlp": 1.00079119, "epoch": 0.278963506282691, "flos": 20558726370720.0, "grad_norm": 2.3168119551176534, "language_loss": 0.84916586, "learning_rate": 3.384440016302088e-06, "loss": 0.87477732, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.6996688842773438 }, { "auxiliary_loss_clip": 0.01349796, "auxiliary_loss_mlp": 0.01197152, "balance_loss_clip": 1.01005113, "balance_loss_mlp": 1.00088549, "epoch": 0.2790837491733301, "flos": 21942938593440.0, "grad_norm": 2.712717655444666, "language_loss": 0.61839378, "learning_rate": 3.3838777379550923e-06, "loss": 0.64386332, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.845376968383789 }, { "auxiliary_loss_clip": 0.01339822, "auxiliary_loss_mlp": 0.01196593, "balance_loss_clip": 1.01013851, "balance_loss_mlp": 1.00070858, "epoch": 0.27920399206396923, "flos": 26286802812000.0, "grad_norm": 1.948236400162674, "language_loss": 0.77846104, "learning_rate": 3.383315249674944e-06, "loss": 0.8038252, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.916191577911377 }, { "auxiliary_loss_clip": 0.0131742, "auxiliary_loss_mlp": 0.01197279, "balance_loss_clip": 1.00930154, "balance_loss_mlp": 1.00063145, "epoch": 0.2793242349546083, "flos": 25400990416320.0, "grad_norm": 2.0945088446189195, "language_loss": 0.85825884, "learning_rate": 3.3827525515469715e-06, "loss": 0.8834058, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.8442330360412598 }, { "auxiliary_loss_clip": 0.01326554, "auxiliary_loss_mlp": 0.01197281, "balance_loss_clip": 1.01041567, "balance_loss_mlp": 1.00082362, "epoch": 0.2794444778452474, "flos": 20850356903040.0, "grad_norm": 1.8884815548364433, "language_loss": 0.71181047, "learning_rate": 3.3821896436565367e-06, "loss": 0.7370488, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.926021099090576 }, { "auxiliary_loss_clip": 0.0135597, "auxiliary_loss_mlp": 0.01197497, "balance_loss_clip": 1.0105195, "balance_loss_mlp": 1.00103986, "epoch": 0.2795647207358865, "flos": 21576253988160.0, "grad_norm": 1.7078968837800708, "language_loss": 0.70349699, "learning_rate": 3.381626526089032e-06, "loss": 0.72903168, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.7545156478881836 }, { "auxiliary_loss_clip": 0.01348794, "auxiliary_loss_mlp": 0.01197667, "balance_loss_clip": 1.01028204, "balance_loss_mlp": 1.00101948, "epoch": 0.27968496362652556, "flos": 21471755706720.0, "grad_norm": 1.8812187818663846, "language_loss": 0.79155004, "learning_rate": 3.3810631989298815e-06, "loss": 0.81701469, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.7865612506866455 }, { "auxiliary_loss_clip": 0.01293985, "auxiliary_loss_mlp": 0.01197556, "balance_loss_clip": 1.00834274, "balance_loss_mlp": 1.00071764, "epoch": 0.2798052065171647, "flos": 23258706700320.0, "grad_norm": 2.935248778034196, "language_loss": 0.8476212, "learning_rate": 3.3804996622645423e-06, "loss": 0.8725366, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.9748284816741943 }, { "auxiliary_loss_clip": 0.01375714, "auxiliary_loss_mlp": 0.01197189, "balance_loss_clip": 1.0108484, "balance_loss_mlp": 1.00073254, "epoch": 0.2799254494078038, "flos": 21539337654240.0, "grad_norm": 1.8256861990413664, "language_loss": 0.89208496, "learning_rate": 3.3799359161785015e-06, "loss": 0.91781402, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.790256977081299 }, { "auxiliary_loss_clip": 0.01350753, "auxiliary_loss_mlp": 0.0119772, "balance_loss_clip": 1.01023722, "balance_loss_mlp": 1.00088203, "epoch": 0.28004569229844284, "flos": 26393923522080.0, "grad_norm": 2.00460444036657, "language_loss": 0.85673946, "learning_rate": 3.3793719607572798e-06, "loss": 0.8822242, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.776843309402466 }, { "auxiliary_loss_clip": 0.01352199, "auxiliary_loss_mlp": 0.01197268, "balance_loss_clip": 1.01097751, "balance_loss_mlp": 1.0010016, "epoch": 0.28016593518908195, "flos": 33547690827360.0, "grad_norm": 2.6235683680676187, "language_loss": 0.77121294, "learning_rate": 3.378807796086428e-06, "loss": 0.79670763, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.879467487335205 }, { "auxiliary_loss_clip": 0.01376241, "auxiliary_loss_mlp": 0.01197283, "balance_loss_clip": 1.011235, "balance_loss_mlp": 1.00082624, "epoch": 0.28028617807972106, "flos": 15340832723520.0, "grad_norm": 2.031378308721662, "language_loss": 0.77299732, "learning_rate": 3.37824342225153e-06, "loss": 0.79873258, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.7038724422454834 }, { "auxiliary_loss_clip": 0.01294698, "auxiliary_loss_mlp": 0.01197245, "balance_loss_clip": 1.00914693, "balance_loss_mlp": 1.00078809, "epoch": 0.2804064209703601, "flos": 25520289253920.0, "grad_norm": 2.188575397704968, "language_loss": 0.77552372, "learning_rate": 3.3776788393382006e-06, "loss": 0.80044317, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.8966948986053467 }, { "auxiliary_loss_clip": 0.01375342, "auxiliary_loss_mlp": 0.01197513, "balance_loss_clip": 1.01058841, "balance_loss_mlp": 1.00086558, "epoch": 0.2805266638609992, "flos": 29351779333920.0, "grad_norm": 2.6349092709370256, "language_loss": 0.76752949, "learning_rate": 3.3771140474320872e-06, "loss": 0.79325807, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.860353946685791 }, { "auxiliary_loss_clip": 0.01325208, "auxiliary_loss_mlp": 0.01197874, "balance_loss_clip": 1.00982869, "balance_loss_mlp": 1.00103521, "epoch": 0.28064690675163834, "flos": 21463744726080.0, "grad_norm": 1.7909869068214668, "language_loss": 0.79928076, "learning_rate": 3.3765490466188664e-06, "loss": 0.82451159, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.8521461486816406 }, { "auxiliary_loss_clip": 0.0132591, "auxiliary_loss_mlp": 0.01197437, "balance_loss_clip": 1.01018631, "balance_loss_mlp": 1.0007894, "epoch": 0.2807671496422774, "flos": 20995651275840.0, "grad_norm": 2.823646333235192, "language_loss": 0.73274767, "learning_rate": 3.3759838369842508e-06, "loss": 0.75798118, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.836225748062134 }, { "auxiliary_loss_clip": 0.01314241, "auxiliary_loss_mlp": 0.01197468, "balance_loss_clip": 1.00982666, "balance_loss_mlp": 1.00101089, "epoch": 0.2808873925329165, "flos": 21506588467200.0, "grad_norm": 2.0785025025195707, "language_loss": 0.7316975, "learning_rate": 3.375418418613981e-06, "loss": 0.7568146, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.7905712127685547 }, { "auxiliary_loss_clip": 0.01332062, "auxiliary_loss_mlp": 0.0119719, "balance_loss_clip": 1.00941384, "balance_loss_mlp": 1.00073326, "epoch": 0.28100763542355556, "flos": 16070825108160.0, "grad_norm": 3.2435294498066503, "language_loss": 0.83797336, "learning_rate": 3.374852791593831e-06, "loss": 0.86326587, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 3.7380642890930176 }, { "auxiliary_loss_clip": 0.01314802, "auxiliary_loss_mlp": 0.01197424, "balance_loss_clip": 1.01040769, "balance_loss_mlp": 1.00077617, "epoch": 0.28112787831419467, "flos": 19062615588480.0, "grad_norm": 2.349664017538881, "language_loss": 0.5372324, "learning_rate": 3.374286956009605e-06, "loss": 0.56235468, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.838512659072876 }, { "auxiliary_loss_clip": 0.01351266, "auxiliary_loss_mlp": 0.01197698, "balance_loss_clip": 1.01091027, "balance_loss_mlp": 1.00105, "epoch": 0.2812481212048338, "flos": 12823637879520.0, "grad_norm": 1.8425367514168811, "language_loss": 0.75304204, "learning_rate": 3.3737209119471405e-06, "loss": 0.77853167, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.707205057144165 }, { "auxiliary_loss_clip": 0.01361942, "auxiliary_loss_mlp": 0.01197887, "balance_loss_clip": 1.01093626, "balance_loss_mlp": 1.00123966, "epoch": 0.28136836409547283, "flos": 15633073958400.0, "grad_norm": 2.415427538391207, "language_loss": 0.6397568, "learning_rate": 3.373154659492306e-06, "loss": 0.66535509, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 4.738910913467407 }, { "auxiliary_loss_clip": 0.0134138, "auxiliary_loss_mlp": 0.01197289, "balance_loss_clip": 1.01026261, "balance_loss_mlp": 1.00083232, "epoch": 0.28148860698611194, "flos": 19933735199040.0, "grad_norm": 2.2270449309453175, "language_loss": 0.85268795, "learning_rate": 3.3725881987310016e-06, "loss": 0.87807465, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.807861804962158 }, { "auxiliary_loss_clip": 0.01330217, "auxiliary_loss_mlp": 0.01197299, "balance_loss_clip": 1.00924563, "balance_loss_mlp": 1.00103283, "epoch": 0.28160884987675106, "flos": 17457228675360.0, "grad_norm": 1.8059246388602297, "language_loss": 0.87646472, "learning_rate": 3.372021529749159e-06, "loss": 0.90173984, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.7793567180633545 }, { "auxiliary_loss_clip": 0.01274346, "auxiliary_loss_mlp": 0.0119752, "balance_loss_clip": 1.00896752, "balance_loss_mlp": 1.00106359, "epoch": 0.2817290927673901, "flos": 16834752161280.0, "grad_norm": 1.8405927313931094, "language_loss": 0.92538673, "learning_rate": 3.3714546526327405e-06, "loss": 0.95010543, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.930446147918701 }, { "auxiliary_loss_clip": 0.01325531, "auxiliary_loss_mlp": 0.01197133, "balance_loss_clip": 1.00938129, "balance_loss_mlp": 1.0006758, "epoch": 0.2818493356580292, "flos": 15414090612480.0, "grad_norm": 2.374842514459578, "language_loss": 0.87888241, "learning_rate": 3.3708875674677423e-06, "loss": 0.904109, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.8450636863708496 }, { "auxiliary_loss_clip": 0.01344062, "auxiliary_loss_mlp": 0.01197553, "balance_loss_clip": 1.01071239, "balance_loss_mlp": 1.00090528, "epoch": 0.28196957854866833, "flos": 20412462058560.0, "grad_norm": 1.976618482304991, "language_loss": 0.83946913, "learning_rate": 3.37032027434019e-06, "loss": 0.86488527, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.781592845916748 }, { "auxiliary_loss_clip": 0.01365081, "auxiliary_loss_mlp": 0.01197767, "balance_loss_clip": 1.0109973, "balance_loss_mlp": 1.00092912, "epoch": 0.2820898214393074, "flos": 19973130266880.0, "grad_norm": 1.8773161644298282, "language_loss": 0.82826704, "learning_rate": 3.369752773336141e-06, "loss": 0.85389555, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.7879436016082764 }, { "auxiliary_loss_clip": 0.0134117, "auxiliary_loss_mlp": 0.01197584, "balance_loss_clip": 1.01047945, "balance_loss_mlp": 1.00093651, "epoch": 0.2822100643299465, "flos": 22528319155200.0, "grad_norm": 1.552002555111326, "language_loss": 0.78350246, "learning_rate": 3.3691850645416864e-06, "loss": 0.8088901, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.7549984455108643 }, { "auxiliary_loss_clip": 0.0136285, "auxiliary_loss_mlp": 0.01197515, "balance_loss_clip": 1.01031458, "balance_loss_mlp": 1.00086713, "epoch": 0.2823303072205856, "flos": 11546690061600.0, "grad_norm": 2.654267895733553, "language_loss": 0.83056766, "learning_rate": 3.368617148042945e-06, "loss": 0.85617131, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.7302520275115967 }, { "auxiliary_loss_clip": 0.01351501, "auxiliary_loss_mlp": 0.01197421, "balance_loss_clip": 1.01059866, "balance_loss_mlp": 1.00077355, "epoch": 0.28245055011122466, "flos": 18259904170080.0, "grad_norm": 1.7784700164044132, "language_loss": 0.84204221, "learning_rate": 3.368049023926071e-06, "loss": 0.86753142, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.7865288257598877 }, { "auxiliary_loss_clip": 0.01353587, "auxiliary_loss_mlp": 0.01196366, "balance_loss_clip": 1.01023054, "balance_loss_mlp": 1.00067151, "epoch": 0.2825707930018638, "flos": 24608122086240.0, "grad_norm": 1.526412922307524, "language_loss": 0.83788687, "learning_rate": 3.3674806922772476e-06, "loss": 0.86338639, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.7745461463928223 }, { "auxiliary_loss_clip": 0.01326758, "auxiliary_loss_mlp": 0.01197417, "balance_loss_clip": 1.00972199, "balance_loss_mlp": 1.00076926, "epoch": 0.28269103589250283, "flos": 25227006232320.0, "grad_norm": 1.7412839683017192, "language_loss": 0.74689049, "learning_rate": 3.3669121531826904e-06, "loss": 0.77213228, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.8429157733917236 }, { "auxiliary_loss_clip": 0.01300501, "auxiliary_loss_mlp": 0.01197565, "balance_loss_clip": 1.00934935, "balance_loss_mlp": 1.00091696, "epoch": 0.28281127878314194, "flos": 19281563010720.0, "grad_norm": 1.9283579769440071, "language_loss": 0.82618147, "learning_rate": 3.366343406728647e-06, "loss": 0.85116208, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.8724570274353027 }, { "auxiliary_loss_clip": 0.01363347, "auxiliary_loss_mlp": 0.011972, "balance_loss_clip": 1.01025748, "balance_loss_mlp": 1.00093341, "epoch": 0.28293152167378105, "flos": 23878417091040.0, "grad_norm": 1.7212328531852028, "language_loss": 0.68063563, "learning_rate": 3.3657744530013946e-06, "loss": 0.70624107, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.7281315326690674 }, { "auxiliary_loss_clip": 0.01361496, "auxiliary_loss_mlp": 0.01197422, "balance_loss_clip": 1.01073003, "balance_loss_mlp": 1.00096512, "epoch": 0.2830517645644201, "flos": 43866981331200.0, "grad_norm": 2.144434763354363, "language_loss": 0.71172863, "learning_rate": 3.3652052920872437e-06, "loss": 0.7373178, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.9538800716400146 }, { "auxiliary_loss_clip": 0.0133877, "auxiliary_loss_mlp": 0.01197414, "balance_loss_clip": 1.01017654, "balance_loss_mlp": 1.00095701, "epoch": 0.2831720074550592, "flos": 26651763080640.0, "grad_norm": 2.332290609319057, "language_loss": 0.85431659, "learning_rate": 3.3646359240725355e-06, "loss": 0.87967849, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.9059970378875732 }, { "auxiliary_loss_clip": 0.01362992, "auxiliary_loss_mlp": 0.00872905, "balance_loss_clip": 1.01085865, "balance_loss_mlp": 1.00044322, "epoch": 0.2832922503456983, "flos": 31029993051840.0, "grad_norm": 1.9669931562696674, "language_loss": 0.67709047, "learning_rate": 3.364066349043643e-06, "loss": 0.69944948, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.9149744510650635 }, { "auxiliary_loss_clip": 0.01335661, "auxiliary_loss_mlp": 0.01196959, "balance_loss_clip": 1.0094825, "balance_loss_mlp": 1.0008837, "epoch": 0.2834124932363374, "flos": 20405708406720.0, "grad_norm": 1.6630659739894054, "language_loss": 0.82281297, "learning_rate": 3.363496567086969e-06, "loss": 0.84813917, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.847257614135742 }, { "auxiliary_loss_clip": 0.01375241, "auxiliary_loss_mlp": 0.01197421, "balance_loss_clip": 1.01075554, "balance_loss_mlp": 1.00096452, "epoch": 0.2835327361269765, "flos": 39384863781120.0, "grad_norm": 2.011188328859333, "language_loss": 0.75966001, "learning_rate": 3.3629265782889506e-06, "loss": 0.78538656, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.849841833114624 }, { "auxiliary_loss_clip": 0.01328223, "auxiliary_loss_mlp": 0.01197538, "balance_loss_clip": 1.01002836, "balance_loss_mlp": 1.00089037, "epoch": 0.2836529790176156, "flos": 30261611462400.0, "grad_norm": 1.8601930708554792, "language_loss": 0.72038591, "learning_rate": 3.362356382736054e-06, "loss": 0.7456435, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.857846736907959 }, { "auxiliary_loss_clip": 0.01330421, "auxiliary_loss_mlp": 0.0119705, "balance_loss_clip": 1.0097158, "balance_loss_mlp": 1.00078344, "epoch": 0.28377322190825466, "flos": 12677804651520.0, "grad_norm": 1.924996383065737, "language_loss": 0.91119981, "learning_rate": 3.361785980514777e-06, "loss": 0.9364745, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 2.7863810062408447 }, { "auxiliary_loss_clip": 0.01266089, "auxiliary_loss_mlp": 0.01197437, "balance_loss_clip": 1.00842285, "balance_loss_mlp": 1.0007894, "epoch": 0.28389346479889377, "flos": 18296676809280.0, "grad_norm": 1.85554964335067, "language_loss": 0.76661837, "learning_rate": 3.361215371711649e-06, "loss": 0.79125369, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.8389079570770264 }, { "auxiliary_loss_clip": 0.01302129, "auxiliary_loss_mlp": 0.01196128, "balance_loss_clip": 1.00817025, "balance_loss_mlp": 1.00062442, "epoch": 0.2840137076895329, "flos": 20407001659200.0, "grad_norm": 1.7567636139877973, "language_loss": 0.83304799, "learning_rate": 3.3606445564132326e-06, "loss": 0.85803056, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.812213182449341 }, { "auxiliary_loss_clip": 0.01375211, "auxiliary_loss_mlp": 0.00872753, "balance_loss_clip": 1.01072752, "balance_loss_mlp": 1.00037599, "epoch": 0.28413395058017193, "flos": 20048040645120.0, "grad_norm": 2.809757057232627, "language_loss": 0.82213748, "learning_rate": 3.360073534706118e-06, "loss": 0.84461707, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 3.620760917663574 }, { "auxiliary_loss_clip": 0.01330633, "auxiliary_loss_mlp": 0.01197447, "balance_loss_clip": 1.00985932, "balance_loss_mlp": 1.00079966, "epoch": 0.28425419347081105, "flos": 37663626703680.0, "grad_norm": 2.040142559200189, "language_loss": 0.75709009, "learning_rate": 3.35950230667693e-06, "loss": 0.78237087, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.8849802017211914 }, { "auxiliary_loss_clip": 0.01355709, "auxiliary_loss_mlp": 0.01197351, "balance_loss_clip": 1.01014197, "balance_loss_mlp": 1.0007031, "epoch": 0.28437443636145016, "flos": 13845081178080.0, "grad_norm": 2.200121089370102, "language_loss": 0.86366749, "learning_rate": 3.358930872412323e-06, "loss": 0.88919812, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.761120080947876 }, { "auxiliary_loss_clip": 0.01351087, "auxiliary_loss_mlp": 0.01197514, "balance_loss_clip": 1.01009107, "balance_loss_mlp": 1.00105727, "epoch": 0.2844946792520892, "flos": 22747805432640.0, "grad_norm": 1.9087615290598248, "language_loss": 0.80896962, "learning_rate": 3.3583592319989825e-06, "loss": 0.83445561, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 3.7332355976104736 }, { "auxiliary_loss_clip": 0.01363984, "auxiliary_loss_mlp": 0.01197791, "balance_loss_clip": 1.01102209, "balance_loss_mlp": 1.00114346, "epoch": 0.2846149221427283, "flos": 32416001458560.0, "grad_norm": 2.0702894437841652, "language_loss": 0.68852973, "learning_rate": 3.357787385523627e-06, "loss": 0.71414745, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.9054408073425293 }, { "auxiliary_loss_clip": 0.01284168, "auxiliary_loss_mlp": 0.01197612, "balance_loss_clip": 1.00868058, "balance_loss_mlp": 1.00096405, "epoch": 0.2847351650333674, "flos": 28475989644960.0, "grad_norm": 2.1472225265209897, "language_loss": 0.82661474, "learning_rate": 3.3572153330730048e-06, "loss": 0.85143256, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.9474852085113525 }, { "auxiliary_loss_clip": 0.0130578, "auxiliary_loss_mlp": 0.01194699, "balance_loss_clip": 1.00675821, "balance_loss_mlp": 1.00014997, "epoch": 0.2848554079240065, "flos": 55753426447200.0, "grad_norm": 0.8252869527895186, "language_loss": 0.64713997, "learning_rate": 3.3566430747338956e-06, "loss": 0.67214477, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.223787546157837 }, { "auxiliary_loss_clip": 0.01356325, "auxiliary_loss_mlp": 0.01197775, "balance_loss_clip": 1.01028824, "balance_loss_mlp": 1.00112724, "epoch": 0.2849756508146456, "flos": 11836883646720.0, "grad_norm": 2.1166179729305394, "language_loss": 0.864622, "learning_rate": 3.35607061059311e-06, "loss": 0.89016294, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.7342145442962646 }, { "auxiliary_loss_clip": 0.0137437, "auxiliary_loss_mlp": 0.01197205, "balance_loss_clip": 1.01105046, "balance_loss_mlp": 1.00074804, "epoch": 0.28509589370528465, "flos": 25155221214240.0, "grad_norm": 2.5175546979235786, "language_loss": 0.7477597, "learning_rate": 3.3554979407374917e-06, "loss": 0.77347541, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.7401349544525146 }, { "auxiliary_loss_clip": 0.0136238, "auxiliary_loss_mlp": 0.01197436, "balance_loss_clip": 1.01068008, "balance_loss_mlp": 1.00097883, "epoch": 0.28521613659592376, "flos": 19974818679840.0, "grad_norm": 1.5612677714287646, "language_loss": 0.73361522, "learning_rate": 3.3549250652539134e-06, "loss": 0.75921339, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.9023046493530273 }, { "auxiliary_loss_clip": 0.01343963, "auxiliary_loss_mlp": 0.01197736, "balance_loss_clip": 1.01023459, "balance_loss_mlp": 1.000898, "epoch": 0.2853363794865629, "flos": 23367982831200.0, "grad_norm": 1.6945606049653064, "language_loss": 0.81740236, "learning_rate": 3.3543519842292794e-06, "loss": 0.84281933, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.772479295730591 }, { "auxiliary_loss_clip": 0.01374809, "auxiliary_loss_mlp": 0.00872809, "balance_loss_clip": 1.0107615, "balance_loss_mlp": 1.0003916, "epoch": 0.28545662237720193, "flos": 19861950180960.0, "grad_norm": 1.927454143617725, "language_loss": 0.83591622, "learning_rate": 3.353778697750527e-06, "loss": 0.85839248, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.7388529777526855 }, { "auxiliary_loss_clip": 0.01337083, "auxiliary_loss_mlp": 0.01197541, "balance_loss_clip": 1.01047635, "balance_loss_mlp": 1.00089359, "epoch": 0.28557686526784104, "flos": 23879027793600.0, "grad_norm": 1.579186269308948, "language_loss": 0.89389461, "learning_rate": 3.353205205904622e-06, "loss": 0.91924083, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.776925563812256 }, { "auxiliary_loss_clip": 0.01328981, "auxiliary_loss_mlp": 0.01197392, "balance_loss_clip": 1.00961947, "balance_loss_mlp": 1.00074399, "epoch": 0.28569710815848015, "flos": 44890400432160.0, "grad_norm": 1.997980155234147, "language_loss": 0.7168566, "learning_rate": 3.3526315087785637e-06, "loss": 0.74212033, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.9199843406677246 }, { "auxiliary_loss_clip": 0.01287428, "auxiliary_loss_mlp": 0.01196527, "balance_loss_clip": 1.00963211, "balance_loss_mlp": 1.00064278, "epoch": 0.2858173510491192, "flos": 26829770716800.0, "grad_norm": 2.07065734620495, "language_loss": 0.80758619, "learning_rate": 3.3520576064593805e-06, "loss": 0.83242577, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.8730521202087402 }, { "auxiliary_loss_clip": 0.01360591, "auxiliary_loss_mlp": 0.01197175, "balance_loss_clip": 1.01061845, "balance_loss_mlp": 1.00090933, "epoch": 0.2859375939397583, "flos": 23148927637920.0, "grad_norm": 1.6874866433280027, "language_loss": 0.81602979, "learning_rate": 3.3514834990341337e-06, "loss": 0.84160745, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.7959532737731934 }, { "auxiliary_loss_clip": 0.01347333, "auxiliary_loss_mlp": 0.01197469, "balance_loss_clip": 1.010324, "balance_loss_mlp": 1.00082183, "epoch": 0.2860578368303974, "flos": 12129807431520.0, "grad_norm": 2.326847595389002, "language_loss": 0.92896599, "learning_rate": 3.3509091865899144e-06, "loss": 0.95441401, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.8002095222473145 }, { "auxiliary_loss_clip": 0.01375057, "auxiliary_loss_mlp": 0.01197435, "balance_loss_clip": 1.01091933, "balance_loss_mlp": 1.00078702, "epoch": 0.2861780797210365, "flos": 19938046040640.0, "grad_norm": 2.187111845933532, "language_loss": 0.70626593, "learning_rate": 3.350334669213846e-06, "loss": 0.73199081, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.7130801677703857 }, { "auxiliary_loss_clip": 0.01350952, "auxiliary_loss_mlp": 0.01197312, "balance_loss_clip": 1.01060116, "balance_loss_mlp": 1.00085485, "epoch": 0.2862983226116756, "flos": 27563139927360.0, "grad_norm": 1.9754205946204757, "language_loss": 0.75431585, "learning_rate": 3.3497599469930816e-06, "loss": 0.77979851, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.819239616394043 }, { "auxiliary_loss_clip": 0.01375552, "auxiliary_loss_mlp": 0.01197664, "balance_loss_clip": 1.01101661, "balance_loss_mlp": 1.00101686, "epoch": 0.28641856550231465, "flos": 22053974984640.0, "grad_norm": 3.2534327313737488, "language_loss": 0.8289901, "learning_rate": 3.349185020014807e-06, "loss": 0.85472226, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.6466050148010254 }, { "auxiliary_loss_clip": 0.01362589, "auxiliary_loss_mlp": 0.0119738, "balance_loss_clip": 1.01089883, "balance_loss_mlp": 1.00092328, "epoch": 0.28653880839295376, "flos": 22378785788160.0, "grad_norm": 1.7369422099419605, "language_loss": 0.74307901, "learning_rate": 3.348609888366237e-06, "loss": 0.76867867, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.8345251083374023 }, { "auxiliary_loss_clip": 0.01281968, "auxiliary_loss_mlp": 0.01197393, "balance_loss_clip": 1.00954032, "balance_loss_mlp": 1.00093651, "epoch": 0.28665905128359287, "flos": 23367982831200.0, "grad_norm": 2.274426766942891, "language_loss": 0.62613893, "learning_rate": 3.348034552134619e-06, "loss": 0.65093255, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.8735733032226562 }, { "auxiliary_loss_clip": 0.01291085, "auxiliary_loss_mlp": 0.01197127, "balance_loss_clip": 1.00938344, "balance_loss_mlp": 1.00067043, "epoch": 0.2867792941742319, "flos": 20881705066560.0, "grad_norm": 2.9174126061488628, "language_loss": 0.84417051, "learning_rate": 3.3474590114072316e-06, "loss": 0.86905265, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.801973581314087 }, { "auxiliary_loss_clip": 0.01301568, "auxiliary_loss_mlp": 0.01197398, "balance_loss_clip": 1.00914359, "balance_loss_mlp": 1.00094092, "epoch": 0.28689953706487104, "flos": 20664014973120.0, "grad_norm": 1.7315621852149337, "language_loss": 0.82994574, "learning_rate": 3.3468832662713836e-06, "loss": 0.85493541, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.9122707843780518 }, { "auxiliary_loss_clip": 0.01305568, "auxiliary_loss_mlp": 0.01197517, "balance_loss_clip": 1.00862646, "balance_loss_mlp": 1.00086951, "epoch": 0.28701977995551015, "flos": 12675541459680.0, "grad_norm": 2.010059800548757, "language_loss": 0.83865523, "learning_rate": 3.346307316814415e-06, "loss": 0.86368614, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.908039093017578 }, { "auxiliary_loss_clip": 0.01348786, "auxiliary_loss_mlp": 0.01197925, "balance_loss_clip": 1.01040864, "balance_loss_mlp": 1.00108719, "epoch": 0.2871400228461492, "flos": 21252377200320.0, "grad_norm": 1.9680967136072236, "language_loss": 0.75522947, "learning_rate": 3.3457311631236965e-06, "loss": 0.78069657, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.744983434677124 }, { "auxiliary_loss_clip": 0.01338076, "auxiliary_loss_mlp": 0.01197604, "balance_loss_clip": 1.0105052, "balance_loss_mlp": 1.00076556, "epoch": 0.2872602657367883, "flos": 25119274819680.0, "grad_norm": 1.832458223112394, "language_loss": 0.84406459, "learning_rate": 3.345154805286631e-06, "loss": 0.86942136, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 3.868417739868164 }, { "auxiliary_loss_clip": 0.01363463, "auxiliary_loss_mlp": 0.01197181, "balance_loss_clip": 1.01107931, "balance_loss_mlp": 1.00053358, "epoch": 0.2873805086274274, "flos": 16646614047360.0, "grad_norm": 2.747279975023943, "language_loss": 0.76219028, "learning_rate": 3.344578243390651e-06, "loss": 0.78779674, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.7409579753875732 }, { "auxiliary_loss_clip": 0.01327786, "auxiliary_loss_mlp": 0.01197486, "balance_loss_clip": 1.00981581, "balance_loss_mlp": 1.00083852, "epoch": 0.2875007515180665, "flos": 17420132723040.0, "grad_norm": 2.1388035075533094, "language_loss": 0.78413093, "learning_rate": 3.3440014775232206e-06, "loss": 0.80938357, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.756878137588501 }, { "auxiliary_loss_clip": 0.01324024, "auxiliary_loss_mlp": 0.01197199, "balance_loss_clip": 1.01052213, "balance_loss_mlp": 1.00055146, "epoch": 0.2876209944087056, "flos": 23434199678880.0, "grad_norm": 1.9931163310928164, "language_loss": 0.71162283, "learning_rate": 3.343424507771834e-06, "loss": 0.73683512, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 4.83510422706604 }, { "auxiliary_loss_clip": 0.01312229, "auxiliary_loss_mlp": 0.01196874, "balance_loss_clip": 1.00961804, "balance_loss_mlp": 1.00060749, "epoch": 0.2877412372993447, "flos": 13735517657760.0, "grad_norm": 1.8716660836823105, "language_loss": 0.8638556, "learning_rate": 3.342847334224018e-06, "loss": 0.88894665, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.7675440311431885 }, { "auxiliary_loss_clip": 0.0133287, "auxiliary_loss_mlp": 0.01194572, "balance_loss_clip": 1.00740552, "balance_loss_mlp": 1.00002229, "epoch": 0.28786148018998375, "flos": 58079730263040.0, "grad_norm": 0.9420128977734363, "language_loss": 0.62433481, "learning_rate": 3.342269956967329e-06, "loss": 0.64960921, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.368061065673828 }, { "auxiliary_loss_clip": 0.01352334, "auxiliary_loss_mlp": 0.01197612, "balance_loss_clip": 1.00993097, "balance_loss_mlp": 1.00077415, "epoch": 0.28798172308062286, "flos": 23435061847200.0, "grad_norm": 2.885291386394471, "language_loss": 0.71319377, "learning_rate": 3.341692376089355e-06, "loss": 0.73869324, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.7501940727233887 }, { "auxiliary_loss_clip": 0.01349869, "auxiliary_loss_mlp": 0.01197409, "balance_loss_clip": 1.01045763, "balance_loss_mlp": 1.00095201, "epoch": 0.288101965971262, "flos": 25110042433920.0, "grad_norm": 3.610716686111075, "language_loss": 0.83671081, "learning_rate": 3.3411145916777146e-06, "loss": 0.86218357, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.7800450325012207 }, { "auxiliary_loss_clip": 0.01350982, "auxiliary_loss_mlp": 0.01197648, "balance_loss_clip": 1.01137042, "balance_loss_mlp": 1.00080991, "epoch": 0.28822220886190103, "flos": 16252568807040.0, "grad_norm": 2.636949026021026, "language_loss": 0.90869355, "learning_rate": 3.3405366038200566e-06, "loss": 0.9341799, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.8183834552764893 }, { "auxiliary_loss_clip": 0.01330867, "auxiliary_loss_mlp": 0.01197999, "balance_loss_clip": 1.01058805, "balance_loss_mlp": 1.00097013, "epoch": 0.28834245175254014, "flos": 24535654518240.0, "grad_norm": 2.111365621473494, "language_loss": 0.85205382, "learning_rate": 3.3399584126040617e-06, "loss": 0.87734252, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.843275547027588 }, { "auxiliary_loss_clip": 0.0137492, "auxiliary_loss_mlp": 0.0087276, "balance_loss_clip": 1.01090598, "balance_loss_mlp": 1.00028443, "epoch": 0.2884626946431792, "flos": 24571457218080.0, "grad_norm": 2.4615250045244923, "language_loss": 0.90915692, "learning_rate": 3.339380018117441e-06, "loss": 0.93163371, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.7708230018615723 }, { "auxiliary_loss_clip": 0.01348475, "auxiliary_loss_mlp": 0.01197126, "balance_loss_clip": 1.01051641, "balance_loss_mlp": 1.0008601, "epoch": 0.2885829375338183, "flos": 16544666347200.0, "grad_norm": 3.2362149144139387, "language_loss": 0.77996659, "learning_rate": 3.3388014204479366e-06, "loss": 0.80542266, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.711756467819214 }, { "auxiliary_loss_clip": 0.01375721, "auxiliary_loss_mlp": 0.01197361, "balance_loss_clip": 1.01113462, "balance_loss_mlp": 1.00071383, "epoch": 0.2887031804244574, "flos": 24061238500320.0, "grad_norm": 1.977814019279046, "language_loss": 0.91642624, "learning_rate": 3.338222619683321e-06, "loss": 0.94215703, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.834813117980957 }, { "auxiliary_loss_clip": 0.01332143, "auxiliary_loss_mlp": 0.01197035, "balance_loss_clip": 1.00973344, "balance_loss_mlp": 1.00057781, "epoch": 0.2888234233150965, "flos": 23330707260480.0, "grad_norm": 2.4772434901744194, "language_loss": 0.73630321, "learning_rate": 3.337643615911398e-06, "loss": 0.76159501, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.7836883068084717 }, { "auxiliary_loss_clip": 0.01354828, "auxiliary_loss_mlp": 0.01197326, "balance_loss_clip": 1.01036716, "balance_loss_mlp": 1.0008688, "epoch": 0.2889436662057356, "flos": 22272778712160.0, "grad_norm": 1.8274546636722513, "language_loss": 0.78670532, "learning_rate": 3.3370644092200026e-06, "loss": 0.81222689, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.8173563480377197 }, { "auxiliary_loss_clip": 0.01325703, "auxiliary_loss_mlp": 0.01197547, "balance_loss_clip": 1.00987458, "balance_loss_mlp": 1.00070834, "epoch": 0.2890639090963747, "flos": 21616942308480.0, "grad_norm": 1.8984726854495617, "language_loss": 0.78342426, "learning_rate": 3.3364849996969985e-06, "loss": 0.80865681, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.7879416942596436 }, { "auxiliary_loss_clip": 0.01350364, "auxiliary_loss_mlp": 0.01196982, "balance_loss_clip": 1.01004565, "balance_loss_mlp": 1.00090623, "epoch": 0.28918415198701375, "flos": 28585553165280.0, "grad_norm": 2.0143690312980076, "language_loss": 0.8533144, "learning_rate": 3.335905387430283e-06, "loss": 0.87878782, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.766627550125122 }, { "auxiliary_loss_clip": 0.01342425, "auxiliary_loss_mlp": 0.0119731, "balance_loss_clip": 1.00979638, "balance_loss_mlp": 1.00066257, "epoch": 0.28930439487765286, "flos": 21944698853760.0, "grad_norm": 1.9978488451160845, "language_loss": 0.83004522, "learning_rate": 3.335325572507782e-06, "loss": 0.85544258, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.8414621353149414 }, { "auxiliary_loss_clip": 0.01374846, "auxiliary_loss_mlp": 0.00872734, "balance_loss_clip": 1.01134562, "balance_loss_mlp": 1.00032473, "epoch": 0.28942463776829197, "flos": 19281922247520.0, "grad_norm": 1.5212266529808733, "language_loss": 0.73867893, "learning_rate": 3.3347455550174537e-06, "loss": 0.76115465, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.7445762157440186 }, { "auxiliary_loss_clip": 0.01326614, "auxiliary_loss_mlp": 0.01197143, "balance_loss_clip": 1.00973463, "balance_loss_mlp": 1.00068557, "epoch": 0.289544880658931, "flos": 14645709023040.0, "grad_norm": 2.971487383758936, "language_loss": 0.68179524, "learning_rate": 3.3341653350472864e-06, "loss": 0.7070328, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.8247756958007812 }, { "auxiliary_loss_clip": 0.01377775, "auxiliary_loss_mlp": 0.01198129, "balance_loss_clip": 1.01193643, "balance_loss_mlp": 1.00110006, "epoch": 0.28966512354957014, "flos": 28621894720320.0, "grad_norm": 2.3931322801886594, "language_loss": 0.69693363, "learning_rate": 3.333584912685298e-06, "loss": 0.72269273, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.9219582080841064 }, { "auxiliary_loss_clip": 0.01293721, "auxiliary_loss_mlp": 0.01194755, "balance_loss_clip": 1.00606346, "balance_loss_mlp": 1.00020587, "epoch": 0.28978536644020925, "flos": 64711819196640.0, "grad_norm": 0.8707146508453452, "language_loss": 0.5551855, "learning_rate": 3.3330042880195385e-06, "loss": 0.5800702, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.334975004196167 }, { "auxiliary_loss_clip": 0.01337538, "auxiliary_loss_mlp": 0.01197345, "balance_loss_clip": 1.01059175, "balance_loss_mlp": 1.00069761, "epoch": 0.2899056093308483, "flos": 18624648896640.0, "grad_norm": 1.6277153720948518, "language_loss": 0.78303915, "learning_rate": 3.3324234611380888e-06, "loss": 0.80838799, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.7571792602539062 }, { "auxiliary_loss_clip": 0.01311691, "auxiliary_loss_mlp": 0.01196799, "balance_loss_clip": 1.00999641, "balance_loss_mlp": 1.00072396, "epoch": 0.2900258522214874, "flos": 22893746431680.0, "grad_norm": 1.6777761450106026, "language_loss": 0.81670088, "learning_rate": 3.3318424321290596e-06, "loss": 0.84178579, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.882455587387085 }, { "auxiliary_loss_clip": 0.0129938, "auxiliary_loss_mlp": 0.01194582, "balance_loss_clip": 1.00658441, "balance_loss_mlp": 1.00003278, "epoch": 0.2901460951121265, "flos": 71106064852320.0, "grad_norm": 0.8214263997488997, "language_loss": 0.59993351, "learning_rate": 3.3312612010805917e-06, "loss": 0.62487316, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.4158778190612793 }, { "auxiliary_loss_clip": 0.01338007, "auxiliary_loss_mlp": 0.01197628, "balance_loss_clip": 1.01078534, "balance_loss_mlp": 1.0009805, "epoch": 0.2902663380027656, "flos": 32160999870720.0, "grad_norm": 1.6209801713762126, "language_loss": 0.69947493, "learning_rate": 3.330679768080858e-06, "loss": 0.72483128, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 3.007305383682251 }, { "auxiliary_loss_clip": 0.01350021, "auxiliary_loss_mlp": 0.01197409, "balance_loss_clip": 1.01066017, "balance_loss_mlp": 1.00095201, "epoch": 0.2903865808934047, "flos": 29351671562880.0, "grad_norm": 2.2103916959754386, "language_loss": 0.83438253, "learning_rate": 3.3300981332180627e-06, "loss": 0.85985684, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 3.7817792892456055 }, { "auxiliary_loss_clip": 0.01322734, "auxiliary_loss_mlp": 0.01197324, "balance_loss_clip": 1.00935245, "balance_loss_mlp": 1.00086749, "epoch": 0.29050682378404374, "flos": 17089035275520.0, "grad_norm": 2.0259067588114883, "language_loss": 0.79514945, "learning_rate": 3.3295162965804373e-06, "loss": 0.82035005, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.8173277378082275 }, { "auxiliary_loss_clip": 0.01305258, "auxiliary_loss_mlp": 0.01196928, "balance_loss_clip": 1.00902867, "balance_loss_mlp": 1.00066209, "epoch": 0.29062706667468285, "flos": 17858243109600.0, "grad_norm": 1.9907134633488095, "language_loss": 0.7818538, "learning_rate": 3.328934258256247e-06, "loss": 0.80687571, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.7596945762634277 }, { "auxiliary_loss_clip": 0.01350304, "auxiliary_loss_mlp": 0.01197424, "balance_loss_clip": 1.00981975, "balance_loss_mlp": 1.00077629, "epoch": 0.29074730956532197, "flos": 24279826685760.0, "grad_norm": 2.151102831847073, "language_loss": 0.67201608, "learning_rate": 3.3283520183337856e-06, "loss": 0.69749337, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 3.8060426712036133 }, { "auxiliary_loss_clip": 0.01332979, "auxiliary_loss_mlp": 0.01196808, "balance_loss_clip": 1.0100286, "balance_loss_mlp": 1.00073242, "epoch": 0.290867552455961, "flos": 22340971362240.0, "grad_norm": 1.7938898440150883, "language_loss": 0.69134605, "learning_rate": 3.3277695769013797e-06, "loss": 0.71664387, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 4.020708084106445 }, { "auxiliary_loss_clip": 0.01350586, "auxiliary_loss_mlp": 0.01197633, "balance_loss_clip": 1.01069772, "balance_loss_mlp": 1.00079477, "epoch": 0.29098779534660013, "flos": 23186167284960.0, "grad_norm": 2.068450629912976, "language_loss": 0.77348477, "learning_rate": 3.327186934047385e-06, "loss": 0.798967, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.7524120807647705 }, { "auxiliary_loss_clip": 0.01349628, "auxiliary_loss_mlp": 0.011969, "balance_loss_clip": 1.01053488, "balance_loss_mlp": 1.00082433, "epoch": 0.29110803823723924, "flos": 15304203779040.0, "grad_norm": 1.7833529876166843, "language_loss": 0.65422022, "learning_rate": 3.3266040898601877e-06, "loss": 0.67968559, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.8001556396484375 }, { "auxiliary_loss_clip": 0.0132513, "auxiliary_loss_mlp": 0.01197058, "balance_loss_clip": 1.01099181, "balance_loss_mlp": 1.00079179, "epoch": 0.2912282811278783, "flos": 22595362247520.0, "grad_norm": 1.8964749450830392, "language_loss": 0.78126514, "learning_rate": 3.3260210444282045e-06, "loss": 0.80648708, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.881375312805176 }, { "auxiliary_loss_clip": 0.0134865, "auxiliary_loss_mlp": 0.01197484, "balance_loss_clip": 1.01066518, "balance_loss_mlp": 1.00083685, "epoch": 0.2913485240185174, "flos": 24497911939680.0, "grad_norm": 2.007882480638671, "language_loss": 0.73077536, "learning_rate": 3.325437797839883e-06, "loss": 0.75623667, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.7370920181274414 }, { "auxiliary_loss_clip": 0.01374334, "auxiliary_loss_mlp": 0.01197477, "balance_loss_clip": 1.0106504, "balance_loss_mlp": 1.0008291, "epoch": 0.2914687669091565, "flos": 17931033990720.0, "grad_norm": 2.2081660447197784, "language_loss": 0.74671137, "learning_rate": 3.3248543501837015e-06, "loss": 0.77242947, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.712751626968384 }, { "auxiliary_loss_clip": 0.01295786, "auxiliary_loss_mlp": 0.01197549, "balance_loss_clip": 1.00925946, "balance_loss_mlp": 1.00090086, "epoch": 0.2915890097997956, "flos": 22529325018240.0, "grad_norm": 2.0409203399735483, "language_loss": 0.77173698, "learning_rate": 3.3242707015481684e-06, "loss": 0.79667032, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.886415958404541 }, { "auxiliary_loss_clip": 0.01348599, "auxiliary_loss_mlp": 0.01197139, "balance_loss_clip": 1.01024127, "balance_loss_mlp": 1.00087273, "epoch": 0.2917092526904347, "flos": 13845224872800.0, "grad_norm": 1.6407457199783002, "language_loss": 0.80467206, "learning_rate": 3.323686852021823e-06, "loss": 0.83012944, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.8082737922668457 }, { "auxiliary_loss_clip": 0.01328515, "auxiliary_loss_mlp": 0.01196977, "balance_loss_clip": 1.01005387, "balance_loss_mlp": 1.00071144, "epoch": 0.2918294955810738, "flos": 22674870856800.0, "grad_norm": 1.8806950224800614, "language_loss": 0.79756594, "learning_rate": 3.323102801693235e-06, "loss": 0.8228209, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.8004870414733887 }, { "auxiliary_loss_clip": 0.01361521, "auxiliary_loss_mlp": 0.01196954, "balance_loss_clip": 1.0106864, "balance_loss_mlp": 1.00049686, "epoch": 0.29194973847171285, "flos": 23438294978400.0, "grad_norm": 2.0321993125726636, "language_loss": 0.8075949, "learning_rate": 3.322518550651003e-06, "loss": 0.83317971, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.825230121612549 }, { "auxiliary_loss_clip": 0.01348985, "auxiliary_loss_mlp": 0.0119727, "balance_loss_clip": 1.01059139, "balance_loss_mlp": 1.00081301, "epoch": 0.29206998136235196, "flos": 21909075772320.0, "grad_norm": 1.6845520027690954, "language_loss": 0.81164122, "learning_rate": 3.3219340989837586e-06, "loss": 0.83710372, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.802966356277466 }, { "auxiliary_loss_clip": 0.01336861, "auxiliary_loss_mlp": 0.01196931, "balance_loss_clip": 1.01041317, "balance_loss_mlp": 1.00066471, "epoch": 0.292190224252991, "flos": 23215934806560.0, "grad_norm": 2.2129990323063096, "language_loss": 0.80393469, "learning_rate": 3.3213494467801625e-06, "loss": 0.82927263, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.8024842739105225 }, { "auxiliary_loss_clip": 0.01278618, "auxiliary_loss_mlp": 0.01197205, "balance_loss_clip": 1.0098871, "balance_loss_mlp": 1.00074852, "epoch": 0.2923104671436301, "flos": 20740829306400.0, "grad_norm": 1.9692912316821614, "language_loss": 0.71562874, "learning_rate": 3.3207645941289063e-06, "loss": 0.74038696, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.9610257148742676 }, { "auxiliary_loss_clip": 0.01349463, "auxiliary_loss_mlp": 0.00872697, "balance_loss_clip": 1.01007342, "balance_loss_mlp": 1.00012004, "epoch": 0.29243071003426924, "flos": 35809129686240.0, "grad_norm": 1.8482360082799696, "language_loss": 0.800713, "learning_rate": 3.320179541118711e-06, "loss": 0.82293463, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 3.0193068981170654 }, { "auxiliary_loss_clip": 0.01336412, "auxiliary_loss_mlp": 0.01194554, "balance_loss_clip": 1.00647855, "balance_loss_mlp": 1.00000429, "epoch": 0.2925509529249083, "flos": 58081634218080.0, "grad_norm": 0.9908755235554063, "language_loss": 0.60283917, "learning_rate": 3.3195942878383293e-06, "loss": 0.62814885, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.32578444480896 }, { "auxiliary_loss_clip": 0.01351071, "auxiliary_loss_mlp": 0.0119732, "balance_loss_clip": 1.0100131, "balance_loss_mlp": 1.00067258, "epoch": 0.2926711958155474, "flos": 21397132717920.0, "grad_norm": 1.7813322657991872, "language_loss": 0.77820837, "learning_rate": 3.319008834376543e-06, "loss": 0.80369234, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.7197318077087402 }, { "auxiliary_loss_clip": 0.01334545, "auxiliary_loss_mlp": 0.01197372, "balance_loss_clip": 1.00983524, "balance_loss_mlp": 1.00091481, "epoch": 0.2927914387061865, "flos": 23185808048160.0, "grad_norm": 2.452933750206372, "language_loss": 0.88706577, "learning_rate": 3.3184231808221654e-06, "loss": 0.91238487, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.854990005493164 }, { "auxiliary_loss_clip": 0.01314892, "auxiliary_loss_mlp": 0.01196879, "balance_loss_clip": 1.00981188, "balance_loss_mlp": 1.00080371, "epoch": 0.29291168159682557, "flos": 22455564197760.0, "grad_norm": 2.0501797691335093, "language_loss": 0.63176155, "learning_rate": 3.3178373272640394e-06, "loss": 0.65687931, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.837186574935913 }, { "auxiliary_loss_clip": 0.01373203, "auxiliary_loss_mlp": 0.01196933, "balance_loss_clip": 1.01036394, "balance_loss_mlp": 1.0006671, "epoch": 0.2930319244874647, "flos": 21170641322880.0, "grad_norm": 2.2185733626588346, "language_loss": 0.84786522, "learning_rate": 3.3172512737910387e-06, "loss": 0.87356663, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.66296124458313 }, { "auxiliary_loss_clip": 0.01361663, "auxiliary_loss_mlp": 0.01196902, "balance_loss_clip": 1.01067424, "balance_loss_mlp": 1.00082636, "epoch": 0.2931521673781038, "flos": 31357857368160.0, "grad_norm": 1.9226617168345206, "language_loss": 0.88421077, "learning_rate": 3.3166650204920674e-06, "loss": 0.90979636, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.845126152038574 }, { "auxiliary_loss_clip": 0.01349882, "auxiliary_loss_mlp": 0.0119749, "balance_loss_clip": 1.01039505, "balance_loss_mlp": 1.00084281, "epoch": 0.29327241026874284, "flos": 24201000626400.0, "grad_norm": 1.6769596675768945, "language_loss": 0.81746137, "learning_rate": 3.316078567456059e-06, "loss": 0.84293514, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.80265736579895 }, { "auxiliary_loss_clip": 0.01279965, "auxiliary_loss_mlp": 0.01197162, "balance_loss_clip": 1.00854373, "balance_loss_mlp": 1.00089574, "epoch": 0.29339265315938196, "flos": 24242622962400.0, "grad_norm": 3.44117703305863, "language_loss": 0.75972927, "learning_rate": 3.3154919147719786e-06, "loss": 0.78450048, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 3.002190113067627 }, { "auxiliary_loss_clip": 0.0135165, "auxiliary_loss_mlp": 0.01197073, "balance_loss_clip": 1.00966895, "balance_loss_mlp": 1.00061619, "epoch": 0.29351289605002107, "flos": 16946650720800.0, "grad_norm": 1.985128386205245, "language_loss": 0.86526752, "learning_rate": 3.31490506252882e-06, "loss": 0.8907547, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 4.1025190353393555 }, { "auxiliary_loss_clip": 0.01323295, "auxiliary_loss_mlp": 0.01196737, "balance_loss_clip": 1.01028681, "balance_loss_mlp": 1.00066185, "epoch": 0.2936331389406601, "flos": 19829093222880.0, "grad_norm": 1.6803570936810945, "language_loss": 0.8436321, "learning_rate": 3.31431801081561e-06, "loss": 0.86883247, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.9150402545928955 }, { "auxiliary_loss_clip": 0.01313202, "auxiliary_loss_mlp": 0.01194588, "balance_loss_clip": 1.00724745, "balance_loss_mlp": 1.00003862, "epoch": 0.29375338183129923, "flos": 71416878629760.0, "grad_norm": 0.9287356738188312, "language_loss": 0.67903632, "learning_rate": 3.313730759721402e-06, "loss": 0.70411426, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.444983959197998 }, { "auxiliary_loss_clip": 0.01327784, "auxiliary_loss_mlp": 0.01196654, "balance_loss_clip": 1.00963271, "balance_loss_mlp": 1.00076938, "epoch": 0.29387362472193834, "flos": 22054513839840.0, "grad_norm": 2.261788540397868, "language_loss": 0.86900431, "learning_rate": 3.313143309335282e-06, "loss": 0.89424866, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.759086847305298 }, { "auxiliary_loss_clip": 0.01310826, "auxiliary_loss_mlp": 0.01197245, "balance_loss_clip": 1.00980842, "balance_loss_mlp": 1.0011692, "epoch": 0.2939938676125774, "flos": 22966429541760.0, "grad_norm": 1.7313767456184403, "language_loss": 0.84732628, "learning_rate": 3.3125556597463665e-06, "loss": 0.87240702, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 4.769291162490845 }, { "auxiliary_loss_clip": 0.01349906, "auxiliary_loss_mlp": 0.0119709, "balance_loss_clip": 1.01039362, "balance_loss_mlp": 1.0008235, "epoch": 0.2941141105032165, "flos": 31358719536480.0, "grad_norm": 1.4697335638051667, "language_loss": 0.66343623, "learning_rate": 3.311967811043801e-06, "loss": 0.68890619, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.8511087894439697 }, { "auxiliary_loss_clip": 0.01349955, "auxiliary_loss_mlp": 0.0119734, "balance_loss_clip": 1.01001096, "balance_loss_mlp": 1.0010736, "epoch": 0.29423435339385556, "flos": 23222149603200.0, "grad_norm": 2.10356038721514, "language_loss": 0.81885475, "learning_rate": 3.3113797633167617e-06, "loss": 0.84432769, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.7664496898651123 }, { "auxiliary_loss_clip": 0.01373469, "auxiliary_loss_mlp": 0.01197059, "balance_loss_clip": 1.01049876, "balance_loss_mlp": 1.0007931, "epoch": 0.2943545962844947, "flos": 26864064622080.0, "grad_norm": 2.23094997129622, "language_loss": 0.69499063, "learning_rate": 3.310791516654455e-06, "loss": 0.72069597, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.7137091159820557 }, { "auxiliary_loss_clip": 0.01326665, "auxiliary_loss_mlp": 0.01196904, "balance_loss_clip": 1.0101912, "balance_loss_mlp": 1.00063801, "epoch": 0.2944748391751338, "flos": 20231688299040.0, "grad_norm": 2.2611168500820047, "language_loss": 0.79576075, "learning_rate": 3.3102030711461177e-06, "loss": 0.82099646, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.764620780944824 }, { "auxiliary_loss_clip": 0.01312167, "auxiliary_loss_mlp": 0.01196829, "balance_loss_clip": 1.00926614, "balance_loss_mlp": 1.0007534, "epoch": 0.29459508206577284, "flos": 15960974198400.0, "grad_norm": 1.8631613756183083, "language_loss": 0.68044055, "learning_rate": 3.3096144268810156e-06, "loss": 0.70553052, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.8934481143951416 }, { "auxiliary_loss_clip": 0.01361475, "auxiliary_loss_mlp": 0.01197516, "balance_loss_clip": 1.01042819, "balance_loss_mlp": 1.00105929, "epoch": 0.29471532495641195, "flos": 20412893142720.0, "grad_norm": 2.130202253947248, "language_loss": 0.72991681, "learning_rate": 3.3090255839484462e-06, "loss": 0.75550675, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.7610881328582764 }, { "auxiliary_loss_clip": 0.01335434, "auxiliary_loss_mlp": 0.01197471, "balance_loss_clip": 1.01028717, "balance_loss_mlp": 1.00082302, "epoch": 0.29483556784705106, "flos": 20376587511360.0, "grad_norm": 1.8824190499650075, "language_loss": 0.85251403, "learning_rate": 3.3084365424377366e-06, "loss": 0.87784302, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.8232083320617676 }, { "auxiliary_loss_clip": 0.01285643, "auxiliary_loss_mlp": 0.01194581, "balance_loss_clip": 1.00956964, "balance_loss_mlp": 1.00003135, "epoch": 0.2949558107376901, "flos": 68555689737120.0, "grad_norm": 0.7242315543561161, "language_loss": 0.55980527, "learning_rate": 3.307847302438245e-06, "loss": 0.58460748, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.2895686626434326 }, { "auxiliary_loss_clip": 0.01311984, "auxiliary_loss_mlp": 0.01197165, "balance_loss_clip": 1.00979066, "balance_loss_mlp": 1.00089896, "epoch": 0.2950760536283292, "flos": 16107094815840.0, "grad_norm": 1.8841869670326905, "language_loss": 0.77705646, "learning_rate": 3.3072578640393562e-06, "loss": 0.80214798, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.8414108753204346 }, { "auxiliary_loss_clip": 0.01336727, "auxiliary_loss_mlp": 0.01197202, "balance_loss_clip": 1.01057696, "balance_loss_mlp": 1.00074482, "epoch": 0.29519629651896834, "flos": 20483636374080.0, "grad_norm": 2.324000079774733, "language_loss": 0.79910856, "learning_rate": 3.3066682273304886e-06, "loss": 0.82444787, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.801173210144043 }, { "auxiliary_loss_clip": 0.0135333, "auxiliary_loss_mlp": 0.00872857, "balance_loss_clip": 1.01045251, "balance_loss_mlp": 1.00022578, "epoch": 0.2953165394096074, "flos": 18916494971040.0, "grad_norm": 2.0783979093586207, "language_loss": 0.78968143, "learning_rate": 3.3060783924010904e-06, "loss": 0.81194335, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.738522529602051 }, { "auxiliary_loss_clip": 0.0131072, "auxiliary_loss_mlp": 0.01197318, "balance_loss_clip": 1.00898194, "balance_loss_mlp": 1.00105214, "epoch": 0.2954367823002465, "flos": 20624476210560.0, "grad_norm": 2.0859203765876084, "language_loss": 0.84572017, "learning_rate": 3.3054883593406387e-06, "loss": 0.8708005, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.8185596466064453 }, { "auxiliary_loss_clip": 0.0134205, "auxiliary_loss_mlp": 0.01196992, "balance_loss_clip": 1.01013136, "balance_loss_mlp": 1.00091696, "epoch": 0.2955570251908856, "flos": 31175538890400.0, "grad_norm": 2.082744587281225, "language_loss": 0.64791471, "learning_rate": 3.3048981282386404e-06, "loss": 0.67330509, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.8719308376312256 }, { "auxiliary_loss_clip": 0.0131391, "auxiliary_loss_mlp": 0.01197313, "balance_loss_clip": 1.00962925, "balance_loss_mlp": 1.00085592, "epoch": 0.29567726808152467, "flos": 21650338121760.0, "grad_norm": 1.834569856327062, "language_loss": 0.82952571, "learning_rate": 3.304307699184634e-06, "loss": 0.85463798, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.8464787006378174 }, { "auxiliary_loss_clip": 0.01335343, "auxiliary_loss_mlp": 0.01196673, "balance_loss_clip": 1.01011992, "balance_loss_mlp": 1.00078869, "epoch": 0.2957975109721638, "flos": 24243880291200.0, "grad_norm": 1.5679979041065095, "language_loss": 0.78510314, "learning_rate": 3.3037170722681866e-06, "loss": 0.81042325, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.8670992851257324 }, { "auxiliary_loss_clip": 0.01312097, "auxiliary_loss_mlp": 0.01196726, "balance_loss_clip": 1.00914216, "balance_loss_mlp": 1.00065017, "epoch": 0.29591775386280283, "flos": 13479725748960.0, "grad_norm": 1.7901574889868976, "language_loss": 0.68351567, "learning_rate": 3.3031262475788956e-06, "loss": 0.70860386, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.802025318145752 }, { "auxiliary_loss_clip": 0.01339026, "auxiliary_loss_mlp": 0.01196931, "balance_loss_clip": 1.01021636, "balance_loss_mlp": 1.00066495, "epoch": 0.29603799675344195, "flos": 17749793223360.0, "grad_norm": 3.153601477581788, "language_loss": 0.73145962, "learning_rate": 3.3025352252063897e-06, "loss": 0.75681913, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.7899725437164307 }, { "auxiliary_loss_clip": 0.0134878, "auxiliary_loss_mlp": 0.01196789, "balance_loss_clip": 1.01035631, "balance_loss_mlp": 1.00071335, "epoch": 0.29615823964408106, "flos": 22783931445600.0, "grad_norm": 1.6258538816210122, "language_loss": 0.75170535, "learning_rate": 3.3019440052403252e-06, "loss": 0.77716106, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.710984945297241 }, { "auxiliary_loss_clip": 0.0133772, "auxiliary_loss_mlp": 0.01196729, "balance_loss_clip": 1.00991929, "balance_loss_mlp": 1.00065398, "epoch": 0.2962784825347201, "flos": 23514211219680.0, "grad_norm": 1.6942884446855448, "language_loss": 0.70843136, "learning_rate": 3.30135258777039e-06, "loss": 0.73377585, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 3.1810390949249268 }, { "auxiliary_loss_clip": 0.01360988, "auxiliary_loss_mlp": 0.00872808, "balance_loss_clip": 1.01030195, "balance_loss_mlp": 1.00026202, "epoch": 0.2963987254253592, "flos": 16362778953600.0, "grad_norm": 1.8551909688867934, "language_loss": 0.70464468, "learning_rate": 3.3007609728863024e-06, "loss": 0.72698271, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.8258564472198486 }, { "auxiliary_loss_clip": 0.01251813, "auxiliary_loss_mlp": 0.0119682, "balance_loss_clip": 1.00732923, "balance_loss_mlp": 1.00074482, "epoch": 0.29651896831599833, "flos": 33472277517600.0, "grad_norm": 2.4940526466953505, "language_loss": 0.73137379, "learning_rate": 3.300169160677809e-06, "loss": 0.75586009, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 3.000555992126465 }, { "auxiliary_loss_clip": 0.01319868, "auxiliary_loss_mlp": 0.01197428, "balance_loss_clip": 1.00991273, "balance_loss_mlp": 1.00078034, "epoch": 0.2966392112066374, "flos": 23805374744160.0, "grad_norm": 3.257578318087712, "language_loss": 0.78160703, "learning_rate": 3.2995771512346878e-06, "loss": 0.80677998, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 4.064103126525879 }, { "auxiliary_loss_clip": 0.01374157, "auxiliary_loss_mlp": 0.00872774, "balance_loss_clip": 1.01082683, "balance_loss_mlp": 1.00028741, "epoch": 0.2967594540972765, "flos": 19938477124800.0, "grad_norm": 2.2720409658672627, "language_loss": 0.73455065, "learning_rate": 3.298984944646746e-06, "loss": 0.75702, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.7591567039489746 }, { "auxiliary_loss_clip": 0.01361006, "auxiliary_loss_mlp": 0.00872642, "balance_loss_clip": 1.01107025, "balance_loss_mlp": 1.00010669, "epoch": 0.2968796969879156, "flos": 23732835328800.0, "grad_norm": 1.7915884754241567, "language_loss": 0.81605136, "learning_rate": 3.298392541003822e-06, "loss": 0.83838785, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.834484815597534 }, { "auxiliary_loss_clip": 0.01332968, "auxiliary_loss_mlp": 0.01196534, "balance_loss_clip": 1.00986791, "balance_loss_mlp": 1.00064945, "epoch": 0.29699993987855466, "flos": 22893710508000.0, "grad_norm": 1.5815508017845048, "language_loss": 0.89929461, "learning_rate": 3.2977999403957806e-06, "loss": 0.92458963, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 3.891552209854126 }, { "auxiliary_loss_clip": 0.01373294, "auxiliary_loss_mlp": 0.01197047, "balance_loss_clip": 1.01078296, "balance_loss_mlp": 1.00078082, "epoch": 0.2971201827691938, "flos": 33832567707840.0, "grad_norm": 1.8983244944269275, "language_loss": 0.66894186, "learning_rate": 3.2972071429125207e-06, "loss": 0.69464529, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 4.143686056137085 }, { "auxiliary_loss_clip": 0.01301335, "auxiliary_loss_mlp": 0.01197183, "balance_loss_clip": 1.00838184, "balance_loss_mlp": 1.00072587, "epoch": 0.2972404256598329, "flos": 22054370145120.0, "grad_norm": 2.012868026168934, "language_loss": 0.88645041, "learning_rate": 3.2966141486439682e-06, "loss": 0.91143554, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7602834701538086 }, { "auxiliary_loss_clip": 0.01292679, "auxiliary_loss_mlp": 0.01197096, "balance_loss_clip": 1.00961745, "balance_loss_mlp": 1.00082958, "epoch": 0.29736066855047194, "flos": 31978609545600.0, "grad_norm": 2.1743381311647902, "language_loss": 0.64956617, "learning_rate": 3.29602095768008e-06, "loss": 0.67446393, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.908090591430664 }, { "auxiliary_loss_clip": 0.01323414, "auxiliary_loss_mlp": 0.01196489, "balance_loss_clip": 1.00944996, "balance_loss_mlp": 1.00069976, "epoch": 0.29748091144111105, "flos": 33510415256640.0, "grad_norm": 2.2735935066541684, "language_loss": 0.63441771, "learning_rate": 3.2954275701108437e-06, "loss": 0.65961677, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.923797607421875 }, { "auxiliary_loss_clip": 0.01299329, "auxiliary_loss_mlp": 0.01196447, "balance_loss_clip": 1.00886059, "balance_loss_mlp": 1.00056243, "epoch": 0.29760115433175016, "flos": 41283390021120.0, "grad_norm": 1.7472999031245255, "language_loss": 0.68439603, "learning_rate": 3.294833986026275e-06, "loss": 0.70935374, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 3.0299060344696045 }, { "auxiliary_loss_clip": 0.01317057, "auxiliary_loss_mlp": 0.01197165, "balance_loss_clip": 1.00967169, "balance_loss_mlp": 1.00070834, "epoch": 0.2977213972223892, "flos": 24493349632320.0, "grad_norm": 1.9379604955986138, "language_loss": 0.85131419, "learning_rate": 3.29424020551642e-06, "loss": 0.87645638, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.884361505508423 }, { "auxiliary_loss_clip": 0.01374545, "auxiliary_loss_mlp": 0.01197646, "balance_loss_clip": 1.01127279, "balance_loss_mlp": 1.00080752, "epoch": 0.2978416401130283, "flos": 21285126387360.0, "grad_norm": 6.730443338658277, "language_loss": 0.72110176, "learning_rate": 3.2936462286713546e-06, "loss": 0.74682367, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.706996202468872 }, { "auxiliary_loss_clip": 0.01351457, "auxiliary_loss_mlp": 0.01196821, "balance_loss_clip": 1.01021838, "balance_loss_mlp": 1.00074589, "epoch": 0.2979618830036674, "flos": 25772165481600.0, "grad_norm": 1.8608373560140752, "language_loss": 0.77133232, "learning_rate": 3.2930520555811846e-06, "loss": 0.79681516, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.811241865158081 }, { "auxiliary_loss_clip": 0.01248557, "auxiliary_loss_mlp": 0.00872828, "balance_loss_clip": 1.0086931, "balance_loss_mlp": 1.00011563, "epoch": 0.2980821258943065, "flos": 23477007496320.0, "grad_norm": 2.0488634754858905, "language_loss": 0.7991854, "learning_rate": 3.292457686336046e-06, "loss": 0.82039922, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.904222249984741 }, { "auxiliary_loss_clip": 0.01324432, "auxiliary_loss_mlp": 0.01194636, "balance_loss_clip": 1.00656748, "balance_loss_mlp": 1.00008607, "epoch": 0.2982023687849456, "flos": 69752351833920.0, "grad_norm": 0.8801602440294224, "language_loss": 0.61258042, "learning_rate": 3.291863121026105e-06, "loss": 0.63777113, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.4423866271972656 }, { "auxiliary_loss_clip": 0.01349564, "auxiliary_loss_mlp": 0.01197141, "balance_loss_clip": 1.00989068, "balance_loss_mlp": 1.0006845, "epoch": 0.29832261167558466, "flos": 29825943886080.0, "grad_norm": 2.2226744370165887, "language_loss": 0.76657462, "learning_rate": 3.2912683597415547e-06, "loss": 0.79204166, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.8713977336883545 }, { "auxiliary_loss_clip": 0.01316491, "auxiliary_loss_mlp": 0.01197165, "balance_loss_clip": 1.00987446, "balance_loss_mlp": 1.00070775, "epoch": 0.29844285456622377, "flos": 33910172362080.0, "grad_norm": 1.9507716148138847, "language_loss": 0.78284079, "learning_rate": 3.2906734025726213e-06, "loss": 0.80797732, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.9350273609161377 }, { "auxiliary_loss_clip": 0.01360409, "auxiliary_loss_mlp": 0.01197357, "balance_loss_clip": 1.01075685, "balance_loss_mlp": 1.00090063, "epoch": 0.2985630974568629, "flos": 23876944220160.0, "grad_norm": 6.412134148988989, "language_loss": 0.87889177, "learning_rate": 3.290078249609559e-06, "loss": 0.90446943, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.801290273666382 }, { "auxiliary_loss_clip": 0.01348148, "auxiliary_loss_mlp": 0.01196964, "balance_loss_clip": 1.01070547, "balance_loss_mlp": 1.00069737, "epoch": 0.29868334034750194, "flos": 21799117091520.0, "grad_norm": 2.9762922898471906, "language_loss": 0.87841094, "learning_rate": 3.2894829009426514e-06, "loss": 0.90386212, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.7322375774383545 }, { "auxiliary_loss_clip": 0.01349397, "auxiliary_loss_mlp": 0.01196951, "balance_loss_clip": 1.0103147, "balance_loss_mlp": 1.00087607, "epoch": 0.29880358323814105, "flos": 25666661337120.0, "grad_norm": 1.8133360704537165, "language_loss": 0.77739459, "learning_rate": 3.288887356662213e-06, "loss": 0.80285811, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.78296160697937 }, { "auxiliary_loss_clip": 0.01324463, "auxiliary_loss_mlp": 0.01194603, "balance_loss_clip": 1.00561786, "balance_loss_mlp": 1.00005305, "epoch": 0.29892382612878016, "flos": 71005877412480.0, "grad_norm": 0.7820625504613392, "language_loss": 0.597188, "learning_rate": 3.288291616858588e-06, "loss": 0.62237871, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 3.2336783409118652 }, { "auxiliary_loss_clip": 0.01278767, "auxiliary_loss_mlp": 0.01197024, "balance_loss_clip": 1.00874305, "balance_loss_mlp": 1.00094819, "epoch": 0.2990440690194192, "flos": 25481145651840.0, "grad_norm": 1.6883895025968634, "language_loss": 0.7677238, "learning_rate": 3.287695681622149e-06, "loss": 0.79248166, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.8447322845458984 }, { "auxiliary_loss_clip": 0.01338208, "auxiliary_loss_mlp": 0.01197275, "balance_loss_clip": 1.00929105, "balance_loss_mlp": 1.00081825, "epoch": 0.2991643119100583, "flos": 23732368320960.0, "grad_norm": 1.7073817855024462, "language_loss": 0.80592412, "learning_rate": 3.2870995510432982e-06, "loss": 0.83127892, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.872041940689087 }, { "auxiliary_loss_clip": 0.01345959, "auxiliary_loss_mlp": 0.01196259, "balance_loss_clip": 1.01000714, "balance_loss_mlp": 1.00056493, "epoch": 0.29928455480069743, "flos": 27417558165120.0, "grad_norm": 1.817328344654462, "language_loss": 0.76879919, "learning_rate": 3.2865032252124697e-06, "loss": 0.7942214, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.772446393966675 }, { "auxiliary_loss_clip": 0.01349044, "auxiliary_loss_mlp": 0.0119653, "balance_loss_clip": 1.01083255, "balance_loss_mlp": 1.0008359, "epoch": 0.2994047976913365, "flos": 33692949276480.0, "grad_norm": 1.4333503397482104, "language_loss": 0.77656615, "learning_rate": 3.2859067042201243e-06, "loss": 0.80202186, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.885843515396118 }, { "auxiliary_loss_clip": 0.01274918, "auxiliary_loss_mlp": 0.01196211, "balance_loss_clip": 1.00830793, "balance_loss_mlp": 1.00051689, "epoch": 0.2995250405819756, "flos": 16763973006240.0, "grad_norm": 2.0467213902279195, "language_loss": 0.78166467, "learning_rate": 3.2853099881567544e-06, "loss": 0.80637598, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.859269142150879 }, { "auxiliary_loss_clip": 0.01371713, "auxiliary_loss_mlp": 0.01196327, "balance_loss_clip": 1.01044869, "balance_loss_mlp": 1.00063324, "epoch": 0.29964528347261465, "flos": 22963986731520.0, "grad_norm": 1.9108966127265061, "language_loss": 0.79051, "learning_rate": 3.284713077112881e-06, "loss": 0.81619036, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.7329795360565186 }, { "auxiliary_loss_clip": 0.01316188, "auxiliary_loss_mlp": 0.01197441, "balance_loss_clip": 1.0101918, "balance_loss_mlp": 1.00079298, "epoch": 0.29976552636325376, "flos": 16938029037600.0, "grad_norm": 2.6165890226759263, "language_loss": 0.86867583, "learning_rate": 3.284115971179056e-06, "loss": 0.89381206, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 3.8703596591949463 }, { "auxiliary_loss_clip": 0.01278836, "auxiliary_loss_mlp": 0.0119674, "balance_loss_clip": 1.00953627, "balance_loss_mlp": 1.00085568, "epoch": 0.2998857692538929, "flos": 17056465706880.0, "grad_norm": 1.7373933488877955, "language_loss": 0.7870428, "learning_rate": 3.283518670445859e-06, "loss": 0.81179857, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.9378859996795654 }, { "auxiliary_loss_clip": 0.01301823, "auxiliary_loss_mlp": 0.00871929, "balance_loss_clip": 1.00616598, "balance_loss_mlp": 1.00001991, "epoch": 0.30000601214453193, "flos": 68831562983040.0, "grad_norm": 0.6827755630763529, "language_loss": 0.54306704, "learning_rate": 3.2829211750038995e-06, "loss": 0.56480455, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.3803117275238037 }, { "auxiliary_loss_clip": 0.01311301, "auxiliary_loss_mlp": 0.01196682, "balance_loss_clip": 1.00971222, "balance_loss_mlp": 1.00060678, "epoch": 0.30012625503517104, "flos": 17603277445440.0, "grad_norm": 1.823862267525557, "language_loss": 0.8930791, "learning_rate": 3.2823234849438183e-06, "loss": 0.91815901, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 4.829876899719238 }, { "auxiliary_loss_clip": 0.01337719, "auxiliary_loss_mlp": 0.01196853, "balance_loss_clip": 1.01005936, "balance_loss_mlp": 1.00077724, "epoch": 0.30024649792581015, "flos": 21252592742400.0, "grad_norm": 1.9542982768078463, "language_loss": 0.75721228, "learning_rate": 3.2817256003562836e-06, "loss": 0.78255796, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.855100393295288 }, { "auxiliary_loss_clip": 0.01276554, "auxiliary_loss_mlp": 0.01197773, "balance_loss_clip": 1.00901854, "balance_loss_mlp": 1.00093448, "epoch": 0.3003667408164492, "flos": 23003274028320.0, "grad_norm": 1.7388240668645414, "language_loss": 0.6598171, "learning_rate": 3.281127521331995e-06, "loss": 0.68456042, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.9026334285736084 }, { "auxiliary_loss_clip": 0.01347286, "auxiliary_loss_mlp": 0.0119459, "balance_loss_clip": 1.00570047, "balance_loss_mlp": 1.00004089, "epoch": 0.3004869837070883, "flos": 64232373863520.0, "grad_norm": 0.8820720585654597, "language_loss": 0.6071285, "learning_rate": 3.2805292479616798e-06, "loss": 0.63254726, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.1535799503326416 }, { "auxiliary_loss_clip": 0.01338048, "auxiliary_loss_mlp": 0.0119693, "balance_loss_clip": 1.0102793, "balance_loss_mlp": 1.00085497, "epoch": 0.30060722659772743, "flos": 26248665072960.0, "grad_norm": 2.2652732422144384, "language_loss": 0.9228816, "learning_rate": 3.2799307803360955e-06, "loss": 0.94823134, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.7832720279693604 }, { "auxiliary_loss_clip": 0.01372275, "auxiliary_loss_mlp": 0.01196694, "balance_loss_clip": 1.01061559, "balance_loss_mlp": 1.00080967, "epoch": 0.3007274694883665, "flos": 24970891010400.0, "grad_norm": 1.4271238960626518, "language_loss": 0.81576288, "learning_rate": 3.27933211854603e-06, "loss": 0.84145254, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.764158010482788 }, { "auxiliary_loss_clip": 0.01333429, "auxiliary_loss_mlp": 0.01196772, "balance_loss_clip": 1.01029789, "balance_loss_mlp": 1.00050592, "epoch": 0.3008477123790056, "flos": 17055855004320.0, "grad_norm": 1.5835890675256636, "language_loss": 0.87169528, "learning_rate": 3.278733262682299e-06, "loss": 0.89699727, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.768808603286743 }, { "auxiliary_loss_clip": 0.01372562, "auxiliary_loss_mlp": 0.01197131, "balance_loss_clip": 1.01024222, "balance_loss_mlp": 1.00067413, "epoch": 0.3009679552696447, "flos": 21506408848800.0, "grad_norm": 2.4268660029771687, "language_loss": 0.82211649, "learning_rate": 3.2781342128357484e-06, "loss": 0.84781337, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.7249908447265625 }, { "auxiliary_loss_clip": 0.01325298, "auxiliary_loss_mlp": 0.01196931, "balance_loss_clip": 1.01010656, "balance_loss_mlp": 1.00085592, "epoch": 0.30108819816028376, "flos": 21134012378400.0, "grad_norm": 2.2942783202490475, "language_loss": 0.80261528, "learning_rate": 3.2775349690972547e-06, "loss": 0.82783759, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.8210105895996094 }, { "auxiliary_loss_clip": 0.01322883, "auxiliary_loss_mlp": 0.01194606, "balance_loss_clip": 1.00654209, "balance_loss_mlp": 1.00005615, "epoch": 0.30120844105092287, "flos": 71126469502560.0, "grad_norm": 0.7759019067510897, "language_loss": 0.5180763, "learning_rate": 3.276935531557722e-06, "loss": 0.54325122, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.4149913787841797 }, { "auxiliary_loss_clip": 0.01299133, "auxiliary_loss_mlp": 0.01197174, "balance_loss_clip": 1.00887454, "balance_loss_mlp": 1.0009079, "epoch": 0.301328683941562, "flos": 20264581180800.0, "grad_norm": 2.040599914310821, "language_loss": 0.79560626, "learning_rate": 3.2763359003080837e-06, "loss": 0.82056934, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.871037244796753 }, { "auxiliary_loss_clip": 0.01314498, "auxiliary_loss_mlp": 0.01194631, "balance_loss_clip": 1.00497198, "balance_loss_mlp": 1.00008154, "epoch": 0.30144892683220104, "flos": 70648245574560.0, "grad_norm": 0.7974582466015643, "language_loss": 0.62485838, "learning_rate": 3.2757360754393047e-06, "loss": 0.64994967, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.449896812438965 }, { "auxiliary_loss_clip": 0.01348159, "auxiliary_loss_mlp": 0.01197125, "balance_loss_clip": 1.00971711, "balance_loss_mlp": 1.0008589, "epoch": 0.30156916972284015, "flos": 22820560390080.0, "grad_norm": 2.66762801063292, "language_loss": 0.63711786, "learning_rate": 3.2751360570423767e-06, "loss": 0.66257071, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.7895846366882324 }, { "auxiliary_loss_clip": 0.01323627, "auxiliary_loss_mlp": 0.01197377, "balance_loss_clip": 1.00914741, "balance_loss_mlp": 1.00072956, "epoch": 0.3016894126134792, "flos": 29899201775040.0, "grad_norm": 2.0046955510268134, "language_loss": 0.75859118, "learning_rate": 3.2745358452083236e-06, "loss": 0.7838012, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.9538495540618896 }, { "auxiliary_loss_clip": 0.01353202, "auxiliary_loss_mlp": 0.0119633, "balance_loss_clip": 1.01023245, "balance_loss_mlp": 1.00063622, "epoch": 0.3018096555041183, "flos": 21546342771840.0, "grad_norm": 1.3419719472559375, "language_loss": 0.82318664, "learning_rate": 3.2739354400281955e-06, "loss": 0.84868193, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.781712770462036 }, { "auxiliary_loss_clip": 0.01299691, "auxiliary_loss_mlp": 0.00872003, "balance_loss_clip": 1.00482154, "balance_loss_mlp": 0.99994487, "epoch": 0.3019298983947574, "flos": 59136329635200.0, "grad_norm": 0.8621411195937397, "language_loss": 0.6369828, "learning_rate": 3.2733348415930744e-06, "loss": 0.65869975, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.401370048522949 }, { "auxiliary_loss_clip": 0.0130006, "auxiliary_loss_mlp": 0.01196641, "balance_loss_clip": 1.00883317, "balance_loss_mlp": 1.0007565, "epoch": 0.3020501412853965, "flos": 34423085355840.0, "grad_norm": 2.118905423839145, "language_loss": 0.80835128, "learning_rate": 3.27273404999407e-06, "loss": 0.83331829, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.921107292175293 }, { "auxiliary_loss_clip": 0.01314611, "auxiliary_loss_mlp": 0.01194557, "balance_loss_clip": 1.00528836, "balance_loss_mlp": 1.00000751, "epoch": 0.3021703841760356, "flos": 71008320222720.0, "grad_norm": 0.7919508249741855, "language_loss": 0.60472071, "learning_rate": 3.272133065322322e-06, "loss": 0.62981236, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.372758388519287 }, { "auxiliary_loss_clip": 0.0137234, "auxiliary_loss_mlp": 0.0119655, "balance_loss_clip": 1.01012349, "balance_loss_mlp": 1.00066555, "epoch": 0.3022906270666747, "flos": 21510540072000.0, "grad_norm": 1.541055099286403, "language_loss": 0.7959072, "learning_rate": 3.271531887669e-06, "loss": 0.82159609, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.7885890007019043 }, { "auxiliary_loss_clip": 0.01296646, "auxiliary_loss_mlp": 0.01196639, "balance_loss_clip": 1.00836205, "balance_loss_mlp": 1.00075459, "epoch": 0.30241086995731375, "flos": 31132012599360.0, "grad_norm": 2.2529279829984876, "language_loss": 0.63016617, "learning_rate": 3.2709305171253015e-06, "loss": 0.65509903, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.860581159591675 }, { "auxiliary_loss_clip": 0.01348753, "auxiliary_loss_mlp": 0.01196628, "balance_loss_clip": 1.01034522, "balance_loss_mlp": 1.00074291, "epoch": 0.30253111284795287, "flos": 23511552867360.0, "grad_norm": 1.9758298457057355, "language_loss": 0.77641082, "learning_rate": 3.2703289537824536e-06, "loss": 0.80186462, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.7916417121887207 }, { "auxiliary_loss_clip": 0.01303724, "auxiliary_loss_mlp": 0.01197164, "balance_loss_clip": 1.00947475, "balance_loss_mlp": 1.00108838, "epoch": 0.302651355738592, "flos": 18725375191680.0, "grad_norm": 2.4961721588018446, "language_loss": 0.78997624, "learning_rate": 3.269727197731714e-06, "loss": 0.8149851, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.7799534797668457 }, { "auxiliary_loss_clip": 0.0127579, "auxiliary_loss_mlp": 0.01196711, "balance_loss_clip": 1.00796139, "balance_loss_mlp": 1.00082648, "epoch": 0.30277159862923103, "flos": 22418899329600.0, "grad_norm": 1.5620465474418521, "language_loss": 0.77328461, "learning_rate": 3.269125249064367e-06, "loss": 0.79800957, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.835939645767212 }, { "auxiliary_loss_clip": 0.01374197, "auxiliary_loss_mlp": 0.01197029, "balance_loss_clip": 1.01103055, "balance_loss_mlp": 1.00095367, "epoch": 0.30289184151987014, "flos": 22273137948960.0, "grad_norm": 1.5547176966945837, "language_loss": 0.83170283, "learning_rate": 3.2685231078717297e-06, "loss": 0.85741508, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 3.744833469390869 }, { "auxiliary_loss_clip": 0.01310426, "auxiliary_loss_mlp": 0.00872781, "balance_loss_clip": 1.00934911, "balance_loss_mlp": 1.0001955, "epoch": 0.30301208441050925, "flos": 25225605208800.0, "grad_norm": 2.2704646871205103, "language_loss": 0.75558072, "learning_rate": 3.267920774245145e-06, "loss": 0.77741277, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.858015298843384 }, { "auxiliary_loss_clip": 0.01349678, "auxiliary_loss_mlp": 0.01197312, "balance_loss_clip": 1.01015329, "balance_loss_mlp": 1.0010457, "epoch": 0.3031323273011483, "flos": 23039256346560.0, "grad_norm": 1.775050248268715, "language_loss": 0.8484413, "learning_rate": 3.2673182482759876e-06, "loss": 0.87391126, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.854586124420166 }, { "auxiliary_loss_clip": 0.01348689, "auxiliary_loss_mlp": 0.01197027, "balance_loss_clip": 1.00990236, "balance_loss_mlp": 1.00095117, "epoch": 0.3032525701917874, "flos": 18876704742720.0, "grad_norm": 1.7529792894802463, "language_loss": 0.66108978, "learning_rate": 3.266715530055659e-06, "loss": 0.68654692, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 4.745522737503052 }, { "auxiliary_loss_clip": 0.01360887, "auxiliary_loss_mlp": 0.01196817, "balance_loss_clip": 1.01041293, "balance_loss_mlp": 1.00074148, "epoch": 0.30337281308242653, "flos": 17782650181440.0, "grad_norm": 1.5577965653610648, "language_loss": 0.80232149, "learning_rate": 3.2661126196755927e-06, "loss": 0.8278985, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.870335578918457 }, { "auxiliary_loss_clip": 0.01345625, "auxiliary_loss_mlp": 0.01194586, "balance_loss_clip": 1.00495791, "balance_loss_mlp": 1.00003624, "epoch": 0.3034930559730656, "flos": 57824333514720.0, "grad_norm": 0.7972452441630199, "language_loss": 0.56013787, "learning_rate": 3.265509517227248e-06, "loss": 0.58554006, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.3423728942871094 }, { "auxiliary_loss_clip": 0.01336649, "auxiliary_loss_mlp": 0.01196426, "balance_loss_clip": 1.00934124, "balance_loss_mlp": 1.00063682, "epoch": 0.3036132988637047, "flos": 14755595856480.0, "grad_norm": 2.844238898067128, "language_loss": 0.80772692, "learning_rate": 3.264906222802115e-06, "loss": 0.83305764, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.735478162765503 }, { "auxiliary_loss_clip": 0.01372535, "auxiliary_loss_mlp": 0.01196585, "balance_loss_clip": 1.010337, "balance_loss_mlp": 1.00070071, "epoch": 0.30373354175434375, "flos": 21033214236000.0, "grad_norm": 2.5158653195395386, "language_loss": 0.78570342, "learning_rate": 3.264302736491715e-06, "loss": 0.81139469, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.7315289974212646 }, { "auxiliary_loss_clip": 0.01346848, "auxiliary_loss_mlp": 0.01196719, "balance_loss_clip": 1.01035666, "balance_loss_mlp": 1.0008347, "epoch": 0.30385378464498286, "flos": 21143244764160.0, "grad_norm": 1.761861488307147, "language_loss": 0.87188083, "learning_rate": 3.263699058387594e-06, "loss": 0.89731646, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.7661731243133545 }, { "auxiliary_loss_clip": 0.01328246, "auxiliary_loss_mlp": 0.01196892, "balance_loss_clip": 1.00955808, "balance_loss_mlp": 1.00081658, "epoch": 0.30397402753562197, "flos": 20629254060000.0, "grad_norm": 3.0806758661860494, "language_loss": 0.90188736, "learning_rate": 3.2630951885813315e-06, "loss": 0.92713875, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.7990689277648926 }, { "auxiliary_loss_clip": 0.01338531, "auxiliary_loss_mlp": 0.01196527, "balance_loss_clip": 1.00927949, "balance_loss_mlp": 1.00064266, "epoch": 0.304094270426261, "flos": 15085687440960.0, "grad_norm": 2.0232751819360484, "language_loss": 0.78315514, "learning_rate": 3.262491127164533e-06, "loss": 0.80850577, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.735384464263916 }, { "auxiliary_loss_clip": 0.01336749, "auxiliary_loss_mlp": 0.00872696, "balance_loss_clip": 1.00977063, "balance_loss_mlp": 1.00032938, "epoch": 0.30421451331690014, "flos": 13845224872800.0, "grad_norm": 2.275095360648184, "language_loss": 0.80287349, "learning_rate": 3.2618868742288337e-06, "loss": 0.82496798, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.7424957752227783 }, { "auxiliary_loss_clip": 0.01349051, "auxiliary_loss_mlp": 0.01196745, "balance_loss_clip": 1.00963354, "balance_loss_mlp": 1.00085986, "epoch": 0.30433475620753925, "flos": 17384222252160.0, "grad_norm": 1.7409419393498482, "language_loss": 0.72476476, "learning_rate": 3.261282429865899e-06, "loss": 0.75022268, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.7338414192199707 }, { "auxiliary_loss_clip": 0.01345366, "auxiliary_loss_mlp": 0.00872648, "balance_loss_clip": 1.01015151, "balance_loss_mlp": 1.00013244, "epoch": 0.3044549990981783, "flos": 18916961978880.0, "grad_norm": 1.5441766631533054, "language_loss": 0.72381914, "learning_rate": 3.2606777941674225e-06, "loss": 0.74599922, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.756176710128784 }, { "auxiliary_loss_clip": 0.01273573, "auxiliary_loss_mlp": 0.01196947, "balance_loss_clip": 1.00804806, "balance_loss_mlp": 1.00087166, "epoch": 0.3045752419888174, "flos": 21068442156960.0, "grad_norm": 2.0116689377948, "language_loss": 0.85043359, "learning_rate": 3.2600729672251276e-06, "loss": 0.87513876, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 2.8585257530212402 }, { "auxiliary_loss_clip": 0.01371926, "auxiliary_loss_mlp": 0.00872679, "balance_loss_clip": 1.01038933, "balance_loss_mlp": 1.00013196, "epoch": 0.3046954848794565, "flos": 29096418509280.0, "grad_norm": 1.9267474397867514, "language_loss": 0.65142399, "learning_rate": 3.259467949130765e-06, "loss": 0.67386997, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.8461761474609375 }, { "auxiliary_loss_clip": 0.0132849, "auxiliary_loss_mlp": 0.01197036, "balance_loss_clip": 1.00956488, "balance_loss_mlp": 1.00096011, "epoch": 0.3048157277700956, "flos": 20295354565440.0, "grad_norm": 2.2842757588499776, "language_loss": 0.82607257, "learning_rate": 3.2588627399761164e-06, "loss": 0.85132778, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.773768186569214 }, { "auxiliary_loss_clip": 0.01334617, "auxiliary_loss_mlp": 0.011971, "balance_loss_clip": 1.00990665, "balance_loss_mlp": 1.00102472, "epoch": 0.3049359706607347, "flos": 22739938146720.0, "grad_norm": 1.591344280292403, "language_loss": 0.70762789, "learning_rate": 3.2582573398529903e-06, "loss": 0.73294508, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.873321533203125 }, { "auxiliary_loss_clip": 0.01336058, "auxiliary_loss_mlp": 0.01197197, "balance_loss_clip": 1.01037121, "balance_loss_mlp": 1.00093031, "epoch": 0.3050562135513738, "flos": 18434642751360.0, "grad_norm": 2.4937404368991967, "language_loss": 0.73979789, "learning_rate": 3.2576517488532265e-06, "loss": 0.7651304, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.8712844848632812 }, { "auxiliary_loss_clip": 0.01358829, "auxiliary_loss_mlp": 0.01196484, "balance_loss_clip": 1.00987399, "balance_loss_mlp": 1.00098109, "epoch": 0.30517645644201286, "flos": 20370336791040.0, "grad_norm": 1.9354898810028738, "language_loss": 0.87713563, "learning_rate": 3.257045967068692e-06, "loss": 0.9026888, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.8321900367736816 }, { "auxiliary_loss_clip": 0.01372746, "auxiliary_loss_mlp": 0.01196913, "balance_loss_clip": 1.01052237, "balance_loss_mlp": 1.00083792, "epoch": 0.30529669933265197, "flos": 21945129937920.0, "grad_norm": 1.5685410681211736, "language_loss": 0.82472408, "learning_rate": 3.2564399945912848e-06, "loss": 0.85042065, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.7793257236480713 }, { "auxiliary_loss_clip": 0.01313937, "auxiliary_loss_mlp": 0.01196278, "balance_loss_clip": 1.00965369, "balance_loss_mlp": 1.00077415, "epoch": 0.305416942223291, "flos": 21835422722880.0, "grad_norm": 2.02950648862805, "language_loss": 0.82261312, "learning_rate": 3.2558338315129287e-06, "loss": 0.84771526, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.851386785507202 }, { "auxiliary_loss_clip": 0.01359326, "auxiliary_loss_mlp": 0.01196801, "balance_loss_clip": 1.01024079, "balance_loss_mlp": 1.0009166, "epoch": 0.30553718511393013, "flos": 33911824851360.0, "grad_norm": 2.0221099004138052, "language_loss": 0.76512676, "learning_rate": 3.2552274779255785e-06, "loss": 0.7906881, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 2.820643663406372 }, { "auxiliary_loss_clip": 0.01346919, "auxiliary_loss_mlp": 0.01197261, "balance_loss_clip": 1.00952995, "balance_loss_mlp": 1.00099444, "epoch": 0.30565742800456924, "flos": 22268539717920.0, "grad_norm": 2.0508934287984633, "language_loss": 0.77131635, "learning_rate": 3.2546209339212184e-06, "loss": 0.79675812, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.7590301036834717 }, { "auxiliary_loss_clip": 0.01339766, "auxiliary_loss_mlp": 0.01196548, "balance_loss_clip": 1.00943804, "balance_loss_mlp": 1.00066352, "epoch": 0.3057776708952083, "flos": 22565055870720.0, "grad_norm": 1.4859314329326734, "language_loss": 0.77385592, "learning_rate": 3.25401419959186e-06, "loss": 0.79921901, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.864989995956421 }, { "auxiliary_loss_clip": 0.01347015, "auxiliary_loss_mlp": 0.01197116, "balance_loss_clip": 1.01088738, "balance_loss_mlp": 1.0010407, "epoch": 0.3058979137858474, "flos": 21799224862560.0, "grad_norm": 1.817966328747697, "language_loss": 0.76297641, "learning_rate": 3.253407275029545e-06, "loss": 0.7884177, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.792288303375244 }, { "auxiliary_loss_clip": 0.01305121, "auxiliary_loss_mlp": 0.01197417, "balance_loss_clip": 1.00873637, "balance_loss_mlp": 1.00095999, "epoch": 0.3060181566764865, "flos": 26979447778560.0, "grad_norm": 1.7340220514050744, "language_loss": 0.80105257, "learning_rate": 3.2528001603263425e-06, "loss": 0.826078, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 3.781195878982544 }, { "auxiliary_loss_clip": 0.01347805, "auxiliary_loss_mlp": 0.01197173, "balance_loss_clip": 1.01066327, "balance_loss_mlp": 1.00090706, "epoch": 0.3061383995671256, "flos": 19865111464800.0, "grad_norm": 1.8135021797462507, "language_loss": 0.81497669, "learning_rate": 3.2521928555743514e-06, "loss": 0.84042645, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.7541134357452393 }, { "auxiliary_loss_clip": 0.01337229, "auxiliary_loss_mlp": 0.00872695, "balance_loss_clip": 1.00949526, "balance_loss_mlp": 1.00013733, "epoch": 0.3062586424577647, "flos": 22127520263040.0, "grad_norm": 1.7845092273971772, "language_loss": 0.67570055, "learning_rate": 3.2515853608657e-06, "loss": 0.69779974, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.8828561305999756 }, { "auxiliary_loss_clip": 0.01360281, "auxiliary_loss_mlp": 0.01196509, "balance_loss_clip": 1.01051688, "balance_loss_mlp": 1.0008148, "epoch": 0.3063788853484038, "flos": 20845507206240.0, "grad_norm": 4.097718732821252, "language_loss": 0.7452904, "learning_rate": 3.250977676292545e-06, "loss": 0.77085829, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 4.714258670806885 }, { "auxiliary_loss_clip": 0.01335943, "auxiliary_loss_mlp": 0.011972, "balance_loss_clip": 1.00946665, "balance_loss_mlp": 1.00074279, "epoch": 0.30649912823904285, "flos": 16209725065920.0, "grad_norm": 2.6737719771004724, "language_loss": 0.79459369, "learning_rate": 3.2503698019470712e-06, "loss": 0.81992519, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.788701057434082 }, { "auxiliary_loss_clip": 0.01359406, "auxiliary_loss_mlp": 0.01197014, "balance_loss_clip": 1.01012969, "balance_loss_mlp": 1.00074756, "epoch": 0.30661937112968196, "flos": 18617823397440.0, "grad_norm": 2.0393801220728762, "language_loss": 0.78242421, "learning_rate": 3.249761737921492e-06, "loss": 0.80798841, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.792989730834961 }, { "auxiliary_loss_clip": 0.01325987, "auxiliary_loss_mlp": 0.01196518, "balance_loss_clip": 1.00929403, "balance_loss_mlp": 1.00063372, "epoch": 0.30673961402032107, "flos": 31390822097280.0, "grad_norm": 1.9040408430389795, "language_loss": 0.73898679, "learning_rate": 3.249153484308051e-06, "loss": 0.76421183, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.8610951900482178 }, { "auxiliary_loss_clip": 0.01286812, "auxiliary_loss_mlp": 0.01196818, "balance_loss_clip": 1.00780225, "balance_loss_mlp": 1.00074291, "epoch": 0.3068598569109601, "flos": 20229820267680.0, "grad_norm": 3.2827841108652223, "language_loss": 0.77212065, "learning_rate": 3.2485450411990194e-06, "loss": 0.79695702, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.7795541286468506 }, { "auxiliary_loss_clip": 0.01371883, "auxiliary_loss_mlp": 0.01197625, "balance_loss_clip": 1.00975347, "balance_loss_mlp": 1.00097716, "epoch": 0.30698009980159924, "flos": 29602003072320.0, "grad_norm": 2.4529229924368225, "language_loss": 0.82118881, "learning_rate": 3.2479364086866983e-06, "loss": 0.84688383, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.842151403427124 }, { "auxiliary_loss_clip": 0.01334766, "auxiliary_loss_mlp": 0.00872721, "balance_loss_clip": 1.01043558, "balance_loss_mlp": 1.00010228, "epoch": 0.30710034269223835, "flos": 23842434772800.0, "grad_norm": 1.6737057951136554, "language_loss": 0.81258351, "learning_rate": 3.247327586863416e-06, "loss": 0.83465838, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.7840466499328613 }, { "auxiliary_loss_clip": 0.013151, "auxiliary_loss_mlp": 0.01197326, "balance_loss_clip": 1.00866199, "balance_loss_mlp": 1.00086927, "epoch": 0.3072205855828774, "flos": 25884998056800.0, "grad_norm": 2.0878115218380042, "language_loss": 0.77292079, "learning_rate": 3.2467185758215304e-06, "loss": 0.79804504, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.817817211151123 }, { "auxiliary_loss_clip": 0.01318604, "auxiliary_loss_mlp": 0.00872655, "balance_loss_clip": 1.01010847, "balance_loss_mlp": 1.00010145, "epoch": 0.3073408284735165, "flos": 22236401233440.0, "grad_norm": 2.3522560524936864, "language_loss": 0.85112005, "learning_rate": 3.246109375653428e-06, "loss": 0.87303269, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.8437488079071045 }, { "auxiliary_loss_clip": 0.01371062, "auxiliary_loss_mlp": 0.01197163, "balance_loss_clip": 1.00974357, "balance_loss_mlp": 1.00089681, "epoch": 0.30746107136415557, "flos": 19500294890880.0, "grad_norm": 1.7536375308506311, "language_loss": 0.78194189, "learning_rate": 3.2454999864515243e-06, "loss": 0.8076241, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.6490540504455566 }, { "auxiliary_loss_clip": 0.0132066, "auxiliary_loss_mlp": 0.00872627, "balance_loss_clip": 1.00905931, "balance_loss_mlp": 1.00007939, "epoch": 0.3075813142547947, "flos": 21724817415840.0, "grad_norm": 1.718232048101095, "language_loss": 0.69605339, "learning_rate": 3.244890408308263e-06, "loss": 0.71798623, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.783292293548584 }, { "auxiliary_loss_clip": 0.01293662, "auxiliary_loss_mlp": 0.01196882, "balance_loss_clip": 1.00818348, "balance_loss_mlp": 1.00061619, "epoch": 0.3077015571454338, "flos": 24097975215840.0, "grad_norm": 2.3427074196704223, "language_loss": 0.61712956, "learning_rate": 3.2442806413161165e-06, "loss": 0.64203501, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 2.8257381916046143 }, { "auxiliary_loss_clip": 0.01295936, "auxiliary_loss_mlp": 0.01197535, "balance_loss_clip": 1.00930285, "balance_loss_mlp": 1.00107837, "epoch": 0.30782180003607285, "flos": 18405485932320.0, "grad_norm": 1.8920542960995628, "language_loss": 0.75678557, "learning_rate": 3.243670685567586e-06, "loss": 0.78172028, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.810515880584717 }, { "auxiliary_loss_clip": 0.01328572, "auxiliary_loss_mlp": 0.0087256, "balance_loss_clip": 1.00875223, "balance_loss_mlp": 1.00005674, "epoch": 0.30794204292671196, "flos": 23878560785760.0, "grad_norm": 2.644559763187119, "language_loss": 0.80302858, "learning_rate": 3.2430605411552012e-06, "loss": 0.82503992, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.803617000579834 }, { "auxiliary_loss_clip": 0.01300765, "auxiliary_loss_mlp": 0.01194604, "balance_loss_clip": 1.00360751, "balance_loss_mlp": 1.00005424, "epoch": 0.30806228581735107, "flos": 67927837880160.0, "grad_norm": 0.8881863082607625, "language_loss": 0.7060703, "learning_rate": 3.2424502081715205e-06, "loss": 0.73102397, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.4117202758789062 }, { "auxiliary_loss_clip": 0.01332512, "auxiliary_loss_mlp": 0.01197289, "balance_loss_clip": 1.00941205, "balance_loss_mlp": 1.00083256, "epoch": 0.3081825287079901, "flos": 23843225093760.0, "grad_norm": 2.1233468410004432, "language_loss": 0.78075016, "learning_rate": 3.241839686709132e-06, "loss": 0.80604821, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.8972902297973633 }, { "auxiliary_loss_clip": 0.01360108, "auxiliary_loss_mlp": 0.01197484, "balance_loss_clip": 1.01029181, "balance_loss_mlp": 1.00083661, "epoch": 0.30830277159862923, "flos": 16209976531680.0, "grad_norm": 2.8021447594903415, "language_loss": 0.82025146, "learning_rate": 3.2412289768606495e-06, "loss": 0.8458274, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.806443452835083 }, { "auxiliary_loss_clip": 0.01358349, "auxiliary_loss_mlp": 0.01197099, "balance_loss_clip": 1.00991344, "balance_loss_mlp": 1.00083232, "epoch": 0.30842301448926834, "flos": 29349516142080.0, "grad_norm": 1.891782224756564, "language_loss": 0.82706314, "learning_rate": 3.240618078718718e-06, "loss": 0.85261762, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.9136579036712646 }, { "auxiliary_loss_clip": 0.0132347, "auxiliary_loss_mlp": 0.01197217, "balance_loss_clip": 1.01039171, "balance_loss_mlp": 1.00076056, "epoch": 0.3085432573799074, "flos": 21945201785280.0, "grad_norm": 1.8668694257996783, "language_loss": 0.74031311, "learning_rate": 3.240006992376011e-06, "loss": 0.76551998, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.8028218746185303 }, { "auxiliary_loss_clip": 0.01338893, "auxiliary_loss_mlp": 0.01196663, "balance_loss_clip": 1.00969172, "balance_loss_mlp": 1.00077808, "epoch": 0.3086635002705465, "flos": 22054729381920.0, "grad_norm": 2.103787319857064, "language_loss": 0.75897515, "learning_rate": 3.2393957179252284e-06, "loss": 0.78433073, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.8887219429016113 }, { "auxiliary_loss_clip": 0.01371965, "auxiliary_loss_mlp": 0.01197202, "balance_loss_clip": 1.01036179, "balance_loss_mlp": 1.00093603, "epoch": 0.3087837431611856, "flos": 32665937807520.0, "grad_norm": 1.8709454184717107, "language_loss": 0.80681694, "learning_rate": 3.2387842554591016e-06, "loss": 0.83250856, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.7861363887786865 }, { "auxiliary_loss_clip": 0.01371702, "auxiliary_loss_mlp": 0.01196965, "balance_loss_clip": 1.01040292, "balance_loss_mlp": 1.00088942, "epoch": 0.3089039860518247, "flos": 17599253993280.0, "grad_norm": 2.084882941493386, "language_loss": 0.87614608, "learning_rate": 3.238172605070388e-06, "loss": 0.9018327, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.8370413780212402 }, { "auxiliary_loss_clip": 0.01345888, "auxiliary_loss_mlp": 0.00872771, "balance_loss_clip": 1.00981498, "balance_loss_mlp": 1.00015378, "epoch": 0.3090242289424638, "flos": 14383846012320.0, "grad_norm": 7.7415683720987865, "language_loss": 0.78180391, "learning_rate": 3.2375607668518745e-06, "loss": 0.80399048, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.7033090591430664 }, { "auxiliary_loss_clip": 0.01334568, "auxiliary_loss_mlp": 0.01197355, "balance_loss_clip": 1.00989866, "balance_loss_mlp": 1.00070727, "epoch": 0.30914447183310284, "flos": 16068633763680.0, "grad_norm": 2.4759484891474846, "language_loss": 0.89958364, "learning_rate": 3.236948740896377e-06, "loss": 0.92490292, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.784740686416626 }, { "auxiliary_loss_clip": 0.01347631, "auxiliary_loss_mlp": 0.01197241, "balance_loss_clip": 1.00945842, "balance_loss_mlp": 1.0007844, "epoch": 0.30926471472374195, "flos": 32230234307520.0, "grad_norm": 1.4857131730847861, "language_loss": 0.8438229, "learning_rate": 3.2363365272967384e-06, "loss": 0.86927164, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 3.870349168777466 }, { "auxiliary_loss_clip": 0.01347007, "auxiliary_loss_mlp": 0.01196835, "balance_loss_clip": 1.01046133, "balance_loss_mlp": 1.00076008, "epoch": 0.30938495761438106, "flos": 20370731951520.0, "grad_norm": 1.870430758414538, "language_loss": 0.81178445, "learning_rate": 3.235724126145832e-06, "loss": 0.83722287, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.7119181156158447 }, { "auxiliary_loss_clip": 0.01360018, "auxiliary_loss_mlp": 0.01197083, "balance_loss_clip": 1.01034236, "balance_loss_mlp": 1.00081718, "epoch": 0.3095052005050201, "flos": 24061166652960.0, "grad_norm": 1.530757512389563, "language_loss": 0.77475893, "learning_rate": 3.235111537536558e-06, "loss": 0.80032992, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 4.822031497955322 }, { "auxiliary_loss_clip": 0.0135145, "auxiliary_loss_mlp": 0.01196833, "balance_loss_clip": 1.00939262, "balance_loss_mlp": 1.00075769, "epoch": 0.30962544339565923, "flos": 23401558262880.0, "grad_norm": 1.9497793563362733, "language_loss": 0.83041728, "learning_rate": 3.2344987615618456e-06, "loss": 0.85590011, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.861895799636841 }, { "auxiliary_loss_clip": 0.0130686, "auxiliary_loss_mlp": 0.01197143, "balance_loss_clip": 1.00916672, "balance_loss_mlp": 1.00087702, "epoch": 0.30974568628629834, "flos": 33799998139200.0, "grad_norm": 1.6097927163500048, "language_loss": 0.78452659, "learning_rate": 3.2338857983146533e-06, "loss": 0.80956656, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.969644784927368 }, { "auxiliary_loss_clip": 0.01337326, "auxiliary_loss_mlp": 0.01197198, "balance_loss_clip": 1.010656, "balance_loss_mlp": 1.0007416, "epoch": 0.3098659291769374, "flos": 20229604725600.0, "grad_norm": 1.7371381125846155, "language_loss": 0.76535988, "learning_rate": 3.233272647887966e-06, "loss": 0.79070508, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.8459534645080566 }, { "auxiliary_loss_clip": 0.01372114, "auxiliary_loss_mlp": 0.01197098, "balance_loss_clip": 1.01050591, "balance_loss_mlp": 1.00083184, "epoch": 0.3099861720675765, "flos": 24748566762240.0, "grad_norm": 1.633820322705964, "language_loss": 0.90351808, "learning_rate": 3.2326593103747985e-06, "loss": 0.92921019, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.80232834815979 }, { "auxiliary_loss_clip": 0.01346948, "auxiliary_loss_mlp": 0.01196939, "balance_loss_clip": 1.01016581, "balance_loss_mlp": 1.00067282, "epoch": 0.3101064149582156, "flos": 11765493789120.0, "grad_norm": 1.8699126793935077, "language_loss": 0.85048628, "learning_rate": 3.2320457858681936e-06, "loss": 0.87592518, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.650381326675415 }, { "auxiliary_loss_clip": 0.01336508, "auxiliary_loss_mlp": 0.01196559, "balance_loss_clip": 1.0097239, "balance_loss_mlp": 1.00067449, "epoch": 0.31022665784885467, "flos": 23033257092000.0, "grad_norm": 2.1161521546847686, "language_loss": 0.85660791, "learning_rate": 3.2314320744612228e-06, "loss": 0.88193858, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.8268277645111084 }, { "auxiliary_loss_clip": 0.0134785, "auxiliary_loss_mlp": 0.01196088, "balance_loss_clip": 1.0094403, "balance_loss_mlp": 1.00058484, "epoch": 0.3103469007394938, "flos": 16289197751520.0, "grad_norm": 1.7047160588918728, "language_loss": 0.76157904, "learning_rate": 3.2308181762469854e-06, "loss": 0.78701842, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.818171501159668 }, { "auxiliary_loss_clip": 0.01372769, "auxiliary_loss_mlp": 0.01197491, "balance_loss_clip": 1.01053381, "balance_loss_mlp": 1.00084352, "epoch": 0.3104671436301329, "flos": 30515283874080.0, "grad_norm": 2.0379169274842655, "language_loss": 0.78487611, "learning_rate": 3.230204091318609e-06, "loss": 0.8105787, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.843968391418457 }, { "auxiliary_loss_clip": 0.01371334, "auxiliary_loss_mlp": 0.0087259, "balance_loss_clip": 1.01016331, "balance_loss_mlp": 1.0000515, "epoch": 0.31058738652077195, "flos": 20047250324160.0, "grad_norm": 3.708643347797927, "language_loss": 0.84831822, "learning_rate": 3.2295898197692503e-06, "loss": 0.87075746, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.72165584564209 }, { "auxiliary_loss_clip": 0.01371489, "auxiliary_loss_mlp": 0.01196909, "balance_loss_clip": 1.01043856, "balance_loss_mlp": 1.00064254, "epoch": 0.31070762941141106, "flos": 28074651897600.0, "grad_norm": 1.7010158213765767, "language_loss": 0.79188979, "learning_rate": 3.228975361692094e-06, "loss": 0.81757379, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.716865301132202 }, { "auxiliary_loss_clip": 0.01359482, "auxiliary_loss_mlp": 0.00872749, "balance_loss_clip": 1.01049328, "balance_loss_mlp": 1.00015879, "epoch": 0.31082787230205017, "flos": 20521917807840.0, "grad_norm": 1.973923458065514, "language_loss": 0.79927313, "learning_rate": 3.228360717180352e-06, "loss": 0.82159543, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.7701187133789062 }, { "auxiliary_loss_clip": 0.01343466, "auxiliary_loss_mlp": 0.0087214, "balance_loss_clip": 1.00350595, "balance_loss_mlp": 1.0001725, "epoch": 0.3109481151926892, "flos": 62445961725120.0, "grad_norm": 0.8127464935030001, "language_loss": 0.59410512, "learning_rate": 3.227745886327266e-06, "loss": 0.61626112, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.250237464904785 }, { "auxiliary_loss_clip": 0.01343468, "auxiliary_loss_mlp": 0.01194617, "balance_loss_clip": 1.00350797, "balance_loss_mlp": 1.00006735, "epoch": 0.31106835808332833, "flos": 44746771757760.0, "grad_norm": 0.8164953496862584, "language_loss": 0.55862564, "learning_rate": 3.227130869226105e-06, "loss": 0.58400649, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.300185203552246 }, { "auxiliary_loss_clip": 0.01351526, "auxiliary_loss_mlp": 0.01196898, "balance_loss_clip": 1.00931311, "balance_loss_mlp": 1.00082254, "epoch": 0.3111886009739674, "flos": 23403067057440.0, "grad_norm": 2.462169924933722, "language_loss": 0.82463849, "learning_rate": 3.226515665970167e-06, "loss": 0.85012281, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.7967796325683594 }, { "auxiliary_loss_clip": 0.01345858, "auxiliary_loss_mlp": 0.01197221, "balance_loss_clip": 1.00926936, "balance_loss_mlp": 1.00076389, "epoch": 0.3113088438646065, "flos": 17530738030080.0, "grad_norm": 5.731607627404757, "language_loss": 0.86165631, "learning_rate": 3.225900276652777e-06, "loss": 0.88708711, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.7386646270751953 }, { "auxiliary_loss_clip": 0.01343209, "auxiliary_loss_mlp": 0.01197181, "balance_loss_clip": 1.00931978, "balance_loss_mlp": 1.00072372, "epoch": 0.3114290867552456, "flos": 28365815422080.0, "grad_norm": 1.522313930905249, "language_loss": 0.75333631, "learning_rate": 3.2252847013672906e-06, "loss": 0.77874017, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.7745676040649414 }, { "auxiliary_loss_clip": 0.01317566, "auxiliary_loss_mlp": 0.01196591, "balance_loss_clip": 1.00888371, "balance_loss_mlp": 1.00070608, "epoch": 0.31154932964588467, "flos": 27379168960320.0, "grad_norm": 2.134628144204949, "language_loss": 0.7548933, "learning_rate": 3.224668940207089e-06, "loss": 0.7800349, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.9446966648101807 }, { "auxiliary_loss_clip": 0.01310706, "auxiliary_loss_mlp": 0.01197294, "balance_loss_clip": 1.00962067, "balance_loss_mlp": 1.00102782, "epoch": 0.3116695725365238, "flos": 26541876247200.0, "grad_norm": 2.312300296134534, "language_loss": 0.86829507, "learning_rate": 3.2240529932655828e-06, "loss": 0.89337504, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.933920383453369 }, { "auxiliary_loss_clip": 0.01322026, "auxiliary_loss_mlp": 0.01197291, "balance_loss_clip": 1.00910008, "balance_loss_mlp": 1.00083447, "epoch": 0.3117898154271629, "flos": 21177610516800.0, "grad_norm": 2.9182187848610237, "language_loss": 0.88635051, "learning_rate": 3.223436860636211e-06, "loss": 0.91154367, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.708160161972046 }, { "auxiliary_loss_clip": 0.01371061, "auxiliary_loss_mlp": 0.01196352, "balance_loss_clip": 1.01002514, "balance_loss_mlp": 1.00046778, "epoch": 0.31191005831780194, "flos": 27272443410720.0, "grad_norm": 1.8910080829808886, "language_loss": 0.74453896, "learning_rate": 3.2228205424124403e-06, "loss": 0.77021307, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 2.783799648284912 }, { "auxiliary_loss_clip": 0.01333904, "auxiliary_loss_mlp": 0.01196804, "balance_loss_clip": 1.00951123, "balance_loss_mlp": 1.00053763, "epoch": 0.31203030120844105, "flos": 12963507776640.0, "grad_norm": 2.2873838897203025, "language_loss": 0.74761271, "learning_rate": 3.222204038687765e-06, "loss": 0.77291989, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.7378973960876465 }, { "auxiliary_loss_clip": 0.01346056, "auxiliary_loss_mlp": 0.01196708, "balance_loss_clip": 1.0094769, "balance_loss_mlp": 1.0008235, "epoch": 0.31215054409908016, "flos": 27562026293280.0, "grad_norm": 1.555468689354244, "language_loss": 0.88026726, "learning_rate": 3.221587349555709e-06, "loss": 0.90569484, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.827207565307617 }, { "auxiliary_loss_clip": 0.0133824, "auxiliary_loss_mlp": 0.01196956, "balance_loss_clip": 1.00970328, "balance_loss_mlp": 1.00069022, "epoch": 0.3122707869897192, "flos": 21506337001440.0, "grad_norm": 2.1520009053305635, "language_loss": 0.69547057, "learning_rate": 3.2209704751098236e-06, "loss": 0.72082257, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 3.8115367889404297 }, { "auxiliary_loss_clip": 0.01329248, "auxiliary_loss_mlp": 0.01197665, "balance_loss_clip": 1.00927901, "balance_loss_mlp": 1.00082684, "epoch": 0.31239102988035833, "flos": 15187024438560.0, "grad_norm": 2.006610458048833, "language_loss": 0.82587093, "learning_rate": 3.2203534154436875e-06, "loss": 0.85114002, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.7554588317871094 }, { "auxiliary_loss_clip": 0.01279986, "auxiliary_loss_mlp": 0.01196201, "balance_loss_clip": 1.00829864, "balance_loss_mlp": 1.00069785, "epoch": 0.31251127277099744, "flos": 22053723518880.0, "grad_norm": 1.948207699692254, "language_loss": 0.75405729, "learning_rate": 3.2197361706509084e-06, "loss": 0.7788192, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.9339001178741455 }, { "auxiliary_loss_clip": 0.01371991, "auxiliary_loss_mlp": 0.01197418, "balance_loss_clip": 1.01045513, "balance_loss_mlp": 1.00077021, "epoch": 0.3126315156616365, "flos": 15193993632480.0, "grad_norm": 2.600915486946674, "language_loss": 0.83978271, "learning_rate": 3.2191187408251228e-06, "loss": 0.86547685, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 4.768860340118408 }, { "auxiliary_loss_clip": 0.01357957, "auxiliary_loss_mlp": 0.01197303, "balance_loss_clip": 1.01027989, "balance_loss_mlp": 1.00084662, "epoch": 0.3127517585522756, "flos": 18145347258240.0, "grad_norm": 2.2470300411718003, "language_loss": 0.78818077, "learning_rate": 3.218501126059993e-06, "loss": 0.8137334, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.7374818325042725 }, { "auxiliary_loss_clip": 0.01359504, "auxiliary_loss_mlp": 0.01197036, "balance_loss_clip": 1.01029491, "balance_loss_mlp": 1.00076938, "epoch": 0.31287200144291466, "flos": 21908644688160.0, "grad_norm": 1.8929816627194969, "language_loss": 0.81315833, "learning_rate": 3.2178833264492116e-06, "loss": 0.83872378, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.770921468734741 }, { "auxiliary_loss_clip": 0.01357598, "auxiliary_loss_mlp": 0.01197733, "balance_loss_clip": 1.00994325, "balance_loss_mlp": 1.00089514, "epoch": 0.31299224433355377, "flos": 29897010430560.0, "grad_norm": 1.8449443992332768, "language_loss": 0.75926197, "learning_rate": 3.217265342086498e-06, "loss": 0.78481525, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.7433600425720215 }, { "auxiliary_loss_clip": 0.01311227, "auxiliary_loss_mlp": 0.00872805, "balance_loss_clip": 1.00965786, "balance_loss_mlp": 1.00022495, "epoch": 0.3131124872241929, "flos": 11655894345120.0, "grad_norm": 2.5661360822526817, "language_loss": 0.72816575, "learning_rate": 3.216647173065599e-06, "loss": 0.75000608, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.8562259674072266 }, { "auxiliary_loss_clip": 0.01324372, "auxiliary_loss_mlp": 0.01196713, "balance_loss_clip": 1.00929713, "balance_loss_mlp": 1.00063705, "epoch": 0.31323273011483194, "flos": 49848802031520.0, "grad_norm": 1.7880765988632636, "language_loss": 0.73957974, "learning_rate": 3.216028819480292e-06, "loss": 0.76479059, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.9620676040649414 }, { "auxiliary_loss_clip": 0.01335089, "auxiliary_loss_mlp": 0.01197145, "balance_loss_clip": 1.01032197, "balance_loss_mlp": 1.00087857, "epoch": 0.31335297300547105, "flos": 22601289654720.0, "grad_norm": 2.3106891201758843, "language_loss": 0.75592256, "learning_rate": 3.2154102814243793e-06, "loss": 0.78124493, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.7400505542755127 }, { "auxiliary_loss_clip": 0.01318141, "auxiliary_loss_mlp": 0.01197295, "balance_loss_clip": 1.00964618, "balance_loss_mlp": 1.0008378, "epoch": 0.31347321589611016, "flos": 34710872054400.0, "grad_norm": 1.8992949345622019, "language_loss": 0.66704428, "learning_rate": 3.2147915589916937e-06, "loss": 0.69219863, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.913922071456909 }, { "auxiliary_loss_clip": 0.01348178, "auxiliary_loss_mlp": 0.01196989, "balance_loss_clip": 1.01083565, "balance_loss_mlp": 1.00072312, "epoch": 0.3135934587867492, "flos": 19755799410240.0, "grad_norm": 1.7331484955584608, "language_loss": 0.82530558, "learning_rate": 3.2141726522760938e-06, "loss": 0.85075724, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.737154960632324 }, { "auxiliary_loss_clip": 0.0132315, "auxiliary_loss_mlp": 0.01194678, "balance_loss_clip": 1.00367498, "balance_loss_mlp": 1.00012803, "epoch": 0.3137137016773883, "flos": 65815573151520.0, "grad_norm": 0.7025916409145179, "language_loss": 0.52654153, "learning_rate": 3.213553561371469e-06, "loss": 0.55171978, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.446143627166748 }, { "auxiliary_loss_clip": 0.01283946, "auxiliary_loss_mlp": 0.01196648, "balance_loss_clip": 1.00762355, "balance_loss_mlp": 1.00076342, "epoch": 0.31383394456802743, "flos": 16252748425440.0, "grad_norm": 2.0392521640542762, "language_loss": 0.9556008, "learning_rate": 3.212934286371733e-06, "loss": 0.98040676, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.7573044300079346 }, { "auxiliary_loss_clip": 0.01347427, "auxiliary_loss_mlp": 0.01197212, "balance_loss_clip": 1.01109219, "balance_loss_mlp": 1.00075495, "epoch": 0.3139541874586665, "flos": 38795531614560.0, "grad_norm": 2.450283066635076, "language_loss": 0.83290589, "learning_rate": 3.2123148273708304e-06, "loss": 0.8583523, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.901961326599121 }, { "auxiliary_loss_clip": 0.01370884, "auxiliary_loss_mlp": 0.01196563, "balance_loss_clip": 1.0104239, "balance_loss_mlp": 1.00067806, "epoch": 0.3140744303493056, "flos": 25046340243840.0, "grad_norm": 1.784620726492548, "language_loss": 0.76612121, "learning_rate": 3.211695184462733e-06, "loss": 0.79179573, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.798766613006592 }, { "auxiliary_loss_clip": 0.01292866, "auxiliary_loss_mlp": 0.01194648, "balance_loss_clip": 1.00371397, "balance_loss_mlp": 1.00009823, "epoch": 0.3141946732399447, "flos": 72504287310240.0, "grad_norm": 0.878648210107357, "language_loss": 0.60452044, "learning_rate": 3.2110753577414383e-06, "loss": 0.6293956, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.32523512840271 }, { "auxiliary_loss_clip": 0.01335599, "auxiliary_loss_mlp": 0.01197071, "balance_loss_clip": 1.00904727, "balance_loss_mlp": 1.00080526, "epoch": 0.31431491613058377, "flos": 19239796980000.0, "grad_norm": 2.0642188166353295, "language_loss": 0.7892378, "learning_rate": 3.2104553473009757e-06, "loss": 0.81456447, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.822598457336426 }, { "auxiliary_loss_clip": 0.01279265, "auxiliary_loss_mlp": 0.01196786, "balance_loss_clip": 1.0075711, "balance_loss_mlp": 1.00071084, "epoch": 0.3144351590212229, "flos": 36210611128320.0, "grad_norm": 1.8327276927543814, "language_loss": 0.6719262, "learning_rate": 3.209835153235399e-06, "loss": 0.69668674, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.8721425533294678 }, { "auxiliary_loss_clip": 0.0133256, "auxiliary_loss_mlp": 0.0119689, "balance_loss_clip": 1.01045966, "balance_loss_mlp": 1.0008142, "epoch": 0.314555401911862, "flos": 18551750244480.0, "grad_norm": 1.7505954267281576, "language_loss": 0.67846012, "learning_rate": 3.2092147756387916e-06, "loss": 0.70375466, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.7736926078796387 }, { "auxiliary_loss_clip": 0.0134737, "auxiliary_loss_mlp": 0.0119662, "balance_loss_clip": 1.01053929, "balance_loss_mlp": 1.00073552, "epoch": 0.31467564480250104, "flos": 16362886724640.0, "grad_norm": 1.8908950428649756, "language_loss": 0.83502424, "learning_rate": 3.208594214605264e-06, "loss": 0.8604641, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.735996723175049 }, { "auxiliary_loss_clip": 0.01334716, "auxiliary_loss_mlp": 0.01195949, "balance_loss_clip": 1.00976062, "balance_loss_mlp": 1.00073147, "epoch": 0.31479588769314015, "flos": 21652385771520.0, "grad_norm": 2.0268203889840937, "language_loss": 0.77293491, "learning_rate": 3.2079734702289553e-06, "loss": 0.79824162, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.7381772994995117 }, { "auxiliary_loss_clip": 0.01319801, "auxiliary_loss_mlp": 0.0087209, "balance_loss_clip": 1.00379872, "balance_loss_mlp": 1.00022018, "epoch": 0.3149161305837792, "flos": 66051104104800.0, "grad_norm": 0.8129279644796972, "language_loss": 0.60377955, "learning_rate": 3.207352542604031e-06, "loss": 0.62569845, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.34263277053833 }, { "auxiliary_loss_clip": 0.01309587, "auxiliary_loss_mlp": 0.01196077, "balance_loss_clip": 1.00894284, "balance_loss_mlp": 1.00057387, "epoch": 0.3150363734744183, "flos": 28987214225760.0, "grad_norm": 1.6242814862439832, "language_loss": 0.7831831, "learning_rate": 3.2067314318246864e-06, "loss": 0.80823976, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.8985092639923096 }, { "auxiliary_loss_clip": 0.0130276, "auxiliary_loss_mlp": 0.01197375, "balance_loss_clip": 1.00874138, "balance_loss_mlp": 1.00091815, "epoch": 0.31515661636505743, "flos": 27636613358400.0, "grad_norm": 1.695871770524042, "language_loss": 0.77851021, "learning_rate": 3.206110137985143e-06, "loss": 0.8035115, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.8019235134124756 }, { "auxiliary_loss_clip": 0.01308685, "auxiliary_loss_mlp": 0.01196442, "balance_loss_clip": 1.00901389, "balance_loss_mlp": 1.00055754, "epoch": 0.3152768592556965, "flos": 24605643352320.0, "grad_norm": 2.0612239453147323, "language_loss": 0.91867292, "learning_rate": 3.2054886611796505e-06, "loss": 0.94372427, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.8056907653808594 }, { "auxiliary_loss_clip": 0.01343003, "auxiliary_loss_mlp": 0.01194607, "balance_loss_clip": 1.00431406, "balance_loss_mlp": 1.00005782, "epoch": 0.3153971021463356, "flos": 68476935525120.0, "grad_norm": 0.8859965225856061, "language_loss": 0.63572431, "learning_rate": 3.204867001502487e-06, "loss": 0.66110039, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 4.234644651412964 }, { "auxiliary_loss_clip": 0.01371904, "auxiliary_loss_mlp": 0.01196676, "balance_loss_clip": 1.01090145, "balance_loss_mlp": 1.00060081, "epoch": 0.3155173450369747, "flos": 25593726761280.0, "grad_norm": 2.1426985547540376, "language_loss": 0.80670774, "learning_rate": 3.2042451590479567e-06, "loss": 0.83239353, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.8164174556732178 }, { "auxiliary_loss_clip": 0.0136967, "auxiliary_loss_mlp": 0.01196437, "balance_loss_clip": 1.01014006, "balance_loss_mlp": 1.00083852, "epoch": 0.31563758792761376, "flos": 24309342741600.0, "grad_norm": 1.5553626667676355, "language_loss": 0.86712271, "learning_rate": 3.203623133910394e-06, "loss": 0.89278376, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.6975018978118896 }, { "auxiliary_loss_clip": 0.01274151, "auxiliary_loss_mlp": 0.01196835, "balance_loss_clip": 1.00894868, "balance_loss_mlp": 1.00075996, "epoch": 0.31575783081825287, "flos": 31903878785760.0, "grad_norm": 2.553355676488198, "language_loss": 0.76899719, "learning_rate": 3.203000926184158e-06, "loss": 0.79370713, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 3.9326248168945312 }, { "auxiliary_loss_clip": 0.01370918, "auxiliary_loss_mlp": 0.01196136, "balance_loss_clip": 1.01038122, "balance_loss_mlp": 1.0005374, "epoch": 0.315878073708892, "flos": 30810973782240.0, "grad_norm": 1.716133829202625, "language_loss": 0.77795875, "learning_rate": 3.202378535963639e-06, "loss": 0.80362928, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.7143402099609375 }, { "auxiliary_loss_clip": 0.01336327, "auxiliary_loss_mlp": 0.00872787, "balance_loss_clip": 1.00977004, "balance_loss_mlp": 1.00033236, "epoch": 0.31599831659953104, "flos": 22200275220480.0, "grad_norm": 1.7319233419507738, "language_loss": 0.83925295, "learning_rate": 3.2017559633432516e-06, "loss": 0.8613441, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.730381727218628 }, { "auxiliary_loss_clip": 0.01344474, "auxiliary_loss_mlp": 0.01196937, "balance_loss_clip": 1.01045108, "balance_loss_mlp": 1.00086164, "epoch": 0.31611855949017015, "flos": 25593475295520.0, "grad_norm": 1.8234872960999153, "language_loss": 0.66348577, "learning_rate": 3.2011332084174398e-06, "loss": 0.68889987, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.7790398597717285 }, { "auxiliary_loss_clip": 0.01345077, "auxiliary_loss_mlp": 0.01197157, "balance_loss_clip": 1.00960815, "balance_loss_mlp": 1.00069976, "epoch": 0.31623880238080926, "flos": 20594097986400.0, "grad_norm": 1.5383972482226014, "language_loss": 0.89141279, "learning_rate": 3.2005102712806756e-06, "loss": 0.91683519, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.7104976177215576 }, { "auxiliary_loss_clip": 0.01358395, "auxiliary_loss_mlp": 0.01196481, "balance_loss_clip": 1.01021051, "balance_loss_mlp": 1.00078762, "epoch": 0.3163590452714483, "flos": 12784925361600.0, "grad_norm": 2.2164669781828623, "language_loss": 0.72939646, "learning_rate": 3.1998871520274575e-06, "loss": 0.75494516, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.7473061084747314 }, { "auxiliary_loss_clip": 0.01346407, "auxiliary_loss_mlp": 0.01196713, "balance_loss_clip": 1.01003098, "balance_loss_mlp": 1.00063777, "epoch": 0.3164792881620874, "flos": 23041303996320.0, "grad_norm": 1.6540445647428423, "language_loss": 0.84727526, "learning_rate": 3.199263850752312e-06, "loss": 0.87270653, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.7469253540039062 }, { "auxiliary_loss_clip": 0.01349742, "auxiliary_loss_mlp": 0.01196728, "balance_loss_clip": 1.00977826, "balance_loss_mlp": 1.00065207, "epoch": 0.31659953105272653, "flos": 18296281648800.0, "grad_norm": 2.409357103447374, "language_loss": 0.86032909, "learning_rate": 3.198640367549795e-06, "loss": 0.88579381, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.687955379486084 }, { "auxiliary_loss_clip": 0.01355493, "auxiliary_loss_mlp": 0.00872598, "balance_loss_clip": 1.0097301, "balance_loss_mlp": 1.00014734, "epoch": 0.3167197739433656, "flos": 25703433976320.0, "grad_norm": 1.8609704303401584, "language_loss": 0.86003673, "learning_rate": 3.198016702514487e-06, "loss": 0.88231766, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.7563297748565674 }, { "auxiliary_loss_clip": 0.01370342, "auxiliary_loss_mlp": 0.01196009, "balance_loss_clip": 1.01043892, "balance_loss_mlp": 1.00050592, "epoch": 0.3168400168340047, "flos": 23546026391040.0, "grad_norm": 1.5432319272948578, "language_loss": 0.84383726, "learning_rate": 3.1973928557409972e-06, "loss": 0.86950076, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.684112310409546 }, { "auxiliary_loss_clip": 0.01369403, "auxiliary_loss_mlp": 0.0119607, "balance_loss_clip": 1.01007426, "balance_loss_mlp": 1.00047135, "epoch": 0.31696025972464376, "flos": 28366462048320.0, "grad_norm": 2.3471762389566053, "language_loss": 0.70642459, "learning_rate": 3.1967688273239636e-06, "loss": 0.73207933, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.772077798843384 }, { "auxiliary_loss_clip": 0.01307782, "auxiliary_loss_mlp": 0.01196372, "balance_loss_clip": 1.00900578, "balance_loss_mlp": 1.00058246, "epoch": 0.31708050261528287, "flos": 16399120508640.0, "grad_norm": 1.6705036890093932, "language_loss": 0.82173145, "learning_rate": 3.1961446173580503e-06, "loss": 0.84677303, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.736511468887329 }, { "auxiliary_loss_clip": 0.01333276, "auxiliary_loss_mlp": 0.01196254, "balance_loss_clip": 1.01014245, "balance_loss_mlp": 1.00075042, "epoch": 0.317200745505922, "flos": 26212359441600.0, "grad_norm": 1.5820641000766742, "language_loss": 0.77151847, "learning_rate": 3.1955202259379502e-06, "loss": 0.79681379, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.770707130432129 }, { "auxiliary_loss_clip": 0.01357997, "auxiliary_loss_mlp": 0.01196373, "balance_loss_clip": 1.01003408, "balance_loss_mlp": 1.00067866, "epoch": 0.31732098839656103, "flos": 31350888174240.0, "grad_norm": 1.857609501928152, "language_loss": 0.83160031, "learning_rate": 3.194895653158381e-06, "loss": 0.857144, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.758899450302124 }, { "auxiliary_loss_clip": 0.01341628, "auxiliary_loss_mlp": 0.01194634, "balance_loss_clip": 1.00358927, "balance_loss_mlp": 1.00008416, "epoch": 0.31744123128720014, "flos": 58989059460000.0, "grad_norm": 0.7621971548395011, "language_loss": 0.55564153, "learning_rate": 3.194270899114093e-06, "loss": 0.58100414, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.351869583129883 }, { "auxiliary_loss_clip": 0.01359272, "auxiliary_loss_mlp": 0.01196747, "balance_loss_clip": 1.01079273, "balance_loss_mlp": 1.00086236, "epoch": 0.31756147417783925, "flos": 17417582141760.0, "grad_norm": 2.01346338606648, "language_loss": 0.82311374, "learning_rate": 3.193645963899858e-06, "loss": 0.84867394, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.743069887161255 }, { "auxiliary_loss_clip": 0.01332769, "auxiliary_loss_mlp": 0.01196743, "balance_loss_clip": 1.01007891, "balance_loss_mlp": 1.00066805, "epoch": 0.3176817170684783, "flos": 25481684507040.0, "grad_norm": 2.1089305584228644, "language_loss": 0.84197623, "learning_rate": 3.193020847610479e-06, "loss": 0.86727136, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.8261892795562744 }, { "auxiliary_loss_clip": 0.0133715, "auxiliary_loss_mlp": 0.01196742, "balance_loss_clip": 1.00974941, "balance_loss_mlp": 1.00066662, "epoch": 0.3178019599591174, "flos": 24972615347040.0, "grad_norm": 2.6316898334186893, "language_loss": 0.71364093, "learning_rate": 3.192395550340787e-06, "loss": 0.73897988, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.7880187034606934 }, { "auxiliary_loss_clip": 0.01346548, "auxiliary_loss_mlp": 0.01196146, "balance_loss_clip": 1.00939846, "balance_loss_mlp": 1.00064278, "epoch": 0.31792220284975653, "flos": 12422228284800.0, "grad_norm": 1.9377977167440954, "language_loss": 0.76792037, "learning_rate": 3.191770072185638e-06, "loss": 0.7933473, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.7169337272644043 }, { "auxiliary_loss_clip": 0.01344887, "auxiliary_loss_mlp": 0.01196486, "balance_loss_clip": 1.00974452, "balance_loss_mlp": 1.00079203, "epoch": 0.3180424457403956, "flos": 15485767859520.0, "grad_norm": 2.438086313350908, "language_loss": 0.72816682, "learning_rate": 3.191144413239916e-06, "loss": 0.75358057, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.707902669906616 }, { "auxiliary_loss_clip": 0.0133254, "auxiliary_loss_mlp": 0.01196924, "balance_loss_clip": 1.00990391, "balance_loss_mlp": 1.00065756, "epoch": 0.3181626886310347, "flos": 26174976099840.0, "grad_norm": 1.961139879423735, "language_loss": 0.8830061, "learning_rate": 3.190518573598534e-06, "loss": 0.90830076, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.754180908203125 }, { "auxiliary_loss_clip": 0.01331345, "auxiliary_loss_mlp": 0.0119662, "balance_loss_clip": 1.01018465, "balance_loss_mlp": 1.00073493, "epoch": 0.3182829315216738, "flos": 25483121454240.0, "grad_norm": 1.4214388080715663, "language_loss": 0.7740944, "learning_rate": 3.1898925533564308e-06, "loss": 0.79937404, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.8534395694732666 }, { "auxiliary_loss_clip": 0.0132051, "auxiliary_loss_mlp": 0.01195959, "balance_loss_clip": 1.0101378, "balance_loss_mlp": 1.00064683, "epoch": 0.31840317441231286, "flos": 18113711705280.0, "grad_norm": 1.952415441010494, "language_loss": 0.64320493, "learning_rate": 3.1892663526085733e-06, "loss": 0.66836965, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.7653591632843018 }, { "auxiliary_loss_clip": 0.01341305, "auxiliary_loss_mlp": 0.01194621, "balance_loss_clip": 1.0032382, "balance_loss_mlp": 1.00007153, "epoch": 0.31852341730295197, "flos": 64741981878720.0, "grad_norm": 0.7581097986295517, "language_loss": 0.56968391, "learning_rate": 3.188639971449956e-06, "loss": 0.59504318, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 4.19230055809021 }, { "auxiliary_loss_clip": 0.01370645, "auxiliary_loss_mlp": 0.01197067, "balance_loss_clip": 1.01030421, "balance_loss_mlp": 1.00080109, "epoch": 0.318643660193591, "flos": 20668146196320.0, "grad_norm": 2.5515439726722446, "language_loss": 0.72820485, "learning_rate": 3.1880134099756e-06, "loss": 0.75388193, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.7019639015197754 }, { "auxiliary_loss_clip": 0.0135799, "auxiliary_loss_mlp": 0.0119619, "balance_loss_clip": 1.01013672, "balance_loss_mlp": 1.00068712, "epoch": 0.31876390308423014, "flos": 26943357689280.0, "grad_norm": 1.9662307112381037, "language_loss": 0.69832534, "learning_rate": 3.1873866682805535e-06, "loss": 0.72386718, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.7539920806884766 }, { "auxiliary_loss_clip": 0.01332911, "auxiliary_loss_mlp": 0.01196317, "balance_loss_clip": 1.00961733, "balance_loss_mlp": 1.00062299, "epoch": 0.31888414597486925, "flos": 18041926687200.0, "grad_norm": 1.9928341488120984, "language_loss": 0.88412505, "learning_rate": 3.186759746459894e-06, "loss": 0.90941739, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 3.8592231273651123 }, { "auxiliary_loss_clip": 0.01333242, "auxiliary_loss_mlp": 0.01196465, "balance_loss_clip": 1.01038861, "balance_loss_mlp": 1.00077116, "epoch": 0.3190043888655083, "flos": 25149329730720.0, "grad_norm": 1.7982335393628306, "language_loss": 0.79475641, "learning_rate": 3.1861326446087246e-06, "loss": 0.82005352, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 3.754784345626831 }, { "auxiliary_loss_clip": 0.01350883, "auxiliary_loss_mlp": 0.01196602, "balance_loss_clip": 1.00933647, "balance_loss_mlp": 1.00090766, "epoch": 0.3191246317561474, "flos": 22053903137280.0, "grad_norm": 2.3748515532313212, "language_loss": 0.72496092, "learning_rate": 3.1855053628221763e-06, "loss": 0.75043577, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.728459119796753 }, { "auxiliary_loss_clip": 0.01324, "auxiliary_loss_mlp": 0.01196684, "balance_loss_clip": 1.00998449, "balance_loss_mlp": 1.00060833, "epoch": 0.3192448746467865, "flos": 14901824244960.0, "grad_norm": 2.3051793907525964, "language_loss": 0.89410675, "learning_rate": 3.184877901195407e-06, "loss": 0.91931355, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.795036792755127 }, { "auxiliary_loss_clip": 0.01308696, "auxiliary_loss_mlp": 0.01194557, "balance_loss_clip": 1.00736487, "balance_loss_mlp": 1.00000787, "epoch": 0.3193651175374256, "flos": 67234869600480.0, "grad_norm": 0.7907277246771963, "language_loss": 0.62820768, "learning_rate": 3.184250259823602e-06, "loss": 0.6532402, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.3858911991119385 }, { "auxiliary_loss_clip": 0.01310361, "auxiliary_loss_mlp": 0.01196532, "balance_loss_clip": 1.0090301, "balance_loss_mlp": 1.00064778, "epoch": 0.3194853604280647, "flos": 12233084307840.0, "grad_norm": 2.0955752819968736, "language_loss": 0.81345767, "learning_rate": 3.183622438801974e-06, "loss": 0.83852661, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.7874860763549805 }, { "auxiliary_loss_clip": 0.01370145, "auxiliary_loss_mlp": 0.01196388, "balance_loss_clip": 1.01061547, "balance_loss_mlp": 1.00069439, "epoch": 0.3196056033187038, "flos": 14939926060320.0, "grad_norm": 1.8949066353294644, "language_loss": 0.753295, "learning_rate": 3.1829944382257637e-06, "loss": 0.77896035, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.795806407928467 }, { "auxiliary_loss_clip": 0.01344657, "auxiliary_loss_mlp": 0.01196121, "balance_loss_clip": 1.00955713, "balance_loss_mlp": 1.00052226, "epoch": 0.31972584620934286, "flos": 23768889494400.0, "grad_norm": 1.9864686579199362, "language_loss": 0.81503588, "learning_rate": 3.1823662581902373e-06, "loss": 0.84044367, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.7259132862091064 }, { "auxiliary_loss_clip": 0.01334382, "auxiliary_loss_mlp": 0.01196332, "balance_loss_clip": 1.00987554, "balance_loss_mlp": 1.00082922, "epoch": 0.31984608909998197, "flos": 21251550955680.0, "grad_norm": 2.920172372536209, "language_loss": 0.74750459, "learning_rate": 3.1817378987906896e-06, "loss": 0.77281177, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.7920448780059814 }, { "auxiliary_loss_clip": 0.01284881, "auxiliary_loss_mlp": 0.01196804, "balance_loss_clip": 1.01007318, "balance_loss_mlp": 1.00072885, "epoch": 0.3199663319906211, "flos": 18296245725120.0, "grad_norm": 2.2970036703252816, "language_loss": 0.79396617, "learning_rate": 3.181109360122442e-06, "loss": 0.81878304, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.797553062438965 }, { "auxiliary_loss_clip": 0.01313429, "auxiliary_loss_mlp": 0.01196832, "balance_loss_clip": 1.00954342, "balance_loss_mlp": 1.0007565, "epoch": 0.32008657488126013, "flos": 18733637638080.0, "grad_norm": 1.940934926188615, "language_loss": 0.78102285, "learning_rate": 3.1804806422808445e-06, "loss": 0.8061254, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.798058271408081 }, { "auxiliary_loss_clip": 0.01331852, "auxiliary_loss_mlp": 0.01196739, "balance_loss_clip": 1.00973856, "balance_loss_mlp": 1.00085473, "epoch": 0.32020681777189924, "flos": 20595355315200.0, "grad_norm": 1.5321303062397673, "language_loss": 0.73260033, "learning_rate": 3.1798517453612714e-06, "loss": 0.75788623, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.7520315647125244 }, { "auxiliary_loss_clip": 0.01345603, "auxiliary_loss_mlp": 0.01196358, "balance_loss_clip": 1.01045942, "balance_loss_mlp": 1.00066364, "epoch": 0.32032706066253835, "flos": 35261707245120.0, "grad_norm": 1.683325262531929, "language_loss": 0.75318897, "learning_rate": 3.1792226694591265e-06, "loss": 0.77860856, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.8472988605499268 }, { "auxiliary_loss_clip": 0.01304344, "auxiliary_loss_mlp": 0.011961, "balance_loss_clip": 1.00874376, "balance_loss_mlp": 1.00069213, "epoch": 0.3204473035531774, "flos": 15304239702720.0, "grad_norm": 1.764580285606366, "language_loss": 0.80235112, "learning_rate": 3.178593414669841e-06, "loss": 0.82735562, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.7887892723083496 }, { "auxiliary_loss_clip": 0.0134646, "auxiliary_loss_mlp": 0.01197433, "balance_loss_clip": 1.0104593, "balance_loss_mlp": 1.00097644, "epoch": 0.3205675464438165, "flos": 24462576247680.0, "grad_norm": 2.209584330844928, "language_loss": 0.70237237, "learning_rate": 3.1779639810888707e-06, "loss": 0.72781134, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.69608736038208 }, { "auxiliary_loss_clip": 0.01344746, "auxiliary_loss_mlp": 0.01196697, "balance_loss_clip": 1.00982845, "balance_loss_mlp": 1.00081217, "epoch": 0.3206877893344556, "flos": 22456246747680.0, "grad_norm": 1.7042164788914191, "language_loss": 0.7580058, "learning_rate": 3.1773343688117013e-06, "loss": 0.78342021, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.847592353820801 }, { "auxiliary_loss_clip": 0.01345735, "auxiliary_loss_mlp": 0.00872752, "balance_loss_clip": 1.01030576, "balance_loss_mlp": 1.00043654, "epoch": 0.3208080322250947, "flos": 20412246516480.0, "grad_norm": 2.8990515657323686, "language_loss": 0.84362942, "learning_rate": 3.1767045779338445e-06, "loss": 0.86581433, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.7344589233398438 }, { "auxiliary_loss_clip": 0.0135867, "auxiliary_loss_mlp": 0.01196316, "balance_loss_clip": 1.01033854, "balance_loss_mlp": 1.00062191, "epoch": 0.3209282751157338, "flos": 21762128910240.0, "grad_norm": 1.9199009783635026, "language_loss": 0.91070104, "learning_rate": 3.176074608550839e-06, "loss": 0.93625087, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.705763101577759 }, { "auxiliary_loss_clip": 0.0127889, "auxiliary_loss_mlp": 0.01197028, "balance_loss_clip": 1.00871003, "balance_loss_mlp": 1.00095248, "epoch": 0.32104851800637285, "flos": 22055052695040.0, "grad_norm": 2.149837198215759, "language_loss": 0.82402593, "learning_rate": 3.17544446075825e-06, "loss": 0.8487851, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.8555541038513184 }, { "auxiliary_loss_clip": 0.01342905, "auxiliary_loss_mlp": 0.01196434, "balance_loss_clip": 1.00968051, "balance_loss_mlp": 1.00093091, "epoch": 0.32116876089701196, "flos": 37012316683680.0, "grad_norm": 1.603496732434062, "language_loss": 0.71053946, "learning_rate": 3.174814134651671e-06, "loss": 0.73593283, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.920583963394165 }, { "auxiliary_loss_clip": 0.01369234, "auxiliary_loss_mlp": 0.01196084, "balance_loss_clip": 1.01028311, "balance_loss_mlp": 1.00067604, "epoch": 0.3212890037876511, "flos": 21979244224800.0, "grad_norm": 1.5709245430724914, "language_loss": 0.80402809, "learning_rate": 3.1741836303267215e-06, "loss": 0.82968128, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.63948655128479 }, { "auxiliary_loss_clip": 0.01369555, "auxiliary_loss_mlp": 0.01196193, "balance_loss_clip": 1.01048779, "balance_loss_mlp": 1.00059402, "epoch": 0.32140924667829013, "flos": 10342353506400.0, "grad_norm": 1.801514920656967, "language_loss": 0.75194407, "learning_rate": 3.1735529478790496e-06, "loss": 0.77760154, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.723705768585205 }, { "auxiliary_loss_clip": 0.01345956, "auxiliary_loss_mlp": 0.01196604, "balance_loss_clip": 1.00955403, "balance_loss_mlp": 1.00081456, "epoch": 0.32152948956892924, "flos": 50798927319840.0, "grad_norm": 1.7335327775917417, "language_loss": 0.79319096, "learning_rate": 3.172922087404328e-06, "loss": 0.81861657, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.9735090732574463 }, { "auxiliary_loss_clip": 0.01340533, "auxiliary_loss_mlp": 0.01194639, "balance_loss_clip": 1.00334239, "balance_loss_mlp": 1.00008965, "epoch": 0.32164973245956835, "flos": 63863282371680.0, "grad_norm": 0.8618265564479137, "language_loss": 0.55228484, "learning_rate": 3.1722910489982586e-06, "loss": 0.57763648, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 4.64821457862854 }, { "auxiliary_loss_clip": 0.01345562, "auxiliary_loss_mlp": 0.01196511, "balance_loss_clip": 1.01010895, "balance_loss_mlp": 1.0006268, "epoch": 0.3217699753502074, "flos": 23513959753920.0, "grad_norm": 1.492650030861378, "language_loss": 0.79993653, "learning_rate": 3.1716598327565694e-06, "loss": 0.82535732, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.808035135269165 }, { "auxiliary_loss_clip": 0.01369827, "auxiliary_loss_mlp": 0.01195733, "balance_loss_clip": 1.01015735, "balance_loss_mlp": 1.00042057, "epoch": 0.3218902182408465, "flos": 19062543741120.0, "grad_norm": 1.4969529675726019, "language_loss": 0.84289408, "learning_rate": 3.171028438775015e-06, "loss": 0.8685497, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.729640007019043 }, { "auxiliary_loss_clip": 0.01369883, "auxiliary_loss_mlp": 0.01195946, "balance_loss_clip": 1.01026678, "balance_loss_mlp": 1.00044298, "epoch": 0.3220104611314856, "flos": 20375581648320.0, "grad_norm": 1.8660070407401608, "language_loss": 0.84309298, "learning_rate": 3.170396867149377e-06, "loss": 0.86875129, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.669428825378418 }, { "auxiliary_loss_clip": 0.01290199, "auxiliary_loss_mlp": 0.01196624, "balance_loss_clip": 1.0092206, "balance_loss_mlp": 1.0007391, "epoch": 0.3221307040221247, "flos": 20117023616160.0, "grad_norm": 1.727144335756467, "language_loss": 0.86562282, "learning_rate": 3.1697651179754653e-06, "loss": 0.89049101, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 4.680688858032227 }, { "auxiliary_loss_clip": 0.0130824, "auxiliary_loss_mlp": 0.01196264, "balance_loss_clip": 1.00983381, "balance_loss_mlp": 1.00066555, "epoch": 0.3222509469127638, "flos": 23987800992960.0, "grad_norm": 1.6285624586050087, "language_loss": 0.73099464, "learning_rate": 3.1691331913491153e-06, "loss": 0.75603974, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 3.851186752319336 }, { "auxiliary_loss_clip": 0.01369165, "auxiliary_loss_mlp": 0.01196096, "balance_loss_clip": 1.00966203, "balance_loss_mlp": 1.00059283, "epoch": 0.32237118980340285, "flos": 17675745013440.0, "grad_norm": 2.0220633445944522, "language_loss": 0.8508023, "learning_rate": 3.1685010873661898e-06, "loss": 0.87645495, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.66263484954834 }, { "auxiliary_loss_clip": 0.01358317, "auxiliary_loss_mlp": 0.01196386, "balance_loss_clip": 1.01012814, "balance_loss_mlp": 1.00069201, "epoch": 0.32249143269404196, "flos": 23147993622240.0, "grad_norm": 1.8018445893011388, "language_loss": 0.79447389, "learning_rate": 3.167868806122578e-06, "loss": 0.82002091, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.7731025218963623 }, { "auxiliary_loss_clip": 0.01326312, "auxiliary_loss_mlp": 0.01196503, "balance_loss_clip": 1.00897384, "balance_loss_mlp": 1.00090456, "epoch": 0.32261167558468107, "flos": 24422319011520.0, "grad_norm": 1.8633629012203954, "language_loss": 0.6666072, "learning_rate": 3.1672363477141968e-06, "loss": 0.6918354, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.819265604019165 }, { "auxiliary_loss_clip": 0.0133133, "auxiliary_loss_mlp": 0.01196482, "balance_loss_clip": 1.00941133, "balance_loss_mlp": 1.00059724, "epoch": 0.3227319184753201, "flos": 30367187454240.0, "grad_norm": 2.059885956020765, "language_loss": 0.85225952, "learning_rate": 3.1666037122369903e-06, "loss": 0.87753761, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.9050040245056152 }, { "auxiliary_loss_clip": 0.01357558, "auxiliary_loss_mlp": 0.01196476, "balance_loss_clip": 1.00996208, "balance_loss_mlp": 1.00078201, "epoch": 0.32285216136595923, "flos": 16946183712960.0, "grad_norm": 2.116872153934223, "language_loss": 0.86524469, "learning_rate": 3.165970899786928e-06, "loss": 0.8907851, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.763462543487549 }, { "auxiliary_loss_clip": 0.01312727, "auxiliary_loss_mlp": 0.01197106, "balance_loss_clip": 1.00938356, "balance_loss_mlp": 1.00103021, "epoch": 0.32297240425659834, "flos": 21981543340320.0, "grad_norm": 1.6758629134904182, "language_loss": 0.75401902, "learning_rate": 3.1653379104600067e-06, "loss": 0.77911729, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.799654483795166 }, { "auxiliary_loss_clip": 0.01357627, "auxiliary_loss_mlp": 0.01196522, "balance_loss_clip": 1.00992715, "balance_loss_mlp": 1.00073266, "epoch": 0.3230926471472374, "flos": 22748056898400.0, "grad_norm": 1.4265695657607755, "language_loss": 0.69128388, "learning_rate": 3.164704744352251e-06, "loss": 0.71682537, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.766958475112915 }, { "auxiliary_loss_clip": 0.01346705, "auxiliary_loss_mlp": 0.0119608, "balance_loss_clip": 1.0093708, "balance_loss_mlp": 1.00067246, "epoch": 0.3232128900378765, "flos": 16942986505440.0, "grad_norm": 1.6220507870055059, "language_loss": 0.80778122, "learning_rate": 3.164071401559713e-06, "loss": 0.83320904, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.6656720638275146 }, { "auxiliary_loss_clip": 0.01333019, "auxiliary_loss_mlp": 0.01196486, "balance_loss_clip": 1.00920749, "balance_loss_mlp": 1.0007925, "epoch": 0.3233331329285156, "flos": 24023747387520.0, "grad_norm": 1.6709156975341595, "language_loss": 0.71515405, "learning_rate": 3.1634378821784674e-06, "loss": 0.74044919, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 2.8126468658447266 }, { "auxiliary_loss_clip": 0.0130669, "auxiliary_loss_mlp": 0.01196528, "balance_loss_clip": 1.00962603, "balance_loss_mlp": 1.00083447, "epoch": 0.3234533758191547, "flos": 18113855400000.0, "grad_norm": 1.9912358475316303, "language_loss": 0.73853028, "learning_rate": 3.1628041863046208e-06, "loss": 0.76356244, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.783576250076294 }, { "auxiliary_loss_clip": 0.01370581, "auxiliary_loss_mlp": 0.01196431, "balance_loss_clip": 1.01003027, "balance_loss_mlp": 1.00073707, "epoch": 0.3235736187097938, "flos": 16946147789280.0, "grad_norm": 1.9745320711831167, "language_loss": 0.91109669, "learning_rate": 3.162170314034304e-06, "loss": 0.93676686, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.6606626510620117 }, { "auxiliary_loss_clip": 0.01370179, "auxiliary_loss_mlp": 0.01196321, "balance_loss_clip": 1.01015627, "balance_loss_mlp": 1.0008173, "epoch": 0.3236938616004329, "flos": 22127161026240.0, "grad_norm": 1.5171526368257433, "language_loss": 0.80749243, "learning_rate": 3.1615362654636738e-06, "loss": 0.83315742, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.703397274017334 }, { "auxiliary_loss_clip": 0.01296446, "auxiliary_loss_mlp": 0.01196036, "balance_loss_clip": 1.0080533, "balance_loss_mlp": 1.00062847, "epoch": 0.32381410449107195, "flos": 17164628203680.0, "grad_norm": 1.6448834725490693, "language_loss": 0.86929846, "learning_rate": 3.1609020406889163e-06, "loss": 0.89422327, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.7519776821136475 }, { "auxiliary_loss_clip": 0.01345427, "auxiliary_loss_mlp": 0.01196372, "balance_loss_clip": 1.00999713, "balance_loss_mlp": 1.00077367, "epoch": 0.32393434738171106, "flos": 16578134007840.0, "grad_norm": 1.6561691901376134, "language_loss": 0.84982026, "learning_rate": 3.1602676398062416e-06, "loss": 0.87523824, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.719547986984253 }, { "auxiliary_loss_clip": 0.01343946, "auxiliary_loss_mlp": 0.01196265, "balance_loss_clip": 1.00957346, "balance_loss_mlp": 1.00057089, "epoch": 0.3240545902723502, "flos": 25483624385760.0, "grad_norm": 2.179345229127118, "language_loss": 0.61566055, "learning_rate": 3.1596330629118886e-06, "loss": 0.64106262, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.7122576236724854 }, { "auxiliary_loss_clip": 0.01289717, "auxiliary_loss_mlp": 0.01195888, "balance_loss_clip": 1.00790477, "balance_loss_mlp": 1.0005759, "epoch": 0.32417483316298923, "flos": 35845866401760.0, "grad_norm": 1.8235228854424785, "language_loss": 0.73316801, "learning_rate": 3.1589983101021223e-06, "loss": 0.75802404, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 3.0564446449279785 }, { "auxiliary_loss_clip": 0.01329687, "auxiliary_loss_mlp": 0.01196378, "balance_loss_clip": 1.00942159, "balance_loss_mlp": 1.00087535, "epoch": 0.32429507605362834, "flos": 30080514389760.0, "grad_norm": 1.8896104667088036, "language_loss": 0.8489821, "learning_rate": 3.1583633814732337e-06, "loss": 0.87424278, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.7573330402374268 }, { "auxiliary_loss_clip": 0.013696, "auxiliary_loss_mlp": 0.01196006, "balance_loss_clip": 1.00972581, "balance_loss_mlp": 1.00059879, "epoch": 0.3244153189442674, "flos": 18223275225600.0, "grad_norm": 2.1744287206131037, "language_loss": 0.71699661, "learning_rate": 3.157728277121541e-06, "loss": 0.74265265, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.719914197921753 }, { "auxiliary_loss_clip": 0.01369113, "auxiliary_loss_mlp": 0.01196606, "balance_loss_clip": 1.00956202, "balance_loss_mlp": 1.00081658, "epoch": 0.3245355618349065, "flos": 17710326308160.0, "grad_norm": 2.2011442129093335, "language_loss": 0.78110653, "learning_rate": 3.1570929971433897e-06, "loss": 0.80676365, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.6790411472320557 }, { "auxiliary_loss_clip": 0.0134338, "auxiliary_loss_mlp": 0.01196521, "balance_loss_clip": 1.00932384, "balance_loss_mlp": 1.00092268, "epoch": 0.3246558047255456, "flos": 23440809636000.0, "grad_norm": 1.8320942149018398, "language_loss": 0.84231573, "learning_rate": 3.1564575416351504e-06, "loss": 0.86771476, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.7278854846954346 }, { "auxiliary_loss_clip": 0.01369619, "auxiliary_loss_mlp": 0.01196158, "balance_loss_clip": 1.00996685, "balance_loss_mlp": 1.0006547, "epoch": 0.32477604761618467, "flos": 21760871581440.0, "grad_norm": 1.8479412048321244, "language_loss": 0.74275804, "learning_rate": 3.155821910693221e-06, "loss": 0.76841581, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 3.6307878494262695 }, { "auxiliary_loss_clip": 0.0133265, "auxiliary_loss_mlp": 0.01196391, "balance_loss_clip": 1.00887513, "balance_loss_mlp": 1.0006969, "epoch": 0.3248962905068238, "flos": 19828338825600.0, "grad_norm": 1.8664099887952905, "language_loss": 0.86136347, "learning_rate": 3.1551861044140275e-06, "loss": 0.88665384, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.7060773372650146 }, { "auxiliary_loss_clip": 0.01297916, "auxiliary_loss_mlp": 0.01196632, "balance_loss_clip": 1.00868762, "balance_loss_mlp": 1.00074744, "epoch": 0.3250165333974629, "flos": 23948226306720.0, "grad_norm": 2.13462289608784, "language_loss": 0.77605134, "learning_rate": 3.15455012289402e-06, "loss": 0.80099678, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.859553337097168 }, { "auxiliary_loss_clip": 0.01345133, "auxiliary_loss_mlp": 0.01196938, "balance_loss_clip": 1.00977266, "balance_loss_mlp": 1.0008626, "epoch": 0.32513677628810195, "flos": 23989345711200.0, "grad_norm": 1.7617410026352018, "language_loss": 0.84436953, "learning_rate": 3.153913966229677e-06, "loss": 0.8697902, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 3.6601624488830566 }, { "auxiliary_loss_clip": 0.01322599, "auxiliary_loss_mlp": 0.01194556, "balance_loss_clip": 1.00292587, "balance_loss_mlp": 1.00000632, "epoch": 0.32525701917874106, "flos": 70655825471040.0, "grad_norm": 0.6400615792103407, "language_loss": 0.50275308, "learning_rate": 3.1532776345175027e-06, "loss": 0.5279246, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 4.2714478969573975 }, { "auxiliary_loss_clip": 0.01369333, "auxiliary_loss_mlp": 0.01196091, "balance_loss_clip": 1.01004887, "balance_loss_mlp": 1.00068319, "epoch": 0.32537726206938017, "flos": 19682649292320.0, "grad_norm": 3.0824395592641447, "language_loss": 0.78483486, "learning_rate": 3.1526411278540285e-06, "loss": 0.81048912, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.7416305541992188 }, { "auxiliary_loss_clip": 0.01344714, "auxiliary_loss_mlp": 0.01196137, "balance_loss_clip": 1.00986063, "balance_loss_mlp": 1.00063336, "epoch": 0.3254975049600192, "flos": 28761010220160.0, "grad_norm": 2.41563958616899, "language_loss": 0.81396747, "learning_rate": 3.1520044463358116e-06, "loss": 0.83937597, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.7431771755218506 }, { "auxiliary_loss_clip": 0.01345043, "auxiliary_loss_mlp": 0.01195966, "balance_loss_clip": 1.00887334, "balance_loss_mlp": 1.00065374, "epoch": 0.32561774785065833, "flos": 18877387292640.0, "grad_norm": 1.484943447461118, "language_loss": 0.80396831, "learning_rate": 3.151367590059436e-06, "loss": 0.82937843, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.7115256786346436 }, { "auxiliary_loss_clip": 0.01369131, "auxiliary_loss_mlp": 0.00872808, "balance_loss_clip": 1.00998616, "balance_loss_mlp": 1.0005784, "epoch": 0.32573799074129745, "flos": 23112119075040.0, "grad_norm": 1.9249454303218025, "language_loss": 0.87013638, "learning_rate": 3.1507305591215117e-06, "loss": 0.89255583, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.7003090381622314 }, { "auxiliary_loss_clip": 0.01320613, "auxiliary_loss_mlp": 0.01194575, "balance_loss_clip": 1.003304, "balance_loss_mlp": 1.00002503, "epoch": 0.3258582336319365, "flos": 71237685512160.0, "grad_norm": 0.6658144952372987, "language_loss": 0.55779898, "learning_rate": 3.150093353618677e-06, "loss": 0.58295089, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.2921602725982666 }, { "auxiliary_loss_clip": 0.01358467, "auxiliary_loss_mlp": 0.01196588, "balance_loss_clip": 1.00998425, "balance_loss_mlp": 1.00070357, "epoch": 0.3259784765225756, "flos": 22456031205600.0, "grad_norm": 2.2738927630271397, "language_loss": 0.88089293, "learning_rate": 3.149455973647596e-06, "loss": 0.90644348, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.7455453872680664 }, { "auxiliary_loss_clip": 0.01322372, "auxiliary_loss_mlp": 0.01196417, "balance_loss_clip": 1.00875962, "balance_loss_mlp": 1.000723, "epoch": 0.32609871941321467, "flos": 20484821855520.0, "grad_norm": 1.8304990864567414, "language_loss": 0.77066028, "learning_rate": 3.1488184193049563e-06, "loss": 0.79584819, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.7389113903045654 }, { "auxiliary_loss_clip": 0.01369235, "auxiliary_loss_mlp": 0.01195865, "balance_loss_clip": 1.01003182, "balance_loss_mlp": 1.00064778, "epoch": 0.3262189623038538, "flos": 22416815756160.0, "grad_norm": 1.508230361650898, "language_loss": 0.72536993, "learning_rate": 3.1481806906874767e-06, "loss": 0.75102097, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.763493061065674 }, { "auxiliary_loss_clip": 0.01369152, "auxiliary_loss_mlp": 0.01195837, "balance_loss_clip": 1.00973511, "balance_loss_mlp": 1.00071502, "epoch": 0.3263392051944929, "flos": 20923507020960.0, "grad_norm": 1.5473606457582516, "language_loss": 0.87843138, "learning_rate": 3.147542787891899e-06, "loss": 0.90408123, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.7273330688476562 }, { "auxiliary_loss_clip": 0.01329115, "auxiliary_loss_mlp": 0.01196146, "balance_loss_clip": 1.00995731, "balance_loss_mlp": 1.00073826, "epoch": 0.32645944808513194, "flos": 24025184334720.0, "grad_norm": 2.395534500550522, "language_loss": 0.75258112, "learning_rate": 3.1469047110149926e-06, "loss": 0.77783376, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.901353359222412 }, { "auxiliary_loss_clip": 0.01283765, "auxiliary_loss_mlp": 0.01196619, "balance_loss_clip": 1.00872993, "balance_loss_mlp": 1.00073409, "epoch": 0.32657969097577105, "flos": 21032423915040.0, "grad_norm": 1.7761249102125023, "language_loss": 0.85251862, "learning_rate": 3.146266460153554e-06, "loss": 0.87732249, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 2.822558879852295 }, { "auxiliary_loss_clip": 0.0133529, "auxiliary_loss_mlp": 0.0087278, "balance_loss_clip": 1.00991845, "balance_loss_mlp": 1.00050199, "epoch": 0.32669993386641016, "flos": 22710278396160.0, "grad_norm": 2.488095697173088, "language_loss": 0.79979563, "learning_rate": 3.145628035404404e-06, "loss": 0.82187629, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.8214097023010254 }, { "auxiliary_loss_clip": 0.01320585, "auxiliary_loss_mlp": 0.01194594, "balance_loss_clip": 1.00335169, "balance_loss_mlp": 1.00004447, "epoch": 0.3268201767570492, "flos": 72105751609920.0, "grad_norm": 0.9075039740740956, "language_loss": 0.5756588, "learning_rate": 3.1449894368643922e-06, "loss": 0.60081065, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.355964422225952 }, { "auxiliary_loss_clip": 0.01308045, "auxiliary_loss_mlp": 0.01196019, "balance_loss_clip": 1.00928819, "balance_loss_mlp": 1.00061107, "epoch": 0.32694041964768833, "flos": 24535187510400.0, "grad_norm": 1.5211996492322337, "language_loss": 0.71510118, "learning_rate": 3.1443506646303934e-06, "loss": 0.74014187, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.7717530727386475 }, { "auxiliary_loss_clip": 0.01356905, "auxiliary_loss_mlp": 0.0119611, "balance_loss_clip": 1.00986779, "balance_loss_mlp": 1.00060725, "epoch": 0.32706066253832744, "flos": 33183017948160.0, "grad_norm": 2.164498846133548, "language_loss": 0.66937488, "learning_rate": 3.1437117187993086e-06, "loss": 0.69490504, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.8475303649902344 }, { "auxiliary_loss_clip": 0.01319814, "auxiliary_loss_mlp": 0.01195964, "balance_loss_clip": 1.00975239, "balance_loss_mlp": 1.00055563, "epoch": 0.3271809054289665, "flos": 24061633660800.0, "grad_norm": 1.5437416167737614, "language_loss": 0.80123603, "learning_rate": 3.143072599468065e-06, "loss": 0.82639384, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.9964687824249268 }, { "auxiliary_loss_clip": 0.01324191, "auxiliary_loss_mlp": 0.01195894, "balance_loss_clip": 1.00880015, "balance_loss_mlp": 1.00058198, "epoch": 0.3273011483196056, "flos": 38253785114880.0, "grad_norm": 1.3860328814521117, "language_loss": 0.75155771, "learning_rate": 3.1424333067336174e-06, "loss": 0.77675855, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.8247287273406982 }, { "auxiliary_loss_clip": 0.01357219, "auxiliary_loss_mlp": 0.01196417, "balance_loss_clip": 1.00967693, "balance_loss_mlp": 1.00053227, "epoch": 0.3274213912102447, "flos": 29054401012800.0, "grad_norm": 2.067534608462341, "language_loss": 0.78268886, "learning_rate": 3.141793840692945e-06, "loss": 0.80822515, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.678903579711914 }, { "auxiliary_loss_clip": 0.01333548, "auxiliary_loss_mlp": 0.01196058, "balance_loss_clip": 1.00908422, "balance_loss_mlp": 1.00074565, "epoch": 0.32754163410088377, "flos": 29133262995840.0, "grad_norm": 3.5367055433541394, "language_loss": 0.61453402, "learning_rate": 3.1411542014430553e-06, "loss": 0.63983005, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 2.681931495666504 }, { "auxiliary_loss_clip": 0.01323301, "auxiliary_loss_mlp": 0.01195757, "balance_loss_clip": 1.00888515, "balance_loss_mlp": 1.00063491, "epoch": 0.3276618769915229, "flos": 20631086167680.0, "grad_norm": 1.6906540505811782, "language_loss": 0.81846124, "learning_rate": 3.1405143890809804e-06, "loss": 0.84365183, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.76107120513916 }, { "auxiliary_loss_clip": 0.01332904, "auxiliary_loss_mlp": 0.01196217, "balance_loss_clip": 1.00908625, "balance_loss_mlp": 1.00071406, "epoch": 0.327782119882162, "flos": 18657434007360.0, "grad_norm": 23.513149703553648, "language_loss": 0.70141768, "learning_rate": 3.1398744037037796e-06, "loss": 0.72670889, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.7258493900299072 }, { "auxiliary_loss_clip": 0.01323268, "auxiliary_loss_mlp": 0.01196147, "balance_loss_clip": 1.00881886, "balance_loss_mlp": 1.00064349, "epoch": 0.32790236277280105, "flos": 21795812112960.0, "grad_norm": 1.896370211711056, "language_loss": 0.84171933, "learning_rate": 3.139234245408538e-06, "loss": 0.86691344, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 3.740217447280884 }, { "auxiliary_loss_clip": 0.01311337, "auxiliary_loss_mlp": 0.0087279, "balance_loss_clip": 1.00912786, "balance_loss_mlp": 1.00061798, "epoch": 0.32802260566344016, "flos": 23331425734080.0, "grad_norm": 1.2961082947428033, "language_loss": 0.76077515, "learning_rate": 3.1385939142923666e-06, "loss": 0.78261644, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.81841778755188 }, { "auxiliary_loss_clip": 0.01331863, "auxiliary_loss_mlp": 0.01196852, "balance_loss_clip": 1.00906169, "balance_loss_mlp": 1.00096726, "epoch": 0.3281428485540792, "flos": 24206999880960.0, "grad_norm": 3.0901648299519855, "language_loss": 0.78496349, "learning_rate": 3.137953410452405e-06, "loss": 0.81025064, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.736272096633911 }, { "auxiliary_loss_clip": 0.01332519, "auxiliary_loss_mlp": 0.01196105, "balance_loss_clip": 1.00905335, "balance_loss_mlp": 1.00069737, "epoch": 0.3282630914447183, "flos": 34128976089600.0, "grad_norm": 1.5398537073513703, "language_loss": 0.74669158, "learning_rate": 3.1373127339858146e-06, "loss": 0.77197778, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 3.7267916202545166 }, { "auxiliary_loss_clip": 0.01322417, "auxiliary_loss_mlp": 0.01195992, "balance_loss_clip": 1.00964332, "balance_loss_mlp": 1.00058424, "epoch": 0.32838333433535744, "flos": 27600738811200.0, "grad_norm": 1.813705488358194, "language_loss": 0.74549437, "learning_rate": 3.136671884989787e-06, "loss": 0.77067846, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 3.80464768409729 }, { "auxiliary_loss_clip": 0.01271836, "auxiliary_loss_mlp": 0.01196277, "balance_loss_clip": 1.00801325, "balance_loss_mlp": 1.00077379, "epoch": 0.3285035772259965, "flos": 12349509251040.0, "grad_norm": 2.054772051009569, "language_loss": 0.8698051, "learning_rate": 3.1360308635615383e-06, "loss": 0.89448619, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 3.8846170902252197 }, { "auxiliary_loss_clip": 0.01333188, "auxiliary_loss_mlp": 0.01196581, "balance_loss_clip": 1.00925088, "balance_loss_mlp": 1.00088763, "epoch": 0.3286238201166356, "flos": 24316096393440.0, "grad_norm": 1.942014221782031, "language_loss": 0.78628516, "learning_rate": 3.135389669798311e-06, "loss": 0.81158292, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.995598793029785 }, { "auxiliary_loss_clip": 0.01357338, "auxiliary_loss_mlp": 0.00872796, "balance_loss_clip": 1.00966728, "balance_loss_mlp": 1.00056303, "epoch": 0.3287440630072747, "flos": 21392821876320.0, "grad_norm": 1.798824130496271, "language_loss": 0.80051029, "learning_rate": 3.134748303797373e-06, "loss": 0.8228116, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.761094093322754 }, { "auxiliary_loss_clip": 0.01296971, "auxiliary_loss_mlp": 0.01196356, "balance_loss_clip": 1.0090065, "balance_loss_mlp": 1.00066233, "epoch": 0.32886430589791377, "flos": 23732547939360.0, "grad_norm": 2.080327522291562, "language_loss": 0.80744618, "learning_rate": 3.1341067656560203e-06, "loss": 0.83237946, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.824434757232666 }, { "auxiliary_loss_clip": 0.01342415, "auxiliary_loss_mlp": 0.01196658, "balance_loss_clip": 1.00927961, "balance_loss_mlp": 1.00077367, "epoch": 0.3289845487885529, "flos": 22418719711200.0, "grad_norm": 1.8340697967703599, "language_loss": 0.86754262, "learning_rate": 3.133465055471572e-06, "loss": 0.89293331, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.795182943344116 }, { "auxiliary_loss_clip": 0.01309949, "auxiliary_loss_mlp": 0.01196027, "balance_loss_clip": 1.00893092, "balance_loss_mlp": 1.00061929, "epoch": 0.329104791679192, "flos": 19682613368640.0, "grad_norm": 2.704003576149342, "language_loss": 0.66529673, "learning_rate": 3.1328231733413767e-06, "loss": 0.69035649, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.9181149005889893 }, { "auxiliary_loss_clip": 0.01357453, "auxiliary_loss_mlp": 0.0119563, "balance_loss_clip": 1.01013303, "balance_loss_mlp": 1.00050819, "epoch": 0.32922503456983104, "flos": 15997243906080.0, "grad_norm": 1.935925704534256, "language_loss": 0.90791416, "learning_rate": 3.1321811193628067e-06, "loss": 0.93344498, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.7123398780822754 }, { "auxiliary_loss_clip": 0.01345248, "auxiliary_loss_mlp": 0.00872868, "balance_loss_clip": 1.0097599, "balance_loss_mlp": 1.00043249, "epoch": 0.32934527746047015, "flos": 26834081558400.0, "grad_norm": 2.0962142528414676, "language_loss": 0.69675744, "learning_rate": 3.131538893633261e-06, "loss": 0.71893853, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.717259168624878 }, { "auxiliary_loss_clip": 0.01369334, "auxiliary_loss_mlp": 0.01196045, "balance_loss_clip": 1.01028955, "balance_loss_mlp": 1.00063753, "epoch": 0.32946552035110926, "flos": 23403785531040.0, "grad_norm": 1.785496402181119, "language_loss": 0.77703881, "learning_rate": 3.130896496250165e-06, "loss": 0.80269253, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.81923770904541 }, { "auxiliary_loss_clip": 0.01369282, "auxiliary_loss_mlp": 0.01196203, "balance_loss_clip": 1.00943971, "balance_loss_mlp": 1.00070024, "epoch": 0.3295857632417483, "flos": 14172478486560.0, "grad_norm": 2.3775583720101485, "language_loss": 0.86844778, "learning_rate": 3.1302539273109693e-06, "loss": 0.89410263, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 2.70639967918396 }, { "auxiliary_loss_clip": 0.01335379, "auxiliary_loss_mlp": 0.0119638, "balance_loss_clip": 1.00965297, "balance_loss_mlp": 1.00087714, "epoch": 0.32970600613238743, "flos": 22196718776160.0, "grad_norm": 1.6331546889341648, "language_loss": 0.80500954, "learning_rate": 3.1296111869131513e-06, "loss": 0.83032715, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.7977712154388428 }, { "auxiliary_loss_clip": 0.013689, "auxiliary_loss_mlp": 0.0119565, "balance_loss_clip": 1.00959086, "balance_loss_mlp": 1.00052822, "epoch": 0.32982624902302654, "flos": 22053795366240.0, "grad_norm": 1.7172268074240422, "language_loss": 0.85672724, "learning_rate": 3.1289682751542153e-06, "loss": 0.88237274, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.7605719566345215 }, { "auxiliary_loss_clip": 0.01357503, "auxiliary_loss_mlp": 0.0119588, "balance_loss_clip": 1.01002777, "balance_loss_mlp": 1.00066304, "epoch": 0.3299464919136656, "flos": 18661637077920.0, "grad_norm": 1.978877488748519, "language_loss": 0.71302205, "learning_rate": 3.1283251921316883e-06, "loss": 0.73855585, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.698699712753296 }, { "auxiliary_loss_clip": 0.01275202, "auxiliary_loss_mlp": 0.01195867, "balance_loss_clip": 1.00736856, "balance_loss_mlp": 1.00074553, "epoch": 0.3300667348043047, "flos": 13407365952000.0, "grad_norm": 1.9126351068690393, "language_loss": 0.80587041, "learning_rate": 3.1276819379431277e-06, "loss": 0.83058107, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.813002347946167 }, { "auxiliary_loss_clip": 0.01334422, "auxiliary_loss_mlp": 0.00872971, "balance_loss_clip": 1.00953758, "balance_loss_mlp": 1.00075591, "epoch": 0.33018697769494376, "flos": 15742565631360.0, "grad_norm": 1.8687443785074755, "language_loss": 0.7503286, "learning_rate": 3.1270385126861134e-06, "loss": 0.77240247, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.716592311859131 }, { "auxiliary_loss_clip": 0.01369754, "auxiliary_loss_mlp": 0.01196298, "balance_loss_clip": 1.01002312, "balance_loss_mlp": 1.00060439, "epoch": 0.3303072205855829, "flos": 18258610917600.0, "grad_norm": 2.01754150108553, "language_loss": 0.81952459, "learning_rate": 3.1263949164582533e-06, "loss": 0.8451851, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.709205389022827 }, { "auxiliary_loss_clip": 0.01369451, "auxiliary_loss_mlp": 0.01196477, "balance_loss_clip": 1.00943398, "balance_loss_mlp": 1.00087833, "epoch": 0.330427463476222, "flos": 17749433986560.0, "grad_norm": 2.0313974434719726, "language_loss": 0.77970123, "learning_rate": 3.1257511493571797e-06, "loss": 0.8053605, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.7858235836029053 }, { "auxiliary_loss_clip": 0.01310128, "auxiliary_loss_mlp": 0.01195731, "balance_loss_clip": 1.00869298, "balance_loss_mlp": 1.00060964, "epoch": 0.33054770636686104, "flos": 27162592500960.0, "grad_norm": 1.675650020872492, "language_loss": 0.77820599, "learning_rate": 3.125107211480552e-06, "loss": 0.8032645, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.85626482963562 }, { "auxiliary_loss_clip": 0.01282237, "auxiliary_loss_mlp": 0.01196092, "balance_loss_clip": 1.00935698, "balance_loss_mlp": 1.00077951, "epoch": 0.33066794925750015, "flos": 20117203234560.0, "grad_norm": 1.5462205948185612, "language_loss": 0.80083191, "learning_rate": 3.124463102926054e-06, "loss": 0.82561517, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.932985305786133 }, { "auxiliary_loss_clip": 0.01316759, "auxiliary_loss_mlp": 0.0119462, "balance_loss_clip": 1.00470018, "balance_loss_mlp": 1.00007057, "epoch": 0.33078819214813926, "flos": 70642641480480.0, "grad_norm": 0.7581041769253717, "language_loss": 0.61635447, "learning_rate": 3.1238188237913984e-06, "loss": 0.64146829, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.5405571460723877 }, { "auxiliary_loss_clip": 0.01371107, "auxiliary_loss_mlp": 0.01196589, "balance_loss_clip": 1.01004267, "balance_loss_mlp": 1.00089502, "epoch": 0.3309084350387783, "flos": 21141951511680.0, "grad_norm": 2.0647617095313633, "language_loss": 0.75905317, "learning_rate": 3.1231743741743202e-06, "loss": 0.78473014, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.7512826919555664 }, { "auxiliary_loss_clip": 0.01356503, "auxiliary_loss_mlp": 0.01196091, "balance_loss_clip": 1.00961959, "balance_loss_mlp": 1.00068331, "epoch": 0.3310286779294174, "flos": 14209358896800.0, "grad_norm": 2.317225890164763, "language_loss": 0.83387518, "learning_rate": 3.122529754172582e-06, "loss": 0.85940111, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.6935760974884033 }, { "auxiliary_loss_clip": 0.01345908, "auxiliary_loss_mlp": 0.01195989, "balance_loss_clip": 1.0093441, "balance_loss_mlp": 1.00067627, "epoch": 0.33114892082005654, "flos": 20778140800800.0, "grad_norm": 2.9115062320702187, "language_loss": 0.7269733, "learning_rate": 3.1218849638839736e-06, "loss": 0.75239229, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 3.7415177822113037 }, { "auxiliary_loss_clip": 0.01333947, "auxiliary_loss_mlp": 0.01196126, "balance_loss_clip": 1.0100534, "balance_loss_mlp": 1.00062227, "epoch": 0.3312691637106956, "flos": 17090759612160.0, "grad_norm": 1.6884553665278155, "language_loss": 0.78373426, "learning_rate": 3.121240003406307e-06, "loss": 0.809035, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.8401732444763184 }, { "auxiliary_loss_clip": 0.01308914, "auxiliary_loss_mlp": 0.01196376, "balance_loss_clip": 1.00855136, "balance_loss_mlp": 1.00077713, "epoch": 0.3313894066013347, "flos": 29456241691680.0, "grad_norm": 2.0985673964815907, "language_loss": 0.72418767, "learning_rate": 3.120594872837425e-06, "loss": 0.74924052, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.84346342086792 }, { "auxiliary_loss_clip": 0.01320868, "auxiliary_loss_mlp": 0.00872047, "balance_loss_clip": 1.0031538, "balance_loss_mlp": 1.00024724, "epoch": 0.3315096494919738, "flos": 61419273569280.0, "grad_norm": 0.8283690673533032, "language_loss": 0.62389505, "learning_rate": 3.1199495722751906e-06, "loss": 0.64582419, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 4.2434492111206055 }, { "auxiliary_loss_clip": 0.01297318, "auxiliary_loss_mlp": 0.0119614, "balance_loss_clip": 1.00874448, "balance_loss_mlp": 1.00063682, "epoch": 0.33162989238261287, "flos": 21653068321440.0, "grad_norm": 1.747185834647153, "language_loss": 0.8390255, "learning_rate": 3.1193041018174972e-06, "loss": 0.86396003, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 4.04626989364624 }, { "auxiliary_loss_clip": 0.01349468, "auxiliary_loss_mlp": 0.01196235, "balance_loss_clip": 1.00959074, "balance_loss_mlp": 1.00073159, "epoch": 0.331750135273252, "flos": 22674799009440.0, "grad_norm": 1.9014013524050677, "language_loss": 0.94751167, "learning_rate": 3.118658461562261e-06, "loss": 0.9729687, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.768425226211548 }, { "auxiliary_loss_clip": 0.01324232, "auxiliary_loss_mlp": 0.01196067, "balance_loss_clip": 1.00964165, "balance_loss_mlp": 1.00075424, "epoch": 0.33187037816389103, "flos": 22746907340640.0, "grad_norm": 1.840994360725891, "language_loss": 0.85013056, "learning_rate": 3.118012651607426e-06, "loss": 0.87533361, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.75101375579834 }, { "auxiliary_loss_clip": 0.01369046, "auxiliary_loss_mlp": 0.01196338, "balance_loss_clip": 1.00985694, "balance_loss_mlp": 1.00064373, "epoch": 0.33199062105453014, "flos": 19203778738080.0, "grad_norm": 2.018490220902125, "language_loss": 0.8364315, "learning_rate": 3.1173666720509603e-06, "loss": 0.86208534, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.6487584114074707 }, { "auxiliary_loss_clip": 0.01337047, "auxiliary_loss_mlp": 0.0119621, "balance_loss_clip": 1.00921404, "balance_loss_mlp": 1.00080264, "epoch": 0.33211086394516925, "flos": 31577020332480.0, "grad_norm": 1.7680484657083135, "language_loss": 0.68451506, "learning_rate": 3.116720522990859e-06, "loss": 0.70984757, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.8279340267181396 }, { "auxiliary_loss_clip": 0.01260629, "auxiliary_loss_mlp": 0.01196372, "balance_loss_clip": 1.00679135, "balance_loss_mlp": 1.00077343, "epoch": 0.3322311068358083, "flos": 17932506861600.0, "grad_norm": 2.1054089703066383, "language_loss": 0.62451744, "learning_rate": 3.116074204525142e-06, "loss": 0.64908743, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.8498549461364746 }, { "auxiliary_loss_clip": 0.01357032, "auxiliary_loss_mlp": 0.01196341, "balance_loss_clip": 1.01039791, "balance_loss_mlp": 1.00093341, "epoch": 0.3323513497264474, "flos": 32269844917440.0, "grad_norm": 1.4011032285671767, "language_loss": 0.83557719, "learning_rate": 3.1154277167518553e-06, "loss": 0.86111093, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.757256031036377 }, { "auxiliary_loss_clip": 0.0129762, "auxiliary_loss_mlp": 0.01194587, "balance_loss_clip": 1.00277364, "balance_loss_mlp": 1.00003707, "epoch": 0.33247159261708653, "flos": 52668705110400.0, "grad_norm": 0.7767571205364249, "language_loss": 0.59539115, "learning_rate": 3.114781059769072e-06, "loss": 0.62031317, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.1828205585479736 }, { "auxiliary_loss_clip": 0.0133209, "auxiliary_loss_mlp": 0.01195919, "balance_loss_clip": 1.00967705, "balance_loss_mlp": 1.00060678, "epoch": 0.3325918355077256, "flos": 27125245082880.0, "grad_norm": 2.4169746607255536, "language_loss": 0.67170382, "learning_rate": 3.1141342336748874e-06, "loss": 0.69698393, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.7633423805236816 }, { "auxiliary_loss_clip": 0.01345358, "auxiliary_loss_mlp": 0.01195926, "balance_loss_clip": 1.00934935, "balance_loss_mlp": 1.00061321, "epoch": 0.3327120783983647, "flos": 23664427136640.0, "grad_norm": 1.4143321851021078, "language_loss": 0.82220876, "learning_rate": 3.1134872385674253e-06, "loss": 0.84762168, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 2.8070285320281982 }, { "auxiliary_loss_clip": 0.01337198, "auxiliary_loss_mlp": 0.01196122, "balance_loss_clip": 1.0091536, "balance_loss_mlp": 1.00080943, "epoch": 0.3328323212890038, "flos": 19171388787840.0, "grad_norm": 2.1251068327459124, "language_loss": 0.85698688, "learning_rate": 3.1128400745448353e-06, "loss": 0.88232011, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.7523703575134277 }, { "auxiliary_loss_clip": 0.01347601, "auxiliary_loss_mlp": 0.01196088, "balance_loss_clip": 1.00924683, "balance_loss_mlp": 1.00067973, "epoch": 0.33295256417964286, "flos": 37706362673760.0, "grad_norm": 2.549469611295051, "language_loss": 0.62866431, "learning_rate": 3.11219274170529e-06, "loss": 0.65410125, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.927931308746338 }, { "auxiliary_loss_clip": 0.01330953, "auxiliary_loss_mlp": 0.01195751, "balance_loss_clip": 1.00939965, "balance_loss_mlp": 1.00072432, "epoch": 0.333072807070282, "flos": 26506001700000.0, "grad_norm": 1.8401358026101822, "language_loss": 0.81974339, "learning_rate": 3.1115452401469903e-06, "loss": 0.84501052, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.8318302631378174 }, { "auxiliary_loss_clip": 0.01307933, "auxiliary_loss_mlp": 0.0119578, "balance_loss_clip": 1.009305, "balance_loss_mlp": 1.00056338, "epoch": 0.3331930499609211, "flos": 21430923691680.0, "grad_norm": 1.8268905133026636, "language_loss": 0.86647356, "learning_rate": 3.1108975699681613e-06, "loss": 0.89151073, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.821897268295288 }, { "auxiliary_loss_clip": 0.01310569, "auxiliary_loss_mlp": 0.01195356, "balance_loss_clip": 1.00859618, "balance_loss_mlp": 1.00052023, "epoch": 0.33331329285156014, "flos": 20659955597280.0, "grad_norm": 1.9995798460892507, "language_loss": 0.71473557, "learning_rate": 3.1102497312670542e-06, "loss": 0.73979479, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.797898530960083 }, { "auxiliary_loss_clip": 0.01330233, "auxiliary_loss_mlp": 0.01195844, "balance_loss_clip": 1.00922179, "balance_loss_mlp": 1.00072229, "epoch": 0.33343353574219925, "flos": 28001609550720.0, "grad_norm": 1.887710155053493, "language_loss": 0.80427873, "learning_rate": 3.109601724141946e-06, "loss": 0.82953954, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.7538444995880127 }, { "auxiliary_loss_clip": 0.01342133, "auxiliary_loss_mlp": 0.01196161, "balance_loss_clip": 1.0104028, "balance_loss_mlp": 1.00065804, "epoch": 0.33355377863283836, "flos": 23764973813280.0, "grad_norm": 1.6113805933071688, "language_loss": 0.68128693, "learning_rate": 3.108953548691138e-06, "loss": 0.70666993, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.8232154846191406 }, { "auxiliary_loss_clip": 0.0136872, "auxiliary_loss_mlp": 0.01195789, "balance_loss_clip": 1.00970614, "balance_loss_mlp": 1.00057149, "epoch": 0.3336740215234774, "flos": 37779692410080.0, "grad_norm": 2.603657119210942, "language_loss": 0.7260434, "learning_rate": 3.108305205012959e-06, "loss": 0.75168848, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 2.7750329971313477 }, { "auxiliary_loss_clip": 0.01324063, "auxiliary_loss_mlp": 0.01196043, "balance_loss_clip": 1.00878286, "balance_loss_mlp": 1.00063539, "epoch": 0.3337942644141165, "flos": 25519067848800.0, "grad_norm": 1.9326799786400668, "language_loss": 0.87361866, "learning_rate": 3.107656693205761e-06, "loss": 0.89881974, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.819148302078247 }, { "auxiliary_loss_clip": 0.01369516, "auxiliary_loss_mlp": 0.01196634, "balance_loss_clip": 1.01012361, "balance_loss_mlp": 1.00074923, "epoch": 0.3339145073047556, "flos": 25989855575040.0, "grad_norm": 2.570128217578996, "language_loss": 0.70123386, "learning_rate": 3.107008013367924e-06, "loss": 0.72689539, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.6988420486450195 }, { "auxiliary_loss_clip": 0.01308553, "auxiliary_loss_mlp": 0.01196169, "balance_loss_clip": 1.00828433, "balance_loss_mlp": 1.00076139, "epoch": 0.3340347501953947, "flos": 19062579664800.0, "grad_norm": 2.437445779129511, "language_loss": 0.86622572, "learning_rate": 3.1063591655978507e-06, "loss": 0.8912729, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.7580790519714355 }, { "auxiliary_loss_clip": 0.01304155, "auxiliary_loss_mlp": 0.01196073, "balance_loss_clip": 1.00906396, "balance_loss_mlp": 1.00076056, "epoch": 0.3341549930860338, "flos": 18109724176800.0, "grad_norm": 1.6662127447536992, "language_loss": 0.79267848, "learning_rate": 3.105710149993972e-06, "loss": 0.81768072, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.8265256881713867 }, { "auxiliary_loss_clip": 0.0136961, "auxiliary_loss_mlp": 0.0119625, "balance_loss_clip": 1.00978673, "balance_loss_mlp": 1.00074708, "epoch": 0.33427523597667286, "flos": 22674978627840.0, "grad_norm": 1.9251750743662703, "language_loss": 0.85151166, "learning_rate": 3.1050609666547427e-06, "loss": 0.87717021, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 3.8013179302215576 }, { "auxiliary_loss_clip": 0.01318464, "auxiliary_loss_mlp": 0.01195559, "balance_loss_clip": 1.00864637, "balance_loss_mlp": 1.00043726, "epoch": 0.33439547886731197, "flos": 22638349683360.0, "grad_norm": 1.705457520525441, "language_loss": 0.77336955, "learning_rate": 3.104411615678644e-06, "loss": 0.79850984, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.808820962905884 }, { "auxiliary_loss_clip": 0.01331794, "auxiliary_loss_mlp": 0.01196648, "balance_loss_clip": 1.00979316, "balance_loss_mlp": 1.00076342, "epoch": 0.3345157217579511, "flos": 24096394573920.0, "grad_norm": 2.2479168330823516, "language_loss": 0.73402417, "learning_rate": 3.1037620971641803e-06, "loss": 0.75930858, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.7934343814849854 }, { "auxiliary_loss_clip": 0.01368731, "auxiliary_loss_mlp": 0.01196196, "balance_loss_clip": 1.00962138, "balance_loss_mlp": 1.00078785, "epoch": 0.33463596464859013, "flos": 18989501394240.0, "grad_norm": 2.6358198734717115, "language_loss": 0.64956808, "learning_rate": 3.1031124112098844e-06, "loss": 0.67521739, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 3.6245219707489014 }, { "auxiliary_loss_clip": 0.0133381, "auxiliary_loss_mlp": 0.01195966, "balance_loss_clip": 1.00948215, "balance_loss_mlp": 1.00065374, "epoch": 0.33475620753922924, "flos": 20375617572000.0, "grad_norm": 1.7892386561705482, "language_loss": 0.72043049, "learning_rate": 3.1024625579143127e-06, "loss": 0.74572825, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 3.7919044494628906 }, { "auxiliary_loss_clip": 0.0136839, "auxiliary_loss_mlp": 0.01195935, "balance_loss_clip": 1.00955749, "balance_loss_mlp": 1.00062275, "epoch": 0.33487645042986836, "flos": 18182586905280.0, "grad_norm": 1.8770303782051458, "language_loss": 0.72993279, "learning_rate": 3.101812537376048e-06, "loss": 0.75557601, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.652864694595337 }, { "auxiliary_loss_clip": 0.01331839, "auxiliary_loss_mlp": 0.0087277, "balance_loss_clip": 1.0092032, "balance_loss_mlp": 1.00065649, "epoch": 0.3349966933205074, "flos": 25848836120160.0, "grad_norm": 2.000982891452437, "language_loss": 0.84444034, "learning_rate": 3.1011623496936973e-06, "loss": 0.86648643, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.764477252960205 }, { "auxiliary_loss_clip": 0.01367884, "auxiliary_loss_mlp": 0.01196086, "balance_loss_clip": 1.00977015, "balance_loss_mlp": 1.0006783, "epoch": 0.3351169362111465, "flos": 28111460460480.0, "grad_norm": 1.67686717640284, "language_loss": 0.69634044, "learning_rate": 3.100511994965893e-06, "loss": 0.72198009, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.7511179447174072 }, { "auxiliary_loss_clip": 0.01341863, "auxiliary_loss_mlp": 0.01195636, "balance_loss_clip": 1.00916195, "balance_loss_mlp": 1.00060976, "epoch": 0.33523717910178563, "flos": 22673326138560.0, "grad_norm": 1.6688879991084995, "language_loss": 0.84465963, "learning_rate": 3.0998614732912947e-06, "loss": 0.87003458, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.718503713607788 }, { "auxiliary_loss_clip": 0.01344613, "auxiliary_loss_mlp": 0.01196099, "balance_loss_clip": 1.0095799, "balance_loss_mlp": 1.00069118, "epoch": 0.3353574219924247, "flos": 15669810673920.0, "grad_norm": 1.7229539283593058, "language_loss": 0.67715609, "learning_rate": 3.0992107847685855e-06, "loss": 0.70256323, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.795300245285034 }, { "auxiliary_loss_clip": 0.01321379, "auxiliary_loss_mlp": 0.01196091, "balance_loss_clip": 1.00883412, "balance_loss_mlp": 1.00068283, "epoch": 0.3354776648830638, "flos": 24790656106080.0, "grad_norm": 1.5731977342351255, "language_loss": 0.79013228, "learning_rate": 3.0985599294964736e-06, "loss": 0.81530696, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.8118784427642822 }, { "auxiliary_loss_clip": 0.01322127, "auxiliary_loss_mlp": 0.01196117, "balance_loss_clip": 1.00916946, "balance_loss_mlp": 1.0007087, "epoch": 0.33559790777370285, "flos": 28694865219840.0, "grad_norm": 1.9379786843865923, "language_loss": 0.69503772, "learning_rate": 3.097908907573695e-06, "loss": 0.72022021, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.96830677986145 }, { "auxiliary_loss_clip": 0.01259877, "auxiliary_loss_mlp": 0.01196122, "balance_loss_clip": 1.0067184, "balance_loss_mlp": 1.00071383, "epoch": 0.33571815066434196, "flos": 22235790530880.0, "grad_norm": 1.968594651887918, "language_loss": 0.89423478, "learning_rate": 3.0972577190990067e-06, "loss": 0.91879475, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.829716444015503 }, { "auxiliary_loss_clip": 0.01314502, "auxiliary_loss_mlp": 0.01195991, "balance_loss_clip": 1.00852466, "balance_loss_mlp": 1.00067878, "epoch": 0.3358383935549811, "flos": 23842291078080.0, "grad_norm": 1.7402932348925564, "language_loss": 0.79988009, "learning_rate": 3.096606364171196e-06, "loss": 0.82498503, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.7868499755859375 }, { "auxiliary_loss_clip": 0.01319038, "auxiliary_loss_mlp": 0.01196009, "balance_loss_clip": 1.00982571, "balance_loss_mlp": 1.00088775, "epoch": 0.33595863644562013, "flos": 22267318312800.0, "grad_norm": 1.9287264365920072, "language_loss": 0.85239744, "learning_rate": 3.0959548428890703e-06, "loss": 0.87754792, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.8142263889312744 }, { "auxiliary_loss_clip": 0.01343074, "auxiliary_loss_mlp": 0.0119581, "balance_loss_clip": 1.00958502, "balance_loss_mlp": 1.00049758, "epoch": 0.33607887933625924, "flos": 20119789739520.0, "grad_norm": 1.7353696091098154, "language_loss": 0.84151387, "learning_rate": 3.095303155351468e-06, "loss": 0.86690271, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.7421090602874756 }, { "auxiliary_loss_clip": 0.01288745, "auxiliary_loss_mlp": 0.01195941, "balance_loss_clip": 1.00819457, "balance_loss_mlp": 1.00072372, "epoch": 0.33619912222689835, "flos": 19318120107840.0, "grad_norm": 9.183712847374721, "language_loss": 0.78462529, "learning_rate": 3.0946513016572464e-06, "loss": 0.80947214, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.8293206691741943 }, { "auxiliary_loss_clip": 0.01345695, "auxiliary_loss_mlp": 0.01195989, "balance_loss_clip": 1.00929499, "balance_loss_mlp": 1.00067687, "epoch": 0.3363193651175374, "flos": 16800673798080.0, "grad_norm": 2.1058531913502945, "language_loss": 0.77393723, "learning_rate": 3.0939992819052938e-06, "loss": 0.79935408, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.6834018230438232 }, { "auxiliary_loss_clip": 0.01326142, "auxiliary_loss_mlp": 0.01196159, "balance_loss_clip": 1.00894248, "balance_loss_mlp": 1.00065613, "epoch": 0.3364396080081765, "flos": 23550301308960.0, "grad_norm": 1.92845778978438, "language_loss": 0.80591846, "learning_rate": 3.0933470961945193e-06, "loss": 0.83114147, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.8138091564178467 }, { "auxiliary_loss_clip": 0.01320798, "auxiliary_loss_mlp": 0.01195744, "balance_loss_clip": 1.00860476, "balance_loss_mlp": 1.00062203, "epoch": 0.3365598508988156, "flos": 28037915182080.0, "grad_norm": 1.6435022010580003, "language_loss": 0.68275952, "learning_rate": 3.0926947446238597e-06, "loss": 0.70792496, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.8034234046936035 }, { "auxiliary_loss_clip": 0.01357315, "auxiliary_loss_mlp": 0.01196043, "balance_loss_clip": 1.00957584, "balance_loss_mlp": 1.00063515, "epoch": 0.3366800937894547, "flos": 16982776733760.0, "grad_norm": 2.2070779639266394, "language_loss": 0.82718718, "learning_rate": 3.092042227292276e-06, "loss": 0.8527208, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.7148516178131104 }, { "auxiliary_loss_clip": 0.01368085, "auxiliary_loss_mlp": 0.01195411, "balance_loss_clip": 1.00950825, "balance_loss_mlp": 1.00067091, "epoch": 0.3368003366800938, "flos": 23915333424960.0, "grad_norm": 1.638213191261751, "language_loss": 0.88439929, "learning_rate": 3.0913895442987557e-06, "loss": 0.9100343, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 2.7221109867095947 }, { "auxiliary_loss_clip": 0.01300753, "auxiliary_loss_mlp": 0.00872957, "balance_loss_clip": 1.00800705, "balance_loss_mlp": 1.00077116, "epoch": 0.3369205795707329, "flos": 24791230884960.0, "grad_norm": 1.5793180662505846, "language_loss": 0.85841483, "learning_rate": 3.090736695742308e-06, "loss": 0.88015193, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.8232429027557373 }, { "auxiliary_loss_clip": 0.01287643, "auxiliary_loss_mlp": 0.01195813, "balance_loss_clip": 1.00814021, "balance_loss_mlp": 1.00059593, "epoch": 0.33704082246137196, "flos": 17931105838080.0, "grad_norm": 2.2818538049477923, "language_loss": 0.52079272, "learning_rate": 3.0900836817219713e-06, "loss": 0.54562724, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.863863468170166 }, { "auxiliary_loss_clip": 0.01368063, "auxiliary_loss_mlp": 0.01195862, "balance_loss_clip": 1.00926387, "balance_loss_mlp": 1.00073993, "epoch": 0.33716106535201107, "flos": 21286527410880.0, "grad_norm": 1.6014221807983735, "language_loss": 0.83658838, "learning_rate": 3.089430502336807e-06, "loss": 0.86222762, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.7074244022369385 }, { "auxiliary_loss_clip": 0.01349668, "auxiliary_loss_mlp": 0.01195966, "balance_loss_clip": 1.00922728, "balance_loss_mlp": 1.00074899, "epoch": 0.3372813082426502, "flos": 18402971274720.0, "grad_norm": 2.3467135461828246, "language_loss": 0.90477008, "learning_rate": 3.088777157685902e-06, "loss": 0.9302265, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.74070143699646 }, { "auxiliary_loss_clip": 0.01320384, "auxiliary_loss_mlp": 0.01195693, "balance_loss_clip": 1.00883818, "balance_loss_mlp": 1.00057077, "epoch": 0.33740155113328923, "flos": 17201400842880.0, "grad_norm": 2.035005267185914, "language_loss": 0.85386837, "learning_rate": 3.088123647868367e-06, "loss": 0.87902915, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 3.7475037574768066 }, { "auxiliary_loss_clip": 0.01356935, "auxiliary_loss_mlp": 0.01196044, "balance_loss_clip": 1.00946951, "balance_loss_mlp": 1.00063586, "epoch": 0.33752179402392835, "flos": 29058963320160.0, "grad_norm": 1.8451033083097161, "language_loss": 0.81163329, "learning_rate": 3.0874699729833405e-06, "loss": 0.83716309, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.7891550064086914 }, { "auxiliary_loss_clip": 0.01318117, "auxiliary_loss_mlp": 0.01195888, "balance_loss_clip": 1.00831318, "balance_loss_mlp": 1.00067091, "epoch": 0.3376420369145674, "flos": 25080741920160.0, "grad_norm": 1.7272310814487837, "language_loss": 0.79842257, "learning_rate": 3.086816133129983e-06, "loss": 0.82356262, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.8645360469818115 }, { "auxiliary_loss_clip": 0.01369537, "auxiliary_loss_mlp": 0.0119596, "balance_loss_clip": 1.01033175, "balance_loss_mlp": 1.00055218, "epoch": 0.3377622798052065, "flos": 27490636435680.0, "grad_norm": 1.8071936985148302, "language_loss": 0.75982374, "learning_rate": 3.0861621284074826e-06, "loss": 0.78547871, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 3.6283578872680664 }, { "auxiliary_loss_clip": 0.01338428, "auxiliary_loss_mlp": 0.01196015, "balance_loss_clip": 1.00969696, "balance_loss_mlp": 1.00079763, "epoch": 0.3378825226958456, "flos": 21975220772640.0, "grad_norm": 1.509789900985658, "language_loss": 0.72942597, "learning_rate": 3.085507958915051e-06, "loss": 0.7547704, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 3.759518623352051 }, { "auxiliary_loss_clip": 0.01318811, "auxiliary_loss_mlp": 0.0119628, "balance_loss_clip": 1.00852036, "balance_loss_mlp": 1.00077677, "epoch": 0.3380027655864847, "flos": 42523205963040.0, "grad_norm": 1.7496198066713737, "language_loss": 0.71345979, "learning_rate": 3.084853624751925e-06, "loss": 0.73861074, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 3.9375123977661133 }, { "auxiliary_loss_clip": 0.01307278, "auxiliary_loss_mlp": 0.0119597, "balance_loss_clip": 1.00841641, "balance_loss_mlp": 1.00056195, "epoch": 0.3381230084771238, "flos": 26725092816960.0, "grad_norm": 1.5913045920303361, "language_loss": 0.85620666, "learning_rate": 3.0841991260173668e-06, "loss": 0.88123918, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.831329822540283 }, { "auxiliary_loss_clip": 0.01369334, "auxiliary_loss_mlp": 0.01196053, "balance_loss_clip": 1.00964141, "balance_loss_mlp": 1.00074017, "epoch": 0.3382432513677629, "flos": 22710386167200.0, "grad_norm": 1.890031346170745, "language_loss": 0.8023119, "learning_rate": 3.0835444628106634e-06, "loss": 0.82796574, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.6923575401306152 }, { "auxiliary_loss_clip": 0.0136872, "auxiliary_loss_mlp": 0.00872894, "balance_loss_clip": 1.00969541, "balance_loss_mlp": 1.00048041, "epoch": 0.33836349425840195, "flos": 22122419100480.0, "grad_norm": 1.773125940358741, "language_loss": 0.83158219, "learning_rate": 3.082889635231126e-06, "loss": 0.8539983, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.6475276947021484 }, { "auxiliary_loss_clip": 0.01331644, "auxiliary_loss_mlp": 0.0119617, "balance_loss_clip": 1.00901198, "balance_loss_mlp": 1.00057125, "epoch": 0.33848373714904106, "flos": 27308102415840.0, "grad_norm": 2.3057434075199246, "language_loss": 0.76066184, "learning_rate": 3.0822346433780925e-06, "loss": 0.78593993, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.7464230060577393 }, { "auxiliary_loss_clip": 0.01357511, "auxiliary_loss_mlp": 0.01196098, "balance_loss_clip": 1.0099268, "balance_loss_mlp": 1.00069046, "epoch": 0.3386039800396802, "flos": 25848728349120.0, "grad_norm": 1.8172940293916349, "language_loss": 0.87261724, "learning_rate": 3.0815794873509237e-06, "loss": 0.89815331, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.7499642372131348 }, { "auxiliary_loss_clip": 0.01368746, "auxiliary_loss_mlp": 0.01195823, "balance_loss_clip": 1.00950241, "balance_loss_mlp": 1.00051045, "epoch": 0.33872422293031923, "flos": 18880656347520.0, "grad_norm": 1.69251426106551, "language_loss": 0.72394198, "learning_rate": 3.0809241672490066e-06, "loss": 0.74958766, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.71025013923645 }, { "auxiliary_loss_clip": 0.01326661, "auxiliary_loss_mlp": 0.01195783, "balance_loss_clip": 1.00871277, "balance_loss_mlp": 1.00056577, "epoch": 0.33884446582095834, "flos": 23146987759200.0, "grad_norm": 1.633956411935962, "language_loss": 0.85163784, "learning_rate": 3.080268683171753e-06, "loss": 0.87686229, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.8170664310455322 }, { "auxiliary_loss_clip": 0.01356224, "auxiliary_loss_mlp": 0.01195858, "balance_loss_clip": 1.00964642, "balance_loss_mlp": 1.00064087, "epoch": 0.33896470871159745, "flos": 15997351677120.0, "grad_norm": 2.215457541190154, "language_loss": 0.89310455, "learning_rate": 3.0796130352185985e-06, "loss": 0.91862535, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.657308340072632 }, { "auxiliary_loss_clip": 0.01345535, "auxiliary_loss_mlp": 0.00872956, "balance_loss_clip": 1.00994563, "balance_loss_mlp": 1.00056314, "epoch": 0.3390849516022365, "flos": 34495768465920.0, "grad_norm": 1.6662783695652248, "language_loss": 0.6649251, "learning_rate": 3.0789572234890057e-06, "loss": 0.68711001, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.874141216278076 }, { "auxiliary_loss_clip": 0.01322245, "auxiliary_loss_mlp": 0.01196091, "balance_loss_clip": 1.00895548, "balance_loss_mlp": 1.00068307, "epoch": 0.3392051944928756, "flos": 16180316781120.0, "grad_norm": 1.52110636552909, "language_loss": 0.77326035, "learning_rate": 3.0783012480824596e-06, "loss": 0.79844368, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.7141408920288086 }, { "auxiliary_loss_clip": 0.01367923, "auxiliary_loss_mlp": 0.01196161, "balance_loss_clip": 1.00939476, "balance_loss_mlp": 1.00065827, "epoch": 0.33932543738351467, "flos": 17086664312640.0, "grad_norm": 2.029368946248931, "language_loss": 0.74249655, "learning_rate": 3.077645109098471e-06, "loss": 0.7681374, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.671424388885498 }, { "auxiliary_loss_clip": 0.01308492, "auxiliary_loss_mlp": 0.01196041, "balance_loss_clip": 1.00830579, "balance_loss_mlp": 1.00072813, "epoch": 0.3394456802741538, "flos": 22126981407840.0, "grad_norm": 1.9595835351890525, "language_loss": 0.7227087, "learning_rate": 3.076988806636577e-06, "loss": 0.74775404, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.7775752544403076 }, { "auxiliary_loss_clip": 0.01325248, "auxiliary_loss_mlp": 0.00873054, "balance_loss_clip": 1.00910664, "balance_loss_mlp": 1.00074363, "epoch": 0.3395659231647929, "flos": 25226898461280.0, "grad_norm": 1.8780175334471985, "language_loss": 0.88345349, "learning_rate": 3.0763323407963377e-06, "loss": 0.90543652, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.891383409500122 }, { "auxiliary_loss_clip": 0.01356809, "auxiliary_loss_mlp": 0.01195978, "balance_loss_clip": 1.00969768, "balance_loss_mlp": 1.00066602, "epoch": 0.33968616605543195, "flos": 29096490356640.0, "grad_norm": 1.61578041952856, "language_loss": 0.79976892, "learning_rate": 3.075675711677337e-06, "loss": 0.82529676, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.7456912994384766 }, { "auxiliary_loss_clip": 0.01318848, "auxiliary_loss_mlp": 0.01195833, "balance_loss_clip": 1.00893641, "balance_loss_mlp": 1.00052035, "epoch": 0.33980640894607106, "flos": 21433977204480.0, "grad_norm": 2.0844981193961605, "language_loss": 0.7805326, "learning_rate": 3.0750189193791865e-06, "loss": 0.80567944, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.7806506156921387 }, { "auxiliary_loss_clip": 0.01356244, "auxiliary_loss_mlp": 0.0119634, "balance_loss_clip": 1.00946188, "balance_loss_mlp": 1.00093222, "epoch": 0.33992665183671017, "flos": 32490049668480.0, "grad_norm": 1.8062588280646048, "language_loss": 0.70195651, "learning_rate": 3.0743619640015203e-06, "loss": 0.72748232, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 2.8773672580718994 }, { "auxiliary_loss_clip": 0.01344501, "auxiliary_loss_mlp": 0.01196053, "balance_loss_clip": 1.00951886, "balance_loss_mlp": 1.00074053, "epoch": 0.3400468947273492, "flos": 17055423920160.0, "grad_norm": 2.0538031277231106, "language_loss": 0.92864096, "learning_rate": 3.073704845643999e-06, "loss": 0.95404649, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.7246816158294678 }, { "auxiliary_loss_clip": 0.01356778, "auxiliary_loss_mlp": 0.01196252, "balance_loss_clip": 1.00953805, "balance_loss_mlp": 1.00074911, "epoch": 0.34016713761798834, "flos": 16872997671360.0, "grad_norm": 2.6533457829506526, "language_loss": 0.78010273, "learning_rate": 3.0730475644063063e-06, "loss": 0.80563307, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.708200216293335 }, { "auxiliary_loss_clip": 0.01340297, "auxiliary_loss_mlp": 0.00872873, "balance_loss_clip": 1.00934839, "balance_loss_mlp": 1.0006969, "epoch": 0.34028738050862745, "flos": 21907171817280.0, "grad_norm": 1.6197294229992583, "language_loss": 0.65183008, "learning_rate": 3.072390120388151e-06, "loss": 0.67396176, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.729728937149048 }, { "auxiliary_loss_clip": 0.01345108, "auxiliary_loss_mlp": 0.01196064, "balance_loss_clip": 1.0090754, "balance_loss_mlp": 1.00065589, "epoch": 0.3404076233992665, "flos": 22746045172320.0, "grad_norm": 2.6868313435378686, "language_loss": 0.71215916, "learning_rate": 3.071732513689267e-06, "loss": 0.73757088, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.722676992416382 }, { "auxiliary_loss_clip": 0.01345239, "auxiliary_loss_mlp": 0.0119634, "balance_loss_clip": 1.01067972, "balance_loss_mlp": 1.00083625, "epoch": 0.3405278662899056, "flos": 17052370407360.0, "grad_norm": 2.7406457492098317, "language_loss": 0.67467904, "learning_rate": 3.0710747444094134e-06, "loss": 0.70009482, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.710780620574951 }, { "auxiliary_loss_clip": 0.01325977, "auxiliary_loss_mlp": 0.01196368, "balance_loss_clip": 1.00854349, "balance_loss_mlp": 1.00076985, "epoch": 0.3406481091805447, "flos": 42813148082400.0, "grad_norm": 1.7048475939835648, "language_loss": 0.65382349, "learning_rate": 3.070416812648372e-06, "loss": 0.67904699, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 3.886333703994751 }, { "auxiliary_loss_clip": 0.01332153, "auxiliary_loss_mlp": 0.01196046, "balance_loss_clip": 1.00994408, "balance_loss_mlp": 1.00082934, "epoch": 0.3407683520711838, "flos": 26761470295680.0, "grad_norm": 1.8856643657289736, "language_loss": 0.65249693, "learning_rate": 3.069758718505951e-06, "loss": 0.6777789, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.894892692565918 }, { "auxiliary_loss_clip": 0.01367981, "auxiliary_loss_mlp": 0.0119587, "balance_loss_clip": 1.00982535, "balance_loss_mlp": 1.00065327, "epoch": 0.3408885949618229, "flos": 28767656100960.0, "grad_norm": 2.8403586930006397, "language_loss": 0.80026847, "learning_rate": 3.0691004620819836e-06, "loss": 0.82590699, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 3.670576572418213 }, { "auxiliary_loss_clip": 0.01265481, "auxiliary_loss_mlp": 0.01194665, "balance_loss_clip": 1.00478005, "balance_loss_mlp": 1.00011587, "epoch": 0.341008837852462, "flos": 63576285994080.0, "grad_norm": 0.8553345471819925, "language_loss": 0.60244805, "learning_rate": 3.0684420434763254e-06, "loss": 0.62704951, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 4.400849342346191 }, { "auxiliary_loss_clip": 0.01305618, "auxiliary_loss_mlp": 0.01195882, "balance_loss_clip": 1.00886798, "balance_loss_mlp": 1.00066447, "epoch": 0.34112908074310105, "flos": 20812183240320.0, "grad_norm": 1.6978943168002198, "language_loss": 0.76706052, "learning_rate": 3.06778346278886e-06, "loss": 0.79207546, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 3.691455841064453 }, { "auxiliary_loss_clip": 0.0136817, "auxiliary_loss_mlp": 0.01195791, "balance_loss_clip": 1.01008153, "balance_loss_mlp": 1.00057387, "epoch": 0.34124932363374016, "flos": 24976459180800.0, "grad_norm": 2.4532084987473946, "language_loss": 0.79069376, "learning_rate": 3.0671247201194906e-06, "loss": 0.81633329, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.767890691757202 }, { "auxiliary_loss_clip": 0.01309591, "auxiliary_loss_mlp": 0.0119601, "balance_loss_clip": 1.0084281, "balance_loss_mlp": 1.00050712, "epoch": 0.3413695665243792, "flos": 28402983221760.0, "grad_norm": 2.7243142110363157, "language_loss": 0.75259519, "learning_rate": 3.066465815568151e-06, "loss": 0.77765119, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.879241943359375 }, { "auxiliary_loss_clip": 0.01356637, "auxiliary_loss_mlp": 0.01195594, "balance_loss_clip": 1.00952315, "balance_loss_mlp": 1.00056779, "epoch": 0.34148980941501833, "flos": 25302024381600.0, "grad_norm": 1.9083794256202649, "language_loss": 0.68670356, "learning_rate": 3.0658067492347947e-06, "loss": 0.71222585, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.853226661682129 }, { "auxiliary_loss_clip": 0.01251613, "auxiliary_loss_mlp": 0.01196632, "balance_loss_clip": 1.00768375, "balance_loss_mlp": 1.00084233, "epoch": 0.34161005230565744, "flos": 17530091403840.0, "grad_norm": 2.2912799845525442, "language_loss": 0.66907114, "learning_rate": 3.065147521219402e-06, "loss": 0.69355357, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 3.025380849838257 }, { "auxiliary_loss_clip": 0.01331513, "auxiliary_loss_mlp": 0.01195977, "balance_loss_clip": 1.00967312, "balance_loss_mlp": 1.0009501, "epoch": 0.3417302951962965, "flos": 43650117482400.0, "grad_norm": 1.4849946979447561, "language_loss": 0.74661911, "learning_rate": 3.064488131621977e-06, "loss": 0.77189398, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 3.0646188259124756 }, { "auxiliary_loss_clip": 0.01355901, "auxiliary_loss_mlp": 0.01195776, "balance_loss_clip": 1.00935245, "balance_loss_mlp": 1.00065434, "epoch": 0.3418505380869356, "flos": 30882219945120.0, "grad_norm": 1.7180568304031718, "language_loss": 0.73777521, "learning_rate": 3.063828580542549e-06, "loss": 0.76329195, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.791862964630127 }, { "auxiliary_loss_clip": 0.01331231, "auxiliary_loss_mlp": 0.01195476, "balance_loss_clip": 1.00878704, "balance_loss_mlp": 1.00054455, "epoch": 0.3419707809775747, "flos": 19463881488480.0, "grad_norm": 1.7131773139154278, "language_loss": 0.73514605, "learning_rate": 3.0631688680811706e-06, "loss": 0.76041305, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.7929623126983643 }, { "auxiliary_loss_clip": 0.01368806, "auxiliary_loss_mlp": 0.0119596, "balance_loss_clip": 1.00952518, "balance_loss_mlp": 1.0006479, "epoch": 0.3420910238682138, "flos": 28727829948960.0, "grad_norm": 1.9166665857378424, "language_loss": 0.75269026, "learning_rate": 3.062508994337921e-06, "loss": 0.77833796, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.803802967071533 }, { "auxiliary_loss_clip": 0.01345764, "auxiliary_loss_mlp": 0.0119614, "balance_loss_clip": 1.00888324, "balance_loss_mlp": 1.00063658, "epoch": 0.3422112667588529, "flos": 21397276412640.0, "grad_norm": 2.1839182658029452, "language_loss": 0.79778302, "learning_rate": 3.0618489594129013e-06, "loss": 0.82320207, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.7182369232177734 }, { "auxiliary_loss_clip": 0.01311, "auxiliary_loss_mlp": 0.01196023, "balance_loss_clip": 1.00881803, "balance_loss_mlp": 1.00071096, "epoch": 0.342331509649492, "flos": 13881458656800.0, "grad_norm": 1.902765573238903, "language_loss": 0.70950598, "learning_rate": 3.061188763406239e-06, "loss": 0.73457623, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.783177375793457 }, { "auxiliary_loss_clip": 0.01335469, "auxiliary_loss_mlp": 0.01196295, "balance_loss_clip": 1.00932932, "balance_loss_mlp": 1.00079203, "epoch": 0.34245175254013105, "flos": 28621463636160.0, "grad_norm": 2.0081557005785946, "language_loss": 0.8222537, "learning_rate": 3.060528406418085e-06, "loss": 0.84757137, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.7817752361297607 }, { "auxiliary_loss_clip": 0.0133734, "auxiliary_loss_mlp": 0.01195798, "balance_loss_clip": 1.00931096, "balance_loss_mlp": 1.00067663, "epoch": 0.34257199543077016, "flos": 34127072134560.0, "grad_norm": 2.613777613523175, "language_loss": 0.61700833, "learning_rate": 3.0598678885486145e-06, "loss": 0.64233977, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.9420344829559326 }, { "auxiliary_loss_clip": 0.01326673, "auxiliary_loss_mlp": 0.00873135, "balance_loss_clip": 1.00911903, "balance_loss_mlp": 1.00117874, "epoch": 0.34269223832140927, "flos": 19974028358880.0, "grad_norm": 2.1542440960150824, "language_loss": 0.74276394, "learning_rate": 3.0592072098980282e-06, "loss": 0.76476204, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.719930410385132 }, { "auxiliary_loss_clip": 0.01332122, "auxiliary_loss_mlp": 0.01195858, "balance_loss_clip": 1.0099318, "balance_loss_mlp": 1.00073671, "epoch": 0.3428124812120483, "flos": 27235670771520.0, "grad_norm": 2.1180740640207687, "language_loss": 0.73225272, "learning_rate": 3.0585463705665514e-06, "loss": 0.75753248, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.8372585773468018 }, { "auxiliary_loss_clip": 0.01322067, "auxiliary_loss_mlp": 0.01195709, "balance_loss_clip": 1.00861216, "balance_loss_mlp": 1.00068283, "epoch": 0.34293272410268744, "flos": 24570882439200.0, "grad_norm": 2.512352719674874, "language_loss": 0.70497978, "learning_rate": 3.0578853706544304e-06, "loss": 0.73015749, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.8249459266662598 }, { "auxiliary_loss_clip": 0.01319884, "auxiliary_loss_mlp": 0.00873055, "balance_loss_clip": 1.01017952, "balance_loss_mlp": 1.0009954, "epoch": 0.34305296699332655, "flos": 21506875856640.0, "grad_norm": 1.9283025453362517, "language_loss": 0.65149975, "learning_rate": 3.0572242102619404e-06, "loss": 0.67342913, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 3.297806739807129 }, { "auxiliary_loss_clip": 0.01324497, "auxiliary_loss_mlp": 0.01195707, "balance_loss_clip": 1.00888932, "balance_loss_mlp": 1.00058544, "epoch": 0.3431732098839656, "flos": 24056676192960.0, "grad_norm": 1.7634066860306, "language_loss": 0.80462909, "learning_rate": 3.0565628894893784e-06, "loss": 0.82983112, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.786482334136963 }, { "auxiliary_loss_clip": 0.01357488, "auxiliary_loss_mlp": 0.01195722, "balance_loss_clip": 1.00999343, "balance_loss_mlp": 1.00069535, "epoch": 0.3432934527746047, "flos": 16800889340160.0, "grad_norm": 1.7270177049553659, "language_loss": 0.74632972, "learning_rate": 3.0559014084370655e-06, "loss": 0.77186179, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.695681571960449 }, { "auxiliary_loss_clip": 0.01332999, "auxiliary_loss_mlp": 0.01196197, "balance_loss_clip": 1.0084691, "balance_loss_mlp": 1.00059795, "epoch": 0.34341369566524377, "flos": 23439731925600.0, "grad_norm": 1.645466684507048, "language_loss": 0.7813046, "learning_rate": 3.055239767205349e-06, "loss": 0.80659658, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.8878729343414307 }, { "auxiliary_loss_clip": 0.01343466, "auxiliary_loss_mlp": 0.0119617, "balance_loss_clip": 1.00943553, "balance_loss_mlp": 1.00076175, "epoch": 0.3435339385558829, "flos": 17267473995840.0, "grad_norm": 2.0507523649264474, "language_loss": 0.78472096, "learning_rate": 3.054577965894599e-06, "loss": 0.81011736, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.6701157093048096 }, { "auxiliary_loss_clip": 0.01328486, "auxiliary_loss_mlp": 0.01196241, "balance_loss_clip": 1.00959635, "balance_loss_mlp": 1.00083327, "epoch": 0.343654181446522, "flos": 22199377128480.0, "grad_norm": 1.8202500374466701, "language_loss": 0.70328587, "learning_rate": 3.0539160046052094e-06, "loss": 0.72853315, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.7608070373535156 }, { "auxiliary_loss_clip": 0.01333244, "auxiliary_loss_mlp": 0.01196539, "balance_loss_clip": 1.00922692, "balance_loss_mlp": 1.00074971, "epoch": 0.34377442433716104, "flos": 19901812256640.0, "grad_norm": 2.3988398050793585, "language_loss": 0.70306689, "learning_rate": 3.0532538834376003e-06, "loss": 0.72836471, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 3.7683937549591064 }, { "auxiliary_loss_clip": 0.01356221, "auxiliary_loss_mlp": 0.01195707, "balance_loss_clip": 1.00963664, "balance_loss_mlp": 1.00058556, "epoch": 0.34389466722780015, "flos": 22197688715520.0, "grad_norm": 1.7450358916769944, "language_loss": 0.78012002, "learning_rate": 3.0525916024922143e-06, "loss": 0.80563939, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.7265000343322754 }, { "auxiliary_loss_clip": 0.01343902, "auxiliary_loss_mlp": 0.01195923, "balance_loss_clip": 1.009709, "balance_loss_mlp": 1.0007056, "epoch": 0.34401491011843927, "flos": 18624577049280.0, "grad_norm": 2.5038172346890137, "language_loss": 0.8405416, "learning_rate": 3.0519291618695193e-06, "loss": 0.86593986, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 3.748439073562622 }, { "auxiliary_loss_clip": 0.01319876, "auxiliary_loss_mlp": 0.01195579, "balance_loss_clip": 1.00889099, "balance_loss_mlp": 1.00055265, "epoch": 0.3441351530090783, "flos": 17858207185920.0, "grad_norm": 1.8573031815421162, "language_loss": 0.7557019, "learning_rate": 3.0512665616700065e-06, "loss": 0.78085649, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 3.889638662338257 }, { "auxiliary_loss_clip": 0.01298844, "auxiliary_loss_mlp": 0.01195645, "balance_loss_clip": 1.00804687, "balance_loss_mlp": 1.00052321, "epoch": 0.34425539589971743, "flos": 23112765701280.0, "grad_norm": 1.9352771412980383, "language_loss": 0.89388907, "learning_rate": 3.0506038019941933e-06, "loss": 0.91883397, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 4.725840330123901 }, { "auxiliary_loss_clip": 0.01300728, "auxiliary_loss_mlp": 0.01196308, "balance_loss_clip": 1.007797, "balance_loss_mlp": 1.00080419, "epoch": 0.34437563879035654, "flos": 21907710672480.0, "grad_norm": 2.140428774776077, "language_loss": 0.67618787, "learning_rate": 3.049940882942617e-06, "loss": 0.70115817, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.7734265327453613 }, { "auxiliary_loss_clip": 0.01368197, "auxiliary_loss_mlp": 0.01195891, "balance_loss_clip": 1.00982904, "balance_loss_mlp": 1.00067425, "epoch": 0.3444958816809956, "flos": 23076927077760.0, "grad_norm": 2.21625054213089, "language_loss": 0.80200255, "learning_rate": 3.0492778046158448e-06, "loss": 0.82764339, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.7326712608337402 }, { "auxiliary_loss_clip": 0.0134444, "auxiliary_loss_mlp": 0.01195803, "balance_loss_clip": 1.00940776, "balance_loss_mlp": 1.0006814, "epoch": 0.3446161245716347, "flos": 21908644688160.0, "grad_norm": 1.8482720560262103, "language_loss": 0.76687682, "learning_rate": 3.0486145671144633e-06, "loss": 0.7922793, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.7853550910949707 }, { "auxiliary_loss_clip": 0.01281666, "auxiliary_loss_mlp": 0.01196135, "balance_loss_clip": 1.00958109, "balance_loss_mlp": 1.00072718, "epoch": 0.3447363674622738, "flos": 25112844480960.0, "grad_norm": 2.8035073615093054, "language_loss": 0.77242458, "learning_rate": 3.047951170539086e-06, "loss": 0.79720271, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.8950467109680176 }, { "auxiliary_loss_clip": 0.01302429, "auxiliary_loss_mlp": 0.01195486, "balance_loss_clip": 1.00877154, "balance_loss_mlp": 1.00064981, "epoch": 0.3448566103529129, "flos": 11984692677120.0, "grad_norm": 1.831741908573835, "language_loss": 0.83893728, "learning_rate": 3.047287614990349e-06, "loss": 0.8639164, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.720881223678589 }, { "auxiliary_loss_clip": 0.01332205, "auxiliary_loss_mlp": 0.01196123, "balance_loss_clip": 1.00927997, "balance_loss_mlp": 1.00081015, "epoch": 0.344976853243552, "flos": 40187898512640.0, "grad_norm": 2.9835264247373576, "language_loss": 0.61961752, "learning_rate": 3.046623900568914e-06, "loss": 0.6449008, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.9253108501434326 }, { "auxiliary_loss_clip": 0.01332392, "auxiliary_loss_mlp": 0.0119682, "balance_loss_clip": 1.01004434, "balance_loss_mlp": 1.00074458, "epoch": 0.34509709613419104, "flos": 28723662802080.0, "grad_norm": 2.502314165480009, "language_loss": 0.70010549, "learning_rate": 3.045960027375465e-06, "loss": 0.72539759, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 2.828622341156006 }, { "auxiliary_loss_clip": 0.01356189, "auxiliary_loss_mlp": 0.01196201, "balance_loss_clip": 1.00964403, "balance_loss_mlp": 1.00060225, "epoch": 0.34521733902483015, "flos": 29967609967200.0, "grad_norm": 2.4519360409685453, "language_loss": 0.82673049, "learning_rate": 3.045295995510711e-06, "loss": 0.85225439, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.784177541732788 }, { "auxiliary_loss_clip": 0.01330398, "auxiliary_loss_mlp": 0.01195678, "balance_loss_clip": 1.00978827, "balance_loss_mlp": 1.00065124, "epoch": 0.34533758191546926, "flos": 27923070880800.0, "grad_norm": 1.8536411649713185, "language_loss": 0.73376483, "learning_rate": 3.0446318050753865e-06, "loss": 0.75902557, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.818936347961426 }, { "auxiliary_loss_clip": 0.01355536, "auxiliary_loss_mlp": 0.01195319, "balance_loss_clip": 1.00955629, "balance_loss_mlp": 1.00048375, "epoch": 0.3454578248061083, "flos": 27125891709120.0, "grad_norm": 1.9431953486645868, "language_loss": 0.77233249, "learning_rate": 3.0439674561702474e-06, "loss": 0.79784101, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.7175567150115967 }, { "auxiliary_loss_clip": 0.0134437, "auxiliary_loss_mlp": 0.01195765, "balance_loss_clip": 1.0089581, "balance_loss_mlp": 1.00073838, "epoch": 0.3455780676967474, "flos": 19024908933600.0, "grad_norm": 10.558915858840566, "language_loss": 0.87991786, "learning_rate": 3.043302948896076e-06, "loss": 0.90531921, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.87382435798645 }, { "auxiliary_loss_clip": 0.01287045, "auxiliary_loss_mlp": 0.01196146, "balance_loss_clip": 1.0077951, "balance_loss_mlp": 1.00073767, "epoch": 0.34569831058738654, "flos": 34496019931680.0, "grad_norm": 1.72314724702129, "language_loss": 0.60674393, "learning_rate": 3.0426382833536756e-06, "loss": 0.63157582, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 2.869091272354126 }, { "auxiliary_loss_clip": 0.01321728, "auxiliary_loss_mlp": 0.01195795, "balance_loss_clip": 1.00945401, "balance_loss_mlp": 1.0005784, "epoch": 0.3458185534780256, "flos": 31138694403840.0, "grad_norm": 2.69013192773742, "language_loss": 0.77731627, "learning_rate": 3.041973459643877e-06, "loss": 0.80249155, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.9055287837982178 }, { "auxiliary_loss_clip": 0.01305572, "auxiliary_loss_mlp": 0.01196008, "balance_loss_clip": 1.00849211, "balance_loss_mlp": 1.00069523, "epoch": 0.3459387963686647, "flos": 32452522632000.0, "grad_norm": 2.093459666493627, "language_loss": 0.66912889, "learning_rate": 3.0413084778675334e-06, "loss": 0.69414461, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.9672210216522217 }, { "auxiliary_loss_clip": 0.01343329, "auxiliary_loss_mlp": 0.00873078, "balance_loss_clip": 1.00939512, "balance_loss_mlp": 1.00114441, "epoch": 0.3460590392593038, "flos": 24675668110080.0, "grad_norm": 2.0705709899229485, "language_loss": 0.83938026, "learning_rate": 3.0406433381255214e-06, "loss": 0.86154437, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.764577865600586 }, { "auxiliary_loss_clip": 0.01342748, "auxiliary_loss_mlp": 0.01195896, "balance_loss_clip": 1.00950909, "balance_loss_mlp": 1.00067878, "epoch": 0.34617928214994287, "flos": 18807326611200.0, "grad_norm": 3.0939618231089003, "language_loss": 0.82387763, "learning_rate": 3.0399780405187425e-06, "loss": 0.84926403, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 2.731924057006836 }, { "auxiliary_loss_clip": 0.01355318, "auxiliary_loss_mlp": 0.01195853, "balance_loss_clip": 1.00943708, "balance_loss_mlp": 1.00073123, "epoch": 0.346299525040582, "flos": 24857663274720.0, "grad_norm": 1.745729240382644, "language_loss": 0.78707796, "learning_rate": 3.0393125851481216e-06, "loss": 0.81258965, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.759922742843628 }, { "auxiliary_loss_clip": 0.01310832, "auxiliary_loss_mlp": 0.01195864, "balance_loss_clip": 1.00822306, "balance_loss_mlp": 1.00074196, "epoch": 0.3464197679312211, "flos": 16434923208480.0, "grad_norm": 2.1721177495625037, "language_loss": 0.86310315, "learning_rate": 3.038646972114608e-06, "loss": 0.88817006, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.7762365341186523 }, { "auxiliary_loss_clip": 0.01304881, "auxiliary_loss_mlp": 0.01195906, "balance_loss_clip": 1.00948536, "balance_loss_mlp": 1.00068843, "epoch": 0.34654001082186014, "flos": 22382485927200.0, "grad_norm": 1.6203180163110655, "language_loss": 0.67357767, "learning_rate": 3.037981201519174e-06, "loss": 0.69858551, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.7898402214050293 }, { "auxiliary_loss_clip": 0.0134343, "auxiliary_loss_mlp": 0.01195691, "balance_loss_clip": 1.00900316, "balance_loss_mlp": 1.00056875, "epoch": 0.34666025371249926, "flos": 19573912016640.0, "grad_norm": 1.8778949528416837, "language_loss": 0.71201158, "learning_rate": 3.0373152734628175e-06, "loss": 0.7374028, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.74587345123291 }, { "auxiliary_loss_clip": 0.01356279, "auxiliary_loss_mlp": 0.01196063, "balance_loss_clip": 1.00953758, "balance_loss_mlp": 1.00065494, "epoch": 0.34678049660313837, "flos": 15267646681920.0, "grad_norm": 3.576506942294615, "language_loss": 0.76073152, "learning_rate": 3.0366491880465584e-06, "loss": 0.78625494, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.6839377880096436 }, { "auxiliary_loss_clip": 0.01369222, "auxiliary_loss_mlp": 0.01196425, "balance_loss_clip": 1.01048374, "balance_loss_mlp": 1.00101757, "epoch": 0.3469007394937774, "flos": 21181562121600.0, "grad_norm": 1.4381248206988297, "language_loss": 0.82192987, "learning_rate": 3.035982945371443e-06, "loss": 0.84758633, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 3.7112748622894287 }, { "auxiliary_loss_clip": 0.01340838, "auxiliary_loss_mlp": 0.01196077, "balance_loss_clip": 1.0095861, "balance_loss_mlp": 1.0008595, "epoch": 0.34702098238441653, "flos": 22375480809600.0, "grad_norm": 2.1347273748367432, "language_loss": 0.85710144, "learning_rate": 3.035316545538537e-06, "loss": 0.88247055, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.754328727722168 }, { "auxiliary_loss_clip": 0.01320368, "auxiliary_loss_mlp": 0.01195607, "balance_loss_clip": 1.00976086, "balance_loss_mlp": 1.00058067, "epoch": 0.3471412252750556, "flos": 22929441360480.0, "grad_norm": 2.0399930547476472, "language_loss": 0.79168302, "learning_rate": 3.034649988648935e-06, "loss": 0.81684279, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 3.671323299407959 }, { "auxiliary_loss_clip": 0.01329827, "auxiliary_loss_mlp": 0.01195623, "balance_loss_clip": 1.00832248, "balance_loss_mlp": 1.00050116, "epoch": 0.3472614681656947, "flos": 21324269989440.0, "grad_norm": 1.776155772120668, "language_loss": 0.80747569, "learning_rate": 3.033983274803752e-06, "loss": 0.83273017, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 5.326783895492554 }, { "auxiliary_loss_clip": 0.01342953, "auxiliary_loss_mlp": 0.01195783, "balance_loss_clip": 1.009642, "balance_loss_mlp": 1.00056553, "epoch": 0.3473817110563338, "flos": 23475750167520.0, "grad_norm": 2.0206676385428954, "language_loss": 0.72657925, "learning_rate": 3.0333164041041283e-06, "loss": 0.7519666, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.7528717517852783 }, { "auxiliary_loss_clip": 0.01279893, "auxiliary_loss_mlp": 0.01195944, "balance_loss_clip": 1.00771499, "balance_loss_mlp": 1.00072718, "epoch": 0.34750195394697286, "flos": 22346036601120.0, "grad_norm": 1.8133728850176036, "language_loss": 0.71908414, "learning_rate": 3.032649376651228e-06, "loss": 0.74384248, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.8999035358428955 }, { "auxiliary_loss_clip": 0.01306231, "auxiliary_loss_mlp": 0.01195927, "balance_loss_clip": 1.0080961, "balance_loss_mlp": 1.00051868, "epoch": 0.347622196837612, "flos": 29095017485760.0, "grad_norm": 1.671996679760499, "language_loss": 0.7604112, "learning_rate": 3.031982192546238e-06, "loss": 0.78543276, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.8758649826049805 }, { "auxiliary_loss_clip": 0.01355578, "auxiliary_loss_mlp": 0.01195951, "balance_loss_clip": 1.00979114, "balance_loss_mlp": 1.00073409, "epoch": 0.3477424397282511, "flos": 22455743816160.0, "grad_norm": 3.941087022649192, "language_loss": 0.94564587, "learning_rate": 3.0313148518903696e-06, "loss": 0.97116119, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 3.041316270828247 }, { "auxiliary_loss_clip": 0.01331174, "auxiliary_loss_mlp": 0.01196017, "balance_loss_clip": 1.00985098, "balance_loss_mlp": 1.00080013, "epoch": 0.34786268261889014, "flos": 15778799415360.0, "grad_norm": 2.9558489867804223, "language_loss": 0.81003845, "learning_rate": 3.030647354784859e-06, "loss": 0.83531034, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.6809206008911133 }, { "auxiliary_loss_clip": 0.01306845, "auxiliary_loss_mlp": 0.0119584, "balance_loss_clip": 1.00797415, "balance_loss_mlp": 1.00062311, "epoch": 0.34798292550952925, "flos": 20777637869280.0, "grad_norm": 1.6852130407342818, "language_loss": 0.77289307, "learning_rate": 3.029979701330964e-06, "loss": 0.79791999, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.8215882778167725 }, { "auxiliary_loss_clip": 0.01340443, "auxiliary_loss_mlp": 0.01196023, "balance_loss_clip": 1.00949597, "balance_loss_mlp": 1.00071037, "epoch": 0.34810316840016836, "flos": 19937830498560.0, "grad_norm": 2.061655207001365, "language_loss": 0.80406559, "learning_rate": 3.029311891629966e-06, "loss": 0.82943028, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.7691683769226074 }, { "auxiliary_loss_clip": 0.01322694, "auxiliary_loss_mlp": 0.01195923, "balance_loss_clip": 1.00827634, "balance_loss_mlp": 1.0007056, "epoch": 0.3482234112908074, "flos": 23623307732160.0, "grad_norm": 1.722451282014761, "language_loss": 0.74503714, "learning_rate": 3.0286439257831744e-06, "loss": 0.77022326, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 2.7120349407196045 }, { "auxiliary_loss_clip": 0.01368197, "auxiliary_loss_mlp": 0.01196356, "balance_loss_clip": 1.00961697, "balance_loss_mlp": 1.00085258, "epoch": 0.3483436541814465, "flos": 23986723282560.0, "grad_norm": 2.0938154921215304, "language_loss": 0.71477485, "learning_rate": 3.0279758038919156e-06, "loss": 0.74042034, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.723689556121826 }, { "auxiliary_loss_clip": 0.01345321, "auxiliary_loss_mlp": 0.01196702, "balance_loss_clip": 1.00927687, "balance_loss_mlp": 1.0008173, "epoch": 0.34846389707208564, "flos": 22638349683360.0, "grad_norm": 1.91480198681049, "language_loss": 0.78284109, "learning_rate": 3.0273075260575455e-06, "loss": 0.80826128, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.720674514770508 }, { "auxiliary_loss_clip": 0.01336899, "auxiliary_loss_mlp": 0.01196256, "balance_loss_clip": 1.00968552, "balance_loss_mlp": 1.00075245, "epoch": 0.3485841399627247, "flos": 21792866371200.0, "grad_norm": 1.7499662022929872, "language_loss": 0.81018448, "learning_rate": 3.0266390923814396e-06, "loss": 0.83551604, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.7219836711883545 }, { "auxiliary_loss_clip": 0.01330439, "auxiliary_loss_mlp": 0.01195953, "balance_loss_clip": 1.01012325, "balance_loss_mlp": 1.000736, "epoch": 0.3487043828533638, "flos": 17019046441440.0, "grad_norm": 1.8168203655165047, "language_loss": 0.82003337, "learning_rate": 3.0259705029650008e-06, "loss": 0.84529734, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.714294910430908 }, { "auxiliary_loss_clip": 0.01354802, "auxiliary_loss_mlp": 0.01195841, "balance_loss_clip": 1.00954485, "balance_loss_mlp": 1.00062358, "epoch": 0.34882462574400286, "flos": 22601145960000.0, "grad_norm": 2.005430488728367, "language_loss": 0.7289865, "learning_rate": 3.025301757909652e-06, "loss": 0.75449288, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.7619636058807373 }, { "auxiliary_loss_clip": 0.01302749, "auxiliary_loss_mlp": 0.00873374, "balance_loss_clip": 1.00769806, "balance_loss_mlp": 1.00149918, "epoch": 0.34894486863464197, "flos": 29861531043840.0, "grad_norm": 1.6099553401437565, "language_loss": 0.80806398, "learning_rate": 3.024632857316842e-06, "loss": 0.82982528, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.81544828414917 }, { "auxiliary_loss_clip": 0.01345776, "auxiliary_loss_mlp": 0.01195933, "balance_loss_clip": 1.00944328, "balance_loss_mlp": 1.00081098, "epoch": 0.3490651115252811, "flos": 22122275405760.0, "grad_norm": 1.652348117523513, "language_loss": 0.77508676, "learning_rate": 3.0239638012880412e-06, "loss": 0.80050385, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 2.6893503665924072 }, { "auxiliary_loss_clip": 0.01290342, "auxiliary_loss_mlp": 0.01196581, "balance_loss_clip": 1.00808835, "balance_loss_mlp": 1.0008868, "epoch": 0.34918535441592014, "flos": 12676691017440.0, "grad_norm": 2.1228848793196184, "language_loss": 0.81383944, "learning_rate": 3.0232945899247466e-06, "loss": 0.83870864, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.7740650177001953 }, { "auxiliary_loss_clip": 0.01356318, "auxiliary_loss_mlp": 0.01195708, "balance_loss_clip": 1.00981736, "balance_loss_mlp": 1.00068176, "epoch": 0.34930559730655925, "flos": 23185628429760.0, "grad_norm": 1.8203701655714408, "language_loss": 0.77313769, "learning_rate": 3.022625223328476e-06, "loss": 0.79865789, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 2.8152353763580322 }, { "auxiliary_loss_clip": 0.01351484, "auxiliary_loss_mlp": 0.01196456, "balance_loss_clip": 1.00992656, "balance_loss_mlp": 1.0009532, "epoch": 0.34942584019719836, "flos": 22855033913760.0, "grad_norm": 1.368861273502754, "language_loss": 0.68979657, "learning_rate": 3.0219557016007723e-06, "loss": 0.71527588, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.7468597888946533 }, { "auxiliary_loss_clip": 0.01342286, "auxiliary_loss_mlp": 0.01195891, "balance_loss_clip": 1.00974584, "balance_loss_mlp": 1.00067437, "epoch": 0.3495460830878374, "flos": 24426055074240.0, "grad_norm": 1.8190095573758205, "language_loss": 0.69406897, "learning_rate": 3.021286024843202e-06, "loss": 0.71945077, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.694854259490967 }, { "auxiliary_loss_clip": 0.01341986, "auxiliary_loss_mlp": 0.01194659, "balance_loss_clip": 1.00572705, "balance_loss_mlp": 1.00010955, "epoch": 0.3496663259784765, "flos": 70008777159840.0, "grad_norm": 1.0675119701532478, "language_loss": 0.64799404, "learning_rate": 3.0206161931573526e-06, "loss": 0.67336059, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.16831111907959 }, { "auxiliary_loss_clip": 0.01330978, "auxiliary_loss_mlp": 0.01195707, "balance_loss_clip": 1.00896168, "balance_loss_mlp": 1.00068069, "epoch": 0.34978656886911563, "flos": 28692817570080.0, "grad_norm": 1.8355278884954553, "language_loss": 0.93049508, "learning_rate": 3.0199462066448388e-06, "loss": 0.95576197, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.8219072818756104 }, { "auxiliary_loss_clip": 0.01346255, "auxiliary_loss_mlp": 0.01195856, "balance_loss_clip": 1.00937176, "balance_loss_mlp": 1.00063872, "epoch": 0.3499068117597547, "flos": 21142167053760.0, "grad_norm": 1.7202936453764401, "language_loss": 0.6904285, "learning_rate": 3.019276065407296e-06, "loss": 0.71584952, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.733278751373291 }, { "auxiliary_loss_clip": 0.01295141, "auxiliary_loss_mlp": 0.01196511, "balance_loss_clip": 1.00868702, "balance_loss_mlp": 1.00100756, "epoch": 0.3500270546503938, "flos": 22782710040480.0, "grad_norm": 1.8373349369512697, "language_loss": 0.808303, "learning_rate": 3.018605769546385e-06, "loss": 0.83321953, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 3.798492908477783 }, { "auxiliary_loss_clip": 0.01355685, "auxiliary_loss_mlp": 0.01196367, "balance_loss_clip": 1.00983012, "balance_loss_mlp": 1.00086367, "epoch": 0.3501472975410329, "flos": 22894069744800.0, "grad_norm": 1.7399831504984011, "language_loss": 0.7966013, "learning_rate": 3.017935319163788e-06, "loss": 0.8221218, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.7348380088806152 }, { "auxiliary_loss_clip": 0.01344147, "auxiliary_loss_mlp": 0.01196596, "balance_loss_clip": 1.00939977, "balance_loss_mlp": 1.00071096, "epoch": 0.35026754043167196, "flos": 25446600280800.0, "grad_norm": 1.5730354995747735, "language_loss": 0.70321339, "learning_rate": 3.017264714361213e-06, "loss": 0.72862077, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 3.6449270248413086 }, { "auxiliary_loss_clip": 0.01323014, "auxiliary_loss_mlp": 0.0087337, "balance_loss_clip": 1.00869179, "balance_loss_mlp": 1.00153518, "epoch": 0.3503877833223111, "flos": 19573768321920.0, "grad_norm": 1.852463892267021, "language_loss": 0.82445425, "learning_rate": 3.016593955240389e-06, "loss": 0.84641814, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 3.7642855644226074 }, { "auxiliary_loss_clip": 0.01317302, "auxiliary_loss_mlp": 0.01193925, "balance_loss_clip": 1.0045712, "balance_loss_mlp": 1.00013852, "epoch": 0.3505080262129502, "flos": 65072109386880.0, "grad_norm": 0.8167697123037925, "language_loss": 0.63710815, "learning_rate": 3.015923041903071e-06, "loss": 0.66222042, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 4.495916843414307 }, { "auxiliary_loss_clip": 0.01343992, "auxiliary_loss_mlp": 0.01195811, "balance_loss_clip": 1.00980353, "balance_loss_mlp": 1.00059402, "epoch": 0.35062826910358924, "flos": 29314575610560.0, "grad_norm": 1.8954028674861256, "language_loss": 0.83024144, "learning_rate": 3.0152519744510347e-06, "loss": 0.85563946, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.758620262145996 }, { "auxiliary_loss_clip": 0.0130955, "auxiliary_loss_mlp": 0.01196025, "balance_loss_clip": 1.00829959, "balance_loss_mlp": 1.00080764, "epoch": 0.35074851199422835, "flos": 23987729145600.0, "grad_norm": 1.7605789287178557, "language_loss": 0.82919288, "learning_rate": 3.014580752986081e-06, "loss": 0.85424864, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.802689790725708 }, { "auxiliary_loss_clip": 0.01288499, "auxiliary_loss_mlp": 0.0119624, "balance_loss_clip": 1.00820065, "balance_loss_mlp": 1.00092721, "epoch": 0.3508687548848674, "flos": 15224443704000.0, "grad_norm": 1.899605544595943, "language_loss": 0.78368163, "learning_rate": 3.0139093776100345e-06, "loss": 0.80852902, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.773714542388916 }, { "auxiliary_loss_clip": 0.0136675, "auxiliary_loss_mlp": 0.01195419, "balance_loss_clip": 1.00968909, "balance_loss_mlp": 1.0004884, "epoch": 0.3509889977755065, "flos": 21361761102240.0, "grad_norm": 1.6599144884252104, "language_loss": 0.75066859, "learning_rate": 3.013237848424741e-06, "loss": 0.7762903, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.7283568382263184 }, { "auxiliary_loss_clip": 0.01324663, "auxiliary_loss_mlp": 0.01195812, "balance_loss_clip": 1.00909281, "balance_loss_mlp": 1.00078535, "epoch": 0.35110924066614563, "flos": 19135370545920.0, "grad_norm": 2.1200545707478127, "language_loss": 0.75526041, "learning_rate": 3.012566165532072e-06, "loss": 0.78046519, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.7491939067840576 }, { "auxiliary_loss_clip": 0.01279193, "auxiliary_loss_mlp": 0.01196121, "balance_loss_clip": 1.00948715, "balance_loss_mlp": 1.00080895, "epoch": 0.3512294835567847, "flos": 21980896714080.0, "grad_norm": 2.3158663929977648, "language_loss": 0.76632726, "learning_rate": 3.0118943290339207e-06, "loss": 0.79108042, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.8660571575164795 }, { "auxiliary_loss_clip": 0.01320604, "auxiliary_loss_mlp": 0.01195936, "balance_loss_clip": 1.0092901, "balance_loss_mlp": 1.00071919, "epoch": 0.3513497264474238, "flos": 17817303323520.0, "grad_norm": 1.7571319349336558, "language_loss": 0.68036389, "learning_rate": 3.011222339032204e-06, "loss": 0.70552933, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.753397226333618 }, { "auxiliary_loss_clip": 0.0136721, "auxiliary_loss_mlp": 0.01195771, "balance_loss_clip": 1.01012611, "balance_loss_mlp": 1.00074506, "epoch": 0.3514699693380629, "flos": 26943429536640.0, "grad_norm": 1.6613938704401403, "language_loss": 0.69600379, "learning_rate": 3.0105501956288626e-06, "loss": 0.72163367, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.757988929748535 }, { "auxiliary_loss_clip": 0.01356173, "auxiliary_loss_mlp": 0.01196396, "balance_loss_clip": 1.01030421, "balance_loss_mlp": 1.00070238, "epoch": 0.35159021222870196, "flos": 15267574834560.0, "grad_norm": 1.7619341668984017, "language_loss": 0.72656322, "learning_rate": 3.0098778989258602e-06, "loss": 0.7520889, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.7468814849853516 }, { "auxiliary_loss_clip": 0.01311085, "auxiliary_loss_mlp": 0.01195912, "balance_loss_clip": 1.0088805, "balance_loss_mlp": 1.00098133, "epoch": 0.35171045511934107, "flos": 13984160754240.0, "grad_norm": 1.9415849253906725, "language_loss": 0.88152736, "learning_rate": 3.009205449025183e-06, "loss": 0.90659738, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.7767295837402344 }, { "auxiliary_loss_clip": 0.01325926, "auxiliary_loss_mlp": 0.01195953, "balance_loss_clip": 1.01007235, "balance_loss_mlp": 1.000736, "epoch": 0.3518306980099802, "flos": 14283443030400.0, "grad_norm": 1.7800199776625174, "language_loss": 0.63226497, "learning_rate": 3.008532846028842e-06, "loss": 0.65748382, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.727543830871582 }, { "auxiliary_loss_clip": 0.01366861, "auxiliary_loss_mlp": 0.01195782, "balance_loss_clip": 1.00990748, "balance_loss_mlp": 1.00065994, "epoch": 0.35195094090061924, "flos": 27052885285920.0, "grad_norm": 3.4528542929846355, "language_loss": 0.71654421, "learning_rate": 3.0078600900388694e-06, "loss": 0.74217063, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.710810899734497 }, { "auxiliary_loss_clip": 0.01318073, "auxiliary_loss_mlp": 0.01195678, "balance_loss_clip": 1.00893629, "balance_loss_mlp": 1.00055599, "epoch": 0.35207118379125835, "flos": 25629277995360.0, "grad_norm": 1.7288690063288548, "language_loss": 0.73721445, "learning_rate": 3.007187181157323e-06, "loss": 0.76235199, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.8285670280456543 }, { "auxiliary_loss_clip": 0.01272073, "auxiliary_loss_mlp": 0.01195453, "balance_loss_clip": 1.00862098, "balance_loss_mlp": 1.00052142, "epoch": 0.35219142668189746, "flos": 18004722963840.0, "grad_norm": 2.16313919546281, "language_loss": 0.68220639, "learning_rate": 3.006514119486282e-06, "loss": 0.70688164, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 2.7858450412750244 }, { "auxiliary_loss_clip": 0.01310261, "auxiliary_loss_mlp": 0.01195502, "balance_loss_clip": 1.00830138, "balance_loss_mlp": 1.00047529, "epoch": 0.3523116695725365, "flos": 14028118129440.0, "grad_norm": 1.8604057231666409, "language_loss": 0.69784158, "learning_rate": 3.005840905127849e-06, "loss": 0.7228992, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.8031833171844482 }, { "auxiliary_loss_clip": 0.01367133, "auxiliary_loss_mlp": 0.01195769, "balance_loss_clip": 1.01015997, "balance_loss_mlp": 1.0008378, "epoch": 0.3524319124631756, "flos": 21433977204480.0, "grad_norm": 1.9436575140627506, "language_loss": 0.86799532, "learning_rate": 3.0051675381841516e-06, "loss": 0.89362437, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 2.6881401538848877 }, { "auxiliary_loss_clip": 0.01262646, "auxiliary_loss_mlp": 0.00873435, "balance_loss_clip": 1.00848103, "balance_loss_mlp": 1.00177038, "epoch": 0.3525521553538147, "flos": 26322785130240.0, "grad_norm": 1.4882791824226649, "language_loss": 0.76785135, "learning_rate": 3.0044940187573363e-06, "loss": 0.78921211, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.9246931076049805 }, { "auxiliary_loss_clip": 0.01354781, "auxiliary_loss_mlp": 0.01196188, "balance_loss_clip": 1.0098269, "balance_loss_mlp": 1.00078034, "epoch": 0.3526723982444538, "flos": 21543325182720.0, "grad_norm": 1.8478538912294271, "language_loss": 0.64976501, "learning_rate": 3.003820346949578e-06, "loss": 0.67527473, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.935253858566284 }, { "auxiliary_loss_clip": 0.01367769, "auxiliary_loss_mlp": 0.01195779, "balance_loss_clip": 1.00998878, "balance_loss_mlp": 1.00065756, "epoch": 0.3527926411350929, "flos": 23733661573440.0, "grad_norm": 1.9035000977009207, "language_loss": 0.80053735, "learning_rate": 3.003146522863071e-06, "loss": 0.82617283, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.6933553218841553 }, { "auxiliary_loss_clip": 0.01321446, "auxiliary_loss_mlp": 0.01195683, "balance_loss_clip": 1.00908232, "balance_loss_mlp": 1.00065649, "epoch": 0.35291288402573195, "flos": 30445474658400.0, "grad_norm": 2.320754964066873, "language_loss": 0.85824889, "learning_rate": 3.0024725466000345e-06, "loss": 0.88342023, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.7961413860321045 }, { "auxiliary_loss_clip": 0.01344771, "auxiliary_loss_mlp": 0.01195582, "balance_loss_clip": 1.00977337, "balance_loss_mlp": 1.00065064, "epoch": 0.35303312691637107, "flos": 23112190922400.0, "grad_norm": 1.7521223182742882, "language_loss": 0.7889148, "learning_rate": 3.0017984182627087e-06, "loss": 0.81431836, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.6988489627838135 }, { "auxiliary_loss_clip": 0.01322832, "auxiliary_loss_mlp": 0.00873349, "balance_loss_clip": 1.00935566, "balance_loss_mlp": 1.00168037, "epoch": 0.3531533698070102, "flos": 21835710112320.0, "grad_norm": 1.9095838055190895, "language_loss": 0.82439458, "learning_rate": 3.00112413795336e-06, "loss": 0.84635645, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 3.788508653640747 }, { "auxiliary_loss_clip": 0.01343391, "auxiliary_loss_mlp": 0.01195686, "balance_loss_clip": 1.00998175, "balance_loss_mlp": 1.00056458, "epoch": 0.35327361269764923, "flos": 15778979033760.0, "grad_norm": 1.8256652321672502, "language_loss": 0.80179513, "learning_rate": 3.000449705774275e-06, "loss": 0.82718599, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.866132974624634 }, { "auxiliary_loss_clip": 0.01346247, "auxiliary_loss_mlp": 0.01195703, "balance_loss_clip": 1.00953174, "balance_loss_mlp": 1.0005815, "epoch": 0.35339385558828834, "flos": 22090424310720.0, "grad_norm": 1.902068517965427, "language_loss": 0.71762842, "learning_rate": 2.9997751218277654e-06, "loss": 0.74304795, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 3.7476842403411865 }, { "auxiliary_loss_clip": 0.01367177, "auxiliary_loss_mlp": 0.01196062, "balance_loss_clip": 1.01008558, "balance_loss_mlp": 1.00084496, "epoch": 0.35351409847892745, "flos": 24165018308160.0, "grad_norm": 1.8166424842963784, "language_loss": 0.77678555, "learning_rate": 2.999100386216166e-06, "loss": 0.80241793, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 3.709318161010742 }, { "auxiliary_loss_clip": 0.01329014, "auxiliary_loss_mlp": 0.01195654, "balance_loss_clip": 1.00950956, "balance_loss_mlp": 1.00053239, "epoch": 0.3536343413695665, "flos": 27052310507040.0, "grad_norm": 4.069957454353966, "language_loss": 0.74224734, "learning_rate": 2.998425499041831e-06, "loss": 0.76749402, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 3.792628049850464 }, { "auxiliary_loss_clip": 0.01321321, "auxiliary_loss_mlp": 0.0119387, "balance_loss_clip": 1.00538659, "balance_loss_mlp": 1.0000838, "epoch": 0.3537545842602056, "flos": 65991102053760.0, "grad_norm": 1.2794762355617328, "language_loss": 0.64519787, "learning_rate": 2.997750460407142e-06, "loss": 0.67034978, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.3310444355010986 }, { "auxiliary_loss_clip": 0.01315916, "auxiliary_loss_mlp": 0.01195902, "balance_loss_clip": 1.00857687, "balance_loss_mlp": 1.00078011, "epoch": 0.35387482715084473, "flos": 18436905943200.0, "grad_norm": 1.9159032353718042, "language_loss": 0.70160532, "learning_rate": 2.997075270414501e-06, "loss": 0.72672355, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.807363748550415 }, { "auxiliary_loss_clip": 0.01300937, "auxiliary_loss_mlp": 0.01193869, "balance_loss_clip": 1.00503302, "balance_loss_mlp": 1.00008249, "epoch": 0.3539950700414838, "flos": 65588614748640.0, "grad_norm": 0.7041021997637553, "language_loss": 0.57732606, "learning_rate": 2.9963999291663347e-06, "loss": 0.60227418, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.3293182849884033 }, { "auxiliary_loss_clip": 0.01277157, "auxiliary_loss_mlp": 0.01195801, "balance_loss_clip": 1.00786841, "balance_loss_mlp": 1.00058377, "epoch": 0.3541153129321229, "flos": 20521666342080.0, "grad_norm": 2.346094841220249, "language_loss": 0.7400884, "learning_rate": 2.9957244367650915e-06, "loss": 0.76481801, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.820854663848877 }, { "auxiliary_loss_clip": 0.01304827, "auxiliary_loss_mlp": 0.01195652, "balance_loss_clip": 1.01031768, "balance_loss_mlp": 1.00062537, "epoch": 0.354235555822762, "flos": 19573588703520.0, "grad_norm": 1.921741600108769, "language_loss": 0.84006721, "learning_rate": 2.9950487933132425e-06, "loss": 0.86507201, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.8703253269195557 }, { "auxiliary_loss_clip": 0.01355037, "auxiliary_loss_mlp": 0.01196215, "balance_loss_clip": 1.00989306, "balance_loss_mlp": 1.00080681, "epoch": 0.35435579871340106, "flos": 20777278632480.0, "grad_norm": 2.230174251964147, "language_loss": 0.71134406, "learning_rate": 2.994372998913283e-06, "loss": 0.73685658, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.7064154148101807 }, { "auxiliary_loss_clip": 0.01323154, "auxiliary_loss_mlp": 0.01196238, "balance_loss_clip": 1.00857997, "balance_loss_mlp": 1.00082994, "epoch": 0.35447604160404017, "flos": 23951818674720.0, "grad_norm": 2.9112181706013827, "language_loss": 0.6228683, "learning_rate": 2.99369705366773e-06, "loss": 0.64806223, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.7300093173980713 }, { "auxiliary_loss_clip": 0.01317738, "auxiliary_loss_mlp": 0.01195835, "balance_loss_clip": 1.00913239, "balance_loss_mlp": 1.00061786, "epoch": 0.3545962844946792, "flos": 23435672549760.0, "grad_norm": 1.911094274152822, "language_loss": 0.82283211, "learning_rate": 2.9930209576791244e-06, "loss": 0.84796786, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.7545464038848877 }, { "auxiliary_loss_clip": 0.01343875, "auxiliary_loss_mlp": 0.01195445, "balance_loss_clip": 1.0089339, "balance_loss_mlp": 1.00060892, "epoch": 0.35471652738531834, "flos": 22085143529760.0, "grad_norm": 1.923321090214704, "language_loss": 0.63740945, "learning_rate": 2.9923447110500285e-06, "loss": 0.6628027, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.7267649173736572 }, { "auxiliary_loss_clip": 0.01355002, "auxiliary_loss_mlp": 0.01195615, "balance_loss_clip": 1.00975513, "balance_loss_mlp": 1.00068426, "epoch": 0.35483677027595745, "flos": 27341893389600.0, "grad_norm": 1.3660678970100844, "language_loss": 0.75430214, "learning_rate": 2.9916683138830295e-06, "loss": 0.77980828, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.8160324096679688 }, { "auxiliary_loss_clip": 0.01317882, "auxiliary_loss_mlp": 0.01195951, "balance_loss_clip": 1.00825298, "balance_loss_mlp": 1.00073409, "epoch": 0.3549570131665965, "flos": 13516175075040.0, "grad_norm": 2.0504870921032037, "language_loss": 0.80662692, "learning_rate": 2.9909917662807353e-06, "loss": 0.83176529, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.7126646041870117 }, { "auxiliary_loss_clip": 0.01342495, "auxiliary_loss_mlp": 0.01195713, "balance_loss_clip": 1.00902295, "balance_loss_mlp": 1.00049555, "epoch": 0.3550772560572356, "flos": 20887560626400.0, "grad_norm": 2.313577338476444, "language_loss": 0.69058394, "learning_rate": 2.9903150683457783e-06, "loss": 0.71596605, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.788271903991699 }, { "auxiliary_loss_clip": 0.01324647, "auxiliary_loss_mlp": 0.01195665, "balance_loss_clip": 1.00850141, "balance_loss_mlp": 1.00054288, "epoch": 0.3551974989478747, "flos": 20194053491520.0, "grad_norm": 2.0364930928485543, "language_loss": 0.65337974, "learning_rate": 2.9896382201808126e-06, "loss": 0.67858291, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.698538303375244 }, { "auxiliary_loss_clip": 0.0136782, "auxiliary_loss_mlp": 0.01195813, "balance_loss_clip": 1.00981903, "balance_loss_mlp": 1.00059569, "epoch": 0.3553177418385138, "flos": 19828841757120.0, "grad_norm": 2.046866305758894, "language_loss": 0.8076582, "learning_rate": 2.988961221888516e-06, "loss": 0.83329451, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 2.6523945331573486 }, { "auxiliary_loss_clip": 0.01311696, "auxiliary_loss_mlp": 0.01195539, "balance_loss_clip": 1.00867379, "balance_loss_mlp": 1.00060797, "epoch": 0.3554379847291529, "flos": 14829141134880.0, "grad_norm": 2.5364521930961685, "language_loss": 0.78797215, "learning_rate": 2.988284073571589e-06, "loss": 0.81304449, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.741722822189331 }, { "auxiliary_loss_clip": 0.01348222, "auxiliary_loss_mlp": 0.00873257, "balance_loss_clip": 1.00930238, "balance_loss_mlp": 1.00157011, "epoch": 0.355558227619792, "flos": 20485360710720.0, "grad_norm": 2.3359266282251148, "language_loss": 0.72530985, "learning_rate": 2.9876067753327528e-06, "loss": 0.74752462, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 2.7640981674194336 }, { "auxiliary_loss_clip": 0.01349687, "auxiliary_loss_mlp": 0.01195914, "balance_loss_clip": 1.00936818, "balance_loss_mlp": 1.00060129, "epoch": 0.35567847051043106, "flos": 37663626703680.0, "grad_norm": 1.8951076840483807, "language_loss": 0.80161852, "learning_rate": 2.986929327274754e-06, "loss": 0.82707453, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.8275530338287354 }, { "auxiliary_loss_clip": 0.01343841, "auxiliary_loss_mlp": 0.01195697, "balance_loss_clip": 1.00941777, "balance_loss_mlp": 1.00057554, "epoch": 0.35579871340107017, "flos": 26943070299840.0, "grad_norm": 1.6712286062421804, "language_loss": 0.78713346, "learning_rate": 2.9862517295003617e-06, "loss": 0.81252885, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.7612340450286865 }, { "auxiliary_loss_clip": 0.01318016, "auxiliary_loss_mlp": 0.01195838, "balance_loss_clip": 1.00896549, "balance_loss_mlp": 1.00052559, "epoch": 0.3559189562917093, "flos": 28293347854080.0, "grad_norm": 1.9664600985058536, "language_loss": 0.72575504, "learning_rate": 2.9855739821123654e-06, "loss": 0.75089353, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.844491720199585 }, { "auxiliary_loss_clip": 0.01341213, "auxiliary_loss_mlp": 0.01195537, "balance_loss_clip": 1.00896096, "balance_loss_mlp": 1.00060618, "epoch": 0.35603919918234833, "flos": 25664074832160.0, "grad_norm": 1.6711409955400145, "language_loss": 0.82201505, "learning_rate": 2.98489608521358e-06, "loss": 0.84738255, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.7243638038635254 }, { "auxiliary_loss_clip": 0.01354661, "auxiliary_loss_mlp": 0.00873146, "balance_loss_clip": 1.00972652, "balance_loss_mlp": 1.00157595, "epoch": 0.35615944207298744, "flos": 23000867141760.0, "grad_norm": 1.9520839340709, "language_loss": 0.79804635, "learning_rate": 2.9842180389068425e-06, "loss": 0.82032442, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.7652130126953125 }, { "auxiliary_loss_clip": 0.01283912, "auxiliary_loss_mlp": 0.01193809, "balance_loss_clip": 1.01020277, "balance_loss_mlp": 1.00002217, "epoch": 0.35627968496362655, "flos": 68251319507520.0, "grad_norm": 0.7603799154976404, "language_loss": 0.59272885, "learning_rate": 2.98353984329501e-06, "loss": 0.61750609, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 4.279339075088501 }, { "auxiliary_loss_clip": 0.01321425, "auxiliary_loss_mlp": 0.01196126, "balance_loss_clip": 1.00883305, "balance_loss_mlp": 1.00052702, "epoch": 0.3563999278542656, "flos": 22641726509280.0, "grad_norm": 1.4900918152350746, "language_loss": 0.70458782, "learning_rate": 2.982861498480965e-06, "loss": 0.72976333, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.8213131427764893 }, { "auxiliary_loss_clip": 0.01320654, "auxiliary_loss_mlp": 0.01195654, "balance_loss_clip": 1.00918448, "balance_loss_mlp": 1.00053203, "epoch": 0.3565201707449047, "flos": 25952544080640.0, "grad_norm": 1.613019113224665, "language_loss": 0.82823467, "learning_rate": 2.9821830045676122e-06, "loss": 0.85339773, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 3.7239017486572266 }, { "auxiliary_loss_clip": 0.0136729, "auxiliary_loss_mlp": 0.01195909, "balance_loss_clip": 1.00986028, "balance_loss_mlp": 1.00069213, "epoch": 0.3566404136355438, "flos": 28475738179200.0, "grad_norm": 1.6844498620159731, "language_loss": 0.72815192, "learning_rate": 2.9815043616578793e-06, "loss": 0.75378394, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 3.7908928394317627 }, { "auxiliary_loss_clip": 0.01319153, "auxiliary_loss_mlp": 0.01196281, "balance_loss_clip": 1.0093689, "balance_loss_mlp": 1.00077796, "epoch": 0.3567606565261829, "flos": 38363133093120.0, "grad_norm": 1.8927698403859214, "language_loss": 0.76857603, "learning_rate": 2.9808255698547145e-06, "loss": 0.79373038, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.914243698120117 }, { "auxiliary_loss_clip": 0.013442, "auxiliary_loss_mlp": 0.0119565, "balance_loss_clip": 1.00960469, "balance_loss_mlp": 1.00062394, "epoch": 0.356880899416822, "flos": 21981040408800.0, "grad_norm": 2.6691191499050957, "language_loss": 0.79562747, "learning_rate": 2.9801466292610913e-06, "loss": 0.82102597, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.773209571838379 }, { "auxiliary_loss_clip": 0.01345497, "auxiliary_loss_mlp": 0.01195604, "balance_loss_clip": 1.00929797, "balance_loss_mlp": 1.00048256, "epoch": 0.35700114230746105, "flos": 18989142157440.0, "grad_norm": 2.110425751832147, "language_loss": 0.81103909, "learning_rate": 2.979467539980003e-06, "loss": 0.8364501, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.67653489112854 }, { "auxiliary_loss_clip": 0.013469, "auxiliary_loss_mlp": 0.01195664, "balance_loss_clip": 1.0094682, "balance_loss_mlp": 1.00054264, "epoch": 0.35712138519810016, "flos": 19756122723360.0, "grad_norm": 1.7378281536956868, "language_loss": 0.76697481, "learning_rate": 2.978788302114468e-06, "loss": 0.79240042, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.7375314235687256 }, { "auxiliary_loss_clip": 0.01341623, "auxiliary_loss_mlp": 0.01195991, "balance_loss_clip": 1.00961685, "balance_loss_mlp": 1.00067902, "epoch": 0.35724162808873927, "flos": 35183024880480.0, "grad_norm": 2.1228879733067, "language_loss": 0.83619881, "learning_rate": 2.9781089157675255e-06, "loss": 0.86157495, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.786778211593628 }, { "auxiliary_loss_clip": 0.01341681, "auxiliary_loss_mlp": 0.01195948, "balance_loss_clip": 1.00975704, "balance_loss_mlp": 1.00082624, "epoch": 0.3573618709793783, "flos": 25556738580000.0, "grad_norm": 1.4864740893664672, "language_loss": 0.88452727, "learning_rate": 2.977429381042238e-06, "loss": 0.90990353, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.983142375946045 }, { "auxiliary_loss_clip": 0.01332325, "auxiliary_loss_mlp": 0.01195908, "balance_loss_clip": 1.00985718, "balance_loss_mlp": 1.00069082, "epoch": 0.35748211387001744, "flos": 29132364903840.0, "grad_norm": 2.1281814159237133, "language_loss": 0.89095342, "learning_rate": 2.9767496980416913e-06, "loss": 0.9162358, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.7997310161590576 }, { "auxiliary_loss_clip": 0.01329975, "auxiliary_loss_mlp": 0.01195998, "balance_loss_clip": 1.00888252, "balance_loss_mlp": 1.00068581, "epoch": 0.35760235676065655, "flos": 13954177690560.0, "grad_norm": 2.079410266554435, "language_loss": 0.81033587, "learning_rate": 2.9760698668689914e-06, "loss": 0.83559549, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.7097907066345215 }, { "auxiliary_loss_clip": 0.01347415, "auxiliary_loss_mlp": 0.0119578, "balance_loss_clip": 1.00915384, "balance_loss_mlp": 1.0006578, "epoch": 0.3577225996512956, "flos": 44018706042720.0, "grad_norm": 1.7711965509927745, "language_loss": 0.7165212, "learning_rate": 2.975389887627269e-06, "loss": 0.74195313, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.96455717086792 }, { "auxiliary_loss_clip": 0.01311351, "auxiliary_loss_mlp": 0.01196072, "balance_loss_clip": 1.00859165, "balance_loss_mlp": 1.00056851, "epoch": 0.3578428425419347, "flos": 17055208378080.0, "grad_norm": 2.050367962174018, "language_loss": 0.90189052, "learning_rate": 2.9747097604196764e-06, "loss": 0.92696476, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.718675136566162 }, { "auxiliary_loss_clip": 0.01270117, "auxiliary_loss_mlp": 0.01193881, "balance_loss_clip": 1.00568259, "balance_loss_mlp": 1.00009429, "epoch": 0.3579630854325738, "flos": 71676586219680.0, "grad_norm": 1.067775645321667, "language_loss": 0.56675947, "learning_rate": 2.9740294853493875e-06, "loss": 0.59139949, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.5964856147766113 }, { "auxiliary_loss_clip": 0.01302752, "auxiliary_loss_mlp": 0.01195805, "balance_loss_clip": 1.0092752, "balance_loss_mlp": 1.00068331, "epoch": 0.3580833283232129, "flos": 25046663556960.0, "grad_norm": 2.1989599980972314, "language_loss": 0.67201102, "learning_rate": 2.9733490625196008e-06, "loss": 0.69699657, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.843761682510376 }, { "auxiliary_loss_clip": 0.01307632, "auxiliary_loss_mlp": 0.01195738, "balance_loss_clip": 1.00904942, "balance_loss_mlp": 1.00071108, "epoch": 0.358203571213852, "flos": 13953135903840.0, "grad_norm": 2.5892695365215084, "language_loss": 0.75613737, "learning_rate": 2.9726684920335353e-06, "loss": 0.78117108, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.775113821029663 }, { "auxiliary_loss_clip": 0.01367394, "auxiliary_loss_mlp": 0.00873132, "balance_loss_clip": 1.00969863, "balance_loss_mlp": 1.00123179, "epoch": 0.35832381410449105, "flos": 20302467454080.0, "grad_norm": 2.1185868502833602, "language_loss": 0.81564093, "learning_rate": 2.971987773994432e-06, "loss": 0.83804619, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.740485668182373 }, { "auxiliary_loss_clip": 0.0135471, "auxiliary_loss_mlp": 0.0119549, "balance_loss_clip": 1.01001513, "balance_loss_mlp": 1.00055921, "epoch": 0.35844405699513016, "flos": 16983243741600.0, "grad_norm": 2.3899451309446658, "language_loss": 0.82899022, "learning_rate": 2.9713069085055566e-06, "loss": 0.85449219, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 2.705070972442627 }, { "auxiliary_loss_clip": 0.01306648, "auxiliary_loss_mlp": 0.01195649, "balance_loss_clip": 1.00929236, "balance_loss_mlp": 1.00052762, "epoch": 0.35856429988576927, "flos": 23216868822240.0, "grad_norm": 1.5984702611451076, "language_loss": 0.78867292, "learning_rate": 2.9706258956701958e-06, "loss": 0.81369591, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 2.7992067337036133 }, { "auxiliary_loss_clip": 0.01348053, "auxiliary_loss_mlp": 0.01196025, "balance_loss_clip": 1.00995469, "balance_loss_mlp": 1.00071275, "epoch": 0.3586845427764083, "flos": 23034586268160.0, "grad_norm": 2.045431631956015, "language_loss": 0.77281046, "learning_rate": 2.9699447355916575e-06, "loss": 0.79825115, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.7421655654907227 }, { "auxiliary_loss_clip": 0.01366596, "auxiliary_loss_mlp": 0.0087311, "balance_loss_clip": 1.00983012, "balance_loss_mlp": 1.00136328, "epoch": 0.35880478566704743, "flos": 20010693227040.0, "grad_norm": 2.3241914675754107, "language_loss": 0.73842227, "learning_rate": 2.969263428373275e-06, "loss": 0.76081938, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.737874984741211 }, { "auxiliary_loss_clip": 0.01332894, "auxiliary_loss_mlp": 0.01195636, "balance_loss_clip": 1.00957918, "balance_loss_mlp": 1.00070536, "epoch": 0.35892502855768654, "flos": 13699104255360.0, "grad_norm": 1.896773141555823, "language_loss": 0.79528105, "learning_rate": 2.9685819741184007e-06, "loss": 0.82056636, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.7212581634521484 }, { "auxiliary_loss_clip": 0.01306513, "auxiliary_loss_mlp": 0.01195721, "balance_loss_clip": 1.00972056, "balance_loss_mlp": 1.00059962, "epoch": 0.3590452714483256, "flos": 18114106865760.0, "grad_norm": 2.582696900601744, "language_loss": 0.6932807, "learning_rate": 2.967900372930411e-06, "loss": 0.71830308, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.8094635009765625 }, { "auxiliary_loss_clip": 0.0133625, "auxiliary_loss_mlp": 0.01195722, "balance_loss_clip": 1.00960159, "balance_loss_mlp": 1.00069523, "epoch": 0.3591655143389647, "flos": 17749362139200.0, "grad_norm": 2.0075857642754356, "language_loss": 0.79469681, "learning_rate": 2.9672186249127046e-06, "loss": 0.8200165, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.713836669921875 }, { "auxiliary_loss_clip": 0.01329705, "auxiliary_loss_mlp": 0.01195677, "balance_loss_clip": 1.00966358, "balance_loss_mlp": 1.00065017, "epoch": 0.3592857572296038, "flos": 25224419727360.0, "grad_norm": 2.1921013576943285, "language_loss": 0.78938729, "learning_rate": 2.9665367301687014e-06, "loss": 0.81464112, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.8078994750976562 }, { "auxiliary_loss_clip": 0.01334863, "auxiliary_loss_mlp": 0.01195487, "balance_loss_clip": 1.00954521, "balance_loss_mlp": 1.00046098, "epoch": 0.3594060001202429, "flos": 29384420749920.0, "grad_norm": 2.3748201908519864, "language_loss": 0.76695967, "learning_rate": 2.965854688801845e-06, "loss": 0.79226321, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 3.7970638275146484 }, { "auxiliary_loss_clip": 0.01353898, "auxiliary_loss_mlp": 0.01195905, "balance_loss_clip": 1.00973952, "balance_loss_mlp": 1.00078273, "epoch": 0.359526243010882, "flos": 17052909262560.0, "grad_norm": 2.026847371256295, "language_loss": 0.76170123, "learning_rate": 2.9651725009156005e-06, "loss": 0.78719926, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.671363592147827 }, { "auxiliary_loss_clip": 0.01330466, "auxiliary_loss_mlp": 0.01196015, "balance_loss_clip": 1.00900555, "balance_loss_mlp": 1.00089288, "epoch": 0.3596464859015211, "flos": 22965100365600.0, "grad_norm": 1.6369849265889664, "language_loss": 0.74044794, "learning_rate": 2.964490166613454e-06, "loss": 0.76571274, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 3.681581497192383 }, { "auxiliary_loss_clip": 0.01342636, "auxiliary_loss_mlp": 0.01193816, "balance_loss_clip": 1.00798774, "balance_loss_mlp": 1.00002992, "epoch": 0.35976672879216015, "flos": 54739491197760.0, "grad_norm": 0.7582443944060383, "language_loss": 0.57745963, "learning_rate": 2.963807685998917e-06, "loss": 0.60282415, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.987166166305542 }, { "auxiliary_loss_clip": 0.01293024, "auxiliary_loss_mlp": 0.01195648, "balance_loss_clip": 1.00808227, "balance_loss_mlp": 1.00062156, "epoch": 0.35988697168279926, "flos": 43139036596320.0, "grad_norm": 1.45104073228606, "language_loss": 0.77879751, "learning_rate": 2.9631250591755196e-06, "loss": 0.80368423, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 4.002948045730591 }, { "auxiliary_loss_clip": 0.01318148, "auxiliary_loss_mlp": 0.01195562, "balance_loss_clip": 1.00912106, "balance_loss_mlp": 1.00053561, "epoch": 0.36000721457343837, "flos": 35845614936000.0, "grad_norm": 1.7361816689697445, "language_loss": 0.57825893, "learning_rate": 2.962442286246817e-06, "loss": 0.60339606, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.9272358417510986 }, { "auxiliary_loss_clip": 0.01326958, "auxiliary_loss_mlp": 0.01195648, "balance_loss_clip": 1.00940514, "balance_loss_mlp": 1.00071728, "epoch": 0.3601274574640774, "flos": 18291108638880.0, "grad_norm": 1.529575098115204, "language_loss": 0.69565916, "learning_rate": 2.9617593673163853e-06, "loss": 0.72088528, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.7991766929626465 }, { "auxiliary_loss_clip": 0.01341255, "auxiliary_loss_mlp": 0.01195812, "balance_loss_clip": 1.00961041, "balance_loss_mlp": 1.00069022, "epoch": 0.36024770035471654, "flos": 13333964368320.0, "grad_norm": 2.456931523227995, "language_loss": 0.77332354, "learning_rate": 2.9610763024878216e-06, "loss": 0.79869413, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.7297539710998535 }, { "auxiliary_loss_clip": 0.01340375, "auxiliary_loss_mlp": 0.0119581, "balance_loss_clip": 1.00949764, "balance_loss_mlp": 1.00068831, "epoch": 0.3603679432453556, "flos": 20267023991040.0, "grad_norm": 1.6700397092088373, "language_loss": 0.91747683, "learning_rate": 2.960393091864747e-06, "loss": 0.94283867, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.8291079998016357 }, { "auxiliary_loss_clip": 0.01331325, "auxiliary_loss_mlp": 0.01195719, "balance_loss_clip": 1.00946903, "balance_loss_mlp": 1.00059724, "epoch": 0.3604881861359947, "flos": 22451145585120.0, "grad_norm": 2.097264555856165, "language_loss": 0.75084031, "learning_rate": 2.959709735550804e-06, "loss": 0.77611077, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.759566307067871 }, { "auxiliary_loss_clip": 0.01291639, "auxiliary_loss_mlp": 0.01195976, "balance_loss_clip": 1.00770736, "balance_loss_mlp": 1.00085449, "epoch": 0.3606084290266338, "flos": 22054262374080.0, "grad_norm": 1.8962213417896872, "language_loss": 0.75593603, "learning_rate": 2.9590262336496575e-06, "loss": 0.78081226, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.787706136703491 }, { "auxiliary_loss_clip": 0.01309468, "auxiliary_loss_mlp": 0.01195987, "balance_loss_clip": 1.00918722, "balance_loss_mlp": 1.00076962, "epoch": 0.36072867191727287, "flos": 15632930263680.0, "grad_norm": 1.8301283290939774, "language_loss": 0.85518706, "learning_rate": 2.9583425862649936e-06, "loss": 0.88024157, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.7767555713653564 }, { "auxiliary_loss_clip": 0.01367371, "auxiliary_loss_mlp": 0.01195717, "balance_loss_clip": 1.01007509, "balance_loss_mlp": 1.0006907, "epoch": 0.360848914807912, "flos": 19677009274560.0, "grad_norm": 2.419743434815335, "language_loss": 0.73935419, "learning_rate": 2.9576587935005215e-06, "loss": 0.76498508, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.7035138607025146 }, { "auxiliary_loss_clip": 0.01347457, "auxiliary_loss_mlp": 0.01195747, "balance_loss_clip": 1.00933731, "balance_loss_mlp": 1.00062478, "epoch": 0.3609691576985511, "flos": 18877818376800.0, "grad_norm": 2.502875485896719, "language_loss": 0.72442067, "learning_rate": 2.9569748554599713e-06, "loss": 0.74985266, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.830688953399658 }, { "auxiliary_loss_clip": 0.01331197, "auxiliary_loss_mlp": 0.01195425, "balance_loss_clip": 1.00979769, "balance_loss_mlp": 1.00058973, "epoch": 0.36108940058919015, "flos": 42224103305280.0, "grad_norm": 1.9826117066484397, "language_loss": 0.73045695, "learning_rate": 2.956290772247097e-06, "loss": 0.75572318, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.9674344062805176 }, { "auxiliary_loss_clip": 0.01282086, "auxiliary_loss_mlp": 0.01195383, "balance_loss_clip": 1.00873208, "balance_loss_mlp": 1.00054729, "epoch": 0.36120964347982926, "flos": 23185161421920.0, "grad_norm": 1.7546678709868366, "language_loss": 0.73390615, "learning_rate": 2.9556065439656724e-06, "loss": 0.75868088, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 3.0751235485076904 }, { "auxiliary_loss_clip": 0.01280205, "auxiliary_loss_mlp": 0.01195271, "balance_loss_clip": 1.0081625, "balance_loss_mlp": 1.00053096, "epoch": 0.36132988637046837, "flos": 18113065079040.0, "grad_norm": 1.766795633732566, "language_loss": 0.81462532, "learning_rate": 2.9549221707194952e-06, "loss": 0.83938009, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.8477113246917725 }, { "auxiliary_loss_clip": 0.01347037, "auxiliary_loss_mlp": 0.0119563, "balance_loss_clip": 1.00940061, "balance_loss_mlp": 1.00060344, "epoch": 0.3614501292611074, "flos": 27813112200000.0, "grad_norm": 1.8517097667895868, "language_loss": 0.72621822, "learning_rate": 2.954237652612384e-06, "loss": 0.75164485, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.7419416904449463 }, { "auxiliary_loss_clip": 0.01321711, "auxiliary_loss_mlp": 0.01195471, "balance_loss_clip": 1.00828576, "balance_loss_mlp": 1.00053978, "epoch": 0.36157037215174653, "flos": 22634936933760.0, "grad_norm": 2.9667523949190815, "language_loss": 0.8450253, "learning_rate": 2.9535529897481796e-06, "loss": 0.87019718, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.7958920001983643 }, { "auxiliary_loss_clip": 0.0136653, "auxiliary_loss_mlp": 0.01195616, "balance_loss_clip": 1.00952339, "balance_loss_mlp": 1.00059009, "epoch": 0.36169061504238564, "flos": 12600846623520.0, "grad_norm": 2.074835351685881, "language_loss": 0.76682979, "learning_rate": 2.9528681822307446e-06, "loss": 0.79245126, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.6961848735809326 }, { "auxiliary_loss_clip": 0.01341204, "auxiliary_loss_mlp": 0.00873007, "balance_loss_clip": 1.00940883, "balance_loss_mlp": 1.00124574, "epoch": 0.3618108579330247, "flos": 26684655962400.0, "grad_norm": 1.806333592634203, "language_loss": 0.82169378, "learning_rate": 2.952183230163964e-06, "loss": 0.84383595, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 2.8182122707366943 }, { "auxiliary_loss_clip": 0.01308175, "auxiliary_loss_mlp": 0.01195341, "balance_loss_clip": 1.00819576, "balance_loss_mlp": 1.00060046, "epoch": 0.3619311008236638, "flos": 22817039869440.0, "grad_norm": 1.958563347764721, "language_loss": 0.73471236, "learning_rate": 2.9514981336517448e-06, "loss": 0.75974751, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.884413242340088 }, { "auxiliary_loss_clip": 0.01340697, "auxiliary_loss_mlp": 0.01195297, "balance_loss_clip": 1.00874984, "balance_loss_mlp": 1.00046146, "epoch": 0.36205134371430286, "flos": 25919615275200.0, "grad_norm": 1.8323246620213796, "language_loss": 0.81951654, "learning_rate": 2.950812892798015e-06, "loss": 0.84487653, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.811547040939331 }, { "auxiliary_loss_clip": 0.01283951, "auxiliary_loss_mlp": 0.00873043, "balance_loss_clip": 1.00814676, "balance_loss_mlp": 1.00139666, "epoch": 0.362171586604942, "flos": 26139604484160.0, "grad_norm": 1.685143313233553, "language_loss": 0.87030202, "learning_rate": 2.9501275077067256e-06, "loss": 0.89187193, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.7979536056518555 }, { "auxiliary_loss_clip": 0.01283467, "auxiliary_loss_mlp": 0.01195129, "balance_loss_clip": 1.00861537, "balance_loss_mlp": 1.00048399, "epoch": 0.3622918294955811, "flos": 28074220813440.0, "grad_norm": 1.5566891049341187, "language_loss": 0.88491774, "learning_rate": 2.949441978481848e-06, "loss": 0.90970367, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.8906774520874023 }, { "auxiliary_loss_clip": 0.01310922, "auxiliary_loss_mlp": 0.01195598, "balance_loss_clip": 1.00878501, "balance_loss_mlp": 1.000476, "epoch": 0.36241207238622014, "flos": 19828015512480.0, "grad_norm": 1.8975390809371213, "language_loss": 0.80024666, "learning_rate": 2.9487563052273778e-06, "loss": 0.82531184, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.7367308139801025 }, { "auxiliary_loss_clip": 0.01341139, "auxiliary_loss_mlp": 0.01195648, "balance_loss_clip": 1.01001596, "balance_loss_mlp": 1.00062132, "epoch": 0.36253231527685925, "flos": 21397168641600.0, "grad_norm": 1.7838004532902623, "language_loss": 0.85409451, "learning_rate": 2.94807048804733e-06, "loss": 0.87946236, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 3.700632333755493 }, { "auxiliary_loss_clip": 0.01323806, "auxiliary_loss_mlp": 0.01195596, "balance_loss_clip": 1.00978613, "balance_loss_mlp": 1.00066483, "epoch": 0.36265255816749836, "flos": 18362893656960.0, "grad_norm": 1.6287214322516144, "language_loss": 0.90146315, "learning_rate": 2.9473845270457434e-06, "loss": 0.9266572, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.894468307495117 }, { "auxiliary_loss_clip": 0.01327906, "auxiliary_loss_mlp": 0.01195351, "balance_loss_clip": 1.00920141, "balance_loss_mlp": 1.00041986, "epoch": 0.3627728010581374, "flos": 18660056436000.0, "grad_norm": 2.07494195242816, "language_loss": 0.70130706, "learning_rate": 2.946698422326677e-06, "loss": 0.72653967, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 3.6291069984436035 }, { "auxiliary_loss_clip": 0.01290668, "auxiliary_loss_mlp": 0.01195703, "balance_loss_clip": 1.00828552, "balance_loss_mlp": 1.00058126, "epoch": 0.36289304394877653, "flos": 27524283714720.0, "grad_norm": 2.00682019681541, "language_loss": 0.7956171, "learning_rate": 2.946012173994213e-06, "loss": 0.82048082, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 3.879739761352539 }, { "auxiliary_loss_clip": 0.01339987, "auxiliary_loss_mlp": 0.01195657, "balance_loss_clip": 1.00944293, "balance_loss_mlp": 1.00072622, "epoch": 0.36301328683941564, "flos": 34533259578720.0, "grad_norm": 1.4293867072664237, "language_loss": 0.67968458, "learning_rate": 2.945325782152454e-06, "loss": 0.70504105, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 3.8043298721313477 }, { "auxiliary_loss_clip": 0.01340428, "auxiliary_loss_mlp": 0.0119525, "balance_loss_clip": 1.00971484, "balance_loss_mlp": 1.00041413, "epoch": 0.3631335297300547, "flos": 19025986644000.0, "grad_norm": 2.1241052861304253, "language_loss": 0.78625047, "learning_rate": 2.9446392469055257e-06, "loss": 0.8116073, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.799525737762451 }, { "auxiliary_loss_clip": 0.01296273, "auxiliary_loss_mlp": 0.01195091, "balance_loss_clip": 1.00883758, "balance_loss_mlp": 1.00044608, "epoch": 0.3632537726206938, "flos": 19536780140640.0, "grad_norm": 1.758538302280058, "language_loss": 0.79739296, "learning_rate": 2.9439525683575745e-06, "loss": 0.82230663, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.707136631011963 }, { "auxiliary_loss_clip": 0.01366621, "auxiliary_loss_mlp": 0.01196114, "balance_loss_clip": 1.00981843, "balance_loss_mlp": 1.00070655, "epoch": 0.3633740155113329, "flos": 21068621775360.0, "grad_norm": 1.8966946943321443, "language_loss": 0.74488974, "learning_rate": 2.9432657466127694e-06, "loss": 0.77051711, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.7131903171539307 }, { "auxiliary_loss_clip": 0.01281167, "auxiliary_loss_mlp": 0.0119545, "balance_loss_clip": 1.00800014, "balance_loss_mlp": 1.00071013, "epoch": 0.36349425840197197, "flos": 20298731391360.0, "grad_norm": 2.2499960388666405, "language_loss": 0.76632899, "learning_rate": 2.9425787817753007e-06, "loss": 0.79109514, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.8235044479370117 }, { "auxiliary_loss_clip": 0.01298144, "auxiliary_loss_mlp": 0.01196018, "balance_loss_clip": 1.00786066, "balance_loss_mlp": 1.00089669, "epoch": 0.3636145012926111, "flos": 29716775526240.0, "grad_norm": 1.5074054338685687, "language_loss": 0.71297252, "learning_rate": 2.94189167394938e-06, "loss": 0.73791409, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.8304734230041504 }, { "auxiliary_loss_clip": 0.01366448, "auxiliary_loss_mlp": 0.01195507, "balance_loss_clip": 1.01020479, "balance_loss_mlp": 1.00057578, "epoch": 0.3637347441832502, "flos": 21431857707360.0, "grad_norm": 1.7634549162706756, "language_loss": 0.81207442, "learning_rate": 2.941204423239241e-06, "loss": 0.83769399, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.7229015827178955 }, { "auxiliary_loss_clip": 0.0134077, "auxiliary_loss_mlp": 0.01195705, "balance_loss_clip": 1.00857437, "balance_loss_mlp": 1.00067902, "epoch": 0.36385498707388925, "flos": 29533954116960.0, "grad_norm": 1.6583941660790824, "language_loss": 0.76141143, "learning_rate": 2.9405170297491395e-06, "loss": 0.78677619, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.7627954483032227 }, { "auxiliary_loss_clip": 0.01252337, "auxiliary_loss_mlp": 0.00873128, "balance_loss_clip": 1.00907898, "balance_loss_mlp": 1.00127518, "epoch": 0.36397522996452836, "flos": 22236580851840.0, "grad_norm": 1.8953721679123896, "language_loss": 0.80360818, "learning_rate": 2.939829493583353e-06, "loss": 0.82486284, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.823742628097534 }, { "auxiliary_loss_clip": 0.01321913, "auxiliary_loss_mlp": 0.01195549, "balance_loss_clip": 1.00886321, "balance_loss_mlp": 1.0006181, "epoch": 0.3640954728551674, "flos": 21506516619840.0, "grad_norm": 3.227705394438074, "language_loss": 0.82618141, "learning_rate": 2.939141814846179e-06, "loss": 0.85135603, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.824112892150879 }, { "auxiliary_loss_clip": 0.01329557, "auxiliary_loss_mlp": 0.01195263, "balance_loss_clip": 1.00902176, "balance_loss_mlp": 1.00052261, "epoch": 0.3642157157458065, "flos": 17712876889440.0, "grad_norm": 1.610131636592169, "language_loss": 0.82292187, "learning_rate": 2.938453993641938e-06, "loss": 0.8481701, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.9233505725860596 }, { "auxiliary_loss_clip": 0.01319063, "auxiliary_loss_mlp": 0.01195461, "balance_loss_clip": 1.00873661, "balance_loss_mlp": 1.00053048, "epoch": 0.36433595863644563, "flos": 17639547153120.0, "grad_norm": 1.8613131975449377, "language_loss": 0.70357412, "learning_rate": 2.937766030074973e-06, "loss": 0.72871935, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.8094494342803955 }, { "auxiliary_loss_clip": 0.01304191, "auxiliary_loss_mlp": 0.01195442, "balance_loss_clip": 1.0085808, "balance_loss_mlp": 1.00060654, "epoch": 0.3644562015270847, "flos": 26833291237440.0, "grad_norm": 2.82864029830694, "language_loss": 0.82481545, "learning_rate": 2.937077924249646e-06, "loss": 0.84981179, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.893948554992676 }, { "auxiliary_loss_clip": 0.01328315, "auxiliary_loss_mlp": 0.01195477, "balance_loss_clip": 1.00880921, "balance_loss_mlp": 1.00064123, "epoch": 0.3645764444177238, "flos": 14282724556800.0, "grad_norm": 2.0093816687610513, "language_loss": 0.75790918, "learning_rate": 2.9363896762703443e-06, "loss": 0.7831471, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.7535386085510254 }, { "auxiliary_loss_clip": 0.01365657, "auxiliary_loss_mlp": 0.01195589, "balance_loss_clip": 1.00958884, "balance_loss_mlp": 1.00056255, "epoch": 0.3646966873083629, "flos": 20667499570080.0, "grad_norm": 1.59515447152757, "language_loss": 0.84018815, "learning_rate": 2.9357012862414725e-06, "loss": 0.86580062, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.690540313720703 }, { "auxiliary_loss_clip": 0.01346565, "auxiliary_loss_mlp": 0.01195232, "balance_loss_clip": 1.00916409, "balance_loss_mlp": 1.00039625, "epoch": 0.36481693019900197, "flos": 27782626204800.0, "grad_norm": 2.0948630827508508, "language_loss": 0.7164613, "learning_rate": 2.9350127542674593e-06, "loss": 0.74187928, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.7608799934387207 }, { "auxiliary_loss_clip": 0.01325312, "auxiliary_loss_mlp": 0.01195271, "balance_loss_clip": 1.00948095, "balance_loss_mlp": 1.00053108, "epoch": 0.3649371730896411, "flos": 19712596432320.0, "grad_norm": 1.8514426846054837, "language_loss": 0.76416695, "learning_rate": 2.934324080452755e-06, "loss": 0.78937274, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 2.700577974319458 }, { "auxiliary_loss_clip": 0.01316083, "auxiliary_loss_mlp": 0.00873187, "balance_loss_clip": 1.0084722, "balance_loss_mlp": 1.00138342, "epoch": 0.3650574159802802, "flos": 24750506640960.0, "grad_norm": 1.968646362149869, "language_loss": 0.7806673, "learning_rate": 2.9336352649018307e-06, "loss": 0.80255997, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.779144763946533 }, { "auxiliary_loss_clip": 0.01333996, "auxiliary_loss_mlp": 0.01195464, "balance_loss_clip": 1.00939775, "balance_loss_mlp": 1.00062823, "epoch": 0.36517765887091924, "flos": 32853501142560.0, "grad_norm": 1.70215116751931, "language_loss": 0.70058548, "learning_rate": 2.9329463077191783e-06, "loss": 0.72588009, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.8683669567108154 }, { "auxiliary_loss_clip": 0.01298112, "auxiliary_loss_mlp": 0.01195536, "balance_loss_clip": 1.00834537, "balance_loss_mlp": 1.00060487, "epoch": 0.36529790176155835, "flos": 20120328594720.0, "grad_norm": 3.266008137369464, "language_loss": 0.64137435, "learning_rate": 2.9322572090093135e-06, "loss": 0.66631085, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.9082372188568115 }, { "auxiliary_loss_clip": 0.0129951, "auxiliary_loss_mlp": 0.01195501, "balance_loss_clip": 1.00828362, "balance_loss_mlp": 1.00066555, "epoch": 0.36541814465219746, "flos": 17639583076800.0, "grad_norm": 2.7507164193486715, "language_loss": 0.76036346, "learning_rate": 2.9315679688767713e-06, "loss": 0.78531361, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.7736093997955322 }, { "auxiliary_loss_clip": 0.01340239, "auxiliary_loss_mlp": 0.01195144, "balance_loss_clip": 1.00917912, "balance_loss_mlp": 1.0005939, "epoch": 0.3655383875428365, "flos": 22674367925280.0, "grad_norm": 1.4368243287092943, "language_loss": 0.66574436, "learning_rate": 2.9308785874261085e-06, "loss": 0.69109815, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.7707788944244385 }, { "auxiliary_loss_clip": 0.0136495, "auxiliary_loss_mlp": 0.01195258, "balance_loss_clip": 1.00950801, "balance_loss_mlp": 1.00051737, "epoch": 0.36565863043347563, "flos": 21981184103520.0, "grad_norm": 1.887024001364358, "language_loss": 0.81711435, "learning_rate": 2.9301890647619045e-06, "loss": 0.84271646, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.639284610748291 }, { "auxiliary_loss_clip": 0.01325192, "auxiliary_loss_mlp": 0.01195486, "balance_loss_clip": 1.00923562, "balance_loss_mlp": 1.00055552, "epoch": 0.36577887332411474, "flos": 24827644287360.0, "grad_norm": 1.768854148167979, "language_loss": 0.80482602, "learning_rate": 2.929499400988759e-06, "loss": 0.83003283, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 3.737086057662964 }, { "auxiliary_loss_clip": 0.01341388, "auxiliary_loss_mlp": 0.0119574, "balance_loss_clip": 1.00968289, "balance_loss_mlp": 1.00061822, "epoch": 0.3658991162147538, "flos": 28293204159360.0, "grad_norm": 1.7119851856992632, "language_loss": 0.65229464, "learning_rate": 2.9288095962112927e-06, "loss": 0.67766595, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 3.6811423301696777 }, { "auxiliary_loss_clip": 0.01364521, "auxiliary_loss_mlp": 0.01195371, "balance_loss_clip": 1.00934958, "balance_loss_mlp": 1.00053585, "epoch": 0.3660193591053929, "flos": 17785559999520.0, "grad_norm": 2.018942752808388, "language_loss": 0.85191476, "learning_rate": 2.9281196505341503e-06, "loss": 0.87751365, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 3.5500361919403076 }, { "auxiliary_loss_clip": 0.01280944, "auxiliary_loss_mlp": 0.00872938, "balance_loss_clip": 1.00803065, "balance_loss_mlp": 1.00096679, "epoch": 0.36613960199603196, "flos": 10342784590560.0, "grad_norm": 1.8692283508492757, "language_loss": 0.78454447, "learning_rate": 2.9274295640619946e-06, "loss": 0.80608332, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 3.7279481887817383 }, { "auxiliary_loss_clip": 0.01315734, "auxiliary_loss_mlp": 0.01195215, "balance_loss_clip": 1.0084691, "balance_loss_mlp": 1.0005703, "epoch": 0.36625984488667107, "flos": 19755619791840.0, "grad_norm": 1.861429015007187, "language_loss": 0.78757584, "learning_rate": 2.9267393368995103e-06, "loss": 0.81268537, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.867321252822876 }, { "auxiliary_loss_clip": 0.01365593, "auxiliary_loss_mlp": 0.01195581, "balance_loss_clip": 1.00934577, "balance_loss_mlp": 1.00064993, "epoch": 0.3663800877773102, "flos": 17674272142560.0, "grad_norm": 2.3755738930976893, "language_loss": 0.74073231, "learning_rate": 2.926048969151407e-06, "loss": 0.76634407, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.6110048294067383 }, { "auxiliary_loss_clip": 0.01269782, "auxiliary_loss_mlp": 0.01195265, "balance_loss_clip": 1.00714231, "balance_loss_mlp": 1.00042927, "epoch": 0.36650033066794924, "flos": 20303617011840.0, "grad_norm": 1.738668767411774, "language_loss": 0.684057, "learning_rate": 2.92535846092241e-06, "loss": 0.70870739, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.8746819496154785 }, { "auxiliary_loss_clip": 0.01321917, "auxiliary_loss_mlp": 0.01195506, "balance_loss_clip": 1.00808501, "balance_loss_mlp": 1.00057483, "epoch": 0.36662057355858835, "flos": 24716248659360.0, "grad_norm": 1.6672629727942303, "language_loss": 0.8282057, "learning_rate": 2.9246678123172704e-06, "loss": 0.85337996, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.771411895751953 }, { "auxiliary_loss_clip": 0.01365597, "auxiliary_loss_mlp": 0.01195654, "balance_loss_clip": 1.00935531, "balance_loss_mlp": 1.00072289, "epoch": 0.36674081644922746, "flos": 12385276027200.0, "grad_norm": 2.25199675531826, "language_loss": 0.74644554, "learning_rate": 2.9239770234407596e-06, "loss": 0.77205801, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.6865899562835693 }, { "auxiliary_loss_clip": 0.01346229, "auxiliary_loss_mlp": 0.01195578, "balance_loss_clip": 1.00878835, "balance_loss_mlp": 1.00064683, "epoch": 0.3668610593398665, "flos": 21105933269760.0, "grad_norm": 1.658499627821256, "language_loss": 0.68224764, "learning_rate": 2.9232860943976686e-06, "loss": 0.70766568, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.7171337604522705 }, { "auxiliary_loss_clip": 0.01327747, "auxiliary_loss_mlp": 0.0119502, "balance_loss_clip": 1.00897348, "balance_loss_mlp": 1.00047028, "epoch": 0.3669813022305056, "flos": 26758093469760.0, "grad_norm": 1.6276159013793805, "language_loss": 0.83841908, "learning_rate": 2.9225950252928115e-06, "loss": 0.86364675, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.785670757293701 }, { "auxiliary_loss_clip": 0.01340215, "auxiliary_loss_mlp": 0.01195394, "balance_loss_clip": 1.00910091, "balance_loss_mlp": 1.00055861, "epoch": 0.36710154512114473, "flos": 19099531922400.0, "grad_norm": 4.139207569952449, "language_loss": 0.81666303, "learning_rate": 2.9219038162310217e-06, "loss": 0.84201908, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.726954698562622 }, { "auxiliary_loss_clip": 0.01248108, "auxiliary_loss_mlp": 0.00873051, "balance_loss_clip": 1.00845897, "balance_loss_mlp": 1.00094938, "epoch": 0.3672217880117838, "flos": 20812039545600.0, "grad_norm": 1.7733075890474825, "language_loss": 0.8289901, "learning_rate": 2.921212467317157e-06, "loss": 0.85020173, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.8902995586395264 }, { "auxiliary_loss_clip": 0.01341289, "auxiliary_loss_mlp": 0.01195248, "balance_loss_clip": 1.00914836, "balance_loss_mlp": 1.00050807, "epoch": 0.3673420309024229, "flos": 13590402903360.0, "grad_norm": 1.841882934258317, "language_loss": 0.80394018, "learning_rate": 2.920520978656093e-06, "loss": 0.82930559, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 3.0266573429107666 }, { "auxiliary_loss_clip": 0.01364239, "auxiliary_loss_mlp": 0.00872977, "balance_loss_clip": 1.00901663, "balance_loss_mlp": 1.00109935, "epoch": 0.367462273793062, "flos": 28986890912640.0, "grad_norm": 1.9081319996062858, "language_loss": 0.7640484, "learning_rate": 2.919829350352729e-06, "loss": 0.78642052, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.8131675720214844 }, { "auxiliary_loss_clip": 0.01338842, "auxiliary_loss_mlp": 0.01193947, "balance_loss_clip": 1.00495672, "balance_loss_mlp": 1.0001601, "epoch": 0.36758251668370107, "flos": 62643175320960.0, "grad_norm": 0.7839129450864303, "language_loss": 0.60022682, "learning_rate": 2.919137582511983e-06, "loss": 0.62555462, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.176968812942505 }, { "auxiliary_loss_clip": 0.01307934, "auxiliary_loss_mlp": 0.01195581, "balance_loss_clip": 1.00886083, "balance_loss_mlp": 1.00064945, "epoch": 0.3677027595743402, "flos": 12713894740800.0, "grad_norm": 1.893892901938313, "language_loss": 0.64070171, "learning_rate": 2.918445675238797e-06, "loss": 0.66573685, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.85042142868042 }, { "auxiliary_loss_clip": 0.01364962, "auxiliary_loss_mlp": 0.01195676, "balance_loss_clip": 1.00877893, "balance_loss_mlp": 1.00074506, "epoch": 0.36782300246497923, "flos": 25046591709600.0, "grad_norm": 3.451895681414064, "language_loss": 0.69945025, "learning_rate": 2.917753628638132e-06, "loss": 0.72505665, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.7003402709960938 }, { "auxiliary_loss_clip": 0.01321833, "auxiliary_loss_mlp": 0.0119575, "balance_loss_clip": 1.00844359, "balance_loss_mlp": 1.00072372, "epoch": 0.36794324535561834, "flos": 17419522020480.0, "grad_norm": 2.202239928002034, "language_loss": 0.69838345, "learning_rate": 2.9170614428149716e-06, "loss": 0.72355926, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.775517702102661 }, { "auxiliary_loss_clip": 0.01306196, "auxiliary_loss_mlp": 0.01195933, "balance_loss_clip": 1.00867784, "balance_loss_mlp": 1.00081098, "epoch": 0.36806348824625745, "flos": 24089137990560.0, "grad_norm": 2.0768435860837062, "language_loss": 0.86360246, "learning_rate": 2.9163691178743195e-06, "loss": 0.88862371, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 2.855499505996704 }, { "auxiliary_loss_clip": 0.0134039, "auxiliary_loss_mlp": 0.01195388, "balance_loss_clip": 1.00876093, "balance_loss_mlp": 1.00064814, "epoch": 0.3681837311368965, "flos": 20521881884160.0, "grad_norm": 1.8024389163678787, "language_loss": 0.77113754, "learning_rate": 2.9156766539212006e-06, "loss": 0.79649532, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.7739057540893555 }, { "auxiliary_loss_clip": 0.01350908, "auxiliary_loss_mlp": 0.01195189, "balance_loss_clip": 1.00859046, "balance_loss_mlp": 1.00054371, "epoch": 0.3683039740275356, "flos": 21466654544160.0, "grad_norm": 2.1233742005972256, "language_loss": 0.71836478, "learning_rate": 2.9149840510606614e-06, "loss": 0.74382573, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.700718879699707 }, { "auxiliary_loss_clip": 0.01316319, "auxiliary_loss_mlp": 0.00871799, "balance_loss_clip": 1.00482011, "balance_loss_mlp": 1.00028932, "epoch": 0.36842421691817473, "flos": 70381029935520.0, "grad_norm": 1.0220313094347369, "language_loss": 0.64252466, "learning_rate": 2.914291309397769e-06, "loss": 0.66440588, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.3807284832000732 }, { "auxiliary_loss_clip": 0.01267963, "auxiliary_loss_mlp": 0.01195244, "balance_loss_clip": 1.0075078, "balance_loss_mlp": 1.00050354, "epoch": 0.3685444598088138, "flos": 23331389810400.0, "grad_norm": 2.6494587590787537, "language_loss": 0.78901541, "learning_rate": 2.9135984290376117e-06, "loss": 0.81364745, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.8098859786987305 }, { "auxiliary_loss_clip": 0.0127883, "auxiliary_loss_mlp": 0.01195632, "balance_loss_clip": 1.00777745, "balance_loss_mlp": 1.00070119, "epoch": 0.3686647026994529, "flos": 23070281196960.0, "grad_norm": 1.639178323104687, "language_loss": 0.82661259, "learning_rate": 2.9129054100853e-06, "loss": 0.85135722, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.848144292831421 }, { "auxiliary_loss_clip": 0.01322105, "auxiliary_loss_mlp": 0.01195594, "balance_loss_clip": 1.00823164, "balance_loss_mlp": 1.00075889, "epoch": 0.368784945590092, "flos": 25119921445920.0, "grad_norm": 1.6303471811962356, "language_loss": 0.75759077, "learning_rate": 2.912212252645963e-06, "loss": 0.78276777, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.88662052154541 }, { "auxiliary_loss_clip": 0.01347268, "auxiliary_loss_mlp": 0.01195723, "balance_loss_clip": 1.00898457, "balance_loss_mlp": 1.00069678, "epoch": 0.36890518848073106, "flos": 18442294495200.0, "grad_norm": 2.08173199338483, "language_loss": 0.76045489, "learning_rate": 2.9115189568247523e-06, "loss": 0.78588486, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 3.606527090072632 }, { "auxiliary_loss_clip": 0.01264923, "auxiliary_loss_mlp": 0.01195273, "balance_loss_clip": 1.00838065, "balance_loss_mlp": 1.00043774, "epoch": 0.36902543137137017, "flos": 16362455640480.0, "grad_norm": 6.952239719233283, "language_loss": 0.91601318, "learning_rate": 2.910825522726841e-06, "loss": 0.94061512, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.821384906768799 }, { "auxiliary_loss_clip": 0.01289884, "auxiliary_loss_mlp": 0.0119524, "balance_loss_clip": 1.00745773, "balance_loss_mlp": 1.00040448, "epoch": 0.3691456742620093, "flos": 12275604735840.0, "grad_norm": 2.60672704162558, "language_loss": 0.76823777, "learning_rate": 2.9101319504574215e-06, "loss": 0.79308897, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 5.23540997505188 }, { "auxiliary_loss_clip": 0.01353121, "auxiliary_loss_mlp": 0.01195361, "balance_loss_clip": 1.00898695, "balance_loss_mlp": 1.00052524, "epoch": 0.36926591715264834, "flos": 17786422167840.0, "grad_norm": 1.666515044392911, "language_loss": 0.76385993, "learning_rate": 2.909438240121709e-06, "loss": 0.78934479, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 3.767347812652588 }, { "auxiliary_loss_clip": 0.01315312, "auxiliary_loss_mlp": 0.01195587, "balance_loss_clip": 1.00829506, "balance_loss_mlp": 1.00065589, "epoch": 0.36938616004328745, "flos": 28948322089440.0, "grad_norm": 1.718559811565951, "language_loss": 0.70680892, "learning_rate": 2.908744391824939e-06, "loss": 0.73191792, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.792879343032837 }, { "auxiliary_loss_clip": 0.01279407, "auxiliary_loss_mlp": 0.01195622, "balance_loss_clip": 1.00819349, "balance_loss_mlp": 1.0005002, "epoch": 0.36950640293392656, "flos": 29205407250720.0, "grad_norm": 1.666224443932077, "language_loss": 0.79351246, "learning_rate": 2.908050405672367e-06, "loss": 0.81826276, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.905958890914917 }, { "auxiliary_loss_clip": 0.01340792, "auxiliary_loss_mlp": 0.01195563, "balance_loss_clip": 1.00890887, "balance_loss_mlp": 1.00072765, "epoch": 0.3696266458245656, "flos": 24827787982080.0, "grad_norm": 1.8362983893493583, "language_loss": 0.7936514, "learning_rate": 2.9073562817692703e-06, "loss": 0.81901503, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.7542922496795654 }, { "auxiliary_loss_clip": 0.01279389, "auxiliary_loss_mlp": 0.01193942, "balance_loss_clip": 1.00422406, "balance_loss_mlp": 1.00015569, "epoch": 0.3697468887152047, "flos": 59887288239840.0, "grad_norm": 0.7183493542880337, "language_loss": 0.5653649, "learning_rate": 2.9066620202209468e-06, "loss": 0.5900982, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.3705809116363525 }, { "auxiliary_loss_clip": 0.01310418, "auxiliary_loss_mlp": 0.01195308, "balance_loss_clip": 1.00806272, "balance_loss_mlp": 1.00047231, "epoch": 0.3698671316058438, "flos": 26137592758080.0, "grad_norm": 2.335615091127795, "language_loss": 0.77473813, "learning_rate": 2.905967621132716e-06, "loss": 0.79979539, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.833070755004883 }, { "auxiliary_loss_clip": 0.01328986, "auxiliary_loss_mlp": 0.01195772, "balance_loss_clip": 1.00865102, "balance_loss_mlp": 1.00074577, "epoch": 0.3699873744964829, "flos": 24607475460000.0, "grad_norm": 3.749353072950683, "language_loss": 0.75247073, "learning_rate": 2.9052730846099172e-06, "loss": 0.77771831, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.7577407360076904 }, { "auxiliary_loss_clip": 0.01303534, "auxiliary_loss_mlp": 0.01193942, "balance_loss_clip": 1.00377846, "balance_loss_mlp": 1.00015569, "epoch": 0.370107617387122, "flos": 64885372296480.0, "grad_norm": 0.8539072149273982, "language_loss": 0.60992932, "learning_rate": 2.9045784107579123e-06, "loss": 0.63490415, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.342905282974243 }, { "auxiliary_loss_clip": 0.01364968, "auxiliary_loss_mlp": 0.01195589, "balance_loss_clip": 1.00925577, "balance_loss_mlp": 1.00065851, "epoch": 0.37022786027776106, "flos": 15961692672000.0, "grad_norm": 1.6716755382331032, "language_loss": 0.66872644, "learning_rate": 2.9038835996820807e-06, "loss": 0.694332, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.6054470539093018 }, { "auxiliary_loss_clip": 0.01327974, "auxiliary_loss_mlp": 0.01195503, "balance_loss_clip": 1.00867879, "balance_loss_mlp": 1.0007627, "epoch": 0.37034810316840017, "flos": 18546936471360.0, "grad_norm": 2.0306431966039313, "language_loss": 0.79735786, "learning_rate": 2.903188651487826e-06, "loss": 0.82259262, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.8039557933807373 }, { "auxiliary_loss_clip": 0.01344841, "auxiliary_loss_mlp": 0.01195187, "balance_loss_clip": 1.00874758, "balance_loss_mlp": 1.00063789, "epoch": 0.3704683460590393, "flos": 17821937478240.0, "grad_norm": 1.9453346930479636, "language_loss": 0.86425287, "learning_rate": 2.902493566280571e-06, "loss": 0.88965315, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.637751817703247 }, { "auxiliary_loss_clip": 0.01315176, "auxiliary_loss_mlp": 0.01195348, "balance_loss_clip": 1.00786495, "balance_loss_mlp": 1.00051212, "epoch": 0.37058858894967833, "flos": 14134089281760.0, "grad_norm": 2.029838760263547, "language_loss": 0.81402957, "learning_rate": 2.9017983441657595e-06, "loss": 0.83913481, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.6701836585998535 }, { "auxiliary_loss_clip": 0.01303076, "auxiliary_loss_mlp": 0.01195673, "balance_loss_clip": 1.00823855, "balance_loss_mlp": 1.00064611, "epoch": 0.37070883184031744, "flos": 13954501003680.0, "grad_norm": 1.9331151075288473, "language_loss": 0.75215256, "learning_rate": 2.9011029852488564e-06, "loss": 0.77714008, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 2.8607828617095947 }, { "auxiliary_loss_clip": 0.0133745, "auxiliary_loss_mlp": 0.01193854, "balance_loss_clip": 1.00427127, "balance_loss_mlp": 1.000067, "epoch": 0.37082907473095655, "flos": 52315442752320.0, "grad_norm": 1.0046596435770243, "language_loss": 0.62491441, "learning_rate": 2.9004074896353465e-06, "loss": 0.65022743, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.140103578567505 }, { "auxiliary_loss_clip": 0.01364699, "auxiliary_loss_mlp": 0.01195416, "balance_loss_clip": 1.00979805, "balance_loss_mlp": 1.00058007, "epoch": 0.3709493176215956, "flos": 15998106074400.0, "grad_norm": 1.9289879186626906, "language_loss": 0.81826556, "learning_rate": 2.8997118574307362e-06, "loss": 0.84386671, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.7046239376068115 }, { "auxiliary_loss_clip": 0.01309479, "auxiliary_loss_mlp": 0.01195459, "balance_loss_clip": 1.00855923, "balance_loss_mlp": 1.00062335, "epoch": 0.3710695605122347, "flos": 20959848576000.0, "grad_norm": 1.839673334915671, "language_loss": 0.74416518, "learning_rate": 2.899016088740553e-06, "loss": 0.76921463, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.775514602661133 }, { "auxiliary_loss_clip": 0.0128016, "auxiliary_loss_mlp": 0.01195259, "balance_loss_clip": 1.00737834, "balance_loss_mlp": 1.00051832, "epoch": 0.37118980340287383, "flos": 14355587285280.0, "grad_norm": 1.7823872380984633, "language_loss": 0.79221743, "learning_rate": 2.898320183670344e-06, "loss": 0.81697154, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 2.8029816150665283 }, { "auxiliary_loss_clip": 0.01279294, "auxiliary_loss_mlp": 0.01195504, "balance_loss_clip": 1.00787795, "balance_loss_mlp": 1.00057316, "epoch": 0.3713100462935129, "flos": 25885393217280.0, "grad_norm": 1.6540729247591544, "language_loss": 0.88743013, "learning_rate": 2.8976241423256767e-06, "loss": 0.9121781, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.9210472106933594 }, { "auxiliary_loss_clip": 0.01339462, "auxiliary_loss_mlp": 0.01195107, "balance_loss_clip": 1.00920761, "balance_loss_mlp": 1.0005579, "epoch": 0.371430289184152, "flos": 30518948089440.0, "grad_norm": 1.785029121408699, "language_loss": 0.68067396, "learning_rate": 2.896927964812142e-06, "loss": 0.70601964, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.941654682159424 }, { "auxiliary_loss_clip": 0.01319829, "auxiliary_loss_mlp": 0.01195305, "balance_loss_clip": 1.00876176, "balance_loss_mlp": 1.00046921, "epoch": 0.37155053207479105, "flos": 15742242318240.0, "grad_norm": 2.693268097890074, "language_loss": 0.75185722, "learning_rate": 2.8962316512353465e-06, "loss": 0.77700853, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.9329257011413574 }, { "auxiliary_loss_clip": 0.01275434, "auxiliary_loss_mlp": 0.01195324, "balance_loss_clip": 1.00778675, "balance_loss_mlp": 1.00058377, "epoch": 0.37167077496543016, "flos": 23404072920480.0, "grad_norm": 1.5791401549416118, "language_loss": 0.75090218, "learning_rate": 2.8955352017009233e-06, "loss": 0.77560973, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.9556899070739746 }, { "auxiliary_loss_clip": 0.01316517, "auxiliary_loss_mlp": 0.01195512, "balance_loss_clip": 1.00799716, "balance_loss_mlp": 1.00058091, "epoch": 0.3717910178560693, "flos": 22088664050400.0, "grad_norm": 1.7590724959963404, "language_loss": 0.77206755, "learning_rate": 2.8948386163145212e-06, "loss": 0.79718786, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.8013298511505127 }, { "auxiliary_loss_clip": 0.0134563, "auxiliary_loss_mlp": 0.01195155, "balance_loss_clip": 1.00961494, "balance_loss_mlp": 1.00050986, "epoch": 0.3719112607467083, "flos": 26939980863360.0, "grad_norm": 1.696264055863176, "language_loss": 0.79345882, "learning_rate": 2.8941418951818135e-06, "loss": 0.81886673, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 3.7401320934295654 }, { "auxiliary_loss_clip": 0.0130862, "auxiliary_loss_mlp": 0.01195166, "balance_loss_clip": 1.00784218, "balance_loss_mlp": 1.00052094, "epoch": 0.37203150363734744, "flos": 12166508223360.0, "grad_norm": 2.030849488140449, "language_loss": 0.71039248, "learning_rate": 2.8934450384084903e-06, "loss": 0.73543036, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.7566301822662354 }, { "auxiliary_loss_clip": 0.01328216, "auxiliary_loss_mlp": 0.01195609, "balance_loss_clip": 1.0085876, "balance_loss_mlp": 1.0006783, "epoch": 0.37215174652798655, "flos": 23697607407840.0, "grad_norm": 1.963295058211422, "language_loss": 0.69735038, "learning_rate": 2.8927480461002653e-06, "loss": 0.7225886, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 3.6706669330596924 }, { "auxiliary_loss_clip": 0.01339933, "auxiliary_loss_mlp": 0.01195338, "balance_loss_clip": 1.0087142, "balance_loss_mlp": 1.00059736, "epoch": 0.3722719894186256, "flos": 17887759165440.0, "grad_norm": 2.589612598286023, "language_loss": 0.86018789, "learning_rate": 2.892050918362872e-06, "loss": 0.8855406, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 3.8690309524536133 }, { "auxiliary_loss_clip": 0.01230938, "auxiliary_loss_mlp": 0.01193882, "balance_loss_clip": 1.00421274, "balance_loss_mlp": 1.00009561, "epoch": 0.3723922323092647, "flos": 62419917057120.0, "grad_norm": 0.8446805174129153, "language_loss": 0.55925727, "learning_rate": 2.8913536553020626e-06, "loss": 0.58350551, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.4021530151367188 }, { "auxiliary_loss_clip": 0.01296606, "auxiliary_loss_mlp": 0.01195522, "balance_loss_clip": 1.00786185, "balance_loss_mlp": 1.00078177, "epoch": 0.3725124751999038, "flos": 23039759278080.0, "grad_norm": 1.9536044469850284, "language_loss": 0.84596962, "learning_rate": 2.8906562570236137e-06, "loss": 0.87089086, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 2.9042069911956787 }, { "auxiliary_loss_clip": 0.01275924, "auxiliary_loss_mlp": 0.01195464, "balance_loss_clip": 1.0070076, "balance_loss_mlp": 1.00043797, "epoch": 0.3726327180905429, "flos": 20920561279200.0, "grad_norm": 1.4626732470450527, "language_loss": 0.76547265, "learning_rate": 2.889958723633318e-06, "loss": 0.79018658, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.9410018920898438 }, { "auxiliary_loss_clip": 0.01302956, "auxiliary_loss_mlp": 0.01195347, "balance_loss_clip": 1.00819695, "balance_loss_mlp": 1.00060713, "epoch": 0.372752960981182, "flos": 30592170054720.0, "grad_norm": 1.548959683273136, "language_loss": 0.73510194, "learning_rate": 2.889261055236992e-06, "loss": 0.76008499, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.878861665725708 }, { "auxiliary_loss_clip": 0.01315001, "auxiliary_loss_mlp": 0.01195546, "balance_loss_clip": 1.0082128, "balance_loss_mlp": 1.00061548, "epoch": 0.3728732038718211, "flos": 25116760162080.0, "grad_norm": 1.7452947735883086, "language_loss": 0.82535422, "learning_rate": 2.8885632519404704e-06, "loss": 0.85045969, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.807896852493286 }, { "auxiliary_loss_clip": 0.01316908, "auxiliary_loss_mlp": 0.0119539, "balance_loss_clip": 1.00835025, "balance_loss_mlp": 1.00065017, "epoch": 0.37299344676246016, "flos": 25302060305280.0, "grad_norm": 1.9829600303987318, "language_loss": 0.75860476, "learning_rate": 2.8878653138496107e-06, "loss": 0.78372777, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.8116464614868164 }, { "auxiliary_loss_clip": 0.01279754, "auxiliary_loss_mlp": 0.0119529, "balance_loss_clip": 1.00808358, "balance_loss_mlp": 1.00054979, "epoch": 0.37311368965309927, "flos": 23842542543840.0, "grad_norm": 2.413215625263253, "language_loss": 0.76045591, "learning_rate": 2.8871672410702878e-06, "loss": 0.78520638, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.8536853790283203 }, { "auxiliary_loss_clip": 0.01315961, "auxiliary_loss_mlp": 0.01195386, "balance_loss_clip": 1.00801635, "balance_loss_mlp": 1.00045502, "epoch": 0.3732339325437384, "flos": 25811955709920.0, "grad_norm": 1.7452777396753092, "language_loss": 0.81960994, "learning_rate": 2.8864690337084008e-06, "loss": 0.84472334, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.7959046363830566 }, { "auxiliary_loss_clip": 0.01353224, "auxiliary_loss_mlp": 0.0119516, "balance_loss_clip": 1.00936794, "balance_loss_mlp": 1.00051546, "epoch": 0.37335417543437743, "flos": 26208443760480.0, "grad_norm": 2.625767834731497, "language_loss": 0.77945042, "learning_rate": 2.885770691869866e-06, "loss": 0.80493432, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.7519400119781494 }, { "auxiliary_loss_clip": 0.01339815, "auxiliary_loss_mlp": 0.01195196, "balance_loss_clip": 1.0083828, "balance_loss_mlp": 1.00045562, "epoch": 0.37347441832501654, "flos": 24023891082240.0, "grad_norm": 2.4188444582828166, "language_loss": 0.74663234, "learning_rate": 2.8850722156606207e-06, "loss": 0.77198243, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.8167803287506104 }, { "auxiliary_loss_clip": 0.01352001, "auxiliary_loss_mlp": 0.0119536, "balance_loss_clip": 1.00895262, "balance_loss_mlp": 1.00062001, "epoch": 0.3735946612156556, "flos": 19714931471520.0, "grad_norm": 1.5191476621861488, "language_loss": 0.67076886, "learning_rate": 2.8843736051866252e-06, "loss": 0.69624245, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.759331226348877 }, { "auxiliary_loss_clip": 0.01292361, "auxiliary_loss_mlp": 0.00872689, "balance_loss_clip": 1.00794768, "balance_loss_mlp": 1.00057709, "epoch": 0.3737149041062947, "flos": 23039615583360.0, "grad_norm": 1.5518635969598116, "language_loss": 0.69411922, "learning_rate": 2.8836748605538557e-06, "loss": 0.71576971, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.9599759578704834 }, { "auxiliary_loss_clip": 0.01329273, "auxiliary_loss_mlp": 0.01195842, "balance_loss_clip": 1.00846803, "balance_loss_mlp": 1.00081539, "epoch": 0.3738351469969338, "flos": 34678122867360.0, "grad_norm": 1.9089912823780155, "language_loss": 0.6340245, "learning_rate": 2.882975981868313e-06, "loss": 0.65927565, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 2.8438146114349365 }, { "auxiliary_loss_clip": 0.01338603, "auxiliary_loss_mlp": 0.01195679, "balance_loss_clip": 1.00855446, "balance_loss_mlp": 1.00065243, "epoch": 0.3739553898875729, "flos": 43507984393440.0, "grad_norm": 2.1873183412926585, "language_loss": 0.68990278, "learning_rate": 2.882276969236016e-06, "loss": 0.71524554, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.9436397552490234 }, { "auxiliary_loss_clip": 0.01327707, "auxiliary_loss_mlp": 0.01195377, "balance_loss_clip": 1.00852823, "balance_loss_mlp": 1.00054145, "epoch": 0.374075632778212, "flos": 12856494837600.0, "grad_norm": 2.004900809061302, "language_loss": 0.76243448, "learning_rate": 2.881577822763005e-06, "loss": 0.78766531, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.7258987426757812 }, { "auxiliary_loss_clip": 0.01342527, "auxiliary_loss_mlp": 0.01195445, "balance_loss_clip": 1.00834858, "balance_loss_mlp": 1.00060976, "epoch": 0.3741958756688511, "flos": 26024041709280.0, "grad_norm": 2.01248480130252, "language_loss": 0.8730101, "learning_rate": 2.880878542555338e-06, "loss": 0.89838982, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.8159916400909424 }, { "auxiliary_loss_clip": 0.013646, "auxiliary_loss_mlp": 0.01195681, "balance_loss_clip": 1.00911307, "balance_loss_mlp": 1.00065446, "epoch": 0.37431611855949015, "flos": 21433905357120.0, "grad_norm": 1.9180917764283456, "language_loss": 0.80240691, "learning_rate": 2.8801791287190976e-06, "loss": 0.82800972, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.7563581466674805 }, { "auxiliary_loss_clip": 0.01343434, "auxiliary_loss_mlp": 0.01195627, "balance_loss_clip": 1.00879216, "balance_loss_mlp": 1.0006001, "epoch": 0.37443636145012926, "flos": 24207107652000.0, "grad_norm": 3.334353893483644, "language_loss": 0.85586071, "learning_rate": 2.8794795813603817e-06, "loss": 0.88125128, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.7693328857421875 }, { "auxiliary_loss_clip": 0.01345332, "auxiliary_loss_mlp": 0.01195555, "balance_loss_clip": 1.00911963, "balance_loss_mlp": 1.00062442, "epoch": 0.3745566043407684, "flos": 15378611225760.0, "grad_norm": 1.6160665946872759, "language_loss": 0.81637454, "learning_rate": 2.878779900585314e-06, "loss": 0.84178334, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.662062883377075 }, { "auxiliary_loss_clip": 0.01333523, "auxiliary_loss_mlp": 0.01195474, "balance_loss_clip": 1.00846887, "balance_loss_mlp": 1.00063837, "epoch": 0.37467684723140743, "flos": 24608229857280.0, "grad_norm": 1.5533157351125417, "language_loss": 0.75265914, "learning_rate": 2.8780800865000336e-06, "loss": 0.77794909, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.8235764503479004 }, { "auxiliary_loss_clip": 0.01317973, "auxiliary_loss_mlp": 0.01193844, "balance_loss_clip": 1.0042913, "balance_loss_mlp": 1.00005746, "epoch": 0.37479709012204654, "flos": 64377524541600.0, "grad_norm": 0.9757355447370432, "language_loss": 0.59211695, "learning_rate": 2.877380139210702e-06, "loss": 0.61723518, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.247380495071411 }, { "auxiliary_loss_clip": 0.01297235, "auxiliary_loss_mlp": 0.01195892, "balance_loss_clip": 1.00856721, "balance_loss_mlp": 1.00077009, "epoch": 0.37491733301268565, "flos": 23803973720640.0, "grad_norm": 1.550543878013965, "language_loss": 0.76558048, "learning_rate": 2.876680058823501e-06, "loss": 0.79051173, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.831753730773926 }, { "auxiliary_loss_clip": 0.01327795, "auxiliary_loss_mlp": 0.01195986, "balance_loss_clip": 1.00860155, "balance_loss_mlp": 1.00086379, "epoch": 0.3750375759033247, "flos": 32160963947040.0, "grad_norm": 1.7243976798541827, "language_loss": 0.65642774, "learning_rate": 2.8759798454446314e-06, "loss": 0.68166554, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.880117893218994 }, { "auxiliary_loss_clip": 0.01351681, "auxiliary_loss_mlp": 0.0119569, "balance_loss_clip": 1.00916505, "balance_loss_mlp": 1.00066376, "epoch": 0.3751578187939638, "flos": 23367803212800.0, "grad_norm": 1.7396912832893967, "language_loss": 0.81521714, "learning_rate": 2.8752794991803173e-06, "loss": 0.84069085, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 3.744805335998535 }, { "auxiliary_loss_clip": 0.01327979, "auxiliary_loss_mlp": 0.01195357, "balance_loss_clip": 1.00895035, "balance_loss_mlp": 1.00052118, "epoch": 0.37527806168460287, "flos": 14605739176320.0, "grad_norm": 1.8384069890366126, "language_loss": 0.75019419, "learning_rate": 2.8745790201367976e-06, "loss": 0.77542752, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 3.8162472248077393 }, { "auxiliary_loss_clip": 0.01365123, "auxiliary_loss_mlp": 0.01195672, "balance_loss_clip": 1.00936913, "balance_loss_mlp": 1.00055027, "epoch": 0.375398304575242, "flos": 26390834085600.0, "grad_norm": 2.124207293137122, "language_loss": 0.84383214, "learning_rate": 2.8738784084203373e-06, "loss": 0.86944008, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 3.641331672668457 }, { "auxiliary_loss_clip": 0.01339416, "auxiliary_loss_mlp": 0.01195561, "balance_loss_clip": 1.00922596, "balance_loss_mlp": 1.00082064, "epoch": 0.3755185474658811, "flos": 22236616775520.0, "grad_norm": 1.5538880874503356, "language_loss": 0.78671575, "learning_rate": 2.873177664137216e-06, "loss": 0.81206548, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 3.7630743980407715 }, { "auxiliary_loss_clip": 0.01292766, "auxiliary_loss_mlp": 0.01195636, "balance_loss_clip": 1.00773418, "balance_loss_mlp": 1.00060952, "epoch": 0.37563879035652015, "flos": 30812949584640.0, "grad_norm": 1.5549511448078746, "language_loss": 0.69024336, "learning_rate": 2.8724767873937384e-06, "loss": 0.71512735, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.838827610015869 }, { "auxiliary_loss_clip": 0.01319405, "auxiliary_loss_mlp": 0.01195599, "balance_loss_clip": 1.00853634, "balance_loss_mlp": 1.00066841, "epoch": 0.37575903324715926, "flos": 20773542569760.0, "grad_norm": 1.9904908938779506, "language_loss": 0.87441456, "learning_rate": 2.871775778296225e-06, "loss": 0.89956462, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.7430899143218994 }, { "auxiliary_loss_clip": 0.01340979, "auxiliary_loss_mlp": 0.01195521, "balance_loss_clip": 1.00975144, "balance_loss_mlp": 1.00059009, "epoch": 0.37587927613779837, "flos": 18697691243520.0, "grad_norm": 2.160550895340856, "language_loss": 0.78507364, "learning_rate": 2.8710746369510196e-06, "loss": 0.81043869, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.671764612197876 }, { "auxiliary_loss_clip": 0.01334689, "auxiliary_loss_mlp": 0.0119567, "balance_loss_clip": 1.00931859, "balance_loss_mlp": 1.00054884, "epoch": 0.3759995190284374, "flos": 13624804579680.0, "grad_norm": 2.8964227972617462, "language_loss": 0.83054233, "learning_rate": 2.8703733634644846e-06, "loss": 0.85584593, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.7445590496063232 }, { "auxiliary_loss_clip": 0.01364227, "auxiliary_loss_mlp": 0.01195385, "balance_loss_clip": 1.00935125, "balance_loss_mlp": 1.00064445, "epoch": 0.37611976191907653, "flos": 20484857779200.0, "grad_norm": 1.664826609935947, "language_loss": 0.79070938, "learning_rate": 2.869671957943002e-06, "loss": 0.81630552, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.684035062789917 }, { "auxiliary_loss_clip": 0.01314762, "auxiliary_loss_mlp": 0.01195547, "balance_loss_clip": 1.0094595, "balance_loss_mlp": 1.0006156, "epoch": 0.37624000480971564, "flos": 21141807816960.0, "grad_norm": 1.746636194472923, "language_loss": 0.74238753, "learning_rate": 2.8689704204929747e-06, "loss": 0.76749063, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.689688205718994 }, { "auxiliary_loss_clip": 0.01364346, "auxiliary_loss_mlp": 0.01195546, "balance_loss_clip": 1.00904071, "balance_loss_mlp": 1.00071084, "epoch": 0.3763602477003547, "flos": 22564481091840.0, "grad_norm": 1.899264453654636, "language_loss": 0.81408161, "learning_rate": 2.8682687512208253e-06, "loss": 0.83968055, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.684877634048462 }, { "auxiliary_loss_clip": 0.01351176, "auxiliary_loss_mlp": 0.011956, "balance_loss_clip": 1.00898981, "balance_loss_mlp": 1.00066936, "epoch": 0.3764804905909938, "flos": 27526870219680.0, "grad_norm": 1.9372350123843611, "language_loss": 0.80076903, "learning_rate": 2.8675669502329972e-06, "loss": 0.82623684, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.7490062713623047 }, { "auxiliary_loss_clip": 0.01339049, "auxiliary_loss_mlp": 0.00872797, "balance_loss_clip": 1.00786519, "balance_loss_mlp": 1.00083721, "epoch": 0.3766007334816329, "flos": 22528103613120.0, "grad_norm": 2.3947036971536897, "language_loss": 0.85497856, "learning_rate": 2.866865017635952e-06, "loss": 0.87709701, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.7444961071014404 }, { "auxiliary_loss_clip": 0.01294396, "auxiliary_loss_mlp": 0.01196045, "balance_loss_clip": 1.00854123, "balance_loss_mlp": 1.00082815, "epoch": 0.376720976372272, "flos": 25957178235360.0, "grad_norm": 1.4599502931440451, "language_loss": 0.79437232, "learning_rate": 2.866162953536174e-06, "loss": 0.81927669, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.7923247814178467 }, { "auxiliary_loss_clip": 0.01337853, "auxiliary_loss_mlp": 0.00872745, "balance_loss_clip": 1.00899291, "balance_loss_mlp": 1.00063097, "epoch": 0.3768412192629111, "flos": 18041172289920.0, "grad_norm": 1.5236466770653596, "language_loss": 0.75151181, "learning_rate": 2.8654607580401634e-06, "loss": 0.77361786, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.764427661895752 }, { "auxiliary_loss_clip": 0.01317653, "auxiliary_loss_mlp": 0.01193862, "balance_loss_clip": 1.00441217, "balance_loss_mlp": 1.00007558, "epoch": 0.3769614621535502, "flos": 62989504408800.0, "grad_norm": 0.8826234507109605, "language_loss": 0.65225828, "learning_rate": 2.8647584312544446e-06, "loss": 0.67737341, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.369939088821411 }, { "auxiliary_loss_clip": 0.01310427, "auxiliary_loss_mlp": 0.00872772, "balance_loss_clip": 1.00901723, "balance_loss_mlp": 1.00073671, "epoch": 0.37708170504418925, "flos": 23661696936960.0, "grad_norm": 1.357847887473665, "language_loss": 0.850007, "learning_rate": 2.864055973285559e-06, "loss": 0.87183899, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 2.8156380653381348 }, { "auxiliary_loss_clip": 0.01327111, "auxiliary_loss_mlp": 0.01195318, "balance_loss_clip": 1.00840342, "balance_loss_mlp": 1.00057745, "epoch": 0.37720194793482836, "flos": 24423181179840.0, "grad_norm": 2.1204291849697796, "language_loss": 0.86503899, "learning_rate": 2.8633533842400698e-06, "loss": 0.89026332, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.7901313304901123 }, { "auxiliary_loss_clip": 0.01340295, "auxiliary_loss_mlp": 0.00872918, "balance_loss_clip": 1.00909805, "balance_loss_mlp": 1.00072467, "epoch": 0.3773221908254674, "flos": 20996513444160.0, "grad_norm": 1.7090581713169235, "language_loss": 0.77365339, "learning_rate": 2.862650664224558e-06, "loss": 0.79578555, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.6832163333892822 }, { "auxiliary_loss_clip": 0.01339249, "auxiliary_loss_mlp": 0.01195445, "balance_loss_clip": 1.00961578, "balance_loss_mlp": 1.00060976, "epoch": 0.37744243371610653, "flos": 37631739684960.0, "grad_norm": 1.344980318503856, "language_loss": 0.69809866, "learning_rate": 2.861947813345627e-06, "loss": 0.72344559, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.8846566677093506 }, { "auxiliary_loss_clip": 0.01364848, "auxiliary_loss_mlp": 0.00872793, "balance_loss_clip": 1.00952077, "balance_loss_mlp": 1.00066662, "epoch": 0.37756267660674564, "flos": 26140538499840.0, "grad_norm": 4.089135020458161, "language_loss": 0.7260766, "learning_rate": 2.8612448317098974e-06, "loss": 0.74845302, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.7714076042175293 }, { "auxiliary_loss_clip": 0.01306777, "auxiliary_loss_mlp": 0.00872896, "balance_loss_clip": 1.00817037, "balance_loss_mlp": 1.00089264, "epoch": 0.3776829194973847, "flos": 19427899170240.0, "grad_norm": 2.181848131186375, "language_loss": 0.82792556, "learning_rate": 2.8605417194240114e-06, "loss": 0.84972233, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.8203284740448 }, { "auxiliary_loss_clip": 0.0133848, "auxiliary_loss_mlp": 0.01195449, "balance_loss_clip": 1.00832224, "balance_loss_mlp": 1.00061381, "epoch": 0.3778031623880238, "flos": 17382318297120.0, "grad_norm": 1.847204972199892, "language_loss": 0.78914201, "learning_rate": 2.8598384765946315e-06, "loss": 0.81448138, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.7061848640441895 }, { "auxiliary_loss_clip": 0.0136377, "auxiliary_loss_mlp": 0.01195198, "balance_loss_clip": 1.00854039, "balance_loss_mlp": 1.00045753, "epoch": 0.3779234052786629, "flos": 27125855785440.0, "grad_norm": 1.667356005763263, "language_loss": 0.71609312, "learning_rate": 2.8591351033284377e-06, "loss": 0.74168277, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.940547227859497 }, { "auxiliary_loss_clip": 0.01343742, "auxiliary_loss_mlp": 0.01195455, "balance_loss_clip": 1.00922251, "balance_loss_mlp": 1.00071454, "epoch": 0.37804364816930197, "flos": 19682649292320.0, "grad_norm": 1.9995022395052926, "language_loss": 0.83367383, "learning_rate": 2.8584315997321325e-06, "loss": 0.85906583, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.6740353107452393 }, { "auxiliary_loss_clip": 0.01363877, "auxiliary_loss_mlp": 0.01195257, "balance_loss_clip": 1.00879574, "balance_loss_mlp": 1.00042152, "epoch": 0.3781638910599411, "flos": 22702914041760.0, "grad_norm": 2.2260997860302174, "language_loss": 0.77580613, "learning_rate": 2.8577279659124356e-06, "loss": 0.80139756, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.616039752960205 }, { "auxiliary_loss_clip": 0.01351423, "auxiliary_loss_mlp": 0.01195306, "balance_loss_clip": 1.00911927, "balance_loss_mlp": 1.00056577, "epoch": 0.3782841339505802, "flos": 14647612978080.0, "grad_norm": 1.7302603621039623, "language_loss": 0.83362609, "learning_rate": 2.857024201976089e-06, "loss": 0.85909343, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 3.671067714691162 }, { "auxiliary_loss_clip": 0.0131358, "auxiliary_loss_mlp": 0.01195954, "balance_loss_clip": 1.00815034, "balance_loss_mlp": 1.00054634, "epoch": 0.37840437684121925, "flos": 32818237297920.0, "grad_norm": 1.7860867257330053, "language_loss": 0.72729069, "learning_rate": 2.8563203080298516e-06, "loss": 0.75238597, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.7757997512817383 }, { "auxiliary_loss_clip": 0.01316249, "auxiliary_loss_mlp": 0.00872897, "balance_loss_clip": 1.00788677, "balance_loss_mlp": 1.00081408, "epoch": 0.37852461973185836, "flos": 18369216224640.0, "grad_norm": 2.6398092157000184, "language_loss": 0.89171493, "learning_rate": 2.855616284180505e-06, "loss": 0.91360635, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 3.7706782817840576 }, { "auxiliary_loss_clip": 0.01324412, "auxiliary_loss_mlp": 0.01193839, "balance_loss_clip": 1.00426137, "balance_loss_mlp": 1.00005269, "epoch": 0.37864486262249747, "flos": 59500715124960.0, "grad_norm": 0.8744343237070767, "language_loss": 0.66135597, "learning_rate": 2.8549121305348477e-06, "loss": 0.68653846, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 4.240316390991211 }, { "auxiliary_loss_clip": 0.01350794, "auxiliary_loss_mlp": 0.01195459, "balance_loss_clip": 1.00879323, "balance_loss_mlp": 1.00062323, "epoch": 0.3787651055131365, "flos": 23363025363360.0, "grad_norm": 2.5222267739778803, "language_loss": 0.83439618, "learning_rate": 2.8542078471997006e-06, "loss": 0.85985863, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.814023733139038 }, { "auxiliary_loss_clip": 0.0134478, "auxiliary_loss_mlp": 0.01195359, "balance_loss_clip": 1.00849557, "balance_loss_mlp": 1.00052309, "epoch": 0.37888534840377563, "flos": 24601404358080.0, "grad_norm": 1.7592607026331488, "language_loss": 0.75707424, "learning_rate": 2.8535034342819013e-06, "loss": 0.78247565, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.7138471603393555 }, { "auxiliary_loss_clip": 0.01362657, "auxiliary_loss_mlp": 0.01195202, "balance_loss_clip": 1.00879431, "balance_loss_mlp": 1.00046182, "epoch": 0.37900559129441475, "flos": 23986902900960.0, "grad_norm": 1.5669364614740693, "language_loss": 0.72913021, "learning_rate": 2.85279889188831e-06, "loss": 0.75470877, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.74245548248291 }, { "auxiliary_loss_clip": 0.01320899, "auxiliary_loss_mlp": 0.01195624, "balance_loss_clip": 1.00826263, "balance_loss_mlp": 1.00059807, "epoch": 0.3791258341850538, "flos": 24644679183360.0, "grad_norm": 1.7754988476485747, "language_loss": 0.80863726, "learning_rate": 2.852094220125805e-06, "loss": 0.83380246, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.8449506759643555 }, { "auxiliary_loss_clip": 0.01341511, "auxiliary_loss_mlp": 0.01195841, "balance_loss_clip": 1.00891125, "balance_loss_mlp": 1.00071943, "epoch": 0.3792460770756929, "flos": 17420851196640.0, "grad_norm": 2.052924452277592, "language_loss": 0.70932049, "learning_rate": 2.8513894191012846e-06, "loss": 0.734694, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.7138257026672363 }, { "auxiliary_loss_clip": 0.01364168, "auxiliary_loss_mlp": 0.01195513, "balance_loss_clip": 1.00904071, "balance_loss_mlp": 1.00048661, "epoch": 0.37936631996633197, "flos": 24206568796800.0, "grad_norm": 1.508761628079247, "language_loss": 0.78978062, "learning_rate": 2.8506844889216664e-06, "loss": 0.81537747, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.6708481311798096 }, { "auxiliary_loss_clip": 0.0131198, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00491381, "balance_loss_mlp": 1.00005043, "epoch": 0.3794865628569711, "flos": 70297151846400.0, "grad_norm": 0.8601236587323496, "language_loss": 0.62900078, "learning_rate": 2.849979429693887e-06, "loss": 0.65405899, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.333956718444824 }, { "auxiliary_loss_clip": 0.01363651, "auxiliary_loss_mlp": 0.01195665, "balance_loss_clip": 1.00921583, "balance_loss_mlp": 1.00073361, "epoch": 0.3796068057476102, "flos": 15779374194240.0, "grad_norm": 1.8830244632080189, "language_loss": 0.74526727, "learning_rate": 2.8492742415249042e-06, "loss": 0.77086043, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.6250100135803223 }, { "auxiliary_loss_clip": 0.01363017, "auxiliary_loss_mlp": 0.01195609, "balance_loss_clip": 1.00890744, "balance_loss_mlp": 1.00067794, "epoch": 0.37972704863824924, "flos": 25191706464000.0, "grad_norm": 1.6664319844600208, "language_loss": 0.76134741, "learning_rate": 2.848568924521694e-06, "loss": 0.78693366, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.684539794921875 }, { "auxiliary_loss_clip": 0.01351639, "auxiliary_loss_mlp": 0.01195406, "balance_loss_clip": 1.00892651, "balance_loss_mlp": 1.00047457, "epoch": 0.37984729152888835, "flos": 26210383639200.0, "grad_norm": 1.6578237973991368, "language_loss": 0.73541337, "learning_rate": 2.8478634787912526e-06, "loss": 0.76088381, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.7288615703582764 }, { "auxiliary_loss_clip": 0.01352131, "auxiliary_loss_mlp": 0.01195703, "balance_loss_clip": 1.00947428, "balance_loss_mlp": 1.00058162, "epoch": 0.37996753441952746, "flos": 25629313919040.0, "grad_norm": 2.137819199962024, "language_loss": 0.7667855, "learning_rate": 2.847157904440596e-06, "loss": 0.79226387, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.6926023960113525 }, { "auxiliary_loss_clip": 0.01340381, "auxiliary_loss_mlp": 0.01195608, "balance_loss_clip": 1.00820744, "balance_loss_mlp": 1.0007726, "epoch": 0.3800877773101665, "flos": 20118424639680.0, "grad_norm": 1.4480746266206566, "language_loss": 0.73991954, "learning_rate": 2.846452201576759e-06, "loss": 0.76527941, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.7450013160705566 }, { "auxiliary_loss_clip": 0.01307778, "auxiliary_loss_mlp": 0.01193822, "balance_loss_clip": 1.00411308, "balance_loss_mlp": 1.00003552, "epoch": 0.38020802020080563, "flos": 63053637683040.0, "grad_norm": 0.8514216043222232, "language_loss": 0.62853354, "learning_rate": 2.845746370306795e-06, "loss": 0.65354955, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.377464532852173 }, { "auxiliary_loss_clip": 0.01345039, "auxiliary_loss_mlp": 0.01195484, "balance_loss_clip": 1.00863838, "balance_loss_mlp": 1.00064826, "epoch": 0.38032826309144474, "flos": 21288431365920.0, "grad_norm": 2.0521201387857526, "language_loss": 0.78846109, "learning_rate": 2.84504041073778e-06, "loss": 0.81386632, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.7543489933013916 }, { "auxiliary_loss_clip": 0.01327375, "auxiliary_loss_mlp": 0.0119566, "balance_loss_clip": 1.0088079, "balance_loss_mlp": 1.00063396, "epoch": 0.3804485059820838, "flos": 18954129778560.0, "grad_norm": 2.0638145206070884, "language_loss": 0.7929793, "learning_rate": 2.844334322976806e-06, "loss": 0.81820965, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.795513153076172 }, { "auxiliary_loss_clip": 0.01292269, "auxiliary_loss_mlp": 0.01196016, "balance_loss_clip": 1.00814164, "balance_loss_mlp": 1.00060821, "epoch": 0.3805687488727229, "flos": 21833770233600.0, "grad_norm": 1.731996273809902, "language_loss": 0.83228779, "learning_rate": 2.8436281071309866e-06, "loss": 0.85717058, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.985975503921509 }, { "auxiliary_loss_clip": 0.01281357, "auxiliary_loss_mlp": 0.0119385, "balance_loss_clip": 1.00441742, "balance_loss_mlp": 1.00006318, "epoch": 0.380688991763362, "flos": 58546243071360.0, "grad_norm": 0.7619745555218622, "language_loss": 0.52997512, "learning_rate": 2.842921763307455e-06, "loss": 0.5547272, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.347806453704834 }, { "auxiliary_loss_clip": 0.01339362, "auxiliary_loss_mlp": 0.01195097, "balance_loss_clip": 1.00934112, "balance_loss_mlp": 1.0004524, "epoch": 0.38080923465400107, "flos": 23799519184320.0, "grad_norm": 1.7362467574546994, "language_loss": 0.82669377, "learning_rate": 2.842215291613361e-06, "loss": 0.85203838, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.832261323928833 }, { "auxiliary_loss_clip": 0.01219219, "auxiliary_loss_mlp": 0.01193798, "balance_loss_clip": 1.00353742, "balance_loss_mlp": 1.00001168, "epoch": 0.3809294775446402, "flos": 54969898273920.0, "grad_norm": 0.7744644658711806, "language_loss": 0.5924502, "learning_rate": 2.8415086921558774e-06, "loss": 0.61658043, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.630730152130127 }, { "auxiliary_loss_clip": 0.01338869, "auxiliary_loss_mlp": 0.01195437, "balance_loss_clip": 1.00861442, "balance_loss_mlp": 1.00050581, "epoch": 0.38104972043527924, "flos": 24643709244000.0, "grad_norm": 1.5041585861443727, "language_loss": 0.78720611, "learning_rate": 2.840801965042194e-06, "loss": 0.81254911, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 3.4504778385162354 }, { "auxiliary_loss_clip": 0.0133916, "auxiliary_loss_mlp": 0.01195524, "balance_loss_clip": 1.00883234, "balance_loss_mlp": 1.00059342, "epoch": 0.38116996332591835, "flos": 22856794174080.0, "grad_norm": 2.02550117071364, "language_loss": 0.83947027, "learning_rate": 2.840095110379521e-06, "loss": 0.86481714, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.817445755004883 }, { "auxiliary_loss_clip": 0.01256214, "auxiliary_loss_mlp": 0.01193897, "balance_loss_clip": 1.00389385, "balance_loss_mlp": 1.00011086, "epoch": 0.38129020621655746, "flos": 60836192123040.0, "grad_norm": 0.7384168584637136, "language_loss": 0.53908098, "learning_rate": 2.8393881282750884e-06, "loss": 0.56358206, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.3639509677886963 }, { "auxiliary_loss_clip": 0.01316848, "auxiliary_loss_mlp": 0.01195657, "balance_loss_clip": 1.00799394, "balance_loss_mlp": 1.00063086, "epoch": 0.3814104491071965, "flos": 21648110853600.0, "grad_norm": 1.7589743519448096, "language_loss": 0.78496826, "learning_rate": 2.838681018836144e-06, "loss": 0.81009334, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 3.9363114833831787 }, { "auxiliary_loss_clip": 0.01317694, "auxiliary_loss_mlp": 0.00872794, "balance_loss_clip": 1.00785446, "balance_loss_mlp": 1.00075376, "epoch": 0.3815306919978356, "flos": 19099100838240.0, "grad_norm": 1.9631564126908942, "language_loss": 0.78017426, "learning_rate": 2.837973782169955e-06, "loss": 0.8020792, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 4.200791835784912 }, { "auxiliary_loss_clip": 0.013358, "auxiliary_loss_mlp": 0.01193811, "balance_loss_clip": 1.00413299, "balance_loss_mlp": 1.00002432, "epoch": 0.38165093488847474, "flos": 67067949172320.0, "grad_norm": 0.8043746410478944, "language_loss": 0.59187979, "learning_rate": 2.8372664183838096e-06, "loss": 0.61717588, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.3833327293395996 }, { "auxiliary_loss_clip": 0.01362692, "auxiliary_loss_mlp": 0.01195366, "balance_loss_clip": 1.00863338, "balance_loss_mlp": 1.00053024, "epoch": 0.3817711777791138, "flos": 22341115056960.0, "grad_norm": 2.120932850188367, "language_loss": 0.67987716, "learning_rate": 2.836558927585015e-06, "loss": 0.70545769, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 4.638728141784668 }, { "auxiliary_loss_clip": 0.01344717, "auxiliary_loss_mlp": 0.01195299, "balance_loss_clip": 1.00813627, "balance_loss_mlp": 1.00046349, "epoch": 0.3818914206697529, "flos": 22820632237440.0, "grad_norm": 1.680534637628757, "language_loss": 0.82342476, "learning_rate": 2.8358513098808957e-06, "loss": 0.84882486, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.8613531589508057 }, { "auxiliary_loss_clip": 0.01279756, "auxiliary_loss_mlp": 0.01195422, "balance_loss_clip": 1.00737381, "balance_loss_mlp": 1.0006814, "epoch": 0.382011663560392, "flos": 24386085227520.0, "grad_norm": 1.693972106011492, "language_loss": 0.76975584, "learning_rate": 2.835143565378798e-06, "loss": 0.79450762, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.9221889972686768 }, { "auxiliary_loss_clip": 0.01260734, "auxiliary_loss_mlp": 0.01195641, "balance_loss_clip": 1.00665939, "balance_loss_mlp": 1.00051904, "epoch": 0.38213190645103107, "flos": 21981579264000.0, "grad_norm": 1.8688289038802826, "language_loss": 0.7832303, "learning_rate": 2.8344356941860847e-06, "loss": 0.80779397, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 3.0345873832702637 }, { "auxiliary_loss_clip": 0.01311851, "auxiliary_loss_mlp": 0.0119536, "balance_loss_clip": 1.00827718, "balance_loss_mlp": 1.00061941, "epoch": 0.3822521493416702, "flos": 35516960298720.0, "grad_norm": 2.106380125586973, "language_loss": 0.66145545, "learning_rate": 2.8337276964101403e-06, "loss": 0.68652749, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.9952609539031982 }, { "auxiliary_loss_clip": 0.01344072, "auxiliary_loss_mlp": 0.01195397, "balance_loss_clip": 1.00815868, "balance_loss_mlp": 1.00056171, "epoch": 0.3823723922323093, "flos": 21069915027840.0, "grad_norm": 1.879728485887235, "language_loss": 0.76336223, "learning_rate": 2.833019572158367e-06, "loss": 0.78875697, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.659257650375366 }, { "auxiliary_loss_clip": 0.01318178, "auxiliary_loss_mlp": 0.0119517, "balance_loss_clip": 1.00780916, "balance_loss_mlp": 1.00043023, "epoch": 0.38249263512294834, "flos": 19789159299840.0, "grad_norm": 2.0756778078666067, "language_loss": 0.80149055, "learning_rate": 2.8323113215381872e-06, "loss": 0.82662404, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.8145108222961426 }, { "auxiliary_loss_clip": 0.01305681, "auxiliary_loss_mlp": 0.01195835, "balance_loss_clip": 1.00836897, "balance_loss_mlp": 1.00071359, "epoch": 0.38261287801358745, "flos": 21433941280800.0, "grad_norm": 1.796203307430424, "language_loss": 0.76322246, "learning_rate": 2.831602944657042e-06, "loss": 0.78823763, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.7439780235290527 }, { "auxiliary_loss_clip": 0.0133043, "auxiliary_loss_mlp": 0.011954, "balance_loss_clip": 1.00862098, "balance_loss_mlp": 1.0005641, "epoch": 0.38273312090422656, "flos": 21981579264000.0, "grad_norm": 3.9161907873762396, "language_loss": 0.74428427, "learning_rate": 2.830894441622391e-06, "loss": 0.76954257, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.7199738025665283 }, { "auxiliary_loss_clip": 0.0132006, "auxiliary_loss_mlp": 0.00872798, "balance_loss_clip": 1.00819707, "balance_loss_mlp": 1.00069368, "epoch": 0.3828533637948656, "flos": 24790907571840.0, "grad_norm": 1.7784039889640504, "language_loss": 0.79987144, "learning_rate": 2.8301858125417134e-06, "loss": 0.82179999, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 2.844961404800415 }, { "auxiliary_loss_clip": 0.01327575, "auxiliary_loss_mlp": 0.01195334, "balance_loss_clip": 1.00941014, "balance_loss_mlp": 1.00059342, "epoch": 0.38297360668550473, "flos": 22455456426720.0, "grad_norm": 1.6714396922796257, "language_loss": 0.73556268, "learning_rate": 2.8294770575225082e-06, "loss": 0.76079178, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.7945544719696045 }, { "auxiliary_loss_clip": 0.01341541, "auxiliary_loss_mlp": 0.01195677, "balance_loss_clip": 1.00894439, "balance_loss_mlp": 1.00074577, "epoch": 0.3830938495761438, "flos": 24896914647840.0, "grad_norm": 1.6939783675560955, "language_loss": 0.83985519, "learning_rate": 2.828768176672293e-06, "loss": 0.86522734, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.719728708267212 }, { "auxiliary_loss_clip": 0.0131746, "auxiliary_loss_mlp": 0.01195739, "balance_loss_clip": 1.00802398, "balance_loss_mlp": 1.00080764, "epoch": 0.3832140924667829, "flos": 33036250704480.0, "grad_norm": 1.5369649992588892, "language_loss": 0.71549308, "learning_rate": 2.8280591700986044e-06, "loss": 0.74062502, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.868778705596924 }, { "auxiliary_loss_clip": 0.01332474, "auxiliary_loss_mlp": 0.01195464, "balance_loss_clip": 1.0081861, "balance_loss_mlp": 1.00053358, "epoch": 0.383334335357422, "flos": 31903735091040.0, "grad_norm": 1.7209639849536844, "language_loss": 0.75072217, "learning_rate": 2.827350037908999e-06, "loss": 0.77600157, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.7693309783935547 }, { "auxiliary_loss_clip": 0.01312568, "auxiliary_loss_mlp": 0.01195665, "balance_loss_clip": 1.00836062, "balance_loss_mlp": 1.00063896, "epoch": 0.38345457824806106, "flos": 19791925423200.0, "grad_norm": 2.0400985799291247, "language_loss": 0.7921443, "learning_rate": 2.8266407802110496e-06, "loss": 0.81722665, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.743809938430786 }, { "auxiliary_loss_clip": 0.01260163, "auxiliary_loss_mlp": 0.01195708, "balance_loss_clip": 1.0070709, "balance_loss_mlp": 1.00077701, "epoch": 0.3835748211387002, "flos": 22419402261120.0, "grad_norm": 1.7847744768525837, "language_loss": 0.76153827, "learning_rate": 2.8259313971123515e-06, "loss": 0.78609705, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.966583013534546 }, { "auxiliary_loss_clip": 0.01338088, "auxiliary_loss_mlp": 0.01195351, "balance_loss_clip": 1.00844765, "balance_loss_mlp": 1.00051522, "epoch": 0.3836950640293393, "flos": 25118448575040.0, "grad_norm": 1.7870909762084273, "language_loss": 0.78404027, "learning_rate": 2.8252218887205166e-06, "loss": 0.80937463, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 3.205986499786377 }, { "auxiliary_loss_clip": 0.01267942, "auxiliary_loss_mlp": 0.01195817, "balance_loss_clip": 1.00763452, "balance_loss_mlp": 1.00079083, "epoch": 0.38381530691997834, "flos": 21799224862560.0, "grad_norm": 1.5654110335642508, "language_loss": 0.80686843, "learning_rate": 2.824512255143178e-06, "loss": 0.83150601, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.9060440063476562 }, { "auxiliary_loss_clip": 0.01312844, "auxiliary_loss_mlp": 0.011956, "balance_loss_clip": 1.0087086, "balance_loss_mlp": 1.00066924, "epoch": 0.38393554981061745, "flos": 21252700513440.0, "grad_norm": 3.5522831290825656, "language_loss": 0.79567456, "learning_rate": 2.8238024964879855e-06, "loss": 0.82075906, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.7840867042541504 }, { "auxiliary_loss_clip": 0.01364234, "auxiliary_loss_mlp": 0.01195882, "balance_loss_clip": 1.00922394, "balance_loss_mlp": 1.00076008, "epoch": 0.38405579270125656, "flos": 17019369754560.0, "grad_norm": 2.253035221059108, "language_loss": 0.76933783, "learning_rate": 2.8230926128626095e-06, "loss": 0.79493898, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.7155849933624268 }, { "auxiliary_loss_clip": 0.01328109, "auxiliary_loss_mlp": 0.01195705, "balance_loss_clip": 1.00823903, "balance_loss_mlp": 1.0006783, "epoch": 0.3841760355918956, "flos": 21835386799200.0, "grad_norm": 1.7960414722393467, "language_loss": 0.79264331, "learning_rate": 2.822382604374738e-06, "loss": 0.81788141, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.8115618228912354 }, { "auxiliary_loss_clip": 0.01319208, "auxiliary_loss_mlp": 0.01195532, "balance_loss_clip": 1.00903952, "balance_loss_mlp": 1.00069666, "epoch": 0.3842962784825347, "flos": 25915124815200.0, "grad_norm": 2.036983108785388, "language_loss": 0.66048598, "learning_rate": 2.8216724711320793e-06, "loss": 0.68563336, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.7983129024505615 }, { "auxiliary_loss_clip": 0.01362675, "auxiliary_loss_mlp": 0.00872609, "balance_loss_clip": 1.0086813, "balance_loss_mlp": 1.00058699, "epoch": 0.38441652137317384, "flos": 25337503768320.0, "grad_norm": 1.4752640499321397, "language_loss": 0.80073977, "learning_rate": 2.820962213242361e-06, "loss": 0.82309258, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.7188689708709717 }, { "auxiliary_loss_clip": 0.0133818, "auxiliary_loss_mlp": 0.01195512, "balance_loss_clip": 1.00926757, "balance_loss_mlp": 1.00058079, "epoch": 0.3845367642638129, "flos": 18113496163200.0, "grad_norm": 2.0219096384558624, "language_loss": 0.84120953, "learning_rate": 2.8202518308133264e-06, "loss": 0.86654651, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 4.939669847488403 }, { "auxiliary_loss_clip": 0.01362995, "auxiliary_loss_mlp": 0.01195566, "balance_loss_clip": 1.00870359, "balance_loss_mlp": 1.00063467, "epoch": 0.384657007154452, "flos": 25228407255840.0, "grad_norm": 1.7958955935528225, "language_loss": 0.73305976, "learning_rate": 2.8195413239527426e-06, "loss": 0.75864542, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.7318594455718994 }, { "auxiliary_loss_clip": 0.01351529, "auxiliary_loss_mlp": 0.01195495, "balance_loss_clip": 1.00874877, "balance_loss_mlp": 1.0005641, "epoch": 0.38477725004509106, "flos": 19865865862080.0, "grad_norm": 2.0002791886186237, "language_loss": 0.80812085, "learning_rate": 2.8188306927683906e-06, "loss": 0.8335911, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.73419451713562 }, { "auxiliary_loss_clip": 0.01321207, "auxiliary_loss_mlp": 0.01195596, "balance_loss_clip": 1.00818372, "balance_loss_mlp": 1.00057006, "epoch": 0.38489749293573017, "flos": 18259401238560.0, "grad_norm": 1.8781024888452198, "language_loss": 0.74292135, "learning_rate": 2.818119937368074e-06, "loss": 0.76808935, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 4.723645210266113 }, { "auxiliary_loss_clip": 0.01349614, "auxiliary_loss_mlp": 0.01195942, "balance_loss_clip": 1.0084362, "balance_loss_mlp": 1.00062919, "epoch": 0.3850177358263693, "flos": 24389174664000.0, "grad_norm": 1.7303724413622485, "language_loss": 0.65264857, "learning_rate": 2.817409057859613e-06, "loss": 0.67810416, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.6954541206359863 }, { "auxiliary_loss_clip": 0.01278926, "auxiliary_loss_mlp": 0.01195968, "balance_loss_clip": 1.00792575, "balance_loss_mlp": 1.00084591, "epoch": 0.38513797871700833, "flos": 17671541942880.0, "grad_norm": 2.1645873272494294, "language_loss": 0.78774107, "learning_rate": 2.8166980543508482e-06, "loss": 0.81248999, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.822277784347534 }, { "auxiliary_loss_clip": 0.01363887, "auxiliary_loss_mlp": 0.011955, "balance_loss_clip": 1.00957036, "balance_loss_mlp": 1.00047362, "epoch": 0.38525822160764744, "flos": 25739595912960.0, "grad_norm": 1.7608522872738535, "language_loss": 0.79368782, "learning_rate": 2.815986926949638e-06, "loss": 0.81928164, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.7112600803375244 }, { "auxiliary_loss_clip": 0.01340553, "auxiliary_loss_mlp": 0.01195292, "balance_loss_clip": 1.00851965, "balance_loss_mlp": 1.00055146, "epoch": 0.38537846449828655, "flos": 20193658331040.0, "grad_norm": 1.6411690193633495, "language_loss": 0.80349982, "learning_rate": 2.8152756757638597e-06, "loss": 0.82885826, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.755621910095215 }, { "auxiliary_loss_clip": 0.01337713, "auxiliary_loss_mlp": 0.01195391, "balance_loss_clip": 1.00884712, "balance_loss_mlp": 1.00046015, "epoch": 0.3854987073889256, "flos": 23039364117600.0, "grad_norm": 1.9561130375163218, "language_loss": 0.84471548, "learning_rate": 2.8145643009014093e-06, "loss": 0.8700465, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.7875640392303467 }, { "auxiliary_loss_clip": 0.01350022, "auxiliary_loss_mlp": 0.01195107, "balance_loss_clip": 1.00920689, "balance_loss_mlp": 1.00046253, "epoch": 0.3856189502795647, "flos": 20190640741920.0, "grad_norm": 1.7481446651108752, "language_loss": 0.79461229, "learning_rate": 2.813852802470202e-06, "loss": 0.82006371, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.7926933765411377 }, { "auxiliary_loss_clip": 0.01329697, "auxiliary_loss_mlp": 0.01196188, "balance_loss_clip": 1.00876677, "balance_loss_mlp": 1.00097036, "epoch": 0.38573919317020383, "flos": 25702643655360.0, "grad_norm": 1.7977580236614918, "language_loss": 0.72013676, "learning_rate": 2.8131411805781717e-06, "loss": 0.74539566, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.801809072494507 }, { "auxiliary_loss_clip": 0.01316841, "auxiliary_loss_mlp": 0.0119568, "balance_loss_clip": 1.00842559, "balance_loss_mlp": 1.00055873, "epoch": 0.3858594360608429, "flos": 29821417502400.0, "grad_norm": 2.002482933343359, "language_loss": 0.63930786, "learning_rate": 2.8124294353332707e-06, "loss": 0.66443312, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.7616071701049805 }, { "auxiliary_loss_clip": 0.01312986, "auxiliary_loss_mlp": 0.01195626, "balance_loss_clip": 1.00767851, "balance_loss_mlp": 1.00069499, "epoch": 0.385979678951482, "flos": 24790440564000.0, "grad_norm": 1.5652126757548577, "language_loss": 0.77122891, "learning_rate": 2.8117175668434713e-06, "loss": 0.79631501, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.832547426223755 }, { "auxiliary_loss_clip": 0.01363583, "auxiliary_loss_mlp": 0.01195875, "balance_loss_clip": 1.00938296, "balance_loss_mlp": 1.00065851, "epoch": 0.3860999218421211, "flos": 21287892510720.0, "grad_norm": 2.244468340762051, "language_loss": 0.69656777, "learning_rate": 2.811005575216762e-06, "loss": 0.72216237, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.6357810497283936 }, { "auxiliary_loss_clip": 0.01308655, "auxiliary_loss_mlp": 0.01195554, "balance_loss_clip": 1.00852013, "balance_loss_mlp": 1.00052738, "epoch": 0.38622016473276016, "flos": 24536732228640.0, "grad_norm": 1.4690561450842023, "language_loss": 0.79129046, "learning_rate": 2.8102934605611513e-06, "loss": 0.81633258, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.7843730449676514 }, { "auxiliary_loss_clip": 0.01329387, "auxiliary_loss_mlp": 0.0119576, "balance_loss_clip": 1.00926375, "balance_loss_mlp": 1.00073326, "epoch": 0.3863404076233993, "flos": 20558223439200.0, "grad_norm": 2.7399166659714416, "language_loss": 0.67068398, "learning_rate": 2.8095812229846665e-06, "loss": 0.69593543, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.761115312576294 }, { "auxiliary_loss_clip": 0.01325935, "auxiliary_loss_mlp": 0.01195633, "balance_loss_clip": 1.00827122, "balance_loss_mlp": 1.00070214, "epoch": 0.3864606505140384, "flos": 22346288066880.0, "grad_norm": 2.409596875169763, "language_loss": 0.68650091, "learning_rate": 2.808868862595355e-06, "loss": 0.71171665, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.758185625076294 }, { "auxiliary_loss_clip": 0.01345134, "auxiliary_loss_mlp": 0.01195822, "balance_loss_clip": 1.0085268, "balance_loss_mlp": 1.00060487, "epoch": 0.38658089340467744, "flos": 25703613594720.0, "grad_norm": 1.8965014156373823, "language_loss": 0.79803646, "learning_rate": 2.8081563795012795e-06, "loss": 0.82344598, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.751542568206787 }, { "auxiliary_loss_clip": 0.01336866, "auxiliary_loss_mlp": 0.01195496, "balance_loss_clip": 1.00872183, "balance_loss_mlp": 1.00046992, "epoch": 0.38670113629531655, "flos": 33802548720480.0, "grad_norm": 1.7830362478223556, "language_loss": 0.74160182, "learning_rate": 2.807443773810524e-06, "loss": 0.76692551, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.8395538330078125 }, { "auxiliary_loss_clip": 0.0129815, "auxiliary_loss_mlp": 0.01195693, "balance_loss_clip": 1.00785089, "balance_loss_mlp": 1.00066686, "epoch": 0.3868213791859556, "flos": 23331533505120.0, "grad_norm": 1.9734639758404624, "language_loss": 0.89352834, "learning_rate": 2.80673104563119e-06, "loss": 0.91846669, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.876054286956787 }, { "auxiliary_loss_clip": 0.01340541, "auxiliary_loss_mlp": 0.01195352, "balance_loss_clip": 1.00851512, "balance_loss_mlp": 1.00061214, "epoch": 0.3869416220765947, "flos": 18441540097920.0, "grad_norm": 1.8550808014006201, "language_loss": 0.78748834, "learning_rate": 2.8060181950713976e-06, "loss": 0.81284732, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.7336599826812744 }, { "auxiliary_loss_clip": 0.01314337, "auxiliary_loss_mlp": 0.01195809, "balance_loss_clip": 1.00841904, "balance_loss_mlp": 1.0006876, "epoch": 0.3870618649672338, "flos": 15633002111040.0, "grad_norm": 1.832068241984415, "language_loss": 0.81226146, "learning_rate": 2.805305222239286e-06, "loss": 0.83736295, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.795919179916382 }, { "auxiliary_loss_clip": 0.01316529, "auxiliary_loss_mlp": 0.01195174, "balance_loss_clip": 1.00805688, "balance_loss_mlp": 1.00043416, "epoch": 0.3871821078578729, "flos": 23513816059200.0, "grad_norm": 1.8829204031404585, "language_loss": 0.74150306, "learning_rate": 2.8045921272430118e-06, "loss": 0.76662004, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.7687299251556396 }, { "auxiliary_loss_clip": 0.01351553, "auxiliary_loss_mlp": 0.01195899, "balance_loss_clip": 1.00886941, "balance_loss_mlp": 1.00077736, "epoch": 0.387302350748512, "flos": 17778267492480.0, "grad_norm": 2.2573319355595767, "language_loss": 0.76639396, "learning_rate": 2.803878910190753e-06, "loss": 0.79186851, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.762376308441162 }, { "auxiliary_loss_clip": 0.01351337, "auxiliary_loss_mlp": 0.01195516, "balance_loss_clip": 1.00918317, "balance_loss_mlp": 1.00058532, "epoch": 0.3874225936391511, "flos": 11503415236320.0, "grad_norm": 2.3927257564223416, "language_loss": 0.82153702, "learning_rate": 2.8031655711907017e-06, "loss": 0.84700561, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.6684317588806152 }, { "auxiliary_loss_clip": 0.01341256, "auxiliary_loss_mlp": 0.01195783, "balance_loss_clip": 1.00870705, "balance_loss_mlp": 1.0007571, "epoch": 0.38754283652979016, "flos": 21945165861600.0, "grad_norm": 2.3818490740036697, "language_loss": 0.80699879, "learning_rate": 2.8024521103510723e-06, "loss": 0.83236921, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.8036386966705322 }, { "auxiliary_loss_clip": 0.01351217, "auxiliary_loss_mlp": 0.01195667, "balance_loss_clip": 1.00878024, "balance_loss_mlp": 1.00083184, "epoch": 0.38766307942042927, "flos": 21175993951200.0, "grad_norm": 1.7294123923023803, "language_loss": 0.75503373, "learning_rate": 2.8017385277800952e-06, "loss": 0.78050262, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 3.596092939376831 }, { "auxiliary_loss_clip": 0.01307052, "auxiliary_loss_mlp": 0.01195955, "balance_loss_clip": 1.00822854, "balance_loss_mlp": 1.00064278, "epoch": 0.3877833223110684, "flos": 27417306699360.0, "grad_norm": 1.9945479658333316, "language_loss": 0.75203454, "learning_rate": 2.8010248235860213e-06, "loss": 0.77706456, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 3.7168407440185547 }, { "auxiliary_loss_clip": 0.01311752, "auxiliary_loss_mlp": 0.00871732, "balance_loss_clip": 1.00462437, "balance_loss_mlp": 0.99986106, "epoch": 0.38790356520170743, "flos": 64500056510400.0, "grad_norm": 0.8221796812457366, "language_loss": 0.62849927, "learning_rate": 2.8003109978771192e-06, "loss": 0.65033412, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.348402976989746 }, { "auxiliary_loss_clip": 0.01314179, "auxiliary_loss_mlp": 0.01195222, "balance_loss_clip": 1.00841963, "balance_loss_mlp": 1.00048161, "epoch": 0.38802380809234654, "flos": 22345425898560.0, "grad_norm": 1.9281254420142255, "language_loss": 0.79171598, "learning_rate": 2.799597050761674e-06, "loss": 0.81681001, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 3.77295184135437 }, { "auxiliary_loss_clip": 0.01364127, "auxiliary_loss_mlp": 0.01195665, "balance_loss_clip": 1.00913024, "balance_loss_mlp": 1.00054312, "epoch": 0.38814405098298566, "flos": 25261371984960.0, "grad_norm": 1.7746661054925859, "language_loss": 0.78749526, "learning_rate": 2.7988829823479924e-06, "loss": 0.81309325, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 3.6833150386810303 }, { "auxiliary_loss_clip": 0.01327848, "auxiliary_loss_mlp": 0.01195717, "balance_loss_clip": 1.00846875, "balance_loss_mlp": 1.00069034, "epoch": 0.3882642938736247, "flos": 18841189432320.0, "grad_norm": 1.8712453602415426, "language_loss": 0.64045513, "learning_rate": 2.7981687927443976e-06, "loss": 0.66569078, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.730731964111328 }, { "auxiliary_loss_clip": 0.01352139, "auxiliary_loss_mlp": 0.01195468, "balance_loss_clip": 1.0090816, "balance_loss_mlp": 1.00063229, "epoch": 0.3883845367642638, "flos": 21652816855680.0, "grad_norm": 1.6242094438111705, "language_loss": 0.85363275, "learning_rate": 2.797454482059231e-06, "loss": 0.87910879, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.783424139022827 }, { "auxiliary_loss_clip": 0.01364057, "auxiliary_loss_mlp": 0.01195447, "balance_loss_clip": 1.00927234, "balance_loss_mlp": 1.00051618, "epoch": 0.3885047796549029, "flos": 20557540889280.0, "grad_norm": 1.8480063829088065, "language_loss": 0.84319353, "learning_rate": 2.7967400504008537e-06, "loss": 0.86878848, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.6669082641601562 }, { "auxiliary_loss_clip": 0.01268174, "auxiliary_loss_mlp": 0.01193798, "balance_loss_clip": 1.00553775, "balance_loss_mlp": 1.0000118, "epoch": 0.388625022545542, "flos": 64325520262080.0, "grad_norm": 0.7856157242897538, "language_loss": 0.57483333, "learning_rate": 2.7960254978776456e-06, "loss": 0.59945303, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.442861557006836 }, { "auxiliary_loss_clip": 0.01363407, "auxiliary_loss_mlp": 0.01195635, "balance_loss_clip": 1.00929141, "balance_loss_mlp": 1.00060821, "epoch": 0.3887452654361811, "flos": 18113891323680.0, "grad_norm": 2.293878398497133, "language_loss": 0.81883895, "learning_rate": 2.7953108245980006e-06, "loss": 0.84442937, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.796931266784668 }, { "auxiliary_loss_clip": 0.01313927, "auxiliary_loss_mlp": 0.01195467, "balance_loss_clip": 1.00814986, "balance_loss_mlp": 1.00072694, "epoch": 0.38886550832682015, "flos": 24975273699360.0, "grad_norm": 1.567024007247131, "language_loss": 0.73817545, "learning_rate": 2.7945960306703365e-06, "loss": 0.76326936, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.8940720558166504 }, { "auxiliary_loss_clip": 0.01351697, "auxiliary_loss_mlp": 0.011956, "balance_loss_clip": 1.00924873, "balance_loss_mlp": 1.00066853, "epoch": 0.38898575121745926, "flos": 27199508834880.0, "grad_norm": 1.572658790602268, "language_loss": 0.65724707, "learning_rate": 2.7938811162030865e-06, "loss": 0.68272001, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.7780346870422363 }, { "auxiliary_loss_clip": 0.01342135, "auxiliary_loss_mlp": 0.01195409, "balance_loss_clip": 1.00898027, "balance_loss_mlp": 1.0005734, "epoch": 0.3891059941080984, "flos": 28763740419840.0, "grad_norm": 1.6835725609104775, "language_loss": 0.82145011, "learning_rate": 2.793166081304702e-06, "loss": 0.8468256, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.7801034450531006 }, { "auxiliary_loss_clip": 0.01314068, "auxiliary_loss_mlp": 0.01195785, "balance_loss_clip": 1.00840259, "balance_loss_mlp": 1.00056779, "epoch": 0.38922623699873743, "flos": 22893459042240.0, "grad_norm": 2.928827892685978, "language_loss": 0.82190216, "learning_rate": 2.7924509260836543e-06, "loss": 0.84700066, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.8194072246551514 }, { "auxiliary_loss_clip": 0.013061, "auxiliary_loss_mlp": 0.01195388, "balance_loss_clip": 1.00777066, "balance_loss_mlp": 1.00055265, "epoch": 0.38934647988937654, "flos": 19792428354720.0, "grad_norm": 1.4627037711051993, "language_loss": 0.68082523, "learning_rate": 2.791735650648431e-06, "loss": 0.70584011, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.764292001724243 }, { "auxiliary_loss_clip": 0.01315512, "auxiliary_loss_mlp": 0.01195273, "balance_loss_clip": 1.00768197, "balance_loss_mlp": 1.00053251, "epoch": 0.38946672278001565, "flos": 19202090325120.0, "grad_norm": 1.7756153020085585, "language_loss": 0.74501681, "learning_rate": 2.791020255107538e-06, "loss": 0.77012473, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.69301700592041 }, { "auxiliary_loss_clip": 0.01317415, "auxiliary_loss_mlp": 0.01195651, "balance_loss_clip": 1.00870872, "balance_loss_mlp": 1.00062501, "epoch": 0.3895869656706547, "flos": 24936489334080.0, "grad_norm": 1.6317997949063183, "language_loss": 0.80739409, "learning_rate": 2.7903047395695023e-06, "loss": 0.83252478, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.8025524616241455 }, { "auxiliary_loss_clip": 0.01339739, "auxiliary_loss_mlp": 0.00872736, "balance_loss_clip": 1.00865674, "balance_loss_mlp": 1.00044179, "epoch": 0.3897072085612938, "flos": 24133634220960.0, "grad_norm": 1.9199264018247135, "language_loss": 0.90233123, "learning_rate": 2.789589104142865e-06, "loss": 0.924456, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.743889331817627 }, { "auxiliary_loss_clip": 0.01307666, "auxiliary_loss_mlp": 0.01195278, "balance_loss_clip": 1.00829995, "balance_loss_mlp": 1.00053775, "epoch": 0.3898274514519329, "flos": 17166352540320.0, "grad_norm": 2.0309361311797964, "language_loss": 0.76454115, "learning_rate": 2.7888733489361895e-06, "loss": 0.78957057, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.7333009243011475 }, { "auxiliary_loss_clip": 0.01335385, "auxiliary_loss_mlp": 0.01193794, "balance_loss_clip": 1.00432634, "balance_loss_mlp": 1.00000751, "epoch": 0.389947694342572, "flos": 66074836448160.0, "grad_norm": 0.7236461475033219, "language_loss": 0.58724254, "learning_rate": 2.788157474058054e-06, "loss": 0.61253428, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.365089178085327 }, { "auxiliary_loss_clip": 0.01361689, "auxiliary_loss_mlp": 0.01195176, "balance_loss_clip": 1.00840724, "balance_loss_mlp": 1.00053144, "epoch": 0.3900679372332111, "flos": 25740925089120.0, "grad_norm": 1.502286312618587, "language_loss": 0.69933283, "learning_rate": 2.7874414796170555e-06, "loss": 0.72490156, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.6811506748199463 }, { "auxiliary_loss_clip": 0.01351121, "auxiliary_loss_mlp": 0.01195641, "balance_loss_clip": 1.00931263, "balance_loss_mlp": 1.00061464, "epoch": 0.3901881801238502, "flos": 11801619802080.0, "grad_norm": 2.4994890775021306, "language_loss": 0.83804917, "learning_rate": 2.7867253657218113e-06, "loss": 0.86351675, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.6545674800872803 }, { "auxiliary_loss_clip": 0.01327159, "auxiliary_loss_mlp": 0.0087275, "balance_loss_clip": 1.00840425, "balance_loss_mlp": 1.00036836, "epoch": 0.39030842301448926, "flos": 27308964584160.0, "grad_norm": 1.939942787881524, "language_loss": 0.73107046, "learning_rate": 2.7860091324809544e-06, "loss": 0.75306952, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.827549934387207 }, { "auxiliary_loss_clip": 0.01338141, "auxiliary_loss_mlp": 0.01195408, "balance_loss_clip": 1.00880504, "balance_loss_mlp": 1.00057244, "epoch": 0.39042866590512837, "flos": 27163346898240.0, "grad_norm": 1.6495140611266819, "language_loss": 0.80886054, "learning_rate": 2.7852927800031377e-06, "loss": 0.83419597, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.742255449295044 }, { "auxiliary_loss_clip": 0.01328129, "auxiliary_loss_mlp": 0.01195236, "balance_loss_clip": 1.00851059, "balance_loss_mlp": 1.0004009, "epoch": 0.3905489087957674, "flos": 29716128900000.0, "grad_norm": 1.956446545454016, "language_loss": 0.82919306, "learning_rate": 2.7845763083970298e-06, "loss": 0.85442674, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.8732829093933105 }, { "auxiliary_loss_clip": 0.01350746, "auxiliary_loss_mlp": 0.01195334, "balance_loss_clip": 1.00853527, "balance_loss_mlp": 1.00049794, "epoch": 0.39066915168640653, "flos": 24498630413280.0, "grad_norm": 2.9240878948311, "language_loss": 0.82260323, "learning_rate": 2.7838597177713205e-06, "loss": 0.84806401, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.755199909210205 }, { "auxiliary_loss_clip": 0.01255864, "auxiliary_loss_mlp": 0.01195995, "balance_loss_clip": 1.00802398, "balance_loss_mlp": 1.00077832, "epoch": 0.39078939457704565, "flos": 20558582676000.0, "grad_norm": 1.6325063294386126, "language_loss": 0.73551881, "learning_rate": 2.7831430082347143e-06, "loss": 0.76003742, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 4.765944242477417 }, { "auxiliary_loss_clip": 0.01351192, "auxiliary_loss_mlp": 0.00872667, "balance_loss_clip": 1.0094769, "balance_loss_mlp": 1.00039387, "epoch": 0.3909096374676847, "flos": 22783428514080.0, "grad_norm": 1.809683600421289, "language_loss": 0.82502168, "learning_rate": 2.7824261798959373e-06, "loss": 0.8472603, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.7275757789611816 }, { "auxiliary_loss_clip": 0.01338496, "auxiliary_loss_mlp": 0.01195145, "balance_loss_clip": 1.00883865, "balance_loss_mlp": 1.00049996, "epoch": 0.3910298803583238, "flos": 23003130333600.0, "grad_norm": 1.8947640518014637, "language_loss": 0.79443258, "learning_rate": 2.78170923286373e-06, "loss": 0.81976897, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.7620973587036133 }, { "auxiliary_loss_clip": 0.01251931, "auxiliary_loss_mlp": 0.01195736, "balance_loss_clip": 1.0083425, "balance_loss_mlp": 1.00070977, "epoch": 0.3911501232489629, "flos": 24316276011840.0, "grad_norm": 2.1986148398638776, "language_loss": 0.83918506, "learning_rate": 2.780992167246854e-06, "loss": 0.86366177, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 3.850433826446533 }, { "auxiliary_loss_clip": 0.01310801, "auxiliary_loss_mlp": 0.01193827, "balance_loss_clip": 1.00408268, "balance_loss_mlp": 1.00004089, "epoch": 0.391270366139602, "flos": 60869085004800.0, "grad_norm": 0.9801487943184318, "language_loss": 0.72217906, "learning_rate": 2.7802749831540883e-06, "loss": 0.7472254, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 4.286700963973999 }, { "auxiliary_loss_clip": 0.01286328, "auxiliary_loss_mlp": 0.01195152, "balance_loss_clip": 1.00805569, "balance_loss_mlp": 1.00050747, "epoch": 0.3913906090302411, "flos": 21543504801120.0, "grad_norm": 1.8750161073729286, "language_loss": 0.81927478, "learning_rate": 2.7795576806942268e-06, "loss": 0.84408963, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.825676679611206 }, { "auxiliary_loss_clip": 0.01297412, "auxiliary_loss_mlp": 0.01193845, "balance_loss_clip": 1.00966263, "balance_loss_mlp": 1.00005805, "epoch": 0.3915108519208802, "flos": 49839978015360.0, "grad_norm": 0.7619953480878002, "language_loss": 0.54874027, "learning_rate": 2.778840259976085e-06, "loss": 0.57365286, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.24711012840271 }, { "auxiliary_loss_clip": 0.01340554, "auxiliary_loss_mlp": 0.01195416, "balance_loss_clip": 1.0084976, "balance_loss_mlp": 1.0004847, "epoch": 0.39163109481151925, "flos": 16506456760800.0, "grad_norm": 2.05524616527984, "language_loss": 0.7686528, "learning_rate": 2.778122721108495e-06, "loss": 0.79401255, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.733259439468384 }, { "auxiliary_loss_clip": 0.01350489, "auxiliary_loss_mlp": 0.01195174, "balance_loss_clip": 1.0092175, "balance_loss_mlp": 1.00043392, "epoch": 0.39175133770215836, "flos": 26067496152960.0, "grad_norm": 2.5567526397866485, "language_loss": 0.88207209, "learning_rate": 2.7774050642003076e-06, "loss": 0.90752876, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.7263143062591553 }, { "auxiliary_loss_clip": 0.0136376, "auxiliary_loss_mlp": 0.01195576, "balance_loss_clip": 1.00909948, "balance_loss_mlp": 1.00064492, "epoch": 0.3918715805927975, "flos": 21872087591040.0, "grad_norm": 1.993580030695085, "language_loss": 0.93585324, "learning_rate": 2.7766872893603896e-06, "loss": 0.96144658, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.757147789001465 }, { "auxiliary_loss_clip": 0.01350887, "auxiliary_loss_mlp": 0.01195192, "balance_loss_clip": 1.00948656, "balance_loss_mlp": 1.00054657, "epoch": 0.39199182348343653, "flos": 20376192350880.0, "grad_norm": 1.6582220831307566, "language_loss": 0.73291087, "learning_rate": 2.7759693966976275e-06, "loss": 0.75837165, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.713330030441284 }, { "auxiliary_loss_clip": 0.01301875, "auxiliary_loss_mlp": 0.0119587, "balance_loss_clip": 1.00808454, "balance_loss_mlp": 1.00065243, "epoch": 0.39211206637407564, "flos": 21683554316640.0, "grad_norm": 1.6741549365702781, "language_loss": 0.85289162, "learning_rate": 2.7752513863209242e-06, "loss": 0.87786901, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.787670612335205 }, { "auxiliary_loss_clip": 0.01315825, "auxiliary_loss_mlp": 0.00872651, "balance_loss_clip": 1.00846601, "balance_loss_mlp": 1.00041735, "epoch": 0.39223230926471475, "flos": 21066286736160.0, "grad_norm": 1.7164571122004961, "language_loss": 0.84490478, "learning_rate": 2.774533258339203e-06, "loss": 0.86678946, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.827305555343628 }, { "auxiliary_loss_clip": 0.01298457, "auxiliary_loss_mlp": 0.01195786, "balance_loss_clip": 1.00840402, "balance_loss_mlp": 1.00075996, "epoch": 0.3923525521553538, "flos": 17603025979680.0, "grad_norm": 1.9363654145879152, "language_loss": 0.79954088, "learning_rate": 2.7738150128614014e-06, "loss": 0.82448334, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.799166440963745 }, { "auxiliary_loss_clip": 0.01302368, "auxiliary_loss_mlp": 0.0119518, "balance_loss_clip": 1.00818741, "balance_loss_mlp": 1.00053549, "epoch": 0.3924727950459929, "flos": 20558295286560.0, "grad_norm": 2.334361492031511, "language_loss": 0.89662516, "learning_rate": 2.7730966499964777e-06, "loss": 0.92160064, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.76302433013916 }, { "auxiliary_loss_clip": 0.01363367, "auxiliary_loss_mlp": 0.01195488, "balance_loss_clip": 1.00874197, "balance_loss_mlp": 1.00055659, "epoch": 0.39259303793663197, "flos": 16216119480960.0, "grad_norm": 2.2883980795264627, "language_loss": 0.80818033, "learning_rate": 2.772378169853408e-06, "loss": 0.83376884, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.6202006340026855 }, { "auxiliary_loss_clip": 0.01298882, "auxiliary_loss_mlp": 0.01195169, "balance_loss_clip": 1.00748754, "balance_loss_mlp": 1.00042868, "epoch": 0.3927132808272711, "flos": 16797009582720.0, "grad_norm": 1.6403471230231073, "language_loss": 0.74046373, "learning_rate": 2.771659572541183e-06, "loss": 0.76540422, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.810610771179199 }, { "auxiliary_loss_clip": 0.01345085, "auxiliary_loss_mlp": 0.01195627, "balance_loss_clip": 1.00923777, "balance_loss_mlp": 1.00069559, "epoch": 0.3928335237179102, "flos": 20267239533120.0, "grad_norm": 2.063599295188386, "language_loss": 0.87061, "learning_rate": 2.7709408581688143e-06, "loss": 0.89601713, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.703446626663208 }, { "auxiliary_loss_clip": 0.01307534, "auxiliary_loss_mlp": 0.01195231, "balance_loss_clip": 1.00862336, "balance_loss_mlp": 1.0004909, "epoch": 0.39295376660854925, "flos": 24973261973280.0, "grad_norm": 1.555269974996165, "language_loss": 0.87946117, "learning_rate": 2.7702220268453307e-06, "loss": 0.9044888, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.792598247528076 }, { "auxiliary_loss_clip": 0.01321963, "auxiliary_loss_mlp": 0.01195583, "balance_loss_clip": 1.007761, "balance_loss_mlp": 1.00065231, "epoch": 0.39307400949918836, "flos": 18697799014560.0, "grad_norm": 2.158845158754854, "language_loss": 0.84531832, "learning_rate": 2.7695030786797785e-06, "loss": 0.87049377, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.6475162506103516 }, { "auxiliary_loss_clip": 0.0129086, "auxiliary_loss_mlp": 0.01195112, "balance_loss_clip": 1.00781965, "balance_loss_mlp": 1.0004667, "epoch": 0.39319425238982747, "flos": 22415486580000.0, "grad_norm": 2.3758506399276174, "language_loss": 0.74586618, "learning_rate": 2.7687840137812206e-06, "loss": 0.77072585, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.855039119720459 }, { "auxiliary_loss_clip": 0.01312871, "auxiliary_loss_mlp": 0.01193804, "balance_loss_clip": 1.00410652, "balance_loss_mlp": 1.00001752, "epoch": 0.3933144952804665, "flos": 66192985728000.0, "grad_norm": 0.7978129570116211, "language_loss": 0.62057739, "learning_rate": 2.7680648322587395e-06, "loss": 0.64564413, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.3023922443389893 }, { "auxiliary_loss_clip": 0.01361682, "auxiliary_loss_mlp": 0.01195278, "balance_loss_clip": 1.00874257, "balance_loss_mlp": 1.00053763, "epoch": 0.39343473817110564, "flos": 15487168883040.0, "grad_norm": 1.7579290016468685, "language_loss": 0.80449182, "learning_rate": 2.7673455342214334e-06, "loss": 0.83006144, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.6397624015808105 }, { "auxiliary_loss_clip": 0.01343577, "auxiliary_loss_mlp": 0.01195346, "balance_loss_clip": 1.00839758, "balance_loss_mlp": 1.00070143, "epoch": 0.39355498106174475, "flos": 21324916615680.0, "grad_norm": 2.0070927322345162, "language_loss": 0.7614423, "learning_rate": 2.7666261197784198e-06, "loss": 0.7868315, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.7005064487457275 }, { "auxiliary_loss_clip": 0.01315561, "auxiliary_loss_mlp": 0.01195283, "balance_loss_clip": 1.00865626, "balance_loss_mlp": 1.00044775, "epoch": 0.3936752239523838, "flos": 13296365484480.0, "grad_norm": 2.162618964757335, "language_loss": 0.7661593, "learning_rate": 2.7659065890388336e-06, "loss": 0.79126775, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.7699763774871826 }, { "auxiliary_loss_clip": 0.01327122, "auxiliary_loss_mlp": 0.01195254, "balance_loss_clip": 1.0083189, "balance_loss_mlp": 1.00060868, "epoch": 0.3937954668430229, "flos": 16800170866560.0, "grad_norm": 2.368084303872064, "language_loss": 0.85065567, "learning_rate": 2.7651869421118266e-06, "loss": 0.87587941, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 3.5996253490448 }, { "auxiliary_loss_clip": 0.01342222, "auxiliary_loss_mlp": 0.01195209, "balance_loss_clip": 1.0102818, "balance_loss_mlp": 1.00046897, "epoch": 0.393915709733662, "flos": 21064239086400.0, "grad_norm": 1.8531638210428711, "language_loss": 0.83134222, "learning_rate": 2.76446717910657e-06, "loss": 0.85671651, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.7416279315948486 }, { "auxiliary_loss_clip": 0.01338078, "auxiliary_loss_mlp": 0.01195151, "balance_loss_clip": 1.00855863, "balance_loss_mlp": 1.00041103, "epoch": 0.3940359526243011, "flos": 17165274829920.0, "grad_norm": 2.1889798044221207, "language_loss": 0.7681396, "learning_rate": 2.763747300132249e-06, "loss": 0.79347181, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 3.672152280807495 }, { "auxiliary_loss_clip": 0.01363638, "auxiliary_loss_mlp": 0.01195178, "balance_loss_clip": 1.00954771, "balance_loss_mlp": 1.00053334, "epoch": 0.3941561955149402, "flos": 20995866817920.0, "grad_norm": 1.624232417109019, "language_loss": 0.8643235, "learning_rate": 2.7630273052980704e-06, "loss": 0.88991165, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.7218546867370605 }, { "auxiliary_loss_clip": 0.01325976, "auxiliary_loss_mlp": 0.01195204, "balance_loss_clip": 1.00936842, "balance_loss_mlp": 1.00055945, "epoch": 0.39427643840557924, "flos": 18843416700480.0, "grad_norm": 2.0717162775777167, "language_loss": 0.66979671, "learning_rate": 2.7623071947132554e-06, "loss": 0.69500852, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 3.6084401607513428 }, { "auxiliary_loss_clip": 0.01337326, "auxiliary_loss_mlp": 0.01195426, "balance_loss_clip": 1.00891328, "balance_loss_mlp": 1.00059032, "epoch": 0.39439668129621835, "flos": 23258670776640.0, "grad_norm": 2.4895301364665285, "language_loss": 0.78684449, "learning_rate": 2.7615869684870458e-06, "loss": 0.81217194, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 3.8277957439422607 }, { "auxiliary_loss_clip": 0.01338175, "auxiliary_loss_mlp": 0.01195271, "balance_loss_clip": 1.00867331, "balance_loss_mlp": 1.00053024, "epoch": 0.39451692418685746, "flos": 26652301935840.0, "grad_norm": 1.5739192387713603, "language_loss": 0.84568197, "learning_rate": 2.7608666267286986e-06, "loss": 0.8710165, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.730315923690796 }, { "auxiliary_loss_clip": 0.01270671, "auxiliary_loss_mlp": 0.01195321, "balance_loss_clip": 1.00686717, "balance_loss_mlp": 1.00048566, "epoch": 0.3946371670774965, "flos": 18258718688640.0, "grad_norm": 2.769002695279929, "language_loss": 0.86538315, "learning_rate": 2.760146169547489e-06, "loss": 0.89004308, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.7932968139648438 }, { "auxiliary_loss_clip": 0.01321948, "auxiliary_loss_mlp": 0.01195508, "balance_loss_clip": 1.00864792, "balance_loss_mlp": 1.00067186, "epoch": 0.39475740996813563, "flos": 24206137712640.0, "grad_norm": 1.3832207188859478, "language_loss": 0.76328617, "learning_rate": 2.75942559705271e-06, "loss": 0.78846073, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.749922513961792 }, { "auxiliary_loss_clip": 0.0133758, "auxiliary_loss_mlp": 0.0119557, "balance_loss_clip": 1.00862575, "balance_loss_mlp": 1.0006392, "epoch": 0.39487765285877474, "flos": 19317868642080.0, "grad_norm": 1.7298292516688785, "language_loss": 0.89045465, "learning_rate": 2.7587049093536713e-06, "loss": 0.91578615, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.649857759475708 }, { "auxiliary_loss_clip": 0.01342676, "auxiliary_loss_mlp": 0.01195365, "balance_loss_clip": 1.00932026, "balance_loss_mlp": 1.00052893, "epoch": 0.3949978957494138, "flos": 17311754684160.0, "grad_norm": 1.8005832763358747, "language_loss": 0.80967903, "learning_rate": 2.757984106559701e-06, "loss": 0.83505946, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.6856586933135986 }, { "auxiliary_loss_clip": 0.01327923, "auxiliary_loss_mlp": 0.01195303, "balance_loss_clip": 1.00835562, "balance_loss_mlp": 1.00065804, "epoch": 0.3951181386400529, "flos": 36317875533120.0, "grad_norm": 2.945356227938839, "language_loss": 0.71319306, "learning_rate": 2.7572631887801446e-06, "loss": 0.73842531, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.8443245887756348 }, { "auxiliary_loss_clip": 0.0134023, "auxiliary_loss_mlp": 0.01195371, "balance_loss_clip": 1.00890183, "balance_loss_mlp": 1.00053513, "epoch": 0.395238381530692, "flos": 23110358814720.0, "grad_norm": 1.766286646240038, "language_loss": 0.76532221, "learning_rate": 2.7565421561243654e-06, "loss": 0.79067826, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.786724328994751 }, { "auxiliary_loss_clip": 0.01314586, "auxiliary_loss_mlp": 0.01195334, "balance_loss_clip": 1.00838339, "balance_loss_mlp": 1.00059342, "epoch": 0.3953586244213311, "flos": 24347624175360.0, "grad_norm": 1.8640849301518645, "language_loss": 0.81885302, "learning_rate": 2.7558210087017413e-06, "loss": 0.84395218, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 2.825495958328247 }, { "auxiliary_loss_clip": 0.01288856, "auxiliary_loss_mlp": 0.01195257, "balance_loss_clip": 1.00774229, "balance_loss_mlp": 1.00051665, "epoch": 0.3954788673119702, "flos": 23440091162400.0, "grad_norm": 1.959623884716034, "language_loss": 0.73259705, "learning_rate": 2.7550997466216724e-06, "loss": 0.75743818, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.7184391021728516 }, { "auxiliary_loss_clip": 0.01317217, "auxiliary_loss_mlp": 0.01195239, "balance_loss_clip": 1.00911283, "balance_loss_mlp": 1.00059414, "epoch": 0.3955991102026093, "flos": 17494073161920.0, "grad_norm": 2.0009776160524666, "language_loss": 0.81529617, "learning_rate": 2.7543783699935714e-06, "loss": 0.84042072, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.7566943168640137 }, { "auxiliary_loss_clip": 0.01338535, "auxiliary_loss_mlp": 0.01195274, "balance_loss_clip": 1.00890684, "balance_loss_mlp": 1.00043869, "epoch": 0.39571935309324835, "flos": 18221335346880.0, "grad_norm": 3.087758104544903, "language_loss": 0.85746527, "learning_rate": 2.753656878926872e-06, "loss": 0.88280332, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.728811264038086 }, { "auxiliary_loss_clip": 0.01326698, "auxiliary_loss_mlp": 0.01194983, "balance_loss_clip": 1.00843716, "balance_loss_mlp": 1.00033855, "epoch": 0.39583959598388746, "flos": 17748823284000.0, "grad_norm": 1.6952345988329593, "language_loss": 0.73817497, "learning_rate": 2.752935273531023e-06, "loss": 0.76339179, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.8128929138183594 }, { "auxiliary_loss_clip": 0.01340728, "auxiliary_loss_mlp": 0.01195518, "balance_loss_clip": 1.00899947, "balance_loss_mlp": 1.00058746, "epoch": 0.39595983887452657, "flos": 19352378089440.0, "grad_norm": 1.8714866394575107, "language_loss": 0.78618324, "learning_rate": 2.752213553915492e-06, "loss": 0.81154573, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.6631226539611816 }, { "auxiliary_loss_clip": 0.01296336, "auxiliary_loss_mlp": 0.01193815, "balance_loss_clip": 1.00400615, "balance_loss_mlp": 1.00002789, "epoch": 0.3960800817651656, "flos": 60682311990720.0, "grad_norm": 0.8173989142324993, "language_loss": 0.66066635, "learning_rate": 2.751491720189762e-06, "loss": 0.68556786, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.2589290142059326 }, { "auxiliary_loss_clip": 0.01321516, "auxiliary_loss_mlp": 0.00872573, "balance_loss_clip": 1.00816035, "balance_loss_mlp": 1.00021636, "epoch": 0.39620032465580474, "flos": 16836727963680.0, "grad_norm": 2.179785242673836, "language_loss": 0.91799188, "learning_rate": 2.7507697724633364e-06, "loss": 0.9399327, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.718406915664673 }, { "auxiliary_loss_clip": 0.01281114, "auxiliary_loss_mlp": 0.01193966, "balance_loss_clip": 1.00927138, "balance_loss_mlp": 1.00017917, "epoch": 0.3963205675464438, "flos": 69071476625280.0, "grad_norm": 0.7804231086776127, "language_loss": 0.54726768, "learning_rate": 2.7500477108457327e-06, "loss": 0.57201844, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.272949457168579 }, { "auxiliary_loss_clip": 0.01338889, "auxiliary_loss_mlp": 0.01195379, "balance_loss_clip": 1.00838494, "balance_loss_mlp": 1.00054312, "epoch": 0.3964408104370829, "flos": 25667451658080.0, "grad_norm": 1.776135366106163, "language_loss": 0.80556142, "learning_rate": 2.7493255354464877e-06, "loss": 0.83090413, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.786924123764038 }, { "auxiliary_loss_clip": 0.01218259, "auxiliary_loss_mlp": 0.01195157, "balance_loss_clip": 1.00637436, "balance_loss_mlp": 1.00060725, "epoch": 0.396561053327722, "flos": 24277491646560.0, "grad_norm": 1.7413196871028405, "language_loss": 0.76130354, "learning_rate": 2.748603246375156e-06, "loss": 0.7854377, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.2719829082489014 }, { "auxiliary_loss_clip": 0.01363548, "auxiliary_loss_mlp": 0.01195391, "balance_loss_clip": 1.0094136, "balance_loss_mlp": 1.00055528, "epoch": 0.39668129621836107, "flos": 20522312968320.0, "grad_norm": 2.0893163636391563, "language_loss": 0.69101405, "learning_rate": 2.7478808437413055e-06, "loss": 0.71660352, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 2.833026647567749 }, { "auxiliary_loss_clip": 0.0127383, "auxiliary_loss_mlp": 0.01195781, "balance_loss_clip": 1.00761771, "balance_loss_mlp": 1.00075459, "epoch": 0.3968015391090002, "flos": 27052597896480.0, "grad_norm": 1.8464754879414462, "language_loss": 0.65960658, "learning_rate": 2.7471583276545263e-06, "loss": 0.68430269, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.8423092365264893 }, { "auxiliary_loss_clip": 0.01320446, "auxiliary_loss_mlp": 0.01195503, "balance_loss_clip": 1.0077275, "balance_loss_mlp": 1.0006671, "epoch": 0.3969217819996393, "flos": 12531827728800.0, "grad_norm": 2.2433799425885232, "language_loss": 0.70558167, "learning_rate": 2.7464356982244224e-06, "loss": 0.7307412, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 4.56987190246582 }, { "auxiliary_loss_clip": 0.01318027, "auxiliary_loss_mlp": 0.01194022, "balance_loss_clip": 1.01123881, "balance_loss_mlp": 1.00023556, "epoch": 0.39704202489027834, "flos": 66241433563200.0, "grad_norm": 0.788838712165711, "language_loss": 0.61780739, "learning_rate": 2.745712955560617e-06, "loss": 0.64292789, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.299917221069336 }, { "auxiliary_loss_clip": 0.01265537, "auxiliary_loss_mlp": 0.01195388, "balance_loss_clip": 1.00694013, "balance_loss_mlp": 1.00074267, "epoch": 0.39716226778091746, "flos": 16982992275840.0, "grad_norm": 2.223900952592095, "language_loss": 0.76616728, "learning_rate": 2.7449900997727496e-06, "loss": 0.79077649, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 3.791058301925659 }, { "auxiliary_loss_clip": 0.01318101, "auxiliary_loss_mlp": 0.01195277, "balance_loss_clip": 1.00882435, "balance_loss_mlp": 1.00072742, "epoch": 0.39728251067155657, "flos": 23477151191040.0, "grad_norm": 1.653912704182238, "language_loss": 0.84121162, "learning_rate": 2.744267130970476e-06, "loss": 0.86634541, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.7904040813446045 }, { "auxiliary_loss_clip": 0.01327642, "auxiliary_loss_mlp": 0.01195353, "balance_loss_clip": 1.00944293, "balance_loss_mlp": 1.00051725, "epoch": 0.3974027535621956, "flos": 20704451827680.0, "grad_norm": 1.8686305199929885, "language_loss": 0.76909971, "learning_rate": 2.7435440492634697e-06, "loss": 0.7943297, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 3.7127482891082764 }, { "auxiliary_loss_clip": 0.01313819, "auxiliary_loss_mlp": 0.01195802, "balance_loss_clip": 1.00798011, "balance_loss_mlp": 1.00077534, "epoch": 0.39752299645283473, "flos": 21543289259040.0, "grad_norm": 1.8281763233551132, "language_loss": 0.67216134, "learning_rate": 2.7428208547614228e-06, "loss": 0.69725752, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 3.938729763031006 }, { "auxiliary_loss_clip": 0.0134062, "auxiliary_loss_mlp": 0.0119519, "balance_loss_clip": 1.00854719, "balance_loss_mlp": 1.00044942, "epoch": 0.39764323934347384, "flos": 19208305121760.0, "grad_norm": 1.9113902491012518, "language_loss": 0.76906866, "learning_rate": 2.742097547574043e-06, "loss": 0.7944268, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.7725729942321777 }, { "auxiliary_loss_clip": 0.01328776, "auxiliary_loss_mlp": 0.00872603, "balance_loss_clip": 1.00853252, "balance_loss_mlp": 1.00014472, "epoch": 0.3977634822341129, "flos": 20850213208320.0, "grad_norm": 2.0889247803883304, "language_loss": 0.77478242, "learning_rate": 2.7413741278110544e-06, "loss": 0.7967962, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.731269121170044 }, { "auxiliary_loss_clip": 0.01320911, "auxiliary_loss_mlp": 0.01195728, "balance_loss_clip": 1.00807428, "balance_loss_mlp": 1.00070119, "epoch": 0.397883725124752, "flos": 39786057833760.0, "grad_norm": 4.5933940500069195, "language_loss": 0.68921721, "learning_rate": 2.7406505955822016e-06, "loss": 0.7143836, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.9036600589752197 }, { "auxiliary_loss_clip": 0.01338104, "auxiliary_loss_mlp": 0.01195558, "balance_loss_clip": 1.0089016, "balance_loss_mlp": 1.00072277, "epoch": 0.39800396801539106, "flos": 17379516250080.0, "grad_norm": 2.4999477507920247, "language_loss": 0.66109347, "learning_rate": 2.7399269509972415e-06, "loss": 0.6864301, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.7471847534179688 }, { "auxiliary_loss_clip": 0.01340709, "auxiliary_loss_mlp": 0.01195876, "balance_loss_clip": 1.00917125, "balance_loss_mlp": 1.00065887, "epoch": 0.3981242109060302, "flos": 19202772875040.0, "grad_norm": 2.3292941325055776, "language_loss": 0.8522566, "learning_rate": 2.7392031941659514e-06, "loss": 0.87762249, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.76342511177063 }, { "auxiliary_loss_clip": 0.01317913, "auxiliary_loss_mlp": 0.01195439, "balance_loss_clip": 1.00880933, "balance_loss_mlp": 1.00060368, "epoch": 0.3982444537966693, "flos": 24565134650400.0, "grad_norm": 1.8813474908803767, "language_loss": 0.86027783, "learning_rate": 2.7384793251981244e-06, "loss": 0.88541138, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.8349575996398926 }, { "auxiliary_loss_clip": 0.01351662, "auxiliary_loss_mlp": 0.01195013, "balance_loss_clip": 1.00925279, "balance_loss_mlp": 1.00046396, "epoch": 0.39836469668730834, "flos": 26213868236160.0, "grad_norm": 1.9363166217029602, "language_loss": 0.80641222, "learning_rate": 2.737755344203571e-06, "loss": 0.83187902, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.7679147720336914 }, { "auxiliary_loss_clip": 0.01344507, "auxiliary_loss_mlp": 0.01195176, "balance_loss_clip": 1.00860739, "balance_loss_mlp": 1.0005312, "epoch": 0.39848493957794745, "flos": 27636146350560.0, "grad_norm": 1.9946618446570883, "language_loss": 0.80114412, "learning_rate": 2.7370312512921186e-06, "loss": 0.82654095, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 2.7384135723114014 }, { "auxiliary_loss_clip": 0.01330077, "auxiliary_loss_mlp": 0.0119531, "balance_loss_clip": 1.00901949, "balance_loss_mlp": 1.00047481, "epoch": 0.39860518246858656, "flos": 12239335028160.0, "grad_norm": 2.2796504028000664, "language_loss": 0.76903713, "learning_rate": 2.736307046573611e-06, "loss": 0.79429102, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.7190499305725098 }, { "auxiliary_loss_clip": 0.01362719, "auxiliary_loss_mlp": 0.0119545, "balance_loss_clip": 1.00880647, "balance_loss_mlp": 1.00061417, "epoch": 0.3987254253592256, "flos": 22379145024960.0, "grad_norm": 1.8442362930938332, "language_loss": 0.81951362, "learning_rate": 2.73558273015791e-06, "loss": 0.84509528, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.6601269245147705 }, { "auxiliary_loss_clip": 0.01364107, "auxiliary_loss_mlp": 0.01195386, "balance_loss_clip": 1.0096879, "balance_loss_mlp": 1.00064564, "epoch": 0.3988456682498647, "flos": 23514031601280.0, "grad_norm": 2.2902593902214563, "language_loss": 0.70393986, "learning_rate": 2.734858302154894e-06, "loss": 0.72953475, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.722688913345337 }, { "auxiliary_loss_clip": 0.01313649, "auxiliary_loss_mlp": 0.01195302, "balance_loss_clip": 1.00796461, "balance_loss_mlp": 1.00056219, "epoch": 0.39896591114050384, "flos": 19208772129600.0, "grad_norm": 2.1134427417397967, "language_loss": 0.76277161, "learning_rate": 2.734133762674457e-06, "loss": 0.78786111, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.7776124477386475 }, { "auxiliary_loss_clip": 0.01316577, "auxiliary_loss_mlp": 0.01195582, "balance_loss_clip": 1.00790143, "balance_loss_mlp": 1.00055599, "epoch": 0.3990861540311429, "flos": 28401043343040.0, "grad_norm": 5.3181221862169155, "language_loss": 0.70385373, "learning_rate": 2.7334091118265124e-06, "loss": 0.7289753, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.845731258392334 }, { "auxiliary_loss_clip": 0.01321255, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00371337, "balance_loss_mlp": 1.00005043, "epoch": 0.399206396921782, "flos": 61758597539520.0, "grad_norm": 0.6759316892805848, "language_loss": 0.57835758, "learning_rate": 2.732684349720989e-06, "loss": 0.60350859, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.445773124694824 }, { "auxiliary_loss_clip": 0.01307142, "auxiliary_loss_mlp": 0.0119543, "balance_loss_clip": 1.00967097, "balance_loss_mlp": 1.00049889, "epoch": 0.3993266398124211, "flos": 28074580050240.0, "grad_norm": 1.8964802882084224, "language_loss": 0.75509024, "learning_rate": 2.7319594764678318e-06, "loss": 0.78011596, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.908642530441284 }, { "auxiliary_loss_clip": 0.01272106, "auxiliary_loss_mlp": 0.01195417, "balance_loss_clip": 1.00727701, "balance_loss_mlp": 1.00058162, "epoch": 0.39944688270306017, "flos": 23225095344960.0, "grad_norm": 1.7404335334306682, "language_loss": 0.83148342, "learning_rate": 2.7312344921770044e-06, "loss": 0.85615861, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.991257667541504 }, { "auxiliary_loss_clip": 0.01338063, "auxiliary_loss_mlp": 0.01195583, "balance_loss_clip": 1.00893331, "balance_loss_mlp": 1.00065148, "epoch": 0.3995671255936993, "flos": 19390443981120.0, "grad_norm": 1.8083296579361576, "language_loss": 0.7858023, "learning_rate": 2.7305093969584857e-06, "loss": 0.81113875, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.842848539352417 }, { "auxiliary_loss_clip": 0.01350924, "auxiliary_loss_mlp": 0.01195279, "balance_loss_clip": 1.00910485, "balance_loss_mlp": 1.00053835, "epoch": 0.3996873684843384, "flos": 23842650314880.0, "grad_norm": 2.027407608643283, "language_loss": 0.79762828, "learning_rate": 2.729784190922272e-06, "loss": 0.82309031, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.7583491802215576 }, { "auxiliary_loss_clip": 0.0128819, "auxiliary_loss_mlp": 0.01193834, "balance_loss_clip": 1.00341392, "balance_loss_mlp": 1.00004697, "epoch": 0.39980761137497745, "flos": 66576913699680.0, "grad_norm": 0.9360915908843326, "language_loss": 0.57179433, "learning_rate": 2.729058874178378e-06, "loss": 0.59661454, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.4727590084075928 }, { "auxiliary_loss_clip": 0.01323745, "auxiliary_loss_mlp": 0.01195234, "balance_loss_clip": 1.00838304, "balance_loss_mlp": 1.00049317, "epoch": 0.39992785426561656, "flos": 28549175686560.0, "grad_norm": 1.9103046747688337, "language_loss": 0.69215786, "learning_rate": 2.7283334468368315e-06, "loss": 0.71734762, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.825056314468384 }, { "auxiliary_loss_clip": 0.01245178, "auxiliary_loss_mlp": 0.01195347, "balance_loss_clip": 1.0078609, "balance_loss_mlp": 1.00051093, "epoch": 0.4000480971562556, "flos": 15049417733280.0, "grad_norm": 1.7872183250153952, "language_loss": 0.72735661, "learning_rate": 2.72760790900768e-06, "loss": 0.75176191, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 3.991762638092041 }, { "auxiliary_loss_clip": 0.01363466, "auxiliary_loss_mlp": 0.01195278, "balance_loss_clip": 1.00918782, "balance_loss_mlp": 1.00053799, "epoch": 0.4001683400468947, "flos": 23915620814400.0, "grad_norm": 1.7119888475346103, "language_loss": 0.78619134, "learning_rate": 2.7268822608009875e-06, "loss": 0.81177878, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.964738130569458 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.01195137, "balance_loss_clip": 1.00827742, "balance_loss_mlp": 1.00049245, "epoch": 0.40028858293753383, "flos": 24352689414240.0, "grad_norm": 2.0067384960970993, "language_loss": 0.78355968, "learning_rate": 2.726156502326834e-06, "loss": 0.80857933, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 285.61756658554077 }, { "auxiliary_loss_clip": 0.01257106, "auxiliary_loss_mlp": 0.01194049, "balance_loss_clip": 1.01088548, "balance_loss_mlp": 1.00026214, "epoch": 0.4004088258281729, "flos": 66787059820320.0, "grad_norm": 0.7328408277289724, "language_loss": 0.60204643, "learning_rate": 2.725430633695316e-06, "loss": 0.62655795, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 4.5259177684783936 }, { "auxiliary_loss_clip": 0.01334631, "auxiliary_loss_mlp": 0.01193845, "balance_loss_clip": 1.0040524, "balance_loss_mlp": 1.00005805, "epoch": 0.400529068718812, "flos": 58598413760160.0, "grad_norm": 0.8836192303347946, "language_loss": 0.57986379, "learning_rate": 2.7247046550165485e-06, "loss": 0.60514855, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 4.356717109680176 }, { "auxiliary_loss_clip": 0.01363494, "auxiliary_loss_mlp": 0.01195398, "balance_loss_clip": 1.00979531, "balance_loss_mlp": 1.00065804, "epoch": 0.4006493116094511, "flos": 25377473615040.0, "grad_norm": 1.6037202487671862, "language_loss": 0.75946164, "learning_rate": 2.7239785664006606e-06, "loss": 0.78505051, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.8310370445251465 }, { "auxiliary_loss_clip": 0.01322212, "auxiliary_loss_mlp": 0.01193861, "balance_loss_clip": 1.00396132, "balance_loss_mlp": 1.00007439, "epoch": 0.40076955450009016, "flos": 60280758701280.0, "grad_norm": 0.9421763586290222, "language_loss": 0.6186434, "learning_rate": 2.7232523679578002e-06, "loss": 0.64380413, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.332627773284912 }, { "auxiliary_loss_clip": 0.01338584, "auxiliary_loss_mlp": 0.01195434, "balance_loss_clip": 1.00887311, "balance_loss_mlp": 1.00069332, "epoch": 0.4008897973907293, "flos": 16617277609920.0, "grad_norm": 4.581388982148829, "language_loss": 0.78995669, "learning_rate": 2.7225260597981295e-06, "loss": 0.81529689, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.8566699028015137 }, { "auxiliary_loss_clip": 0.0129201, "auxiliary_loss_mlp": 0.00872598, "balance_loss_clip": 1.00805402, "balance_loss_mlp": 1.00018501, "epoch": 0.4010100402813684, "flos": 15377353896960.0, "grad_norm": 2.7695852865933035, "language_loss": 0.78915501, "learning_rate": 2.721799642031831e-06, "loss": 0.81080103, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.7411954402923584 }, { "auxiliary_loss_clip": 0.01338988, "auxiliary_loss_mlp": 0.01195151, "balance_loss_clip": 1.00930703, "balance_loss_mlp": 1.0005064, "epoch": 0.40113028317200744, "flos": 13298844218400.0, "grad_norm": 1.7639500541272533, "language_loss": 0.77703792, "learning_rate": 2.721073114769101e-06, "loss": 0.80237937, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.779047966003418 }, { "auxiliary_loss_clip": 0.01295548, "auxiliary_loss_mlp": 0.01195305, "balance_loss_clip": 1.00686431, "balance_loss_mlp": 1.0006603, "epoch": 0.40125052606264655, "flos": 20668038425280.0, "grad_norm": 1.771405943842084, "language_loss": 0.75099182, "learning_rate": 2.7203464781201523e-06, "loss": 0.77590036, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.808377265930176 }, { "auxiliary_loss_clip": 0.01364038, "auxiliary_loss_mlp": 0.01195537, "balance_loss_clip": 1.00970912, "balance_loss_mlp": 1.00070167, "epoch": 0.40137076895328566, "flos": 24607691002080.0, "grad_norm": 2.7663213995759737, "language_loss": 0.78617358, "learning_rate": 2.719619732195215e-06, "loss": 0.81176931, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.8062291145324707 }, { "auxiliary_loss_clip": 0.01298081, "auxiliary_loss_mlp": 0.01195421, "balance_loss_clip": 1.00719404, "balance_loss_mlp": 1.00068069, "epoch": 0.4014910118439247, "flos": 24206604720480.0, "grad_norm": 1.358423313697366, "language_loss": 0.72847539, "learning_rate": 2.7188928771045377e-06, "loss": 0.7534104, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.795426607131958 }, { "auxiliary_loss_clip": 0.01308155, "auxiliary_loss_mlp": 0.0119527, "balance_loss_clip": 1.00798154, "balance_loss_mlp": 1.00062478, "epoch": 0.4016112547345638, "flos": 26725092816960.0, "grad_norm": 1.73614432935486, "language_loss": 0.80289376, "learning_rate": 2.7181659129583815e-06, "loss": 0.82792807, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.823005199432373 }, { "auxiliary_loss_clip": 0.01339241, "auxiliary_loss_mlp": 0.01195254, "balance_loss_clip": 1.00902438, "balance_loss_mlp": 1.00060904, "epoch": 0.4017314976252029, "flos": 21288036205440.0, "grad_norm": 1.6592752839027614, "language_loss": 0.75491101, "learning_rate": 2.7174388398670276e-06, "loss": 0.78025591, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.7210617065429688 }, { "auxiliary_loss_clip": 0.0136322, "auxiliary_loss_mlp": 0.01195472, "balance_loss_clip": 1.00870919, "balance_loss_mlp": 1.00063586, "epoch": 0.401851740515842, "flos": 25484702096160.0, "grad_norm": 1.830750110748918, "language_loss": 0.92148858, "learning_rate": 2.716711657940773e-06, "loss": 0.94707549, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.7380220890045166 }, { "auxiliary_loss_clip": 0.0127487, "auxiliary_loss_mlp": 0.01193922, "balance_loss_clip": 1.00381482, "balance_loss_mlp": 1.00013494, "epoch": 0.4019719834064811, "flos": 55395363525120.0, "grad_norm": 0.8086734948224986, "language_loss": 0.564946, "learning_rate": 2.7159843672899284e-06, "loss": 0.58963394, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.4987876415252686 }, { "auxiliary_loss_clip": 0.01341326, "auxiliary_loss_mlp": 0.01195155, "balance_loss_clip": 1.00925732, "balance_loss_mlp": 1.00050974, "epoch": 0.40209222629712016, "flos": 18180108171360.0, "grad_norm": 2.033824227322857, "language_loss": 0.81478119, "learning_rate": 2.715256968024825e-06, "loss": 0.84014595, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.7215540409088135 }, { "auxiliary_loss_clip": 0.01323814, "auxiliary_loss_mlp": 0.01195547, "balance_loss_clip": 1.00909042, "balance_loss_mlp": 1.0006156, "epoch": 0.40221246918775927, "flos": 25961022069120.0, "grad_norm": 1.4917147386463776, "language_loss": 0.82259226, "learning_rate": 2.7145294602558083e-06, "loss": 0.84778595, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.8080875873565674 }, { "auxiliary_loss_clip": 0.01340134, "auxiliary_loss_mlp": 0.01195582, "balance_loss_clip": 1.00896811, "balance_loss_mlp": 1.00065076, "epoch": 0.4023327120783984, "flos": 33838926199200.0, "grad_norm": 1.8611920169629894, "language_loss": 0.70582426, "learning_rate": 2.713801844093241e-06, "loss": 0.73118144, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.786802291870117 }, { "auxiliary_loss_clip": 0.01343916, "auxiliary_loss_mlp": 0.01195354, "balance_loss_clip": 1.00875568, "balance_loss_mlp": 1.00061333, "epoch": 0.40245295496903744, "flos": 26900262482400.0, "grad_norm": 1.864803448415117, "language_loss": 0.88650358, "learning_rate": 2.7130741196475014e-06, "loss": 0.91189623, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.7967348098754883 }, { "auxiliary_loss_clip": 0.01317757, "auxiliary_loss_mlp": 0.01195687, "balance_loss_clip": 1.00905728, "balance_loss_mlp": 1.00066066, "epoch": 0.40257319785967655, "flos": 36902753163360.0, "grad_norm": 1.800207092505333, "language_loss": 0.79207242, "learning_rate": 2.7123462870289848e-06, "loss": 0.81720686, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.837895154953003 }, { "auxiliary_loss_clip": 0.01328624, "auxiliary_loss_mlp": 0.01195441, "balance_loss_clip": 1.00820947, "balance_loss_mlp": 1.00060534, "epoch": 0.40269344075031566, "flos": 24353192345760.0, "grad_norm": 1.4529857293194262, "language_loss": 0.81090409, "learning_rate": 2.711618346348102e-06, "loss": 0.8361448, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.7389719486236572 }, { "auxiliary_loss_clip": 0.01326204, "auxiliary_loss_mlp": 0.01195244, "balance_loss_clip": 1.00945044, "balance_loss_mlp": 1.00059938, "epoch": 0.4028136836409547, "flos": 14389665648480.0, "grad_norm": 1.470884753310814, "language_loss": 0.63262564, "learning_rate": 2.7108902977152825e-06, "loss": 0.65784007, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.741915464401245 }, { "auxiliary_loss_clip": 0.01351052, "auxiliary_loss_mlp": 0.01195216, "balance_loss_clip": 1.00916719, "balance_loss_mlp": 1.00066662, "epoch": 0.4029339265315938, "flos": 26136048039840.0, "grad_norm": 2.7244651424866704, "language_loss": 0.74953663, "learning_rate": 2.7101621412409704e-06, "loss": 0.77499938, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.7112343311309814 }, { "auxiliary_loss_clip": 0.01363042, "auxiliary_loss_mlp": 0.01195221, "balance_loss_clip": 1.00889349, "balance_loss_mlp": 1.00057578, "epoch": 0.40305416942223293, "flos": 23256335737440.0, "grad_norm": 2.0117119487214024, "language_loss": 0.85793084, "learning_rate": 2.7094338770356256e-06, "loss": 0.88351345, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.6866631507873535 }, { "auxiliary_loss_clip": 0.0131451, "auxiliary_loss_mlp": 0.01195158, "balance_loss_clip": 1.00820053, "balance_loss_mlp": 1.00051296, "epoch": 0.403174412312872, "flos": 27089657925120.0, "grad_norm": 1.8310473125377222, "language_loss": 0.64101183, "learning_rate": 2.708705505209726e-06, "loss": 0.66610849, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 3.765300750732422 }, { "auxiliary_loss_clip": 0.01297413, "auxiliary_loss_mlp": 0.01195045, "balance_loss_clip": 1.00790441, "balance_loss_mlp": 1.00049543, "epoch": 0.4032946552035111, "flos": 21756345197760.0, "grad_norm": 2.0297898320619714, "language_loss": 0.92001152, "learning_rate": 2.7079770258737646e-06, "loss": 0.9449361, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 3.7563416957855225 }, { "auxiliary_loss_clip": 0.01314205, "auxiliary_loss_mlp": 0.01195307, "balance_loss_clip": 1.00868857, "balance_loss_mlp": 1.0005672, "epoch": 0.4034148980941502, "flos": 17343965016000.0, "grad_norm": 2.13317556700289, "language_loss": 0.75186789, "learning_rate": 2.707248439138251e-06, "loss": 0.776963, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.790408134460449 }, { "auxiliary_loss_clip": 0.01314659, "auxiliary_loss_mlp": 0.0119527, "balance_loss_clip": 1.00896835, "balance_loss_mlp": 1.00062537, "epoch": 0.40353514098478926, "flos": 22017848971680.0, "grad_norm": 1.6368021989539405, "language_loss": 0.65364951, "learning_rate": 2.7065197451137114e-06, "loss": 0.67874885, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 3.708045244216919 }, { "auxiliary_loss_clip": 0.01316287, "auxiliary_loss_mlp": 0.01195031, "balance_loss_clip": 1.00821352, "balance_loss_mlp": 1.00048184, "epoch": 0.4036553838754284, "flos": 14246454849120.0, "grad_norm": 1.972657284128322, "language_loss": 0.6768384, "learning_rate": 2.7057909439106894e-06, "loss": 0.70195162, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 3.7287418842315674 }, { "auxiliary_loss_clip": 0.01351958, "auxiliary_loss_mlp": 0.00872452, "balance_loss_clip": 1.00948596, "balance_loss_mlp": 1.00010681, "epoch": 0.40377562676606743, "flos": 24790656106080.0, "grad_norm": 1.8490505241408695, "language_loss": 0.78515911, "learning_rate": 2.7050620356397417e-06, "loss": 0.80740321, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.7873222827911377 }, { "auxiliary_loss_clip": 0.01362392, "auxiliary_loss_mlp": 0.0119532, "balance_loss_clip": 1.00931859, "balance_loss_mlp": 1.00067544, "epoch": 0.40389586965670654, "flos": 24061238500320.0, "grad_norm": 1.6609216069158377, "language_loss": 0.72613519, "learning_rate": 2.7043330204114437e-06, "loss": 0.75171238, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.728172540664673 }, { "auxiliary_loss_clip": 0.01361724, "auxiliary_loss_mlp": 0.01195099, "balance_loss_clip": 1.00862789, "balance_loss_mlp": 1.00054955, "epoch": 0.40401611254734565, "flos": 16399623440160.0, "grad_norm": 1.8872322860181223, "language_loss": 0.85608852, "learning_rate": 2.7036038983363862e-06, "loss": 0.88165671, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.733227491378784 }, { "auxiliary_loss_clip": 0.01337069, "auxiliary_loss_mlp": 0.0119518, "balance_loss_clip": 1.00813806, "balance_loss_mlp": 1.00063014, "epoch": 0.4041363554379847, "flos": 23988232077120.0, "grad_norm": 1.9322424217076375, "language_loss": 0.8439182, "learning_rate": 2.702874669525177e-06, "loss": 0.86924064, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.7573609352111816 }, { "auxiliary_loss_clip": 0.01304404, "auxiliary_loss_mlp": 0.011952, "balance_loss_clip": 1.00918603, "balance_loss_mlp": 1.00055504, "epoch": 0.4042565983286238, "flos": 28401977358720.0, "grad_norm": 1.8958391278847166, "language_loss": 0.69755626, "learning_rate": 2.7021453340884394e-06, "loss": 0.7225523, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 3.050691604614258 }, { "auxiliary_loss_clip": 0.01327735, "auxiliary_loss_mlp": 0.00872485, "balance_loss_clip": 1.00852239, "balance_loss_mlp": 1.00017941, "epoch": 0.40437684121926293, "flos": 17710972934400.0, "grad_norm": 2.059607221789562, "language_loss": 0.7281146, "learning_rate": 2.7014158921368125e-06, "loss": 0.75011683, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.7705419063568115 }, { "auxiliary_loss_clip": 0.01363089, "auxiliary_loss_mlp": 0.01195227, "balance_loss_clip": 1.00904131, "balance_loss_mlp": 1.00058234, "epoch": 0.404497084109902, "flos": 24018969538080.0, "grad_norm": 1.7146070418834796, "language_loss": 0.85200357, "learning_rate": 2.700686343780953e-06, "loss": 0.87758672, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.886582851409912 }, { "auxiliary_loss_clip": 0.01326868, "auxiliary_loss_mlp": 0.01195295, "balance_loss_clip": 1.00865674, "balance_loss_mlp": 1.00074577, "epoch": 0.4046173270005411, "flos": 22929872444640.0, "grad_norm": 1.6321133364201785, "language_loss": 0.88151574, "learning_rate": 2.699956689131532e-06, "loss": 0.90673739, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.737391471862793 }, { "auxiliary_loss_clip": 0.01315875, "auxiliary_loss_mlp": 0.01195083, "balance_loss_clip": 1.00832033, "balance_loss_mlp": 1.00053382, "epoch": 0.4047375698911802, "flos": 20668146196320.0, "grad_norm": 2.0199741508705684, "language_loss": 0.85421664, "learning_rate": 2.699226928299238e-06, "loss": 0.87932622, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.7026145458221436 }, { "auxiliary_loss_clip": 0.01343006, "auxiliary_loss_mlp": 0.01195307, "balance_loss_clip": 1.00809848, "balance_loss_mlp": 1.00066185, "epoch": 0.40485781278181926, "flos": 28912878626400.0, "grad_norm": 2.099471720481423, "language_loss": 0.78964859, "learning_rate": 2.698497061394774e-06, "loss": 0.81503177, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.7631995677948 }, { "auxiliary_loss_clip": 0.01304721, "auxiliary_loss_mlp": 0.00872509, "balance_loss_clip": 1.00797749, "balance_loss_mlp": 1.00016284, "epoch": 0.40497805567245837, "flos": 23148388782720.0, "grad_norm": 1.441849014098644, "language_loss": 0.80434662, "learning_rate": 2.6977670885288627e-06, "loss": 0.82611889, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.7607219219207764 }, { "auxiliary_loss_clip": 0.01326627, "auxiliary_loss_mlp": 0.01195218, "balance_loss_clip": 1.0085876, "balance_loss_mlp": 1.0005734, "epoch": 0.4050982985630975, "flos": 16289413293600.0, "grad_norm": 1.7580751464432205, "language_loss": 0.75267816, "learning_rate": 2.6970370098122378e-06, "loss": 0.77789664, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.770470380783081 }, { "auxiliary_loss_clip": 0.01363096, "auxiliary_loss_mlp": 0.0119523, "balance_loss_clip": 1.00914693, "balance_loss_mlp": 1.00058472, "epoch": 0.40521854145373654, "flos": 34459498758240.0, "grad_norm": 1.4928418319804206, "language_loss": 0.86466569, "learning_rate": 2.6963068253556535e-06, "loss": 0.89024895, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.7421507835388184 }, { "auxiliary_loss_clip": 0.01342166, "auxiliary_loss_mlp": 0.01195734, "balance_loss_clip": 1.00903559, "balance_loss_mlp": 1.00080323, "epoch": 0.40533878434437565, "flos": 25331109353280.0, "grad_norm": 1.9794279345731518, "language_loss": 0.85708368, "learning_rate": 2.6955765352698763e-06, "loss": 0.88246262, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.7579751014709473 }, { "auxiliary_loss_clip": 0.01363428, "auxiliary_loss_mlp": 0.01195528, "balance_loss_clip": 1.00893521, "balance_loss_mlp": 1.00069284, "epoch": 0.40545902723501476, "flos": 15012070315200.0, "grad_norm": 1.8909930303234261, "language_loss": 0.73173064, "learning_rate": 2.6948461396656923e-06, "loss": 0.75732023, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.692596435546875 }, { "auxiliary_loss_clip": 0.01342183, "auxiliary_loss_mlp": 0.01195317, "balance_loss_clip": 1.00973737, "balance_loss_mlp": 1.00048113, "epoch": 0.4055792701256538, "flos": 25521115498560.0, "grad_norm": 11.358426636386127, "language_loss": 0.74072933, "learning_rate": 2.6941156386539013e-06, "loss": 0.76610428, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.7891998291015625 }, { "auxiliary_loss_clip": 0.01312577, "auxiliary_loss_mlp": 0.01195108, "balance_loss_clip": 1.00839984, "balance_loss_mlp": 1.00046325, "epoch": 0.4056995130162929, "flos": 19574594566560.0, "grad_norm": 1.9447960592650737, "language_loss": 0.8079986, "learning_rate": 2.6933850323453203e-06, "loss": 0.8330754, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.813420534133911 }, { "auxiliary_loss_clip": 0.01363549, "auxiliary_loss_mlp": 0.01195105, "balance_loss_clip": 1.00966549, "balance_loss_mlp": 1.00055552, "epoch": 0.405819755906932, "flos": 15413803223040.0, "grad_norm": 1.7419952538099008, "language_loss": 0.74776864, "learning_rate": 2.6926543208507806e-06, "loss": 0.77335519, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.6242990493774414 }, { "auxiliary_loss_clip": 0.01339217, "auxiliary_loss_mlp": 0.0119522, "balance_loss_clip": 1.00898898, "balance_loss_mlp": 1.00057507, "epoch": 0.4059399987975711, "flos": 21433941280800.0, "grad_norm": 2.0514277544816757, "language_loss": 0.80081332, "learning_rate": 2.6919235042811316e-06, "loss": 0.82615757, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.7049365043640137 }, { "auxiliary_loss_clip": 0.01305009, "auxiliary_loss_mlp": 0.01195434, "balance_loss_clip": 1.00845838, "balance_loss_mlp": 1.00059831, "epoch": 0.4060602416882102, "flos": 25556954122080.0, "grad_norm": 2.219384765692923, "language_loss": 0.76480931, "learning_rate": 2.691192582747237e-06, "loss": 0.78981376, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.7734785079956055 }, { "auxiliary_loss_clip": 0.01362993, "auxiliary_loss_mlp": 0.01195305, "balance_loss_clip": 1.00911605, "balance_loss_mlp": 1.00065994, "epoch": 0.40618048457884925, "flos": 23766769997280.0, "grad_norm": 1.6520052127186315, "language_loss": 0.74007094, "learning_rate": 2.6904615563599765e-06, "loss": 0.76565385, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.730022668838501 }, { "auxiliary_loss_clip": 0.01301946, "auxiliary_loss_mlp": 0.01195246, "balance_loss_clip": 1.00791454, "balance_loss_mlp": 1.00050592, "epoch": 0.40630072746948837, "flos": 17639690847840.0, "grad_norm": 1.5898437169304223, "language_loss": 0.83125073, "learning_rate": 2.6897304252302477e-06, "loss": 0.85622263, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 3.7578036785125732 }, { "auxiliary_loss_clip": 0.01278487, "auxiliary_loss_mlp": 0.01193877, "balance_loss_clip": 1.00272584, "balance_loss_mlp": 1.00009, "epoch": 0.4064209703601275, "flos": 60836084352000.0, "grad_norm": 0.782248058848043, "language_loss": 0.54798448, "learning_rate": 2.688999189468962e-06, "loss": 0.57270813, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 4.056403636932373 }, { "auxiliary_loss_clip": 0.01338984, "auxiliary_loss_mlp": 0.01195226, "balance_loss_clip": 1.00943041, "balance_loss_mlp": 1.00048554, "epoch": 0.40654121325076653, "flos": 24024358090080.0, "grad_norm": 2.4618838731028077, "language_loss": 0.7664212, "learning_rate": 2.6882678491870464e-06, "loss": 0.79176331, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.7672033309936523 }, { "auxiliary_loss_clip": 0.0134348, "auxiliary_loss_mlp": 0.0119525, "balance_loss_clip": 1.00912809, "balance_loss_mlp": 1.00060463, "epoch": 0.40666145614140564, "flos": 27344264352480.0, "grad_norm": 1.5947822896819919, "language_loss": 0.71727479, "learning_rate": 2.6875364044954453e-06, "loss": 0.74266207, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.779890298843384 }, { "auxiliary_loss_clip": 0.01337865, "auxiliary_loss_mlp": 0.01195157, "balance_loss_clip": 1.0089643, "balance_loss_mlp": 1.00060737, "epoch": 0.40678169903204475, "flos": 26176736360160.0, "grad_norm": 1.540489499058221, "language_loss": 0.82701969, "learning_rate": 2.6868048555051185e-06, "loss": 0.85234994, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 4.815086126327515 }, { "auxiliary_loss_clip": 0.01339594, "auxiliary_loss_mlp": 0.01195419, "balance_loss_clip": 1.0092423, "balance_loss_mlp": 1.00058365, "epoch": 0.4069019419226838, "flos": 28622433575520.0, "grad_norm": 2.2845319162078694, "language_loss": 0.85684288, "learning_rate": 2.686073202327041e-06, "loss": 0.88219303, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.835968017578125 }, { "auxiliary_loss_clip": 0.01339119, "auxiliary_loss_mlp": 0.01195139, "balance_loss_clip": 1.00932884, "balance_loss_mlp": 1.00058961, "epoch": 0.4070221848133229, "flos": 25229017958400.0, "grad_norm": 1.6070716779040497, "language_loss": 0.73316908, "learning_rate": 2.6853414450722043e-06, "loss": 0.75851166, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.8722338676452637 }, { "auxiliary_loss_clip": 0.01338739, "auxiliary_loss_mlp": 0.01195228, "balance_loss_clip": 1.00848222, "balance_loss_mlp": 1.00067854, "epoch": 0.40714242770396203, "flos": 18405234466560.0, "grad_norm": 1.5922705735600684, "language_loss": 0.85294366, "learning_rate": 2.684609583851616e-06, "loss": 0.87828326, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.6514639854431152 }, { "auxiliary_loss_clip": 0.01279661, "auxiliary_loss_mlp": 0.01195119, "balance_loss_clip": 1.00764668, "balance_loss_mlp": 1.00056911, "epoch": 0.4072626705946011, "flos": 30228934122720.0, "grad_norm": 1.525808076129965, "language_loss": 0.8047598, "learning_rate": 2.683877618776297e-06, "loss": 0.82950759, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.8490216732025146 }, { "auxiliary_loss_clip": 0.01338817, "auxiliary_loss_mlp": 0.01195527, "balance_loss_clip": 1.00913024, "balance_loss_mlp": 1.00078654, "epoch": 0.4073829134852402, "flos": 21834560554560.0, "grad_norm": 2.5362486068154193, "language_loss": 0.74158859, "learning_rate": 2.6831455499572876e-06, "loss": 0.76693207, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.718494415283203 }, { "auxiliary_loss_clip": 0.01363276, "auxiliary_loss_mlp": 0.01195452, "balance_loss_clip": 1.00908875, "balance_loss_mlp": 1.00071216, "epoch": 0.40750315637587925, "flos": 25260222427200.0, "grad_norm": 2.1206352836755484, "language_loss": 0.77703029, "learning_rate": 2.682413377505641e-06, "loss": 0.80261755, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.6993391513824463 }, { "auxiliary_loss_clip": 0.01339795, "auxiliary_loss_mlp": 0.01195388, "balance_loss_clip": 1.0078131, "balance_loss_mlp": 1.00055206, "epoch": 0.40762339926651836, "flos": 19712776050720.0, "grad_norm": 1.7743893476815165, "language_loss": 0.76694375, "learning_rate": 2.6816811015324284e-06, "loss": 0.79229558, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.692492961883545 }, { "auxiliary_loss_clip": 0.01333815, "auxiliary_loss_mlp": 0.01193994, "balance_loss_clip": 1.00377297, "balance_loss_mlp": 1.00020742, "epoch": 0.40774364215715747, "flos": 71449340427360.0, "grad_norm": 0.9806946517034407, "language_loss": 0.56712788, "learning_rate": 2.6809487221487343e-06, "loss": 0.59240597, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 3.215411424636841 }, { "auxiliary_loss_clip": 0.01351333, "auxiliary_loss_mlp": 0.0119547, "balance_loss_clip": 1.00916791, "balance_loss_mlp": 1.00082469, "epoch": 0.4078638850477965, "flos": 15084142722720.0, "grad_norm": 2.251681651893119, "language_loss": 0.82047319, "learning_rate": 2.6802162394656605e-06, "loss": 0.84594119, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.6973631381988525 }, { "auxiliary_loss_clip": 0.01327203, "auxiliary_loss_mlp": 0.01195286, "balance_loss_clip": 1.00794291, "balance_loss_mlp": 1.00073624, "epoch": 0.40798412793843564, "flos": 23842901780640.0, "grad_norm": 1.9735774827174604, "language_loss": 0.71673465, "learning_rate": 2.679483653594324e-06, "loss": 0.74195951, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.8061065673828125 }, { "auxiliary_loss_clip": 0.01344691, "auxiliary_loss_mlp": 0.01195479, "balance_loss_clip": 1.00867772, "balance_loss_mlp": 1.00073862, "epoch": 0.40810437082907475, "flos": 21065783804640.0, "grad_norm": 2.4291304732225236, "language_loss": 0.76363772, "learning_rate": 2.678750964645857e-06, "loss": 0.78903943, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.741020917892456 }, { "auxiliary_loss_clip": 0.01339021, "auxiliary_loss_mlp": 0.01195146, "balance_loss_clip": 1.0095526, "balance_loss_mlp": 1.00050068, "epoch": 0.4082246137197138, "flos": 11321384148000.0, "grad_norm": 2.6057129896007436, "language_loss": 0.83653402, "learning_rate": 2.6780181727314094e-06, "loss": 0.86187565, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.628645181655884 }, { "auxiliary_loss_clip": 0.01308149, "auxiliary_loss_mlp": 0.0087255, "balance_loss_clip": 1.00832844, "balance_loss_mlp": 1.00006759, "epoch": 0.4083448566103529, "flos": 19062579664800.0, "grad_norm": 1.8447900514281337, "language_loss": 0.77803373, "learning_rate": 2.6772852779621435e-06, "loss": 0.79984069, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.8403983116149902 }, { "auxiliary_loss_clip": 0.01337265, "auxiliary_loss_mlp": 0.00872458, "balance_loss_clip": 1.00923955, "balance_loss_mlp": 1.00004172, "epoch": 0.408465099500992, "flos": 23550265385280.0, "grad_norm": 4.502137218795696, "language_loss": 0.8651517, "learning_rate": 2.676552280449239e-06, "loss": 0.88724893, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.7209789752960205 }, { "auxiliary_loss_clip": 0.01350468, "auxiliary_loss_mlp": 0.01195093, "balance_loss_clip": 1.00892353, "balance_loss_mlp": 1.00054359, "epoch": 0.4085853423916311, "flos": 12750020753760.0, "grad_norm": 2.238032890865045, "language_loss": 0.75192612, "learning_rate": 2.6758191803038917e-06, "loss": 0.77738172, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.7271316051483154 }, { "auxiliary_loss_clip": 0.01260964, "auxiliary_loss_mlp": 0.01195148, "balance_loss_clip": 1.00753129, "balance_loss_mlp": 1.00050354, "epoch": 0.4087055852822702, "flos": 24353084574720.0, "grad_norm": 1.785931388138206, "language_loss": 0.8266986, "learning_rate": 2.6750859776373125e-06, "loss": 0.85125971, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.8882908821105957 }, { "auxiliary_loss_clip": 0.0125784, "auxiliary_loss_mlp": 0.01193972, "balance_loss_clip": 1.00375152, "balance_loss_mlp": 1.00018585, "epoch": 0.4088258281729093, "flos": 66387985264800.0, "grad_norm": 0.7704898865404944, "language_loss": 0.60437912, "learning_rate": 2.674352672560727e-06, "loss": 0.62889725, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.4828200340270996 }, { "auxiliary_loss_clip": 0.01304432, "auxiliary_loss_mlp": 0.01195297, "balance_loss_clip": 1.00856853, "balance_loss_mlp": 1.00055647, "epoch": 0.40894607106354836, "flos": 20449270621440.0, "grad_norm": 1.6262982028466566, "language_loss": 0.76666665, "learning_rate": 2.673619265185377e-06, "loss": 0.79166394, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 3.020813465118408 }, { "auxiliary_loss_clip": 0.01350297, "auxiliary_loss_mlp": 0.01195239, "balance_loss_clip": 1.00886428, "balance_loss_mlp": 1.00049841, "epoch": 0.40906631395418747, "flos": 27053639683200.0, "grad_norm": 1.8114321552894164, "language_loss": 0.77935767, "learning_rate": 2.672885755622521e-06, "loss": 0.80481303, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.7618982791900635 }, { "auxiliary_loss_clip": 0.01279397, "auxiliary_loss_mlp": 0.01195107, "balance_loss_clip": 1.00711751, "balance_loss_mlp": 1.00046229, "epoch": 0.4091865568448266, "flos": 25484163240960.0, "grad_norm": 2.3335323830100916, "language_loss": 0.7042585, "learning_rate": 2.67215214398343e-06, "loss": 0.72900355, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.8634743690490723 }, { "auxiliary_loss_clip": 0.01291201, "auxiliary_loss_mlp": 0.01195057, "balance_loss_clip": 1.00772393, "balance_loss_mlp": 1.00050759, "epoch": 0.40930679973546563, "flos": 28657877038560.0, "grad_norm": 2.5224106029834057, "language_loss": 0.78471893, "learning_rate": 2.671418430379393e-06, "loss": 0.80958152, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.888258695602417 }, { "auxiliary_loss_clip": 0.01363703, "auxiliary_loss_mlp": 0.01195167, "balance_loss_clip": 1.00948012, "balance_loss_mlp": 1.0005219, "epoch": 0.40942704262610474, "flos": 20886303297600.0, "grad_norm": 1.7359722697269406, "language_loss": 0.83251733, "learning_rate": 2.670684614921715e-06, "loss": 0.85810602, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 3.65950608253479 }, { "auxiliary_loss_clip": 0.01338562, "auxiliary_loss_mlp": 0.01195243, "balance_loss_clip": 1.00926542, "balance_loss_mlp": 1.00050282, "epoch": 0.4095472855167438, "flos": 21618091866240.0, "grad_norm": 2.043670005743988, "language_loss": 0.68787211, "learning_rate": 2.6699506977217128e-06, "loss": 0.71321017, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 3.6806249618530273 }, { "auxiliary_loss_clip": 0.01337773, "auxiliary_loss_mlp": 0.01195152, "balance_loss_clip": 1.00904346, "balance_loss_mlp": 1.00050688, "epoch": 0.4096675284073829, "flos": 27926124393600.0, "grad_norm": 2.1440145192952036, "language_loss": 0.70128345, "learning_rate": 2.6692166788907233e-06, "loss": 0.72661275, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.740119218826294 }, { "auxiliary_loss_clip": 0.01326355, "auxiliary_loss_mlp": 0.01195423, "balance_loss_clip": 1.00834751, "balance_loss_mlp": 1.00058699, "epoch": 0.409787771298022, "flos": 19206616708800.0, "grad_norm": 2.2857451581240027, "language_loss": 0.76624763, "learning_rate": 2.6684825585400957e-06, "loss": 0.7914654, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.875648260116577 }, { "auxiliary_loss_clip": 0.01291822, "auxiliary_loss_mlp": 0.01193922, "balance_loss_clip": 1.00382972, "balance_loss_mlp": 1.00013542, "epoch": 0.4099080141886611, "flos": 59269266262080.0, "grad_norm": 0.8198634179151455, "language_loss": 0.65133512, "learning_rate": 2.6677483367811947e-06, "loss": 0.67619258, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 5.351319074630737 }, { "auxiliary_loss_clip": 0.01345081, "auxiliary_loss_mlp": 0.01195221, "balance_loss_clip": 1.00858819, "balance_loss_mlp": 1.00048053, "epoch": 0.4100282570793002, "flos": 21906453343680.0, "grad_norm": 1.8577826082708722, "language_loss": 0.75375748, "learning_rate": 2.6670140137254028e-06, "loss": 0.7791605, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.9108800888061523 }, { "auxiliary_loss_clip": 0.01282138, "auxiliary_loss_mlp": 0.01195328, "balance_loss_clip": 1.00777197, "balance_loss_mlp": 1.00068283, "epoch": 0.4101484999699393, "flos": 18551606549760.0, "grad_norm": 2.208005861591549, "language_loss": 0.89498198, "learning_rate": 2.666279589484115e-06, "loss": 0.91975665, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.8062119483947754 }, { "auxiliary_loss_clip": 0.01306306, "auxiliary_loss_mlp": 0.01195078, "balance_loss_clip": 1.00862825, "balance_loss_mlp": 1.00062394, "epoch": 0.41026874286057835, "flos": 19094538530880.0, "grad_norm": 1.786501830236634, "language_loss": 0.81478, "learning_rate": 2.6655450641687435e-06, "loss": 0.8397938, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.8566832542419434 }, { "auxiliary_loss_clip": 0.01363096, "auxiliary_loss_mlp": 0.01195136, "balance_loss_clip": 1.00956881, "balance_loss_mlp": 1.00058675, "epoch": 0.41038898575121746, "flos": 31209581329920.0, "grad_norm": 1.6944260878410904, "language_loss": 0.6923368, "learning_rate": 2.664810437890715e-06, "loss": 0.71791911, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.814657211303711 }, { "auxiliary_loss_clip": 0.01243562, "auxiliary_loss_mlp": 0.01195129, "balance_loss_clip": 1.00671148, "balance_loss_mlp": 1.00057936, "epoch": 0.41050922864185657, "flos": 14355874674720.0, "grad_norm": 1.6945483979310272, "language_loss": 0.79333544, "learning_rate": 2.6640757107614714e-06, "loss": 0.81772232, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.9437649250030518 }, { "auxiliary_loss_clip": 0.01301413, "auxiliary_loss_mlp": 0.01195515, "balance_loss_clip": 1.00845075, "balance_loss_mlp": 1.00067925, "epoch": 0.4106294715324956, "flos": 30956304078720.0, "grad_norm": 1.8831489094460674, "language_loss": 0.69491339, "learning_rate": 2.6633408828924697e-06, "loss": 0.71988267, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 3.390589952468872 }, { "auxiliary_loss_clip": 0.01316842, "auxiliary_loss_mlp": 0.01195101, "balance_loss_clip": 1.00884414, "balance_loss_mlp": 1.00036085, "epoch": 0.41074971442313474, "flos": 24457331390400.0, "grad_norm": 1.5652532198882023, "language_loss": 0.70011252, "learning_rate": 2.662605954395185e-06, "loss": 0.72523201, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.8594019412994385 }, { "auxiliary_loss_clip": 0.01351017, "auxiliary_loss_mlp": 0.01195191, "balance_loss_clip": 1.00920105, "balance_loss_mlp": 1.00064158, "epoch": 0.41086995731377385, "flos": 21543001869600.0, "grad_norm": 1.7023594916657598, "language_loss": 0.83946478, "learning_rate": 2.6618709253811027e-06, "loss": 0.86492693, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 2.7717349529266357 }, { "auxiliary_loss_clip": 0.01361465, "auxiliary_loss_mlp": 0.01195006, "balance_loss_clip": 1.0091269, "balance_loss_mlp": 1.00045657, "epoch": 0.4109902002044129, "flos": 20702763414720.0, "grad_norm": 1.6101439404763431, "language_loss": 0.87826586, "learning_rate": 2.6611357959617277e-06, "loss": 0.90383053, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.6553752422332764 }, { "auxiliary_loss_clip": 0.01304636, "auxiliary_loss_mlp": 0.01195069, "balance_loss_clip": 1.00807226, "balance_loss_mlp": 1.00051928, "epoch": 0.411110443095052, "flos": 18180000400320.0, "grad_norm": 1.8646727219172208, "language_loss": 0.91084051, "learning_rate": 2.660400566248578e-06, "loss": 0.93583751, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.8472094535827637 }, { "auxiliary_loss_clip": 0.01301956, "auxiliary_loss_mlp": 0.01195475, "balance_loss_clip": 1.00850105, "balance_loss_mlp": 1.00063944, "epoch": 0.41123068598569107, "flos": 14575253181120.0, "grad_norm": 2.4318780793725874, "language_loss": 0.66752815, "learning_rate": 2.6596652363531876e-06, "loss": 0.69250244, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.730032205581665 }, { "auxiliary_loss_clip": 0.01362568, "auxiliary_loss_mlp": 0.01194953, "balance_loss_clip": 1.0093627, "balance_loss_mlp": 1.00049925, "epoch": 0.4113509288763302, "flos": 21178005677280.0, "grad_norm": 1.460219769461865, "language_loss": 0.77987903, "learning_rate": 2.6589298063871055e-06, "loss": 0.80545425, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.7574000358581543 }, { "auxiliary_loss_clip": 0.01361741, "auxiliary_loss_mlp": 0.01195261, "balance_loss_clip": 1.00951862, "balance_loss_mlp": 1.00052023, "epoch": 0.4114711717669693, "flos": 18442222647840.0, "grad_norm": 1.8170434837672345, "language_loss": 0.69490731, "learning_rate": 2.658194276461895e-06, "loss": 0.72047734, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.7825965881347656 }, { "auxiliary_loss_clip": 0.01339429, "auxiliary_loss_mlp": 0.01195282, "balance_loss_clip": 1.00892353, "balance_loss_mlp": 1.00054216, "epoch": 0.41159141465760835, "flos": 27233407579680.0, "grad_norm": 1.8648766884644652, "language_loss": 0.66690469, "learning_rate": 2.6574586466891368e-06, "loss": 0.6922518, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.8226563930511475 }, { "auxiliary_loss_clip": 0.01324185, "auxiliary_loss_mlp": 0.00872446, "balance_loss_clip": 1.00791049, "balance_loss_mlp": 1.00012851, "epoch": 0.41171165754824746, "flos": 20006885316960.0, "grad_norm": 4.1860729241632795, "language_loss": 0.64711618, "learning_rate": 2.6567229171804247e-06, "loss": 0.6690824, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.784015417098999 }, { "auxiliary_loss_clip": 0.01313982, "auxiliary_loss_mlp": 0.01195335, "balance_loss_clip": 1.00786209, "balance_loss_mlp": 1.00068963, "epoch": 0.41183190043888657, "flos": 18004327803360.0, "grad_norm": 20.863855695830058, "language_loss": 0.87537503, "learning_rate": 2.655987088047368e-06, "loss": 0.90046823, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.8252062797546387 }, { "auxiliary_loss_clip": 0.01333334, "auxiliary_loss_mlp": 0.01195196, "balance_loss_clip": 1.0086056, "balance_loss_mlp": 1.00055134, "epoch": 0.4119521433295256, "flos": 27163382821920.0, "grad_norm": 1.8811222147825508, "language_loss": 0.78993171, "learning_rate": 2.6552511594015912e-06, "loss": 0.81521702, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.8737552165985107 }, { "auxiliary_loss_clip": 0.01338887, "auxiliary_loss_mlp": 0.01195484, "balance_loss_clip": 1.00904119, "balance_loss_mlp": 1.00064802, "epoch": 0.41207238622016473, "flos": 15122028996000.0, "grad_norm": 1.9890806501728864, "language_loss": 0.84628469, "learning_rate": 2.654515131354735e-06, "loss": 0.87162834, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.77881121635437 }, { "auxiliary_loss_clip": 0.01302189, "auxiliary_loss_mlp": 0.01195245, "balance_loss_clip": 1.00818229, "balance_loss_mlp": 1.00060022, "epoch": 0.41219262911080384, "flos": 27052885285920.0, "grad_norm": 3.052512466049822, "language_loss": 0.85263872, "learning_rate": 2.653779004018453e-06, "loss": 0.87761307, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.874051809310913 }, { "auxiliary_loss_clip": 0.01329884, "auxiliary_loss_mlp": 0.01195283, "balance_loss_clip": 1.0087285, "balance_loss_mlp": 1.00063777, "epoch": 0.4123128720014429, "flos": 24686373366720.0, "grad_norm": 2.0636332882964927, "language_loss": 0.82629144, "learning_rate": 2.653042777504417e-06, "loss": 0.85154313, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.8107481002807617 }, { "auxiliary_loss_clip": 0.01318879, "auxiliary_loss_mlp": 0.01195556, "balance_loss_clip": 1.00834894, "balance_loss_mlp": 1.00081575, "epoch": 0.412433114892082, "flos": 26244785315520.0, "grad_norm": 1.7827008735074585, "language_loss": 0.80041301, "learning_rate": 2.6523064519243105e-06, "loss": 0.82555735, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.8188037872314453 }, { "auxiliary_loss_clip": 0.01338696, "auxiliary_loss_mlp": 0.01195198, "balance_loss_clip": 1.00871921, "balance_loss_mlp": 1.00055301, "epoch": 0.4125533577827211, "flos": 21361042628640.0, "grad_norm": 2.1160323210811045, "language_loss": 0.79056066, "learning_rate": 2.6515700273898333e-06, "loss": 0.81589961, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 4.0640716552734375 }, { "auxiliary_loss_clip": 0.0132683, "auxiliary_loss_mlp": 0.01195195, "balance_loss_clip": 1.00976336, "balance_loss_mlp": 1.00064576, "epoch": 0.4126736006733602, "flos": 26067568000320.0, "grad_norm": 2.4067692157917464, "language_loss": 0.68917108, "learning_rate": 2.6508335040127018e-06, "loss": 0.71439135, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 3.7483272552490234 }, { "auxiliary_loss_clip": 0.01348737, "auxiliary_loss_mlp": 0.01194996, "balance_loss_clip": 1.00867832, "balance_loss_mlp": 1.00044656, "epoch": 0.4127938435639993, "flos": 25666158405600.0, "grad_norm": 2.211641608307347, "language_loss": 0.77152169, "learning_rate": 2.6500968819046446e-06, "loss": 0.79695904, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.7759182453155518 }, { "auxiliary_loss_clip": 0.0131311, "auxiliary_loss_mlp": 0.0119491, "balance_loss_clip": 1.00832677, "balance_loss_mlp": 1.00036025, "epoch": 0.4129140864546384, "flos": 17995921662240.0, "grad_norm": 3.930482079444144, "language_loss": 0.58783388, "learning_rate": 2.649360161177408e-06, "loss": 0.61291409, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.79019832611084 }, { "auxiliary_loss_clip": 0.0134984, "auxiliary_loss_mlp": 0.01195166, "balance_loss_clip": 1.00910687, "balance_loss_mlp": 1.00052142, "epoch": 0.41303432934527745, "flos": 23732907176160.0, "grad_norm": 1.9915452737740058, "language_loss": 0.73347461, "learning_rate": 2.6486233419427504e-06, "loss": 0.75892472, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 4.7192840576171875 }, { "auxiliary_loss_clip": 0.01304604, "auxiliary_loss_mlp": 0.01195177, "balance_loss_clip": 1.00932872, "balance_loss_mlp": 1.00053155, "epoch": 0.41315457223591656, "flos": 19755296478720.0, "grad_norm": 2.199020039022311, "language_loss": 0.74901366, "learning_rate": 2.6478864243124484e-06, "loss": 0.77401149, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.7646782398223877 }, { "auxiliary_loss_clip": 0.01350252, "auxiliary_loss_mlp": 0.01195158, "balance_loss_clip": 1.00923347, "balance_loss_mlp": 1.00051296, "epoch": 0.4132748151265556, "flos": 20923327402560.0, "grad_norm": 1.9739654823756316, "language_loss": 0.85183954, "learning_rate": 2.6471494083982903e-06, "loss": 0.87729371, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.8436813354492188 }, { "auxiliary_loss_clip": 0.0131585, "auxiliary_loss_mlp": 0.01195347, "balance_loss_clip": 1.00892878, "balance_loss_mlp": 1.00060678, "epoch": 0.4133950580171947, "flos": 32232533423040.0, "grad_norm": 1.9078403897092855, "language_loss": 0.74818784, "learning_rate": 2.6464122943120818e-06, "loss": 0.77329981, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.850659132003784 }, { "auxiliary_loss_clip": 0.0129173, "auxiliary_loss_mlp": 0.01195221, "balance_loss_clip": 1.00766242, "balance_loss_mlp": 1.00057578, "epoch": 0.41351530090783384, "flos": 23292497674080.0, "grad_norm": 2.980838658145301, "language_loss": 0.82380348, "learning_rate": 2.645675082165642e-06, "loss": 0.84867299, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.7811331748962402 }, { "auxiliary_loss_clip": 0.01316073, "auxiliary_loss_mlp": 0.01195327, "balance_loss_clip": 1.00819433, "balance_loss_mlp": 1.00058711, "epoch": 0.4136355437984729, "flos": 25593583066560.0, "grad_norm": 2.0804822038283732, "language_loss": 0.75115609, "learning_rate": 2.644937772070806e-06, "loss": 0.77627009, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.7288782596588135 }, { "auxiliary_loss_clip": 0.01363291, "auxiliary_loss_mlp": 0.01195157, "balance_loss_clip": 1.00978327, "balance_loss_mlp": 1.00051177, "epoch": 0.413755786689112, "flos": 19828626215040.0, "grad_norm": 2.1239628427916215, "language_loss": 0.83563662, "learning_rate": 2.6442003641394225e-06, "loss": 0.86122108, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.7318978309631348 }, { "auxiliary_loss_clip": 0.0133882, "auxiliary_loss_mlp": 0.01195035, "balance_loss_clip": 1.00899601, "balance_loss_mlp": 1.00048554, "epoch": 0.4138760295797511, "flos": 26870459037120.0, "grad_norm": 1.6695202424713715, "language_loss": 0.84001178, "learning_rate": 2.643462858483356e-06, "loss": 0.86535037, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.805926561355591 }, { "auxiliary_loss_clip": 0.01291432, "auxiliary_loss_mlp": 0.01195136, "balance_loss_clip": 1.00849104, "balance_loss_mlp": 1.00049162, "epoch": 0.41399627247039017, "flos": 16399264203360.0, "grad_norm": 1.7988022303537234, "language_loss": 0.7281571, "learning_rate": 2.6427252552144856e-06, "loss": 0.75302279, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.8220887184143066 }, { "auxiliary_loss_clip": 0.01362, "auxiliary_loss_mlp": 0.01195128, "balance_loss_clip": 1.00915158, "balance_loss_mlp": 1.0004828, "epoch": 0.4141165153610293, "flos": 22930231681440.0, "grad_norm": 2.2628686335458985, "language_loss": 0.74960017, "learning_rate": 2.6419875544447044e-06, "loss": 0.7751714, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.735597848892212 }, { "auxiliary_loss_clip": 0.01363125, "auxiliary_loss_mlp": 0.01195018, "balance_loss_clip": 1.00919485, "balance_loss_mlp": 1.00046802, "epoch": 0.4142367582516684, "flos": 25192568632320.0, "grad_norm": 1.5116137757382584, "language_loss": 0.71150887, "learning_rate": 2.6412497562859218e-06, "loss": 0.73709029, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.724642276763916 }, { "auxiliary_loss_clip": 0.01351195, "auxiliary_loss_mlp": 0.01195262, "balance_loss_clip": 1.00927126, "balance_loss_mlp": 1.0005219, "epoch": 0.41435700114230745, "flos": 21690487586880.0, "grad_norm": 2.487616112871183, "language_loss": 0.76231295, "learning_rate": 2.6405118608500617e-06, "loss": 0.78777754, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.8346848487854004 }, { "auxiliary_loss_clip": 0.01289213, "auxiliary_loss_mlp": 0.0119518, "balance_loss_clip": 1.00766897, "balance_loss_mlp": 1.00063062, "epoch": 0.41447724403294656, "flos": 25995172279680.0, "grad_norm": 1.7149089949775407, "language_loss": 0.81487119, "learning_rate": 2.6397738682490613e-06, "loss": 0.83971512, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.78647780418396 }, { "auxiliary_loss_clip": 0.01363023, "auxiliary_loss_mlp": 0.01195109, "balance_loss_clip": 1.00957942, "balance_loss_mlp": 1.00055909, "epoch": 0.41459748692358567, "flos": 18259688628000.0, "grad_norm": 1.8341313406116124, "language_loss": 0.7490868, "learning_rate": 2.6390357785948734e-06, "loss": 0.7746681, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.7387118339538574 }, { "auxiliary_loss_clip": 0.01339266, "auxiliary_loss_mlp": 0.01195031, "balance_loss_clip": 1.00843358, "balance_loss_mlp": 1.00048184, "epoch": 0.4147177298142247, "flos": 24168466981440.0, "grad_norm": 1.749672374973649, "language_loss": 0.80419028, "learning_rate": 2.6382975919994667e-06, "loss": 0.82953322, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 2.7218663692474365 }, { "auxiliary_loss_clip": 0.01330931, "auxiliary_loss_mlp": 0.01194947, "balance_loss_clip": 1.00857115, "balance_loss_mlp": 1.00049233, "epoch": 0.41483797270486383, "flos": 20084669589600.0, "grad_norm": 1.654270460134881, "language_loss": 0.72741675, "learning_rate": 2.637559308574822e-06, "loss": 0.75267553, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.767205238342285 }, { "auxiliary_loss_clip": 0.01362378, "auxiliary_loss_mlp": 0.0119514, "balance_loss_clip": 1.00925565, "balance_loss_mlp": 1.00059009, "epoch": 0.4149582155955029, "flos": 30081053244960.0, "grad_norm": 1.7988421221168127, "language_loss": 0.70969582, "learning_rate": 2.6368209284329376e-06, "loss": 0.73527098, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.739621877670288 }, { "auxiliary_loss_clip": 0.01350159, "auxiliary_loss_mlp": 0.01195075, "balance_loss_clip": 1.00925756, "balance_loss_mlp": 1.00052571, "epoch": 0.415078458486142, "flos": 16764404090400.0, "grad_norm": 2.1271234309124885, "language_loss": 0.75703239, "learning_rate": 2.636082451685825e-06, "loss": 0.78248477, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.694528579711914 }, { "auxiliary_loss_clip": 0.0132208, "auxiliary_loss_mlp": 0.01194998, "balance_loss_clip": 1.00873125, "balance_loss_mlp": 1.00044835, "epoch": 0.4151987013767811, "flos": 26033705179200.0, "grad_norm": 2.110611314821287, "language_loss": 0.86278427, "learning_rate": 2.6353438784455094e-06, "loss": 0.88795507, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.7692105770111084 }, { "auxiliary_loss_clip": 0.01325721, "auxiliary_loss_mlp": 0.0119516, "balance_loss_clip": 1.00894213, "balance_loss_mlp": 1.00051522, "epoch": 0.41531894426742016, "flos": 24608014315200.0, "grad_norm": 2.2244681063930685, "language_loss": 0.72111422, "learning_rate": 2.6346052088240326e-06, "loss": 0.74632305, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.755356788635254 }, { "auxiliary_loss_clip": 0.01326486, "auxiliary_loss_mlp": 0.01194922, "balance_loss_clip": 1.00831914, "balance_loss_mlp": 1.00046825, "epoch": 0.4154391871580593, "flos": 14975800607520.0, "grad_norm": 2.054640705573681, "language_loss": 0.77338779, "learning_rate": 2.63386644293345e-06, "loss": 0.79860193, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.7282562255859375 }, { "auxiliary_loss_clip": 0.01325827, "auxiliary_loss_mlp": 0.01194912, "balance_loss_clip": 1.00926006, "balance_loss_mlp": 1.00036287, "epoch": 0.4155594300486984, "flos": 14647181893920.0, "grad_norm": 2.215526803580506, "language_loss": 0.82552266, "learning_rate": 2.633127580885833e-06, "loss": 0.85073006, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.7003324031829834 }, { "auxiliary_loss_clip": 0.01361743, "auxiliary_loss_mlp": 0.01194965, "balance_loss_clip": 1.00909412, "balance_loss_mlp": 1.00051129, "epoch": 0.41567967293933744, "flos": 29497289248800.0, "grad_norm": 1.854225932845846, "language_loss": 0.64832568, "learning_rate": 2.632388622793265e-06, "loss": 0.67389274, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 3.768097162246704 }, { "auxiliary_loss_clip": 0.01351891, "auxiliary_loss_mlp": 0.01194975, "balance_loss_clip": 1.01010358, "balance_loss_mlp": 1.00042558, "epoch": 0.41579991582997655, "flos": 19238395956480.0, "grad_norm": 4.440504419543087, "language_loss": 0.68075371, "learning_rate": 2.6316495687678457e-06, "loss": 0.70622241, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.691483736038208 }, { "auxiliary_loss_clip": 0.01287521, "auxiliary_loss_mlp": 0.01194948, "balance_loss_clip": 1.00767601, "balance_loss_mlp": 1.00039828, "epoch": 0.41592015872061566, "flos": 24462073316160.0, "grad_norm": 2.276928532368432, "language_loss": 0.76482582, "learning_rate": 2.6309104189216887e-06, "loss": 0.7896505, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 3.779162883758545 }, { "auxiliary_loss_clip": 0.01302425, "auxiliary_loss_mlp": 0.00872546, "balance_loss_clip": 1.00820994, "balance_loss_mlp": 1.00019848, "epoch": 0.4160404016112547, "flos": 20775662066880.0, "grad_norm": 3.1077285844576368, "language_loss": 0.74506056, "learning_rate": 2.630171173366923e-06, "loss": 0.7668103, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 3.7778613567352295 }, { "auxiliary_loss_clip": 0.01296992, "auxiliary_loss_mlp": 0.01195064, "balance_loss_clip": 1.00821948, "balance_loss_mlp": 1.00061011, "epoch": 0.41616064450189383, "flos": 13916471035680.0, "grad_norm": 2.487553190821959, "language_loss": 0.74556148, "learning_rate": 2.629431832215691e-06, "loss": 0.77048206, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 3.7822563648223877 }, { "auxiliary_loss_clip": 0.01315316, "auxiliary_loss_mlp": 0.01195284, "balance_loss_clip": 1.00817609, "balance_loss_mlp": 1.00063944, "epoch": 0.41628088739253294, "flos": 20010836921760.0, "grad_norm": 1.5055348116488816, "language_loss": 0.87302488, "learning_rate": 2.628692395580151e-06, "loss": 0.89813089, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.749260425567627 }, { "auxiliary_loss_clip": 0.01277282, "auxiliary_loss_mlp": 0.01195113, "balance_loss_clip": 1.00883293, "balance_loss_mlp": 1.00056362, "epoch": 0.416401130283172, "flos": 29168814229920.0, "grad_norm": 3.453801686296195, "language_loss": 0.79539859, "learning_rate": 2.6279528635724747e-06, "loss": 0.82012254, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.9908134937286377 }, { "auxiliary_loss_clip": 0.01350993, "auxiliary_loss_mlp": 0.0119524, "balance_loss_clip": 1.00939846, "balance_loss_mlp": 1.00059497, "epoch": 0.4165213731738111, "flos": 16246821018240.0, "grad_norm": 2.743229686159453, "language_loss": 0.78780484, "learning_rate": 2.627213236304848e-06, "loss": 0.81326711, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.660447835922241 }, { "auxiliary_loss_clip": 0.01350295, "auxiliary_loss_mlp": 0.01195142, "balance_loss_clip": 1.00938606, "balance_loss_mlp": 1.00059235, "epoch": 0.4166416160644502, "flos": 33765452768160.0, "grad_norm": 2.219613235556894, "language_loss": 0.71049178, "learning_rate": 2.626473513889472e-06, "loss": 0.73594612, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.8488240242004395 }, { "auxiliary_loss_clip": 0.01350772, "auxiliary_loss_mlp": 0.01195072, "balance_loss_clip": 1.00952911, "balance_loss_mlp": 1.00052202, "epoch": 0.41676185895508927, "flos": 20917507766400.0, "grad_norm": 6.186431210500259, "language_loss": 0.82445896, "learning_rate": 2.625733696438562e-06, "loss": 0.84991735, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.736654281616211 }, { "auxiliary_loss_clip": 0.01332073, "auxiliary_loss_mlp": 0.01194951, "balance_loss_clip": 1.00897145, "balance_loss_mlp": 1.00049663, "epoch": 0.4168821018457284, "flos": 18406132558560.0, "grad_norm": 1.769575180008897, "language_loss": 0.74998164, "learning_rate": 2.6249937840643476e-06, "loss": 0.77525187, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.791440963745117 }, { "auxiliary_loss_clip": 0.01362647, "auxiliary_loss_mlp": 0.00872522, "balance_loss_clip": 1.00967622, "balance_loss_mlp": 1.00021446, "epoch": 0.41700234473636744, "flos": 18698409717120.0, "grad_norm": 1.6763476907301467, "language_loss": 0.6700784, "learning_rate": 2.6242537768790733e-06, "loss": 0.69243008, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.763808488845825 }, { "auxiliary_loss_clip": 0.01338271, "auxiliary_loss_mlp": 0.01195071, "balance_loss_clip": 1.00915158, "balance_loss_mlp": 1.00052094, "epoch": 0.41712258762700655, "flos": 31033298030400.0, "grad_norm": 1.958272762096994, "language_loss": 0.6854986, "learning_rate": 2.6235136749949975e-06, "loss": 0.710832, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.784834384918213 }, { "auxiliary_loss_clip": 0.01362485, "auxiliary_loss_mlp": 0.01195105, "balance_loss_clip": 1.00932717, "balance_loss_mlp": 1.00045967, "epoch": 0.41724283051764566, "flos": 35914777525440.0, "grad_norm": 2.914859394778692, "language_loss": 0.61587143, "learning_rate": 2.6227734785243924e-06, "loss": 0.64144731, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.891458511352539 }, { "auxiliary_loss_clip": 0.01272745, "auxiliary_loss_mlp": 0.01194964, "balance_loss_clip": 1.00742066, "balance_loss_mlp": 1.00041485, "epoch": 0.4173630734082847, "flos": 25333659934560.0, "grad_norm": 1.6895277941176485, "language_loss": 0.78855634, "learning_rate": 2.6220331875795466e-06, "loss": 0.81323344, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.9688384532928467 }, { "auxiliary_loss_clip": 0.01351309, "auxiliary_loss_mlp": 0.01194965, "balance_loss_clip": 1.00987804, "balance_loss_mlp": 1.00041568, "epoch": 0.4174833162989238, "flos": 26685410359680.0, "grad_norm": 1.4839698705688835, "language_loss": 0.75386226, "learning_rate": 2.62129280227276e-06, "loss": 0.77932501, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.956787109375 }, { "auxiliary_loss_clip": 0.01350689, "auxiliary_loss_mlp": 0.01195224, "balance_loss_clip": 1.00943267, "balance_loss_mlp": 1.00057888, "epoch": 0.41760355918956293, "flos": 74739609653760.0, "grad_norm": 1.986070370384742, "language_loss": 0.68647575, "learning_rate": 2.62055232271635e-06, "loss": 0.7119348, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.120464324951172 }, { "auxiliary_loss_clip": 0.01318325, "auxiliary_loss_mlp": 0.01195082, "balance_loss_clip": 1.00899148, "balance_loss_mlp": 1.0004375, "epoch": 0.417723802080202, "flos": 14317521393600.0, "grad_norm": 2.041976097959154, "language_loss": 0.87472898, "learning_rate": 2.619811749022646e-06, "loss": 0.89986312, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.7218985557556152 }, { "auxiliary_loss_clip": 0.01340906, "auxiliary_loss_mlp": 0.01195143, "balance_loss_clip": 1.0091269, "balance_loss_mlp": 1.00049782, "epoch": 0.4178440449708411, "flos": 14643805068000.0, "grad_norm": 2.0355006963660944, "language_loss": 0.7144829, "learning_rate": 2.6190710813039917e-06, "loss": 0.73984337, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.767991781234741 }, { "auxiliary_loss_clip": 0.01300468, "auxiliary_loss_mlp": 0.00872662, "balance_loss_clip": 1.00873256, "balance_loss_mlp": 1.00009823, "epoch": 0.4179642878614802, "flos": 21507306940800.0, "grad_norm": 3.3344598085118613, "language_loss": 0.83752346, "learning_rate": 2.618330319672747e-06, "loss": 0.85925472, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.9589898586273193 }, { "auxiliary_loss_clip": 0.01362746, "auxiliary_loss_mlp": 0.01195109, "balance_loss_clip": 1.00926435, "balance_loss_mlp": 1.00055897, "epoch": 0.41808453075211927, "flos": 18442007105760.0, "grad_norm": 2.1229770771078873, "language_loss": 0.91336894, "learning_rate": 2.617589464241284e-06, "loss": 0.93894756, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.7410995960235596 }, { "auxiliary_loss_clip": 0.01303218, "auxiliary_loss_mlp": 0.01194992, "balance_loss_clip": 1.00901854, "balance_loss_mlp": 1.00053787, "epoch": 0.4182047736427584, "flos": 20301030506880.0, "grad_norm": 2.146884407584461, "language_loss": 0.74533546, "learning_rate": 2.6168485151219914e-06, "loss": 0.77031755, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.795605182647705 }, { "auxiliary_loss_clip": 0.01338809, "auxiliary_loss_mlp": 0.01195166, "balance_loss_clip": 1.00904012, "balance_loss_mlp": 1.00052154, "epoch": 0.4183250165333975, "flos": 18876632895360.0, "grad_norm": 2.187812641178121, "language_loss": 0.71292013, "learning_rate": 2.616107472427269e-06, "loss": 0.73825985, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.7419302463531494 }, { "auxiliary_loss_clip": 0.01351035, "auxiliary_loss_mlp": 0.01195201, "balance_loss_clip": 1.00929749, "balance_loss_mlp": 1.00055575, "epoch": 0.41844525942403654, "flos": 17740057906080.0, "grad_norm": 2.3072476827057185, "language_loss": 0.76334512, "learning_rate": 2.615366336269533e-06, "loss": 0.78880751, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.7078351974487305 }, { "auxiliary_loss_clip": 0.01363472, "auxiliary_loss_mlp": 0.01195295, "balance_loss_clip": 1.00981021, "balance_loss_mlp": 1.00064969, "epoch": 0.41856550231467565, "flos": 18361384862400.0, "grad_norm": 2.1310715066814567, "language_loss": 0.80253392, "learning_rate": 2.6146251067612126e-06, "loss": 0.82812154, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.6866941452026367 }, { "auxiliary_loss_clip": 0.01339241, "auxiliary_loss_mlp": 0.01195141, "balance_loss_clip": 1.01025438, "balance_loss_mlp": 1.00049603, "epoch": 0.41868574520531476, "flos": 22781811948480.0, "grad_norm": 1.7038027740369173, "language_loss": 0.82571518, "learning_rate": 2.6138837840147525e-06, "loss": 0.85105896, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.745630979537964 }, { "auxiliary_loss_clip": 0.01300361, "auxiliary_loss_mlp": 0.01194804, "balance_loss_clip": 1.00839186, "balance_loss_mlp": 1.00034976, "epoch": 0.4188059880959538, "flos": 13699176102720.0, "grad_norm": 2.012142715527942, "language_loss": 0.763372, "learning_rate": 2.6131423681426103e-06, "loss": 0.78832364, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.716365098953247 }, { "auxiliary_loss_clip": 0.01362385, "auxiliary_loss_mlp": 0.01195041, "balance_loss_clip": 1.00950575, "balance_loss_mlp": 1.00049102, "epoch": 0.41892623098659293, "flos": 37818297156960.0, "grad_norm": 1.542743321252865, "language_loss": 0.73191547, "learning_rate": 2.6124008592572587e-06, "loss": 0.7574898, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 4.662925720214844 }, { "auxiliary_loss_clip": 0.01364102, "auxiliary_loss_mlp": 0.01195152, "balance_loss_clip": 1.00948501, "balance_loss_mlp": 1.00060272, "epoch": 0.419046473877232, "flos": 23258886318720.0, "grad_norm": 11.760498782934263, "language_loss": 0.81618673, "learning_rate": 2.6116592574711835e-06, "loss": 0.84177923, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.7164580821990967 }, { "auxiliary_loss_clip": 0.01363602, "auxiliary_loss_mlp": 0.01195314, "balance_loss_clip": 1.00979018, "balance_loss_mlp": 1.00057399, "epoch": 0.4191667167678711, "flos": 20741044848480.0, "grad_norm": 1.9728458960647373, "language_loss": 0.83890688, "learning_rate": 2.6109175628968853e-06, "loss": 0.86449599, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 3.6564557552337646 }, { "auxiliary_loss_clip": 0.01350445, "auxiliary_loss_mlp": 0.01195362, "balance_loss_clip": 1.0095849, "balance_loss_mlp": 1.0007174, "epoch": 0.4192869596585102, "flos": 23586427321920.0, "grad_norm": 1.846363357973317, "language_loss": 0.83119285, "learning_rate": 2.610175775646878e-06, "loss": 0.85665095, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.7709085941314697 }, { "auxiliary_loss_clip": 0.0132641, "auxiliary_loss_mlp": 0.01195055, "balance_loss_clip": 1.00860274, "balance_loss_mlp": 1.00050533, "epoch": 0.41940720254914926, "flos": 25081280775360.0, "grad_norm": 2.2479437394929307, "language_loss": 0.72271544, "learning_rate": 2.6094338958336907e-06, "loss": 0.74793005, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 4.089081287384033 }, { "auxiliary_loss_clip": 0.01319514, "auxiliary_loss_mlp": 0.01194953, "balance_loss_clip": 1.00819921, "balance_loss_mlp": 1.00049829, "epoch": 0.41952744543978837, "flos": 15554140128000.0, "grad_norm": 2.016818822601042, "language_loss": 0.82378328, "learning_rate": 2.608691923569867e-06, "loss": 0.84892797, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.727113723754883 }, { "auxiliary_loss_clip": 0.01342962, "auxiliary_loss_mlp": 0.01195172, "balance_loss_clip": 1.00936937, "balance_loss_mlp": 1.00052691, "epoch": 0.4196476883304275, "flos": 24644786954400.0, "grad_norm": 1.5391884596378596, "language_loss": 0.75696915, "learning_rate": 2.6079498589679616e-06, "loss": 0.78235048, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.790367364883423 }, { "auxiliary_loss_clip": 0.01282048, "auxiliary_loss_mlp": 0.01195205, "balance_loss_clip": 1.00861728, "balance_loss_mlp": 1.00056028, "epoch": 0.41976793122106654, "flos": 24531343676640.0, "grad_norm": 1.9743097463941424, "language_loss": 0.76144111, "learning_rate": 2.6072077021405465e-06, "loss": 0.78621364, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.8775179386138916 }, { "auxiliary_loss_clip": 0.01323629, "auxiliary_loss_mlp": 0.01195006, "balance_loss_clip": 1.00887024, "balance_loss_mlp": 1.00045657, "epoch": 0.41988817411170565, "flos": 21175311401280.0, "grad_norm": 1.615749251864519, "language_loss": 0.69110334, "learning_rate": 2.6064654532002054e-06, "loss": 0.71628964, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.834923267364502 }, { "auxiliary_loss_clip": 0.01362928, "auxiliary_loss_mlp": 0.01194994, "balance_loss_clip": 1.00939536, "balance_loss_mlp": 1.00044429, "epoch": 0.42000841700234476, "flos": 31649416053120.0, "grad_norm": 3.7517499998738817, "language_loss": 0.75638723, "learning_rate": 2.6057231122595375e-06, "loss": 0.78196645, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.7969343662261963 }, { "auxiliary_loss_clip": 0.0132775, "auxiliary_loss_mlp": 0.01194668, "balance_loss_clip": 1.00850344, "balance_loss_mlp": 1.00040472, "epoch": 0.4201286598929838, "flos": 21281534019360.0, "grad_norm": 1.6838385159957463, "language_loss": 0.7305423, "learning_rate": 2.604980679431154e-06, "loss": 0.75576645, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.7854816913604736 }, { "auxiliary_loss_clip": 0.01351156, "auxiliary_loss_mlp": 0.01194828, "balance_loss_clip": 1.00951958, "balance_loss_mlp": 1.0003736, "epoch": 0.4202489027836229, "flos": 18546541310880.0, "grad_norm": 2.0861005857537678, "language_loss": 0.74258333, "learning_rate": 2.604238154827684e-06, "loss": 0.76804316, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.7140884399414062 }, { "auxiliary_loss_clip": 0.01350413, "auxiliary_loss_mlp": 0.0119497, "balance_loss_clip": 1.01000559, "balance_loss_mlp": 1.00042093, "epoch": 0.42036914567426203, "flos": 19317653100000.0, "grad_norm": 2.313803992917402, "language_loss": 0.72273886, "learning_rate": 2.6034955385617656e-06, "loss": 0.74819273, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.7366483211517334 }, { "auxiliary_loss_clip": 0.01287152, "auxiliary_loss_mlp": 0.01193941, "balance_loss_clip": 1.00408435, "balance_loss_mlp": 1.00015402, "epoch": 0.4204893885649011, "flos": 67842976642560.0, "grad_norm": 0.7202742600633613, "language_loss": 0.61623931, "learning_rate": 2.6027528307460544e-06, "loss": 0.64105022, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.4278793334960938 }, { "auxiliary_loss_clip": 0.01362717, "auxiliary_loss_mlp": 0.01195259, "balance_loss_clip": 1.00930154, "balance_loss_mlp": 1.00051832, "epoch": 0.4206096314555402, "flos": 21908788382880.0, "grad_norm": 1.9586470585296472, "language_loss": 0.86612403, "learning_rate": 2.602010031493217e-06, "loss": 0.89170372, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.694591522216797 }, { "auxiliary_loss_clip": 0.01300894, "auxiliary_loss_mlp": 0.01194826, "balance_loss_clip": 1.00900328, "balance_loss_mlp": 1.00037217, "epoch": 0.42072987434617926, "flos": 29278198131840.0, "grad_norm": 1.7432058079752084, "language_loss": 0.87216008, "learning_rate": 2.6012671409159367e-06, "loss": 0.89711726, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.8460330963134766 }, { "auxiliary_loss_clip": 0.01312188, "auxiliary_loss_mlp": 0.01195033, "balance_loss_clip": 1.00816619, "balance_loss_mlp": 1.00038815, "epoch": 0.42085011723681837, "flos": 27600738811200.0, "grad_norm": 1.871523042238019, "language_loss": 0.81393772, "learning_rate": 2.6005241591269097e-06, "loss": 0.83900994, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.7656006813049316 }, { "auxiliary_loss_clip": 0.01294722, "auxiliary_loss_mlp": 0.01195079, "balance_loss_clip": 1.00769877, "balance_loss_mlp": 1.00053, "epoch": 0.4209703601274575, "flos": 27818644446720.0, "grad_norm": 1.5326248730195942, "language_loss": 0.79817462, "learning_rate": 2.5997810862388454e-06, "loss": 0.82307267, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.85469388961792 }, { "auxiliary_loss_clip": 0.01328192, "auxiliary_loss_mlp": 0.01195303, "balance_loss_clip": 1.0086478, "balance_loss_mlp": 1.00065815, "epoch": 0.42109060301809653, "flos": 27525541043520.0, "grad_norm": 2.386489592250766, "language_loss": 0.75861347, "learning_rate": 2.599037922364467e-06, "loss": 0.7838484, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.794740676879883 }, { "auxiliary_loss_clip": 0.01291194, "auxiliary_loss_mlp": 0.01195056, "balance_loss_clip": 1.00833726, "balance_loss_mlp": 1.00050652, "epoch": 0.42121084590873564, "flos": 29314288221120.0, "grad_norm": 1.98002727706774, "language_loss": 0.7533623, "learning_rate": 2.5982946676165112e-06, "loss": 0.77822477, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.8399481773376465 }, { "auxiliary_loss_clip": 0.01283549, "auxiliary_loss_mlp": 0.01193847, "balance_loss_clip": 1.0107193, "balance_loss_mlp": 1.00006032, "epoch": 0.42133108879937475, "flos": 67398867001440.0, "grad_norm": 0.7259681922987511, "language_loss": 0.57676554, "learning_rate": 2.5975513221077313e-06, "loss": 0.60153949, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.5218827724456787 }, { "auxiliary_loss_clip": 0.01325257, "auxiliary_loss_mlp": 0.0119508, "balance_loss_clip": 1.00859761, "balance_loss_mlp": 1.00053072, "epoch": 0.4214513316900138, "flos": 23106047973120.0, "grad_norm": 2.1456700705623035, "language_loss": 0.88026214, "learning_rate": 2.5968078859508897e-06, "loss": 0.90546548, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.786404609680176 }, { "auxiliary_loss_clip": 0.01340582, "auxiliary_loss_mlp": 0.01194806, "balance_loss_clip": 1.00868249, "balance_loss_mlp": 1.00035191, "epoch": 0.4215715745806529, "flos": 15336198568800.0, "grad_norm": 2.2268162591607163, "language_loss": 0.79789293, "learning_rate": 2.5960643592587673e-06, "loss": 0.82324678, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.6933071613311768 }, { "auxiliary_loss_clip": 0.01304601, "auxiliary_loss_mlp": 0.01194904, "balance_loss_clip": 1.00810671, "balance_loss_mlp": 1.00035405, "epoch": 0.42169181747129203, "flos": 22127268797280.0, "grad_norm": 1.7686046606864492, "language_loss": 0.81420916, "learning_rate": 2.5953207421441553e-06, "loss": 0.83920419, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.8108339309692383 }, { "auxiliary_loss_clip": 0.01313318, "auxiliary_loss_mlp": 0.01195032, "balance_loss_clip": 1.00980902, "balance_loss_mlp": 1.00057769, "epoch": 0.4218120603619311, "flos": 22630733863200.0, "grad_norm": 2.1887877596076546, "language_loss": 0.75104415, "learning_rate": 2.5945770347198603e-06, "loss": 0.77612764, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.7838146686553955 }, { "auxiliary_loss_clip": 0.0132716, "auxiliary_loss_mlp": 0.01195016, "balance_loss_clip": 1.0092876, "balance_loss_mlp": 1.00046623, "epoch": 0.4219323032525702, "flos": 19682828910720.0, "grad_norm": 1.768268119116826, "language_loss": 0.81511837, "learning_rate": 2.593833237098701e-06, "loss": 0.84034014, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.883413076400757 }, { "auxiliary_loss_clip": 0.01350984, "auxiliary_loss_mlp": 0.01195055, "balance_loss_clip": 1.00921404, "balance_loss_mlp": 1.00050521, "epoch": 0.4220525461432093, "flos": 30190760460000.0, "grad_norm": 1.6287335398634237, "language_loss": 0.62339807, "learning_rate": 2.593089349393512e-06, "loss": 0.64885843, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 3.75185489654541 }, { "auxiliary_loss_clip": 0.0133872, "auxiliary_loss_mlp": 0.01195069, "balance_loss_clip": 1.00974035, "balance_loss_mlp": 1.00051916, "epoch": 0.42217278903384836, "flos": 24315952698720.0, "grad_norm": 1.9435601779019425, "language_loss": 0.83116651, "learning_rate": 2.592345371717141e-06, "loss": 0.85650444, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 3.7404277324676514 }, { "auxiliary_loss_clip": 0.01338552, "auxiliary_loss_mlp": 0.01195046, "balance_loss_clip": 1.00992, "balance_loss_mlp": 1.00059152, "epoch": 0.42229303192448747, "flos": 17092483948800.0, "grad_norm": 2.759027636973291, "language_loss": 0.71904284, "learning_rate": 2.591601304182448e-06, "loss": 0.74437881, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.8081271648406982 }, { "auxiliary_loss_clip": 0.01322154, "auxiliary_loss_mlp": 0.01194914, "balance_loss_clip": 1.00915337, "balance_loss_mlp": 1.00046015, "epoch": 0.4224132748151266, "flos": 22784542148160.0, "grad_norm": 1.6275741341201517, "language_loss": 0.79189104, "learning_rate": 2.5908571469023067e-06, "loss": 0.81706172, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 3.844757556915283 }, { "auxiliary_loss_clip": 0.01362045, "auxiliary_loss_mlp": 0.01195057, "balance_loss_clip": 1.00923538, "balance_loss_mlp": 1.00050724, "epoch": 0.42253351770576564, "flos": 17819099507520.0, "grad_norm": 2.3832840533319724, "language_loss": 0.7471112, "learning_rate": 2.5901128999896067e-06, "loss": 0.77268225, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 3.570519208908081 }, { "auxiliary_loss_clip": 0.01338232, "auxiliary_loss_mlp": 0.01194863, "balance_loss_clip": 1.00899529, "balance_loss_mlp": 1.00040925, "epoch": 0.42265376059640475, "flos": 28512403047360.0, "grad_norm": 1.6536205339800556, "language_loss": 0.68137276, "learning_rate": 2.5893685635572487e-06, "loss": 0.70670366, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.805633068084717 }, { "auxiliary_loss_clip": 0.01313284, "auxiliary_loss_mlp": 0.01195036, "balance_loss_clip": 1.00828183, "balance_loss_mlp": 1.00048614, "epoch": 0.4227740034870438, "flos": 16253359128000.0, "grad_norm": 1.9937890734418302, "language_loss": 0.69093561, "learning_rate": 2.5886241377181483e-06, "loss": 0.7160188, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.7178544998168945 }, { "auxiliary_loss_clip": 0.01340993, "auxiliary_loss_mlp": 0.01195477, "balance_loss_clip": 1.00919008, "balance_loss_mlp": 1.00064182, "epoch": 0.4228942463776829, "flos": 25295701813920.0, "grad_norm": 1.7045017559404811, "language_loss": 0.81062412, "learning_rate": 2.587879622585234e-06, "loss": 0.83598888, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.787196397781372 }, { "auxiliary_loss_clip": 0.01340143, "auxiliary_loss_mlp": 0.01194948, "balance_loss_clip": 1.00918424, "balance_loss_mlp": 1.00049424, "epoch": 0.423014489268322, "flos": 26395791553440.0, "grad_norm": 2.0990618146306943, "language_loss": 0.75701976, "learning_rate": 2.5871350182714486e-06, "loss": 0.78237069, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.7385432720184326 }, { "auxiliary_loss_clip": 0.0136218, "auxiliary_loss_mlp": 0.0119495, "balance_loss_clip": 1.00950432, "balance_loss_mlp": 1.00040066, "epoch": 0.4231347321589611, "flos": 17274012105600.0, "grad_norm": 2.1033572359594372, "language_loss": 0.80034387, "learning_rate": 2.586390324889748e-06, "loss": 0.82591516, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.6849002838134766 }, { "auxiliary_loss_clip": 0.01337164, "auxiliary_loss_mlp": 0.01195045, "balance_loss_clip": 1.00919604, "balance_loss_mlp": 1.00049543, "epoch": 0.4232549750496002, "flos": 22999645736640.0, "grad_norm": 2.0390067845799056, "language_loss": 0.67298853, "learning_rate": 2.5856455425531003e-06, "loss": 0.69831061, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.850445032119751 }, { "auxiliary_loss_clip": 0.01340052, "auxiliary_loss_mlp": 0.011948, "balance_loss_clip": 1.00953126, "balance_loss_mlp": 1.00044131, "epoch": 0.4233752179402393, "flos": 21248353748160.0, "grad_norm": 2.67206780953288, "language_loss": 0.80428118, "learning_rate": 2.5849006713744902e-06, "loss": 0.82962966, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.7079713344573975 }, { "auxiliary_loss_clip": 0.01327305, "auxiliary_loss_mlp": 0.011952, "balance_loss_clip": 1.00947881, "balance_loss_mlp": 1.00055552, "epoch": 0.42349546083087836, "flos": 20704308132960.0, "grad_norm": 2.630922722675959, "language_loss": 0.73059452, "learning_rate": 2.5841557114669135e-06, "loss": 0.75581956, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.8782334327697754 }, { "auxiliary_loss_clip": 0.01363537, "auxiliary_loss_mlp": 0.0119504, "balance_loss_clip": 1.00955749, "balance_loss_mlp": 1.00039506, "epoch": 0.42361570372151747, "flos": 18585074210400.0, "grad_norm": 2.4084863261172047, "language_loss": 0.66973966, "learning_rate": 2.58341066294338e-06, "loss": 0.69532543, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.669126033782959 }, { "auxiliary_loss_clip": 0.012912, "auxiliary_loss_mlp": 0.00872515, "balance_loss_clip": 1.00856614, "balance_loss_mlp": 1.00024748, "epoch": 0.4237359466121566, "flos": 20959489339200.0, "grad_norm": 2.0446189094242113, "language_loss": 0.85332859, "learning_rate": 2.5826655259169124e-06, "loss": 0.87496567, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.898674964904785 }, { "auxiliary_loss_clip": 0.01363186, "auxiliary_loss_mlp": 0.01195124, "balance_loss_clip": 1.0101397, "balance_loss_mlp": 1.00047898, "epoch": 0.42385618950279563, "flos": 18038190624480.0, "grad_norm": 1.751467269793113, "language_loss": 0.90206379, "learning_rate": 2.5819203005005475e-06, "loss": 0.92764688, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.71341872215271 }, { "auxiliary_loss_clip": 0.0131176, "auxiliary_loss_mlp": 0.01195032, "balance_loss_clip": 1.00820446, "balance_loss_mlp": 1.00048268, "epoch": 0.42397643239343474, "flos": 23769140960160.0, "grad_norm": 1.4958431222564232, "language_loss": 0.78824031, "learning_rate": 2.581174986807336e-06, "loss": 0.81330824, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.802236318588257 }, { "auxiliary_loss_clip": 0.0135084, "auxiliary_loss_mlp": 0.0087248, "balance_loss_clip": 1.00993228, "balance_loss_mlp": 1.00013459, "epoch": 0.42409667528407385, "flos": 16545097431360.0, "grad_norm": 3.083479985488027, "language_loss": 0.91322333, "learning_rate": 2.580429584950341e-06, "loss": 0.93545657, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.654768705368042 }, { "auxiliary_loss_clip": 0.01303553, "auxiliary_loss_mlp": 0.01195177, "balance_loss_clip": 1.00985873, "balance_loss_mlp": 1.00053239, "epoch": 0.4242169181747129, "flos": 16034196163680.0, "grad_norm": 2.687700579850043, "language_loss": 0.6623131, "learning_rate": 2.5796840950426397e-06, "loss": 0.68730038, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.86836838722229 }, { "auxiliary_loss_clip": 0.01351452, "auxiliary_loss_mlp": 0.01194935, "balance_loss_clip": 1.01014137, "balance_loss_mlp": 1.00038517, "epoch": 0.424337161065352, "flos": 20084022963360.0, "grad_norm": 3.245322618711844, "language_loss": 0.65829098, "learning_rate": 2.578938517197322e-06, "loss": 0.68375492, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.706887722015381 }, { "auxiliary_loss_clip": 0.01329075, "auxiliary_loss_mlp": 0.01194965, "balance_loss_clip": 1.00921798, "balance_loss_mlp": 1.0004158, "epoch": 0.4244574039559911, "flos": 23878381167360.0, "grad_norm": 2.2029957919947822, "language_loss": 0.62459409, "learning_rate": 2.5781928515274916e-06, "loss": 0.64983451, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.7619831562042236 }, { "auxiliary_loss_clip": 0.0134416, "auxiliary_loss_mlp": 0.0119508, "balance_loss_clip": 1.00966859, "balance_loss_mlp": 1.00043488, "epoch": 0.4245776468466302, "flos": 17565930027360.0, "grad_norm": 2.0681098502139608, "language_loss": 0.67682827, "learning_rate": 2.577447098146265e-06, "loss": 0.70222068, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.787813663482666 }, { "auxiliary_loss_clip": 0.01307829, "auxiliary_loss_mlp": 0.0119512, "balance_loss_clip": 1.00866568, "balance_loss_mlp": 1.00047469, "epoch": 0.4246978897372693, "flos": 27776267713440.0, "grad_norm": 1.7363948925225197, "language_loss": 0.78939611, "learning_rate": 2.5767012571667724e-06, "loss": 0.81442559, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.8462350368499756 }, { "auxiliary_loss_clip": 0.01351281, "auxiliary_loss_mlp": 0.0119532, "balance_loss_clip": 1.00989926, "balance_loss_mlp": 1.00057971, "epoch": 0.42481813262790835, "flos": 15596624632320.0, "grad_norm": 1.764850466085628, "language_loss": 0.67868704, "learning_rate": 2.5759553287021587e-06, "loss": 0.70415306, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.8367416858673096 }, { "auxiliary_loss_clip": 0.01326153, "auxiliary_loss_mlp": 0.01195032, "balance_loss_clip": 1.00996947, "balance_loss_mlp": 1.0004828, "epoch": 0.42493837551854746, "flos": 23951100201120.0, "grad_norm": 1.9130124375024091, "language_loss": 0.77384162, "learning_rate": 2.5752093128655786e-06, "loss": 0.79905343, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.8386566638946533 }, { "auxiliary_loss_clip": 0.01328026, "auxiliary_loss_mlp": 0.01194954, "balance_loss_clip": 1.00914073, "balance_loss_mlp": 1.00040483, "epoch": 0.4250586184091866, "flos": 20813476492800.0, "grad_norm": 1.8839645933009523, "language_loss": 0.73897725, "learning_rate": 2.574463209770204e-06, "loss": 0.76420712, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.7440149784088135 }, { "auxiliary_loss_clip": 0.01314403, "auxiliary_loss_mlp": 0.01195175, "balance_loss_clip": 1.00980115, "balance_loss_mlp": 1.00052977, "epoch": 0.42517886129982563, "flos": 30371031288000.0, "grad_norm": 1.6755649163830209, "language_loss": 0.79634738, "learning_rate": 2.5737170195292165e-06, "loss": 0.82144314, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.830565929412842 }, { "auxiliary_loss_clip": 0.01310081, "auxiliary_loss_mlp": 0.01195125, "balance_loss_clip": 1.00829315, "balance_loss_mlp": 1.0004797, "epoch": 0.42529910419046474, "flos": 20080646137440.0, "grad_norm": 1.942200942594304, "language_loss": 0.77705872, "learning_rate": 2.572970742255814e-06, "loss": 0.80211079, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 4.69258189201355 }, { "auxiliary_loss_clip": 0.01338658, "auxiliary_loss_mlp": 0.01194853, "balance_loss_clip": 1.00923276, "balance_loss_mlp": 1.00039887, "epoch": 0.42541934708110385, "flos": 22632458199840.0, "grad_norm": 1.7466662449692303, "language_loss": 0.81356066, "learning_rate": 2.5722243780632046e-06, "loss": 0.8388958, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 3.7212953567504883 }, { "auxiliary_loss_clip": 0.01270953, "auxiliary_loss_mlp": 0.01193924, "balance_loss_clip": 1.00370944, "balance_loss_mlp": 1.00013721, "epoch": 0.4255395899717429, "flos": 66200709319200.0, "grad_norm": 0.7500308980911036, "language_loss": 0.60493088, "learning_rate": 2.5714779270646125e-06, "loss": 0.62957966, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.3768692016601562 }, { "auxiliary_loss_clip": 0.01316379, "auxiliary_loss_mlp": 0.00872549, "balance_loss_clip": 1.00855589, "balance_loss_mlp": 1.00016356, "epoch": 0.425659832862382, "flos": 17931824311680.0, "grad_norm": 2.1287934208307764, "language_loss": 0.77762842, "learning_rate": 2.5707313893732735e-06, "loss": 0.79951763, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.757042646408081 }, { "auxiliary_loss_clip": 0.01260661, "auxiliary_loss_mlp": 0.01194909, "balance_loss_clip": 1.00766242, "balance_loss_mlp": 1.00045502, "epoch": 0.4257800757530211, "flos": 24022561906080.0, "grad_norm": 1.7621004138783682, "language_loss": 0.76787889, "learning_rate": 2.5699847651024364e-06, "loss": 0.79243457, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 4.061060667037964 }, { "auxiliary_loss_clip": 0.01337941, "auxiliary_loss_mlp": 0.01195033, "balance_loss_clip": 1.00954461, "balance_loss_mlp": 1.00048327, "epoch": 0.4259003186436602, "flos": 23696026765920.0, "grad_norm": 2.126315361159735, "language_loss": 0.76631266, "learning_rate": 2.5692380543653627e-06, "loss": 0.79164243, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 3.07114577293396 }, { "auxiliary_loss_clip": 0.01350732, "auxiliary_loss_mlp": 0.00872517, "balance_loss_clip": 1.01025462, "balance_loss_mlp": 1.00020659, "epoch": 0.4260205615342993, "flos": 15259779396000.0, "grad_norm": 1.9778415461524443, "language_loss": 0.69766569, "learning_rate": 2.5684912572753293e-06, "loss": 0.71989822, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.771930456161499 }, { "auxiliary_loss_clip": 0.01361376, "auxiliary_loss_mlp": 0.01194958, "balance_loss_clip": 1.00944161, "balance_loss_mlp": 1.0005039, "epoch": 0.4261408044249384, "flos": 30665320172640.0, "grad_norm": 1.6096790947807522, "language_loss": 0.84096527, "learning_rate": 2.5677443739456245e-06, "loss": 0.86652863, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.8549892902374268 }, { "auxiliary_loss_clip": 0.01323579, "auxiliary_loss_mlp": 0.01195017, "balance_loss_clip": 1.00962329, "balance_loss_mlp": 1.0004673, "epoch": 0.42626104731557746, "flos": 23257916379360.0, "grad_norm": 2.3059939555601265, "language_loss": 0.79728079, "learning_rate": 2.5669974044895495e-06, "loss": 0.82246673, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.9029741287231445 }, { "auxiliary_loss_clip": 0.01319035, "auxiliary_loss_mlp": 0.01194912, "balance_loss_clip": 1.00926948, "balance_loss_mlp": 1.00045776, "epoch": 0.42638129020621657, "flos": 25884782514720.0, "grad_norm": 1.6583886544126227, "language_loss": 0.79465461, "learning_rate": 2.5662503490204187e-06, "loss": 0.81979406, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.8990139961242676 }, { "auxiliary_loss_clip": 0.01325563, "auxiliary_loss_mlp": 0.01194939, "balance_loss_clip": 1.00891507, "balance_loss_mlp": 1.00038922, "epoch": 0.4265015330968556, "flos": 26502373408320.0, "grad_norm": 2.0016908170994, "language_loss": 0.76044285, "learning_rate": 2.5655032076515603e-06, "loss": 0.78564781, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.8208374977111816 }, { "auxiliary_loss_clip": 0.01314283, "auxiliary_loss_mlp": 0.0119512, "balance_loss_clip": 1.00914085, "balance_loss_mlp": 1.00057006, "epoch": 0.42662177598749473, "flos": 24389533900800.0, "grad_norm": 2.4775177094851886, "language_loss": 0.81897056, "learning_rate": 2.5647559804963155e-06, "loss": 0.84406459, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.801062822341919 }, { "auxiliary_loss_clip": 0.01291633, "auxiliary_loss_mlp": 0.01195124, "balance_loss_clip": 1.00826454, "balance_loss_mlp": 1.00047934, "epoch": 0.42674201887813384, "flos": 23148640248480.0, "grad_norm": 1.8823471884612577, "language_loss": 0.78477126, "learning_rate": 2.5640086676680364e-06, "loss": 0.80963886, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.88277268409729 }, { "auxiliary_loss_clip": 0.013381, "auxiliary_loss_mlp": 0.01195155, "balance_loss_clip": 1.00917256, "balance_loss_mlp": 1.00051022, "epoch": 0.4268622617687729, "flos": 21689625418560.0, "grad_norm": 2.153637547774817, "language_loss": 0.80855107, "learning_rate": 2.5632612692800923e-06, "loss": 0.83388364, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.7089333534240723 }, { "auxiliary_loss_clip": 0.01305758, "auxiliary_loss_mlp": 0.01195185, "balance_loss_clip": 1.00889075, "balance_loss_mlp": 1.00054026, "epoch": 0.426982504659412, "flos": 23440163009760.0, "grad_norm": 2.025319641023868, "language_loss": 0.75042081, "learning_rate": 2.5625137854458603e-06, "loss": 0.7754302, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.857922077178955 }, { "auxiliary_loss_clip": 0.01328973, "auxiliary_loss_mlp": 0.011949, "balance_loss_clip": 1.00889397, "balance_loss_mlp": 1.00035083, "epoch": 0.4271027475500511, "flos": 18916566818400.0, "grad_norm": 1.9250482062911842, "language_loss": 0.80356854, "learning_rate": 2.561766216278735e-06, "loss": 0.82880729, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.7230231761932373 }, { "auxiliary_loss_clip": 0.01281366, "auxiliary_loss_mlp": 0.01194938, "balance_loss_clip": 1.00821364, "balance_loss_mlp": 1.00038862, "epoch": 0.4272229904406902, "flos": 26870566808160.0, "grad_norm": 1.7302182576233593, "language_loss": 0.81241453, "learning_rate": 2.561018561892121e-06, "loss": 0.83717763, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.8956730365753174 }, { "auxiliary_loss_clip": 0.01325633, "auxiliary_loss_mlp": 0.01195066, "balance_loss_clip": 1.00878286, "balance_loss_mlp": 1.00051618, "epoch": 0.4273432333313293, "flos": 23951387590560.0, "grad_norm": 1.702678671380814, "language_loss": 0.76767552, "learning_rate": 2.5602708223994363e-06, "loss": 0.7928825, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.808037042617798 }, { "auxiliary_loss_clip": 0.01323563, "auxiliary_loss_mlp": 0.01194896, "balance_loss_clip": 1.00897574, "balance_loss_mlp": 1.00034642, "epoch": 0.4274634762219684, "flos": 29570367519360.0, "grad_norm": 2.523155208841905, "language_loss": 0.67936552, "learning_rate": 2.559522997914115e-06, "loss": 0.70455009, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.8924057483673096 }, { "auxiliary_loss_clip": 0.01361402, "auxiliary_loss_mlp": 0.01194961, "balance_loss_clip": 1.00974369, "balance_loss_mlp": 1.0005064, "epoch": 0.42758371911260745, "flos": 21434156822880.0, "grad_norm": 1.8841468092630091, "language_loss": 0.8465668, "learning_rate": 2.558775088549599e-06, "loss": 0.87213045, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.7092466354370117 }, { "auxiliary_loss_clip": 0.01339781, "auxiliary_loss_mlp": 0.01195072, "balance_loss_clip": 1.00950539, "balance_loss_mlp": 1.00052285, "epoch": 0.42770396200324656, "flos": 14752829733120.0, "grad_norm": 2.241565583719512, "language_loss": 0.66292113, "learning_rate": 2.5580270944193467e-06, "loss": 0.68826967, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.675833225250244 }, { "auxiliary_loss_clip": 0.01332987, "auxiliary_loss_mlp": 0.01193825, "balance_loss_clip": 1.00430119, "balance_loss_mlp": 1.00003815, "epoch": 0.4278242048938857, "flos": 70654747760640.0, "grad_norm": 0.7463098957871253, "language_loss": 0.55504048, "learning_rate": 2.557279015636827e-06, "loss": 0.58030862, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.289268970489502 }, { "auxiliary_loss_clip": 0.01311428, "auxiliary_loss_mlp": 0.01193853, "balance_loss_clip": 1.00482941, "balance_loss_mlp": 1.00006628, "epoch": 0.42794444778452473, "flos": 69366196594080.0, "grad_norm": 0.7632203852826627, "language_loss": 0.61257434, "learning_rate": 2.5565308523155245e-06, "loss": 0.63762712, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.2600667476654053 }, { "auxiliary_loss_clip": 0.01280153, "auxiliary_loss_mlp": 0.01195171, "balance_loss_clip": 1.00794709, "balance_loss_mlp": 1.00052595, "epoch": 0.42806469067516384, "flos": 18215336092320.0, "grad_norm": 2.861392557368131, "language_loss": 0.8186779, "learning_rate": 2.5557826045689336e-06, "loss": 0.84343112, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.820505380630493 }, { "auxiliary_loss_clip": 0.01283168, "auxiliary_loss_mlp": 0.01193863, "balance_loss_clip": 1.0099256, "balance_loss_mlp": 1.00007653, "epoch": 0.4281849335658029, "flos": 54535847263200.0, "grad_norm": 0.8224716984202235, "language_loss": 0.5883255, "learning_rate": 2.5550342725105643e-06, "loss": 0.61309576, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.311384916305542 }, { "auxiliary_loss_clip": 0.01337761, "auxiliary_loss_mlp": 0.01195239, "balance_loss_clip": 1.00948524, "balance_loss_mlp": 1.00049913, "epoch": 0.428305176456442, "flos": 17274838350240.0, "grad_norm": 1.6113384903731782, "language_loss": 0.80815315, "learning_rate": 2.554285856253937e-06, "loss": 0.83348322, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 4.038269281387329 }, { "auxiliary_loss_clip": 0.0131419, "auxiliary_loss_mlp": 0.01195022, "balance_loss_clip": 1.0080651, "balance_loss_mlp": 1.00056767, "epoch": 0.4284254193470811, "flos": 26359521845760.0, "grad_norm": 3.091254327425222, "language_loss": 0.77513433, "learning_rate": 2.5535373559125855e-06, "loss": 0.80022639, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.793555974960327 }, { "auxiliary_loss_clip": 0.01257618, "auxiliary_loss_mlp": 0.01194862, "balance_loss_clip": 1.00798869, "balance_loss_mlp": 1.0004077, "epoch": 0.42854566223772017, "flos": 29714260868640.0, "grad_norm": 1.5526739956093407, "language_loss": 0.8209787, "learning_rate": 2.552788771600057e-06, "loss": 0.84550351, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 4.021168231964111 }, { "auxiliary_loss_clip": 0.01304016, "auxiliary_loss_mlp": 0.01195039, "balance_loss_clip": 1.00873339, "balance_loss_mlp": 1.00048935, "epoch": 0.4286659051283593, "flos": 22018172284800.0, "grad_norm": 1.9410424473213839, "language_loss": 0.82107979, "learning_rate": 2.5520401034299118e-06, "loss": 0.84607035, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 3.321851968765259 }, { "auxiliary_loss_clip": 0.01340221, "auxiliary_loss_mlp": 0.0119532, "balance_loss_clip": 1.00889838, "balance_loss_mlp": 1.00067449, "epoch": 0.4287861480189984, "flos": 13334431376160.0, "grad_norm": 2.5702895090877784, "language_loss": 0.88057888, "learning_rate": 2.551291351515722e-06, "loss": 0.90593433, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 3.796982765197754 }, { "auxiliary_loss_clip": 0.01315322, "auxiliary_loss_mlp": 0.00872563, "balance_loss_clip": 1.0087899, "balance_loss_mlp": 1.00025225, "epoch": 0.42890639090963745, "flos": 26651547538560.0, "grad_norm": 1.5905215722278228, "language_loss": 0.8579362, "learning_rate": 2.5505425159710726e-06, "loss": 0.87981498, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 3.0163867473602295 }, { "auxiliary_loss_clip": 0.01337839, "auxiliary_loss_mlp": 0.00872523, "balance_loss_clip": 1.009619, "balance_loss_mlp": 1.00032997, "epoch": 0.42902663380027656, "flos": 24055778100960.0, "grad_norm": 1.7149366231744376, "language_loss": 0.8271504, "learning_rate": 2.549793596909561e-06, "loss": 0.84925401, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.7805778980255127 }, { "auxiliary_loss_clip": 0.01315607, "auxiliary_loss_mlp": 0.0119496, "balance_loss_clip": 1.00917387, "balance_loss_mlp": 1.0004108, "epoch": 0.42914687669091567, "flos": 15632571026880.0, "grad_norm": 2.0118592893783824, "language_loss": 0.66191518, "learning_rate": 2.5490445944447976e-06, "loss": 0.68702084, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.7536802291870117 }, { "auxiliary_loss_clip": 0.01338807, "auxiliary_loss_mlp": 0.01194998, "balance_loss_clip": 1.00862908, "balance_loss_mlp": 1.0004487, "epoch": 0.4292671195815547, "flos": 31467816048960.0, "grad_norm": 2.1987703209262954, "language_loss": 0.65295655, "learning_rate": 2.548295508690406e-06, "loss": 0.6782946, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.9181711673736572 }, { "auxiliary_loss_clip": 0.01349715, "auxiliary_loss_mlp": 0.01194942, "balance_loss_clip": 1.00942993, "balance_loss_mlp": 1.00039232, "epoch": 0.42938736247219383, "flos": 30257767628640.0, "grad_norm": 1.6394345535500572, "language_loss": 0.76516205, "learning_rate": 2.5475463397600217e-06, "loss": 0.79060858, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.758394718170166 }, { "auxiliary_loss_clip": 0.01361816, "auxiliary_loss_mlp": 0.01195042, "balance_loss_clip": 1.00953221, "balance_loss_mlp": 1.0004921, "epoch": 0.42950760536283294, "flos": 29349695760480.0, "grad_norm": 1.9473180323761683, "language_loss": 0.77403855, "learning_rate": 2.546797087767293e-06, "loss": 0.79960716, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.809481143951416 }, { "auxiliary_loss_clip": 0.0127138, "auxiliary_loss_mlp": 0.01194988, "balance_loss_clip": 1.00700068, "balance_loss_mlp": 1.00043821, "epoch": 0.429627848253472, "flos": 26869956105600.0, "grad_norm": 1.6654486273048574, "language_loss": 0.87093353, "learning_rate": 2.546047752825881e-06, "loss": 0.89559722, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.8383920192718506 }, { "auxiliary_loss_clip": 0.01287381, "auxiliary_loss_mlp": 0.01195052, "balance_loss_clip": 1.00737476, "balance_loss_mlp": 1.00040698, "epoch": 0.4297480911441111, "flos": 13881279038400.0, "grad_norm": 1.9252292427762496, "language_loss": 0.92952663, "learning_rate": 2.5452983350494595e-06, "loss": 0.95435095, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.81853985786438 }, { "auxiliary_loss_clip": 0.0134148, "auxiliary_loss_mlp": 0.00872428, "balance_loss_clip": 1.00895751, "balance_loss_mlp": 1.00034356, "epoch": 0.4298683340347502, "flos": 20741152619520.0, "grad_norm": 2.231426265850882, "language_loss": 0.64794606, "learning_rate": 2.544548834551713e-06, "loss": 0.67008513, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.7521164417266846 }, { "auxiliary_loss_clip": 0.01299922, "auxiliary_loss_mlp": 0.00872536, "balance_loss_clip": 1.00886738, "balance_loss_mlp": 1.00017488, "epoch": 0.4299885769253893, "flos": 20882136150720.0, "grad_norm": 2.1413053853915853, "language_loss": 0.94738227, "learning_rate": 2.5437992514463424e-06, "loss": 0.96910685, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.774369478225708 }, { "auxiliary_loss_clip": 0.01337062, "auxiliary_loss_mlp": 0.01195051, "balance_loss_clip": 1.00891948, "balance_loss_mlp": 1.00059676, "epoch": 0.4301088198160284, "flos": 25484630248800.0, "grad_norm": 1.5520970372253555, "language_loss": 0.88030028, "learning_rate": 2.5430495858470565e-06, "loss": 0.90562141, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.774664878845215 }, { "auxiliary_loss_clip": 0.01336214, "auxiliary_loss_mlp": 0.01195099, "balance_loss_clip": 1.00928879, "balance_loss_mlp": 1.00064468, "epoch": 0.43022906270666744, "flos": 18259437162240.0, "grad_norm": 2.2129784408315536, "language_loss": 0.77189779, "learning_rate": 2.54229983786758e-06, "loss": 0.79721087, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.7274649143218994 }, { "auxiliary_loss_clip": 0.01335927, "auxiliary_loss_mlp": 0.01195051, "balance_loss_clip": 1.00955796, "balance_loss_mlp": 1.00050151, "epoch": 0.43034930559730655, "flos": 23399546536800.0, "grad_norm": 2.817256443658385, "language_loss": 0.85133278, "learning_rate": 2.541550007621651e-06, "loss": 0.87664258, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.7857213020324707 }, { "auxiliary_loss_clip": 0.01339323, "auxiliary_loss_mlp": 0.01194953, "balance_loss_clip": 1.00923014, "balance_loss_mlp": 1.00049841, "epoch": 0.43046954848794566, "flos": 28184395036320.0, "grad_norm": 1.8061366562980397, "language_loss": 0.79941708, "learning_rate": 2.5408000952230156e-06, "loss": 0.82475984, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.808043956756592 }, { "auxiliary_loss_clip": 0.01308507, "auxiliary_loss_mlp": 0.01195076, "balance_loss_clip": 1.00897551, "balance_loss_mlp": 1.00052607, "epoch": 0.4305897913785847, "flos": 28580487926400.0, "grad_norm": 1.9778448923695506, "language_loss": 0.90553313, "learning_rate": 2.5400501007854357e-06, "loss": 0.93056893, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.882139205932617 }, { "auxiliary_loss_clip": 0.01295562, "auxiliary_loss_mlp": 0.01195091, "balance_loss_clip": 1.00805473, "balance_loss_mlp": 1.00054193, "epoch": 0.43071003426922383, "flos": 20448731766240.0, "grad_norm": 2.1464186055508017, "language_loss": 0.75644493, "learning_rate": 2.539300024422685e-06, "loss": 0.78135145, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.900357484817505 }, { "auxiliary_loss_clip": 0.01269991, "auxiliary_loss_mlp": 0.01193953, "balance_loss_clip": 1.00505543, "balance_loss_mlp": 1.00016677, "epoch": 0.43083027715986294, "flos": 51997996303200.0, "grad_norm": 0.7870973842977964, "language_loss": 0.6089319, "learning_rate": 2.538549866248549e-06, "loss": 0.63357133, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.2394556999206543 }, { "auxiliary_loss_clip": 0.01339551, "auxiliary_loss_mlp": 0.0119486, "balance_loss_clip": 1.00862443, "balance_loss_mlp": 1.00040567, "epoch": 0.430950520050502, "flos": 16690894735680.0, "grad_norm": 2.1154111524458648, "language_loss": 0.81197345, "learning_rate": 2.5377996263768274e-06, "loss": 0.83731759, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.766759157180786 }, { "auxiliary_loss_clip": 0.01349313, "auxiliary_loss_mlp": 0.0119504, "balance_loss_clip": 1.00963151, "balance_loss_mlp": 1.00049031, "epoch": 0.4310707629411411, "flos": 24608445399360.0, "grad_norm": 1.8275031223982356, "language_loss": 0.68675756, "learning_rate": 2.5370493049213293e-06, "loss": 0.71220112, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.8085107803344727 }, { "auxiliary_loss_clip": 0.0120836, "auxiliary_loss_mlp": 0.01194979, "balance_loss_clip": 1.00654984, "balance_loss_mlp": 1.00042915, "epoch": 0.4311910058317802, "flos": 26432995276800.0, "grad_norm": 1.885756969209743, "language_loss": 0.79742193, "learning_rate": 2.536298901995878e-06, "loss": 0.82145536, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 3.1566107273101807 }, { "auxiliary_loss_clip": 0.01322714, "auxiliary_loss_mlp": 0.01195088, "balance_loss_clip": 1.00908256, "balance_loss_mlp": 1.00044274, "epoch": 0.43131124872241927, "flos": 25155903764160.0, "grad_norm": 1.6159213933168775, "language_loss": 0.80024612, "learning_rate": 2.535548417714311e-06, "loss": 0.82542413, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 3.088350296020508 }, { "auxiliary_loss_clip": 0.01339396, "auxiliary_loss_mlp": 0.0119509, "balance_loss_clip": 1.00931203, "balance_loss_mlp": 1.00054073, "epoch": 0.4314314916130584, "flos": 21614822811360.0, "grad_norm": 1.535502603406067, "language_loss": 0.87039989, "learning_rate": 2.534797852190474e-06, "loss": 0.8957448, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.8566484451293945 }, { "auxiliary_loss_clip": 0.01349743, "auxiliary_loss_mlp": 0.01194992, "balance_loss_clip": 1.00944448, "balance_loss_mlp": 1.00044262, "epoch": 0.4315517345036975, "flos": 19275024900960.0, "grad_norm": 2.0001233599325037, "language_loss": 0.81845951, "learning_rate": 2.5340472055382283e-06, "loss": 0.84390688, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 3.908123016357422 }, { "auxiliary_loss_clip": 0.01322554, "auxiliary_loss_mlp": 0.01195082, "balance_loss_clip": 1.00888586, "balance_loss_mlp": 1.0006274, "epoch": 0.43167197739433655, "flos": 24273863354880.0, "grad_norm": 2.068825766138334, "language_loss": 0.80891061, "learning_rate": 2.5332964778714468e-06, "loss": 0.83408695, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.8145577907562256 }, { "auxiliary_loss_clip": 0.01293306, "auxiliary_loss_mlp": 0.01195103, "balance_loss_clip": 1.00779605, "balance_loss_mlp": 1.00045836, "epoch": 0.43179222028497566, "flos": 16867824661440.0, "grad_norm": 1.5133303699542386, "language_loss": 0.66150814, "learning_rate": 2.5325456693040123e-06, "loss": 0.68639219, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 4.072962760925293 }, { "auxiliary_loss_clip": 0.01339808, "auxiliary_loss_mlp": 0.01195169, "balance_loss_clip": 1.0092299, "balance_loss_mlp": 1.00052381, "epoch": 0.43191246317561477, "flos": 17639223840000.0, "grad_norm": 2.1834443230859275, "language_loss": 0.74489963, "learning_rate": 2.531794779949824e-06, "loss": 0.77024943, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 3.7556731700897217 }, { "auxiliary_loss_clip": 0.0130274, "auxiliary_loss_mlp": 0.01194843, "balance_loss_clip": 1.00839186, "balance_loss_mlp": 1.00038862, "epoch": 0.4320327060662538, "flos": 23878812251520.0, "grad_norm": 1.860299401057685, "language_loss": 0.88148701, "learning_rate": 2.5310438099227903e-06, "loss": 0.90646285, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.81553316116333 }, { "auxiliary_loss_clip": 0.01315538, "auxiliary_loss_mlp": 0.01193855, "balance_loss_clip": 1.00463581, "balance_loss_mlp": 1.00006819, "epoch": 0.43215294895689293, "flos": 66394954458720.0, "grad_norm": 0.8001704631406107, "language_loss": 0.53385603, "learning_rate": 2.530292759336833e-06, "loss": 0.55895001, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.393871307373047 }, { "auxiliary_loss_clip": 0.01311698, "auxiliary_loss_mlp": 0.01195078, "balance_loss_clip": 1.00886202, "balance_loss_mlp": 1.00043297, "epoch": 0.432273191847532, "flos": 20594277604800.0, "grad_norm": 2.2635759081487894, "language_loss": 0.70022619, "learning_rate": 2.5295416283058855e-06, "loss": 0.72529399, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.828925609588623 }, { "auxiliary_loss_clip": 0.01330944, "auxiliary_loss_mlp": 0.00872487, "balance_loss_clip": 1.00903225, "balance_loss_mlp": 1.0001868, "epoch": 0.4323934347381711, "flos": 19282137789600.0, "grad_norm": 1.5029971007814853, "language_loss": 0.66020024, "learning_rate": 2.5287904169438943e-06, "loss": 0.68223459, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.7487599849700928 }, { "auxiliary_loss_clip": 0.01254834, "auxiliary_loss_mlp": 0.01195089, "balance_loss_clip": 1.00814986, "balance_loss_mlp": 1.00053906, "epoch": 0.4325136776288102, "flos": 21726326210400.0, "grad_norm": 3.963288080954744, "language_loss": 0.64133883, "learning_rate": 2.528039125364817e-06, "loss": 0.665838, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 3.159897565841675 }, { "auxiliary_loss_clip": 0.01296386, "auxiliary_loss_mlp": 0.01195246, "balance_loss_clip": 1.00728583, "balance_loss_mlp": 1.00060117, "epoch": 0.43263392051944927, "flos": 22340755820160.0, "grad_norm": 2.057954888461952, "language_loss": 0.75709671, "learning_rate": 2.5272877536826246e-06, "loss": 0.782013, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 3.1442580223083496 }, { "auxiliary_loss_clip": 0.01289347, "auxiliary_loss_mlp": 0.0119481, "balance_loss_clip": 1.00775623, "balance_loss_mlp": 1.00035572, "epoch": 0.4327541634100884, "flos": 29168419069440.0, "grad_norm": 2.3903241517769405, "language_loss": 0.71009552, "learning_rate": 2.5265363020112986e-06, "loss": 0.73493707, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 3.0192086696624756 }, { "auxiliary_loss_clip": 0.01336895, "auxiliary_loss_mlp": 0.01195119, "balance_loss_clip": 1.00925064, "balance_loss_mlp": 1.0005697, "epoch": 0.4328744063007275, "flos": 26067460229280.0, "grad_norm": 1.7934871480722903, "language_loss": 0.84017301, "learning_rate": 2.5257847704648344e-06, "loss": 0.86549312, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.790895462036133 }, { "auxiliary_loss_clip": 0.01361289, "auxiliary_loss_mlp": 0.01194871, "balance_loss_clip": 1.00931609, "balance_loss_mlp": 1.00041735, "epoch": 0.43299464919136654, "flos": 16581367139040.0, "grad_norm": 2.08601699233503, "language_loss": 0.75397015, "learning_rate": 2.525033159157239e-06, "loss": 0.77953172, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.713838815689087 }, { "auxiliary_loss_clip": 0.01349722, "auxiliary_loss_mlp": 0.01195129, "balance_loss_clip": 1.00928044, "balance_loss_mlp": 1.00057948, "epoch": 0.43311489208200565, "flos": 16107274434240.0, "grad_norm": 1.659721497396212, "language_loss": 0.77103394, "learning_rate": 2.52428146820253e-06, "loss": 0.79648244, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.7648046016693115 }, { "auxiliary_loss_clip": 0.01294217, "auxiliary_loss_mlp": 0.01195141, "balance_loss_clip": 1.00739503, "balance_loss_mlp": 1.00059092, "epoch": 0.43323513497264476, "flos": 22930231681440.0, "grad_norm": 1.7344342269151234, "language_loss": 0.81722707, "learning_rate": 2.52352969771474e-06, "loss": 0.84212065, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.7962899208068848 }, { "auxiliary_loss_clip": 0.01324899, "auxiliary_loss_mlp": 0.01194858, "balance_loss_clip": 1.00859785, "balance_loss_mlp": 1.00049889, "epoch": 0.4333553778632838, "flos": 25299042716160.0, "grad_norm": 2.1132513726467286, "language_loss": 0.88184702, "learning_rate": 2.5227778478079106e-06, "loss": 0.90704465, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.8392436504364014 }, { "auxiliary_loss_clip": 0.01348698, "auxiliary_loss_mlp": 0.01194943, "balance_loss_clip": 1.00935102, "balance_loss_mlp": 1.00039315, "epoch": 0.43347562075392293, "flos": 19387174926240.0, "grad_norm": 1.4920748748738233, "language_loss": 0.76135349, "learning_rate": 2.522025918596098e-06, "loss": 0.78678989, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.7017223834991455 }, { "auxiliary_loss_clip": 0.01336496, "auxiliary_loss_mlp": 0.01194701, "balance_loss_clip": 1.00893593, "balance_loss_mlp": 1.00034213, "epoch": 0.43359586364456204, "flos": 26325982337760.0, "grad_norm": 1.3867594043503215, "language_loss": 0.65430331, "learning_rate": 2.521273910193368e-06, "loss": 0.6796152, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.897268533706665 }, { "auxiliary_loss_clip": 0.0134778, "auxiliary_loss_mlp": 0.01195196, "balance_loss_clip": 1.00903535, "balance_loss_mlp": 1.00055063, "epoch": 0.4337161065352011, "flos": 15989268849120.0, "grad_norm": 1.9701555035760714, "language_loss": 0.87011325, "learning_rate": 2.5205218227138006e-06, "loss": 0.89554304, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.7684738636016846 }, { "auxiliary_loss_clip": 0.01362389, "auxiliary_loss_mlp": 0.01195137, "balance_loss_clip": 1.00969172, "balance_loss_mlp": 1.0004921, "epoch": 0.4338363494258402, "flos": 20224719105120.0, "grad_norm": 2.146108159802377, "language_loss": 0.79045475, "learning_rate": 2.519769656271486e-06, "loss": 0.81603003, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.6888082027435303 }, { "auxiliary_loss_clip": 0.0127508, "auxiliary_loss_mlp": 0.0119512, "balance_loss_clip": 1.00797725, "balance_loss_mlp": 1.00047541, "epoch": 0.43395659231647926, "flos": 20083915192320.0, "grad_norm": 2.196026435926165, "language_loss": 0.67360419, "learning_rate": 2.5190174109805285e-06, "loss": 0.6983062, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.898932456970215 }, { "auxiliary_loss_clip": 0.01324557, "auxiliary_loss_mlp": 0.01194974, "balance_loss_clip": 1.00939369, "balance_loss_mlp": 1.00042486, "epoch": 0.43407683520711837, "flos": 19901812256640.0, "grad_norm": 2.8115860819124747, "language_loss": 0.64064568, "learning_rate": 2.518265086955042e-06, "loss": 0.66584098, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.7456092834472656 }, { "auxiliary_loss_clip": 0.01361112, "auxiliary_loss_mlp": 0.01195016, "balance_loss_clip": 1.00923443, "balance_loss_mlp": 1.0004667, "epoch": 0.4341970780977575, "flos": 23108742249120.0, "grad_norm": 2.1755694673666377, "language_loss": 0.83928871, "learning_rate": 2.5175126843091534e-06, "loss": 0.86485004, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.7103779315948486 }, { "auxiliary_loss_clip": 0.01314947, "auxiliary_loss_mlp": 0.01195014, "balance_loss_clip": 1.00863791, "balance_loss_mlp": 1.00046408, "epoch": 0.43431732098839654, "flos": 37408301802720.0, "grad_norm": 1.887883676821239, "language_loss": 0.75401938, "learning_rate": 2.5167602031570034e-06, "loss": 0.77911896, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.8694684505462646 }, { "auxiliary_loss_clip": 0.01361661, "auxiliary_loss_mlp": 0.0119506, "balance_loss_clip": 1.00950074, "balance_loss_mlp": 1.0006063, "epoch": 0.43443756387903565, "flos": 31868219780640.0, "grad_norm": 1.5378520725171398, "language_loss": 0.73163521, "learning_rate": 2.51600764361274e-06, "loss": 0.75720239, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.8067328929901123 }, { "auxiliary_loss_clip": 0.01360799, "auxiliary_loss_mlp": 0.01194965, "balance_loss_clip": 1.00943375, "balance_loss_mlp": 1.00051045, "epoch": 0.43455780676967476, "flos": 23477151191040.0, "grad_norm": 2.5885880431837758, "language_loss": 0.78930175, "learning_rate": 2.5152550057905283e-06, "loss": 0.81485939, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.727553129196167 }, { "auxiliary_loss_clip": 0.01337498, "auxiliary_loss_mlp": 0.00872559, "balance_loss_clip": 1.00912833, "balance_loss_mlp": 1.00023961, "epoch": 0.4346780496603138, "flos": 24207071728320.0, "grad_norm": 2.51847040016792, "language_loss": 0.76909524, "learning_rate": 2.5145022898045415e-06, "loss": 0.79119587, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 4.035315752029419 }, { "auxiliary_loss_clip": 0.0132622, "auxiliary_loss_mlp": 0.01194966, "balance_loss_clip": 1.00930858, "balance_loss_mlp": 1.00041628, "epoch": 0.4347982925509529, "flos": 17092376177760.0, "grad_norm": 1.9868420718493407, "language_loss": 0.89815128, "learning_rate": 2.5137494957689664e-06, "loss": 0.92336321, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.772941827774048 }, { "auxiliary_loss_clip": 0.01296647, "auxiliary_loss_mlp": 0.01193915, "balance_loss_clip": 1.00440931, "balance_loss_mlp": 1.00012827, "epoch": 0.43491853544159204, "flos": 60945576024960.0, "grad_norm": 0.7752016150174771, "language_loss": 0.57388681, "learning_rate": 2.5129966237980016e-06, "loss": 0.59879243, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 4.388833522796631 }, { "auxiliary_loss_clip": 0.01308348, "auxiliary_loss_mlp": 0.0119484, "balance_loss_clip": 1.00838304, "balance_loss_mlp": 1.00048089, "epoch": 0.4350387783322311, "flos": 21944662930080.0, "grad_norm": 1.814258034046189, "language_loss": 0.77923489, "learning_rate": 2.512243674005857e-06, "loss": 0.80426681, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.825464963912964 }, { "auxiliary_loss_clip": 0.0128334, "auxiliary_loss_mlp": 0.01194895, "balance_loss_clip": 1.00862992, "balance_loss_mlp": 1.0005362, "epoch": 0.4351590212228702, "flos": 25082717722560.0, "grad_norm": 1.9956345545631267, "language_loss": 0.86097407, "learning_rate": 2.5114906465067537e-06, "loss": 0.88575637, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 3.9352102279663086 }, { "auxiliary_loss_clip": 0.01349092, "auxiliary_loss_mlp": 0.01195051, "balance_loss_clip": 1.00910687, "balance_loss_mlp": 1.00050104, "epoch": 0.4352792641135093, "flos": 21506552543520.0, "grad_norm": 1.8483708060706687, "language_loss": 0.7468468, "learning_rate": 2.5107375414149264e-06, "loss": 0.7722882, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 3.0696589946746826 }, { "auxiliary_loss_clip": 0.01292206, "auxiliary_loss_mlp": 0.01194907, "balance_loss_clip": 1.00752187, "balance_loss_mlp": 1.00035739, "epoch": 0.43539950700414837, "flos": 16253466899040.0, "grad_norm": 2.1751034797474262, "language_loss": 0.71310437, "learning_rate": 2.5099843588446197e-06, "loss": 0.73797548, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.801845073699951 }, { "auxiliary_loss_clip": 0.01277596, "auxiliary_loss_mlp": 0.01194895, "balance_loss_clip": 1.00820124, "balance_loss_mlp": 1.00044131, "epoch": 0.4355197498947875, "flos": 16691828751360.0, "grad_norm": 1.5692036181007298, "language_loss": 0.61474884, "learning_rate": 2.509231098910091e-06, "loss": 0.63947374, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.8964176177978516 }, { "auxiliary_loss_clip": 0.01311958, "auxiliary_loss_mlp": 0.01195107, "balance_loss_clip": 1.00898767, "balance_loss_mlp": 1.0005579, "epoch": 0.4356399927854266, "flos": 16362743029920.0, "grad_norm": 2.52288232446275, "language_loss": 0.7500574, "learning_rate": 2.508477761725611e-06, "loss": 0.77512807, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.843937635421753 }, { "auxiliary_loss_clip": 0.01346956, "auxiliary_loss_mlp": 0.01194935, "balance_loss_clip": 1.0089438, "balance_loss_mlp": 1.00048089, "epoch": 0.43576023567606564, "flos": 17202047469120.0, "grad_norm": 2.2399076065639627, "language_loss": 0.80725229, "learning_rate": 2.507724347405458e-06, "loss": 0.83267117, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.7428159713745117 }, { "auxiliary_loss_clip": 0.01293164, "auxiliary_loss_mlp": 0.0119494, "balance_loss_clip": 1.00777042, "balance_loss_mlp": 1.00048625, "epoch": 0.43588047856670475, "flos": 15917663449440.0, "grad_norm": 1.8244359614373322, "language_loss": 0.82233697, "learning_rate": 2.5069708560639243e-06, "loss": 0.84721804, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.772859573364258 }, { "auxiliary_loss_clip": 0.01305942, "auxiliary_loss_mlp": 0.01194994, "balance_loss_clip": 1.00883746, "balance_loss_mlp": 1.00044453, "epoch": 0.4360007214573438, "flos": 23659577439840.0, "grad_norm": 2.1736701910741796, "language_loss": 0.61837417, "learning_rate": 2.5062172878153158e-06, "loss": 0.6433835, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.8746731281280518 }, { "auxiliary_loss_clip": 0.01260495, "auxiliary_loss_mlp": 0.01194855, "balance_loss_clip": 1.00750184, "balance_loss_mlp": 1.00040054, "epoch": 0.4361209643479829, "flos": 21978777216960.0, "grad_norm": 2.152337023085648, "language_loss": 0.8732515, "learning_rate": 2.505463642773947e-06, "loss": 0.89780504, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.9468932151794434 }, { "auxiliary_loss_clip": 0.01299615, "auxiliary_loss_mlp": 0.00872603, "balance_loss_clip": 1.0080421, "balance_loss_mlp": 1.00031352, "epoch": 0.43624120723862203, "flos": 17420168646720.0, "grad_norm": 2.2241328348967517, "language_loss": 0.75210643, "learning_rate": 2.504709921054146e-06, "loss": 0.77382863, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.873303174972534 }, { "auxiliary_loss_clip": 0.01325136, "auxiliary_loss_mlp": 0.01195028, "balance_loss_clip": 1.00916982, "balance_loss_mlp": 1.00047874, "epoch": 0.4363614501292611, "flos": 17895303138240.0, "grad_norm": 2.2371258810631907, "language_loss": 0.83438897, "learning_rate": 2.50395612277025e-06, "loss": 0.85959065, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.860564947128296 }, { "auxiliary_loss_clip": 0.0132977, "auxiliary_loss_mlp": 0.01195093, "balance_loss_clip": 1.00828934, "balance_loss_mlp": 1.00054371, "epoch": 0.4364816930199002, "flos": 20302898538240.0, "grad_norm": 1.9706928879209833, "language_loss": 0.73106074, "learning_rate": 2.503202248036612e-06, "loss": 0.75630939, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.770339250564575 }, { "auxiliary_loss_clip": 0.01360396, "auxiliary_loss_mlp": 0.01195032, "balance_loss_clip": 1.00884557, "balance_loss_mlp": 1.00048268, "epoch": 0.4366019359105393, "flos": 24061346271360.0, "grad_norm": 1.8658465468569159, "language_loss": 0.73139632, "learning_rate": 2.5024482969675927e-06, "loss": 0.75695062, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.7797176837921143 }, { "auxiliary_loss_clip": 0.0128109, "auxiliary_loss_mlp": 0.01194791, "balance_loss_clip": 1.0079571, "balance_loss_mlp": 1.00033689, "epoch": 0.43672217880117836, "flos": 21754117929600.0, "grad_norm": 2.057318617987578, "language_loss": 0.84565896, "learning_rate": 2.501694269677566e-06, "loss": 0.87041777, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.7908308506011963 }, { "auxiliary_loss_clip": 0.01346924, "auxiliary_loss_mlp": 0.01195042, "balance_loss_clip": 1.00909555, "balance_loss_mlp": 1.00049233, "epoch": 0.4368424216918175, "flos": 18035208959040.0, "grad_norm": 1.9361953106261685, "language_loss": 0.80427647, "learning_rate": 2.500940166280918e-06, "loss": 0.82969612, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.755671262741089 }, { "auxiliary_loss_clip": 0.01348496, "auxiliary_loss_mlp": 0.0119506, "balance_loss_clip": 1.00901484, "balance_loss_mlp": 1.00051069, "epoch": 0.4369626645824566, "flos": 25447139136000.0, "grad_norm": 1.8663082050512991, "language_loss": 0.79089981, "learning_rate": 2.500185986892045e-06, "loss": 0.81633544, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.7899467945098877 }, { "auxiliary_loss_clip": 0.01349238, "auxiliary_loss_mlp": 0.01195002, "balance_loss_clip": 1.00922084, "balance_loss_mlp": 1.0004524, "epoch": 0.43708290747309564, "flos": 25302706931520.0, "grad_norm": 2.4363019914979684, "language_loss": 0.77716023, "learning_rate": 2.499431731625355e-06, "loss": 0.80260265, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.7450575828552246 }, { "auxiliary_loss_clip": 0.01361433, "auxiliary_loss_mlp": 0.01195106, "balance_loss_clip": 1.0091567, "balance_loss_mlp": 1.00055611, "epoch": 0.43720315036373475, "flos": 31575116377440.0, "grad_norm": 1.8724289182909597, "language_loss": 0.79457283, "learning_rate": 2.4986774005952686e-06, "loss": 0.82013822, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.8991739749908447 }, { "auxiliary_loss_clip": 0.01336378, "auxiliary_loss_mlp": 0.01194849, "balance_loss_clip": 1.00965858, "balance_loss_mlp": 1.00039458, "epoch": 0.43732339325437386, "flos": 23112011304000.0, "grad_norm": 1.9974547692743687, "language_loss": 0.84524548, "learning_rate": 2.4979229939162166e-06, "loss": 0.87055779, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 3.1121506690979004 }, { "auxiliary_loss_clip": 0.01335664, "auxiliary_loss_mlp": 0.01194935, "balance_loss_clip": 1.00866854, "balance_loss_mlp": 1.00038517, "epoch": 0.4374436361450129, "flos": 27746284649760.0, "grad_norm": 1.5490356613347607, "language_loss": 0.80430794, "learning_rate": 2.4971685117026433e-06, "loss": 0.82961392, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.902472972869873 }, { "auxiliary_loss_clip": 0.01346957, "auxiliary_loss_mlp": 0.01194771, "balance_loss_clip": 1.00902104, "balance_loss_mlp": 1.0004127, "epoch": 0.437563879035652, "flos": 24172382662560.0, "grad_norm": 1.402954500884512, "language_loss": 0.76721501, "learning_rate": 2.4964139540690018e-06, "loss": 0.79263234, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.8819305896759033 }, { "auxiliary_loss_clip": 0.01300672, "auxiliary_loss_mlp": 0.01195081, "balance_loss_clip": 1.00843596, "balance_loss_mlp": 1.00043654, "epoch": 0.4376841219262911, "flos": 23477223038400.0, "grad_norm": 1.9378114193905955, "language_loss": 0.72971559, "learning_rate": 2.495659321129758e-06, "loss": 0.75467312, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.8149163722991943 }, { "auxiliary_loss_clip": 0.01349154, "auxiliary_loss_mlp": 0.01194947, "balance_loss_clip": 1.0094496, "balance_loss_mlp": 1.00049329, "epoch": 0.4378043648169302, "flos": 25447821685920.0, "grad_norm": 1.724809349024842, "language_loss": 0.75668907, "learning_rate": 2.494904612999389e-06, "loss": 0.78213006, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 3.7795262336730957 }, { "auxiliary_loss_clip": 0.01309263, "auxiliary_loss_mlp": 0.01193859, "balance_loss_clip": 1.00444889, "balance_loss_mlp": 1.00007272, "epoch": 0.4379246077075693, "flos": 53914089222720.0, "grad_norm": 0.7456598106688483, "language_loss": 0.56515026, "learning_rate": 2.4941498297923843e-06, "loss": 0.59018147, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.2402873039245605 }, { "auxiliary_loss_clip": 0.01336065, "auxiliary_loss_mlp": 0.01194866, "balance_loss_clip": 1.00855279, "balance_loss_mlp": 1.00041211, "epoch": 0.43804485059820836, "flos": 20588314273920.0, "grad_norm": 1.6526064296417595, "language_loss": 0.69624156, "learning_rate": 2.4933949716232424e-06, "loss": 0.72155094, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 3.659420967102051 }, { "auxiliary_loss_clip": 0.01289706, "auxiliary_loss_mlp": 0.01194841, "balance_loss_clip": 1.00900233, "balance_loss_mlp": 1.00038671, "epoch": 0.43816509348884747, "flos": 23876225746560.0, "grad_norm": 2.034833033898006, "language_loss": 0.73908985, "learning_rate": 2.492640038606476e-06, "loss": 0.76393533, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.808199882507324 }, { "auxiliary_loss_clip": 0.01339017, "auxiliary_loss_mlp": 0.01194965, "balance_loss_clip": 1.00850284, "balance_loss_mlp": 1.00041533, "epoch": 0.4382853363794866, "flos": 14684457464640.0, "grad_norm": 1.6988364801069604, "language_loss": 0.78269011, "learning_rate": 2.491885030856608e-06, "loss": 0.80802995, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 3.6577649116516113 }, { "auxiliary_loss_clip": 0.01322989, "auxiliary_loss_mlp": 0.01194854, "balance_loss_clip": 1.00875962, "balance_loss_mlp": 1.0004003, "epoch": 0.43840557927012563, "flos": 17165310753600.0, "grad_norm": 2.115382792072836, "language_loss": 0.8259238, "learning_rate": 2.4911299484881713e-06, "loss": 0.85110223, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.762331485748291 }, { "auxiliary_loss_clip": 0.01328473, "auxiliary_loss_mlp": 0.01195035, "balance_loss_clip": 1.00861788, "balance_loss_mlp": 1.00048578, "epoch": 0.43852582216076474, "flos": 19390695446880.0, "grad_norm": 1.8084629376052164, "language_loss": 0.810911, "learning_rate": 2.490374791615712e-06, "loss": 0.83614612, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.737440347671509 }, { "auxiliary_loss_clip": 0.01361875, "auxiliary_loss_mlp": 0.00872581, "balance_loss_clip": 1.009547, "balance_loss_mlp": 1.00028098, "epoch": 0.43864606505140386, "flos": 18075142882080.0, "grad_norm": 2.4928733119344924, "language_loss": 0.77070653, "learning_rate": 2.4896195603537867e-06, "loss": 0.79305112, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.6339426040649414 }, { "auxiliary_loss_clip": 0.0126585, "auxiliary_loss_mlp": 0.01194893, "balance_loss_clip": 1.00737417, "balance_loss_mlp": 1.00043845, "epoch": 0.4387663079420429, "flos": 19644906713760.0, "grad_norm": 4.32249002401988, "language_loss": 0.7356416, "learning_rate": 2.488864254816964e-06, "loss": 0.76024902, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.8691210746765137 }, { "auxiliary_loss_clip": 0.013367, "auxiliary_loss_mlp": 0.01195091, "balance_loss_clip": 1.00884867, "balance_loss_mlp": 1.00044572, "epoch": 0.438886550832682, "flos": 19719350084160.0, "grad_norm": 2.574598129116122, "language_loss": 0.68092966, "learning_rate": 2.4881088751198218e-06, "loss": 0.70624757, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.7937376499176025 }, { "auxiliary_loss_clip": 0.01325282, "auxiliary_loss_mlp": 0.0119494, "balance_loss_clip": 1.00897169, "balance_loss_mlp": 1.00039077, "epoch": 0.43900679372332113, "flos": 14536684357920.0, "grad_norm": 2.7927263761908345, "language_loss": 0.64402795, "learning_rate": 2.4873534213769517e-06, "loss": 0.6692301, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.8556323051452637 }, { "auxiliary_loss_clip": 0.01287522, "auxiliary_loss_mlp": 0.01194713, "balance_loss_clip": 1.00762153, "balance_loss_mlp": 1.00035429, "epoch": 0.4391270366139602, "flos": 24056245108800.0, "grad_norm": 1.7421764980643262, "language_loss": 0.72039676, "learning_rate": 2.4865978937029547e-06, "loss": 0.74521911, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.814178228378296 }, { "auxiliary_loss_clip": 0.01281325, "auxiliary_loss_mlp": 0.01194836, "balance_loss_clip": 1.00773752, "balance_loss_mlp": 1.00038147, "epoch": 0.4392472795045993, "flos": 31538523356640.0, "grad_norm": 1.6403293004961965, "language_loss": 0.66361594, "learning_rate": 2.485842292212445e-06, "loss": 0.68837756, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.9743590354919434 }, { "auxiliary_loss_clip": 0.01361229, "auxiliary_loss_mlp": 0.01195183, "balance_loss_clip": 1.00907755, "balance_loss_mlp": 1.00053811, "epoch": 0.4393675223952384, "flos": 14866308934560.0, "grad_norm": 1.9383553682040746, "language_loss": 0.80756986, "learning_rate": 2.485086617020045e-06, "loss": 0.833134, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.8022279739379883 }, { "auxiliary_loss_clip": 0.01326494, "auxiliary_loss_mlp": 0.01194855, "balance_loss_clip": 1.00831997, "balance_loss_mlp": 1.0004009, "epoch": 0.43948776528587746, "flos": 14825908003680.0, "grad_norm": 2.112408218326874, "language_loss": 0.81681961, "learning_rate": 2.4843308682403903e-06, "loss": 0.84203309, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.7835395336151123 }, { "auxiliary_loss_clip": 0.01360721, "auxiliary_loss_mlp": 0.011947, "balance_loss_clip": 1.00882733, "balance_loss_mlp": 1.00043678, "epoch": 0.4396080081765166, "flos": 13914531156960.0, "grad_norm": 1.5192382330093068, "language_loss": 0.82245344, "learning_rate": 2.4835750459881294e-06, "loss": 0.84800768, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.772543430328369 }, { "auxiliary_loss_clip": 0.01337532, "auxiliary_loss_mlp": 0.01194827, "balance_loss_clip": 1.00980616, "balance_loss_mlp": 1.00037301, "epoch": 0.43972825106715563, "flos": 18222987836160.0, "grad_norm": 1.8764819484667419, "language_loss": 0.82149976, "learning_rate": 2.4828191503779177e-06, "loss": 0.84682333, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.840665340423584 }, { "auxiliary_loss_clip": 0.01300339, "auxiliary_loss_mlp": 0.01194904, "balance_loss_clip": 1.00811994, "balance_loss_mlp": 1.0004493, "epoch": 0.43984849395779474, "flos": 16873249137120.0, "grad_norm": 1.971693272832778, "language_loss": 0.89558393, "learning_rate": 2.482063181524425e-06, "loss": 0.9205364, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.8572254180908203 }, { "auxiliary_loss_clip": 0.01360864, "auxiliary_loss_mlp": 0.01195001, "balance_loss_clip": 1.00923371, "balance_loss_mlp": 1.00045133, "epoch": 0.43996873684843385, "flos": 18691512370560.0, "grad_norm": 2.5138610137604878, "language_loss": 0.81168729, "learning_rate": 2.4813071395423307e-06, "loss": 0.83724594, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.7610082626342773 }, { "auxiliary_loss_clip": 0.01341029, "auxiliary_loss_mlp": 0.01195087, "balance_loss_clip": 1.00888634, "balance_loss_mlp": 1.00053775, "epoch": 0.4400889797390729, "flos": 23653470414240.0, "grad_norm": 1.7954491546504168, "language_loss": 0.64506698, "learning_rate": 2.4805510245463263e-06, "loss": 0.67042816, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.9007503986358643 }, { "auxiliary_loss_clip": 0.01348769, "auxiliary_loss_mlp": 0.01195056, "balance_loss_clip": 1.00915027, "balance_loss_mlp": 1.00050628, "epoch": 0.440209222629712, "flos": 23149215027360.0, "grad_norm": 1.9503180433243306, "language_loss": 0.60016209, "learning_rate": 2.4797948366511137e-06, "loss": 0.62560034, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.79756498336792 }, { "auxiliary_loss_clip": 0.01312768, "auxiliary_loss_mlp": 0.01194987, "balance_loss_clip": 1.00847268, "balance_loss_mlp": 1.00043774, "epoch": 0.4403294655203511, "flos": 24823405293120.0, "grad_norm": 1.9775873696920236, "language_loss": 0.75813675, "learning_rate": 2.4790385759714055e-06, "loss": 0.78321427, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.994248151779175 }, { "auxiliary_loss_clip": 0.01337098, "auxiliary_loss_mlp": 0.01194951, "balance_loss_clip": 1.00940108, "balance_loss_mlp": 1.00040126, "epoch": 0.4404497084109902, "flos": 22565091794400.0, "grad_norm": 1.7870262316990786, "language_loss": 0.71169007, "learning_rate": 2.478282242621926e-06, "loss": 0.7370106, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.794278860092163 }, { "auxiliary_loss_clip": 0.0128549, "auxiliary_loss_mlp": 0.01193956, "balance_loss_clip": 1.00525069, "balance_loss_mlp": 1.00016963, "epoch": 0.4405699513016293, "flos": 64967108173920.0, "grad_norm": 0.8406807561795425, "language_loss": 0.59597117, "learning_rate": 2.477525836717411e-06, "loss": 0.62076563, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.467733860015869 }, { "auxiliary_loss_clip": 0.01338495, "auxiliary_loss_mlp": 0.01194947, "balance_loss_clip": 1.0082829, "balance_loss_mlp": 1.00039744, "epoch": 0.4406901941922684, "flos": 35661967282080.0, "grad_norm": 2.604785347460956, "language_loss": 0.79861432, "learning_rate": 2.476769358372606e-06, "loss": 0.82394874, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.9393556118011475 }, { "auxiliary_loss_clip": 0.01298247, "auxiliary_loss_mlp": 0.01194895, "balance_loss_clip": 1.00879788, "balance_loss_mlp": 1.00044084, "epoch": 0.44081043708290746, "flos": 18040777129440.0, "grad_norm": 2.8655836135435258, "language_loss": 0.75091481, "learning_rate": 2.4760128077022683e-06, "loss": 0.7758463, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.801612615585327 }, { "auxiliary_loss_clip": 0.01275909, "auxiliary_loss_mlp": 0.01194807, "balance_loss_clip": 1.00861764, "balance_loss_mlp": 1.00044799, "epoch": 0.44093067997354657, "flos": 30153520812960.0, "grad_norm": 2.173517135322504, "language_loss": 0.68294281, "learning_rate": 2.4752561848211672e-06, "loss": 0.70764995, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 4.930190324783325 }, { "auxiliary_loss_clip": 0.01335911, "auxiliary_loss_mlp": 0.01194666, "balance_loss_clip": 1.00878501, "balance_loss_mlp": 1.00030708, "epoch": 0.4410509228641857, "flos": 23255078408640.0, "grad_norm": 2.0983842850146726, "language_loss": 0.71489704, "learning_rate": 2.4744994898440797e-06, "loss": 0.74020284, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.7366604804992676 }, { "auxiliary_loss_clip": 0.01305248, "auxiliary_loss_mlp": 0.01195017, "balance_loss_clip": 1.0078547, "balance_loss_mlp": 1.0004673, "epoch": 0.44117116575482473, "flos": 19500582280320.0, "grad_norm": 2.2286144716829885, "language_loss": 0.83512175, "learning_rate": 2.473742722885797e-06, "loss": 0.86012441, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 3.646754264831543 }, { "auxiliary_loss_clip": 0.01337304, "auxiliary_loss_mlp": 0.00872587, "balance_loss_clip": 1.00977254, "balance_loss_mlp": 1.00026667, "epoch": 0.44129140864546385, "flos": 27053136751680.0, "grad_norm": 2.3054232541803596, "language_loss": 0.65138376, "learning_rate": 2.4729858840611197e-06, "loss": 0.67348272, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.7991883754730225 }, { "auxiliary_loss_clip": 0.01360769, "auxiliary_loss_mlp": 0.01194564, "balance_loss_clip": 1.00932944, "balance_loss_mlp": 1.00039577, "epoch": 0.4414116515361029, "flos": 26102113371360.0, "grad_norm": 1.9030675054915815, "language_loss": 0.7268157, "learning_rate": 2.4722289734848605e-06, "loss": 0.75236905, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 3.9625606536865234 }, { "auxiliary_loss_clip": 0.0129698, "auxiliary_loss_mlp": 0.01194756, "balance_loss_clip": 1.008955, "balance_loss_mlp": 1.00039697, "epoch": 0.441531894426742, "flos": 21906094106880.0, "grad_norm": 1.9380176972798786, "language_loss": 0.77440065, "learning_rate": 2.471471991271841e-06, "loss": 0.79931796, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.851027488708496 }, { "auxiliary_loss_clip": 0.01348568, "auxiliary_loss_mlp": 0.01195042, "balance_loss_clip": 1.00895596, "balance_loss_mlp": 1.00049269, "epoch": 0.4416521373173811, "flos": 23437109496960.0, "grad_norm": 1.8375150119837795, "language_loss": 0.79615259, "learning_rate": 2.470714937536896e-06, "loss": 0.8215887, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.7320070266723633 }, { "auxiliary_loss_clip": 0.01281996, "auxiliary_loss_mlp": 0.01195025, "balance_loss_clip": 1.00864768, "balance_loss_mlp": 1.00047517, "epoch": 0.4417723802080202, "flos": 20334354472800.0, "grad_norm": 1.80244261214138, "language_loss": 0.70301366, "learning_rate": 2.469957812394868e-06, "loss": 0.72778392, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.8516297340393066 }, { "auxiliary_loss_clip": 0.01361235, "auxiliary_loss_mlp": 0.01194841, "balance_loss_clip": 1.00999665, "balance_loss_mlp": 1.00048256, "epoch": 0.4418926230986593, "flos": 18880692271200.0, "grad_norm": 1.9128873045805899, "language_loss": 0.76244652, "learning_rate": 2.4692006159606148e-06, "loss": 0.78800738, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.662813901901245 }, { "auxiliary_loss_clip": 0.0136101, "auxiliary_loss_mlp": 0.01194865, "balance_loss_clip": 1.00894904, "balance_loss_mlp": 1.00041056, "epoch": 0.4420128659892984, "flos": 19464420343680.0, "grad_norm": 2.331259902440293, "language_loss": 0.78456569, "learning_rate": 2.468443348349e-06, "loss": 0.8101244, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.7440290451049805 }, { "auxiliary_loss_clip": 0.01292589, "auxiliary_loss_mlp": 0.01195057, "balance_loss_clip": 1.00853467, "balance_loss_mlp": 1.00041246, "epoch": 0.44213310887993745, "flos": 17894369122560.0, "grad_norm": 2.5469755400954495, "language_loss": 0.82393789, "learning_rate": 2.467686009674902e-06, "loss": 0.84881431, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.853851795196533 }, { "auxiliary_loss_clip": 0.01348883, "auxiliary_loss_mlp": 0.01194878, "balance_loss_clip": 1.00912941, "balance_loss_mlp": 1.00042439, "epoch": 0.44225335177057656, "flos": 19204676830080.0, "grad_norm": 1.9281407364497412, "language_loss": 0.84945393, "learning_rate": 2.466928600053209e-06, "loss": 0.87489158, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.840991497039795 }, { "auxiliary_loss_clip": 0.01324152, "auxiliary_loss_mlp": 0.01194596, "balance_loss_clip": 1.00906289, "balance_loss_mlp": 1.00033259, "epoch": 0.4423735946612157, "flos": 23471331554880.0, "grad_norm": 1.8685480601258648, "language_loss": 0.71218878, "learning_rate": 2.466171119598818e-06, "loss": 0.73737621, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.779916763305664 }, { "auxiliary_loss_clip": 0.01349673, "auxiliary_loss_mlp": 0.01195093, "balance_loss_clip": 1.00921106, "balance_loss_mlp": 1.00054336, "epoch": 0.44249383755185473, "flos": 26685410359680.0, "grad_norm": 1.8849284250282783, "language_loss": 0.76956469, "learning_rate": 2.465413568426639e-06, "loss": 0.79501235, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.951690912246704 }, { "auxiliary_loss_clip": 0.0133515, "auxiliary_loss_mlp": 0.01194576, "balance_loss_clip": 1.00882363, "balance_loss_mlp": 1.00040817, "epoch": 0.44261408044249384, "flos": 23147670309120.0, "grad_norm": 1.79470401271909, "language_loss": 0.81287676, "learning_rate": 2.464655946651591e-06, "loss": 0.83817399, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.799032688140869 }, { "auxiliary_loss_clip": 0.01339079, "auxiliary_loss_mlp": 0.0119488, "balance_loss_clip": 1.00858331, "balance_loss_mlp": 1.00042593, "epoch": 0.44273432333313295, "flos": 24462576247680.0, "grad_norm": 1.762845919497592, "language_loss": 0.80569297, "learning_rate": 2.4638982543886065e-06, "loss": 0.83103251, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.7337090969085693 }, { "auxiliary_loss_clip": 0.01342114, "auxiliary_loss_mlp": 0.01195263, "balance_loss_clip": 1.00916266, "balance_loss_mlp": 1.00052226, "epoch": 0.442854566223772, "flos": 17528941846080.0, "grad_norm": 2.2035685301201795, "language_loss": 0.87154639, "learning_rate": 2.4631404917526254e-06, "loss": 0.89692008, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.7660329341888428 }, { "auxiliary_loss_clip": 0.01348591, "auxiliary_loss_mlp": 0.0119477, "balance_loss_clip": 1.0094161, "balance_loss_mlp": 1.00041127, "epoch": 0.4429748091144111, "flos": 24896303945280.0, "grad_norm": 1.507090255715507, "language_loss": 0.793594, "learning_rate": 2.4623826588586e-06, "loss": 0.81902766, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.78871750831604 }, { "auxiliary_loss_clip": 0.01335423, "auxiliary_loss_mlp": 0.01194733, "balance_loss_clip": 1.00886488, "balance_loss_mlp": 1.00046968, "epoch": 0.4430950520050502, "flos": 21614715040320.0, "grad_norm": 1.5818433606676305, "language_loss": 0.82880175, "learning_rate": 2.461624755821492e-06, "loss": 0.85410333, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.81089448928833 }, { "auxiliary_loss_clip": 0.01289705, "auxiliary_loss_mlp": 0.01194819, "balance_loss_clip": 1.00813842, "balance_loss_mlp": 1.00055575, "epoch": 0.4432152948956893, "flos": 24572283462720.0, "grad_norm": 1.7305284123407618, "language_loss": 0.76612556, "learning_rate": 2.4608667827562763e-06, "loss": 0.7909708, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.8890373706817627 }, { "auxiliary_loss_clip": 0.013425, "auxiliary_loss_mlp": 0.0119522, "balance_loss_clip": 1.00927997, "balance_loss_mlp": 1.0005753, "epoch": 0.4433355377863284, "flos": 21762272604960.0, "grad_norm": 1.9999925193540777, "language_loss": 0.89705008, "learning_rate": 2.460108739777936e-06, "loss": 0.9224273, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.7889418601989746 }, { "auxiliary_loss_clip": 0.0131619, "auxiliary_loss_mlp": 0.01194702, "balance_loss_clip": 1.00898266, "balance_loss_mlp": 1.00043869, "epoch": 0.44345578067696745, "flos": 20084489971200.0, "grad_norm": 1.4992915504895785, "language_loss": 0.76543236, "learning_rate": 2.4593506270014656e-06, "loss": 0.79054129, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.7860982418060303 }, { "auxiliary_loss_clip": 0.01326928, "auxiliary_loss_mlp": 0.0119468, "balance_loss_clip": 1.00862789, "balance_loss_mlp": 1.00032091, "epoch": 0.44357602356760656, "flos": 24169508768160.0, "grad_norm": 1.6248415544094055, "language_loss": 0.82074213, "learning_rate": 2.45859244454187e-06, "loss": 0.84595823, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.7557284832000732 }, { "auxiliary_loss_clip": 0.01337183, "auxiliary_loss_mlp": 0.01194891, "balance_loss_clip": 1.00827348, "balance_loss_mlp": 1.00043654, "epoch": 0.44369626645824567, "flos": 22707727814880.0, "grad_norm": 1.6018781611610995, "language_loss": 0.66255188, "learning_rate": 2.4578341925141655e-06, "loss": 0.68787265, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.755763530731201 }, { "auxiliary_loss_clip": 0.01349971, "auxiliary_loss_mlp": 0.01195208, "balance_loss_clip": 1.00956798, "balance_loss_mlp": 1.00056291, "epoch": 0.4438165093488847, "flos": 38030239461600.0, "grad_norm": 1.928950668965718, "language_loss": 0.72414207, "learning_rate": 2.457075871033378e-06, "loss": 0.74959385, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.8561103343963623 }, { "auxiliary_loss_clip": 0.01299819, "auxiliary_loss_mlp": 0.01194748, "balance_loss_clip": 1.00794816, "balance_loss_mlp": 1.0003891, "epoch": 0.44393675223952384, "flos": 15523223048640.0, "grad_norm": 2.2498663267846197, "language_loss": 0.88583624, "learning_rate": 2.4563174802145445e-06, "loss": 0.91078186, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.794083595275879 }, { "auxiliary_loss_clip": 0.01297438, "auxiliary_loss_mlp": 0.01193876, "balance_loss_clip": 1.00366187, "balance_loss_mlp": 1.00008965, "epoch": 0.44405699513016295, "flos": 64574930964960.0, "grad_norm": 0.6321122173881968, "language_loss": 0.48584855, "learning_rate": 2.455559020172712e-06, "loss": 0.51076168, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 4.2703986167907715 }, { "auxiliary_loss_clip": 0.01274993, "auxiliary_loss_mlp": 0.01194967, "balance_loss_clip": 1.00795794, "balance_loss_mlp": 1.00041699, "epoch": 0.444177238020802, "flos": 23987405832480.0, "grad_norm": 1.7829032371478755, "language_loss": 0.8976264, "learning_rate": 2.4548004910229385e-06, "loss": 0.92232597, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 3.744626522064209 }, { "auxiliary_loss_clip": 0.01342275, "auxiliary_loss_mlp": 0.00872523, "balance_loss_clip": 1.00852132, "balance_loss_mlp": 1.0002346, "epoch": 0.4442974809114411, "flos": 22563080068320.0, "grad_norm": 3.5182748224224203, "language_loss": 0.86832476, "learning_rate": 2.4540418928802913e-06, "loss": 0.89047277, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 3.704113245010376 }, { "auxiliary_loss_clip": 0.01330636, "auxiliary_loss_mlp": 0.01194969, "balance_loss_clip": 1.00888598, "balance_loss_mlp": 1.00061035, "epoch": 0.4444177238020802, "flos": 17675709089760.0, "grad_norm": 2.0326620841649428, "language_loss": 0.65910614, "learning_rate": 2.4532832258598506e-06, "loss": 0.68436217, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.7416365146636963 }, { "auxiliary_loss_clip": 0.01360672, "auxiliary_loss_mlp": 0.0119453, "balance_loss_clip": 1.0092659, "balance_loss_mlp": 1.00036192, "epoch": 0.4445379666927193, "flos": 28621607330880.0, "grad_norm": 1.6491066086540258, "language_loss": 0.80848777, "learning_rate": 2.4525244900767047e-06, "loss": 0.83403981, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 3.686934471130371 }, { "auxiliary_loss_clip": 0.01308465, "auxiliary_loss_mlp": 0.01193844, "balance_loss_clip": 1.00676262, "balance_loss_mlp": 1.0000571, "epoch": 0.4446582095833584, "flos": 70487409457440.0, "grad_norm": 0.7742000410494432, "language_loss": 0.6055035, "learning_rate": 2.4517656856459536e-06, "loss": 0.6305266, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.386003255844116 }, { "auxiliary_loss_clip": 0.01341818, "auxiliary_loss_mlp": 0.01194735, "balance_loss_clip": 1.00865686, "balance_loss_mlp": 1.0003767, "epoch": 0.4447784524739975, "flos": 26505211379040.0, "grad_norm": 1.5917302102905155, "language_loss": 0.68134075, "learning_rate": 2.4510068126827073e-06, "loss": 0.70670629, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.794585943222046 }, { "auxiliary_loss_clip": 0.01325804, "auxiliary_loss_mlp": 0.01194849, "balance_loss_clip": 1.00910866, "balance_loss_mlp": 1.00039458, "epoch": 0.44489869536463655, "flos": 11656217658240.0, "grad_norm": 2.0217197462547567, "language_loss": 0.81785041, "learning_rate": 2.450247871302086e-06, "loss": 0.84305698, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.7229959964752197 }, { "auxiliary_loss_clip": 0.01337203, "auxiliary_loss_mlp": 0.01194556, "balance_loss_clip": 1.00905514, "balance_loss_mlp": 1.00038815, "epoch": 0.44501893825527566, "flos": 20448480300480.0, "grad_norm": 3.875157877111155, "language_loss": 0.83074808, "learning_rate": 2.44948886161922e-06, "loss": 0.85606569, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.7574331760406494 }, { "auxiliary_loss_clip": 0.01348374, "auxiliary_loss_mlp": 0.01194881, "balance_loss_clip": 1.00903487, "balance_loss_mlp": 1.00042629, "epoch": 0.4451391811459148, "flos": 18261089651520.0, "grad_norm": 1.51347052699936, "language_loss": 0.84819496, "learning_rate": 2.4487297837492524e-06, "loss": 0.87362748, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.7252633571624756 }, { "auxiliary_loss_clip": 0.01294799, "auxiliary_loss_mlp": 0.01194757, "balance_loss_clip": 1.00769806, "balance_loss_mlp": 1.00049353, "epoch": 0.44525942403655383, "flos": 16910165471040.0, "grad_norm": 2.014351415445556, "language_loss": 0.62064314, "learning_rate": 2.4479706378073323e-06, "loss": 0.64553869, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.761577606201172 }, { "auxiliary_loss_clip": 0.0132169, "auxiliary_loss_mlp": 0.01194455, "balance_loss_clip": 1.00873351, "balance_loss_mlp": 1.00038195, "epoch": 0.44537966692719294, "flos": 23258850395040.0, "grad_norm": 1.4531498095869055, "language_loss": 0.83879143, "learning_rate": 2.447211423908623e-06, "loss": 0.86395288, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.7401986122131348 }, { "auxiliary_loss_clip": 0.01347723, "auxiliary_loss_mlp": 0.01194669, "balance_loss_clip": 1.00879765, "balance_loss_mlp": 1.00040615, "epoch": 0.445499909817832, "flos": 21724170789600.0, "grad_norm": 1.8770189683183265, "language_loss": 0.74892426, "learning_rate": 2.4464521421682966e-06, "loss": 0.77434814, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.799880027770996 }, { "auxiliary_loss_clip": 0.01334737, "auxiliary_loss_mlp": 0.01194722, "balance_loss_clip": 1.00829601, "balance_loss_mlp": 1.00045848, "epoch": 0.4456201527084711, "flos": 23987980611360.0, "grad_norm": 1.2919962016331767, "language_loss": 0.87484276, "learning_rate": 2.4456927927015345e-06, "loss": 0.90013731, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.7410531044006348 }, { "auxiliary_loss_clip": 0.01323882, "auxiliary_loss_mlp": 0.01195037, "balance_loss_clip": 1.00908852, "balance_loss_mlp": 1.00048757, "epoch": 0.4457403955991102, "flos": 18807075145440.0, "grad_norm": 1.9615085455130872, "language_loss": 0.76665825, "learning_rate": 2.4449333756235307e-06, "loss": 0.79184747, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.7809669971466064 }, { "auxiliary_loss_clip": 0.01337244, "auxiliary_loss_mlp": 0.01194681, "balance_loss_clip": 1.00934482, "balance_loss_mlp": 1.00041747, "epoch": 0.4458606384897493, "flos": 19207766266560.0, "grad_norm": 2.0938164410274136, "language_loss": 0.7891556, "learning_rate": 2.4441738910494876e-06, "loss": 0.81447494, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.724940061569214 }, { "auxiliary_loss_clip": 0.01336324, "auxiliary_loss_mlp": 0.0119497, "balance_loss_clip": 1.00938368, "balance_loss_mlp": 1.00051618, "epoch": 0.4459808813803884, "flos": 21361294094400.0, "grad_norm": 1.8109587202383874, "language_loss": 0.82077169, "learning_rate": 2.4434143390946176e-06, "loss": 0.84608465, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.763349771499634 }, { "auxiliary_loss_clip": 0.01300668, "auxiliary_loss_mlp": 0.01194853, "balance_loss_clip": 1.00779366, "balance_loss_mlp": 1.000494, "epoch": 0.4461011242710275, "flos": 23288977153440.0, "grad_norm": 1.7460515658573306, "language_loss": 0.85255849, "learning_rate": 2.4426547198741457e-06, "loss": 0.87751377, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.819232225418091 }, { "auxiliary_loss_clip": 0.01269728, "auxiliary_loss_mlp": 0.01194962, "balance_loss_clip": 1.00710499, "balance_loss_mlp": 1.00050831, "epoch": 0.44622136716166655, "flos": 20193011704800.0, "grad_norm": 2.045905866533498, "language_loss": 0.74502504, "learning_rate": 2.441895033503305e-06, "loss": 0.76967192, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.744802474975586 }, { "auxiliary_loss_clip": 0.01335858, "auxiliary_loss_mlp": 0.01195177, "balance_loss_clip": 1.00841999, "balance_loss_mlp": 1.00072241, "epoch": 0.44634161005230566, "flos": 21283042813920.0, "grad_norm": 1.621719322600358, "language_loss": 0.81915236, "learning_rate": 2.4411352800973375e-06, "loss": 0.84446275, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.7677013874053955 }, { "auxiliary_loss_clip": 0.01309731, "auxiliary_loss_mlp": 0.01194884, "balance_loss_clip": 1.00799906, "balance_loss_mlp": 1.00043035, "epoch": 0.44646185294294477, "flos": 22929369513120.0, "grad_norm": 2.29591863566392, "language_loss": 0.7515592, "learning_rate": 2.4403754597715005e-06, "loss": 0.77660531, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.752838373184204 }, { "auxiliary_loss_clip": 0.01336563, "auxiliary_loss_mlp": 0.01194882, "balance_loss_clip": 1.00893235, "balance_loss_mlp": 1.0004282, "epoch": 0.4465820958335838, "flos": 22637703057120.0, "grad_norm": 2.4550172958199816, "language_loss": 0.92601514, "learning_rate": 2.4396155726410553e-06, "loss": 0.95132959, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.7556557655334473 }, { "auxiliary_loss_clip": 0.01338077, "auxiliary_loss_mlp": 0.01194821, "balance_loss_clip": 1.00957179, "balance_loss_mlp": 1.00046253, "epoch": 0.44670233872422294, "flos": 22672535817600.0, "grad_norm": 2.2176585590989983, "language_loss": 0.90841687, "learning_rate": 2.438855618821278e-06, "loss": 0.93374586, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.682790994644165 }, { "auxiliary_loss_clip": 0.0134842, "auxiliary_loss_mlp": 0.01194725, "balance_loss_clip": 1.00888669, "balance_loss_mlp": 1.00036621, "epoch": 0.44682258161486205, "flos": 23582188327680.0, "grad_norm": 1.7272142547120297, "language_loss": 0.67420822, "learning_rate": 2.4380955984274517e-06, "loss": 0.69963968, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.7630786895751953 }, { "auxiliary_loss_clip": 0.01349598, "auxiliary_loss_mlp": 0.01195097, "balance_loss_clip": 1.00941372, "balance_loss_mlp": 1.00064278, "epoch": 0.4469428245055011, "flos": 26501367545280.0, "grad_norm": 1.6245622670288506, "language_loss": 0.76962733, "learning_rate": 2.4373355115748716e-06, "loss": 0.79507422, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.8036773204803467 }, { "auxiliary_loss_clip": 0.01323736, "auxiliary_loss_mlp": 0.01194647, "balance_loss_clip": 1.00866127, "balance_loss_mlp": 1.00047874, "epoch": 0.4470630673961402, "flos": 21504935977920.0, "grad_norm": 1.6326299481292659, "language_loss": 0.72146273, "learning_rate": 2.436575358378842e-06, "loss": 0.74664652, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.7266173362731934 }, { "auxiliary_loss_clip": 0.01326771, "auxiliary_loss_mlp": 0.01194944, "balance_loss_clip": 1.00885296, "balance_loss_mlp": 1.00048995, "epoch": 0.44718331028677927, "flos": 16173994213440.0, "grad_norm": 2.5332936385517453, "language_loss": 0.82717335, "learning_rate": 2.4358151389546782e-06, "loss": 0.85239047, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 3.784126043319702 }, { "auxiliary_loss_clip": 0.01361748, "auxiliary_loss_mlp": 0.01194527, "balance_loss_clip": 1.00941336, "balance_loss_mlp": 1.0003593, "epoch": 0.4473035531774184, "flos": 19681248268800.0, "grad_norm": 2.2783926127346743, "language_loss": 0.75954652, "learning_rate": 2.4350548534177035e-06, "loss": 0.78510928, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.740415573120117 }, { "auxiliary_loss_clip": 0.01304555, "auxiliary_loss_mlp": 0.01194629, "balance_loss_clip": 1.00795269, "balance_loss_mlp": 1.00036597, "epoch": 0.4474237960680575, "flos": 41427606683520.0, "grad_norm": 1.547542233775993, "language_loss": 0.66546011, "learning_rate": 2.434294501883254e-06, "loss": 0.69045192, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 4.544129371643066 }, { "auxiliary_loss_clip": 0.01336724, "auxiliary_loss_mlp": 0.01194663, "balance_loss_clip": 1.00930762, "balance_loss_mlp": 1.00058985, "epoch": 0.44754403895869654, "flos": 22891339545120.0, "grad_norm": 1.7494318842657974, "language_loss": 0.65504491, "learning_rate": 2.433534084466674e-06, "loss": 0.68035877, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.875037908554077 }, { "auxiliary_loss_clip": 0.01360229, "auxiliary_loss_mlp": 0.01194518, "balance_loss_clip": 1.00916743, "balance_loss_mlp": 1.00044489, "epoch": 0.44766428184933565, "flos": 25630283858400.0, "grad_norm": 1.5651796306455297, "language_loss": 0.70780623, "learning_rate": 2.4327736012833178e-06, "loss": 0.73335373, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 3.7496559619903564 }, { "auxiliary_loss_clip": 0.01336445, "auxiliary_loss_mlp": 0.01194756, "balance_loss_clip": 1.00834739, "balance_loss_mlp": 1.00039697, "epoch": 0.44778452473997477, "flos": 20448983232000.0, "grad_norm": 3.6894143795860392, "language_loss": 0.76368517, "learning_rate": 2.4320130524485506e-06, "loss": 0.78899717, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.730319023132324 }, { "auxiliary_loss_clip": 0.01311104, "auxiliary_loss_mlp": 0.01194705, "balance_loss_clip": 1.00911164, "balance_loss_mlp": 1.00053692, "epoch": 0.4479047676306138, "flos": 21975472238400.0, "grad_norm": 1.497889887941678, "language_loss": 0.79527819, "learning_rate": 2.431252438077746e-06, "loss": 0.82033634, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.8498637676239014 }, { "auxiliary_loss_clip": 0.01348742, "auxiliary_loss_mlp": 0.00872527, "balance_loss_clip": 1.00943947, "balance_loss_mlp": 1.0002625, "epoch": 0.44802501052125293, "flos": 21467229323040.0, "grad_norm": 2.023766687551241, "language_loss": 0.76410151, "learning_rate": 2.4304917582862906e-06, "loss": 0.78631425, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.776219129562378 }, { "auxiliary_loss_clip": 0.01360838, "auxiliary_loss_mlp": 0.01194825, "balance_loss_clip": 1.00952733, "balance_loss_mlp": 1.00056124, "epoch": 0.44814525341189204, "flos": 22126981407840.0, "grad_norm": 1.875097845620481, "language_loss": 0.87763876, "learning_rate": 2.4297310131895774e-06, "loss": 0.90319538, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.797183036804199 }, { "auxiliary_loss_clip": 0.01337317, "auxiliary_loss_mlp": 0.01194789, "balance_loss_clip": 1.00846481, "balance_loss_mlp": 1.00052559, "epoch": 0.4482654963025311, "flos": 16653942478080.0, "grad_norm": 1.965299456359626, "language_loss": 0.74672496, "learning_rate": 2.4289702029030113e-06, "loss": 0.77204597, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 2.713315963745117 }, { "auxiliary_loss_clip": 0.01336854, "auxiliary_loss_mlp": 0.01194829, "balance_loss_clip": 1.00902021, "balance_loss_mlp": 1.00056505, "epoch": 0.4483857391931702, "flos": 18841261279680.0, "grad_norm": 1.6652995626361435, "language_loss": 0.82717645, "learning_rate": 2.4282093275420057e-06, "loss": 0.85249323, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.729530096054077 }, { "auxiliary_loss_clip": 0.01336353, "auxiliary_loss_mlp": 0.01194739, "balance_loss_clip": 1.00938916, "balance_loss_mlp": 1.00047576, "epoch": 0.4485059820838093, "flos": 20372599982880.0, "grad_norm": 2.067831240523265, "language_loss": 0.71269143, "learning_rate": 2.4274483872219863e-06, "loss": 0.7380023, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.662898540496826 }, { "auxiliary_loss_clip": 0.01347147, "auxiliary_loss_mlp": 0.01194423, "balance_loss_clip": 1.00906539, "balance_loss_mlp": 1.00035036, "epoch": 0.4486262249744484, "flos": 20047753255680.0, "grad_norm": 1.7002238372377594, "language_loss": 0.93680513, "learning_rate": 2.426687382058386e-06, "loss": 0.96222079, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 2.7083308696746826 }, { "auxiliary_loss_clip": 0.01307788, "auxiliary_loss_mlp": 0.01193895, "balance_loss_clip": 1.00640488, "balance_loss_mlp": 1.00010872, "epoch": 0.4487464678650875, "flos": 64595717566560.0, "grad_norm": 0.861666769987343, "language_loss": 0.59825146, "learning_rate": 2.425926312166649e-06, "loss": 0.62326825, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.131091833114624 }, { "auxiliary_loss_clip": 0.01318032, "auxiliary_loss_mlp": 0.01194993, "balance_loss_clip": 1.00775528, "balance_loss_mlp": 1.00053871, "epoch": 0.4488667107557266, "flos": 20769806507040.0, "grad_norm": 1.9169057099343816, "language_loss": 0.72774863, "learning_rate": 2.42516517766223e-06, "loss": 0.7528789, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.709904670715332 }, { "auxiliary_loss_clip": 0.01360401, "auxiliary_loss_mlp": 0.01194613, "balance_loss_clip": 1.00949454, "balance_loss_mlp": 1.00034976, "epoch": 0.44898695364636565, "flos": 23951746827360.0, "grad_norm": 1.7180053755676143, "language_loss": 0.6827842, "learning_rate": 2.4244039786605907e-06, "loss": 0.70833433, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.848158359527588 }, { "auxiliary_loss_clip": 0.01297217, "auxiliary_loss_mlp": 0.01194626, "balance_loss_clip": 1.00768447, "balance_loss_mlp": 1.000458, "epoch": 0.44910719653700476, "flos": 18624361507200.0, "grad_norm": 2.6308300535145412, "language_loss": 0.8227548, "learning_rate": 2.4236427152772055e-06, "loss": 0.84767324, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.845255136489868 }, { "auxiliary_loss_clip": 0.01263909, "auxiliary_loss_mlp": 0.01193824, "balance_loss_clip": 1.00629568, "balance_loss_mlp": 1.00003743, "epoch": 0.4492274394276438, "flos": 57033463701600.0, "grad_norm": 0.82797554530827, "language_loss": 0.5733822, "learning_rate": 2.422881387627557e-06, "loss": 0.59795946, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 3.0892157554626465 }, { "auxiliary_loss_clip": 0.01311838, "auxiliary_loss_mlp": 0.01194708, "balance_loss_clip": 1.00868118, "balance_loss_mlp": 1.00044477, "epoch": 0.4493476823182829, "flos": 23254898790240.0, "grad_norm": 1.5174756517528416, "language_loss": 0.77113533, "learning_rate": 2.422119995827139e-06, "loss": 0.79620081, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.8042266368865967 }, { "auxiliary_loss_clip": 0.01335782, "auxiliary_loss_mlp": 0.01194952, "balance_loss_clip": 1.00937438, "balance_loss_mlp": 1.00059366, "epoch": 0.44946792520892204, "flos": 15815140970400.0, "grad_norm": 3.0515076474802574, "language_loss": 0.74068105, "learning_rate": 2.4213585399914528e-06, "loss": 0.76598847, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.6680691242218018 }, { "auxiliary_loss_clip": 0.0133793, "auxiliary_loss_mlp": 0.01194378, "balance_loss_clip": 1.00884032, "balance_loss_mlp": 1.00040114, "epoch": 0.4495881680995611, "flos": 19610073953280.0, "grad_norm": 1.871850005354603, "language_loss": 0.85104454, "learning_rate": 2.4205970202360113e-06, "loss": 0.87636757, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.7097318172454834 }, { "auxiliary_loss_clip": 0.01277737, "auxiliary_loss_mlp": 0.01194838, "balance_loss_clip": 1.00790954, "balance_loss_mlp": 1.00038338, "epoch": 0.4497084109902002, "flos": 26031477911040.0, "grad_norm": 1.915432258353104, "language_loss": 0.78471851, "learning_rate": 2.4198354366763354e-06, "loss": 0.80944425, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.870828866958618 }, { "auxiliary_loss_clip": 0.0132592, "auxiliary_loss_mlp": 0.01194714, "balance_loss_clip": 1.00882244, "balance_loss_mlp": 1.00045073, "epoch": 0.4498286538808393, "flos": 14793697671840.0, "grad_norm": 2.6066177026064863, "language_loss": 0.78513861, "learning_rate": 2.4190737894279587e-06, "loss": 0.81034493, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.737490653991699 }, { "auxiliary_loss_clip": 0.01323236, "auxiliary_loss_mlp": 0.01194813, "balance_loss_clip": 1.009027, "balance_loss_mlp": 1.00045466, "epoch": 0.44994889677147837, "flos": 15450180701760.0, "grad_norm": 2.1273283990166534, "language_loss": 0.8022083, "learning_rate": 2.4183120786064203e-06, "loss": 0.82738876, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.773052930831909 }, { "auxiliary_loss_clip": 0.01338955, "auxiliary_loss_mlp": 0.00872442, "balance_loss_clip": 1.00906599, "balance_loss_mlp": 1.00027251, "epoch": 0.4500691396621175, "flos": 21798326770560.0, "grad_norm": 2.466467411391749, "language_loss": 0.85123158, "learning_rate": 2.417550304327273e-06, "loss": 0.87334549, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.772463798522949 }, { "auxiliary_loss_clip": 0.01361313, "auxiliary_loss_mlp": 0.01194791, "balance_loss_clip": 1.00969827, "balance_loss_mlp": 1.00043249, "epoch": 0.4501893825527566, "flos": 32382030866400.0, "grad_norm": 1.464944427497113, "language_loss": 0.75525689, "learning_rate": 2.4167884667060763e-06, "loss": 0.78081793, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.7472474575042725 }, { "auxiliary_loss_clip": 0.01335509, "auxiliary_loss_mlp": 0.01194605, "balance_loss_clip": 1.00912833, "balance_loss_mlp": 1.00043678, "epoch": 0.45030962544339564, "flos": 16544953736640.0, "grad_norm": 2.6435349544048967, "language_loss": 0.87312227, "learning_rate": 2.4160265658584e-06, "loss": 0.89842343, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 4.624392986297607 }, { "auxiliary_loss_clip": 0.01347831, "auxiliary_loss_mlp": 0.01194827, "balance_loss_clip": 1.00969219, "balance_loss_mlp": 1.0004679, "epoch": 0.45042986833403476, "flos": 19573049848320.0, "grad_norm": 1.8515807930430899, "language_loss": 0.6816057, "learning_rate": 2.4152646018998253e-06, "loss": 0.7070322, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.7251036167144775 }, { "auxiliary_loss_clip": 0.01337097, "auxiliary_loss_mlp": 0.01194674, "balance_loss_clip": 1.00835609, "balance_loss_mlp": 1.00041103, "epoch": 0.45055011122467387, "flos": 23112478311840.0, "grad_norm": 1.728489682888472, "language_loss": 0.71644783, "learning_rate": 2.4145025749459403e-06, "loss": 0.7417655, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.9360618591308594 }, { "auxiliary_loss_clip": 0.01231605, "auxiliary_loss_mlp": 0.01195014, "balance_loss_clip": 1.00641155, "balance_loss_mlp": 1.00055957, "epoch": 0.4506703541153129, "flos": 19934633291040.0, "grad_norm": 1.826505960319665, "language_loss": 0.70157397, "learning_rate": 2.413740485112344e-06, "loss": 0.72584021, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 4.150910139083862 }, { "auxiliary_loss_clip": 0.01314228, "auxiliary_loss_mlp": 0.01194682, "balance_loss_clip": 1.00920939, "balance_loss_mlp": 1.00051403, "epoch": 0.45079059700595203, "flos": 19499540493600.0, "grad_norm": 1.5458827154407286, "language_loss": 0.82310736, "learning_rate": 2.412978332514646e-06, "loss": 0.84819651, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 4.027369976043701 }, { "auxiliary_loss_clip": 0.01324097, "auxiliary_loss_mlp": 0.01194699, "balance_loss_clip": 1.00902283, "balance_loss_mlp": 1.00043607, "epoch": 0.4509108398965911, "flos": 27636325968960.0, "grad_norm": 2.1904607955601243, "language_loss": 0.72447526, "learning_rate": 2.4122161172684623e-06, "loss": 0.74966323, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.848886013031006 }, { "auxiliary_loss_clip": 0.01311436, "auxiliary_loss_mlp": 0.01194499, "balance_loss_clip": 1.00890505, "balance_loss_mlp": 1.0003314, "epoch": 0.4510310827872302, "flos": 20995723123200.0, "grad_norm": 2.3273817598963364, "language_loss": 0.84391606, "learning_rate": 2.4114538394894216e-06, "loss": 0.8689754, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.765451669692993 }, { "auxiliary_loss_clip": 0.01335913, "auxiliary_loss_mlp": 0.0119446, "balance_loss_clip": 1.0092181, "balance_loss_mlp": 1.00038719, "epoch": 0.4511513256778693, "flos": 16216694259840.0, "grad_norm": 1.7568333868570796, "language_loss": 0.82870293, "learning_rate": 2.410691499293161e-06, "loss": 0.85400665, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.709465742111206 }, { "auxiliary_loss_clip": 0.01339437, "auxiliary_loss_mlp": 0.01194692, "balance_loss_clip": 1.00855339, "balance_loss_mlp": 1.00033343, "epoch": 0.45127156856850836, "flos": 25186713072480.0, "grad_norm": 3.9862166460123247, "language_loss": 0.74409056, "learning_rate": 2.409929096795326e-06, "loss": 0.76943189, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.682821035385132 }, { "auxiliary_loss_clip": 0.01348783, "auxiliary_loss_mlp": 0.01194821, "balance_loss_clip": 1.0090698, "balance_loss_mlp": 1.0005573, "epoch": 0.4513918114591475, "flos": 20412533905920.0, "grad_norm": 1.915228252145242, "language_loss": 0.78745377, "learning_rate": 2.409166632111573e-06, "loss": 0.81288981, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.7574808597564697 }, { "auxiliary_loss_clip": 0.01349981, "auxiliary_loss_mlp": 0.01194887, "balance_loss_clip": 1.0096159, "balance_loss_mlp": 1.00052857, "epoch": 0.4515120543497866, "flos": 26648494025760.0, "grad_norm": 2.641430718307424, "language_loss": 0.80785108, "learning_rate": 2.4084041053575674e-06, "loss": 0.83329976, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.719243049621582 }, { "auxiliary_loss_clip": 0.01312967, "auxiliary_loss_mlp": 0.01194696, "balance_loss_clip": 1.00931227, "balance_loss_mlp": 1.00043237, "epoch": 0.45163229724042564, "flos": 20595103849440.0, "grad_norm": 1.8274523265703517, "language_loss": 0.72898471, "learning_rate": 2.4076415166489834e-06, "loss": 0.7540614, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.7569971084594727 }, { "auxiliary_loss_clip": 0.01275687, "auxiliary_loss_mlp": 0.01194671, "balance_loss_clip": 1.00779021, "balance_loss_mlp": 1.00050306, "epoch": 0.45175254013106475, "flos": 21689014716000.0, "grad_norm": 1.525048073028527, "language_loss": 0.78850126, "learning_rate": 2.406878866101506e-06, "loss": 0.81320482, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.8040168285369873 }, { "auxiliary_loss_clip": 0.01360474, "auxiliary_loss_mlp": 0.01194372, "balance_loss_clip": 1.00972152, "balance_loss_mlp": 1.00029957, "epoch": 0.45187278302170386, "flos": 18878860163520.0, "grad_norm": 1.9790494724655179, "language_loss": 0.78059065, "learning_rate": 2.4061161538308273e-06, "loss": 0.80613917, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.607328414916992 }, { "auxiliary_loss_clip": 0.01338661, "auxiliary_loss_mlp": 0.01194482, "balance_loss_clip": 1.00880671, "balance_loss_mlp": 1.00040901, "epoch": 0.4519930259123429, "flos": 18582487705440.0, "grad_norm": 1.774715351989805, "language_loss": 0.88644058, "learning_rate": 2.4053533799526523e-06, "loss": 0.91177201, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.718250036239624 }, { "auxiliary_loss_clip": 0.01312283, "auxiliary_loss_mlp": 0.01194587, "balance_loss_clip": 1.00762868, "balance_loss_mlp": 1.00041878, "epoch": 0.452113268802982, "flos": 25192389013920.0, "grad_norm": 1.6494101690322907, "language_loss": 0.86398464, "learning_rate": 2.404590544582691e-06, "loss": 0.88905329, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.758896827697754 }, { "auxiliary_loss_clip": 0.01311861, "auxiliary_loss_mlp": 0.01194767, "balance_loss_clip": 1.00998163, "balance_loss_mlp": 1.0005033, "epoch": 0.45223351169362114, "flos": 39378936373920.0, "grad_norm": 1.5993127407820737, "language_loss": 0.81396496, "learning_rate": 2.403827647836666e-06, "loss": 0.83903128, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 3.247960329055786 }, { "auxiliary_loss_clip": 0.01360703, "auxiliary_loss_mlp": 0.01194638, "balance_loss_clip": 1.00924051, "balance_loss_mlp": 1.00037432, "epoch": 0.4523537545842602, "flos": 21582181395360.0, "grad_norm": 2.2875638289490747, "language_loss": 0.69043207, "learning_rate": 2.4030646898303075e-06, "loss": 0.71598554, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.724435806274414 }, { "auxiliary_loss_clip": 0.01328533, "auxiliary_loss_mlp": 0.01194872, "balance_loss_clip": 1.00819111, "balance_loss_mlp": 1.00041747, "epoch": 0.4524739974748993, "flos": 28439540318880.0, "grad_norm": 1.8762871028300787, "language_loss": 0.82175589, "learning_rate": 2.4023016706793566e-06, "loss": 0.84698987, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.8568670749664307 }, { "auxiliary_loss_clip": 0.01294481, "auxiliary_loss_mlp": 0.01193966, "balance_loss_clip": 1.00506806, "balance_loss_mlp": 1.00017917, "epoch": 0.4525942403655384, "flos": 61556521037760.0, "grad_norm": 0.7573780747214356, "language_loss": 0.56950885, "learning_rate": 2.401538590499561e-06, "loss": 0.59439331, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.412555456161499 }, { "auxiliary_loss_clip": 0.01341989, "auxiliary_loss_mlp": 0.00872531, "balance_loss_clip": 1.00893795, "balance_loss_mlp": 1.00032091, "epoch": 0.45271448325617747, "flos": 27529851885120.0, "grad_norm": 1.7641629345055487, "language_loss": 0.71720648, "learning_rate": 2.400775449406682e-06, "loss": 0.73935163, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.7934210300445557 }, { "auxiliary_loss_clip": 0.01347354, "auxiliary_loss_mlp": 0.01194545, "balance_loss_clip": 1.0090003, "balance_loss_mlp": 1.00047231, "epoch": 0.4528347261468166, "flos": 22452618456000.0, "grad_norm": 1.8283200449153278, "language_loss": 0.73060638, "learning_rate": 2.400012247516485e-06, "loss": 0.75602531, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.7629384994506836 }, { "auxiliary_loss_clip": 0.01310217, "auxiliary_loss_mlp": 0.0119477, "balance_loss_clip": 1.00875413, "balance_loss_mlp": 1.00041151, "epoch": 0.45295496903745563, "flos": 21103885620000.0, "grad_norm": 2.0980121768865123, "language_loss": 0.90094078, "learning_rate": 2.3992489849447484e-06, "loss": 0.9259907, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.784742832183838 }, { "auxiliary_loss_clip": 0.01295494, "auxiliary_loss_mlp": 0.01194484, "balance_loss_clip": 1.00836039, "balance_loss_mlp": 1.00031555, "epoch": 0.45307521192809475, "flos": 23221179663840.0, "grad_norm": 1.631275047010968, "language_loss": 0.78990012, "learning_rate": 2.3984856618072584e-06, "loss": 0.8147999, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.7857282161712646 }, { "auxiliary_loss_clip": 0.01311675, "auxiliary_loss_mlp": 0.01194748, "balance_loss_clip": 1.00859261, "balance_loss_mlp": 1.0004847, "epoch": 0.45319545481873386, "flos": 15560103458880.0, "grad_norm": 1.8137769615967505, "language_loss": 0.73893499, "learning_rate": 2.3977222782198098e-06, "loss": 0.76399928, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.7516286373138428 }, { "auxiliary_loss_clip": 0.01311819, "auxiliary_loss_mlp": 0.01194751, "balance_loss_clip": 1.00951207, "balance_loss_mlp": 1.00048792, "epoch": 0.4533156977093729, "flos": 21944770701120.0, "grad_norm": 1.599466865215038, "language_loss": 0.75312173, "learning_rate": 2.3969588342982077e-06, "loss": 0.77818739, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.821211814880371 }, { "auxiliary_loss_clip": 0.01335949, "auxiliary_loss_mlp": 0.01194645, "balance_loss_clip": 1.00914407, "balance_loss_mlp": 1.00038147, "epoch": 0.453435940600012, "flos": 24242191878240.0, "grad_norm": 1.5011195662890648, "language_loss": 0.72953546, "learning_rate": 2.396195330158267e-06, "loss": 0.75484133, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 4.538347244262695 }, { "auxiliary_loss_clip": 0.01360156, "auxiliary_loss_mlp": 0.01194694, "balance_loss_clip": 1.00918651, "balance_loss_mlp": 1.00033474, "epoch": 0.45355618349065113, "flos": 23440378551840.0, "grad_norm": 1.7019680636945644, "language_loss": 0.79127604, "learning_rate": 2.3954317659158094e-06, "loss": 0.81682456, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.7153217792510986 }, { "auxiliary_loss_clip": 0.01331028, "auxiliary_loss_mlp": 0.01193866, "balance_loss_clip": 1.0052948, "balance_loss_mlp": 1.00007975, "epoch": 0.4536764263812902, "flos": 66903197374080.0, "grad_norm": 0.896817486471769, "language_loss": 0.57005948, "learning_rate": 2.394668141686667e-06, "loss": 0.59530842, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 4.269890546798706 }, { "auxiliary_loss_clip": 0.01347818, "auxiliary_loss_mlp": 0.01194547, "balance_loss_clip": 1.00920367, "balance_loss_mlp": 1.00037909, "epoch": 0.4537966692719293, "flos": 42739782422400.0, "grad_norm": 2.737382547715815, "language_loss": 0.69589823, "learning_rate": 2.3939044575866813e-06, "loss": 0.72132188, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.9960949420928955 }, { "auxiliary_loss_clip": 0.01329495, "auxiliary_loss_mlp": 0.008726, "balance_loss_clip": 1.00879574, "balance_loss_mlp": 1.00050235, "epoch": 0.4539169121625684, "flos": 35549493943680.0, "grad_norm": 2.5043354070426345, "language_loss": 0.75676554, "learning_rate": 2.3931407137317024e-06, "loss": 0.77878648, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 3.956470251083374 }, { "auxiliary_loss_clip": 0.01312104, "auxiliary_loss_mlp": 0.011947, "balance_loss_clip": 1.00966692, "balance_loss_mlp": 1.00043619, "epoch": 0.45403715505320746, "flos": 18514726139520.0, "grad_norm": 1.567019688318076, "language_loss": 0.84739292, "learning_rate": 2.3923769102375907e-06, "loss": 0.87246108, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.9036459922790527 }, { "auxiliary_loss_clip": 0.013119, "auxiliary_loss_mlp": 0.0119465, "balance_loss_clip": 1.00914454, "balance_loss_mlp": 1.00048196, "epoch": 0.4541573979438466, "flos": 25045837312320.0, "grad_norm": 1.9189541636494367, "language_loss": 0.78443217, "learning_rate": 2.391613047220213e-06, "loss": 0.80949771, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.920945405960083 }, { "auxiliary_loss_clip": 0.01281502, "auxiliary_loss_mlp": 0.01194508, "balance_loss_clip": 1.00830686, "balance_loss_mlp": 1.00033975, "epoch": 0.4542776408344857, "flos": 18332407661760.0, "grad_norm": 1.7896375557319897, "language_loss": 0.79186988, "learning_rate": 2.390849124795447e-06, "loss": 0.81662995, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.824398994445801 }, { "auxiliary_loss_clip": 0.01360402, "auxiliary_loss_mlp": 0.01194786, "balance_loss_clip": 1.00955582, "balance_loss_mlp": 1.00042713, "epoch": 0.45439788372512474, "flos": 20701182772800.0, "grad_norm": 1.854630751051457, "language_loss": 0.84113383, "learning_rate": 2.3900851430791804e-06, "loss": 0.86668575, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.695377826690674 }, { "auxiliary_loss_clip": 0.0136086, "auxiliary_loss_mlp": 0.01195049, "balance_loss_clip": 1.00933588, "balance_loss_mlp": 1.00049973, "epoch": 0.45451812661576385, "flos": 22309443580320.0, "grad_norm": 2.269997174265451, "language_loss": 0.8461985, "learning_rate": 2.389321102187307e-06, "loss": 0.87175763, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.708390474319458 }, { "auxiliary_loss_clip": 0.01314405, "auxiliary_loss_mlp": 0.00872601, "balance_loss_clip": 1.00824022, "balance_loss_mlp": 1.0003109, "epoch": 0.4546383695064029, "flos": 21763314391680.0, "grad_norm": 1.8245259669131677, "language_loss": 0.81513762, "learning_rate": 2.3885570022357326e-06, "loss": 0.83700764, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.8205225467681885 }, { "auxiliary_loss_clip": 0.01280454, "auxiliary_loss_mlp": 0.01193859, "balance_loss_clip": 1.00536418, "balance_loss_mlp": 1.00007248, "epoch": 0.454758612397042, "flos": 64242791730720.0, "grad_norm": 0.8131836756470935, "language_loss": 0.60933316, "learning_rate": 2.38779284334037e-06, "loss": 0.63407624, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.345853805541992 }, { "auxiliary_loss_clip": 0.01288816, "auxiliary_loss_mlp": 0.01194829, "balance_loss_clip": 1.00831413, "balance_loss_mlp": 1.00047028, "epoch": 0.4548788552876811, "flos": 27304186734720.0, "grad_norm": 2.327508877221151, "language_loss": 0.78916466, "learning_rate": 2.387028625617141e-06, "loss": 0.81400108, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 2.833479166030884 }, { "auxiliary_loss_clip": 0.01322699, "auxiliary_loss_mlp": 0.01194327, "balance_loss_clip": 1.00825393, "balance_loss_mlp": 1.00044537, "epoch": 0.4549990981783202, "flos": 22857153410880.0, "grad_norm": 1.7257479033033947, "language_loss": 0.84404755, "learning_rate": 2.3862643491819766e-06, "loss": 0.86921781, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.7902913093566895 }, { "auxiliary_loss_clip": 0.01347366, "auxiliary_loss_mlp": 0.0119467, "balance_loss_clip": 1.00938892, "balance_loss_mlp": 1.00040686, "epoch": 0.4551193410689593, "flos": 23258598929280.0, "grad_norm": 1.6678231921361055, "language_loss": 0.84288931, "learning_rate": 2.3855000141508186e-06, "loss": 0.86830968, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.7582919597625732 }, { "auxiliary_loss_clip": 0.01322232, "auxiliary_loss_mlp": 0.01194832, "balance_loss_clip": 1.00957251, "balance_loss_mlp": 1.00037789, "epoch": 0.4552395839595984, "flos": 20777530098240.0, "grad_norm": 2.655646563954426, "language_loss": 0.84457695, "learning_rate": 2.3847356206396143e-06, "loss": 0.86974764, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.832275867462158 }, { "auxiliary_loss_clip": 0.01360036, "auxiliary_loss_mlp": 0.01194623, "balance_loss_clip": 1.0095346, "balance_loss_mlp": 1.0003593, "epoch": 0.45535982685023746, "flos": 23257521218880.0, "grad_norm": 1.4470600891133178, "language_loss": 0.78589582, "learning_rate": 2.3839711687643227e-06, "loss": 0.81144238, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.6706533432006836 }, { "auxiliary_loss_clip": 0.01334984, "auxiliary_loss_mlp": 0.01194884, "balance_loss_clip": 1.00912476, "balance_loss_mlp": 1.00043035, "epoch": 0.45548006974087657, "flos": 19646128118880.0, "grad_norm": 1.9470657013958228, "language_loss": 0.74248809, "learning_rate": 2.38320665864091e-06, "loss": 0.76778674, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.8084471225738525 }, { "auxiliary_loss_clip": 0.01267529, "auxiliary_loss_mlp": 0.01194699, "balance_loss_clip": 1.00703061, "balance_loss_mlp": 1.00033963, "epoch": 0.4556003126315157, "flos": 20047789179360.0, "grad_norm": 1.826073785063448, "language_loss": 0.82002664, "learning_rate": 2.3824420903853516e-06, "loss": 0.8446489, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.9425268173217773 }, { "auxiliary_loss_clip": 0.01336219, "auxiliary_loss_mlp": 0.01194904, "balance_loss_clip": 1.00897336, "balance_loss_mlp": 1.0004499, "epoch": 0.45572055552215474, "flos": 22959747737280.0, "grad_norm": 3.7360734304338754, "language_loss": 0.81961584, "learning_rate": 2.3816774641136324e-06, "loss": 0.84492713, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 3.1811282634735107 }, { "auxiliary_loss_clip": 0.01336044, "auxiliary_loss_mlp": 0.00872497, "balance_loss_clip": 1.0090301, "balance_loss_mlp": 1.00028348, "epoch": 0.45584079841279385, "flos": 33109939677600.0, "grad_norm": 1.7230349368188258, "language_loss": 0.71328056, "learning_rate": 2.380912779941745e-06, "loss": 0.73536599, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.863664388656616 }, { "auxiliary_loss_clip": 0.01349069, "auxiliary_loss_mlp": 0.01194836, "balance_loss_clip": 1.00930583, "balance_loss_mlp": 1.00038218, "epoch": 0.45596104130343296, "flos": 27272191944960.0, "grad_norm": 1.9341588885201075, "language_loss": 0.83580112, "learning_rate": 2.3801480379856918e-06, "loss": 0.86124027, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.7576420307159424 }, { "auxiliary_loss_clip": 0.01316307, "auxiliary_loss_mlp": 0.01194567, "balance_loss_clip": 1.00828612, "balance_loss_mlp": 1.00039911, "epoch": 0.456081284194072, "flos": 21579810432480.0, "grad_norm": 1.7144904015421611, "language_loss": 0.83665991, "learning_rate": 2.379383238361484e-06, "loss": 0.86176872, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.7765259742736816 }, { "auxiliary_loss_clip": 0.01337083, "auxiliary_loss_mlp": 0.0119459, "balance_loss_clip": 1.00842845, "balance_loss_mlp": 1.00042176, "epoch": 0.4562015270847111, "flos": 35918836901280.0, "grad_norm": 1.815479326209773, "language_loss": 0.79710126, "learning_rate": 2.3786183811851407e-06, "loss": 0.82241791, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.865346908569336 }, { "auxiliary_loss_clip": 0.01359876, "auxiliary_loss_mlp": 0.01194646, "balance_loss_clip": 1.00926471, "balance_loss_mlp": 1.0003829, "epoch": 0.45632176997535023, "flos": 13589792200800.0, "grad_norm": 1.6262715387955293, "language_loss": 0.79917336, "learning_rate": 2.3778534665726892e-06, "loss": 0.82471859, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.671919584274292 }, { "auxiliary_loss_clip": 0.01347095, "auxiliary_loss_mlp": 0.01194591, "balance_loss_clip": 1.00907493, "balance_loss_mlp": 1.00051856, "epoch": 0.4564420128659893, "flos": 32635415888640.0, "grad_norm": 3.876592161707607, "language_loss": 0.73171043, "learning_rate": 2.377088494640168e-06, "loss": 0.75712729, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 3.7379369735717773 }, { "auxiliary_loss_clip": 0.01333922, "auxiliary_loss_mlp": 0.01194615, "balance_loss_clip": 1.00910449, "balance_loss_mlp": 1.00035191, "epoch": 0.4565622557566284, "flos": 20377701145440.0, "grad_norm": 1.5652282326451044, "language_loss": 0.77900481, "learning_rate": 2.3763234655036216e-06, "loss": 0.80429018, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.738191843032837 }, { "auxiliary_loss_clip": 0.01323592, "auxiliary_loss_mlp": 0.01194483, "balance_loss_clip": 1.00873899, "balance_loss_mlp": 1.00041032, "epoch": 0.45668249864726745, "flos": 25374384178560.0, "grad_norm": 1.8961146916223643, "language_loss": 0.86750811, "learning_rate": 2.3755583792791046e-06, "loss": 0.89268887, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 3.7663321495056152 }, { "auxiliary_loss_clip": 0.01339691, "auxiliary_loss_mlp": 0.01194869, "balance_loss_clip": 1.00894105, "balance_loss_mlp": 1.00051045, "epoch": 0.45680274153790656, "flos": 15559816069440.0, "grad_norm": 1.9576206598900086, "language_loss": 0.74589747, "learning_rate": 2.3747932360826803e-06, "loss": 0.7712431, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.727515459060669 }, { "auxiliary_loss_clip": 0.01335667, "auxiliary_loss_mlp": 0.01194772, "balance_loss_clip": 1.00935555, "balance_loss_mlp": 1.00050867, "epoch": 0.4569229844285457, "flos": 19792895362560.0, "grad_norm": 1.9542790721820025, "language_loss": 0.82389534, "learning_rate": 2.3740280360304205e-06, "loss": 0.84919965, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 3.7305872440338135 }, { "auxiliary_loss_clip": 0.01294847, "auxiliary_loss_mlp": 0.01194801, "balance_loss_clip": 1.00753558, "balance_loss_mlp": 1.00044191, "epoch": 0.45704322731918473, "flos": 24093951763680.0, "grad_norm": 1.5797010133906302, "language_loss": 0.68056381, "learning_rate": 2.3732627792384038e-06, "loss": 0.70546031, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 3.789116144180298 }, { "auxiliary_loss_clip": 0.01360351, "auxiliary_loss_mlp": 0.01194936, "balance_loss_clip": 1.00920856, "balance_loss_mlp": 1.00048161, "epoch": 0.45716347020982384, "flos": 31317815674080.0, "grad_norm": 1.9492484356067896, "language_loss": 0.75016093, "learning_rate": 2.3724974658227207e-06, "loss": 0.7757138, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.807810068130493 }, { "auxiliary_loss_clip": 0.01314756, "auxiliary_loss_mlp": 0.00872594, "balance_loss_clip": 1.00822234, "balance_loss_mlp": 1.00033653, "epoch": 0.45728371310046295, "flos": 26501403468960.0, "grad_norm": 1.802059188713758, "language_loss": 0.71060312, "learning_rate": 2.3717320958994687e-06, "loss": 0.73247659, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.8257017135620117 }, { "auxiliary_loss_clip": 0.0132328, "auxiliary_loss_mlp": 0.01194618, "balance_loss_clip": 1.00893855, "balance_loss_mlp": 1.00035405, "epoch": 0.457403955991102, "flos": 17929417425120.0, "grad_norm": 1.787094210954735, "language_loss": 0.70671487, "learning_rate": 2.3709666695847534e-06, "loss": 0.73189384, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.7809383869171143 }, { "auxiliary_loss_clip": 0.01293273, "auxiliary_loss_mlp": 0.01194496, "balance_loss_clip": 1.00814116, "balance_loss_mlp": 1.00042343, "epoch": 0.4575241988817411, "flos": 42230677338720.0, "grad_norm": 1.9939948130142011, "language_loss": 0.69999248, "learning_rate": 2.370201186994689e-06, "loss": 0.7248702, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 3.1101953983306885 }, { "auxiliary_loss_clip": 0.01326164, "auxiliary_loss_mlp": 0.01194664, "balance_loss_clip": 1.00928903, "balance_loss_mlp": 1.0004003, "epoch": 0.45764444177238023, "flos": 30117322952640.0, "grad_norm": 1.8021750075551548, "language_loss": 0.69929755, "learning_rate": 2.369435648245399e-06, "loss": 0.7245059, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.889798879623413 }, { "auxiliary_loss_clip": 0.01324612, "auxiliary_loss_mlp": 0.01194473, "balance_loss_clip": 1.00913119, "balance_loss_mlp": 1.00040019, "epoch": 0.4577646846630193, "flos": 24060304484640.0, "grad_norm": 1.6263608488238523, "language_loss": 0.85012865, "learning_rate": 2.368670053453015e-06, "loss": 0.87531948, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.9427428245544434 }, { "auxiliary_loss_clip": 0.0134696, "auxiliary_loss_mlp": 0.01194786, "balance_loss_clip": 1.00939488, "balance_loss_mlp": 1.00042701, "epoch": 0.4578849275536584, "flos": 17418588004800.0, "grad_norm": 2.4324723435837106, "language_loss": 0.7451148, "learning_rate": 2.3679044027336757e-06, "loss": 0.77053231, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 2.7094850540161133 }, { "auxiliary_loss_clip": 0.01360318, "auxiliary_loss_mlp": 0.01194787, "balance_loss_clip": 1.00923657, "balance_loss_mlp": 1.0004282, "epoch": 0.4580051704442975, "flos": 13510175820480.0, "grad_norm": 2.411985277366215, "language_loss": 0.6952697, "learning_rate": 2.3671386962035326e-06, "loss": 0.72082078, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.680518865585327 }, { "auxiliary_loss_clip": 0.0133831, "auxiliary_loss_mlp": 0.01194983, "balance_loss_clip": 1.00892091, "balance_loss_mlp": 1.00043297, "epoch": 0.45812541333493656, "flos": 18037615845600.0, "grad_norm": 1.954476611363686, "language_loss": 0.68315291, "learning_rate": 2.3663729339787405e-06, "loss": 0.70848584, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.702894687652588 }, { "auxiliary_loss_clip": 0.0136022, "auxiliary_loss_mlp": 0.01194684, "balance_loss_clip": 1.00965679, "balance_loss_mlp": 1.00032568, "epoch": 0.45824565622557567, "flos": 20222204447520.0, "grad_norm": 2.5256077350750576, "language_loss": 0.73325264, "learning_rate": 2.365607116175466e-06, "loss": 0.7588017, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.7101075649261475 }, { "auxiliary_loss_clip": 0.01359555, "auxiliary_loss_mlp": 0.01194514, "balance_loss_clip": 1.00922608, "balance_loss_mlp": 1.00034595, "epoch": 0.4583658991162148, "flos": 19864895922720.0, "grad_norm": 2.5494927794080415, "language_loss": 0.66840303, "learning_rate": 2.3648412429098825e-06, "loss": 0.69394374, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.6768925189971924 }, { "auxiliary_loss_clip": 0.01301599, "auxiliary_loss_mlp": 0.01195006, "balance_loss_clip": 1.00873923, "balance_loss_mlp": 1.0005517, "epoch": 0.45848614200685384, "flos": 21029945181120.0, "grad_norm": 1.6853557404831168, "language_loss": 0.81914985, "learning_rate": 2.364075314298172e-06, "loss": 0.84411591, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.8026559352874756 }, { "auxiliary_loss_clip": 0.01347486, "auxiliary_loss_mlp": 0.00872589, "balance_loss_clip": 1.00937724, "balance_loss_mlp": 1.00024414, "epoch": 0.45860638489749295, "flos": 21069304325280.0, "grad_norm": 2.845387800767636, "language_loss": 0.7046082, "learning_rate": 2.3633093304565267e-06, "loss": 0.72680902, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.6826581954956055 }, { "auxiliary_loss_clip": 0.01361663, "auxiliary_loss_mlp": 0.01194884, "balance_loss_clip": 1.00990367, "balance_loss_mlp": 1.00042975, "epoch": 0.458726627788132, "flos": 26833901940000.0, "grad_norm": 1.7068403148611395, "language_loss": 0.62809503, "learning_rate": 2.3625432915011443e-06, "loss": 0.65366054, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.7731287479400635 }, { "auxiliary_loss_clip": 0.01325344, "auxiliary_loss_mlp": 0.01194859, "balance_loss_clip": 1.00857449, "balance_loss_mlp": 1.00050056, "epoch": 0.4588468706787711, "flos": 24097939292160.0, "grad_norm": 1.6646114006343644, "language_loss": 0.6553117, "learning_rate": 2.3617771975482334e-06, "loss": 0.68051374, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.7467918395996094 }, { "auxiliary_loss_clip": 0.01284854, "auxiliary_loss_mlp": 0.01194535, "balance_loss_clip": 1.00746918, "balance_loss_mlp": 1.00046206, "epoch": 0.4589671135694102, "flos": 17889339807360.0, "grad_norm": 1.4712442033459299, "language_loss": 0.74427915, "learning_rate": 2.3610110487140083e-06, "loss": 0.76907313, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.8284108638763428 }, { "auxiliary_loss_clip": 0.01323309, "auxiliary_loss_mlp": 0.01194499, "balance_loss_clip": 1.009444, "balance_loss_mlp": 1.00042665, "epoch": 0.4590873564600493, "flos": 25626978879840.0, "grad_norm": 1.682941819422997, "language_loss": 0.80728626, "learning_rate": 2.360244845114695e-06, "loss": 0.8324644, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.872567892074585 }, { "auxiliary_loss_clip": 0.01311105, "auxiliary_loss_mlp": 0.01194618, "balance_loss_clip": 1.00861216, "balance_loss_mlp": 1.00035512, "epoch": 0.4592075993506884, "flos": 18514797986880.0, "grad_norm": 1.970946377171723, "language_loss": 0.68604493, "learning_rate": 2.3594785868665245e-06, "loss": 0.71110213, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.733069896697998 }, { "auxiliary_loss_clip": 0.01302947, "auxiliary_loss_mlp": 0.00872542, "balance_loss_clip": 1.00866437, "balance_loss_mlp": 1.00028098, "epoch": 0.4593278422413275, "flos": 20631122091360.0, "grad_norm": 1.934266201273433, "language_loss": 0.80455136, "learning_rate": 2.3587122740857386e-06, "loss": 0.82630622, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.8449320793151855 }, { "auxiliary_loss_clip": 0.01340557, "auxiliary_loss_mlp": 0.01194619, "balance_loss_clip": 1.00827479, "balance_loss_mlp": 1.00045061, "epoch": 0.45944808513196655, "flos": 21358527971040.0, "grad_norm": 1.5235760220914238, "language_loss": 0.77986819, "learning_rate": 2.357945906888586e-06, "loss": 0.80521989, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.7273459434509277 }, { "auxiliary_loss_clip": 0.01336427, "auxiliary_loss_mlp": 0.01194923, "balance_loss_clip": 1.0089308, "balance_loss_mlp": 1.00056386, "epoch": 0.45956832802260567, "flos": 21427798331520.0, "grad_norm": 2.1249287693126093, "language_loss": 0.79900473, "learning_rate": 2.357179485391324e-06, "loss": 0.82431823, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 3.7950737476348877 }, { "auxiliary_loss_clip": 0.01359592, "auxiliary_loss_mlp": 0.0119451, "balance_loss_clip": 1.00931585, "balance_loss_mlp": 1.00043702, "epoch": 0.4596885709132448, "flos": 22382665545600.0, "grad_norm": 1.770970463636013, "language_loss": 0.85910237, "learning_rate": 2.3564130097102173e-06, "loss": 0.88464344, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.9881904125213623 }, { "auxiliary_loss_clip": 0.01323517, "auxiliary_loss_mlp": 0.01194609, "balance_loss_clip": 1.00942278, "balance_loss_mlp": 1.00053573, "epoch": 0.45980881380388383, "flos": 28981969368480.0, "grad_norm": 1.6612631883502542, "language_loss": 0.75059795, "learning_rate": 2.355646479961541e-06, "loss": 0.77577919, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 3.7390081882476807 }, { "auxiliary_loss_clip": 0.013599, "auxiliary_loss_mlp": 0.011945, "balance_loss_clip": 1.00911343, "balance_loss_mlp": 1.00042713, "epoch": 0.45992905669452294, "flos": 33396612742080.0, "grad_norm": 1.8368137260330435, "language_loss": 0.71338296, "learning_rate": 2.354879896261576e-06, "loss": 0.73892695, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.777219533920288 }, { "auxiliary_loss_clip": 0.01286876, "auxiliary_loss_mlp": 0.01194849, "balance_loss_clip": 1.0075326, "balance_loss_mlp": 1.00058556, "epoch": 0.46004929958516205, "flos": 36318198846240.0, "grad_norm": 1.7189123210794368, "language_loss": 0.56770986, "learning_rate": 2.3541132587266133e-06, "loss": 0.59252715, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 3.8642234802246094 }, { "auxiliary_loss_clip": 0.01307266, "auxiliary_loss_mlp": 0.01194808, "balance_loss_clip": 1.00803232, "balance_loss_mlp": 1.00054407, "epoch": 0.4601695424758011, "flos": 17238460871520.0, "grad_norm": 1.7516128711453143, "language_loss": 0.69226378, "learning_rate": 2.3533465674729515e-06, "loss": 0.71728456, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 3.710904359817505 }, { "auxiliary_loss_clip": 0.01359868, "auxiliary_loss_mlp": 0.01194751, "balance_loss_clip": 1.00913262, "balance_loss_mlp": 1.00048745, "epoch": 0.4602897853664402, "flos": 15888434783040.0, "grad_norm": 1.820285995654846, "language_loss": 0.72894591, "learning_rate": 2.352579822616895e-06, "loss": 0.75449216, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.7085981369018555 }, { "auxiliary_loss_clip": 0.01312917, "auxiliary_loss_mlp": 0.01194548, "balance_loss_clip": 1.0088402, "balance_loss_mlp": 1.00047529, "epoch": 0.4604100282570793, "flos": 25412629688640.0, "grad_norm": 1.6879346881048956, "language_loss": 0.78011537, "learning_rate": 2.351813024274761e-06, "loss": 0.80518997, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.9079980850219727 }, { "auxiliary_loss_clip": 0.01306681, "auxiliary_loss_mlp": 0.01194832, "balance_loss_clip": 1.00808299, "balance_loss_mlp": 1.00066435, "epoch": 0.4605302711477184, "flos": 27630721874880.0, "grad_norm": 1.8353695158241272, "language_loss": 0.7392, "learning_rate": 2.3510461725628693e-06, "loss": 0.76421517, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.8899805545806885 }, { "auxiliary_loss_clip": 0.01305416, "auxiliary_loss_mlp": 0.01194391, "balance_loss_clip": 1.00850594, "balance_loss_mlp": 1.00031793, "epoch": 0.4606505140383575, "flos": 23839668649440.0, "grad_norm": 1.7185152291807355, "language_loss": 0.71078074, "learning_rate": 2.350279267597554e-06, "loss": 0.73577881, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.913109540939331 }, { "auxiliary_loss_clip": 0.01335781, "auxiliary_loss_mlp": 0.0119467, "balance_loss_clip": 1.00863326, "balance_loss_mlp": 1.00040627, "epoch": 0.46077075692899655, "flos": 16107022968480.0, "grad_norm": 1.9287670098120964, "language_loss": 0.83092374, "learning_rate": 2.3495123094951515e-06, "loss": 0.85622823, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.7428250312805176 }, { "auxiliary_loss_clip": 0.01323145, "auxiliary_loss_mlp": 0.01194897, "balance_loss_clip": 1.00852096, "balance_loss_mlp": 1.00053835, "epoch": 0.46089099981963566, "flos": 48798165990240.0, "grad_norm": 2.9122269748828593, "language_loss": 0.75955081, "learning_rate": 2.34874529837201e-06, "loss": 0.78473127, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 3.009026050567627 }, { "auxiliary_loss_clip": 0.01269621, "auxiliary_loss_mlp": 0.01194555, "balance_loss_clip": 1.00740564, "balance_loss_mlp": 1.00038731, "epoch": 0.46101124271027477, "flos": 19099244532960.0, "grad_norm": 1.8433908271627424, "language_loss": 0.79186594, "learning_rate": 2.347978234344483e-06, "loss": 0.8165077, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.8437561988830566 }, { "auxiliary_loss_clip": 0.01342258, "auxiliary_loss_mlp": 0.01194791, "balance_loss_clip": 1.00907969, "balance_loss_mlp": 1.00043237, "epoch": 0.4611314856009138, "flos": 39347947447200.0, "grad_norm": 1.6306116922910574, "language_loss": 0.69232857, "learning_rate": 2.347211117528935e-06, "loss": 0.71769911, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 2.9083938598632812 }, { "auxiliary_loss_clip": 0.01309675, "auxiliary_loss_mlp": 0.01194757, "balance_loss_clip": 1.00860107, "balance_loss_mlp": 1.000494, "epoch": 0.46125172849155294, "flos": 20810782216800.0, "grad_norm": 1.6531939373187985, "language_loss": 0.71556252, "learning_rate": 2.3464439480417374e-06, "loss": 0.74060678, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.9226772785186768 }, { "auxiliary_loss_clip": 0.01340297, "auxiliary_loss_mlp": 0.01194803, "balance_loss_clip": 1.00895309, "balance_loss_mlp": 1.0005393, "epoch": 0.46137197138219205, "flos": 17930818448640.0, "grad_norm": 2.2881645680420886, "language_loss": 0.77002567, "learning_rate": 2.3456767259992676e-06, "loss": 0.79537666, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.814500331878662 }, { "auxiliary_loss_clip": 0.01360541, "auxiliary_loss_mlp": 0.00872506, "balance_loss_clip": 1.00904131, "balance_loss_mlp": 1.00018775, "epoch": 0.4614922142728311, "flos": 16836620192640.0, "grad_norm": 2.2392885262706423, "language_loss": 0.88904327, "learning_rate": 2.3449094515179135e-06, "loss": 0.91137373, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 2.739246368408203 }, { "auxiliary_loss_clip": 0.01333735, "auxiliary_loss_mlp": 0.01194417, "balance_loss_clip": 1.00888574, "balance_loss_mlp": 1.00043964, "epoch": 0.4616124571634702, "flos": 26614918594080.0, "grad_norm": 1.619758344435842, "language_loss": 0.81644475, "learning_rate": 2.34414212471407e-06, "loss": 0.8417263, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.795240640640259 }, { "auxiliary_loss_clip": 0.01347107, "auxiliary_loss_mlp": 0.01194884, "balance_loss_clip": 1.00909829, "balance_loss_mlp": 1.00052524, "epoch": 0.4617327000541093, "flos": 20340138185280.0, "grad_norm": 2.630479530776714, "language_loss": 0.72840559, "learning_rate": 2.3433747457041394e-06, "loss": 0.75382549, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.7255234718322754 }, { "auxiliary_loss_clip": 0.01300161, "auxiliary_loss_mlp": 0.01194587, "balance_loss_clip": 1.00886405, "balance_loss_mlp": 1.00041914, "epoch": 0.4618529429447484, "flos": 29570762679840.0, "grad_norm": 2.320071490670125, "language_loss": 0.85266125, "learning_rate": 2.342607314604533e-06, "loss": 0.87760872, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.7975385189056396 }, { "auxiliary_loss_clip": 0.01336404, "auxiliary_loss_mlp": 0.01194655, "balance_loss_clip": 1.00974846, "balance_loss_mlp": 1.00048649, "epoch": 0.4619731858353875, "flos": 19787039802720.0, "grad_norm": 1.6340982594989941, "language_loss": 0.83861262, "learning_rate": 2.3418398315316694e-06, "loss": 0.86392319, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.727289915084839 }, { "auxiliary_loss_clip": 0.01359767, "auxiliary_loss_mlp": 0.01194784, "balance_loss_clip": 1.00934172, "balance_loss_mlp": 1.00052047, "epoch": 0.4620934287260266, "flos": 18951148113120.0, "grad_norm": 2.2031367014182694, "language_loss": 0.78198677, "learning_rate": 2.3410722966019755e-06, "loss": 0.80753231, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.6665737628936768 }, { "auxiliary_loss_clip": 0.01338396, "auxiliary_loss_mlp": 0.01194685, "balance_loss_clip": 1.00893545, "balance_loss_mlp": 1.00042152, "epoch": 0.46221367161666566, "flos": 37341689794560.0, "grad_norm": 1.7383142645892733, "language_loss": 0.6557616, "learning_rate": 2.3403047099318848e-06, "loss": 0.68109238, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.8658621311187744 }, { "auxiliary_loss_clip": 0.01291205, "auxiliary_loss_mlp": 0.0119438, "balance_loss_clip": 1.00845909, "balance_loss_mlp": 1.00040305, "epoch": 0.46233391450730477, "flos": 14428557784800.0, "grad_norm": 2.080168373370011, "language_loss": 0.74992371, "learning_rate": 2.3395370716378405e-06, "loss": 0.77477956, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.8453383445739746 }, { "auxiliary_loss_clip": 0.01347325, "auxiliary_loss_mlp": 0.01194629, "balance_loss_clip": 1.00892973, "balance_loss_mlp": 1.00055587, "epoch": 0.4624541573979438, "flos": 22493055310560.0, "grad_norm": 2.1777609310601247, "language_loss": 0.72511983, "learning_rate": 2.338769381836292e-06, "loss": 0.75053936, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.675259828567505 }, { "auxiliary_loss_clip": 0.01292484, "auxiliary_loss_mlp": 0.01194791, "balance_loss_clip": 1.00848794, "balance_loss_mlp": 1.0004319, "epoch": 0.46257440028858293, "flos": 14465078958240.0, "grad_norm": 1.8959937512310028, "language_loss": 0.72974467, "learning_rate": 2.3380016406436984e-06, "loss": 0.75461739, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.7803378105163574 }, { "auxiliary_loss_clip": 0.01275059, "auxiliary_loss_mlp": 0.01194554, "balance_loss_clip": 1.00853848, "balance_loss_mlp": 1.00038612, "epoch": 0.46269464317922204, "flos": 23332216055040.0, "grad_norm": 1.9597677631611639, "language_loss": 0.81224954, "learning_rate": 2.337233848176524e-06, "loss": 0.83694565, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.827376127243042 }, { "auxiliary_loss_clip": 0.01287604, "auxiliary_loss_mlp": 0.01194842, "balance_loss_clip": 1.00832009, "balance_loss_mlp": 1.0004828, "epoch": 0.4628148860698611, "flos": 18552037633920.0, "grad_norm": 1.8464037395600637, "language_loss": 0.83134413, "learning_rate": 2.3364660045512435e-06, "loss": 0.85616863, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 3.7867860794067383 }, { "auxiliary_loss_clip": 0.01288149, "auxiliary_loss_mlp": 0.0119386, "balance_loss_clip": 1.0045439, "balance_loss_mlp": 1.00007331, "epoch": 0.4629351289605002, "flos": 70667608438080.0, "grad_norm": 0.7421372557507756, "language_loss": 0.58208615, "learning_rate": 2.335698109884337e-06, "loss": 0.60690624, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 4.584868907928467 }, { "auxiliary_loss_clip": 0.01267123, "auxiliary_loss_mlp": 0.01193937, "balance_loss_clip": 1.00851345, "balance_loss_mlp": 1.0001502, "epoch": 0.4630553718511393, "flos": 59687236673280.0, "grad_norm": 0.7889634285049374, "language_loss": 0.59860158, "learning_rate": 2.334930164292294e-06, "loss": 0.62321222, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.6011462211608887 }, { "auxiliary_loss_clip": 0.01299461, "auxiliary_loss_mlp": 0.01194848, "balance_loss_clip": 1.0090251, "balance_loss_mlp": 1.00058508, "epoch": 0.4631756147417784, "flos": 15960614961600.0, "grad_norm": 1.9198958099034729, "language_loss": 0.79910517, "learning_rate": 2.334162167891612e-06, "loss": 0.82404828, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 4.965965270996094 }, { "auxiliary_loss_clip": 0.01336434, "auxiliary_loss_mlp": 0.01194774, "balance_loss_clip": 1.00948989, "balance_loss_mlp": 1.00060606, "epoch": 0.4632958576324175, "flos": 16472917252800.0, "grad_norm": 2.02515078741828, "language_loss": 0.74724334, "learning_rate": 2.333394120798795e-06, "loss": 0.77255541, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.729126214981079 }, { "auxiliary_loss_clip": 0.01323755, "auxiliary_loss_mlp": 0.01194649, "balance_loss_clip": 1.00830662, "balance_loss_mlp": 1.00038552, "epoch": 0.4634161005230566, "flos": 22346503608960.0, "grad_norm": 2.043524384901351, "language_loss": 0.72214222, "learning_rate": 2.3326260231303545e-06, "loss": 0.74732625, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.8501856327056885 }, { "auxiliary_loss_clip": 0.01359683, "auxiliary_loss_mlp": 0.01194439, "balance_loss_clip": 1.00930679, "balance_loss_mlp": 1.00046182, "epoch": 0.46353634341369565, "flos": 15742242318240.0, "grad_norm": 1.5572569066287676, "language_loss": 0.86557686, "learning_rate": 2.331857875002811e-06, "loss": 0.89111805, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.7507824897766113 }, { "auxiliary_loss_clip": 0.01322579, "auxiliary_loss_mlp": 0.01194732, "balance_loss_clip": 1.00909412, "balance_loss_mlp": 1.00046885, "epoch": 0.46365658630433476, "flos": 28329833103840.0, "grad_norm": 1.643243637574927, "language_loss": 0.76057827, "learning_rate": 2.3310896765326916e-06, "loss": 0.78575134, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.8390204906463623 }, { "auxiliary_loss_clip": 0.01312436, "auxiliary_loss_mlp": 0.01194556, "balance_loss_clip": 1.00989842, "balance_loss_mlp": 1.00048363, "epoch": 0.46377682919497387, "flos": 24608086162560.0, "grad_norm": 1.5540528076240836, "language_loss": 0.84185582, "learning_rate": 2.330321427836531e-06, "loss": 0.86692584, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.812654733657837 }, { "auxiliary_loss_clip": 0.01347853, "auxiliary_loss_mlp": 0.01194537, "balance_loss_clip": 1.00968814, "balance_loss_mlp": 1.00036907, "epoch": 0.4638970720856129, "flos": 19060963099200.0, "grad_norm": 1.8462099216247734, "language_loss": 0.82446921, "learning_rate": 2.3295531290308733e-06, "loss": 0.84989309, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.767544746398926 }, { "auxiliary_loss_clip": 0.01360691, "auxiliary_loss_mlp": 0.00872538, "balance_loss_clip": 1.01001239, "balance_loss_mlp": 1.000247, "epoch": 0.46401731497625204, "flos": 18471020230080.0, "grad_norm": 2.7531347415917016, "language_loss": 0.75401103, "learning_rate": 2.3287847802322678e-06, "loss": 0.77634335, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.7092392444610596 }, { "auxiliary_loss_clip": 0.01312647, "auxiliary_loss_mlp": 0.01195154, "balance_loss_clip": 1.00987768, "balance_loss_mlp": 1.00060451, "epoch": 0.4641375578668911, "flos": 26067065068800.0, "grad_norm": 1.866236629936186, "language_loss": 0.83738017, "learning_rate": 2.3280163815572723e-06, "loss": 0.86245817, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.848602056503296 }, { "auxiliary_loss_clip": 0.01325778, "auxiliary_loss_mlp": 0.0119463, "balance_loss_clip": 1.00873578, "balance_loss_mlp": 1.00036657, "epoch": 0.4642578007575302, "flos": 19570391496000.0, "grad_norm": 3.0222523529095278, "language_loss": 0.77235901, "learning_rate": 2.3272479331224522e-06, "loss": 0.79756308, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.841370105743408 }, { "auxiliary_loss_clip": 0.01360078, "auxiliary_loss_mlp": 0.01194904, "balance_loss_clip": 1.00935495, "balance_loss_mlp": 1.00044966, "epoch": 0.4643780436481693, "flos": 28186263067680.0, "grad_norm": 2.143802405903658, "language_loss": 0.78121126, "learning_rate": 2.3264794350443817e-06, "loss": 0.80676103, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.7906081676483154 }, { "auxiliary_loss_clip": 0.0134782, "auxiliary_loss_mlp": 0.01194825, "balance_loss_clip": 1.00928259, "balance_loss_mlp": 1.00056148, "epoch": 0.46449828653880837, "flos": 25375282270560.0, "grad_norm": 1.927830786366498, "language_loss": 0.78674436, "learning_rate": 2.3257108874396396e-06, "loss": 0.81217074, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.7926642894744873 }, { "auxiliary_loss_clip": 0.01329331, "auxiliary_loss_mlp": 0.01194612, "balance_loss_clip": 1.00881839, "balance_loss_mlp": 1.00044394, "epoch": 0.4646185294294475, "flos": 16034339858400.0, "grad_norm": 1.8871535196459266, "language_loss": 0.73523903, "learning_rate": 2.3249422904248152e-06, "loss": 0.76047844, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.7581255435943604 }, { "auxiliary_loss_clip": 0.01347654, "auxiliary_loss_mlp": 0.01194568, "balance_loss_clip": 1.0090766, "balance_loss_mlp": 1.0003041, "epoch": 0.4647387723200866, "flos": 26363114213760.0, "grad_norm": 1.4610826444375433, "language_loss": 0.87165773, "learning_rate": 2.324173644116504e-06, "loss": 0.89707994, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.8456051349639893 }, { "auxiliary_loss_clip": 0.01334266, "auxiliary_loss_mlp": 0.01194609, "balance_loss_clip": 1.00902271, "balance_loss_mlp": 1.00044107, "epoch": 0.46485901521072565, "flos": 27160221538080.0, "grad_norm": 1.6437272524837456, "language_loss": 0.8175298, "learning_rate": 2.3234049486313087e-06, "loss": 0.84281856, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.762181043624878 }, { "auxiliary_loss_clip": 0.01340114, "auxiliary_loss_mlp": 0.01194334, "balance_loss_clip": 1.00861084, "balance_loss_mlp": 1.00035703, "epoch": 0.46497925810136476, "flos": 24279862609440.0, "grad_norm": 1.7134392525052986, "language_loss": 0.75709832, "learning_rate": 2.322636204085839e-06, "loss": 0.78244281, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.750857353210449 }, { "auxiliary_loss_clip": 0.01335553, "auxiliary_loss_mlp": 0.01194386, "balance_loss_clip": 1.00936675, "balance_loss_mlp": 1.00031328, "epoch": 0.46509950099200387, "flos": 16253143585920.0, "grad_norm": 2.6747253874302404, "language_loss": 0.78884459, "learning_rate": 2.3218674105967143e-06, "loss": 0.81414402, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.745415449142456 }, { "auxiliary_loss_clip": 0.01324382, "auxiliary_loss_mlp": 0.011947, "balance_loss_clip": 1.00890446, "balance_loss_mlp": 1.00034118, "epoch": 0.4652197438826429, "flos": 23442282506880.0, "grad_norm": 1.61683173338044, "language_loss": 0.83498073, "learning_rate": 2.3210985682805593e-06, "loss": 0.86017156, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.8291919231414795 }, { "auxiliary_loss_clip": 0.01359557, "auxiliary_loss_mlp": 0.01194709, "balance_loss_clip": 1.00968909, "balance_loss_mlp": 1.00044584, "epoch": 0.46533998677328203, "flos": 16216406870400.0, "grad_norm": 2.4482334465043496, "language_loss": 0.68137157, "learning_rate": 2.320329677254007e-06, "loss": 0.70691419, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.7017221450805664 }, { "auxiliary_loss_clip": 0.01359464, "auxiliary_loss_mlp": 0.01194784, "balance_loss_clip": 1.00937891, "balance_loss_mlp": 1.00042534, "epoch": 0.46546022966392114, "flos": 21141879664320.0, "grad_norm": 2.0791191233337813, "language_loss": 0.72312582, "learning_rate": 2.319560737633697e-06, "loss": 0.74866831, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.7069270610809326 }, { "auxiliary_loss_clip": 0.01315496, "auxiliary_loss_mlp": 0.01195047, "balance_loss_clip": 1.00929737, "balance_loss_mlp": 1.0004977, "epoch": 0.4655804725545602, "flos": 41171958469440.0, "grad_norm": 1.5492257827713494, "language_loss": 0.68228519, "learning_rate": 2.3187917495362775e-06, "loss": 0.70739061, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 3.1330575942993164 }, { "auxiliary_loss_clip": 0.01291297, "auxiliary_loss_mlp": 0.01194603, "balance_loss_clip": 1.00848973, "balance_loss_mlp": 1.00043452, "epoch": 0.4657007154451993, "flos": 19570966274880.0, "grad_norm": 5.468991358205068, "language_loss": 0.76894534, "learning_rate": 2.318022713078403e-06, "loss": 0.79380441, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.79156756401062 }, { "auxiliary_loss_clip": 0.01319347, "auxiliary_loss_mlp": 0.01194495, "balance_loss_clip": 1.0085175, "balance_loss_mlp": 1.00032735, "epoch": 0.4658209583358384, "flos": 15517834496640.0, "grad_norm": 3.6376860604793366, "language_loss": 0.85251886, "learning_rate": 2.3172536283767354e-06, "loss": 0.87765729, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.7949397563934326 }, { "auxiliary_loss_clip": 0.01305749, "auxiliary_loss_mlp": 0.01194828, "balance_loss_clip": 1.00895357, "balance_loss_mlp": 1.00056505, "epoch": 0.4659412012264775, "flos": 14903189344800.0, "grad_norm": 1.9600167430219628, "language_loss": 0.81190813, "learning_rate": 2.3164844955479447e-06, "loss": 0.83691388, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 3.7107481956481934 }, { "auxiliary_loss_clip": 0.01281465, "auxiliary_loss_mlp": 0.01194915, "balance_loss_clip": 1.00834, "balance_loss_mlp": 1.00055635, "epoch": 0.4660614441171166, "flos": 24425623990080.0, "grad_norm": 1.9376023560386395, "language_loss": 0.70833075, "learning_rate": 2.3157153147087082e-06, "loss": 0.73309457, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 3.9449636936187744 }, { "auxiliary_loss_clip": 0.01272457, "auxiliary_loss_mlp": 0.01194618, "balance_loss_clip": 1.00773156, "balance_loss_mlp": 1.0004499, "epoch": 0.46618168700775564, "flos": 22091106860640.0, "grad_norm": 1.943984480896672, "language_loss": 0.83203173, "learning_rate": 2.314946085975709e-06, "loss": 0.85670245, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.881598472595215 }, { "auxiliary_loss_clip": 0.01297077, "auxiliary_loss_mlp": 0.01194628, "balance_loss_clip": 1.00855303, "balance_loss_mlp": 1.00055504, "epoch": 0.46630192989839475, "flos": 26176987825920.0, "grad_norm": 1.6575702625126767, "language_loss": 0.82444429, "learning_rate": 2.3141768094656393e-06, "loss": 0.84936142, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 3.925563335418701 }, { "auxiliary_loss_clip": 0.01256757, "auxiliary_loss_mlp": 0.01194679, "balance_loss_clip": 1.00684464, "balance_loss_mlp": 1.00041568, "epoch": 0.46642217278903386, "flos": 11509630032960.0, "grad_norm": 2.302204240140027, "language_loss": 0.82966739, "learning_rate": 2.3134074852951966e-06, "loss": 0.85418177, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 3.0748419761657715 }, { "auxiliary_loss_clip": 0.01297849, "auxiliary_loss_mlp": 0.01194584, "balance_loss_clip": 1.00934875, "balance_loss_mlp": 1.00041628, "epoch": 0.4665424156796729, "flos": 32306833098720.0, "grad_norm": 1.6859120293811671, "language_loss": 0.77734047, "learning_rate": 2.312638113581088e-06, "loss": 0.80226481, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 3.2535665035247803 }, { "auxiliary_loss_clip": 0.01347398, "auxiliary_loss_mlp": 0.01194543, "balance_loss_clip": 1.00889528, "balance_loss_mlp": 1.00037479, "epoch": 0.46666265857031203, "flos": 18436187469600.0, "grad_norm": 2.384271366835294, "language_loss": 0.78191221, "learning_rate": 2.311868694440027e-06, "loss": 0.80733168, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.7720417976379395 }, { "auxiliary_loss_clip": 0.01329431, "auxiliary_loss_mlp": 0.01193901, "balance_loss_clip": 1.0038681, "balance_loss_mlp": 1.00011432, "epoch": 0.46678290146095114, "flos": 68439026537280.0, "grad_norm": 0.8039341249777483, "language_loss": 0.62517011, "learning_rate": 2.3110992279887323e-06, "loss": 0.65040344, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.4340622425079346 }, { "auxiliary_loss_clip": 0.0130137, "auxiliary_loss_mlp": 0.01195008, "balance_loss_clip": 1.008111, "balance_loss_mlp": 1.00045812, "epoch": 0.4669031443515902, "flos": 17712517652640.0, "grad_norm": 2.2109473926269794, "language_loss": 0.85086954, "learning_rate": 2.310329714343932e-06, "loss": 0.87583333, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.795729875564575 }, { "auxiliary_loss_clip": 0.01322677, "auxiliary_loss_mlp": 0.01194547, "balance_loss_clip": 1.00942659, "balance_loss_mlp": 1.00037909, "epoch": 0.4670233872422293, "flos": 23947759298880.0, "grad_norm": 1.9187219210850845, "language_loss": 0.81977445, "learning_rate": 2.309560153622361e-06, "loss": 0.84494668, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.7644283771514893 }, { "auxiliary_loss_clip": 0.01291913, "auxiliary_loss_mlp": 0.01194689, "balance_loss_clip": 1.00747228, "balance_loss_mlp": 1.00042546, "epoch": 0.4671436301328684, "flos": 28111280842080.0, "grad_norm": 1.985219915195415, "language_loss": 0.74692005, "learning_rate": 2.3087905459407602e-06, "loss": 0.77178609, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.9105124473571777 }, { "auxiliary_loss_clip": 0.01302633, "auxiliary_loss_mlp": 0.0119385, "balance_loss_clip": 1.00395584, "balance_loss_mlp": 1.00006378, "epoch": 0.46726387302350747, "flos": 69369717114720.0, "grad_norm": 0.7904217053887154, "language_loss": 0.62911975, "learning_rate": 2.3080208914158795e-06, "loss": 0.65408456, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.3194258213043213 }, { "auxiliary_loss_clip": 0.01311869, "auxiliary_loss_mlp": 0.01194917, "balance_loss_clip": 1.00871992, "balance_loss_mlp": 1.00046325, "epoch": 0.4673841159141466, "flos": 25519678551360.0, "grad_norm": 2.0021961354376043, "language_loss": 0.72363174, "learning_rate": 2.3072511901644753e-06, "loss": 0.74869955, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.797513961791992 }, { "auxiliary_loss_clip": 0.01359577, "auxiliary_loss_mlp": 0.01194545, "balance_loss_clip": 1.00940394, "balance_loss_mlp": 1.00047207, "epoch": 0.4675043588047857, "flos": 24499277039520.0, "grad_norm": 1.7875642499672646, "language_loss": 0.81089735, "learning_rate": 2.306481442303309e-06, "loss": 0.8364386, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.828317880630493 }, { "auxiliary_loss_clip": 0.01338108, "auxiliary_loss_mlp": 0.01194787, "balance_loss_clip": 1.00857139, "balance_loss_mlp": 1.00042868, "epoch": 0.46762460169542475, "flos": 20960782591680.0, "grad_norm": 1.778897057317415, "language_loss": 0.73216343, "learning_rate": 2.3057116479491515e-06, "loss": 0.75749236, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.741604804992676 }, { "auxiliary_loss_clip": 0.01348168, "auxiliary_loss_mlp": 0.0119454, "balance_loss_clip": 1.00937641, "balance_loss_mlp": 1.00027657, "epoch": 0.46774484458606386, "flos": 19171676177280.0, "grad_norm": 1.8315787437775046, "language_loss": 0.75969255, "learning_rate": 2.30494180721878e-06, "loss": 0.78511965, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.7253777980804443 }, { "auxiliary_loss_clip": 0.01347279, "auxiliary_loss_mlp": 0.0119434, "balance_loss_clip": 1.00918734, "balance_loss_mlp": 1.0003624, "epoch": 0.4678650874767029, "flos": 17967698858880.0, "grad_norm": 1.7870858769257265, "language_loss": 0.89856398, "learning_rate": 2.3041719202289794e-06, "loss": 0.92398012, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.694922685623169 }, { "auxiliary_loss_clip": 0.0134155, "auxiliary_loss_mlp": 0.01194683, "balance_loss_clip": 1.00893831, "balance_loss_mlp": 1.00041986, "epoch": 0.467985330367342, "flos": 21360827086560.0, "grad_norm": 1.6241787125165754, "language_loss": 0.80407012, "learning_rate": 2.30340198709654e-06, "loss": 0.82943249, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.8263821601867676 }, { "auxiliary_loss_clip": 0.01335191, "auxiliary_loss_mlp": 0.01194783, "balance_loss_clip": 1.00921106, "balance_loss_mlp": 1.00042391, "epoch": 0.46810557325798113, "flos": 20521845960480.0, "grad_norm": 1.8861594148800334, "language_loss": 0.74041152, "learning_rate": 2.3026320079382605e-06, "loss": 0.76571131, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.810096502304077 }, { "auxiliary_loss_clip": 0.01358972, "auxiliary_loss_mlp": 0.01194441, "balance_loss_clip": 1.0092845, "balance_loss_mlp": 1.000368, "epoch": 0.4682258161486202, "flos": 30117861807840.0, "grad_norm": 1.892509924376635, "language_loss": 0.76646769, "learning_rate": 2.3018619828709454e-06, "loss": 0.79200184, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.6693992614746094 }, { "auxiliary_loss_clip": 0.01333653, "auxiliary_loss_mlp": 0.00872411, "balance_loss_clip": 1.0090456, "balance_loss_mlp": 1.00010061, "epoch": 0.4683460590392593, "flos": 25293366774720.0, "grad_norm": 1.8984202195602535, "language_loss": 0.81646127, "learning_rate": 2.3010919120114084e-06, "loss": 0.83852184, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.7472357749938965 }, { "auxiliary_loss_clip": 0.01347548, "auxiliary_loss_mlp": 0.01194829, "balance_loss_clip": 1.00904715, "balance_loss_mlp": 1.00047016, "epoch": 0.4684663019298984, "flos": 15368337053280.0, "grad_norm": 2.16505589521039, "language_loss": 0.66001695, "learning_rate": 2.3003217954764672e-06, "loss": 0.68544066, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.671520471572876 }, { "auxiliary_loss_clip": 0.01348229, "auxiliary_loss_mlp": 0.01194855, "balance_loss_clip": 1.00919998, "balance_loss_mlp": 1.00040054, "epoch": 0.46858654482053747, "flos": 27778854218400.0, "grad_norm": 1.626115510208231, "language_loss": 0.79047453, "learning_rate": 2.299551633382949e-06, "loss": 0.81590533, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.7771267890930176 }, { "auxiliary_loss_clip": 0.0132384, "auxiliary_loss_mlp": 0.01194536, "balance_loss_clip": 1.00865257, "balance_loss_mlp": 1.00046301, "epoch": 0.4687067877111766, "flos": 18040633434720.0, "grad_norm": 1.733418185085938, "language_loss": 0.85378224, "learning_rate": 2.2987814258476854e-06, "loss": 0.87896597, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.6845035552978516 }, { "auxiliary_loss_clip": 0.01296898, "auxiliary_loss_mlp": 0.01194811, "balance_loss_clip": 1.00845456, "balance_loss_mlp": 1.00035703, "epoch": 0.4688270306018157, "flos": 16977388181760.0, "grad_norm": 2.0812215811221577, "language_loss": 0.67683822, "learning_rate": 2.2980111729875177e-06, "loss": 0.70175529, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.7971463203430176 }, { "auxiliary_loss_clip": 0.01310065, "auxiliary_loss_mlp": 0.01194853, "balance_loss_clip": 1.00857878, "balance_loss_mlp": 1.0003984, "epoch": 0.46894727349245474, "flos": 17821650088800.0, "grad_norm": 1.7342843969628694, "language_loss": 0.82357103, "learning_rate": 2.2972408749192917e-06, "loss": 0.84862018, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 3.94329833984375 }, { "auxiliary_loss_clip": 0.01335414, "auxiliary_loss_mlp": 0.00872337, "balance_loss_clip": 1.00869548, "balance_loss_mlp": 1.00013089, "epoch": 0.46906751638309385, "flos": 21471360546240.0, "grad_norm": 1.7738113866123966, "language_loss": 0.67032111, "learning_rate": 2.296470531759861e-06, "loss": 0.69239867, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.7835636138916016 }, { "auxiliary_loss_clip": 0.01303994, "auxiliary_loss_mlp": 0.0119458, "balance_loss_clip": 1.00827277, "balance_loss_mlp": 1.0003171, "epoch": 0.46918775927373296, "flos": 20337839069760.0, "grad_norm": 1.919402561206941, "language_loss": 0.7941947, "learning_rate": 2.2957001436260866e-06, "loss": 0.81918049, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 3.604515314102173 }, { "auxiliary_loss_clip": 0.01327447, "auxiliary_loss_mlp": 0.01194676, "balance_loss_clip": 1.00891471, "balance_loss_mlp": 1.0004127, "epoch": 0.469308002164372, "flos": 18403258664160.0, "grad_norm": 1.5259233810305008, "language_loss": 0.72964138, "learning_rate": 2.294929710634836e-06, "loss": 0.75486255, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.8191514015197754 }, { "auxiliary_loss_clip": 0.01347757, "auxiliary_loss_mlp": 0.01194719, "balance_loss_clip": 1.00946021, "balance_loss_mlp": 1.00036061, "epoch": 0.46942824505501113, "flos": 37962082735200.0, "grad_norm": 2.4954319580295654, "language_loss": 0.61471677, "learning_rate": 2.2941592329029823e-06, "loss": 0.64014149, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.8692829608917236 }, { "auxiliary_loss_clip": 0.01348147, "auxiliary_loss_mlp": 0.01194518, "balance_loss_clip": 1.00953329, "balance_loss_mlp": 1.00034952, "epoch": 0.46954848794565024, "flos": 21872518675200.0, "grad_norm": 1.916693474489061, "language_loss": 0.7909435, "learning_rate": 2.2933887105474067e-06, "loss": 0.81637013, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 3.6724531650543213 }, { "auxiliary_loss_clip": 0.01335132, "auxiliary_loss_mlp": 0.01194398, "balance_loss_clip": 1.0084796, "balance_loss_mlp": 1.0003252, "epoch": 0.4696687308362893, "flos": 22016555719200.0, "grad_norm": 1.496037322791068, "language_loss": 0.8156029, "learning_rate": 2.2926181436849974e-06, "loss": 0.84089828, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.712630033493042 }, { "auxiliary_loss_clip": 0.01334649, "auxiliary_loss_mlp": 0.01194663, "balance_loss_clip": 1.00854921, "balance_loss_mlp": 1.0004952, "epoch": 0.4697889737269284, "flos": 21613673253600.0, "grad_norm": 1.6160412147570276, "language_loss": 0.72713196, "learning_rate": 2.2918475324326478e-06, "loss": 0.75242507, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.760091543197632 }, { "auxiliary_loss_clip": 0.01338662, "auxiliary_loss_mlp": 0.00872481, "balance_loss_clip": 1.00910234, "balance_loss_mlp": 1.0002377, "epoch": 0.46990921661756746, "flos": 25228335408480.0, "grad_norm": 8.091317999604602, "language_loss": 0.91543204, "learning_rate": 2.2910768769072603e-06, "loss": 0.93754345, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.760113477706909 }, { "auxiliary_loss_clip": 0.01347656, "auxiliary_loss_mlp": 0.01194395, "balance_loss_clip": 1.00939071, "balance_loss_mlp": 1.00032258, "epoch": 0.47002945950820657, "flos": 13844039391360.0, "grad_norm": 2.0531917717359667, "language_loss": 0.75960314, "learning_rate": 2.2903061772257417e-06, "loss": 0.78502369, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.6889095306396484 }, { "auxiliary_loss_clip": 0.01336893, "auxiliary_loss_mlp": 0.01194372, "balance_loss_clip": 1.00859022, "balance_loss_mlp": 1.00039506, "epoch": 0.4701497023988457, "flos": 26247012583680.0, "grad_norm": 1.51417952454455, "language_loss": 0.78624046, "learning_rate": 2.289535433505007e-06, "loss": 0.81155312, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.7517683506011963 }, { "auxiliary_loss_clip": 0.01322186, "auxiliary_loss_mlp": 0.0119442, "balance_loss_clip": 1.00937808, "balance_loss_mlp": 1.00044239, "epoch": 0.47026994528948474, "flos": 25629529461120.0, "grad_norm": 2.361153315165952, "language_loss": 0.63809764, "learning_rate": 2.2887646458619767e-06, "loss": 0.66326368, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.8016488552093506 }, { "auxiliary_loss_clip": 0.01305147, "auxiliary_loss_mlp": 0.0119487, "balance_loss_clip": 1.00845468, "balance_loss_mlp": 1.00051117, "epoch": 0.47039018818012385, "flos": 20554415529120.0, "grad_norm": 1.6253786184099164, "language_loss": 0.76290792, "learning_rate": 2.2879938144135797e-06, "loss": 0.78790814, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.8023264408111572 }, { "auxiliary_loss_clip": 0.01308727, "auxiliary_loss_mlp": 0.00872388, "balance_loss_clip": 1.008708, "balance_loss_mlp": 1.00023425, "epoch": 0.47051043107076296, "flos": 21577259851200.0, "grad_norm": 1.528849290724675, "language_loss": 0.75151443, "learning_rate": 2.2872229392767496e-06, "loss": 0.77332556, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.8290324211120605 }, { "auxiliary_loss_clip": 0.01345978, "auxiliary_loss_mlp": 0.01194596, "balance_loss_clip": 1.00905955, "balance_loss_mlp": 1.0004282, "epoch": 0.470630673961402, "flos": 18953195762880.0, "grad_norm": 1.4716953676861695, "language_loss": 0.74583077, "learning_rate": 2.286452020568428e-06, "loss": 0.77123654, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.8299753665924072 }, { "auxiliary_loss_clip": 0.01361132, "auxiliary_loss_mlp": 0.01194698, "balance_loss_clip": 1.0095017, "balance_loss_mlp": 1.00043464, "epoch": 0.4707509168520411, "flos": 19938980056320.0, "grad_norm": 1.6091216040967267, "language_loss": 0.73020381, "learning_rate": 2.2856810584055637e-06, "loss": 0.7557621, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.6772241592407227 }, { "auxiliary_loss_clip": 0.01339195, "auxiliary_loss_mlp": 0.01194767, "balance_loss_clip": 1.00858831, "balance_loss_mlp": 1.00050402, "epoch": 0.47087115974268023, "flos": 40118771846880.0, "grad_norm": 1.4662816793408604, "language_loss": 0.67833531, "learning_rate": 2.2849100529051085e-06, "loss": 0.70367497, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.908970832824707 }, { "auxiliary_loss_clip": 0.01359589, "auxiliary_loss_mlp": 0.01194477, "balance_loss_clip": 1.00952303, "balance_loss_mlp": 1.0004046, "epoch": 0.4709914026333193, "flos": 13552732172160.0, "grad_norm": 2.1730969704088956, "language_loss": 0.80299407, "learning_rate": 2.284139004184026e-06, "loss": 0.82853472, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.71285080909729 }, { "auxiliary_loss_clip": 0.013594, "auxiliary_loss_mlp": 0.01194793, "balance_loss_clip": 1.00927258, "balance_loss_mlp": 1.00052953, "epoch": 0.4711116455239584, "flos": 19974639061440.0, "grad_norm": 1.855281680995342, "language_loss": 0.74159974, "learning_rate": 2.2833679123592814e-06, "loss": 0.7671417, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.670668125152588 }, { "auxiliary_loss_clip": 0.01312429, "auxiliary_loss_mlp": 0.01194461, "balance_loss_clip": 1.00782204, "balance_loss_mlp": 1.00038862, "epoch": 0.4712318884145975, "flos": 32124837934080.0, "grad_norm": 1.5914730060646727, "language_loss": 0.63582683, "learning_rate": 2.2825967775478508e-06, "loss": 0.66089571, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.9119479656219482 }, { "auxiliary_loss_clip": 0.0135965, "auxiliary_loss_mlp": 0.01194334, "balance_loss_clip": 1.00913262, "balance_loss_mlp": 1.00035667, "epoch": 0.47135213130523657, "flos": 20047861026720.0, "grad_norm": 1.9694049682635675, "language_loss": 0.83453155, "learning_rate": 2.2818255998667135e-06, "loss": 0.86007142, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.702794313430786 }, { "auxiliary_loss_clip": 0.01335893, "auxiliary_loss_mlp": 0.01194336, "balance_loss_clip": 1.00897694, "balance_loss_mlp": 1.00035906, "epoch": 0.4714723741958757, "flos": 19427001078240.0, "grad_norm": 1.5176544207641247, "language_loss": 0.78808284, "learning_rate": 2.2810543794328566e-06, "loss": 0.81338513, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.829726457595825 }, { "auxiliary_loss_clip": 0.01347892, "auxiliary_loss_mlp": 0.01194496, "balance_loss_clip": 1.00963569, "balance_loss_mlp": 1.00032759, "epoch": 0.4715926170865148, "flos": 20373893235360.0, "grad_norm": 1.6647355639182206, "language_loss": 0.82449734, "learning_rate": 2.2802831163632735e-06, "loss": 0.84992123, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.8021554946899414 }, { "auxiliary_loss_clip": 0.01255964, "auxiliary_loss_mlp": 0.01194898, "balance_loss_clip": 1.00668716, "balance_loss_mlp": 1.0004437, "epoch": 0.47171285997715384, "flos": 22672895054400.0, "grad_norm": 1.6756911781956083, "language_loss": 0.74510312, "learning_rate": 2.279511810774965e-06, "loss": 0.76961172, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.9857943058013916 }, { "auxiliary_loss_clip": 0.01359426, "auxiliary_loss_mlp": 0.01194518, "balance_loss_clip": 1.00899172, "balance_loss_mlp": 1.0003494, "epoch": 0.47183310286779295, "flos": 21105430338240.0, "grad_norm": 1.8662848741345972, "language_loss": 0.71170998, "learning_rate": 2.2787404627849364e-06, "loss": 0.73724937, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.726473569869995 }, { "auxiliary_loss_clip": 0.01335627, "auxiliary_loss_mlp": 0.01194316, "balance_loss_clip": 1.0096705, "balance_loss_mlp": 1.0004338, "epoch": 0.471953345758432, "flos": 21726577676160.0, "grad_norm": 1.6736598242359881, "language_loss": 0.78863251, "learning_rate": 2.277969072510202e-06, "loss": 0.81393188, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.794475793838501 }, { "auxiliary_loss_clip": 0.01315822, "auxiliary_loss_mlp": 0.0119449, "balance_loss_clip": 1.00741291, "balance_loss_mlp": 1.00041747, "epoch": 0.4720735886490711, "flos": 19861088012640.0, "grad_norm": 1.515368900330448, "language_loss": 0.81093442, "learning_rate": 2.2771976400677803e-06, "loss": 0.83603752, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.818152904510498 }, { "auxiliary_loss_clip": 0.01285145, "auxiliary_loss_mlp": 0.01194307, "balance_loss_clip": 1.00841331, "balance_loss_mlp": 1.00032938, "epoch": 0.47219383153971023, "flos": 19171819872000.0, "grad_norm": 1.6481089263640594, "language_loss": 0.78808671, "learning_rate": 2.2764261655746965e-06, "loss": 0.81288123, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 3.787705183029175 }, { "auxiliary_loss_clip": 0.01307053, "auxiliary_loss_mlp": 0.01194557, "balance_loss_clip": 1.0081439, "balance_loss_mlp": 1.00048447, "epoch": 0.4723140744303493, "flos": 23224017634560.0, "grad_norm": 1.5415161096304384, "language_loss": 0.7560463, "learning_rate": 2.2756546491479832e-06, "loss": 0.78106236, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 3.8778679370880127 }, { "auxiliary_loss_clip": 0.01360356, "auxiliary_loss_mlp": 0.00872499, "balance_loss_clip": 1.00930893, "balance_loss_mlp": 1.00034595, "epoch": 0.4724343173209884, "flos": 18223275225600.0, "grad_norm": 2.1220134985733696, "language_loss": 0.80592376, "learning_rate": 2.274883090904679e-06, "loss": 0.82825232, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.762626886367798 }, { "auxiliary_loss_clip": 0.01360018, "auxiliary_loss_mlp": 0.01194767, "balance_loss_clip": 1.0095582, "balance_loss_mlp": 1.00050378, "epoch": 0.4725545602116275, "flos": 21251048024160.0, "grad_norm": 2.5799462699001046, "language_loss": 0.68097144, "learning_rate": 2.2741114909618283e-06, "loss": 0.70651931, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 3.6127116680145264 }, { "auxiliary_loss_clip": 0.01300219, "auxiliary_loss_mlp": 0.01194465, "balance_loss_clip": 1.00789905, "balance_loss_mlp": 1.00039268, "epoch": 0.47267480310226656, "flos": 21434013128160.0, "grad_norm": 1.6882748592119559, "language_loss": 0.7205286, "learning_rate": 2.2733398494364828e-06, "loss": 0.74547541, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.8621230125427246 }, { "auxiliary_loss_clip": 0.01310265, "auxiliary_loss_mlp": 0.01194381, "balance_loss_clip": 1.00907183, "balance_loss_mlp": 1.00030816, "epoch": 0.47279504599290567, "flos": 18770518048320.0, "grad_norm": 2.0160576064165827, "language_loss": 0.84481525, "learning_rate": 2.272568166445699e-06, "loss": 0.86986172, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 3.7115743160247803 }, { "auxiliary_loss_clip": 0.01340207, "auxiliary_loss_mlp": 0.01194355, "balance_loss_clip": 1.00837314, "balance_loss_mlp": 1.00037777, "epoch": 0.4729152888835448, "flos": 21105753651360.0, "grad_norm": 2.185417769995318, "language_loss": 0.64489782, "learning_rate": 2.271796442106541e-06, "loss": 0.67024338, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.7280054092407227 }, { "auxiliary_loss_clip": 0.01275945, "auxiliary_loss_mlp": 0.01193949, "balance_loss_clip": 1.00340891, "balance_loss_mlp": 1.00016201, "epoch": 0.47303553177418384, "flos": 70201908665280.0, "grad_norm": 0.797713384472293, "language_loss": 0.56576562, "learning_rate": 2.271024676536079e-06, "loss": 0.59046447, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.355276107788086 }, { "auxiliary_loss_clip": 0.01316939, "auxiliary_loss_mlp": 0.01194667, "balance_loss_clip": 1.00834084, "balance_loss_mlp": 1.00040364, "epoch": 0.47315577466482295, "flos": 22455133113600.0, "grad_norm": 1.9468099623498105, "language_loss": 0.7347976, "learning_rate": 2.2702528698513894e-06, "loss": 0.75991362, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.8002517223358154 }, { "auxiliary_loss_clip": 0.01336767, "auxiliary_loss_mlp": 0.01194832, "balance_loss_clip": 1.00974774, "balance_loss_mlp": 1.00047362, "epoch": 0.47327601755546206, "flos": 24352869032640.0, "grad_norm": 1.8011158520011192, "language_loss": 0.78554827, "learning_rate": 2.269481022169554e-06, "loss": 0.81086427, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.789264440536499 }, { "auxiliary_loss_clip": 0.01328876, "auxiliary_loss_mlp": 0.01194945, "balance_loss_clip": 1.00883043, "balance_loss_mlp": 1.00058615, "epoch": 0.4733962604461011, "flos": 22926783008160.0, "grad_norm": 1.7061090604921862, "language_loss": 0.80466092, "learning_rate": 2.2687091336076614e-06, "loss": 0.82989907, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.810673236846924 }, { "auxiliary_loss_clip": 0.01334652, "auxiliary_loss_mlp": 0.01194438, "balance_loss_clip": 1.00826812, "balance_loss_mlp": 1.00036526, "epoch": 0.4735165033367402, "flos": 18327378346560.0, "grad_norm": 2.696380675908824, "language_loss": 0.80262053, "learning_rate": 2.267937204282807e-06, "loss": 0.8279115, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.6888046264648438 }, { "auxiliary_loss_clip": 0.01341596, "auxiliary_loss_mlp": 0.01194629, "balance_loss_clip": 1.00902128, "balance_loss_mlp": 1.00036538, "epoch": 0.4736367462273793, "flos": 23037029078400.0, "grad_norm": 1.9418918593308727, "language_loss": 0.78917778, "learning_rate": 2.2671652343120926e-06, "loss": 0.81454003, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.818824291229248 }, { "auxiliary_loss_clip": 0.01358935, "auxiliary_loss_mlp": 0.01194518, "balance_loss_clip": 1.00922048, "balance_loss_mlp": 1.00044489, "epoch": 0.4737569891180184, "flos": 25374348254880.0, "grad_norm": 1.6605678760965807, "language_loss": 0.80444139, "learning_rate": 2.2663932238126236e-06, "loss": 0.82997584, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.7004811763763428 }, { "auxiliary_loss_clip": 0.0134713, "auxiliary_loss_mlp": 0.01194478, "balance_loss_clip": 1.00910115, "balance_loss_mlp": 1.00040543, "epoch": 0.4738772320086575, "flos": 25849339051680.0, "grad_norm": 1.340775440748873, "language_loss": 0.80321521, "learning_rate": 2.265621172901515e-06, "loss": 0.82863128, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.862948179244995 }, { "auxiliary_loss_clip": 0.01360071, "auxiliary_loss_mlp": 0.0119444, "balance_loss_clip": 1.00963926, "balance_loss_mlp": 1.00036693, "epoch": 0.47399747489929656, "flos": 27564433179840.0, "grad_norm": 2.066561340309734, "language_loss": 0.71472013, "learning_rate": 2.2648490816958854e-06, "loss": 0.74026525, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.801447629928589 }, { "auxiliary_loss_clip": 0.01348441, "auxiliary_loss_mlp": 0.01194628, "balance_loss_clip": 1.0097394, "balance_loss_mlp": 1.0003643, "epoch": 0.47411771778993567, "flos": 24863662529280.0, "grad_norm": 1.955840820544961, "language_loss": 0.7296738, "learning_rate": 2.264076950312861e-06, "loss": 0.75510448, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.838704824447632 }, { "auxiliary_loss_clip": 0.01323299, "auxiliary_loss_mlp": 0.01194659, "balance_loss_clip": 1.00876963, "balance_loss_mlp": 1.00039518, "epoch": 0.4742379606805748, "flos": 22748020974720.0, "grad_norm": 1.9278165401370297, "language_loss": 0.82983214, "learning_rate": 2.2633047788695727e-06, "loss": 0.8550117, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.729691743850708 }, { "auxiliary_loss_clip": 0.01317154, "auxiliary_loss_mlp": 0.01194336, "balance_loss_clip": 1.00821733, "balance_loss_mlp": 1.00035834, "epoch": 0.47435820357121383, "flos": 19681140497760.0, "grad_norm": 1.6687315330321422, "language_loss": 0.63822365, "learning_rate": 2.262532567483159e-06, "loss": 0.66333854, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.8193023204803467 }, { "auxiliary_loss_clip": 0.01360857, "auxiliary_loss_mlp": 0.00872535, "balance_loss_clip": 1.0099597, "balance_loss_mlp": 1.00042665, "epoch": 0.47447844646185294, "flos": 25228730568960.0, "grad_norm": 1.8284763356785172, "language_loss": 0.80368757, "learning_rate": 2.2617603162707635e-06, "loss": 0.82602149, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.7955939769744873 }, { "auxiliary_loss_clip": 0.01359162, "auxiliary_loss_mlp": 0.01194714, "balance_loss_clip": 1.00926793, "balance_loss_mlp": 1.00045073, "epoch": 0.47459868935249205, "flos": 24570630973440.0, "grad_norm": 1.9962216409580433, "language_loss": 0.8242473, "learning_rate": 2.2609880253495363e-06, "loss": 0.84978604, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.780334234237671 }, { "auxiliary_loss_clip": 0.0131063, "auxiliary_loss_mlp": 0.01194562, "balance_loss_clip": 1.00950837, "balance_loss_mlp": 1.00039446, "epoch": 0.4747189322431311, "flos": 20558510828640.0, "grad_norm": 1.8261480544614033, "language_loss": 0.862867, "learning_rate": 2.260215694836633e-06, "loss": 0.88791889, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.8415322303771973 }, { "auxiliary_loss_clip": 0.01297564, "auxiliary_loss_mlp": 0.00872575, "balance_loss_clip": 1.00831354, "balance_loss_mlp": 1.00045347, "epoch": 0.4748391751337702, "flos": 25995244127040.0, "grad_norm": 2.2416205044982958, "language_loss": 0.64896798, "learning_rate": 2.2594433248492157e-06, "loss": 0.67066938, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.8973093032836914 }, { "auxiliary_loss_clip": 0.01348568, "auxiliary_loss_mlp": 0.0119453, "balance_loss_clip": 1.00972319, "balance_loss_mlp": 1.00036168, "epoch": 0.47495941802440933, "flos": 22821063321600.0, "grad_norm": 2.2469520796658693, "language_loss": 0.80141288, "learning_rate": 2.2586709155044527e-06, "loss": 0.82684386, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.7683348655700684 }, { "auxiliary_loss_clip": 0.01360569, "auxiliary_loss_mlp": 0.0119431, "balance_loss_clip": 1.01007044, "balance_loss_mlp": 1.00033236, "epoch": 0.4750796609150484, "flos": 27891794564640.0, "grad_norm": 1.464119961734393, "language_loss": 0.76011705, "learning_rate": 2.2578984669195167e-06, "loss": 0.78566581, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.722506523132324 }, { "auxiliary_loss_clip": 0.01347072, "auxiliary_loss_mlp": 0.01194182, "balance_loss_clip": 1.00895596, "balance_loss_mlp": 1.00029993, "epoch": 0.4751999038056875, "flos": 35660099250720.0, "grad_norm": 1.9507680587113316, "language_loss": 0.6791743, "learning_rate": 2.2571259792115887e-06, "loss": 0.70458686, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.8224077224731445 }, { "auxiliary_loss_clip": 0.0134654, "auxiliary_loss_mlp": 0.01194453, "balance_loss_clip": 1.00935912, "balance_loss_mlp": 1.00038004, "epoch": 0.4753201466963266, "flos": 22090891318560.0, "grad_norm": 1.7699611704375693, "language_loss": 0.79395604, "learning_rate": 2.2563534524978544e-06, "loss": 0.81936598, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 3.7145705223083496 }, { "auxiliary_loss_clip": 0.01292621, "auxiliary_loss_mlp": 0.01194465, "balance_loss_clip": 1.0084672, "balance_loss_mlp": 1.00039256, "epoch": 0.47544038958696566, "flos": 30190868231040.0, "grad_norm": 1.583267089019732, "language_loss": 0.7048673, "learning_rate": 2.2555808868955052e-06, "loss": 0.72973818, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 3.7700343132019043 }, { "auxiliary_loss_clip": 0.01276301, "auxiliary_loss_mlp": 0.01194759, "balance_loss_clip": 1.00803971, "balance_loss_mlp": 1.00040054, "epoch": 0.47556063247760477, "flos": 23472229646880.0, "grad_norm": 1.9021902177630898, "language_loss": 0.73301494, "learning_rate": 2.254808282521738e-06, "loss": 0.7577256, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.9298901557922363 }, { "auxiliary_loss_clip": 0.01309423, "auxiliary_loss_mlp": 0.00872509, "balance_loss_clip": 1.00794578, "balance_loss_mlp": 1.00028515, "epoch": 0.4756808753682438, "flos": 25155221214240.0, "grad_norm": 1.810874550809394, "language_loss": 0.8108924, "learning_rate": 2.2540356394937573e-06, "loss": 0.8327117, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 3.770538806915283 }, { "auxiliary_loss_clip": 0.01311166, "auxiliary_loss_mlp": 0.01194835, "balance_loss_clip": 1.00912309, "balance_loss_mlp": 1.00047588, "epoch": 0.47580111825888294, "flos": 15669738826560.0, "grad_norm": 2.059819308654651, "language_loss": 0.83778727, "learning_rate": 2.253262957928772e-06, "loss": 0.86284727, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 3.7506206035614014 }, { "auxiliary_loss_clip": 0.01335022, "auxiliary_loss_mlp": 0.01194682, "balance_loss_clip": 1.00884795, "balance_loss_mlp": 1.00051367, "epoch": 0.47592136114952205, "flos": 17636565487680.0, "grad_norm": 1.478709113465423, "language_loss": 0.71959239, "learning_rate": 2.2524902379439976e-06, "loss": 0.74488944, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.739081859588623 }, { "auxiliary_loss_clip": 0.01240396, "auxiliary_loss_mlp": 0.01193956, "balance_loss_clip": 1.007671, "balance_loss_mlp": 1.0001694, "epoch": 0.4760416040401611, "flos": 61417189995840.0, "grad_norm": 0.7407844236084765, "language_loss": 0.63717067, "learning_rate": 2.251717479656655e-06, "loss": 0.66151416, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.617831230163574 }, { "auxiliary_loss_clip": 0.01360679, "auxiliary_loss_mlp": 0.01194462, "balance_loss_clip": 1.00963867, "balance_loss_mlp": 1.00038958, "epoch": 0.4761618469308002, "flos": 18405881092800.0, "grad_norm": 2.466017334283841, "language_loss": 0.76359332, "learning_rate": 2.2509446831839704e-06, "loss": 0.78914469, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.8721096515655518 }, { "auxiliary_loss_clip": 0.01325956, "auxiliary_loss_mlp": 0.01194622, "balance_loss_clip": 1.0091548, "balance_loss_mlp": 1.00035834, "epoch": 0.4762820898214393, "flos": 18040920824160.0, "grad_norm": 2.4956831219042024, "language_loss": 0.82577324, "learning_rate": 2.250171848643177e-06, "loss": 0.85097903, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.8052380084991455 }, { "auxiliary_loss_clip": 0.01311564, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.00780916, "balance_loss_mlp": 1.00028241, "epoch": 0.4764023327120784, "flos": 19318263802560.0, "grad_norm": 2.095995995477349, "language_loss": 0.86136687, "learning_rate": 2.249398976151513e-06, "loss": 0.88642508, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.760596513748169 }, { "auxiliary_loss_clip": 0.01358866, "auxiliary_loss_mlp": 0.01194553, "balance_loss_clip": 1.00917673, "balance_loss_mlp": 1.00047994, "epoch": 0.4765225756027175, "flos": 22747266577440.0, "grad_norm": 2.275504190669548, "language_loss": 0.78683686, "learning_rate": 2.248626065826223e-06, "loss": 0.81237108, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.7831740379333496 }, { "auxiliary_loss_clip": 0.01328548, "auxiliary_loss_mlp": 0.01193901, "balance_loss_clip": 1.0034368, "balance_loss_mlp": 1.0001148, "epoch": 0.4766428184933566, "flos": 65933421832800.0, "grad_norm": 0.7602163778513019, "language_loss": 0.62550926, "learning_rate": 2.2478531177845564e-06, "loss": 0.65073371, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.2666099071502686 }, { "auxiliary_loss_clip": 0.01319375, "auxiliary_loss_mlp": 0.01194539, "balance_loss_clip": 1.00865448, "balance_loss_mlp": 1.00037062, "epoch": 0.47676306138399566, "flos": 24136508115360.0, "grad_norm": 1.7071569232702484, "language_loss": 0.84777784, "learning_rate": 2.247080132143769e-06, "loss": 0.872917, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.8341753482818604 }, { "auxiliary_loss_clip": 0.01310597, "auxiliary_loss_mlp": 0.01194834, "balance_loss_clip": 1.00852764, "balance_loss_mlp": 1.00047469, "epoch": 0.47688330427463477, "flos": 12604331220480.0, "grad_norm": 2.2003606310159514, "language_loss": 0.69505632, "learning_rate": 2.246307109021121e-06, "loss": 0.72011054, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.763329267501831 }, { "auxiliary_loss_clip": 0.01336374, "auxiliary_loss_mlp": 0.0119421, "balance_loss_clip": 1.0098505, "balance_loss_mlp": 1.00032794, "epoch": 0.4770035471652739, "flos": 21390594608160.0, "grad_norm": 1.6284540164912278, "language_loss": 0.82474953, "learning_rate": 2.2455340485338817e-06, "loss": 0.85005534, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.7564220428466797 }, { "auxiliary_loss_clip": 0.01346666, "auxiliary_loss_mlp": 0.01194269, "balance_loss_clip": 1.0090704, "balance_loss_mlp": 1.00038719, "epoch": 0.47712379005591293, "flos": 25156263000960.0, "grad_norm": 1.8762385152204217, "language_loss": 0.67836392, "learning_rate": 2.244760950799322e-06, "loss": 0.70377332, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.6883444786071777 }, { "auxiliary_loss_clip": 0.01299954, "auxiliary_loss_mlp": 0.01194501, "balance_loss_clip": 1.00850654, "balance_loss_mlp": 1.00042844, "epoch": 0.47724403294655204, "flos": 22054334221440.0, "grad_norm": 1.8050141382314089, "language_loss": 0.72523808, "learning_rate": 2.2439878159347203e-06, "loss": 0.75018263, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.682131290435791 }, { "auxiliary_loss_clip": 0.01328602, "auxiliary_loss_mlp": 0.01193915, "balance_loss_clip": 1.00356197, "balance_loss_mlp": 1.00012815, "epoch": 0.4773642758371911, "flos": 70229426204160.0, "grad_norm": 0.7330139101479207, "language_loss": 0.55316365, "learning_rate": 2.2432146440573616e-06, "loss": 0.57838881, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.253282070159912 }, { "auxiliary_loss_clip": 0.01308542, "auxiliary_loss_mlp": 0.01194439, "balance_loss_clip": 1.00920177, "balance_loss_mlp": 1.00036597, "epoch": 0.4774845187278302, "flos": 23548612896000.0, "grad_norm": 1.7066628708416183, "language_loss": 0.66516447, "learning_rate": 2.242441435284534e-06, "loss": 0.69019425, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.7733848094940186 }, { "auxiliary_loss_clip": 0.01337515, "auxiliary_loss_mlp": 0.0119478, "balance_loss_clip": 1.00878572, "balance_loss_mlp": 1.00051606, "epoch": 0.4776047616184693, "flos": 23075382359520.0, "grad_norm": 2.2180013367674962, "language_loss": 0.85383081, "learning_rate": 2.2416681897335337e-06, "loss": 0.87915373, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.6776719093322754 }, { "auxiliary_loss_clip": 0.01269486, "auxiliary_loss_mlp": 0.01194439, "balance_loss_clip": 1.00745821, "balance_loss_mlp": 1.00036633, "epoch": 0.4777250045091084, "flos": 31898130996960.0, "grad_norm": 1.77296750095711, "language_loss": 0.67164242, "learning_rate": 2.240894907521661e-06, "loss": 0.69628167, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.887017011642456 }, { "auxiliary_loss_clip": 0.01327436, "auxiliary_loss_mlp": 0.01194599, "balance_loss_clip": 1.00911641, "balance_loss_mlp": 1.00043058, "epoch": 0.4778452473997475, "flos": 24278174196480.0, "grad_norm": 1.7116593305178287, "language_loss": 0.63933384, "learning_rate": 2.240121588766223e-06, "loss": 0.66455418, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.778887987136841 }, { "auxiliary_loss_clip": 0.01324482, "auxiliary_loss_mlp": 0.01194206, "balance_loss_clip": 1.00887406, "balance_loss_mlp": 1.00032353, "epoch": 0.4779654902903866, "flos": 31575044530080.0, "grad_norm": 1.7339824003643673, "language_loss": 0.71241987, "learning_rate": 2.239348233584531e-06, "loss": 0.73760676, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.891057014465332 }, { "auxiliary_loss_clip": 0.0134129, "auxiliary_loss_mlp": 0.01194435, "balance_loss_clip": 1.00872993, "balance_loss_mlp": 1.00045729, "epoch": 0.47808573318102565, "flos": 19500438585600.0, "grad_norm": 2.0006940611451456, "language_loss": 0.81019902, "learning_rate": 2.2385748420939013e-06, "loss": 0.83555627, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.652900218963623 }, { "auxiliary_loss_clip": 0.01358706, "auxiliary_loss_mlp": 0.01194414, "balance_loss_clip": 1.00981641, "balance_loss_mlp": 1.00043678, "epoch": 0.47820597607166476, "flos": 22601145960000.0, "grad_norm": 1.9192723690995515, "language_loss": 0.72395402, "learning_rate": 2.2378014144116583e-06, "loss": 0.74948525, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.625333309173584 }, { "auxiliary_loss_clip": 0.01360536, "auxiliary_loss_mlp": 0.01194272, "balance_loss_clip": 1.00964189, "balance_loss_mlp": 1.0002948, "epoch": 0.4783262189623039, "flos": 23003022562560.0, "grad_norm": 1.7760969496738706, "language_loss": 0.79533798, "learning_rate": 2.23702795065513e-06, "loss": 0.82088602, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.6149818897247314 }, { "auxiliary_loss_clip": 0.01315906, "auxiliary_loss_mlp": 0.01193931, "balance_loss_clip": 1.00330496, "balance_loss_mlp": 1.000144, "epoch": 0.47844646185294293, "flos": 49772827152000.0, "grad_norm": 0.9921117313423437, "language_loss": 0.67557919, "learning_rate": 2.2362544509416493e-06, "loss": 0.70067751, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 4.075470924377441 }, { "auxiliary_loss_clip": 0.0132177, "auxiliary_loss_mlp": 0.0119422, "balance_loss_clip": 1.00872421, "balance_loss_mlp": 1.00033784, "epoch": 0.47856670474358204, "flos": 20229568801920.0, "grad_norm": 1.9274083757804459, "language_loss": 0.82432115, "learning_rate": 2.2354809153885572e-06, "loss": 0.84948099, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 3.572193145751953 }, { "auxiliary_loss_clip": 0.01335324, "auxiliary_loss_mlp": 0.01194404, "balance_loss_clip": 1.00824523, "balance_loss_mlp": 1.00033164, "epoch": 0.47868694763422115, "flos": 20990945273760.0, "grad_norm": 1.706425364998982, "language_loss": 0.83089018, "learning_rate": 2.234707344113197e-06, "loss": 0.85618746, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.722890615463257 }, { "auxiliary_loss_clip": 0.01358368, "auxiliary_loss_mlp": 0.01194254, "balance_loss_clip": 1.00925827, "balance_loss_mlp": 1.00037169, "epoch": 0.4788071905248602, "flos": 19026561422880.0, "grad_norm": 1.7009411958586416, "language_loss": 0.77477896, "learning_rate": 2.233933737232919e-06, "loss": 0.80030513, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.716888427734375 }, { "auxiliary_loss_clip": 0.01285427, "auxiliary_loss_mlp": 0.0087241, "balance_loss_clip": 1.00803041, "balance_loss_mlp": 1.00023127, "epoch": 0.4789274334154993, "flos": 23002232241600.0, "grad_norm": 2.6007703924117727, "language_loss": 0.78059971, "learning_rate": 2.2331600948650793e-06, "loss": 0.80217803, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 4.891873598098755 }, { "auxiliary_loss_clip": 0.01298738, "auxiliary_loss_mlp": 0.00872519, "balance_loss_clip": 1.00866687, "balance_loss_mlp": 1.00016916, "epoch": 0.4790476763061384, "flos": 23075597901600.0, "grad_norm": 1.4084502397571703, "language_loss": 0.80228037, "learning_rate": 2.2323864171270386e-06, "loss": 0.82399297, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.8260653018951416 }, { "auxiliary_loss_clip": 0.01307824, "auxiliary_loss_mlp": 0.01194546, "balance_loss_clip": 1.00835872, "balance_loss_mlp": 1.00037766, "epoch": 0.4791679191967775, "flos": 21179298929760.0, "grad_norm": 1.745593959312615, "language_loss": 0.72578895, "learning_rate": 2.231612704136164e-06, "loss": 0.75081265, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.7679765224456787 }, { "auxiliary_loss_clip": 0.01347792, "auxiliary_loss_mlp": 0.01194759, "balance_loss_clip": 1.00991464, "balance_loss_mlp": 1.00049555, "epoch": 0.4792881620874166, "flos": 22301504447040.0, "grad_norm": 2.4443079537831913, "language_loss": 0.74494827, "learning_rate": 2.2308389560098253e-06, "loss": 0.7703737, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.7261548042297363 }, { "auxiliary_loss_clip": 0.01290813, "auxiliary_loss_mlp": 0.01194422, "balance_loss_clip": 1.00953317, "balance_loss_mlp": 1.00034904, "epoch": 0.47940840497805565, "flos": 17420887120320.0, "grad_norm": 1.9135046526581756, "language_loss": 0.76882905, "learning_rate": 2.2300651728654008e-06, "loss": 0.79368138, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.726531982421875 }, { "auxiliary_loss_clip": 0.01306165, "auxiliary_loss_mlp": 0.00871731, "balance_loss_clip": 1.00349641, "balance_loss_mlp": 1.00008285, "epoch": 0.47952864786869476, "flos": 65358207672480.0, "grad_norm": 0.8556815430608382, "language_loss": 0.60187107, "learning_rate": 2.229291354820272e-06, "loss": 0.62365001, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.2723803520202637 }, { "auxiliary_loss_clip": 0.01348197, "auxiliary_loss_mlp": 0.01194881, "balance_loss_clip": 1.00973845, "balance_loss_mlp": 1.00061703, "epoch": 0.47964889075933387, "flos": 16799811629760.0, "grad_norm": 1.8626917032334218, "language_loss": 0.75750017, "learning_rate": 2.228517501991828e-06, "loss": 0.78293103, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.6712896823883057 }, { "auxiliary_loss_clip": 0.01290241, "auxiliary_loss_mlp": 0.01193972, "balance_loss_clip": 1.00328934, "balance_loss_mlp": 1.00018501, "epoch": 0.4797691336499729, "flos": 70079282134560.0, "grad_norm": 0.8117581500092124, "language_loss": 0.61081755, "learning_rate": 2.22774361449746e-06, "loss": 0.63565969, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.3335516452789307 }, { "auxiliary_loss_clip": 0.01254314, "auxiliary_loss_mlp": 0.01194484, "balance_loss_clip": 1.00734353, "balance_loss_mlp": 1.00041127, "epoch": 0.47988937654061203, "flos": 18953339457600.0, "grad_norm": 2.68894987784501, "language_loss": 0.7006287, "learning_rate": 2.2269696924545668e-06, "loss": 0.72511667, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.911768674850464 }, { "auxiliary_loss_clip": 0.01299458, "auxiliary_loss_mlp": 0.01194385, "balance_loss_clip": 1.00830626, "balance_loss_mlp": 1.00050282, "epoch": 0.48000961943125114, "flos": 14461989521760.0, "grad_norm": 2.2598937177291774, "language_loss": 0.78259623, "learning_rate": 2.2261957359805523e-06, "loss": 0.80753469, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.780597686767578 }, { "auxiliary_loss_clip": 0.01359339, "auxiliary_loss_mlp": 0.01194343, "balance_loss_clip": 1.00942039, "balance_loss_mlp": 1.0003655, "epoch": 0.4801298623218902, "flos": 27051160949280.0, "grad_norm": 1.7704176377238792, "language_loss": 0.73928273, "learning_rate": 2.225421745192823e-06, "loss": 0.7648195, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.730710744857788 }, { "auxiliary_loss_clip": 0.01334516, "auxiliary_loss_mlp": 0.01194658, "balance_loss_clip": 1.00851929, "balance_loss_mlp": 1.00048971, "epoch": 0.4802501052125293, "flos": 26355246927840.0, "grad_norm": 2.2473279441506024, "language_loss": 0.78594899, "learning_rate": 2.2246477202087955e-06, "loss": 0.81124067, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.7049660682678223 }, { "auxiliary_loss_clip": 0.01332682, "auxiliary_loss_mlp": 0.01194252, "balance_loss_clip": 1.00889945, "balance_loss_mlp": 1.00037026, "epoch": 0.4803703481031684, "flos": 20993926939200.0, "grad_norm": 1.5235046902091207, "language_loss": 0.8304556, "learning_rate": 2.223873661145887e-06, "loss": 0.85572493, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.8479232788085938 }, { "auxiliary_loss_clip": 0.01314785, "auxiliary_loss_mlp": 0.00872351, "balance_loss_clip": 1.00886822, "balance_loss_mlp": 1.00016952, "epoch": 0.4804905909938075, "flos": 20703733354080.0, "grad_norm": 1.5478554345667055, "language_loss": 0.71241236, "learning_rate": 2.2230995681215226e-06, "loss": 0.73428375, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.7819056510925293 }, { "auxiliary_loss_clip": 0.0129827, "auxiliary_loss_mlp": 0.01194363, "balance_loss_clip": 1.00785506, "balance_loss_mlp": 1.00038505, "epoch": 0.4806108338844466, "flos": 16654840570080.0, "grad_norm": 1.7925660789152826, "language_loss": 0.77989101, "learning_rate": 2.2223254412531305e-06, "loss": 0.80481732, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.78426194190979 }, { "auxiliary_loss_clip": 0.0132196, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.00790703, "balance_loss_mlp": 1.00027835, "epoch": 0.4807310767750857, "flos": 20011339853280.0, "grad_norm": 2.048611433054459, "language_loss": 0.82604134, "learning_rate": 2.221551280658146e-06, "loss": 0.85120153, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.8160791397094727 }, { "auxiliary_loss_clip": 0.01285815, "auxiliary_loss_mlp": 0.01194321, "balance_loss_clip": 1.0077343, "balance_loss_mlp": 1.00034416, "epoch": 0.48085131966572475, "flos": 23185269192960.0, "grad_norm": 1.643127548070138, "language_loss": 0.74402308, "learning_rate": 2.2207770864540085e-06, "loss": 0.76882446, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.873624801635742 }, { "auxiliary_loss_clip": 0.01327336, "auxiliary_loss_mlp": 0.0119451, "balance_loss_clip": 1.0089972, "balance_loss_mlp": 1.0004375, "epoch": 0.48097156255636386, "flos": 20558654523360.0, "grad_norm": 1.7539990624283868, "language_loss": 0.72928894, "learning_rate": 2.220002858758162e-06, "loss": 0.75450736, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.964914321899414 }, { "auxiliary_loss_clip": 0.01315258, "auxiliary_loss_mlp": 0.01193947, "balance_loss_clip": 1.00326693, "balance_loss_mlp": 1.0001601, "epoch": 0.481091805447003, "flos": 70511644732320.0, "grad_norm": 0.8770904889353065, "language_loss": 0.60936284, "learning_rate": 2.2192285976880573e-06, "loss": 0.63445485, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.259615898132324 }, { "auxiliary_loss_clip": 0.01310704, "auxiliary_loss_mlp": 0.00872155, "balance_loss_clip": 1.00879526, "balance_loss_mlp": 1.00019085, "epoch": 0.48121204833764203, "flos": 36428229374400.0, "grad_norm": 1.6375253061843742, "language_loss": 0.80873299, "learning_rate": 2.2184543033611485e-06, "loss": 0.83056152, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.92641282081604 }, { "auxiliary_loss_clip": 0.01344769, "auxiliary_loss_mlp": 0.01194307, "balance_loss_clip": 1.00854111, "balance_loss_mlp": 1.00033021, "epoch": 0.48133229122828114, "flos": 27490277198880.0, "grad_norm": 2.1490972548551475, "language_loss": 0.81703365, "learning_rate": 2.2176799758948957e-06, "loss": 0.84242439, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.8031411170959473 }, { "auxiliary_loss_clip": 0.01325767, "auxiliary_loss_mlp": 0.01194388, "balance_loss_clip": 1.00880694, "balance_loss_mlp": 1.00041091, "epoch": 0.4814525341189202, "flos": 43072819748640.0, "grad_norm": 2.0393535816296198, "language_loss": 0.73313248, "learning_rate": 2.2169056154067635e-06, "loss": 0.7583341, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.965639352798462 }, { "auxiliary_loss_clip": 0.01334791, "auxiliary_loss_mlp": 0.00872398, "balance_loss_clip": 1.00865591, "balance_loss_mlp": 1.00017619, "epoch": 0.4815727770095593, "flos": 24236911097280.0, "grad_norm": 1.7014537648819932, "language_loss": 0.82462239, "learning_rate": 2.216131222014222e-06, "loss": 0.84669423, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 3.6235554218292236 }, { "auxiliary_loss_clip": 0.01303678, "auxiliary_loss_mlp": 0.01194184, "balance_loss_clip": 1.00899458, "balance_loss_mlp": 1.00039697, "epoch": 0.4816930199001984, "flos": 18113639857920.0, "grad_norm": 1.932748677729123, "language_loss": 0.80039901, "learning_rate": 2.2153567958347455e-06, "loss": 0.82537764, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 3.622785806655884 }, { "auxiliary_loss_clip": 0.01315485, "auxiliary_loss_mlp": 0.01194167, "balance_loss_clip": 1.00845456, "balance_loss_mlp": 1.00028527, "epoch": 0.48181326279083747, "flos": 17274730579200.0, "grad_norm": 1.9690433143782844, "language_loss": 0.79800344, "learning_rate": 2.214582336985815e-06, "loss": 0.82310003, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.692622661590576 }, { "auxiliary_loss_clip": 0.01321399, "auxiliary_loss_mlp": 0.01194368, "balance_loss_clip": 1.00885165, "balance_loss_mlp": 1.00039053, "epoch": 0.4819335056814766, "flos": 14903261192160.0, "grad_norm": 2.098360209059538, "language_loss": 0.66091573, "learning_rate": 2.2138078455849142e-06, "loss": 0.68607342, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 3.691220760345459 }, { "auxiliary_loss_clip": 0.01334313, "auxiliary_loss_mlp": 0.01194224, "balance_loss_clip": 1.00950813, "balance_loss_mlp": 1.00034177, "epoch": 0.4820537485721157, "flos": 19244898142560.0, "grad_norm": 1.8342039581287393, "language_loss": 0.78524828, "learning_rate": 2.2130333217495334e-06, "loss": 0.81053364, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 3.775134563446045 }, { "auxiliary_loss_clip": 0.01309588, "auxiliary_loss_mlp": 0.01194456, "balance_loss_clip": 1.00809836, "balance_loss_mlp": 1.00028777, "epoch": 0.48217399146275475, "flos": 16033801003200.0, "grad_norm": 3.8144975455216943, "language_loss": 0.67483985, "learning_rate": 2.2122587655971665e-06, "loss": 0.69988036, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.7456064224243164 }, { "auxiliary_loss_clip": 0.01320966, "auxiliary_loss_mlp": 0.0119446, "balance_loss_clip": 1.00786483, "balance_loss_mlp": 1.00029194, "epoch": 0.48229423435339386, "flos": 24134208999840.0, "grad_norm": 1.5289006751839802, "language_loss": 0.64149034, "learning_rate": 2.211484177245314e-06, "loss": 0.66664451, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.789116382598877 }, { "auxiliary_loss_clip": 0.01360105, "auxiliary_loss_mlp": 0.01194309, "balance_loss_clip": 1.0094676, "balance_loss_mlp": 1.000332, "epoch": 0.48241447724403297, "flos": 23805446591520.0, "grad_norm": 1.829317118654545, "language_loss": 0.72609407, "learning_rate": 2.21070955681148e-06, "loss": 0.75163817, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.644542932510376 }, { "auxiliary_loss_clip": 0.01297194, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.00822425, "balance_loss_mlp": 1.00037789, "epoch": 0.482534720134672, "flos": 23110322891040.0, "grad_norm": 1.4598678830290708, "language_loss": 0.78229773, "learning_rate": 2.209934904413174e-06, "loss": 0.80721223, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.7748329639434814 }, { "auxiliary_loss_clip": 0.0128504, "auxiliary_loss_mlp": 0.01194428, "balance_loss_clip": 1.00867021, "balance_loss_mlp": 1.00035501, "epoch": 0.48265496302531113, "flos": 20923830334080.0, "grad_norm": 2.3235018737701982, "language_loss": 0.71763235, "learning_rate": 2.2091602201679095e-06, "loss": 0.74242705, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.8505454063415527 }, { "auxiliary_loss_clip": 0.01309153, "auxiliary_loss_mlp": 0.01194354, "balance_loss_clip": 1.00821447, "balance_loss_mlp": 1.00037694, "epoch": 0.48277520591595025, "flos": 15231161432160.0, "grad_norm": 2.1080706601799433, "language_loss": 0.83188725, "learning_rate": 2.208385504193206e-06, "loss": 0.85692233, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.7963144779205322 }, { "auxiliary_loss_clip": 0.0135884, "auxiliary_loss_mlp": 0.01194513, "balance_loss_clip": 1.00904846, "balance_loss_mlp": 1.00034499, "epoch": 0.4828954488065893, "flos": 17858674193760.0, "grad_norm": 1.8802226635835306, "language_loss": 0.80656374, "learning_rate": 2.2076107566065873e-06, "loss": 0.83209723, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.6375882625579834 }, { "auxiliary_loss_clip": 0.01333745, "auxiliary_loss_mlp": 0.01194281, "balance_loss_clip": 1.00970411, "balance_loss_mlp": 1.00039876, "epoch": 0.4830156916972284, "flos": 32087418668640.0, "grad_norm": 2.0203480584727194, "language_loss": 0.75486171, "learning_rate": 2.2068359775255816e-06, "loss": 0.78014195, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 2.8008463382720947 }, { "auxiliary_loss_clip": 0.01277062, "auxiliary_loss_mlp": 0.01194301, "balance_loss_clip": 1.00763464, "balance_loss_mlp": 1.00032413, "epoch": 0.48313593458786747, "flos": 21871728354240.0, "grad_norm": 2.209231227367712, "language_loss": 0.78148162, "learning_rate": 2.206061167067723e-06, "loss": 0.80619526, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.8623745441436768 }, { "auxiliary_loss_clip": 0.01315617, "auxiliary_loss_mlp": 0.01194238, "balance_loss_clip": 1.00871205, "balance_loss_mlp": 1.00035608, "epoch": 0.4832561774785066, "flos": 22601217807360.0, "grad_norm": 2.097718457489654, "language_loss": 0.79638469, "learning_rate": 2.205286325350549e-06, "loss": 0.82148325, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.7742812633514404 }, { "auxiliary_loss_clip": 0.01282909, "auxiliary_loss_mlp": 0.01194333, "balance_loss_clip": 1.00798607, "balance_loss_mlp": 1.00035572, "epoch": 0.4833764203691457, "flos": 13437349015680.0, "grad_norm": 2.0205658043501837, "language_loss": 0.72078025, "learning_rate": 2.204511452491603e-06, "loss": 0.74555266, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 3.050835132598877 }, { "auxiliary_loss_clip": 0.01357922, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00942087, "balance_loss_mlp": 1.00028443, "epoch": 0.48349666325978474, "flos": 44128054020960.0, "grad_norm": 1.6397437265624482, "language_loss": 0.75054926, "learning_rate": 2.2037365486084316e-06, "loss": 0.77606916, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.8500092029571533 }, { "auxiliary_loss_clip": 0.01309996, "auxiliary_loss_mlp": 0.01194495, "balance_loss_clip": 1.00926018, "balance_loss_mlp": 1.00042248, "epoch": 0.48361690615042385, "flos": 26028388474560.0, "grad_norm": 1.852067226592215, "language_loss": 0.77568811, "learning_rate": 2.2029616138185886e-06, "loss": 0.80073303, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.825603485107422 }, { "auxiliary_loss_clip": 0.01288937, "auxiliary_loss_mlp": 0.01194421, "balance_loss_clip": 1.00806677, "balance_loss_mlp": 1.00034809, "epoch": 0.48373714904106296, "flos": 22273317567360.0, "grad_norm": 1.5514246761970982, "language_loss": 0.83070797, "learning_rate": 2.202186648239629e-06, "loss": 0.85554153, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.7748591899871826 }, { "auxiliary_loss_clip": 0.01334826, "auxiliary_loss_mlp": 0.01194334, "balance_loss_clip": 1.00865364, "balance_loss_mlp": 1.00035703, "epoch": 0.483857391931702, "flos": 28292306067360.0, "grad_norm": 1.6840632076784718, "language_loss": 0.71959388, "learning_rate": 2.201411651989117e-06, "loss": 0.74488544, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.7584478855133057 }, { "auxiliary_loss_clip": 0.01312108, "auxiliary_loss_mlp": 0.00872358, "balance_loss_clip": 1.00814128, "balance_loss_mlp": 1.00015497, "epoch": 0.48397763482234113, "flos": 27418061096640.0, "grad_norm": 1.8083223877304346, "language_loss": 0.78374422, "learning_rate": 2.2006366251846167e-06, "loss": 0.8055889, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.814182758331299 }, { "auxiliary_loss_clip": 0.01321047, "auxiliary_loss_mlp": 0.01194149, "balance_loss_clip": 1.00904953, "balance_loss_mlp": 1.0003624, "epoch": 0.48409787771298024, "flos": 16797261048480.0, "grad_norm": 1.8068443366501026, "language_loss": 0.75329411, "learning_rate": 2.1998615679436997e-06, "loss": 0.77844602, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.705143690109253 }, { "auxiliary_loss_clip": 0.0132885, "auxiliary_loss_mlp": 0.01194361, "balance_loss_clip": 1.0086782, "balance_loss_mlp": 1.0003835, "epoch": 0.4842181206036193, "flos": 25083508043520.0, "grad_norm": 2.6180297458843316, "language_loss": 0.77120078, "learning_rate": 2.199086480383942e-06, "loss": 0.79643285, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.79536509513855 }, { "auxiliary_loss_clip": 0.01335783, "auxiliary_loss_mlp": 0.01195038, "balance_loss_clip": 1.00999761, "balance_loss_mlp": 1.00058365, "epoch": 0.4843383634942584, "flos": 30372324540480.0, "grad_norm": 2.6674755683206026, "language_loss": 0.67538077, "learning_rate": 2.1983113626229234e-06, "loss": 0.70068896, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.8024752140045166 }, { "auxiliary_loss_clip": 0.01316596, "auxiliary_loss_mlp": 0.00872351, "balance_loss_clip": 1.00893545, "balance_loss_mlp": 1.00018144, "epoch": 0.4844586063848975, "flos": 20413575692640.0, "grad_norm": 1.554406307286459, "language_loss": 0.78578222, "learning_rate": 2.1975362147782293e-06, "loss": 0.80767161, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.7963707447052 }, { "auxiliary_loss_clip": 0.01297608, "auxiliary_loss_mlp": 0.01193308, "balance_loss_clip": 1.00853384, "balance_loss_mlp": 1.00028384, "epoch": 0.48457884927553657, "flos": 70303761803520.0, "grad_norm": 0.6895708902765594, "language_loss": 0.54104781, "learning_rate": 2.196761036967448e-06, "loss": 0.56595695, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.469398021697998 }, { "auxiliary_loss_clip": 0.01333894, "auxiliary_loss_mlp": 0.01194133, "balance_loss_clip": 1.00804043, "balance_loss_mlp": 1.00034618, "epoch": 0.4846990921661757, "flos": 19934525520000.0, "grad_norm": 1.6792138526406386, "language_loss": 0.77712464, "learning_rate": 2.1959858293081743e-06, "loss": 0.80240494, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 3.655789375305176 }, { "auxiliary_loss_clip": 0.01297032, "auxiliary_loss_mlp": 0.01194186, "balance_loss_clip": 1.00856531, "balance_loss_mlp": 1.00030363, "epoch": 0.4848193350568148, "flos": 23075957138400.0, "grad_norm": 1.5295354763640987, "language_loss": 0.75989777, "learning_rate": 2.1952105919180056e-06, "loss": 0.78480995, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 3.768965721130371 }, { "auxiliary_loss_clip": 0.01311169, "auxiliary_loss_mlp": 0.0119432, "balance_loss_clip": 1.0083195, "balance_loss_mlp": 1.00034308, "epoch": 0.48493957794745385, "flos": 22455492350400.0, "grad_norm": 2.271060745133214, "language_loss": 0.68040085, "learning_rate": 2.1944353249145456e-06, "loss": 0.70545572, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.745793581008911 }, { "auxiliary_loss_clip": 0.01359727, "auxiliary_loss_mlp": 0.01194119, "balance_loss_clip": 1.00980425, "balance_loss_mlp": 1.000332, "epoch": 0.48505982083809296, "flos": 25046124701760.0, "grad_norm": 1.463354038148869, "language_loss": 0.74604684, "learning_rate": 2.193660028415401e-06, "loss": 0.77158529, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.7209620475769043 }, { "auxiliary_loss_clip": 0.01323268, "auxiliary_loss_mlp": 0.01194335, "balance_loss_clip": 1.00822115, "balance_loss_mlp": 1.00035739, "epoch": 0.485180063728732, "flos": 26761398448320.0, "grad_norm": 1.8201250205402313, "language_loss": 0.81948584, "learning_rate": 2.1928847025381852e-06, "loss": 0.84466189, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 4.758514642715454 }, { "auxiliary_loss_clip": 0.01347691, "auxiliary_loss_mlp": 0.01194437, "balance_loss_clip": 1.00921845, "balance_loss_mlp": 1.0003643, "epoch": 0.4853003066193711, "flos": 24059154926880.0, "grad_norm": 1.6658698018866167, "language_loss": 0.83764154, "learning_rate": 2.192109347400512e-06, "loss": 0.86306286, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.749544858932495 }, { "auxiliary_loss_clip": 0.01325639, "auxiliary_loss_mlp": 0.01194538, "balance_loss_clip": 1.00913405, "balance_loss_mlp": 1.00046539, "epoch": 0.48542054951001024, "flos": 23076388222560.0, "grad_norm": 1.6444830944043705, "language_loss": 0.78968191, "learning_rate": 2.191333963120004e-06, "loss": 0.81488371, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.720588207244873 }, { "auxiliary_loss_clip": 0.01313888, "auxiliary_loss_mlp": 0.01194239, "balance_loss_clip": 1.0082972, "balance_loss_mlp": 1.00035667, "epoch": 0.4855407924006493, "flos": 25664901076800.0, "grad_norm": 2.77308937874109, "language_loss": 0.70453179, "learning_rate": 2.190558549814286e-06, "loss": 0.72961307, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.812692165374756 }, { "auxiliary_loss_clip": 0.01334771, "auxiliary_loss_mlp": 0.01194269, "balance_loss_clip": 1.00929487, "balance_loss_mlp": 1.00038743, "epoch": 0.4856610352912884, "flos": 23987944687680.0, "grad_norm": 1.6515309805320861, "language_loss": 0.79177123, "learning_rate": 2.1897831076009872e-06, "loss": 0.81706166, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.768319606781006 }, { "auxiliary_loss_clip": 0.01336147, "auxiliary_loss_mlp": 0.01194401, "balance_loss_clip": 1.00822532, "balance_loss_mlp": 1.00032842, "epoch": 0.4857812781819275, "flos": 24096825658080.0, "grad_norm": 1.654909237129334, "language_loss": 0.7962361, "learning_rate": 2.1890076365977426e-06, "loss": 0.82154155, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.737379312515259 }, { "auxiliary_loss_clip": 0.01286798, "auxiliary_loss_mlp": 0.01194017, "balance_loss_clip": 1.00293911, "balance_loss_mlp": 1.00023031, "epoch": 0.48590152107256657, "flos": 56266662754080.0, "grad_norm": 0.8559908130178281, "language_loss": 0.52808857, "learning_rate": 2.188232136922189e-06, "loss": 0.55289674, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.295186758041382 }, { "auxiliary_loss_clip": 0.01250991, "auxiliary_loss_mlp": 0.01194508, "balance_loss_clip": 1.00790262, "balance_loss_mlp": 1.00033998, "epoch": 0.4860217639632057, "flos": 20046998858400.0, "grad_norm": 1.752774430793591, "language_loss": 0.7574743, "learning_rate": 2.187456608691971e-06, "loss": 0.78192925, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.866068124771118 }, { "auxiliary_loss_clip": 0.0130292, "auxiliary_loss_mlp": 0.01194658, "balance_loss_clip": 1.00874722, "balance_loss_mlp": 1.00048947, "epoch": 0.4861420068538448, "flos": 17822153020320.0, "grad_norm": 1.7030050852994405, "language_loss": 0.87800425, "learning_rate": 2.1866810520247334e-06, "loss": 0.90298003, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.7579636573791504 }, { "auxiliary_loss_clip": 0.01346831, "auxiliary_loss_mlp": 0.01194412, "balance_loss_clip": 1.00915742, "balance_loss_mlp": 1.00033903, "epoch": 0.48626224974448384, "flos": 26250137943840.0, "grad_norm": 1.8060722100465818, "language_loss": 0.64584577, "learning_rate": 2.185905467038129e-06, "loss": 0.67125821, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.7708303928375244 }, { "auxiliary_loss_clip": 0.01357644, "auxiliary_loss_mlp": 0.01194196, "balance_loss_clip": 1.00937462, "balance_loss_mlp": 1.00040913, "epoch": 0.48638249263512295, "flos": 22054513839840.0, "grad_norm": 1.6231500711046467, "language_loss": 0.77695519, "learning_rate": 2.1851298538498127e-06, "loss": 0.80247355, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.7196431159973145 }, { "auxiliary_loss_clip": 0.01340032, "auxiliary_loss_mlp": 0.00872487, "balance_loss_clip": 1.00952291, "balance_loss_mlp": 1.00018549, "epoch": 0.48650273552576206, "flos": 25119957369600.0, "grad_norm": 1.8348045782794729, "language_loss": 0.79444361, "learning_rate": 2.184354212577446e-06, "loss": 0.81656885, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.7443652153015137 }, { "auxiliary_loss_clip": 0.01360059, "auxiliary_loss_mlp": 0.01194397, "balance_loss_clip": 1.00937855, "balance_loss_mlp": 1.00041962, "epoch": 0.4866229784164011, "flos": 17456941285920.0, "grad_norm": 2.7294256081991026, "language_loss": 0.62744284, "learning_rate": 2.1835785433386907e-06, "loss": 0.65298742, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.7057695388793945 }, { "auxiliary_loss_clip": 0.01297554, "auxiliary_loss_mlp": 0.01194266, "balance_loss_clip": 1.00897717, "balance_loss_mlp": 1.00047934, "epoch": 0.48674322130704023, "flos": 23331138344640.0, "grad_norm": 1.8159206215640553, "language_loss": 0.65667897, "learning_rate": 2.182802846251216e-06, "loss": 0.68159723, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.7760047912597656 }, { "auxiliary_loss_clip": 0.01311856, "auxiliary_loss_mlp": 0.01194375, "balance_loss_clip": 1.00870347, "balance_loss_mlp": 1.00049305, "epoch": 0.4868634641976793, "flos": 28804356892800.0, "grad_norm": 2.078231127077503, "language_loss": 0.7198562, "learning_rate": 2.182027121432696e-06, "loss": 0.74491847, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.8671464920043945 }, { "auxiliary_loss_clip": 0.01359324, "auxiliary_loss_mlp": 0.01194491, "balance_loss_clip": 1.00925446, "balance_loss_mlp": 1.00041854, "epoch": 0.4869837070883184, "flos": 19025986644000.0, "grad_norm": 1.613519638866491, "language_loss": 0.8212173, "learning_rate": 2.1812513690008054e-06, "loss": 0.8467555, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.7075605392456055 }, { "auxiliary_loss_clip": 0.01346127, "auxiliary_loss_mlp": 0.0119432, "balance_loss_clip": 1.0091207, "balance_loss_mlp": 1.00043845, "epoch": 0.4871039499789575, "flos": 15121418293440.0, "grad_norm": 4.683835593059365, "language_loss": 0.79267472, "learning_rate": 2.180475589073227e-06, "loss": 0.81807917, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.864022731781006 }, { "auxiliary_loss_clip": 0.01346659, "auxiliary_loss_mlp": 0.01194262, "balance_loss_clip": 1.00886452, "balance_loss_mlp": 1.0003798, "epoch": 0.48722419286959656, "flos": 26174078007840.0, "grad_norm": 1.7546420461454932, "language_loss": 0.73512, "learning_rate": 2.1796997817676456e-06, "loss": 0.76052916, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.904170036315918 }, { "auxiliary_loss_clip": 0.01345785, "auxiliary_loss_mlp": 0.00872308, "balance_loss_clip": 1.00953364, "balance_loss_mlp": 1.00011325, "epoch": 0.4873444357602357, "flos": 24026154274080.0, "grad_norm": 1.4732736928993493, "language_loss": 0.67511213, "learning_rate": 2.1789239472017494e-06, "loss": 0.6972931, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.828461170196533 }, { "auxiliary_loss_clip": 0.01309544, "auxiliary_loss_mlp": 0.01194454, "balance_loss_clip": 1.00964189, "balance_loss_mlp": 1.00047636, "epoch": 0.4874646786508748, "flos": 22820452619040.0, "grad_norm": 2.141737851629211, "language_loss": 0.7298798, "learning_rate": 2.1781480854932326e-06, "loss": 0.75491977, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.904658079147339 }, { "auxiliary_loss_clip": 0.01273464, "auxiliary_loss_mlp": 0.01194261, "balance_loss_clip": 1.00753474, "balance_loss_mlp": 1.00037932, "epoch": 0.48758492154151384, "flos": 21287605121280.0, "grad_norm": 1.8805350018389273, "language_loss": 0.79467738, "learning_rate": 2.1773721967597933e-06, "loss": 0.81935453, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.887817859649658 }, { "auxiliary_loss_clip": 0.01282832, "auxiliary_loss_mlp": 0.01193982, "balance_loss_clip": 1.0027976, "balance_loss_mlp": 1.00019574, "epoch": 0.48770516443215295, "flos": 62244136689120.0, "grad_norm": 0.8489122140086564, "language_loss": 0.57361227, "learning_rate": 2.1765962811191322e-06, "loss": 0.59838045, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.242051362991333 }, { "auxiliary_loss_clip": 0.01248848, "auxiliary_loss_mlp": 0.01193891, "balance_loss_clip": 1.00400567, "balance_loss_mlp": 1.0001049, "epoch": 0.48782540732279206, "flos": 66133486608480.0, "grad_norm": 0.8251598210152938, "language_loss": 0.62054133, "learning_rate": 2.1758203386889566e-06, "loss": 0.64496875, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.374189853668213 }, { "auxiliary_loss_clip": 0.01311979, "auxiliary_loss_mlp": 0.0087239, "balance_loss_clip": 1.00925922, "balance_loss_mlp": 1.0000571, "epoch": 0.4879456502134311, "flos": 14607930520800.0, "grad_norm": 1.9599159230419751, "language_loss": 0.84432966, "learning_rate": 2.1750443695869746e-06, "loss": 0.86617333, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 3.695282220840454 }, { "auxiliary_loss_clip": 0.01339691, "auxiliary_loss_mlp": 0.01194194, "balance_loss_clip": 1.0086391, "balance_loss_mlp": 1.00040781, "epoch": 0.4880658931040702, "flos": 19500474509280.0, "grad_norm": 1.6504315694542266, "language_loss": 0.85731328, "learning_rate": 2.174268373930901e-06, "loss": 0.88265216, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 3.9727039337158203 }, { "auxiliary_loss_clip": 0.0128363, "auxiliary_loss_mlp": 0.00872304, "balance_loss_clip": 1.00727618, "balance_loss_mlp": 1.00012016, "epoch": 0.48818613599470934, "flos": 16723069143840.0, "grad_norm": 2.22337177574726, "language_loss": 0.79567736, "learning_rate": 2.1734923518384537e-06, "loss": 0.81723666, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.8332409858703613 }, { "auxiliary_loss_clip": 0.01295342, "auxiliary_loss_mlp": 0.01194241, "balance_loss_clip": 1.00828552, "balance_loss_mlp": 1.00045419, "epoch": 0.4883063788853484, "flos": 26756943912000.0, "grad_norm": 1.659848254219911, "language_loss": 0.82241726, "learning_rate": 2.1727163034273547e-06, "loss": 0.84731305, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 4.716856479644775 }, { "auxiliary_loss_clip": 0.01346897, "auxiliary_loss_mlp": 0.01194227, "balance_loss_clip": 1.00981736, "balance_loss_mlp": 1.00034547, "epoch": 0.4884266217759875, "flos": 16763398227360.0, "grad_norm": 2.0254797592569376, "language_loss": 0.78970301, "learning_rate": 2.17194022881533e-06, "loss": 0.81511426, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.7175703048706055 }, { "auxiliary_loss_clip": 0.01325126, "auxiliary_loss_mlp": 0.01194737, "balance_loss_clip": 1.00912738, "balance_loss_mlp": 1.0004735, "epoch": 0.4885468646666266, "flos": 24207143575680.0, "grad_norm": 1.7147265146207014, "language_loss": 0.67778301, "learning_rate": 2.1711641281201092e-06, "loss": 0.70298159, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.809006452560425 }, { "auxiliary_loss_clip": 0.01336012, "auxiliary_loss_mlp": 0.01194279, "balance_loss_clip": 1.00895858, "balance_loss_mlp": 1.00039697, "epoch": 0.48866710755726567, "flos": 14610804415200.0, "grad_norm": 2.705253488041132, "language_loss": 0.78781867, "learning_rate": 2.1703880014594264e-06, "loss": 0.81312156, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.7312583923339844 }, { "auxiliary_loss_clip": 0.01271373, "auxiliary_loss_mlp": 0.01194304, "balance_loss_clip": 1.00816441, "balance_loss_mlp": 1.00032651, "epoch": 0.4887873504479048, "flos": 28804464663840.0, "grad_norm": 1.882602306224045, "language_loss": 0.73760629, "learning_rate": 2.1696118489510182e-06, "loss": 0.76226306, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.9206717014312744 }, { "auxiliary_loss_clip": 0.0130977, "auxiliary_loss_mlp": 0.00872423, "balance_loss_clip": 1.008757, "balance_loss_mlp": 1.0000813, "epoch": 0.48890759333854383, "flos": 22784398453440.0, "grad_norm": 1.6771004481516403, "language_loss": 0.72421765, "learning_rate": 2.1688356707126286e-06, "loss": 0.74603963, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.799508571624756 }, { "auxiliary_loss_clip": 0.01301498, "auxiliary_loss_mlp": 0.01194555, "balance_loss_clip": 1.00857008, "balance_loss_mlp": 1.00048256, "epoch": 0.48902783622918294, "flos": 17786098854720.0, "grad_norm": 1.7826411373763522, "language_loss": 0.70277953, "learning_rate": 2.168059466862001e-06, "loss": 0.72774011, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.7214887142181396 }, { "auxiliary_loss_clip": 0.01324133, "auxiliary_loss_mlp": 0.01194466, "balance_loss_clip": 1.00824761, "balance_loss_mlp": 1.00048888, "epoch": 0.48914807911982205, "flos": 22310305748640.0, "grad_norm": 1.8526158370285886, "language_loss": 0.82088262, "learning_rate": 2.167283237516887e-06, "loss": 0.84606862, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.755767583847046 }, { "auxiliary_loss_clip": 0.01322519, "auxiliary_loss_mlp": 0.01194303, "balance_loss_clip": 1.00813019, "balance_loss_mlp": 1.0003252, "epoch": 0.4892683220104611, "flos": 16363030419360.0, "grad_norm": 1.7132628995784793, "language_loss": 0.74368048, "learning_rate": 2.1665069827950383e-06, "loss": 0.76884866, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.7108688354492188 }, { "auxiliary_loss_clip": 0.01324502, "auxiliary_loss_mlp": 0.01194408, "balance_loss_clip": 1.00869417, "balance_loss_mlp": 1.0004307, "epoch": 0.4893885649011002, "flos": 15739152881760.0, "grad_norm": 1.7447186396136132, "language_loss": 0.86614978, "learning_rate": 2.1657307028142126e-06, "loss": 0.89133888, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.719672918319702 }, { "auxiliary_loss_clip": 0.01312271, "auxiliary_loss_mlp": 0.01194417, "balance_loss_clip": 1.00790477, "balance_loss_mlp": 1.0003444, "epoch": 0.48950880779173933, "flos": 28581996720960.0, "grad_norm": 1.774610950876803, "language_loss": 0.67260331, "learning_rate": 2.164954397692171e-06, "loss": 0.69767022, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.7928991317749023 }, { "auxiliary_loss_clip": 0.01297993, "auxiliary_loss_mlp": 0.01193958, "balance_loss_clip": 1.00288391, "balance_loss_mlp": 1.00017154, "epoch": 0.4896290506823784, "flos": 66186339847200.0, "grad_norm": 1.0712853923680559, "language_loss": 0.77407199, "learning_rate": 2.164178067546678e-06, "loss": 0.7989915, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.484781503677368 }, { "auxiliary_loss_clip": 0.0133244, "auxiliary_loss_mlp": 0.01194376, "balance_loss_clip": 1.00870466, "balance_loss_mlp": 1.00039864, "epoch": 0.4897492935730175, "flos": 12531073331520.0, "grad_norm": 6.0223104826281455, "language_loss": 0.90732634, "learning_rate": 2.163401712495504e-06, "loss": 0.93259454, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.824521064758301 }, { "auxiliary_loss_clip": 0.01272458, "auxiliary_loss_mlp": 0.01194561, "balance_loss_clip": 1.00896168, "balance_loss_mlp": 1.0004878, "epoch": 0.4898695364636566, "flos": 23476827877920.0, "grad_norm": 1.5991846962298542, "language_loss": 0.79103339, "learning_rate": 2.1626253326564194e-06, "loss": 0.81570351, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.8603055477142334 }, { "auxiliary_loss_clip": 0.01323142, "auxiliary_loss_mlp": 0.01194261, "balance_loss_clip": 1.00837553, "balance_loss_mlp": 1.0003792, "epoch": 0.48998977935429566, "flos": 27160221538080.0, "grad_norm": 1.716878074464582, "language_loss": 0.77019519, "learning_rate": 2.161848928147201e-06, "loss": 0.79536915, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.812817096710205 }, { "auxiliary_loss_clip": 0.01335305, "auxiliary_loss_mlp": 0.0119452, "balance_loss_clip": 1.008744, "balance_loss_mlp": 1.00044751, "epoch": 0.4901100222449348, "flos": 20339599330080.0, "grad_norm": 2.0428740741912916, "language_loss": 0.80945551, "learning_rate": 2.161072499085629e-06, "loss": 0.83475375, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.9389941692352295 }, { "auxiliary_loss_clip": 0.01312362, "auxiliary_loss_mlp": 0.01194344, "balance_loss_clip": 1.00884497, "balance_loss_mlp": 1.0005579, "epoch": 0.4902302651355739, "flos": 30446372750400.0, "grad_norm": 1.4936299625298886, "language_loss": 0.83134651, "learning_rate": 2.160296045589487e-06, "loss": 0.8564136, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.875866651535034 }, { "auxiliary_loss_clip": 0.01333714, "auxiliary_loss_mlp": 0.01194372, "balance_loss_clip": 1.00922561, "balance_loss_mlp": 1.00039434, "epoch": 0.49035050802621294, "flos": 19174190834880.0, "grad_norm": 1.7140169304799278, "language_loss": 0.69952625, "learning_rate": 2.159519567776562e-06, "loss": 0.72480708, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.726663112640381 }, { "auxiliary_loss_clip": 0.01290739, "auxiliary_loss_mlp": 0.01194583, "balance_loss_clip": 1.00827873, "balance_loss_mlp": 1.00041437, "epoch": 0.49047075091685205, "flos": 22228498023840.0, "grad_norm": 2.591537957327506, "language_loss": 0.70801139, "learning_rate": 2.1587430657646463e-06, "loss": 0.7328645, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.983335256576538 }, { "auxiliary_loss_clip": 0.01313229, "auxiliary_loss_mlp": 0.01194547, "balance_loss_clip": 1.00796628, "balance_loss_mlp": 1.00056982, "epoch": 0.4905909938074911, "flos": 20156526455040.0, "grad_norm": 2.577772214368012, "language_loss": 0.77975309, "learning_rate": 2.157966539671533e-06, "loss": 0.80483085, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.765458345413208 }, { "auxiliary_loss_clip": 0.0130567, "auxiliary_loss_mlp": 0.01194111, "balance_loss_clip": 1.00806379, "balance_loss_mlp": 1.00032461, "epoch": 0.4907112366981302, "flos": 17202227087520.0, "grad_norm": 1.9014371425775842, "language_loss": 0.66785157, "learning_rate": 2.157189989615021e-06, "loss": 0.69284934, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.7941713333129883 }, { "auxiliary_loss_clip": 0.01348051, "auxiliary_loss_mlp": 0.00872498, "balance_loss_clip": 1.00966477, "balance_loss_mlp": 1.00006413, "epoch": 0.4908314795887693, "flos": 21688978792320.0, "grad_norm": 1.7182477591792347, "language_loss": 0.75170279, "learning_rate": 2.156413415712913e-06, "loss": 0.77390826, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.7724521160125732 }, { "auxiliary_loss_clip": 0.0132378, "auxiliary_loss_mlp": 0.00872435, "balance_loss_clip": 1.00872374, "balance_loss_mlp": 1.00003505, "epoch": 0.4909517224794084, "flos": 26213688617760.0, "grad_norm": 1.6194106646753195, "language_loss": 0.78587329, "learning_rate": 2.155636818083014e-06, "loss": 0.8078354, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 3.6501457691192627 }, { "auxiliary_loss_clip": 0.01310795, "auxiliary_loss_mlp": 0.0119432, "balance_loss_clip": 1.00780749, "balance_loss_mlp": 1.00034237, "epoch": 0.4910719653700475, "flos": 23148388782720.0, "grad_norm": 1.6946546558091298, "language_loss": 0.84429193, "learning_rate": 2.154860196843134e-06, "loss": 0.8693431, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 3.746232271194458 }, { "auxiliary_loss_clip": 0.01358896, "auxiliary_loss_mlp": 0.01194372, "balance_loss_clip": 1.00893331, "balance_loss_mlp": 1.00039458, "epoch": 0.4911922082606866, "flos": 23331856818240.0, "grad_norm": 1.749307171678276, "language_loss": 0.76722467, "learning_rate": 2.154083552111085e-06, "loss": 0.79275727, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.7601258754730225 }, { "auxiliary_loss_clip": 0.01359079, "auxiliary_loss_mlp": 0.01194383, "balance_loss_clip": 1.00895357, "balance_loss_mlp": 1.00040519, "epoch": 0.49131245115132566, "flos": 29203251829920.0, "grad_norm": 1.7126445549300635, "language_loss": 0.81790912, "learning_rate": 2.1533068840046834e-06, "loss": 0.84344375, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.729865312576294 }, { "auxiliary_loss_clip": 0.01323668, "auxiliary_loss_mlp": 0.00872435, "balance_loss_clip": 1.00864017, "balance_loss_mlp": 1.00003791, "epoch": 0.49143269404196477, "flos": 20147473687680.0, "grad_norm": 2.3471629243399645, "language_loss": 0.61872333, "learning_rate": 2.152530192641749e-06, "loss": 0.64068431, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 3.8063230514526367 }, { "auxiliary_loss_clip": 0.01333017, "auxiliary_loss_mlp": 0.01194494, "balance_loss_clip": 1.00913143, "balance_loss_mlp": 1.0006125, "epoch": 0.4915529369326039, "flos": 24389821290240.0, "grad_norm": 2.158869218284767, "language_loss": 0.72523272, "learning_rate": 2.1517534781401068e-06, "loss": 0.75050783, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 3.698906183242798 }, { "auxiliary_loss_clip": 0.01335456, "auxiliary_loss_mlp": 0.01194732, "balance_loss_clip": 1.00857019, "balance_loss_mlp": 1.00056386, "epoch": 0.49167317982324293, "flos": 10524312747360.0, "grad_norm": 2.1664632453531705, "language_loss": 0.69147015, "learning_rate": 2.150976740617581e-06, "loss": 0.71677208, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.656425952911377 }, { "auxiliary_loss_clip": 0.0132258, "auxiliary_loss_mlp": 0.01194367, "balance_loss_clip": 1.00829816, "balance_loss_mlp": 1.00038922, "epoch": 0.49179342271388204, "flos": 25593439371840.0, "grad_norm": 1.7190731138841548, "language_loss": 0.71458405, "learning_rate": 2.150199980192006e-06, "loss": 0.73975354, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.8359858989715576 }, { "auxiliary_loss_clip": 0.01321688, "auxiliary_loss_mlp": 0.01194199, "balance_loss_clip": 1.00930035, "balance_loss_mlp": 1.00031674, "epoch": 0.49191366560452116, "flos": 21102053512320.0, "grad_norm": 1.5688323661320795, "language_loss": 0.80904126, "learning_rate": 2.1494231969812114e-06, "loss": 0.83420014, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.7387826442718506 }, { "auxiliary_loss_clip": 0.01305546, "auxiliary_loss_mlp": 0.01194651, "balance_loss_clip": 1.00830424, "balance_loss_mlp": 1.00057817, "epoch": 0.4920339084951602, "flos": 26067532076640.0, "grad_norm": 2.620897178817261, "language_loss": 0.80729634, "learning_rate": 2.1486463911030372e-06, "loss": 0.83229828, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.8451666831970215 }, { "auxiliary_loss_clip": 0.01321225, "auxiliary_loss_mlp": 0.01194426, "balance_loss_clip": 1.00837076, "balance_loss_mlp": 1.00054395, "epoch": 0.4921541513857993, "flos": 25081280775360.0, "grad_norm": 1.9622771890390815, "language_loss": 0.74311364, "learning_rate": 2.147869562675324e-06, "loss": 0.76827013, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.8142168521881104 }, { "auxiliary_loss_clip": 0.01334794, "auxiliary_loss_mlp": 0.0119452, "balance_loss_clip": 1.00873446, "balance_loss_mlp": 1.00044751, "epoch": 0.49227439427643843, "flos": 24389821290240.0, "grad_norm": 1.6149012955269995, "language_loss": 0.72455215, "learning_rate": 2.147092711815915e-06, "loss": 0.74984533, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.7386651039123535 }, { "auxiliary_loss_clip": 0.0128957, "auxiliary_loss_mlp": 0.01194412, "balance_loss_clip": 1.00794601, "balance_loss_mlp": 1.00033927, "epoch": 0.4923946371670775, "flos": 11363760881280.0, "grad_norm": 2.3404340205417213, "language_loss": 0.8641485, "learning_rate": 2.1463158386426593e-06, "loss": 0.88898832, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.7216827869415283 }, { "auxiliary_loss_clip": 0.01322254, "auxiliary_loss_mlp": 0.0119491, "balance_loss_clip": 1.00856245, "balance_loss_mlp": 1.00055146, "epoch": 0.4925148800577166, "flos": 30445977589920.0, "grad_norm": 2.1050925414780974, "language_loss": 0.77032602, "learning_rate": 2.145538943273407e-06, "loss": 0.79549766, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.82950496673584 }, { "auxiliary_loss_clip": 0.01358782, "auxiliary_loss_mlp": 0.01194349, "balance_loss_clip": 1.00923944, "balance_loss_mlp": 1.00037158, "epoch": 0.49263512294835565, "flos": 20850464674080.0, "grad_norm": 1.9412506389489061, "language_loss": 0.71961242, "learning_rate": 2.144762025826013e-06, "loss": 0.74514377, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 2.724717140197754 }, { "auxiliary_loss_clip": 0.01346333, "auxiliary_loss_mlp": 0.01194531, "balance_loss_clip": 1.00944912, "balance_loss_mlp": 1.0004586, "epoch": 0.49275536583899476, "flos": 23767488470880.0, "grad_norm": 2.1657839896671445, "language_loss": 0.86981875, "learning_rate": 2.143985086418334e-06, "loss": 0.89522743, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.856455087661743 }, { "auxiliary_loss_clip": 0.01324813, "auxiliary_loss_mlp": 0.01194254, "balance_loss_clip": 1.00792527, "balance_loss_mlp": 1.00037265, "epoch": 0.4928756087296339, "flos": 22273533109440.0, "grad_norm": 1.3447896706994145, "language_loss": 0.76469994, "learning_rate": 2.1432081251682324e-06, "loss": 0.78989059, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.7877087593078613 }, { "auxiliary_loss_clip": 0.01335142, "auxiliary_loss_mlp": 0.01194309, "balance_loss_clip": 1.00993919, "balance_loss_mlp": 1.00042748, "epoch": 0.49299585162027293, "flos": 19645481492640.0, "grad_norm": 1.6073565734910278, "language_loss": 0.87134832, "learning_rate": 2.142431142193572e-06, "loss": 0.89664286, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.671957015991211 }, { "auxiliary_loss_clip": 0.01358211, "auxiliary_loss_mlp": 0.01194508, "balance_loss_clip": 1.00928319, "balance_loss_mlp": 1.00053072, "epoch": 0.49311609451091204, "flos": 38837153950560.0, "grad_norm": 2.096208162444044, "language_loss": 0.72037619, "learning_rate": 2.1416541376122207e-06, "loss": 0.74590337, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.8375744819641113 }, { "auxiliary_loss_clip": 0.0135762, "auxiliary_loss_mlp": 0.01194314, "balance_loss_clip": 1.00885928, "balance_loss_mlp": 1.00033712, "epoch": 0.49323633740155115, "flos": 28329114630240.0, "grad_norm": 1.6368745768472701, "language_loss": 0.72841871, "learning_rate": 2.1408771115420496e-06, "loss": 0.75393808, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.7210276126861572 }, { "auxiliary_loss_clip": 0.01281053, "auxiliary_loss_mlp": 0.01194407, "balance_loss_clip": 1.0087049, "balance_loss_mlp": 1.00042915, "epoch": 0.4933565802921902, "flos": 21135593020320.0, "grad_norm": 3.1367577812989915, "language_loss": 0.64685959, "learning_rate": 2.140100064100932e-06, "loss": 0.67161417, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.800008773803711 }, { "auxiliary_loss_clip": 0.01333376, "auxiliary_loss_mlp": 0.01194107, "balance_loss_clip": 1.00846124, "balance_loss_mlp": 1.00032067, "epoch": 0.4934768231828293, "flos": 18039016869120.0, "grad_norm": 1.985343890085232, "language_loss": 0.76185197, "learning_rate": 2.139322995406746e-06, "loss": 0.78712684, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.7261905670166016 }, { "auxiliary_loss_clip": 0.01358826, "auxiliary_loss_mlp": 0.01194322, "balance_loss_clip": 1.00961995, "balance_loss_mlp": 1.00034499, "epoch": 0.4935970660734684, "flos": 23469966455040.0, "grad_norm": 1.8265331959895275, "language_loss": 0.79382914, "learning_rate": 2.1385459055773727e-06, "loss": 0.81936061, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.8405587673187256 }, { "auxiliary_loss_clip": 0.01286002, "auxiliary_loss_mlp": 0.00872156, "balance_loss_clip": 1.00793767, "balance_loss_mlp": 1.00005817, "epoch": 0.4937173089641075, "flos": 64479279414240.0, "grad_norm": 1.9434896345688601, "language_loss": 0.73543239, "learning_rate": 2.137768794730696e-06, "loss": 0.75701404, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.3277406692504883 }, { "auxiliary_loss_clip": 0.01318496, "auxiliary_loss_mlp": 0.01194658, "balance_loss_clip": 1.00863791, "balance_loss_mlp": 1.0003947, "epoch": 0.4938375518547466, "flos": 22346036601120.0, "grad_norm": 1.6093439918793633, "language_loss": 0.79925334, "learning_rate": 2.1369916629846026e-06, "loss": 0.82438487, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.7834601402282715 }, { "auxiliary_loss_clip": 0.01321092, "auxiliary_loss_mlp": 0.01194469, "balance_loss_clip": 1.00834942, "balance_loss_mlp": 1.00039589, "epoch": 0.4939577947453857, "flos": 17858710117440.0, "grad_norm": 1.7648070842103825, "language_loss": 0.74960345, "learning_rate": 2.136214510456983e-06, "loss": 0.77475905, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.776942729949951 }, { "auxiliary_loss_clip": 0.01270743, "auxiliary_loss_mlp": 0.00871892, "balance_loss_clip": 1.00356936, "balance_loss_mlp": 1.00026822, "epoch": 0.49407803763602476, "flos": 70066780693920.0, "grad_norm": 0.8847377644983266, "language_loss": 0.6317054, "learning_rate": 2.1354373372657296e-06, "loss": 0.65313172, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 4.257373809814453 }, { "auxiliary_loss_clip": 0.01357685, "auxiliary_loss_mlp": 0.0119424, "balance_loss_clip": 1.00935888, "balance_loss_mlp": 1.00035751, "epoch": 0.49419828052666387, "flos": 24317497416960.0, "grad_norm": 1.446743622229338, "language_loss": 0.70876193, "learning_rate": 2.1346601435287404e-06, "loss": 0.73428118, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.728848695755005 }, { "auxiliary_loss_clip": 0.01321626, "auxiliary_loss_mlp": 0.01194184, "balance_loss_clip": 1.00807822, "balance_loss_mlp": 1.00030243, "epoch": 0.494318523417303, "flos": 29386073239200.0, "grad_norm": 1.7253032484009296, "language_loss": 0.80283266, "learning_rate": 2.1338829293639144e-06, "loss": 0.82799077, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 3.773012161254883 }, { "auxiliary_loss_clip": 0.01278813, "auxiliary_loss_mlp": 0.01194434, "balance_loss_clip": 1.00809932, "balance_loss_mlp": 1.00036108, "epoch": 0.49443876630794203, "flos": 15268293308160.0, "grad_norm": 3.3006147439529885, "language_loss": 0.83118957, "learning_rate": 2.1331056948891547e-06, "loss": 0.85592204, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.97576904296875 }, { "auxiliary_loss_clip": 0.01309463, "auxiliary_loss_mlp": 0.01194168, "balance_loss_clip": 1.00835276, "balance_loss_mlp": 1.00038183, "epoch": 0.49455900919858115, "flos": 12347461601280.0, "grad_norm": 2.0718593940518355, "language_loss": 0.76214457, "learning_rate": 2.1323284402223666e-06, "loss": 0.78718084, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 3.747779130935669 }, { "auxiliary_loss_clip": 0.01357245, "auxiliary_loss_mlp": 0.00872141, "balance_loss_clip": 1.00975657, "balance_loss_mlp": 0.9999795, "epoch": 0.4946792520892202, "flos": 22779620604000.0, "grad_norm": 1.8228046101846724, "language_loss": 0.88077885, "learning_rate": 2.1315511654814597e-06, "loss": 0.90307271, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 3.6857240200042725 }, { "auxiliary_loss_clip": 0.01308147, "auxiliary_loss_mlp": 0.01194207, "balance_loss_clip": 1.00779915, "balance_loss_mlp": 1.00032496, "epoch": 0.4947994949798593, "flos": 23148137316960.0, "grad_norm": 2.251875998703775, "language_loss": 0.784051, "learning_rate": 2.1307738707843456e-06, "loss": 0.80907452, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.869737148284912 }, { "auxiliary_loss_clip": 0.01345207, "auxiliary_loss_mlp": 0.01194602, "balance_loss_clip": 1.00966239, "balance_loss_mlp": 1.00043392, "epoch": 0.4949197378704984, "flos": 23660008524000.0, "grad_norm": 1.961821970499882, "language_loss": 0.69237387, "learning_rate": 2.1299965562489385e-06, "loss": 0.71777195, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.7193753719329834 }, { "auxiliary_loss_clip": 0.01346527, "auxiliary_loss_mlp": 0.01194304, "balance_loss_clip": 1.00911379, "balance_loss_mlp": 1.0003264, "epoch": 0.4950399807611375, "flos": 26911506594240.0, "grad_norm": 1.366564981730617, "language_loss": 0.7887634, "learning_rate": 2.129219221993158e-06, "loss": 0.81417167, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.8706095218658447 }, { "auxiliary_loss_clip": 0.01278614, "auxiliary_loss_mlp": 0.01193883, "balance_loss_clip": 1.00532627, "balance_loss_mlp": 1.00009632, "epoch": 0.4951602236517766, "flos": 67315299016320.0, "grad_norm": 0.7853434672481545, "language_loss": 0.59959251, "learning_rate": 2.128441868134924e-06, "loss": 0.62431753, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.4488110542297363 }, { "auxiliary_loss_clip": 0.01308648, "auxiliary_loss_mlp": 0.01194321, "balance_loss_clip": 1.00815153, "balance_loss_mlp": 1.00043893, "epoch": 0.4952804665424157, "flos": 19901453019840.0, "grad_norm": 2.8833939708694962, "language_loss": 0.82455409, "learning_rate": 2.1276644947921606e-06, "loss": 0.84958375, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.834341287612915 }, { "auxiliary_loss_clip": 0.01334678, "auxiliary_loss_mlp": 0.01194217, "balance_loss_clip": 1.00868773, "balance_loss_mlp": 1.0003351, "epoch": 0.49540070943305475, "flos": 18806823679680.0, "grad_norm": 2.203915093476954, "language_loss": 0.82426095, "learning_rate": 2.126887102082795e-06, "loss": 0.84954989, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.7849490642547607 }, { "auxiliary_loss_clip": 0.01310603, "auxiliary_loss_mlp": 0.01194222, "balance_loss_clip": 1.00847018, "balance_loss_mlp": 1.00034022, "epoch": 0.49552095232369386, "flos": 24934190218560.0, "grad_norm": 1.6368943393143907, "language_loss": 0.70245063, "learning_rate": 2.126109690124757e-06, "loss": 0.72749889, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.884505271911621 }, { "auxiliary_loss_clip": 0.01281553, "auxiliary_loss_mlp": 0.01194216, "balance_loss_clip": 1.00739384, "balance_loss_mlp": 1.00042927, "epoch": 0.495641195214333, "flos": 22857261181920.0, "grad_norm": 1.9947895636440922, "language_loss": 0.71129143, "learning_rate": 2.1253322590359786e-06, "loss": 0.73604912, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.8346288204193115 }, { "auxiliary_loss_clip": 0.01346095, "auxiliary_loss_mlp": 0.0119418, "balance_loss_clip": 1.00916159, "balance_loss_mlp": 1.00029778, "epoch": 0.49576143810497203, "flos": 25769758595040.0, "grad_norm": 1.5522975565279213, "language_loss": 0.73944044, "learning_rate": 2.124554808934397e-06, "loss": 0.76484323, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 2.803515672683716 }, { "auxiliary_loss_clip": 0.01304114, "auxiliary_loss_mlp": 0.0119446, "balance_loss_clip": 1.00910115, "balance_loss_mlp": 1.0004828, "epoch": 0.49588168099561114, "flos": 22128849439200.0, "grad_norm": 1.813371769686465, "language_loss": 0.72866809, "learning_rate": 2.1237773399379496e-06, "loss": 0.75365382, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.9244742393493652 }, { "auxiliary_loss_clip": 0.01320304, "auxiliary_loss_mlp": 0.01194538, "balance_loss_clip": 1.00884521, "balance_loss_mlp": 1.00046539, "epoch": 0.49600192388625025, "flos": 24387342556320.0, "grad_norm": 1.68728099657643, "language_loss": 0.86979914, "learning_rate": 2.122999852164578e-06, "loss": 0.89494753, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.9054343700408936 }, { "auxiliary_loss_clip": 0.0127538, "auxiliary_loss_mlp": 0.01194377, "balance_loss_clip": 1.0086441, "balance_loss_mlp": 1.00039923, "epoch": 0.4961221667768893, "flos": 22857440800320.0, "grad_norm": 2.0796091511788957, "language_loss": 0.58773041, "learning_rate": 2.122222345732227e-06, "loss": 0.61242795, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.8263654708862305 }, { "auxiliary_loss_clip": 0.01304518, "auxiliary_loss_mlp": 0.01194395, "balance_loss_clip": 1.0082413, "balance_loss_mlp": 1.00041771, "epoch": 0.4962424096675284, "flos": 17858099414880.0, "grad_norm": 1.6754956060013115, "language_loss": 0.83100128, "learning_rate": 2.121444820758843e-06, "loss": 0.85599041, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.7434909343719482 }, { "auxiliary_loss_clip": 0.012708, "auxiliary_loss_mlp": 0.0119445, "balance_loss_clip": 1.00740743, "balance_loss_mlp": 1.00028181, "epoch": 0.49636265255816747, "flos": 21793620768480.0, "grad_norm": 1.9889612688598155, "language_loss": 0.78959286, "learning_rate": 2.120667277362376e-06, "loss": 0.81424534, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.9505727291107178 }, { "auxiliary_loss_clip": 0.01359033, "auxiliary_loss_mlp": 0.01194393, "balance_loss_clip": 1.00973606, "balance_loss_mlp": 1.00041544, "epoch": 0.4964828954488066, "flos": 16358611806720.0, "grad_norm": 2.0740960819556915, "language_loss": 0.84584612, "learning_rate": 2.1198897156607796e-06, "loss": 0.87138039, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.6701653003692627 }, { "auxiliary_loss_clip": 0.01333365, "auxiliary_loss_mlp": 0.01194409, "balance_loss_clip": 1.00957227, "balance_loss_mlp": 1.00043201, "epoch": 0.4966031383394457, "flos": 24711111573120.0, "grad_norm": 1.812785067238506, "language_loss": 0.73595381, "learning_rate": 2.1191121357720085e-06, "loss": 0.7612316, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.72912859916687 }, { "auxiliary_loss_clip": 0.01284747, "auxiliary_loss_mlp": 0.01194475, "balance_loss_clip": 1.00853789, "balance_loss_mlp": 1.00049818, "epoch": 0.49672338123008475, "flos": 22930626841920.0, "grad_norm": 1.5763704891851236, "language_loss": 0.74694908, "learning_rate": 2.1183345378140206e-06, "loss": 0.77174127, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.8372926712036133 }, { "auxiliary_loss_clip": 0.01315246, "auxiliary_loss_mlp": 0.01193943, "balance_loss_clip": 1.00420451, "balance_loss_mlp": 1.00015593, "epoch": 0.49684362412072386, "flos": 65976768505440.0, "grad_norm": 0.8501756601890327, "language_loss": 0.62018496, "learning_rate": 2.1175569219047783e-06, "loss": 0.64527684, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.3926079273223877 }, { "auxiliary_loss_clip": 0.01358319, "auxiliary_loss_mlp": 0.0119431, "balance_loss_clip": 1.00923526, "balance_loss_mlp": 1.00042844, "epoch": 0.49696386701136297, "flos": 19971298159200.0, "grad_norm": 1.472032221533895, "language_loss": 0.7336148, "learning_rate": 2.1167792881622437e-06, "loss": 0.75914115, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.740422248840332 }, { "auxiliary_loss_clip": 0.01308879, "auxiliary_loss_mlp": 0.01194298, "balance_loss_clip": 1.00830436, "balance_loss_mlp": 1.00041604, "epoch": 0.497084109902002, "flos": 24750829954080.0, "grad_norm": 1.4619217954472539, "language_loss": 0.80847216, "learning_rate": 2.116001636704384e-06, "loss": 0.83350396, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.823136568069458 }, { "auxiliary_loss_clip": 0.01285171, "auxiliary_loss_mlp": 0.01194295, "balance_loss_clip": 1.00890851, "balance_loss_mlp": 1.00041258, "epoch": 0.49720435279264114, "flos": 21871836125280.0, "grad_norm": 1.7828752937028163, "language_loss": 0.80151814, "learning_rate": 2.1152239676491685e-06, "loss": 0.82631284, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.9228501319885254 }, { "auxiliary_loss_clip": 0.01333569, "auxiliary_loss_mlp": 0.01194213, "balance_loss_clip": 1.00909781, "balance_loss_mlp": 1.0003314, "epoch": 0.49732459568328025, "flos": 23805805828320.0, "grad_norm": 1.785985919342857, "language_loss": 0.73120588, "learning_rate": 2.114446281114569e-06, "loss": 0.75648367, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 4.710424423217773 }, { "auxiliary_loss_clip": 0.01321675, "auxiliary_loss_mlp": 0.01194166, "balance_loss_clip": 1.0089767, "balance_loss_mlp": 1.00037968, "epoch": 0.4974448385739193, "flos": 20047753255680.0, "grad_norm": 1.8369186061141498, "language_loss": 0.76047868, "learning_rate": 2.1136685772185587e-06, "loss": 0.78563702, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.8514888286590576 }, { "auxiliary_loss_clip": 0.01333946, "auxiliary_loss_mlp": 0.00872318, "balance_loss_clip": 1.00915599, "balance_loss_mlp": 0.99999958, "epoch": 0.4975650814645584, "flos": 24821357643360.0, "grad_norm": 1.5859536760433197, "language_loss": 0.77936232, "learning_rate": 2.1128908560791163e-06, "loss": 0.80142492, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 3.775536060333252 }, { "auxiliary_loss_clip": 0.01358338, "auxiliary_loss_mlp": 0.01194273, "balance_loss_clip": 1.0095048, "balance_loss_mlp": 1.00039065, "epoch": 0.4976853243551975, "flos": 19829488383360.0, "grad_norm": 1.7815331060136836, "language_loss": 0.78133678, "learning_rate": 2.1121131178142203e-06, "loss": 0.80686295, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.7819550037384033 }, { "auxiliary_loss_clip": 0.01321676, "auxiliary_loss_mlp": 0.01194364, "balance_loss_clip": 1.00857353, "balance_loss_mlp": 1.00048196, "epoch": 0.4978055672458366, "flos": 23143000230720.0, "grad_norm": 1.6498933535357487, "language_loss": 0.82291794, "learning_rate": 2.1113353625418544e-06, "loss": 0.84807837, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 3.839609384536743 }, { "auxiliary_loss_clip": 0.01331893, "auxiliary_loss_mlp": 0.01194098, "balance_loss_clip": 1.00898552, "balance_loss_mlp": 1.00031185, "epoch": 0.4979258101364757, "flos": 15559923840480.0, "grad_norm": 1.5470352236225478, "language_loss": 0.7880255, "learning_rate": 2.1105575903800017e-06, "loss": 0.81328541, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.716703414916992 }, { "auxiliary_loss_clip": 0.01332416, "auxiliary_loss_mlp": 0.01194202, "balance_loss_clip": 1.00904942, "balance_loss_mlp": 1.00032032, "epoch": 0.4980460530271148, "flos": 26356180943520.0, "grad_norm": 2.0699329816974283, "language_loss": 0.84862542, "learning_rate": 2.1097798014466502e-06, "loss": 0.87389159, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.9728925228118896 }, { "auxiliary_loss_clip": 0.01339461, "auxiliary_loss_mlp": 0.01194321, "balance_loss_clip": 1.00926852, "balance_loss_mlp": 1.00034356, "epoch": 0.49816629591775385, "flos": 17274550960800.0, "grad_norm": 3.8617657433327874, "language_loss": 0.59576374, "learning_rate": 2.109001995859791e-06, "loss": 0.62110156, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.7366490364074707 }, { "auxiliary_loss_clip": 0.01287452, "auxiliary_loss_mlp": 0.01193923, "balance_loss_clip": 1.00330639, "balance_loss_mlp": 1.00013638, "epoch": 0.49828653880839296, "flos": 64930982160960.0, "grad_norm": 0.7895645444101624, "language_loss": 0.60110086, "learning_rate": 2.108224173737415e-06, "loss": 0.62591463, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.317809581756592 }, { "auxiliary_loss_clip": 0.01320059, "auxiliary_loss_mlp": 0.01194358, "balance_loss_clip": 1.00869823, "balance_loss_mlp": 1.00038123, "epoch": 0.498406781699032, "flos": 27484816799520.0, "grad_norm": 1.791016117851172, "language_loss": 0.76269281, "learning_rate": 2.1074463351975183e-06, "loss": 0.78783703, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.829063892364502 }, { "auxiliary_loss_clip": 0.01294852, "auxiliary_loss_mlp": 0.01194196, "balance_loss_clip": 1.00820255, "balance_loss_mlp": 1.00040913, "epoch": 0.49852702458967113, "flos": 31499882686080.0, "grad_norm": 2.2799411954997564, "language_loss": 0.71468341, "learning_rate": 2.106668480358098e-06, "loss": 0.73957396, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.861567258834839 }, { "auxiliary_loss_clip": 0.01309088, "auxiliary_loss_mlp": 0.0119452, "balance_loss_clip": 1.00982344, "balance_loss_mlp": 1.00044727, "epoch": 0.49864726748031024, "flos": 22852878492960.0, "grad_norm": 1.6856929424980491, "language_loss": 0.70800495, "learning_rate": 2.105890609337154e-06, "loss": 0.73304105, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.8283841609954834 }, { "auxiliary_loss_clip": 0.01326842, "auxiliary_loss_mlp": 0.0119393, "balance_loss_clip": 1.00404835, "balance_loss_mlp": 1.00014293, "epoch": 0.4987675103709493, "flos": 70405745427360.0, "grad_norm": 0.6959338018855535, "language_loss": 0.63849163, "learning_rate": 2.1051127222526883e-06, "loss": 0.66369927, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.4006688594818115 }, { "auxiliary_loss_clip": 0.01333513, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00907278, "balance_loss_mlp": 1.00026011, "epoch": 0.4988877532615884, "flos": 28767584253600.0, "grad_norm": 1.4808619753293597, "language_loss": 0.80741531, "learning_rate": 2.1043348192227067e-06, "loss": 0.83269089, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 2.892106056213379 }, { "auxiliary_loss_clip": 0.01301077, "auxiliary_loss_mlp": 0.01194294, "balance_loss_clip": 1.00818157, "balance_loss_mlp": 1.00041151, "epoch": 0.4990079961522275, "flos": 16872710281920.0, "grad_norm": 1.7315598773599232, "language_loss": 0.61741519, "learning_rate": 2.1035569003652156e-06, "loss": 0.64236891, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.7744035720825195 }, { "auxiliary_loss_clip": 0.01285262, "auxiliary_loss_mlp": 0.01194614, "balance_loss_clip": 1.00810504, "balance_loss_mlp": 1.00054085, "epoch": 0.4991282390428666, "flos": 13291048779840.0, "grad_norm": 2.226958342762745, "language_loss": 0.81870085, "learning_rate": 2.1027789657982255e-06, "loss": 0.8434996, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.798586845397949 }, { "auxiliary_loss_clip": 0.01261751, "auxiliary_loss_mlp": 0.01194247, "balance_loss_clip": 1.00728917, "balance_loss_mlp": 1.00046015, "epoch": 0.4992484819335057, "flos": 21537505546560.0, "grad_norm": 1.8929179244245473, "language_loss": 0.77138948, "learning_rate": 2.1020010156397482e-06, "loss": 0.79594946, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.8150196075439453 }, { "auxiliary_loss_clip": 0.01344983, "auxiliary_loss_mlp": 0.01194312, "balance_loss_clip": 1.00923276, "balance_loss_mlp": 1.00042963, "epoch": 0.4993687248241448, "flos": 24860106084960.0, "grad_norm": 1.4571873300099056, "language_loss": 0.7726444, "learning_rate": 2.101223050007797e-06, "loss": 0.79803741, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.764308452606201 }, { "auxiliary_loss_clip": 0.01326923, "auxiliary_loss_mlp": 0.01193942, "balance_loss_clip": 1.00405777, "balance_loss_mlp": 1.00015581, "epoch": 0.49948896771478385, "flos": 62941631352480.0, "grad_norm": 0.818636693425519, "language_loss": 0.53791124, "learning_rate": 2.1004450690203904e-06, "loss": 0.56311989, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.3029956817626953 }, { "auxiliary_loss_clip": 0.01326802, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00405049, "balance_loss_mlp": 1.00018656, "epoch": 0.49960921060542296, "flos": 68284248312960.0, "grad_norm": 0.8504685235967178, "language_loss": 0.63392746, "learning_rate": 2.099667072795546e-06, "loss": 0.65912747, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.3310413360595703 }, { "auxiliary_loss_clip": 0.01334205, "auxiliary_loss_mlp": 0.01194204, "balance_loss_clip": 1.00824678, "balance_loss_mlp": 1.0003221, "epoch": 0.49972945349606207, "flos": 23659361897760.0, "grad_norm": 1.6436739725067637, "language_loss": 0.79928184, "learning_rate": 2.0988890614512864e-06, "loss": 0.82456601, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.7593212127685547 }, { "auxiliary_loss_clip": 0.01317633, "auxiliary_loss_mlp": 0.01194339, "balance_loss_clip": 1.00826359, "balance_loss_mlp": 1.00045741, "epoch": 0.4998496963867011, "flos": 19755835333920.0, "grad_norm": 2.4846646393642287, "language_loss": 0.8421979, "learning_rate": 2.098111035105635e-06, "loss": 0.86731768, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.803178548812866 }, { "auxiliary_loss_clip": 0.01260312, "auxiliary_loss_mlp": 0.01194294, "balance_loss_clip": 1.00825787, "balance_loss_mlp": 1.0003171, "epoch": 0.49996993927734024, "flos": 22265737670880.0, "grad_norm": 1.6052068595098643, "language_loss": 0.72733182, "learning_rate": 2.0973329938766176e-06, "loss": 0.7518779, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.853245973587036 }, { "auxiliary_loss_clip": 0.01346829, "auxiliary_loss_mlp": 0.0119477, "balance_loss_clip": 1.00960016, "balance_loss_mlp": 1.00050592, "epoch": 0.5000901821679793, "flos": 23327222663520.0, "grad_norm": 1.7337112832945665, "language_loss": 0.79239678, "learning_rate": 2.0965549378822618e-06, "loss": 0.8178128, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.8873987197875977 }, { "auxiliary_loss_clip": 0.01217571, "auxiliary_loss_mlp": 0.0119428, "balance_loss_clip": 1.00746083, "balance_loss_mlp": 1.00039816, "epoch": 0.5002104250586185, "flos": 20339024551200.0, "grad_norm": 1.9046462838539926, "language_loss": 0.84063709, "learning_rate": 2.095776867240599e-06, "loss": 0.86475563, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 3.0171172618865967 }, { "auxiliary_loss_clip": 0.01307163, "auxiliary_loss_mlp": 0.01194373, "balance_loss_clip": 1.00886905, "balance_loss_mlp": 1.0004909, "epoch": 0.5003306679492575, "flos": 13991381413920.0, "grad_norm": 1.9345990565624474, "language_loss": 0.82279867, "learning_rate": 2.094998782069661e-06, "loss": 0.84781402, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 3.9567439556121826 }, { "auxiliary_loss_clip": 0.0135752, "auxiliary_loss_mlp": 0.01194354, "balance_loss_clip": 1.00927377, "balance_loss_mlp": 1.0003767, "epoch": 0.5004509108398966, "flos": 27672775295040.0, "grad_norm": 2.6911504118310607, "language_loss": 0.75211489, "learning_rate": 2.0942206824874845e-06, "loss": 0.77763361, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 3.814624547958374 }, { "auxiliary_loss_clip": 0.0133396, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.00922036, "balance_loss_mlp": 1.00037801, "epoch": 0.5005711537305357, "flos": 14976195768000.0, "grad_norm": 2.2126120849656945, "language_loss": 0.79317993, "learning_rate": 2.093442568612105e-06, "loss": 0.81846213, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.849036931991577 }, { "auxiliary_loss_clip": 0.01357887, "auxiliary_loss_mlp": 0.01194517, "balance_loss_clip": 1.00874007, "balance_loss_mlp": 1.00044394, "epoch": 0.5006913966211748, "flos": 26503271500320.0, "grad_norm": 1.4447973223943504, "language_loss": 0.84908664, "learning_rate": 2.0926644405615613e-06, "loss": 0.87461066, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 3.7216525077819824 }, { "auxiliary_loss_clip": 0.0128998, "auxiliary_loss_mlp": 0.01194182, "balance_loss_clip": 1.00776076, "balance_loss_mlp": 1.00039494, "epoch": 0.5008116395118138, "flos": 20449306545120.0, "grad_norm": 1.8198943114033999, "language_loss": 0.81144375, "learning_rate": 2.091886298453897e-06, "loss": 0.83628535, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.8137500286102295 }, { "auxiliary_loss_clip": 0.01345329, "auxiliary_loss_mlp": 0.0119439, "balance_loss_clip": 1.00956511, "balance_loss_mlp": 1.00041246, "epoch": 0.500931882402453, "flos": 21579882279840.0, "grad_norm": 2.472197910930077, "language_loss": 0.73097122, "learning_rate": 2.091108142407153e-06, "loss": 0.75636846, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 3.7978599071502686 }, { "auxiliary_loss_clip": 0.01295332, "auxiliary_loss_mlp": 0.01193265, "balance_loss_clip": 1.00777328, "balance_loss_mlp": 1.00024152, "epoch": 0.5010521252930921, "flos": 57785046217920.0, "grad_norm": 0.8548858956225123, "language_loss": 0.62416631, "learning_rate": 2.090329972539377e-06, "loss": 0.64905232, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.3848226070404053 }, { "auxiliary_loss_clip": 0.01217233, "auxiliary_loss_mlp": 0.01194266, "balance_loss_clip": 1.00663674, "balance_loss_mlp": 1.00038409, "epoch": 0.5011723681837311, "flos": 18625511064960.0, "grad_norm": 1.8706238629928658, "language_loss": 0.68638057, "learning_rate": 2.089551788968616e-06, "loss": 0.71049559, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 3.0779223442077637 }, { "auxiliary_loss_clip": 0.01326638, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00377345, "balance_loss_mlp": 1.0001986, "epoch": 0.5012926110743702, "flos": 55883179075680.0, "grad_norm": 0.841739917453133, "language_loss": 0.60866082, "learning_rate": 2.08877359181292e-06, "loss": 0.6338594, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.657398223876953 }, { "auxiliary_loss_clip": 0.01319295, "auxiliary_loss_mlp": 0.01194186, "balance_loss_clip": 1.00866866, "balance_loss_mlp": 1.00039911, "epoch": 0.5014128539650093, "flos": 24238276197120.0, "grad_norm": 2.1568790647567067, "language_loss": 0.8574028, "learning_rate": 2.0879953811903396e-06, "loss": 0.8825376, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.8113207817077637 }, { "auxiliary_loss_clip": 0.01333352, "auxiliary_loss_mlp": 0.01194146, "balance_loss_clip": 1.00859058, "balance_loss_mlp": 1.00035965, "epoch": 0.5015330968556484, "flos": 27527480922240.0, "grad_norm": 1.6831076170817671, "language_loss": 0.78469944, "learning_rate": 2.08721715721893e-06, "loss": 0.80997443, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.875877618789673 }, { "auxiliary_loss_clip": 0.01334062, "auxiliary_loss_mlp": 0.0119417, "balance_loss_clip": 1.00844097, "balance_loss_mlp": 1.00028777, "epoch": 0.5016533397462875, "flos": 23800812436800.0, "grad_norm": 1.7601696000108624, "language_loss": 0.76892126, "learning_rate": 2.0864389200167477e-06, "loss": 0.79420352, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.7841360569000244 }, { "auxiliary_loss_clip": 0.01345151, "auxiliary_loss_mlp": 0.0087236, "balance_loss_clip": 1.00928807, "balance_loss_mlp": 1.00014102, "epoch": 0.5017735826369266, "flos": 25295019264000.0, "grad_norm": 1.6398045380449093, "language_loss": 0.78963089, "learning_rate": 2.0856606697018504e-06, "loss": 0.81180596, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.8172521591186523 }, { "auxiliary_loss_clip": 0.01320438, "auxiliary_loss_mlp": 0.01194321, "balance_loss_clip": 1.00852776, "balance_loss_mlp": 1.00034404, "epoch": 0.5018938255275657, "flos": 16873213213440.0, "grad_norm": 2.1032349497018075, "language_loss": 0.7368632, "learning_rate": 2.084882406392297e-06, "loss": 0.76201081, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.8494679927825928 }, { "auxiliary_loss_clip": 0.01339024, "auxiliary_loss_mlp": 0.01194241, "balance_loss_clip": 1.00887108, "balance_loss_mlp": 1.00035894, "epoch": 0.5020140684182047, "flos": 25515439557120.0, "grad_norm": 3.2883575087811514, "language_loss": 0.71197772, "learning_rate": 2.0841041302061496e-06, "loss": 0.73731041, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.7355310916900635 }, { "auxiliary_loss_clip": 0.01332784, "auxiliary_loss_mlp": 0.0119415, "balance_loss_clip": 1.00885701, "balance_loss_mlp": 1.00036383, "epoch": 0.5021343113088439, "flos": 23659290050400.0, "grad_norm": 1.7800453309630166, "language_loss": 0.75736976, "learning_rate": 2.083325841261473e-06, "loss": 0.78263909, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.7646377086639404 }, { "auxiliary_loss_clip": 0.01333403, "auxiliary_loss_mlp": 0.01194101, "balance_loss_clip": 1.00904584, "balance_loss_mlp": 1.00031471, "epoch": 0.502254554199483, "flos": 24534684578880.0, "grad_norm": 1.974145472890506, "language_loss": 0.66444981, "learning_rate": 2.0825475396763322e-06, "loss": 0.68972492, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.760646343231201 }, { "auxiliary_loss_clip": 0.01236806, "auxiliary_loss_mlp": 0.01194212, "balance_loss_clip": 1.0074755, "balance_loss_mlp": 1.00042534, "epoch": 0.502374797090122, "flos": 34240299870240.0, "grad_norm": 1.3777621082948026, "language_loss": 0.65874517, "learning_rate": 2.081769225568796e-06, "loss": 0.6830554, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 3.236637830734253 }, { "auxiliary_loss_clip": 0.01345508, "auxiliary_loss_mlp": 0.01194359, "balance_loss_clip": 1.00895, "balance_loss_mlp": 1.00047684, "epoch": 0.5024950399807612, "flos": 26031118674240.0, "grad_norm": 1.364291004406385, "language_loss": 0.76117754, "learning_rate": 2.0809908990569327e-06, "loss": 0.78657627, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 3.5282678604125977 }, { "auxiliary_loss_clip": 0.01323694, "auxiliary_loss_mlp": 0.01194391, "balance_loss_clip": 1.00872207, "balance_loss_mlp": 1.00041389, "epoch": 0.5026152828714002, "flos": 21252449047680.0, "grad_norm": 1.7134409551662733, "language_loss": 0.79105526, "learning_rate": 2.0802125602588146e-06, "loss": 0.81623614, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.7992703914642334 }, { "auxiliary_loss_clip": 0.01357624, "auxiliary_loss_mlp": 0.01194195, "balance_loss_clip": 1.00933313, "balance_loss_mlp": 1.00031257, "epoch": 0.5027355257620393, "flos": 30956124460320.0, "grad_norm": 1.7308443443288852, "language_loss": 0.66739047, "learning_rate": 2.0794342092925146e-06, "loss": 0.69290864, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.7859861850738525 }, { "auxiliary_loss_clip": 0.01343563, "auxiliary_loss_mlp": 0.01194372, "balance_loss_clip": 1.00894296, "balance_loss_mlp": 1.0003947, "epoch": 0.5028557686526784, "flos": 24791159037600.0, "grad_norm": 1.8240729252779762, "language_loss": 0.6783855, "learning_rate": 2.078655846276108e-06, "loss": 0.7037648, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.8409905433654785 }, { "auxiliary_loss_clip": 0.01325317, "auxiliary_loss_mlp": 0.01194274, "balance_loss_clip": 1.0088619, "balance_loss_mlp": 1.00039232, "epoch": 0.5029760115433175, "flos": 22967004320640.0, "grad_norm": 1.7878472385809092, "language_loss": 0.68676603, "learning_rate": 2.0778774713276727e-06, "loss": 0.71196193, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.826186418533325 }, { "auxiliary_loss_clip": 0.01346214, "auxiliary_loss_mlp": 0.01194433, "balance_loss_clip": 1.00946736, "balance_loss_mlp": 1.00045609, "epoch": 0.5030962544339566, "flos": 15305173718400.0, "grad_norm": 2.6054322332844975, "language_loss": 0.67580408, "learning_rate": 2.077099084565287e-06, "loss": 0.70121056, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.7037558555603027 }, { "auxiliary_loss_clip": 0.01332984, "auxiliary_loss_mlp": 0.01194413, "balance_loss_clip": 1.00922418, "balance_loss_mlp": 1.00053072, "epoch": 0.5032164973245957, "flos": 24494858426880.0, "grad_norm": 1.9894697139784117, "language_loss": 0.65113699, "learning_rate": 2.0763206861070313e-06, "loss": 0.67641091, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.798614263534546 }, { "auxiliary_loss_clip": 0.01357275, "auxiliary_loss_mlp": 0.01194115, "balance_loss_clip": 1.0089705, "balance_loss_mlp": 1.00032806, "epoch": 0.5033367402152348, "flos": 16213461128640.0, "grad_norm": 1.8470893266420305, "language_loss": 0.75079805, "learning_rate": 2.0755422760709876e-06, "loss": 0.77631199, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.6481544971466064 }, { "auxiliary_loss_clip": 0.01282306, "auxiliary_loss_mlp": 0.01194184, "balance_loss_clip": 1.00903845, "balance_loss_mlp": 1.00039744, "epoch": 0.5034569831058738, "flos": 21391384929120.0, "grad_norm": 1.738160067257703, "language_loss": 0.76974726, "learning_rate": 2.0747638545752417e-06, "loss": 0.79451215, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.856394052505493 }, { "auxiliary_loss_clip": 0.01311387, "auxiliary_loss_mlp": 0.01194091, "balance_loss_clip": 1.00847697, "balance_loss_mlp": 1.00030398, "epoch": 0.503577225996513, "flos": 20558762294400.0, "grad_norm": 1.8374218718296262, "language_loss": 0.83347255, "learning_rate": 2.073985421737878e-06, "loss": 0.85852736, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 3.687849283218384 }, { "auxiliary_loss_clip": 0.01337358, "auxiliary_loss_mlp": 0.01194411, "balance_loss_clip": 1.00847435, "balance_loss_mlp": 1.00043392, "epoch": 0.5036974688871521, "flos": 27229168585440.0, "grad_norm": 2.1304487633800457, "language_loss": 0.73683947, "learning_rate": 2.0732069776769844e-06, "loss": 0.76215708, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 3.946291923522949 }, { "auxiliary_loss_clip": 0.01357322, "auxiliary_loss_mlp": 0.01194477, "balance_loss_clip": 1.00928771, "balance_loss_mlp": 1.00040472, "epoch": 0.5038177117777911, "flos": 20412174669120.0, "grad_norm": 1.876443319347382, "language_loss": 0.72910726, "learning_rate": 2.072428522510651e-06, "loss": 0.75462532, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 3.6677892208099365 }, { "auxiliary_loss_clip": 0.012957, "auxiliary_loss_mlp": 0.01194193, "balance_loss_clip": 1.00898862, "balance_loss_mlp": 1.00031126, "epoch": 0.5039379546684303, "flos": 21907998061920.0, "grad_norm": 2.324793096632163, "language_loss": 0.76210237, "learning_rate": 2.071650056356968e-06, "loss": 0.78700125, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.832777500152588 }, { "auxiliary_loss_clip": 0.01357936, "auxiliary_loss_mlp": 0.01194157, "balance_loss_clip": 1.00923038, "balance_loss_mlp": 1.00036991, "epoch": 0.5040581975590693, "flos": 20010729150720.0, "grad_norm": 3.6971089208485877, "language_loss": 0.79831618, "learning_rate": 2.070871579334028e-06, "loss": 0.8238371, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 3.6823971271514893 }, { "auxiliary_loss_clip": 0.01356275, "auxiliary_loss_mlp": 0.01194136, "balance_loss_clip": 1.00841916, "balance_loss_mlp": 1.00034952, "epoch": 0.5041784404497084, "flos": 20959848576000.0, "grad_norm": 1.8779550731058907, "language_loss": 0.71865821, "learning_rate": 2.0700930915599264e-06, "loss": 0.74416232, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.6958229541778564 }, { "auxiliary_loss_clip": 0.013569, "auxiliary_loss_mlp": 0.0119414, "balance_loss_clip": 1.00880861, "balance_loss_mlp": 1.00035369, "epoch": 0.5042986833403476, "flos": 12495091013280.0, "grad_norm": 1.8842619176607538, "language_loss": 0.7853018, "learning_rate": 2.0693145931527583e-06, "loss": 0.81081223, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.7120654582977295 }, { "auxiliary_loss_clip": 0.01308729, "auxiliary_loss_mlp": 0.0119421, "balance_loss_clip": 1.00804162, "balance_loss_mlp": 1.00042331, "epoch": 0.5044189262309866, "flos": 29202317814240.0, "grad_norm": 1.4970708639796964, "language_loss": 0.7809577, "learning_rate": 2.068536084230622e-06, "loss": 0.80598712, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.810640811920166 }, { "auxiliary_loss_clip": 0.01334024, "auxiliary_loss_mlp": 0.01194198, "balance_loss_clip": 1.00900602, "balance_loss_mlp": 1.00031638, "epoch": 0.5045391691216257, "flos": 23873208157440.0, "grad_norm": 1.857726910948309, "language_loss": 0.88765109, "learning_rate": 2.067757564911616e-06, "loss": 0.91293329, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.7784204483032227 }, { "auxiliary_loss_clip": 0.01327993, "auxiliary_loss_mlp": 0.00872412, "balance_loss_clip": 1.00861835, "balance_loss_mlp": 1.00015676, "epoch": 0.5046594120122648, "flos": 24645002496480.0, "grad_norm": 1.9771991210199547, "language_loss": 0.92506498, "learning_rate": 2.0669790353138407e-06, "loss": 0.94706905, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.791018486022949 }, { "auxiliary_loss_clip": 0.0128488, "auxiliary_loss_mlp": 0.0087244, "balance_loss_clip": 1.00760484, "balance_loss_mlp": 1.00019503, "epoch": 0.5047796549029039, "flos": 23362845744960.0, "grad_norm": 2.231513941641781, "language_loss": 0.73089623, "learning_rate": 2.0662004955553995e-06, "loss": 0.75246942, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.859910726547241 }, { "auxiliary_loss_clip": 0.01332026, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00887728, "balance_loss_mlp": 1.00029802, "epoch": 0.5048998977935429, "flos": 17304102940320.0, "grad_norm": 1.8706371611764352, "language_loss": 0.77131879, "learning_rate": 2.065421945754395e-06, "loss": 0.7965799, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.777092933654785 }, { "auxiliary_loss_clip": 0.01274725, "auxiliary_loss_mlp": 0.01194295, "balance_loss_clip": 1.00701833, "balance_loss_mlp": 1.00041342, "epoch": 0.505020140684182, "flos": 34856992671840.0, "grad_norm": 1.6523932748124481, "language_loss": 0.78140903, "learning_rate": 2.0646433860289344e-06, "loss": 0.80609918, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.9120383262634277 }, { "auxiliary_loss_clip": 0.01339768, "auxiliary_loss_mlp": 0.00872479, "balance_loss_clip": 1.00822353, "balance_loss_mlp": 1.00019026, "epoch": 0.5051403835748212, "flos": 24863985842400.0, "grad_norm": 1.8047830511568255, "language_loss": 0.82587492, "learning_rate": 2.0638648164971233e-06, "loss": 0.84799743, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.7775774002075195 }, { "auxiliary_loss_clip": 0.01317267, "auxiliary_loss_mlp": 0.01194197, "balance_loss_clip": 1.00833321, "balance_loss_mlp": 1.0004108, "epoch": 0.5052606264654602, "flos": 20959704881280.0, "grad_norm": 1.731856270297662, "language_loss": 0.88577914, "learning_rate": 2.06308623727707e-06, "loss": 0.91089374, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.799386978149414 }, { "auxiliary_loss_clip": 0.01331732, "auxiliary_loss_mlp": 0.01194111, "balance_loss_clip": 1.00867283, "balance_loss_mlp": 1.00032401, "epoch": 0.5053808693560993, "flos": 19642392056160.0, "grad_norm": 2.340056124672816, "language_loss": 0.76510543, "learning_rate": 2.0623076484868846e-06, "loss": 0.79036385, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.7273268699645996 }, { "auxiliary_loss_clip": 0.01286606, "auxiliary_loss_mlp": 0.01193155, "balance_loss_clip": 1.0045011, "balance_loss_mlp": 1.00013149, "epoch": 0.5055011122467384, "flos": 67504945924800.0, "grad_norm": 0.8397343346910658, "language_loss": 0.60651374, "learning_rate": 2.061529050244679e-06, "loss": 0.63131142, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.326385736465454 }, { "auxiliary_loss_clip": 0.01295195, "auxiliary_loss_mlp": 0.01194245, "balance_loss_clip": 1.00846028, "balance_loss_mlp": 1.00036287, "epoch": 0.5056213551373775, "flos": 16872961747680.0, "grad_norm": 1.916806646935754, "language_loss": 0.74101841, "learning_rate": 2.060750442668565e-06, "loss": 0.76591277, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.8052620887756348 }, { "auxiliary_loss_clip": 0.01335459, "auxiliary_loss_mlp": 0.01194267, "balance_loss_clip": 1.00881028, "balance_loss_mlp": 1.00038493, "epoch": 0.5057415980280165, "flos": 15334186842720.0, "grad_norm": 2.100601442000107, "language_loss": 0.63882232, "learning_rate": 2.059971825876657e-06, "loss": 0.6641196, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.7738192081451416 }, { "auxiliary_loss_clip": 0.01344543, "auxiliary_loss_mlp": 0.01194315, "balance_loss_clip": 1.00904107, "balance_loss_mlp": 1.00033724, "epoch": 0.5058618409186557, "flos": 19025986644000.0, "grad_norm": 1.7487695270643826, "language_loss": 0.76224947, "learning_rate": 2.0591931999870713e-06, "loss": 0.78763807, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.733273983001709 }, { "auxiliary_loss_clip": 0.01301949, "auxiliary_loss_mlp": 0.01193115, "balance_loss_clip": 1.00443864, "balance_loss_mlp": 1.0000912, "epoch": 0.5059820838092948, "flos": 63453143322720.0, "grad_norm": 0.8202882394263962, "language_loss": 0.57613456, "learning_rate": 2.0584145651179234e-06, "loss": 0.60108519, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.379183769226074 }, { "auxiliary_loss_clip": 0.01313151, "auxiliary_loss_mlp": 0.00872361, "balance_loss_clip": 1.00810719, "balance_loss_mlp": 1.00014246, "epoch": 0.5061023266999338, "flos": 15441810484320.0, "grad_norm": 7.297206240831616, "language_loss": 0.79939091, "learning_rate": 2.0576359213873327e-06, "loss": 0.82124603, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.7594239711761475 }, { "auxiliary_loss_clip": 0.01327699, "auxiliary_loss_mlp": 0.01194595, "balance_loss_clip": 1.00813818, "balance_loss_mlp": 1.00042677, "epoch": 0.506222569590573, "flos": 22451073737760.0, "grad_norm": 2.5117318879677883, "language_loss": 0.70064652, "learning_rate": 2.056857268913419e-06, "loss": 0.72586942, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.8572328090667725 }, { "auxiliary_loss_clip": 0.01335861, "auxiliary_loss_mlp": 0.01194326, "balance_loss_clip": 1.00856137, "balance_loss_mlp": 1.00044429, "epoch": 0.506342812481212, "flos": 17558673444000.0, "grad_norm": 2.502651906761516, "language_loss": 0.84045196, "learning_rate": 2.056078607814303e-06, "loss": 0.86575383, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.65848708152771 }, { "auxiliary_loss_clip": 0.01333869, "auxiliary_loss_mlp": 0.01194219, "balance_loss_clip": 1.00833082, "balance_loss_mlp": 1.00033736, "epoch": 0.5064630553718511, "flos": 23402061194400.0, "grad_norm": 1.9261288408674024, "language_loss": 0.78315663, "learning_rate": 2.055299938208106e-06, "loss": 0.80843759, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.8836166858673096 }, { "auxiliary_loss_clip": 0.01337038, "auxiliary_loss_mlp": 0.0119441, "balance_loss_clip": 1.00902414, "balance_loss_mlp": 1.0003376, "epoch": 0.5065832982624903, "flos": 23987046595680.0, "grad_norm": 1.624596457982727, "language_loss": 0.86205471, "learning_rate": 2.0545212602129526e-06, "loss": 0.88736916, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.76300048828125 }, { "auxiliary_loss_clip": 0.01319383, "auxiliary_loss_mlp": 0.01194307, "balance_loss_clip": 1.00899637, "balance_loss_mlp": 1.00032997, "epoch": 0.5067035411531293, "flos": 21503067946560.0, "grad_norm": 2.48464660934112, "language_loss": 0.66610622, "learning_rate": 2.0537425739469673e-06, "loss": 0.69124317, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 3.6851208209991455 }, { "auxiliary_loss_clip": 0.01312976, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00315094, "balance_loss_mlp": 1.00021076, "epoch": 0.5068237840437684, "flos": 65934427695840.0, "grad_norm": 0.8408270196148633, "language_loss": 0.59464127, "learning_rate": 2.052963879528276e-06, "loss": 0.61970341, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 4.265066385269165 }, { "auxiliary_loss_clip": 0.01334843, "auxiliary_loss_mlp": 0.01194482, "balance_loss_clip": 1.00900269, "balance_loss_mlp": 1.00050414, "epoch": 0.5069440269344075, "flos": 27264216888000.0, "grad_norm": 2.1650134974014965, "language_loss": 0.76569939, "learning_rate": 2.052185177075007e-06, "loss": 0.79099268, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 3.6964569091796875 }, { "auxiliary_loss_clip": 0.01345707, "auxiliary_loss_mlp": 0.01194399, "balance_loss_clip": 1.00924492, "balance_loss_mlp": 1.00042212, "epoch": 0.5070642698250466, "flos": 23366330341920.0, "grad_norm": 1.7630050182348713, "language_loss": 0.82895112, "learning_rate": 2.051406466705288e-06, "loss": 0.85435224, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.7698633670806885 }, { "auxiliary_loss_clip": 0.01357013, "auxiliary_loss_mlp": 0.01194123, "balance_loss_clip": 1.00875783, "balance_loss_mlp": 1.00033689, "epoch": 0.5071845127156857, "flos": 20340138185280.0, "grad_norm": 2.3045725388303637, "language_loss": 0.81193006, "learning_rate": 2.0506277485372486e-06, "loss": 0.83744144, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.7054731845855713 }, { "auxiliary_loss_clip": 0.01346339, "auxiliary_loss_mlp": 0.0119428, "balance_loss_clip": 1.00906372, "balance_loss_mlp": 1.00030303, "epoch": 0.5073047556063248, "flos": 12092962944960.0, "grad_norm": 1.977918169357073, "language_loss": 0.67346781, "learning_rate": 2.04984902268902e-06, "loss": 0.698874, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 3.6356661319732666 }, { "auxiliary_loss_clip": 0.01345619, "auxiliary_loss_mlp": 0.01194415, "balance_loss_clip": 1.00937903, "balance_loss_mlp": 1.00043797, "epoch": 0.5074249984969639, "flos": 19682864834400.0, "grad_norm": 1.9522878852890784, "language_loss": 0.75818485, "learning_rate": 2.0490702892787345e-06, "loss": 0.78358525, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.724698781967163 }, { "auxiliary_loss_clip": 0.01344695, "auxiliary_loss_mlp": 0.01194048, "balance_loss_clip": 1.00882006, "balance_loss_mlp": 1.00026131, "epoch": 0.5075452413876029, "flos": 28765716222240.0, "grad_norm": 1.65571864136421, "language_loss": 0.62224627, "learning_rate": 2.0482915484245246e-06, "loss": 0.64763367, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.800781488418579 }, { "auxiliary_loss_clip": 0.01286964, "auxiliary_loss_mlp": 0.01194334, "balance_loss_clip": 1.00906014, "balance_loss_mlp": 1.00045168, "epoch": 0.5076654842782421, "flos": 20339455635360.0, "grad_norm": 2.0904026843799888, "language_loss": 0.84239364, "learning_rate": 2.047512800244526e-06, "loss": 0.86720669, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.892885446548462 }, { "auxiliary_loss_clip": 0.01335263, "auxiliary_loss_mlp": 0.01194242, "balance_loss_clip": 1.0085578, "balance_loss_mlp": 1.00035977, "epoch": 0.5077857271688812, "flos": 26359665540480.0, "grad_norm": 1.7658941887858859, "language_loss": 0.79137719, "learning_rate": 2.046734044856873e-06, "loss": 0.81667233, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.8076767921447754 }, { "auxiliary_loss_clip": 0.01333142, "auxiliary_loss_mlp": 0.01194117, "balance_loss_clip": 1.0080148, "balance_loss_mlp": 1.00032997, "epoch": 0.5079059700595202, "flos": 21798973396800.0, "grad_norm": 2.428723968132525, "language_loss": 0.81193614, "learning_rate": 2.045955282379702e-06, "loss": 0.83720869, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.754925012588501 }, { "auxiliary_loss_clip": 0.01345114, "auxiliary_loss_mlp": 0.0119424, "balance_loss_clip": 1.00904262, "balance_loss_mlp": 1.00035763, "epoch": 0.5080262129501594, "flos": 13187951521920.0, "grad_norm": 2.3788881794651155, "language_loss": 0.75677502, "learning_rate": 2.045176512931152e-06, "loss": 0.78216851, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.747750997543335 }, { "auxiliary_loss_clip": 0.01294027, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00863123, "balance_loss_mlp": 1.00028467, "epoch": 0.5081464558407984, "flos": 25301485526400.0, "grad_norm": 2.1150245704637185, "language_loss": 0.76072168, "learning_rate": 2.0443977366293604e-06, "loss": 0.78560263, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.9014883041381836 }, { "auxiliary_loss_clip": 0.01271585, "auxiliary_loss_mlp": 0.01194392, "balance_loss_clip": 1.00801528, "balance_loss_mlp": 1.00041485, "epoch": 0.5082666987314375, "flos": 30951238839840.0, "grad_norm": 1.5216030072045108, "language_loss": 0.76868337, "learning_rate": 2.043618953592468e-06, "loss": 0.79334319, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 3.1399199962615967 }, { "auxiliary_loss_clip": 0.01310874, "auxiliary_loss_mlp": 0.01194267, "balance_loss_clip": 1.00780153, "balance_loss_mlp": 1.00028968, "epoch": 0.5083869416220766, "flos": 19682505597600.0, "grad_norm": 1.4249462349109223, "language_loss": 0.80974722, "learning_rate": 2.0428401639386144e-06, "loss": 0.83479863, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.9730138778686523 }, { "auxiliary_loss_clip": 0.01282547, "auxiliary_loss_mlp": 0.01193152, "balance_loss_clip": 1.00275755, "balance_loss_mlp": 1.00012815, "epoch": 0.5085071845127157, "flos": 71817569750880.0, "grad_norm": 0.8772265331661554, "language_loss": 0.5812313, "learning_rate": 2.042061367785943e-06, "loss": 0.60598832, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.3233399391174316 }, { "auxiliary_loss_clip": 0.01310226, "auxiliary_loss_mlp": 0.01194327, "balance_loss_clip": 1.00809503, "balance_loss_mlp": 1.00034952, "epoch": 0.5086274274033548, "flos": 35951622012000.0, "grad_norm": 2.0606250544686624, "language_loss": 0.75177813, "learning_rate": 2.041282565252594e-06, "loss": 0.77682364, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.890125036239624 }, { "auxiliary_loss_clip": 0.01314113, "auxiliary_loss_mlp": 0.01194308, "balance_loss_clip": 1.00801897, "balance_loss_mlp": 1.00042605, "epoch": 0.5087476702939938, "flos": 23513744211840.0, "grad_norm": 1.6601735402824354, "language_loss": 0.7717731, "learning_rate": 2.040503756456714e-06, "loss": 0.79685736, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.8505351543426514 }, { "auxiliary_loss_clip": 0.01344806, "auxiliary_loss_mlp": 0.01194183, "balance_loss_clip": 1.00870872, "balance_loss_mlp": 1.00039625, "epoch": 0.508867913184633, "flos": 15122100843360.0, "grad_norm": 1.963489645874498, "language_loss": 0.78651702, "learning_rate": 2.0397249415164456e-06, "loss": 0.81190693, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.6777756214141846 }, { "auxiliary_loss_clip": 0.0133213, "auxiliary_loss_mlp": 0.01194341, "balance_loss_clip": 1.00925326, "balance_loss_mlp": 1.00036407, "epoch": 0.508988156075272, "flos": 25885321369920.0, "grad_norm": 1.539539963784199, "language_loss": 0.79991698, "learning_rate": 2.0389461205499354e-06, "loss": 0.82518172, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.9303340911865234 }, { "auxiliary_loss_clip": 0.01302763, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.00772929, "balance_loss_mlp": 1.00028718, "epoch": 0.5091083989659111, "flos": 13844865636000.0, "grad_norm": 2.240475816712426, "language_loss": 0.7368933, "learning_rate": 2.03816729367533e-06, "loss": 0.76186168, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.777521848678589 }, { "auxiliary_loss_clip": 0.01321343, "auxiliary_loss_mlp": 0.01194273, "balance_loss_clip": 1.00926125, "balance_loss_mlp": 1.00020039, "epoch": 0.5092286418565503, "flos": 21104891483040.0, "grad_norm": 2.085636966342053, "language_loss": 0.71761453, "learning_rate": 2.0373884610107765e-06, "loss": 0.74277067, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.7604405879974365 }, { "auxiliary_loss_clip": 0.01344271, "auxiliary_loss_mlp": 0.01194142, "balance_loss_clip": 1.00820565, "balance_loss_mlp": 1.00035572, "epoch": 0.5093488847471893, "flos": 18621307994400.0, "grad_norm": 2.283674108403933, "language_loss": 0.69825566, "learning_rate": 2.0366096226744225e-06, "loss": 0.72363973, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.677830457687378 }, { "auxiliary_loss_clip": 0.01344417, "auxiliary_loss_mlp": 0.01194213, "balance_loss_clip": 1.00855851, "balance_loss_mlp": 1.00042629, "epoch": 0.5094691276378284, "flos": 23803794102240.0, "grad_norm": 1.6525081887357629, "language_loss": 0.76876074, "learning_rate": 2.035830778784418e-06, "loss": 0.79414701, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.743771553039551 }, { "auxiliary_loss_clip": 0.01305667, "auxiliary_loss_mlp": 0.01194212, "balance_loss_clip": 1.00948238, "balance_loss_mlp": 1.00032961, "epoch": 0.5095893705284675, "flos": 17420420112480.0, "grad_norm": 1.8387375689030394, "language_loss": 0.79823875, "learning_rate": 2.0350519294589134e-06, "loss": 0.82323754, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.654873847961426 }, { "auxiliary_loss_clip": 0.01275327, "auxiliary_loss_mlp": 0.01194289, "balance_loss_clip": 1.00727677, "balance_loss_mlp": 1.00040746, "epoch": 0.5097096134191066, "flos": 25849374975360.0, "grad_norm": 1.6467965786674597, "language_loss": 0.82668465, "learning_rate": 2.0342730748160588e-06, "loss": 0.85138083, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.8655247688293457 }, { "auxiliary_loss_clip": 0.0131918, "auxiliary_loss_mlp": 0.01194267, "balance_loss_clip": 1.008605, "balance_loss_mlp": 1.00038528, "epoch": 0.5098298563097456, "flos": 27745135092000.0, "grad_norm": 2.560440677176127, "language_loss": 0.70867777, "learning_rate": 2.033494214974006e-06, "loss": 0.73381221, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 3.802447557449341 }, { "auxiliary_loss_clip": 0.013216, "auxiliary_loss_mlp": 0.01194116, "balance_loss_clip": 1.00826502, "balance_loss_mlp": 1.00032902, "epoch": 0.5099500992003848, "flos": 21358923131520.0, "grad_norm": 1.7795169926144698, "language_loss": 0.83850175, "learning_rate": 2.0327153500509067e-06, "loss": 0.86365891, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 3.7610743045806885 }, { "auxiliary_loss_clip": 0.01313178, "auxiliary_loss_mlp": 0.01194466, "balance_loss_clip": 1.00792623, "balance_loss_mlp": 1.00039303, "epoch": 0.5100703420910239, "flos": 19866009556800.0, "grad_norm": 2.0780928461471064, "language_loss": 0.85033691, "learning_rate": 2.031936480164916e-06, "loss": 0.8754133, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 3.7526047229766846 }, { "auxiliary_loss_clip": 0.01306179, "auxiliary_loss_mlp": 0.01194137, "balance_loss_clip": 1.00792825, "balance_loss_mlp": 1.00035048, "epoch": 0.5101905849816629, "flos": 24648810406560.0, "grad_norm": 1.7251185150834338, "language_loss": 0.80489278, "learning_rate": 2.0311576054341857e-06, "loss": 0.82989597, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.9023759365081787 }, { "auxiliary_loss_clip": 0.01357667, "auxiliary_loss_mlp": 0.01194247, "balance_loss_clip": 1.00939596, "balance_loss_mlp": 1.00026941, "epoch": 0.5103108278723021, "flos": 22930087986720.0, "grad_norm": 1.5406413887071109, "language_loss": 0.6270622, "learning_rate": 2.0303787259768715e-06, "loss": 0.65258133, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.715620279312134 }, { "auxiliary_loss_clip": 0.013178, "auxiliary_loss_mlp": 0.01194426, "balance_loss_clip": 1.00832129, "balance_loss_mlp": 1.00044823, "epoch": 0.5104310707629411, "flos": 21506624390880.0, "grad_norm": 2.368459636795571, "language_loss": 0.69054317, "learning_rate": 2.0295998419111294e-06, "loss": 0.71566546, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 3.702000379562378 }, { "auxiliary_loss_clip": 0.01282019, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.00830293, "balance_loss_mlp": 1.00028801, "epoch": 0.5105513136535802, "flos": 14903189344800.0, "grad_norm": 2.1356348063279205, "language_loss": 0.73951638, "learning_rate": 2.028820953355115e-06, "loss": 0.76427734, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.9048054218292236 }, { "auxiliary_loss_clip": 0.01324222, "auxiliary_loss_mlp": 0.01194266, "balance_loss_clip": 1.00856578, "balance_loss_mlp": 1.00038409, "epoch": 0.5106715565442194, "flos": 22602223670400.0, "grad_norm": 1.6813406838569438, "language_loss": 0.78440619, "learning_rate": 2.0280420604269834e-06, "loss": 0.80959111, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.801862955093384 }, { "auxiliary_loss_clip": 0.01307033, "auxiliary_loss_mlp": 0.01193114, "balance_loss_clip": 1.00322545, "balance_loss_mlp": 1.00009084, "epoch": 0.5107917994348584, "flos": 71027683086240.0, "grad_norm": 0.7018419521800385, "language_loss": 0.58965278, "learning_rate": 2.027263163244895e-06, "loss": 0.61465418, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.4483628273010254 }, { "auxiliary_loss_clip": 0.01332796, "auxiliary_loss_mlp": 0.01194232, "balance_loss_clip": 1.00832915, "balance_loss_mlp": 1.00035024, "epoch": 0.5109120423254975, "flos": 24827428745280.0, "grad_norm": 1.5034219206429342, "language_loss": 0.74618411, "learning_rate": 2.026484261927005e-06, "loss": 0.77145445, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.7377002239227295 }, { "auxiliary_loss_clip": 0.01339724, "auxiliary_loss_mlp": 0.01194344, "balance_loss_clip": 1.00870728, "balance_loss_mlp": 1.00036693, "epoch": 0.5110322852161366, "flos": 21247671198240.0, "grad_norm": 1.9848078330815935, "language_loss": 0.74149823, "learning_rate": 2.025705356591475e-06, "loss": 0.76683891, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 2.71581768989563 }, { "auxiliary_loss_clip": 0.01270152, "auxiliary_loss_mlp": 0.00871959, "balance_loss_clip": 1.0028131, "balance_loss_mlp": 1.00038195, "epoch": 0.5111525281067757, "flos": 66457147854240.0, "grad_norm": 0.7632023176170292, "language_loss": 0.58014238, "learning_rate": 2.024926447356462e-06, "loss": 0.60156351, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.265477180480957 }, { "auxiliary_loss_clip": 0.01331871, "auxiliary_loss_mlp": 0.01194627, "balance_loss_clip": 1.00890791, "balance_loss_mlp": 1.00036395, "epoch": 0.5112727709974147, "flos": 14866740018720.0, "grad_norm": 1.7543471780826663, "language_loss": 0.78657824, "learning_rate": 2.024147534340127e-06, "loss": 0.81184316, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.680349111557007 }, { "auxiliary_loss_clip": 0.01319085, "auxiliary_loss_mlp": 0.01194104, "balance_loss_clip": 1.00882578, "balance_loss_mlp": 1.00031734, "epoch": 0.5113930138880539, "flos": 21177610516800.0, "grad_norm": 1.5837315511392125, "language_loss": 0.79882914, "learning_rate": 2.02336861766063e-06, "loss": 0.82396102, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.8442800045013428 }, { "auxiliary_loss_clip": 0.01345454, "auxiliary_loss_mlp": 0.01194509, "balance_loss_clip": 1.00926948, "balance_loss_mlp": 1.00043631, "epoch": 0.511513256778693, "flos": 20409121156320.0, "grad_norm": 1.6695815301290142, "language_loss": 0.7864179, "learning_rate": 2.0225896974361327e-06, "loss": 0.81181753, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.7987124919891357 }, { "auxiliary_loss_clip": 0.01271963, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00331545, "balance_loss_mlp": 1.00012922, "epoch": 0.511633499669332, "flos": 69880007679840.0, "grad_norm": 0.854689897280824, "language_loss": 0.59951282, "learning_rate": 2.0218107737847962e-06, "loss": 0.62416399, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.3673274517059326 }, { "auxiliary_loss_clip": 0.01356673, "auxiliary_loss_mlp": 0.01194189, "balance_loss_clip": 1.0089879, "balance_loss_mlp": 1.0003072, "epoch": 0.5117537425599712, "flos": 24097867444800.0, "grad_norm": 1.8066692732936278, "language_loss": 0.74720263, "learning_rate": 2.0210318468247826e-06, "loss": 0.77271128, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.7071239948272705 }, { "auxiliary_loss_clip": 0.01320746, "auxiliary_loss_mlp": 0.01194201, "balance_loss_clip": 1.00756884, "balance_loss_mlp": 1.00031877, "epoch": 0.5118739854506102, "flos": 20959561186560.0, "grad_norm": 1.8998463620643633, "language_loss": 0.81675947, "learning_rate": 2.020252916674255e-06, "loss": 0.84190893, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.759079933166504 }, { "auxiliary_loss_clip": 0.01344913, "auxiliary_loss_mlp": 0.01194282, "balance_loss_clip": 1.00909019, "balance_loss_mlp": 1.00030494, "epoch": 0.5119942283412493, "flos": 17457336446400.0, "grad_norm": 1.7392110012857511, "language_loss": 0.81414843, "learning_rate": 2.019473983451375e-06, "loss": 0.83954036, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.724201202392578 }, { "auxiliary_loss_clip": 0.01299117, "auxiliary_loss_mlp": 0.01194284, "balance_loss_clip": 1.00892448, "balance_loss_mlp": 1.00040185, "epoch": 0.5121144712318885, "flos": 21066753744000.0, "grad_norm": 1.7321772471209258, "language_loss": 0.71644384, "learning_rate": 2.0186950472743076e-06, "loss": 0.74137783, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.779510021209717 }, { "auxiliary_loss_clip": 0.01357622, "auxiliary_loss_mlp": 0.01194244, "balance_loss_clip": 1.00913358, "balance_loss_mlp": 1.00036216, "epoch": 0.5122347141225275, "flos": 19860800623200.0, "grad_norm": 2.188537098757429, "language_loss": 0.74115384, "learning_rate": 2.0179161082612162e-06, "loss": 0.76667249, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.7675764560699463 }, { "auxiliary_loss_clip": 0.01320346, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00807881, "balance_loss_mlp": 1.00027156, "epoch": 0.5123549570131666, "flos": 22528498773600.0, "grad_norm": 1.9130925157620662, "language_loss": 0.72897863, "learning_rate": 2.017137166530266e-06, "loss": 0.75412267, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.7829837799072266 }, { "auxiliary_loss_clip": 0.01326205, "auxiliary_loss_mlp": 0.01194316, "balance_loss_clip": 1.00840998, "balance_loss_mlp": 1.00043428, "epoch": 0.5124751999038056, "flos": 20333384533440.0, "grad_norm": 1.948605866367934, "language_loss": 0.7998786, "learning_rate": 2.0163582221996213e-06, "loss": 0.82508385, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.847703695297241 }, { "auxiliary_loss_clip": 0.01313194, "auxiliary_loss_mlp": 0.01194208, "balance_loss_clip": 1.00784922, "balance_loss_mlp": 1.00032616, "epoch": 0.5125954427944448, "flos": 39785985986400.0, "grad_norm": 1.7503098977318532, "language_loss": 0.68011671, "learning_rate": 2.015579275387446e-06, "loss": 0.70519078, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.9678797721862793 }, { "auxiliary_loss_clip": 0.01320986, "auxiliary_loss_mlp": 0.01194343, "balance_loss_clip": 1.0083425, "balance_loss_mlp": 1.00046098, "epoch": 0.5127156856850839, "flos": 29205407250720.0, "grad_norm": 1.8663193638509956, "language_loss": 0.69027585, "learning_rate": 2.0148003262119085e-06, "loss": 0.71542919, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.8355319499969482 }, { "auxiliary_loss_clip": 0.01295335, "auxiliary_loss_mlp": 0.01194433, "balance_loss_clip": 1.00828743, "balance_loss_mlp": 1.00045586, "epoch": 0.5128359285757229, "flos": 13553702111520.0, "grad_norm": 1.9496086451886185, "language_loss": 0.76690924, "learning_rate": 2.0140213747911728e-06, "loss": 0.79180682, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.9323489665985107 }, { "auxiliary_loss_clip": 0.0128778, "auxiliary_loss_mlp": 0.01194312, "balance_loss_clip": 1.00751758, "balance_loss_mlp": 1.00033474, "epoch": 0.5129561714663621, "flos": 25192101624480.0, "grad_norm": 1.984783286745118, "language_loss": 0.80835164, "learning_rate": 2.013242421243406e-06, "loss": 0.83317256, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 3.7836287021636963 }, { "auxiliary_loss_clip": 0.01275116, "auxiliary_loss_mlp": 0.01194152, "balance_loss_clip": 1.00733805, "balance_loss_mlp": 1.00026989, "epoch": 0.5130764143570011, "flos": 18150232878720.0, "grad_norm": 1.4831468344349652, "language_loss": 0.791462, "learning_rate": 2.012463465686774e-06, "loss": 0.81615472, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 3.775254011154175 }, { "auxiliary_loss_clip": 0.01254799, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.0085299, "balance_loss_mlp": 1.00012708, "epoch": 0.5131966572476402, "flos": 59794932162240.0, "grad_norm": 0.7615561046298779, "language_loss": 0.54755795, "learning_rate": 2.0116845082394446e-06, "loss": 0.5720374, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 4.360478401184082 }, { "auxiliary_loss_clip": 0.01345995, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00924659, "balance_loss_mlp": 1.00032628, "epoch": 0.5133169001382794, "flos": 18515229071040.0, "grad_norm": 2.309232016435751, "language_loss": 0.78582627, "learning_rate": 2.0109055490195836e-06, "loss": 0.81122732, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.8377859592437744 }, { "auxiliary_loss_clip": 0.0128506, "auxiliary_loss_mlp": 0.01194135, "balance_loss_clip": 1.00812364, "balance_loss_mlp": 1.00034857, "epoch": 0.5134371430289184, "flos": 15523546361760.0, "grad_norm": 1.8249516965790487, "language_loss": 0.64169765, "learning_rate": 2.0101265881453605e-06, "loss": 0.66648966, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 3.075244426727295 }, { "auxiliary_loss_clip": 0.01322672, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00909507, "balance_loss_mlp": 1.00029159, "epoch": 0.5135573859195575, "flos": 21433797586080.0, "grad_norm": 1.9166502241728354, "language_loss": 0.78401959, "learning_rate": 2.009347625734941e-06, "loss": 0.80918705, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 3.8152995109558105 }, { "auxiliary_loss_clip": 0.01358339, "auxiliary_loss_mlp": 0.01194277, "balance_loss_clip": 1.00961757, "balance_loss_mlp": 1.0003947, "epoch": 0.5136776288101966, "flos": 17712661347360.0, "grad_norm": 2.2599241095416094, "language_loss": 0.75413072, "learning_rate": 2.0085686619064954e-06, "loss": 0.77965689, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.694101572036743 }, { "auxiliary_loss_clip": 0.01343721, "auxiliary_loss_mlp": 0.01194448, "balance_loss_clip": 1.0093472, "balance_loss_mlp": 1.00047088, "epoch": 0.5137978717008357, "flos": 16581690452160.0, "grad_norm": 1.7907533770198631, "language_loss": 0.8299731, "learning_rate": 2.00778969677819e-06, "loss": 0.85535479, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.7339258193969727 }, { "auxiliary_loss_clip": 0.0132241, "auxiliary_loss_mlp": 0.01194253, "balance_loss_clip": 1.00877714, "balance_loss_mlp": 1.00037074, "epoch": 0.5139181145914747, "flos": 20668253967360.0, "grad_norm": 1.660017763756602, "language_loss": 0.63767266, "learning_rate": 2.0070107304681934e-06, "loss": 0.66283929, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.8519747257232666 }, { "auxiliary_loss_clip": 0.01293036, "auxiliary_loss_mlp": 0.01194162, "balance_loss_clip": 1.00757396, "balance_loss_mlp": 1.00037479, "epoch": 0.5140383574821139, "flos": 32927010497280.0, "grad_norm": 1.634969145752131, "language_loss": 0.78087211, "learning_rate": 2.006231763094675e-06, "loss": 0.80574405, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.8837897777557373 }, { "auxiliary_loss_clip": 0.01309171, "auxiliary_loss_mlp": 0.01194184, "balance_loss_clip": 1.00867212, "balance_loss_mlp": 1.00039768, "epoch": 0.514158600372753, "flos": 19537103453760.0, "grad_norm": 1.8625694563894895, "language_loss": 0.87504888, "learning_rate": 2.0054527947758027e-06, "loss": 0.90008235, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 2.7495057582855225 }, { "auxiliary_loss_clip": 0.01306536, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.00329411, "balance_loss_mlp": 1.00007427, "epoch": 0.514278843263392, "flos": 62523889692480.0, "grad_norm": 0.7216382499937165, "language_loss": 0.55910134, "learning_rate": 2.004673825629746e-06, "loss": 0.58409774, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.2857613563537598 }, { "auxiliary_loss_clip": 0.0133239, "auxiliary_loss_mlp": 0.01194224, "balance_loss_clip": 1.0089314, "balance_loss_mlp": 1.00034249, "epoch": 0.5143990861540312, "flos": 25882339704480.0, "grad_norm": 1.455388369227777, "language_loss": 0.72225165, "learning_rate": 2.0038948557746744e-06, "loss": 0.7475177, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.8348915576934814 }, { "auxiliary_loss_clip": 0.01331899, "auxiliary_loss_mlp": 0.01194214, "balance_loss_clip": 1.00854778, "balance_loss_mlp": 1.000332, "epoch": 0.5145193290446702, "flos": 23330671336800.0, "grad_norm": 1.5570047813297683, "language_loss": 0.75014889, "learning_rate": 2.0031158853287558e-06, "loss": 0.77541, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.800262451171875 }, { "auxiliary_loss_clip": 0.01310311, "auxiliary_loss_mlp": 0.01194249, "balance_loss_clip": 1.00769329, "balance_loss_mlp": 1.0003674, "epoch": 0.5146395719353093, "flos": 22856614555680.0, "grad_norm": 1.9247552386544007, "language_loss": 0.70187086, "learning_rate": 2.0023369144101593e-06, "loss": 0.72691649, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.8310470581054688 }, { "auxiliary_loss_clip": 0.01320053, "auxiliary_loss_mlp": 0.01194322, "balance_loss_clip": 1.00825024, "balance_loss_mlp": 1.00034428, "epoch": 0.5147598148259485, "flos": 26391588482880.0, "grad_norm": 1.5363349022476862, "language_loss": 0.77043998, "learning_rate": 2.0015579431370555e-06, "loss": 0.79558378, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.861954689025879 }, { "auxiliary_loss_clip": 0.013324, "auxiliary_loss_mlp": 0.01194222, "balance_loss_clip": 1.00810766, "balance_loss_mlp": 1.00043535, "epoch": 0.5148800577165875, "flos": 29965706012160.0, "grad_norm": 2.8491471703588602, "language_loss": 0.70128453, "learning_rate": 2.000778971627612e-06, "loss": 0.72655082, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.859969139099121 }, { "auxiliary_loss_clip": 0.01332491, "auxiliary_loss_mlp": 0.0119406, "balance_loss_clip": 1.00914121, "balance_loss_mlp": 1.0002737, "epoch": 0.5150003006072266, "flos": 17931393227520.0, "grad_norm": 1.8991814306831718, "language_loss": 0.90192962, "learning_rate": 2e-06, "loss": 0.92719513, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.7992005348205566 }, { "auxiliary_loss_clip": 0.01356382, "auxiliary_loss_mlp": 0.01194223, "balance_loss_clip": 1.0091244, "balance_loss_mlp": 1.00034142, "epoch": 0.5151205434978657, "flos": 18478743821280.0, "grad_norm": 1.7991963613295587, "language_loss": 0.8519361, "learning_rate": 1.9992210283723878e-06, "loss": 0.87744218, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.702756404876709 }, { "auxiliary_loss_clip": 0.0135607, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00914073, "balance_loss_mlp": 1.00026059, "epoch": 0.5152407863885048, "flos": 25341275754720.0, "grad_norm": 1.474011860653112, "language_loss": 0.79377806, "learning_rate": 1.9984420568629448e-06, "loss": 0.81927931, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.774486541748047 }, { "auxiliary_loss_clip": 0.01338048, "auxiliary_loss_mlp": 0.01194172, "balance_loss_clip": 1.00824583, "balance_loss_mlp": 1.00029016, "epoch": 0.5153610292791438, "flos": 18329749309440.0, "grad_norm": 2.174889216874076, "language_loss": 0.78476012, "learning_rate": 1.9976630855898405e-06, "loss": 0.81008226, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.6993298530578613 }, { "auxiliary_loss_clip": 0.01330934, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00863528, "balance_loss_mlp": 1.00028479, "epoch": 0.515481272169783, "flos": 30409959348000.0, "grad_norm": 3.0332275155470794, "language_loss": 0.7429322, "learning_rate": 1.9968841146712445e-06, "loss": 0.76818222, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.827138900756836 }, { "auxiliary_loss_clip": 0.01263675, "auxiliary_loss_mlp": 0.00872405, "balance_loss_clip": 1.00768387, "balance_loss_mlp": 1.00020766, "epoch": 0.5156015150604221, "flos": 23037316467840.0, "grad_norm": 1.5121865226196038, "language_loss": 0.71498054, "learning_rate": 1.996105144225326e-06, "loss": 0.73634136, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.874824047088623 }, { "auxiliary_loss_clip": 0.01334526, "auxiliary_loss_mlp": 0.01194099, "balance_loss_clip": 1.00826859, "balance_loss_mlp": 1.00031209, "epoch": 0.5157217579510611, "flos": 17858566422720.0, "grad_norm": 2.401948348049542, "language_loss": 0.78690958, "learning_rate": 1.995326174370254e-06, "loss": 0.81219584, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.75219988822937 }, { "auxiliary_loss_clip": 0.01336097, "auxiliary_loss_mlp": 0.0087223, "balance_loss_clip": 1.00804234, "balance_loss_mlp": 1.00015855, "epoch": 0.5158420008417003, "flos": 19171496558880.0, "grad_norm": 1.5210739292504154, "language_loss": 0.73096752, "learning_rate": 1.994547205224197e-06, "loss": 0.75305074, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.7560341358184814 }, { "auxiliary_loss_clip": 0.01308028, "auxiliary_loss_mlp": 0.01194162, "balance_loss_clip": 1.00814736, "balance_loss_mlp": 1.00037491, "epoch": 0.5159622437323393, "flos": 22419545955840.0, "grad_norm": 12.180842786287101, "language_loss": 0.67525882, "learning_rate": 1.993768236905325e-06, "loss": 0.70028073, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.7540080547332764 }, { "auxiliary_loss_clip": 0.01322561, "auxiliary_loss_mlp": 0.01194268, "balance_loss_clip": 1.00828409, "balance_loss_mlp": 1.00038624, "epoch": 0.5160824866229784, "flos": 24603020923680.0, "grad_norm": 2.2508362302410143, "language_loss": 0.65316981, "learning_rate": 1.992989269531807e-06, "loss": 0.67833811, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 3.723304510116577 }, { "auxiliary_loss_clip": 0.01322454, "auxiliary_loss_mlp": 0.01194247, "balance_loss_clip": 1.00844455, "balance_loss_mlp": 1.00036502, "epoch": 0.5162027295136175, "flos": 18002747161440.0, "grad_norm": 10.237253194087687, "language_loss": 0.67737877, "learning_rate": 1.99221030322181e-06, "loss": 0.70254582, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 3.776556968688965 }, { "auxiliary_loss_clip": 0.01323698, "auxiliary_loss_mlp": 0.01194238, "balance_loss_clip": 1.00793886, "balance_loss_mlp": 1.00035572, "epoch": 0.5163229724042566, "flos": 27344623589280.0, "grad_norm": 1.6405636338810063, "language_loss": 0.80822015, "learning_rate": 1.991431338093505e-06, "loss": 0.83339947, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 3.817089319229126 }, { "auxiliary_loss_clip": 0.0131272, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00792944, "balance_loss_mlp": 1.00036705, "epoch": 0.5164432152948957, "flos": 21762775536480.0, "grad_norm": 1.562523352030292, "language_loss": 0.79282814, "learning_rate": 1.9906523742650587e-06, "loss": 0.81789696, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.7743773460388184 }, { "auxiliary_loss_clip": 0.01356962, "auxiliary_loss_mlp": 0.01194268, "balance_loss_clip": 1.00859046, "balance_loss_mlp": 1.00029087, "epoch": 0.5165634581855347, "flos": 25550344164960.0, "grad_norm": 2.227364830658072, "language_loss": 0.77495748, "learning_rate": 1.9898734118546397e-06, "loss": 0.80046976, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.7802278995513916 }, { "auxiliary_loss_clip": 0.01244909, "auxiliary_loss_mlp": 0.0119416, "balance_loss_clip": 1.00710857, "balance_loss_mlp": 1.00037301, "epoch": 0.5166837010761739, "flos": 19901201554080.0, "grad_norm": 1.4456545019696672, "language_loss": 0.80541921, "learning_rate": 1.989094450980416e-06, "loss": 0.8298099, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 4.161147117614746 }, { "auxiliary_loss_clip": 0.01333621, "auxiliary_loss_mlp": 0.0119411, "balance_loss_clip": 1.00812936, "balance_loss_mlp": 1.00032377, "epoch": 0.516803943966813, "flos": 26646087139200.0, "grad_norm": 2.6000850871924475, "language_loss": 0.76550561, "learning_rate": 1.9883154917605556e-06, "loss": 0.79078287, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 3.2950689792633057 }, { "auxiliary_loss_clip": 0.01355682, "auxiliary_loss_mlp": 0.01194068, "balance_loss_clip": 1.0086261, "balance_loss_mlp": 1.0002811, "epoch": 0.516924186857452, "flos": 19682864834400.0, "grad_norm": 1.776873703125382, "language_loss": 0.83108175, "learning_rate": 1.9875365343132262e-06, "loss": 0.8565793, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.7135188579559326 }, { "auxiliary_loss_clip": 0.01333185, "auxiliary_loss_mlp": 0.00872318, "balance_loss_clip": 1.00874221, "balance_loss_mlp": 1.0002594, "epoch": 0.5170444297480912, "flos": 15956591509440.0, "grad_norm": 1.9413381337568383, "language_loss": 0.84497535, "learning_rate": 1.9867575787565946e-06, "loss": 0.86703038, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.6961216926574707 }, { "auxiliary_loss_clip": 0.01335068, "auxiliary_loss_mlp": 0.01194168, "balance_loss_clip": 1.00870395, "balance_loss_mlp": 1.00038099, "epoch": 0.5171646726387302, "flos": 14174166899520.0, "grad_norm": 1.9560605990312194, "language_loss": 0.86072862, "learning_rate": 1.9859786252088275e-06, "loss": 0.88602102, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.70395827293396 }, { "auxiliary_loss_clip": 0.0129845, "auxiliary_loss_mlp": 0.01194141, "balance_loss_clip": 1.00797582, "balance_loss_mlp": 1.00035489, "epoch": 0.5172849155293693, "flos": 23578560036000.0, "grad_norm": 3.166233442648754, "language_loss": 0.66649747, "learning_rate": 1.9851996737880914e-06, "loss": 0.69142336, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.820277690887451 }, { "auxiliary_loss_clip": 0.01338229, "auxiliary_loss_mlp": 0.01194202, "balance_loss_clip": 1.00894809, "balance_loss_mlp": 1.00031972, "epoch": 0.5174051584200084, "flos": 14283550801440.0, "grad_norm": 2.190885249947324, "language_loss": 0.74384356, "learning_rate": 1.9844207246125537e-06, "loss": 0.76916778, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.720825433731079 }, { "auxiliary_loss_clip": 0.01325922, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00899112, "balance_loss_mlp": 1.00026512, "epoch": 0.5175254013106475, "flos": 37889399625120.0, "grad_norm": 1.68110193547312, "language_loss": 0.68822837, "learning_rate": 1.983641777800379e-06, "loss": 0.71342814, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.9050023555755615 }, { "auxiliary_loss_clip": 0.01300927, "auxiliary_loss_mlp": 0.01193083, "balance_loss_clip": 1.00339842, "balance_loss_mlp": 1.00005937, "epoch": 0.5176456442012866, "flos": 68549775539040.0, "grad_norm": 0.7406908097766498, "language_loss": 0.58739746, "learning_rate": 1.9828628334697343e-06, "loss": 0.61233747, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.47706937789917 }, { "auxiliary_loss_clip": 0.01300515, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00334549, "balance_loss_mlp": 1.00007498, "epoch": 0.5177658870919257, "flos": 64084169672640.0, "grad_norm": 0.7599856124972564, "language_loss": 0.54670399, "learning_rate": 1.982083891738784e-06, "loss": 0.57164007, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.4025425910949707 }, { "auxiliary_loss_clip": 0.01308833, "auxiliary_loss_mlp": 0.0119429, "balance_loss_clip": 1.00870419, "balance_loss_mlp": 1.00031245, "epoch": 0.5178861299825648, "flos": 26651260149120.0, "grad_norm": 1.4252050057588324, "language_loss": 0.82923603, "learning_rate": 1.9813049527256923e-06, "loss": 0.85426736, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.815336227416992 }, { "auxiliary_loss_clip": 0.01312173, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.00841403, "balance_loss_mlp": 1.00028861, "epoch": 0.5180063728732038, "flos": 17931896159040.0, "grad_norm": 2.4973969880902938, "language_loss": 0.8175745, "learning_rate": 1.9805260165486252e-06, "loss": 0.84263694, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.8880300521850586 }, { "auxiliary_loss_clip": 0.01332615, "auxiliary_loss_mlp": 0.01194125, "balance_loss_clip": 1.00842655, "balance_loss_mlp": 1.00033891, "epoch": 0.518126615763843, "flos": 19500905593440.0, "grad_norm": 1.9491064536244664, "language_loss": 0.85952407, "learning_rate": 1.9797470833257457e-06, "loss": 0.88479143, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.8911962509155273 }, { "auxiliary_loss_clip": 0.01334633, "auxiliary_loss_mlp": 0.01194115, "balance_loss_clip": 1.0092628, "balance_loss_mlp": 1.00032878, "epoch": 0.5182468586544821, "flos": 20704092590880.0, "grad_norm": 2.0407734068199397, "language_loss": 0.77413207, "learning_rate": 1.9789681531752177e-06, "loss": 0.79941952, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.749994993209839 }, { "auxiliary_loss_clip": 0.01274504, "auxiliary_loss_mlp": 0.01194215, "balance_loss_clip": 1.0072161, "balance_loss_mlp": 1.00042844, "epoch": 0.5183671015451211, "flos": 23112119075040.0, "grad_norm": 1.45841024248139, "language_loss": 0.72234946, "learning_rate": 1.978189226215204e-06, "loss": 0.7470367, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.901582717895508 }, { "auxiliary_loss_clip": 0.01356423, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00880182, "balance_loss_mlp": 1.00028896, "epoch": 0.5184873444357603, "flos": 17597098572480.0, "grad_norm": 1.664088202672732, "language_loss": 0.77107614, "learning_rate": 1.9774103025638675e-06, "loss": 0.79658115, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.646230459213257 }, { "auxiliary_loss_clip": 0.01271247, "auxiliary_loss_mlp": 0.01194314, "balance_loss_clip": 1.00738406, "balance_loss_mlp": 1.00043178, "epoch": 0.5186075873263993, "flos": 24936812647200.0, "grad_norm": 1.4898095401053193, "language_loss": 0.76039016, "learning_rate": 1.9766313823393696e-06, "loss": 0.78504574, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.8630576133728027 }, { "auxiliary_loss_clip": 0.01287217, "auxiliary_loss_mlp": 0.01194084, "balance_loss_clip": 1.00748348, "balance_loss_mlp": 1.00029767, "epoch": 0.5187278302170384, "flos": 15190113875040.0, "grad_norm": 1.963229707691745, "language_loss": 0.69005829, "learning_rate": 1.975852465659873e-06, "loss": 0.71487129, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.8296449184417725 }, { "auxiliary_loss_clip": 0.01333412, "auxiliary_loss_mlp": 0.01194187, "balance_loss_clip": 1.00928998, "balance_loss_mlp": 1.0003047, "epoch": 0.5188480731076776, "flos": 25009423909920.0, "grad_norm": 2.159314259193829, "language_loss": 0.70184958, "learning_rate": 1.9750735526435377e-06, "loss": 0.72712553, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.7783007621765137 }, { "auxiliary_loss_clip": 0.01311099, "auxiliary_loss_mlp": 0.01194269, "balance_loss_clip": 1.00840473, "balance_loss_mlp": 1.00038719, "epoch": 0.5189683159983166, "flos": 24790153174560.0, "grad_norm": 2.5110405079816185, "language_loss": 0.79060155, "learning_rate": 1.974294643408525e-06, "loss": 0.81565523, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.8507161140441895 }, { "auxiliary_loss_clip": 0.0134531, "auxiliary_loss_mlp": 0.01194398, "balance_loss_clip": 1.00916719, "balance_loss_mlp": 1.00042009, "epoch": 0.5190885588889557, "flos": 24754278627360.0, "grad_norm": 1.7278866288794643, "language_loss": 0.66823095, "learning_rate": 1.9735157380729947e-06, "loss": 0.69362807, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.8779489994049072 }, { "auxiliary_loss_clip": 0.01321246, "auxiliary_loss_mlp": 0.01194054, "balance_loss_clip": 1.0079689, "balance_loss_mlp": 1.00026703, "epoch": 0.5192088017795948, "flos": 24712656291360.0, "grad_norm": 1.7151222590002548, "language_loss": 0.84000611, "learning_rate": 1.9727368367551053e-06, "loss": 0.86515915, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 3.857381582260132 }, { "auxiliary_loss_clip": 0.01319528, "auxiliary_loss_mlp": 0.01194135, "balance_loss_clip": 1.0087024, "balance_loss_mlp": 1.00034881, "epoch": 0.5193290446702339, "flos": 27229599669600.0, "grad_norm": 1.7543815409104704, "language_loss": 0.6835351, "learning_rate": 1.9719579395730164e-06, "loss": 0.70867175, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 4.000210523605347 }, { "auxiliary_loss_clip": 0.01357208, "auxiliary_loss_mlp": 0.01194384, "balance_loss_clip": 1.00952685, "balance_loss_mlp": 1.00040638, "epoch": 0.5194492875608729, "flos": 11473360325280.0, "grad_norm": 2.955682694109862, "language_loss": 0.93534011, "learning_rate": 1.9711790466448854e-06, "loss": 0.96085602, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 3.6326773166656494 }, { "auxiliary_loss_clip": 0.01281287, "auxiliary_loss_mlp": 0.01194194, "balance_loss_clip": 1.00790989, "balance_loss_mlp": 1.00031209, "epoch": 0.5195695304515121, "flos": 20338916780160.0, "grad_norm": 2.6135647998230143, "language_loss": 0.71602118, "learning_rate": 1.9704001580888704e-06, "loss": 0.740776, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.9107449054718018 }, { "auxiliary_loss_clip": 0.01321319, "auxiliary_loss_mlp": 0.00872421, "balance_loss_clip": 1.0080471, "balance_loss_mlp": 1.00019956, "epoch": 0.5196897733421512, "flos": 20048328034560.0, "grad_norm": 1.7417634390093844, "language_loss": 0.86922801, "learning_rate": 1.9696212740231283e-06, "loss": 0.89116538, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.758765697479248 }, { "auxiliary_loss_clip": 0.01345328, "auxiliary_loss_mlp": 0.01194202, "balance_loss_clip": 1.00908303, "balance_loss_mlp": 1.00032043, "epoch": 0.5198100162327902, "flos": 23805518438880.0, "grad_norm": 1.894329610060088, "language_loss": 0.82371062, "learning_rate": 1.9688423945658146e-06, "loss": 0.84910595, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 3.7355728149414062 }, { "auxiliary_loss_clip": 0.01297415, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.00834608, "balance_loss_mlp": 1.00037789, "epoch": 0.5199302591234293, "flos": 24023962929600.0, "grad_norm": 2.59428944742977, "language_loss": 0.72188365, "learning_rate": 1.9680635198350845e-06, "loss": 0.74680036, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.8752031326293945 }, { "auxiliary_loss_clip": 0.01345674, "auxiliary_loss_mlp": 0.01194222, "balance_loss_clip": 1.00953352, "balance_loss_mlp": 1.00034022, "epoch": 0.5200505020140684, "flos": 26359378151040.0, "grad_norm": 2.1112100272040637, "language_loss": 0.72794604, "learning_rate": 1.967284649949093e-06, "loss": 0.75334501, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.739107131958008 }, { "auxiliary_loss_clip": 0.01308764, "auxiliary_loss_mlp": 0.01194064, "balance_loss_clip": 1.00918627, "balance_loss_mlp": 1.00027764, "epoch": 0.5201707449047075, "flos": 39604278211200.0, "grad_norm": 2.3725549321551935, "language_loss": 0.72423452, "learning_rate": 1.966505785025994e-06, "loss": 0.74926275, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.967538356781006 }, { "auxiliary_loss_clip": 0.01280971, "auxiliary_loss_mlp": 0.01194253, "balance_loss_clip": 1.00779462, "balance_loss_mlp": 1.00046635, "epoch": 0.5202909877953465, "flos": 53682806769120.0, "grad_norm": 1.6198709629548025, "language_loss": 0.75938594, "learning_rate": 1.965726925183941e-06, "loss": 0.7841382, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 3.098250150680542 }, { "auxiliary_loss_clip": 0.0135616, "auxiliary_loss_mlp": 0.01194115, "balance_loss_clip": 1.00902987, "balance_loss_mlp": 1.00032794, "epoch": 0.5204112306859857, "flos": 19537031606400.0, "grad_norm": 2.095740776509277, "language_loss": 0.84686804, "learning_rate": 1.964948070541087e-06, "loss": 0.87237072, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 2.7050321102142334 }, { "auxiliary_loss_clip": 0.01344373, "auxiliary_loss_mlp": 0.01194174, "balance_loss_clip": 1.00874996, "balance_loss_mlp": 1.00029206, "epoch": 0.5205314735766248, "flos": 15304706710560.0, "grad_norm": 2.0642776309135242, "language_loss": 0.69521189, "learning_rate": 1.9641692212155816e-06, "loss": 0.72059739, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.728076934814453 }, { "auxiliary_loss_clip": 0.01273359, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00799382, "balance_loss_mlp": 1.00030506, "epoch": 0.5206517164672638, "flos": 59263720806240.0, "grad_norm": 1.822058814452704, "language_loss": 0.72733599, "learning_rate": 1.9633903773255777e-06, "loss": 0.75201046, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.148989200592041 }, { "auxiliary_loss_clip": 0.01356416, "auxiliary_loss_mlp": 0.01194073, "balance_loss_clip": 1.00853479, "balance_loss_mlp": 1.00028682, "epoch": 0.520771959357903, "flos": 26871141587040.0, "grad_norm": 1.5538841913855828, "language_loss": 0.74833548, "learning_rate": 1.9626115389892237e-06, "loss": 0.77384043, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.765904188156128 }, { "auxiliary_loss_clip": 0.01296985, "auxiliary_loss_mlp": 0.01194352, "balance_loss_clip": 1.00885713, "balance_loss_mlp": 1.00037503, "epoch": 0.520892202248542, "flos": 26907087981600.0, "grad_norm": 2.134207843549913, "language_loss": 0.85472941, "learning_rate": 1.96183270632467e-06, "loss": 0.87964284, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.7877838611602783 }, { "auxiliary_loss_clip": 0.01292637, "auxiliary_loss_mlp": 0.00872494, "balance_loss_clip": 1.00860965, "balance_loss_mlp": 1.00020933, "epoch": 0.5210124451391811, "flos": 25849446822720.0, "grad_norm": 1.5693836104542322, "language_loss": 0.7891742, "learning_rate": 1.9610538794500644e-06, "loss": 0.81082541, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.8509559631347656 }, { "auxiliary_loss_clip": 0.01282758, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00316012, "balance_loss_mlp": 1.00007236, "epoch": 0.5211326880298203, "flos": 70553841847200.0, "grad_norm": 0.7680041421814577, "language_loss": 0.59498966, "learning_rate": 1.9602750584835542e-06, "loss": 0.61974823, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.4836432933807373 }, { "auxiliary_loss_clip": 0.01332433, "auxiliary_loss_mlp": 0.01194142, "balance_loss_clip": 1.0091567, "balance_loss_mlp": 1.00035536, "epoch": 0.5212529309204593, "flos": 15628906811520.0, "grad_norm": 1.834738327465592, "language_loss": 0.82581896, "learning_rate": 1.959496243543286e-06, "loss": 0.85108465, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.784804344177246 }, { "auxiliary_loss_clip": 0.01334326, "auxiliary_loss_mlp": 0.01194306, "balance_loss_clip": 1.00877059, "balance_loss_mlp": 1.00032854, "epoch": 0.5213731738110984, "flos": 26242665818400.0, "grad_norm": 2.064453272452841, "language_loss": 0.79408163, "learning_rate": 1.9587174347474057e-06, "loss": 0.819368, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.802687644958496 }, { "auxiliary_loss_clip": 0.01284292, "auxiliary_loss_mlp": 0.01194267, "balance_loss_clip": 1.00793147, "balance_loss_mlp": 1.00038469, "epoch": 0.5214934167017375, "flos": 19418415318720.0, "grad_norm": 2.1432143214568082, "language_loss": 0.8219198, "learning_rate": 1.9579386322140574e-06, "loss": 0.84670544, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.9139881134033203 }, { "auxiliary_loss_clip": 0.01358153, "auxiliary_loss_mlp": 0.00872415, "balance_loss_clip": 1.00946689, "balance_loss_mlp": 1.00028133, "epoch": 0.5216136595923766, "flos": 30955801147200.0, "grad_norm": 2.958817460517779, "language_loss": 0.80732369, "learning_rate": 1.9571598360613854e-06, "loss": 0.8296293, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.851040840148926 }, { "auxiliary_loss_clip": 0.013329, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.0088948, "balance_loss_mlp": 1.0002656, "epoch": 0.5217339024830157, "flos": 21945058090560.0, "grad_norm": 2.0829257268321433, "language_loss": 0.69806564, "learning_rate": 1.956381046407532e-06, "loss": 0.72333515, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.8064966201782227 }, { "auxiliary_loss_clip": 0.01306586, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.00812709, "balance_loss_mlp": 1.00029683, "epoch": 0.5218541453736548, "flos": 20923219631520.0, "grad_norm": 1.6105121991733335, "language_loss": 0.86304075, "learning_rate": 1.9556022633706394e-06, "loss": 0.88804841, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.821944236755371 }, { "auxiliary_loss_clip": 0.01320572, "auxiliary_loss_mlp": 0.01194262, "balance_loss_clip": 1.00870037, "balance_loss_mlp": 1.00038028, "epoch": 0.5219743882642939, "flos": 23951674980000.0, "grad_norm": 1.593735357627888, "language_loss": 0.79799509, "learning_rate": 1.954823487068848e-06, "loss": 0.82314342, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.8216285705566406 }, { "auxiliary_loss_clip": 0.01335005, "auxiliary_loss_mlp": 0.01194098, "balance_loss_clip": 1.00940037, "balance_loss_mlp": 1.0003109, "epoch": 0.5220946311549329, "flos": 28799291653920.0, "grad_norm": 1.5943718623456635, "language_loss": 0.81281215, "learning_rate": 1.9540447176202976e-06, "loss": 0.83810318, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.766693592071533 }, { "auxiliary_loss_clip": 0.01304193, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.0029695, "balance_loss_mlp": 1.00003397, "epoch": 0.5222148740455721, "flos": 67189403430720.0, "grad_norm": 0.8690258440092137, "language_loss": 0.60806179, "learning_rate": 1.9532659551431272e-06, "loss": 0.63303423, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.479050636291504 }, { "auxiliary_loss_clip": 0.01344438, "auxiliary_loss_mlp": 0.01194119, "balance_loss_clip": 1.009058, "balance_loss_mlp": 1.00033283, "epoch": 0.5223351169362112, "flos": 61856185265280.0, "grad_norm": 1.6027470230055685, "language_loss": 0.67356408, "learning_rate": 1.9524871997554744e-06, "loss": 0.69894969, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 4.783653736114502 }, { "auxiliary_loss_clip": 0.01334906, "auxiliary_loss_mlp": 0.01194202, "balance_loss_clip": 1.00846469, "balance_loss_mlp": 1.00032032, "epoch": 0.5224553598268502, "flos": 14647397436000.0, "grad_norm": 2.3899072806763644, "language_loss": 0.80407155, "learning_rate": 1.951708451575475e-06, "loss": 0.82936263, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.71614408493042 }, { "auxiliary_loss_clip": 0.01319806, "auxiliary_loss_mlp": 0.01194217, "balance_loss_clip": 1.00892019, "balance_loss_mlp": 1.00043011, "epoch": 0.5225756027174894, "flos": 14826051698400.0, "grad_norm": 1.6886563110227866, "language_loss": 0.82141685, "learning_rate": 1.9509297107212657e-06, "loss": 0.84655708, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 3.752295732498169 }, { "auxiliary_loss_clip": 0.01356886, "auxiliary_loss_mlp": 0.0119418, "balance_loss_clip": 1.00930202, "balance_loss_mlp": 1.00039363, "epoch": 0.5226958456081284, "flos": 23512019875200.0, "grad_norm": 1.5461554088112595, "language_loss": 0.79169559, "learning_rate": 1.95015097731098e-06, "loss": 0.81720632, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.8040263652801514 }, { "auxiliary_loss_clip": 0.01356844, "auxiliary_loss_mlp": 0.01194216, "balance_loss_clip": 1.00897038, "balance_loss_mlp": 1.00033402, "epoch": 0.5228160884987675, "flos": 19062938901600.0, "grad_norm": 2.007754265766467, "language_loss": 0.81743634, "learning_rate": 1.949372251462751e-06, "loss": 0.84294689, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.7781600952148438 }, { "auxiliary_loss_clip": 0.013008, "auxiliary_loss_mlp": 0.00872258, "balance_loss_clip": 1.00849795, "balance_loss_mlp": 1.00013733, "epoch": 0.5229363313894067, "flos": 21063233223360.0, "grad_norm": 6.159474345860424, "language_loss": 0.82869703, "learning_rate": 1.9485935332947124e-06, "loss": 0.85042757, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 3.8345248699188232 }, { "auxiliary_loss_clip": 0.01311355, "auxiliary_loss_mlp": 0.01194117, "balance_loss_clip": 1.00769138, "balance_loss_mlp": 1.00033045, "epoch": 0.5230565742800457, "flos": 14830398463680.0, "grad_norm": 2.3249019610135564, "language_loss": 0.83735287, "learning_rate": 1.947814822924993e-06, "loss": 0.86240762, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.7745931148529053 }, { "auxiliary_loss_clip": 0.01356372, "auxiliary_loss_mlp": 0.01194087, "balance_loss_clip": 1.00918818, "balance_loss_mlp": 1.00030065, "epoch": 0.5231768171706848, "flos": 25813033420320.0, "grad_norm": 1.7293718634131534, "language_loss": 0.82616103, "learning_rate": 1.9470361204717236e-06, "loss": 0.85166562, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.7091736793518066 }, { "auxiliary_loss_clip": 0.01308324, "auxiliary_loss_mlp": 0.00872404, "balance_loss_clip": 1.00853252, "balance_loss_mlp": 1.00021553, "epoch": 0.5232970600613239, "flos": 22743817904160.0, "grad_norm": 1.5356097868161969, "language_loss": 0.80718368, "learning_rate": 1.9462574260530326e-06, "loss": 0.82899094, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.81622576713562 }, { "auxiliary_loss_clip": 0.013437, "auxiliary_loss_mlp": 0.01194136, "balance_loss_clip": 1.00870109, "balance_loss_mlp": 1.0003494, "epoch": 0.523417302951963, "flos": 17310712897440.0, "grad_norm": 1.702032079293965, "language_loss": 0.81001365, "learning_rate": 1.9454787397870472e-06, "loss": 0.835392, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.774703025817871 }, { "auxiliary_loss_clip": 0.0124954, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00843668, "balance_loss_mlp": 1.00029969, "epoch": 0.523537545842602, "flos": 18551750244480.0, "grad_norm": 2.04340632375722, "language_loss": 0.71596843, "learning_rate": 1.944700061791894e-06, "loss": 0.74040473, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.9665393829345703 }, { "auxiliary_loss_clip": 0.01331859, "auxiliary_loss_mlp": 0.01194081, "balance_loss_clip": 1.00810969, "balance_loss_mlp": 1.00029445, "epoch": 0.5236577887332411, "flos": 19719278236800.0, "grad_norm": 1.9241365896578815, "language_loss": 0.65161777, "learning_rate": 1.943921392185698e-06, "loss": 0.67687714, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 3.366699457168579 }, { "auxiliary_loss_clip": 0.0131938, "auxiliary_loss_mlp": 0.01194151, "balance_loss_clip": 1.00908899, "balance_loss_mlp": 1.00036466, "epoch": 0.5237780316238803, "flos": 23550229461600.0, "grad_norm": 1.7877753167157866, "language_loss": 0.77179432, "learning_rate": 1.9431427310865814e-06, "loss": 0.79692972, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.7652299404144287 }, { "auxiliary_loss_clip": 0.01306871, "auxiliary_loss_mlp": 0.01194061, "balance_loss_clip": 1.00892448, "balance_loss_mlp": 1.00027418, "epoch": 0.5238982745145193, "flos": 22491905752800.0, "grad_norm": 1.8148371962712406, "language_loss": 0.79036599, "learning_rate": 1.942364078612667e-06, "loss": 0.81537533, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.8889520168304443 }, { "auxiliary_loss_clip": 0.01295101, "auxiliary_loss_mlp": 0.01194256, "balance_loss_clip": 1.00806046, "balance_loss_mlp": 1.00037432, "epoch": 0.5240185174051584, "flos": 27088939451520.0, "grad_norm": 1.6594385238471339, "language_loss": 0.75223452, "learning_rate": 1.9415854348820765e-06, "loss": 0.7771281, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.8486990928649902 }, { "auxiliary_loss_clip": 0.01331557, "auxiliary_loss_mlp": 0.01194186, "balance_loss_clip": 1.00920451, "balance_loss_mlp": 1.00039959, "epoch": 0.5241387602957975, "flos": 22674691238400.0, "grad_norm": 2.1972477241404933, "language_loss": 0.68308866, "learning_rate": 1.940806800012929e-06, "loss": 0.70834607, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.7412548065185547 }, { "auxiliary_loss_clip": 0.01273159, "auxiliary_loss_mlp": 0.00872402, "balance_loss_clip": 1.00808191, "balance_loss_mlp": 1.00023556, "epoch": 0.5242590031864366, "flos": 40553469483840.0, "grad_norm": 1.4150054563065368, "language_loss": 0.63376796, "learning_rate": 1.9400281741233432e-06, "loss": 0.65522355, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 3.0713958740234375 }, { "auxiliary_loss_clip": 0.01266443, "auxiliary_loss_mlp": 0.01193028, "balance_loss_clip": 1.00378394, "balance_loss_mlp": 1.00000465, "epoch": 0.5243792460770756, "flos": 66676346742240.0, "grad_norm": 0.6553044128825096, "language_loss": 0.52569044, "learning_rate": 1.939249557331435e-06, "loss": 0.5502851, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.357480525970459 }, { "auxiliary_loss_clip": 0.01312318, "auxiliary_loss_mlp": 0.01194128, "balance_loss_clip": 1.00806963, "balance_loss_mlp": 1.00034189, "epoch": 0.5244994889677148, "flos": 28183676562720.0, "grad_norm": 2.021102343000797, "language_loss": 0.72910005, "learning_rate": 1.938470949755321e-06, "loss": 0.75416446, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.9187328815460205 }, { "auxiliary_loss_clip": 0.01279331, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00344896, "balance_loss_mlp": 1.00006568, "epoch": 0.5246197318583539, "flos": 65950413733440.0, "grad_norm": 0.8129326181168139, "language_loss": 0.55703199, "learning_rate": 1.937692351513115e-06, "loss": 0.58175623, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.3673806190490723 }, { "auxiliary_loss_clip": 0.0134559, "auxiliary_loss_mlp": 0.01194207, "balance_loss_clip": 1.00937581, "balance_loss_mlp": 1.00032473, "epoch": 0.5247399747489929, "flos": 21033501625440.0, "grad_norm": 1.5929199031524228, "language_loss": 0.8053298, "learning_rate": 1.9369137627229297e-06, "loss": 0.83072782, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.8985507488250732 }, { "auxiliary_loss_clip": 0.01332968, "auxiliary_loss_mlp": 0.01194219, "balance_loss_clip": 1.00845003, "balance_loss_mlp": 1.0003376, "epoch": 0.5248602176396321, "flos": 19025950720320.0, "grad_norm": 2.070337705041995, "language_loss": 0.87950784, "learning_rate": 1.936135183502877e-06, "loss": 0.90477967, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.8186967372894287 }, { "auxiliary_loss_clip": 0.01293998, "auxiliary_loss_mlp": 0.0119433, "balance_loss_clip": 1.00859272, "balance_loss_mlp": 1.00044787, "epoch": 0.5249804605302711, "flos": 22200095602080.0, "grad_norm": 1.9645053531529906, "language_loss": 0.80281365, "learning_rate": 1.935356613971066e-06, "loss": 0.82769692, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.939656972885132 }, { "auxiliary_loss_clip": 0.01311751, "auxiliary_loss_mlp": 0.00872403, "balance_loss_clip": 1.00740898, "balance_loss_mlp": 1.00021148, "epoch": 0.5251007034209102, "flos": 23805698057280.0, "grad_norm": 1.6334054982525061, "language_loss": 0.76462346, "learning_rate": 1.9345780542456047e-06, "loss": 0.78646499, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.9625542163848877 }, { "auxiliary_loss_clip": 0.01343765, "auxiliary_loss_mlp": 0.01194223, "balance_loss_clip": 1.0086925, "balance_loss_mlp": 1.00034094, "epoch": 0.5252209463115494, "flos": 23294617171200.0, "grad_norm": 2.446171399392137, "language_loss": 0.71610761, "learning_rate": 1.9337995044446007e-06, "loss": 0.74148744, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.7881484031677246 }, { "auxiliary_loss_clip": 0.01344508, "auxiliary_loss_mlp": 0.0119406, "balance_loss_clip": 1.00896549, "balance_loss_mlp": 1.00027299, "epoch": 0.5253411892021884, "flos": 19828698062400.0, "grad_norm": 2.134882017795904, "language_loss": 0.79528153, "learning_rate": 1.9330209646861596e-06, "loss": 0.82066721, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.9599781036376953 }, { "auxiliary_loss_clip": 0.01311834, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00779116, "balance_loss_mlp": 1.00029123, "epoch": 0.5254614320928275, "flos": 24133741992000.0, "grad_norm": 1.552983246914771, "language_loss": 0.77471662, "learning_rate": 1.9322424350883843e-06, "loss": 0.79977572, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 3.7464582920074463 }, { "auxiliary_loss_clip": 0.01323885, "auxiliary_loss_mlp": 0.01194117, "balance_loss_clip": 1.00838661, "balance_loss_mlp": 1.00033009, "epoch": 0.5255816749834666, "flos": 24644966572800.0, "grad_norm": 1.5851933322635756, "language_loss": 0.78642511, "learning_rate": 1.931463915769379e-06, "loss": 0.8116051, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.980898380279541 }, { "auxiliary_loss_clip": 0.01285314, "auxiliary_loss_mlp": 0.01194172, "balance_loss_clip": 1.00782037, "balance_loss_mlp": 1.00029004, "epoch": 0.5257019178741057, "flos": 14136603939360.0, "grad_norm": 2.0375946280224624, "language_loss": 0.73709309, "learning_rate": 1.930685406847242e-06, "loss": 0.76188797, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 3.9872188568115234 }, { "auxiliary_loss_clip": 0.01312766, "auxiliary_loss_mlp": 0.0119413, "balance_loss_clip": 1.00743639, "balance_loss_mlp": 1.00034332, "epoch": 0.5258221607647448, "flos": 23548972132800.0, "grad_norm": 1.4301309514632334, "language_loss": 0.81596303, "learning_rate": 1.9299069084400734e-06, "loss": 0.84103191, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.9923033714294434 }, { "auxiliary_loss_clip": 0.01283859, "auxiliary_loss_mlp": 0.01194138, "balance_loss_clip": 1.00710261, "balance_loss_mlp": 1.00035095, "epoch": 0.5259424036553839, "flos": 24966113160960.0, "grad_norm": 1.8463013963765982, "language_loss": 0.69891179, "learning_rate": 1.9291284206659717e-06, "loss": 0.72369182, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.9350624084472656 }, { "auxiliary_loss_clip": 0.01357103, "auxiliary_loss_mlp": 0.01194173, "balance_loss_clip": 1.00942874, "balance_loss_mlp": 1.000386, "epoch": 0.526062646546023, "flos": 28763920038240.0, "grad_norm": 1.9182523463119943, "language_loss": 0.71454632, "learning_rate": 1.928349943643032e-06, "loss": 0.74005902, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.8318562507629395 }, { "auxiliary_loss_clip": 0.01331371, "auxiliary_loss_mlp": 0.01194146, "balance_loss_clip": 1.00899494, "balance_loss_mlp": 1.0003593, "epoch": 0.526182889436662, "flos": 22821386634720.0, "grad_norm": 1.8272712444807644, "language_loss": 0.81728339, "learning_rate": 1.9275714774893493e-06, "loss": 0.84253854, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 3.844594717025757 }, { "auxiliary_loss_clip": 0.01306805, "auxiliary_loss_mlp": 0.01194227, "balance_loss_clip": 1.00882685, "balance_loss_mlp": 1.00034463, "epoch": 0.5263031323273012, "flos": 22929477284160.0, "grad_norm": 2.7859191501148723, "language_loss": 0.73106301, "learning_rate": 1.9267930223230154e-06, "loss": 0.75607336, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.8622519969940186 }, { "auxiliary_loss_clip": 0.01321511, "auxiliary_loss_mlp": 0.01194277, "balance_loss_clip": 1.00884235, "balance_loss_mlp": 1.00029957, "epoch": 0.5264233752179402, "flos": 17748643665600.0, "grad_norm": 2.263492548674601, "language_loss": 0.78191036, "learning_rate": 1.9260145782621224e-06, "loss": 0.80706823, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.7113871574401855 }, { "auxiliary_loss_clip": 0.01307681, "auxiliary_loss_mlp": 0.01194073, "balance_loss_clip": 1.00780165, "balance_loss_mlp": 1.00028586, "epoch": 0.5265436181085793, "flos": 24421636461600.0, "grad_norm": 1.7126515669401772, "language_loss": 0.87816727, "learning_rate": 1.925236145424758e-06, "loss": 0.90318477, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.8277547359466553 }, { "auxiliary_loss_clip": 0.01313164, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.00413704, "balance_loss_mlp": 1.00007463, "epoch": 0.5266638609992185, "flos": 69207336279360.0, "grad_norm": 0.6970163153630239, "language_loss": 0.57604355, "learning_rate": 1.924457723929012e-06, "loss": 0.60110617, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.523108959197998 }, { "auxiliary_loss_clip": 0.01335372, "auxiliary_loss_mlp": 0.01194149, "balance_loss_clip": 1.00872087, "balance_loss_mlp": 1.00036216, "epoch": 0.5267841038898575, "flos": 20738709809280.0, "grad_norm": 1.4286795808827246, "language_loss": 0.8271274, "learning_rate": 1.9236793138929685e-06, "loss": 0.8524226, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.775629758834839 }, { "auxiliary_loss_clip": 0.01345964, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00960851, "balance_loss_mlp": 1.00030303, "epoch": 0.5269043467804966, "flos": 17234401495680.0, "grad_norm": 1.9958307772029196, "language_loss": 0.81216645, "learning_rate": 1.9229009154347133e-06, "loss": 0.83756697, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.753343105316162 }, { "auxiliary_loss_clip": 0.01289491, "auxiliary_loss_mlp": 0.00872193, "balance_loss_clip": 1.00802565, "balance_loss_mlp": 1.00023818, "epoch": 0.5270245896711357, "flos": 18223167454560.0, "grad_norm": 1.9893569899805124, "language_loss": 0.80750698, "learning_rate": 1.922122528672327e-06, "loss": 0.82912374, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.8072352409362793 }, { "auxiliary_loss_clip": 0.01356102, "auxiliary_loss_mlp": 0.0119404, "balance_loss_clip": 1.00896299, "balance_loss_mlp": 1.0002532, "epoch": 0.5271448325617748, "flos": 21287569197600.0, "grad_norm": 2.27713106528476, "language_loss": 0.78070748, "learning_rate": 1.9213441537238914e-06, "loss": 0.80620885, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.7039456367492676 }, { "auxiliary_loss_clip": 0.01247958, "auxiliary_loss_mlp": 0.01193029, "balance_loss_clip": 1.00515556, "balance_loss_mlp": 1.00000513, "epoch": 0.5272650754524139, "flos": 65495526988320.0, "grad_norm": 0.8377995506411964, "language_loss": 0.57429063, "learning_rate": 1.920565790707485e-06, "loss": 0.59870052, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.540740489959717 }, { "auxiliary_loss_clip": 0.01282227, "auxiliary_loss_mlp": 0.01194302, "balance_loss_clip": 1.00866973, "balance_loss_mlp": 1.00042009, "epoch": 0.527385318343053, "flos": 19676434495680.0, "grad_norm": 2.1208922245793853, "language_loss": 0.65970492, "learning_rate": 1.9197874397411853e-06, "loss": 0.68447018, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 3.2707901000976562 }, { "auxiliary_loss_clip": 0.01320223, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00858474, "balance_loss_mlp": 1.00036705, "epoch": 0.5275055612336921, "flos": 12712026709440.0, "grad_norm": 17.344030163139013, "language_loss": 0.6685763, "learning_rate": 1.919009100943067e-06, "loss": 0.6937201, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.806400775909424 }, { "auxiliary_loss_clip": 0.01287342, "auxiliary_loss_mlp": 0.01194376, "balance_loss_clip": 1.00813651, "balance_loss_mlp": 1.00039899, "epoch": 0.5276258041243311, "flos": 17749038826080.0, "grad_norm": 1.833906417529326, "language_loss": 0.65741438, "learning_rate": 1.9182307744312043e-06, "loss": 0.68223161, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.8615269660949707 }, { "auxiliary_loss_clip": 0.01325909, "auxiliary_loss_mlp": 0.01194214, "balance_loss_clip": 1.00862622, "balance_loss_mlp": 1.00033188, "epoch": 0.5277460470149702, "flos": 22710458014560.0, "grad_norm": 1.861442380824879, "language_loss": 0.76444358, "learning_rate": 1.9174524603236676e-06, "loss": 0.78964484, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.8490283489227295 }, { "auxiliary_loss_clip": 0.0130965, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00793314, "balance_loss_mlp": 1.00036728, "epoch": 0.5278662899056094, "flos": 19902710348640.0, "grad_norm": 1.787518744311169, "language_loss": 0.76281154, "learning_rate": 1.916674158738527e-06, "loss": 0.78784955, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.8519320487976074 }, { "auxiliary_loss_clip": 0.0128354, "auxiliary_loss_mlp": 0.00872438, "balance_loss_clip": 1.00818586, "balance_loss_mlp": 1.0003643, "epoch": 0.5279865327962484, "flos": 18005225895360.0, "grad_norm": 1.680612802599845, "language_loss": 0.60097277, "learning_rate": 1.9158958697938506e-06, "loss": 0.62253249, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.800798177719116 }, { "auxiliary_loss_clip": 0.0132269, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00877285, "balance_loss_mlp": 1.00029182, "epoch": 0.5281067756868875, "flos": 15924453024960.0, "grad_norm": 2.878998497554582, "language_loss": 0.86093348, "learning_rate": 1.9151175936077032e-06, "loss": 0.88610119, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.7967817783355713 }, { "auxiliary_loss_clip": 0.01330742, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.00854051, "balance_loss_mlp": 1.00028729, "epoch": 0.5282270185775266, "flos": 19426498146720.0, "grad_norm": 1.636114023826098, "language_loss": 0.79352158, "learning_rate": 1.9143393302981507e-06, "loss": 0.81876969, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.7057244777679443 }, { "auxiliary_loss_clip": 0.01323147, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00890064, "balance_loss_mlp": 1.00036693, "epoch": 0.5283472614681657, "flos": 16399623440160.0, "grad_norm": 1.639022591814587, "language_loss": 0.83339673, "learning_rate": 1.913561079983252e-06, "loss": 0.85856974, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.7484428882598877 }, { "auxiliary_loss_clip": 0.01319869, "auxiliary_loss_mlp": 0.01194308, "balance_loss_clip": 1.00929713, "balance_loss_mlp": 1.00042641, "epoch": 0.5284675043588047, "flos": 26760536280000.0, "grad_norm": 2.179611404185246, "language_loss": 0.74701846, "learning_rate": 1.9127828427810693e-06, "loss": 0.77216023, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.7495341300964355 }, { "auxiliary_loss_clip": 0.01307043, "auxiliary_loss_mlp": 0.01194102, "balance_loss_clip": 1.0084722, "balance_loss_mlp": 1.00031519, "epoch": 0.5285877472494439, "flos": 19899908301600.0, "grad_norm": 1.8979046194017577, "language_loss": 0.80767369, "learning_rate": 1.9120046188096607e-06, "loss": 0.83268511, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 3.944892406463623 }, { "auxiliary_loss_clip": 0.01311456, "auxiliary_loss_mlp": 0.01194131, "balance_loss_clip": 1.0090555, "balance_loss_mlp": 1.00034451, "epoch": 0.528707990140083, "flos": 20011268005920.0, "grad_norm": 1.7456391975471928, "language_loss": 0.74024498, "learning_rate": 1.9112264081870804e-06, "loss": 0.76530087, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 3.6336238384246826 }, { "auxiliary_loss_clip": 0.01284261, "auxiliary_loss_mlp": 0.01194167, "balance_loss_clip": 1.00768363, "balance_loss_mlp": 1.00028539, "epoch": 0.528828233030722, "flos": 20667966577920.0, "grad_norm": 1.8690691912501511, "language_loss": 0.76006842, "learning_rate": 1.9104482110313843e-06, "loss": 0.78485274, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 3.756770372390747 }, { "auxiliary_loss_clip": 0.01334058, "auxiliary_loss_mlp": 0.01194134, "balance_loss_clip": 1.00858212, "balance_loss_mlp": 1.00034738, "epoch": 0.5289484759213612, "flos": 25192460861280.0, "grad_norm": 1.7152111640169099, "language_loss": 0.74171275, "learning_rate": 1.909670027460623e-06, "loss": 0.76699466, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.7354736328125 }, { "auxiliary_loss_clip": 0.01334705, "auxiliary_loss_mlp": 0.01194244, "balance_loss_clip": 1.00891733, "balance_loss_mlp": 1.00036216, "epoch": 0.5290687188120002, "flos": 31139269182720.0, "grad_norm": 1.9787151097045175, "language_loss": 0.71929377, "learning_rate": 1.908891857592847e-06, "loss": 0.74458325, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.8900563716888428 }, { "auxiliary_loss_clip": 0.01282424, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.00726593, "balance_loss_mlp": 1.0003922, "epoch": 0.5291889617026393, "flos": 20119861586880.0, "grad_norm": 2.190954818972243, "language_loss": 0.9008553, "learning_rate": 1.9081137015461034e-06, "loss": 0.92562139, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 3.725399971008301 }, { "auxiliary_loss_clip": 0.01273814, "auxiliary_loss_mlp": 0.01194166, "balance_loss_clip": 1.0080061, "balance_loss_mlp": 1.00028419, "epoch": 0.5293092045932785, "flos": 19643757156000.0, "grad_norm": 1.8521510361422622, "language_loss": 0.9041791, "learning_rate": 1.9073355594384383e-06, "loss": 0.92885894, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.8349061012268066 }, { "auxiliary_loss_clip": 0.0130722, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.00922036, "balance_loss_mlp": 1.00027859, "epoch": 0.5294294474839175, "flos": 24317748882720.0, "grad_norm": 1.737427625795011, "language_loss": 0.80449414, "learning_rate": 1.906557431387895e-06, "loss": 0.82950699, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.840989589691162 }, { "auxiliary_loss_clip": 0.01286824, "auxiliary_loss_mlp": 0.01194267, "balance_loss_clip": 1.0085423, "balance_loss_mlp": 1.00038469, "epoch": 0.5295496903745566, "flos": 18875950345440.0, "grad_norm": 1.9850613834895523, "language_loss": 0.78970802, "learning_rate": 1.905779317512516e-06, "loss": 0.81451893, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.718012809753418 }, { "auxiliary_loss_clip": 0.01332102, "auxiliary_loss_mlp": 0.01194048, "balance_loss_clip": 1.00815356, "balance_loss_mlp": 1.00026155, "epoch": 0.5296699332651957, "flos": 20923111860480.0, "grad_norm": 1.9512107696023864, "language_loss": 0.80330217, "learning_rate": 1.9050012179303385e-06, "loss": 0.82856369, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.776043653488159 }, { "auxiliary_loss_clip": 0.01346736, "auxiliary_loss_mlp": 0.01194223, "balance_loss_clip": 1.00975609, "balance_loss_mlp": 1.00034118, "epoch": 0.5297901761558348, "flos": 22046754324960.0, "grad_norm": 2.2688611945264676, "language_loss": 0.68998462, "learning_rate": 1.904223132759401e-06, "loss": 0.7153942, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.758941411972046 }, { "auxiliary_loss_clip": 0.01333936, "auxiliary_loss_mlp": 0.01194135, "balance_loss_clip": 1.00868177, "balance_loss_mlp": 1.00034881, "epoch": 0.5299104190464738, "flos": 21798506388960.0, "grad_norm": 2.2496525778314074, "language_loss": 0.6912601, "learning_rate": 1.9034450621177383e-06, "loss": 0.71654087, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.795776128768921 }, { "auxiliary_loss_clip": 0.01331787, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.0090934, "balance_loss_mlp": 1.00037825, "epoch": 0.530030661937113, "flos": 14720799019680.0, "grad_norm": 2.2059229842971746, "language_loss": 0.70440727, "learning_rate": 1.9026670061233824e-06, "loss": 0.72966772, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.7059812545776367 }, { "auxiliary_loss_clip": 0.01309463, "auxiliary_loss_mlp": 0.01194173, "balance_loss_clip": 1.00872183, "balance_loss_mlp": 1.00038648, "epoch": 0.5301509048277521, "flos": 21251515032000.0, "grad_norm": 1.55461580622016, "language_loss": 0.80474192, "learning_rate": 1.901888964894365e-06, "loss": 0.82977831, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.7197840213775635 }, { "auxiliary_loss_clip": 0.01357588, "auxiliary_loss_mlp": 0.01194245, "balance_loss_clip": 1.00916815, "balance_loss_mlp": 1.00036311, "epoch": 0.5302711477183911, "flos": 25957070464320.0, "grad_norm": 1.8087162859579138, "language_loss": 0.67407846, "learning_rate": 1.9011109385487134e-06, "loss": 0.69959676, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.7389988899230957 }, { "auxiliary_loss_clip": 0.01357519, "auxiliary_loss_mlp": 0.01194222, "balance_loss_clip": 1.00956237, "balance_loss_mlp": 1.00034046, "epoch": 0.5303913906090303, "flos": 22273137948960.0, "grad_norm": 2.461251291045473, "language_loss": 0.66565025, "learning_rate": 1.900332927204454e-06, "loss": 0.69116765, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.7588918209075928 }, { "auxiliary_loss_clip": 0.01311074, "auxiliary_loss_mlp": 0.01194182, "balance_loss_clip": 1.00885749, "balance_loss_mlp": 1.00030005, "epoch": 0.5305116334996693, "flos": 24936022326240.0, "grad_norm": 1.6431309261776914, "language_loss": 0.76798034, "learning_rate": 1.8995549309796097e-06, "loss": 0.79303288, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.8254146575927734 }, { "auxiliary_loss_clip": 0.0133127, "auxiliary_loss_mlp": 0.01194298, "balance_loss_clip": 1.00994813, "balance_loss_mlp": 1.0004158, "epoch": 0.5306318763903084, "flos": 20189347489440.0, "grad_norm": 4.210948631930586, "language_loss": 0.76288033, "learning_rate": 1.8987769499922028e-06, "loss": 0.78813601, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.709216594696045 }, { "auxiliary_loss_clip": 0.01333664, "auxiliary_loss_mlp": 0.00872482, "balance_loss_clip": 1.00914085, "balance_loss_mlp": 1.00032365, "epoch": 0.5307521192809476, "flos": 20266377364800.0, "grad_norm": 2.484181512926721, "language_loss": 0.71043313, "learning_rate": 1.897998984360252e-06, "loss": 0.73249459, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.7496962547302246 }, { "auxiliary_loss_clip": 0.01320348, "auxiliary_loss_mlp": 0.01194138, "balance_loss_clip": 1.00841713, "balance_loss_mlp": 1.00035167, "epoch": 0.5308723621715866, "flos": 28844290815840.0, "grad_norm": 1.4497510989055293, "language_loss": 0.78522354, "learning_rate": 1.897221034201775e-06, "loss": 0.81036842, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.813633441925049 }, { "auxiliary_loss_clip": 0.01293281, "auxiliary_loss_mlp": 0.0119413, "balance_loss_clip": 1.00902581, "balance_loss_mlp": 1.0003432, "epoch": 0.5309926050622257, "flos": 27457779477600.0, "grad_norm": 1.4547256362574974, "language_loss": 0.66754431, "learning_rate": 1.8964430996347842e-06, "loss": 0.6924184, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.9183144569396973 }, { "auxiliary_loss_clip": 0.01321617, "auxiliary_loss_mlp": 0.01194198, "balance_loss_clip": 1.00938511, "balance_loss_mlp": 1.00031614, "epoch": 0.5311128479528648, "flos": 20514553453440.0, "grad_norm": 2.6473209046341455, "language_loss": 0.82393926, "learning_rate": 1.8956651807772931e-06, "loss": 0.84909737, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.758981227874756 }, { "auxiliary_loss_clip": 0.01332893, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00887537, "balance_loss_mlp": 1.00028419, "epoch": 0.5312330908435039, "flos": 21397671573120.0, "grad_norm": 1.5710085864389252, "language_loss": 0.83829635, "learning_rate": 1.8948872777473115e-06, "loss": 0.86356604, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.751518487930298 }, { "auxiliary_loss_clip": 0.01317968, "auxiliary_loss_mlp": 0.01194119, "balance_loss_clip": 1.00860798, "balance_loss_mlp": 1.00033236, "epoch": 0.531353333734143, "flos": 24717362293440.0, "grad_norm": 1.6178006340863158, "language_loss": 0.63446772, "learning_rate": 1.8941093906628458e-06, "loss": 0.65958863, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.7986040115356445 }, { "auxiliary_loss_clip": 0.01320565, "auxiliary_loss_mlp": 0.01194145, "balance_loss_clip": 1.00932026, "balance_loss_mlp": 1.00035822, "epoch": 0.531473576624782, "flos": 30480702579360.0, "grad_norm": 1.571664951338016, "language_loss": 0.7088666, "learning_rate": 1.893331519641902e-06, "loss": 0.73401368, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.917480945587158 }, { "auxiliary_loss_clip": 0.0130728, "auxiliary_loss_mlp": 0.01194135, "balance_loss_clip": 1.00890851, "balance_loss_mlp": 1.00034893, "epoch": 0.5315938195154212, "flos": 23002986638880.0, "grad_norm": 2.356832610324127, "language_loss": 0.73771536, "learning_rate": 1.8925536648024815e-06, "loss": 0.76272953, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.8245723247528076 }, { "auxiliary_loss_clip": 0.0135736, "auxiliary_loss_mlp": 0.01194177, "balance_loss_clip": 1.00915766, "balance_loss_mlp": 1.00039053, "epoch": 0.5317140624060602, "flos": 22748595753600.0, "grad_norm": 1.7249113998514392, "language_loss": 0.75876677, "learning_rate": 1.8917758262625849e-06, "loss": 0.78428209, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 3.7365105152130127 }, { "auxiliary_loss_clip": 0.01307995, "auxiliary_loss_mlp": 0.01194152, "balance_loss_clip": 1.00737977, "balance_loss_mlp": 1.00036573, "epoch": 0.5318343052966993, "flos": 22821099245280.0, "grad_norm": 1.584541064355735, "language_loss": 0.80819285, "learning_rate": 1.8909980041402089e-06, "loss": 0.83321428, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 3.593585968017578 }, { "auxiliary_loss_clip": 0.01343829, "auxiliary_loss_mlp": 0.01194088, "balance_loss_clip": 1.00915599, "balance_loss_mlp": 1.00030124, "epoch": 0.5319545481873384, "flos": 13626097832160.0, "grad_norm": 2.095782423725448, "language_loss": 0.6549086, "learning_rate": 1.8902201985533494e-06, "loss": 0.68028772, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 3.634479522705078 }, { "auxiliary_loss_clip": 0.01312373, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00810158, "balance_loss_mlp": 1.00028419, "epoch": 0.5320747910779775, "flos": 22162532641920.0, "grad_norm": 1.709820595623081, "language_loss": 0.74949145, "learning_rate": 1.8894424096199983e-06, "loss": 0.77455592, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.711080312728882 }, { "auxiliary_loss_clip": 0.01332127, "auxiliary_loss_mlp": 0.01194079, "balance_loss_clip": 1.00831079, "balance_loss_mlp": 1.00029278, "epoch": 0.5321950339686166, "flos": 18588091799520.0, "grad_norm": 1.8118703474522906, "language_loss": 0.85675156, "learning_rate": 1.8886646374581463e-06, "loss": 0.88201368, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 3.7317051887512207 }, { "auxiliary_loss_clip": 0.01343971, "auxiliary_loss_mlp": 0.01194208, "balance_loss_clip": 1.00910282, "balance_loss_mlp": 1.0003258, "epoch": 0.5323152768592557, "flos": 22856830097760.0, "grad_norm": 1.5744340707643305, "language_loss": 0.71252251, "learning_rate": 1.8878868821857795e-06, "loss": 0.73790437, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.8005876541137695 }, { "auxiliary_loss_clip": 0.01290877, "auxiliary_loss_mlp": 0.01194151, "balance_loss_clip": 1.00897825, "balance_loss_mlp": 1.00026953, "epoch": 0.5324355197498948, "flos": 33948705261600.0, "grad_norm": 1.8105562803854816, "language_loss": 0.75157231, "learning_rate": 1.8871091439208838e-06, "loss": 0.7764225, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.9305636882781982 }, { "auxiliary_loss_clip": 0.01285666, "auxiliary_loss_mlp": 0.01194248, "balance_loss_clip": 1.00904417, "balance_loss_mlp": 1.00036621, "epoch": 0.5325557626405338, "flos": 23256730897920.0, "grad_norm": 1.9834221004925057, "language_loss": 0.76921916, "learning_rate": 1.8863314227814414e-06, "loss": 0.79401833, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.862731695175171 }, { "auxiliary_loss_clip": 0.01340777, "auxiliary_loss_mlp": 0.01194349, "balance_loss_clip": 1.00974643, "balance_loss_mlp": 1.00037205, "epoch": 0.532676005531173, "flos": 26718698401920.0, "grad_norm": 2.3091117441460867, "language_loss": 0.49322253, "learning_rate": 1.8855537188854313e-06, "loss": 0.51857382, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.7608866691589355 }, { "auxiliary_loss_clip": 0.01345118, "auxiliary_loss_mlp": 0.01194228, "balance_loss_clip": 1.00927329, "balance_loss_mlp": 1.00044179, "epoch": 0.5327962484218121, "flos": 17894620588320.0, "grad_norm": 1.9575459597178468, "language_loss": 0.782004, "learning_rate": 1.8847760323508315e-06, "loss": 0.80739748, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.781360149383545 }, { "auxiliary_loss_clip": 0.01308846, "auxiliary_loss_mlp": 0.01194233, "balance_loss_clip": 1.00797808, "balance_loss_mlp": 1.00035107, "epoch": 0.5329164913124511, "flos": 17925393972960.0, "grad_norm": 1.621243487591933, "language_loss": 0.75762916, "learning_rate": 1.883998363295616e-06, "loss": 0.78265989, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.7771570682525635 }, { "auxiliary_loss_clip": 0.01290929, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00359678, "balance_loss_mlp": 1.00005865, "epoch": 0.5330367342030903, "flos": 57254207307840.0, "grad_norm": 0.900011554408611, "language_loss": 0.62650323, "learning_rate": 1.8832207118377565e-06, "loss": 0.65134329, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.26589298248291 }, { "auxiliary_loss_clip": 0.01356048, "auxiliary_loss_mlp": 0.01194116, "balance_loss_clip": 1.00908387, "balance_loss_mlp": 1.00032926, "epoch": 0.5331569770937293, "flos": 17420527883520.0, "grad_norm": 1.7947416763559747, "language_loss": 0.69622469, "learning_rate": 1.882443078095222e-06, "loss": 0.7217263, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.6813085079193115 }, { "auxiliary_loss_clip": 0.01262237, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00371099, "balance_loss_mlp": 1.00004888, "epoch": 0.5332772199843684, "flos": 56750814089280.0, "grad_norm": 0.8617169861436729, "language_loss": 0.66836184, "learning_rate": 1.8816654621859794e-06, "loss": 0.6929149, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.2283239364624023 }, { "auxiliary_loss_clip": 0.01355504, "auxiliary_loss_mlp": 0.01194134, "balance_loss_clip": 1.00948811, "balance_loss_mlp": 1.00025177, "epoch": 0.5333974628750076, "flos": 18697763090880.0, "grad_norm": 2.2280291536943673, "language_loss": 0.72596824, "learning_rate": 1.8808878642279915e-06, "loss": 0.75146466, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.898549795150757 }, { "auxiliary_loss_clip": 0.01320603, "auxiliary_loss_mlp": 0.01194229, "balance_loss_clip": 1.00925767, "balance_loss_mlp": 1.00034738, "epoch": 0.5335177057656466, "flos": 23805518438880.0, "grad_norm": 2.151064658467998, "language_loss": 0.65020764, "learning_rate": 1.8801102843392209e-06, "loss": 0.67535603, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.8772644996643066 }, { "auxiliary_loss_clip": 0.01300234, "auxiliary_loss_mlp": 0.01194069, "balance_loss_clip": 1.00831389, "balance_loss_mlp": 1.00028253, "epoch": 0.5336379486562857, "flos": 25078694270400.0, "grad_norm": 1.5163014650677231, "language_loss": 0.85228938, "learning_rate": 1.8793327226376238e-06, "loss": 0.87723243, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.8196651935577393 }, { "auxiliary_loss_clip": 0.01318325, "auxiliary_loss_mlp": 0.01194042, "balance_loss_clip": 1.00960851, "balance_loss_mlp": 1.00025558, "epoch": 0.5337581915469248, "flos": 21396701633760.0, "grad_norm": 1.5770542568782708, "language_loss": 0.80298245, "learning_rate": 1.8785551792411569e-06, "loss": 0.82810616, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.871671199798584 }, { "auxiliary_loss_clip": 0.01315239, "auxiliary_loss_mlp": 0.01194242, "balance_loss_clip": 1.00771213, "balance_loss_mlp": 1.00035965, "epoch": 0.5338784344375639, "flos": 14865913774080.0, "grad_norm": 2.0533286511297164, "language_loss": 0.826168, "learning_rate": 1.8777776542677733e-06, "loss": 0.85126281, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.696627616882324 }, { "auxiliary_loss_clip": 0.01308564, "auxiliary_loss_mlp": 0.01194139, "balance_loss_clip": 1.0084666, "balance_loss_mlp": 1.00035214, "epoch": 0.5339986773282029, "flos": 20813512416480.0, "grad_norm": 1.927646369367946, "language_loss": 0.73309249, "learning_rate": 1.8770001478354216e-06, "loss": 0.75811952, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.8028109073638916 }, { "auxiliary_loss_clip": 0.0134418, "auxiliary_loss_mlp": 0.01194244, "balance_loss_clip": 1.00936627, "balance_loss_mlp": 1.0003624, "epoch": 0.5341189202188421, "flos": 17969099882400.0, "grad_norm": 2.1646325768851136, "language_loss": 0.83812189, "learning_rate": 1.8762226600620504e-06, "loss": 0.8635062, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.7423620223999023 }, { "auxiliary_loss_clip": 0.01333573, "auxiliary_loss_mlp": 0.01194225, "balance_loss_clip": 1.00961196, "balance_loss_mlp": 1.00034261, "epoch": 0.5342391631094812, "flos": 11031873112800.0, "grad_norm": 2.189990274000482, "language_loss": 0.58702767, "learning_rate": 1.8754451910656031e-06, "loss": 0.61230564, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.8021740913391113 }, { "auxiliary_loss_clip": 0.01286677, "auxiliary_loss_mlp": 0.01194229, "balance_loss_clip": 1.00801277, "balance_loss_mlp": 1.00034714, "epoch": 0.5343594060001202, "flos": 15339144310560.0, "grad_norm": 1.9802415157041542, "language_loss": 0.82854396, "learning_rate": 1.8746677409640212e-06, "loss": 0.85335302, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.7903926372528076 }, { "auxiliary_loss_clip": 0.01336566, "auxiliary_loss_mlp": 0.0119423, "balance_loss_clip": 1.00886166, "balance_loss_mlp": 1.00034845, "epoch": 0.5344796488907594, "flos": 26900909108640.0, "grad_norm": 1.577086193976601, "language_loss": 0.8465817, "learning_rate": 1.8738903098752432e-06, "loss": 0.87188965, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.8145482540130615 }, { "auxiliary_loss_clip": 0.01327099, "auxiliary_loss_mlp": 0.01194253, "balance_loss_clip": 1.00949204, "balance_loss_mlp": 1.00037074, "epoch": 0.5345998917813984, "flos": 25411228665120.0, "grad_norm": 2.203718012544729, "language_loss": 0.7308926, "learning_rate": 1.8731128979172052e-06, "loss": 0.75610614, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.823920488357544 }, { "auxiliary_loss_clip": 0.01310508, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00792766, "balance_loss_mlp": 1.00032091, "epoch": 0.5347201346720375, "flos": 32853393371520.0, "grad_norm": 2.1688780907521754, "language_loss": 0.67192578, "learning_rate": 1.8723355052078394e-06, "loss": 0.69697201, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.839693546295166 }, { "auxiliary_loss_clip": 0.01345127, "auxiliary_loss_mlp": 0.01194304, "balance_loss_clip": 1.00934744, "balance_loss_mlp": 1.0003264, "epoch": 0.5348403775626767, "flos": 17967950324640.0, "grad_norm": 2.301300811967658, "language_loss": 0.77148098, "learning_rate": 1.8715581318650765e-06, "loss": 0.7968753, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 3.769974946975708 }, { "auxiliary_loss_clip": 0.01311599, "auxiliary_loss_mlp": 0.01194199, "balance_loss_clip": 1.00945854, "balance_loss_mlp": 1.0003171, "epoch": 0.5349606204533157, "flos": 17603349292800.0, "grad_norm": 2.064445029131435, "language_loss": 0.81213129, "learning_rate": 1.8707807780068422e-06, "loss": 0.83718926, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 3.684234619140625 }, { "auxiliary_loss_clip": 0.01313029, "auxiliary_loss_mlp": 0.01194081, "balance_loss_clip": 1.00795174, "balance_loss_mlp": 1.00029492, "epoch": 0.5350808633439548, "flos": 29167844290560.0, "grad_norm": 1.8444405969563333, "language_loss": 0.66251111, "learning_rate": 1.8700034437510611e-06, "loss": 0.68758225, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 3.7500452995300293 }, { "auxiliary_loss_clip": 0.01306275, "auxiliary_loss_mlp": 0.01194224, "balance_loss_clip": 1.00935459, "balance_loss_mlp": 1.00034165, "epoch": 0.5352011062345938, "flos": 19500007501440.0, "grad_norm": 2.531029167777275, "language_loss": 0.81480795, "learning_rate": 1.8692261292156549e-06, "loss": 0.83981299, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.799025058746338 }, { "auxiliary_loss_clip": 0.01355419, "auxiliary_loss_mlp": 0.01194056, "balance_loss_clip": 1.00944352, "balance_loss_mlp": 1.00026917, "epoch": 0.535321349125233, "flos": 23477654122560.0, "grad_norm": 1.7903592525846084, "language_loss": 0.81413406, "learning_rate": 1.8684488345185401e-06, "loss": 0.83962882, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 3.7620797157287598 }, { "auxiliary_loss_clip": 0.01356739, "auxiliary_loss_mlp": 0.01194257, "balance_loss_clip": 1.00977492, "balance_loss_mlp": 1.00037479, "epoch": 0.535441592015872, "flos": 20478068203680.0, "grad_norm": 2.1614816233094376, "language_loss": 0.78842384, "learning_rate": 1.8676715597776332e-06, "loss": 0.81393379, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.6891140937805176 }, { "auxiliary_loss_clip": 0.01286806, "auxiliary_loss_mlp": 0.01194142, "balance_loss_clip": 1.00819039, "balance_loss_mlp": 1.00035501, "epoch": 0.5355618349065111, "flos": 19573157619360.0, "grad_norm": 1.8094041543953068, "language_loss": 0.76411343, "learning_rate": 1.8668943051108455e-06, "loss": 0.78892297, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.830768585205078 }, { "auxiliary_loss_clip": 0.01319951, "auxiliary_loss_mlp": 0.01194159, "balance_loss_clip": 1.00827622, "balance_loss_mlp": 1.00037265, "epoch": 0.5356820777971503, "flos": 24024645479520.0, "grad_norm": 1.7606430703586335, "language_loss": 0.7625339, "learning_rate": 1.8661170706360856e-06, "loss": 0.78767502, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.7712278366088867 }, { "auxiliary_loss_clip": 0.01333137, "auxiliary_loss_mlp": 0.01194103, "balance_loss_clip": 1.00820708, "balance_loss_mlp": 1.0003159, "epoch": 0.5358023206877893, "flos": 20884686732000.0, "grad_norm": 1.533366378138068, "language_loss": 0.81658196, "learning_rate": 1.8653398564712594e-06, "loss": 0.84185433, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.7973599433898926 }, { "auxiliary_loss_clip": 0.01331169, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.0083313, "balance_loss_mlp": 1.00029802, "epoch": 0.5359225635784284, "flos": 22419007100640.0, "grad_norm": 1.508001766530694, "language_loss": 0.82211387, "learning_rate": 1.8645626627342704e-06, "loss": 0.84736633, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.7259511947631836 }, { "auxiliary_loss_clip": 0.01343643, "auxiliary_loss_mlp": 0.01194218, "balance_loss_clip": 1.00914848, "balance_loss_mlp": 1.0004313, "epoch": 0.5360428064690675, "flos": 24097795597440.0, "grad_norm": 1.9823132255467169, "language_loss": 0.80908561, "learning_rate": 1.8637854895430172e-06, "loss": 0.83446419, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.905561685562134 }, { "auxiliary_loss_clip": 0.01308662, "auxiliary_loss_mlp": 0.01194294, "balance_loss_clip": 1.00927734, "balance_loss_mlp": 1.00041199, "epoch": 0.5361630493597066, "flos": 21434516059680.0, "grad_norm": 1.9956754499813154, "language_loss": 0.69161576, "learning_rate": 1.8630083370153978e-06, "loss": 0.71664536, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.882565975189209 }, { "auxiliary_loss_clip": 0.01248931, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00374377, "balance_loss_mlp": 1.00004697, "epoch": 0.5362832922503457, "flos": 68888727063360.0, "grad_norm": 0.7449477414532264, "language_loss": 0.55434978, "learning_rate": 1.8622312052693041e-06, "loss": 0.5787698, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.646787405014038 }, { "auxiliary_loss_clip": 0.01343235, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.00871611, "balance_loss_mlp": 1.00028825, "epoch": 0.5364035351409848, "flos": 9793709660160.0, "grad_norm": 2.1778174916307997, "language_loss": 0.71845573, "learning_rate": 1.8614540944226267e-06, "loss": 0.74382877, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 3.6003997325897217 }, { "auxiliary_loss_clip": 0.01310346, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.0083611, "balance_loss_mlp": 1.00023985, "epoch": 0.5365237780316239, "flos": 23290090787520.0, "grad_norm": 1.7994961330278552, "language_loss": 0.67662686, "learning_rate": 1.8606770045932537e-06, "loss": 0.70167059, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.8289084434509277 }, { "auxiliary_loss_clip": 0.01322176, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00986385, "balance_loss_mlp": 1.00028968, "epoch": 0.5366440209222629, "flos": 26578145954880.0, "grad_norm": 2.012703304551425, "language_loss": 0.81632507, "learning_rate": 1.859899935899068e-06, "loss": 0.84148753, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.8835697174072266 }, { "auxiliary_loss_clip": 0.01310185, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.0086844, "balance_loss_mlp": 1.00028694, "epoch": 0.5367642638129021, "flos": 19608062227200.0, "grad_norm": 1.6075321455720981, "language_loss": 0.79112577, "learning_rate": 1.8591228884579506e-06, "loss": 0.81616831, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.787776231765747 }, { "auxiliary_loss_clip": 0.01302846, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.0078088, "balance_loss_mlp": 1.00028849, "epoch": 0.5368845067035412, "flos": 23915225653920.0, "grad_norm": 1.863246276158427, "language_loss": 0.82114464, "learning_rate": 1.8583458623877795e-06, "loss": 0.84611386, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.8681087493896484 }, { "auxiliary_loss_clip": 0.01338168, "auxiliary_loss_mlp": 0.01194082, "balance_loss_clip": 1.00893748, "balance_loss_mlp": 1.00029564, "epoch": 0.5370047495941802, "flos": 16873141366080.0, "grad_norm": 1.6703040382604168, "language_loss": 0.74411052, "learning_rate": 1.8575688578064281e-06, "loss": 0.76943302, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.871248722076416 }, { "auxiliary_loss_clip": 0.01336149, "auxiliary_loss_mlp": 0.01194187, "balance_loss_clip": 1.00869632, "balance_loss_mlp": 1.00040078, "epoch": 0.5371249924848194, "flos": 20740937077440.0, "grad_norm": 1.5483255132616716, "language_loss": 0.76719862, "learning_rate": 1.8567918748317674e-06, "loss": 0.79250193, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.7634332180023193 }, { "auxiliary_loss_clip": 0.01318979, "auxiliary_loss_mlp": 0.01194134, "balance_loss_clip": 1.00940514, "balance_loss_mlp": 1.0003469, "epoch": 0.5372452353754584, "flos": 17968129943040.0, "grad_norm": 1.795091833706356, "language_loss": 0.82706207, "learning_rate": 1.8560149135816659e-06, "loss": 0.85219324, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.8039662837982178 }, { "auxiliary_loss_clip": 0.01343933, "auxiliary_loss_mlp": 0.01194084, "balance_loss_clip": 1.00894225, "balance_loss_mlp": 1.00029695, "epoch": 0.5373654782660975, "flos": 15377030583840.0, "grad_norm": 2.094710852679654, "language_loss": 0.84349167, "learning_rate": 1.8552379741739873e-06, "loss": 0.86887181, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.779738187789917 }, { "auxiliary_loss_clip": 0.012738, "auxiliary_loss_mlp": 0.00871624, "balance_loss_clip": 1.00303268, "balance_loss_mlp": 1.00002992, "epoch": 0.5374857211567367, "flos": 69000122691360.0, "grad_norm": 0.8929854862522738, "language_loss": 0.55652761, "learning_rate": 1.8544610567265935e-06, "loss": 0.57798183, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.351020336151123 }, { "auxiliary_loss_clip": 0.01311177, "auxiliary_loss_mlp": 0.00872386, "balance_loss_clip": 1.00815821, "balance_loss_mlp": 1.00030637, "epoch": 0.5376059640473757, "flos": 15085364127840.0, "grad_norm": 1.7139996665217647, "language_loss": 0.83172619, "learning_rate": 1.853684161357341e-06, "loss": 0.85356176, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.8741416931152344 }, { "auxiliary_loss_clip": 0.01331661, "auxiliary_loss_mlp": 0.00872547, "balance_loss_clip": 1.00841784, "balance_loss_mlp": 1.00033224, "epoch": 0.5377262069380148, "flos": 19792607973120.0, "grad_norm": 2.025621154822686, "language_loss": 0.7692858, "learning_rate": 1.852907288184085e-06, "loss": 0.79132783, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.723551034927368 }, { "auxiliary_loss_clip": 0.01280738, "auxiliary_loss_mlp": 0.01194073, "balance_loss_clip": 1.00812876, "balance_loss_mlp": 1.0002867, "epoch": 0.5378464498286539, "flos": 30003089353920.0, "grad_norm": 1.803433310916411, "language_loss": 0.69956738, "learning_rate": 1.8521304373246762e-06, "loss": 0.72431552, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.912738084793091 }, { "auxiliary_loss_clip": 0.01344873, "auxiliary_loss_mlp": 0.01194173, "balance_loss_clip": 1.0094173, "balance_loss_mlp": 1.00029111, "epoch": 0.537966692719293, "flos": 21251227642560.0, "grad_norm": 2.5207591384022217, "language_loss": 0.88766211, "learning_rate": 1.8513536088969626e-06, "loss": 0.91305256, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 4.567909240722656 }, { "auxiliary_loss_clip": 0.0133297, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.0089035, "balance_loss_mlp": 1.00029719, "epoch": 0.538086935609932, "flos": 21543181488000.0, "grad_norm": 2.098934000578767, "language_loss": 0.80207992, "learning_rate": 1.8505768030187884e-06, "loss": 0.82735145, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.700092077255249 }, { "auxiliary_loss_clip": 0.01307631, "auxiliary_loss_mlp": 0.01194087, "balance_loss_clip": 1.0081749, "balance_loss_mlp": 1.00030065, "epoch": 0.5382071785005712, "flos": 22747230653760.0, "grad_norm": 1.436211762288441, "language_loss": 0.79801792, "learning_rate": 1.849800019807995e-06, "loss": 0.82303512, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 3.7837958335876465 }, { "auxiliary_loss_clip": 0.01288716, "auxiliary_loss_mlp": 0.01194181, "balance_loss_clip": 1.00761235, "balance_loss_mlp": 1.0002991, "epoch": 0.5383274213912103, "flos": 24934585379040.0, "grad_norm": 1.8483671314504788, "language_loss": 0.71032703, "learning_rate": 1.8490232593824186e-06, "loss": 0.735156, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.852809190750122 }, { "auxiliary_loss_clip": 0.01311373, "auxiliary_loss_mlp": 0.01194069, "balance_loss_clip": 1.0077337, "balance_loss_mlp": 1.00028205, "epoch": 0.5384476642818493, "flos": 22310197977600.0, "grad_norm": 1.5630143302869104, "language_loss": 0.8487432, "learning_rate": 1.8482465218598935e-06, "loss": 0.87379766, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 3.718903064727783 }, { "auxiliary_loss_clip": 0.01310031, "auxiliary_loss_mlp": 0.01194129, "balance_loss_clip": 1.00888073, "balance_loss_mlp": 1.00034285, "epoch": 0.5385679071724885, "flos": 22711032793440.0, "grad_norm": 1.694698097353707, "language_loss": 0.83558333, "learning_rate": 1.8474698073582508e-06, "loss": 0.86062491, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.8119821548461914 }, { "auxiliary_loss_clip": 0.01309296, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00872803, "balance_loss_mlp": 1.00026071, "epoch": 0.5386881500631275, "flos": 15953753538720.0, "grad_norm": 2.0243828534518244, "language_loss": 0.87877822, "learning_rate": 1.8466931159953166e-06, "loss": 0.90381169, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.788553476333618 }, { "auxiliary_loss_clip": 0.01315049, "auxiliary_loss_mlp": 0.01194308, "balance_loss_clip": 1.0085361, "balance_loss_mlp": 1.00042617, "epoch": 0.5388083929537666, "flos": 24060053018880.0, "grad_norm": 2.260948342099232, "language_loss": 0.84180999, "learning_rate": 1.8459164478889158e-06, "loss": 0.86690354, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.770106792449951 }, { "auxiliary_loss_clip": 0.01306462, "auxiliary_loss_mlp": 0.01194069, "balance_loss_clip": 1.00913692, "balance_loss_mlp": 1.00028229, "epoch": 0.5389286358444056, "flos": 22236904164960.0, "grad_norm": 1.584859332877939, "language_loss": 0.76039112, "learning_rate": 1.8451398031568663e-06, "loss": 0.7853964, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.8296844959259033 }, { "auxiliary_loss_clip": 0.01299299, "auxiliary_loss_mlp": 0.01194104, "balance_loss_clip": 1.00861359, "balance_loss_mlp": 1.00031745, "epoch": 0.5390488787350448, "flos": 24281730640800.0, "grad_norm": 1.4847064962749998, "language_loss": 0.74687541, "learning_rate": 1.844363181916986e-06, "loss": 0.77180934, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.8664190769195557 }, { "auxiliary_loss_clip": 0.01345055, "auxiliary_loss_mlp": 0.0119435, "balance_loss_clip": 1.00901365, "balance_loss_mlp": 1.00046778, "epoch": 0.5391691216256839, "flos": 16581403062720.0, "grad_norm": 4.304427376930824, "language_loss": 0.82985842, "learning_rate": 1.8435865842870868e-06, "loss": 0.8552525, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.723557949066162 }, { "auxiliary_loss_clip": 0.01331712, "auxiliary_loss_mlp": 0.00872597, "balance_loss_clip": 1.0087831, "balance_loss_mlp": 1.00044286, "epoch": 0.5392893645163229, "flos": 23330060634240.0, "grad_norm": 1.7825031186259663, "language_loss": 0.71988249, "learning_rate": 1.8428100103849787e-06, "loss": 0.7419256, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.8180294036865234 }, { "auxiliary_loss_clip": 0.01310745, "auxiliary_loss_mlp": 0.0119423, "balance_loss_clip": 1.00855255, "balance_loss_mlp": 1.00034821, "epoch": 0.5394096074069621, "flos": 15669810673920.0, "grad_norm": 2.1358982391963335, "language_loss": 0.73161983, "learning_rate": 1.842033460328467e-06, "loss": 0.75666952, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.798478841781616 }, { "auxiliary_loss_clip": 0.01330202, "auxiliary_loss_mlp": 0.00872537, "balance_loss_clip": 1.00929701, "balance_loss_mlp": 1.00040102, "epoch": 0.5395298502976011, "flos": 22893459042240.0, "grad_norm": 1.6310632627430732, "language_loss": 0.7495299, "learning_rate": 1.8412569342353541e-06, "loss": 0.77155733, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.7807555198669434 }, { "auxiliary_loss_clip": 0.01313569, "auxiliary_loss_mlp": 0.01194164, "balance_loss_clip": 1.00842011, "balance_loss_mlp": 1.00037742, "epoch": 0.5396500931882402, "flos": 23842147383360.0, "grad_norm": 2.5524959968764933, "language_loss": 0.84607863, "learning_rate": 1.840480432223438e-06, "loss": 0.87115598, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.803157329559326 }, { "auxiliary_loss_clip": 0.0132424, "auxiliary_loss_mlp": 0.01194296, "balance_loss_clip": 1.00882316, "balance_loss_mlp": 1.00050902, "epoch": 0.5397703360788794, "flos": 26323000672320.0, "grad_norm": 1.7808189822256648, "language_loss": 0.77659887, "learning_rate": 1.8397039544105131e-06, "loss": 0.80178422, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.8201699256896973 }, { "auxiliary_loss_clip": 0.0133301, "auxiliary_loss_mlp": 0.0119407, "balance_loss_clip": 1.00909317, "balance_loss_mlp": 1.00028324, "epoch": 0.5398905789695184, "flos": 21214598698080.0, "grad_norm": 1.7778048888286946, "language_loss": 0.69599009, "learning_rate": 1.8389275009143711e-06, "loss": 0.72126091, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.743199348449707 }, { "auxiliary_loss_clip": 0.01355629, "auxiliary_loss_mlp": 0.01194197, "balance_loss_clip": 1.00885916, "balance_loss_mlp": 1.00041044, "epoch": 0.5400108218601575, "flos": 25080346759680.0, "grad_norm": 1.7108488391643626, "language_loss": 0.73363143, "learning_rate": 1.8381510718527988e-06, "loss": 0.75912964, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.755974769592285 }, { "auxiliary_loss_clip": 0.01331913, "auxiliary_loss_mlp": 0.01194149, "balance_loss_clip": 1.00920904, "balance_loss_mlp": 1.00026727, "epoch": 0.5401310647507966, "flos": 26357510119680.0, "grad_norm": 1.730987265244456, "language_loss": 0.63811553, "learning_rate": 1.8373746673435812e-06, "loss": 0.66337615, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.767549991607666 }, { "auxiliary_loss_clip": 0.01355157, "auxiliary_loss_mlp": 0.01194392, "balance_loss_clip": 1.00881982, "balance_loss_mlp": 1.00041509, "epoch": 0.5402513076414357, "flos": 27855345238560.0, "grad_norm": 1.8059387208291537, "language_loss": 0.79028791, "learning_rate": 1.8365982875044964e-06, "loss": 0.81578338, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.7183127403259277 }, { "auxiliary_loss_clip": 0.01342851, "auxiliary_loss_mlp": 0.00872796, "balance_loss_clip": 1.00912094, "balance_loss_mlp": 1.00057602, "epoch": 0.5403715505320748, "flos": 22893782355360.0, "grad_norm": 2.66822342322777, "language_loss": 0.75689322, "learning_rate": 1.8358219324533217e-06, "loss": 0.77904963, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.7496612071990967 }, { "auxiliary_loss_clip": 0.01321426, "auxiliary_loss_mlp": 0.01194023, "balance_loss_clip": 1.00858283, "balance_loss_mlp": 1.00023675, "epoch": 0.5404917934227139, "flos": 30224156273280.0, "grad_norm": 1.5101171834010474, "language_loss": 0.70069826, "learning_rate": 1.8350456023078292e-06, "loss": 0.72585273, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.85491681098938 }, { "auxiliary_loss_clip": 0.01358104, "auxiliary_loss_mlp": 0.01194009, "balance_loss_clip": 1.00927341, "balance_loss_mlp": 1.00022233, "epoch": 0.540612036313353, "flos": 19938513048480.0, "grad_norm": 2.0684218037491537, "language_loss": 0.78095001, "learning_rate": 1.8342692971857874e-06, "loss": 0.80647117, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.718566417694092 }, { "auxiliary_loss_clip": 0.01310705, "auxiliary_loss_mlp": 0.01194041, "balance_loss_clip": 1.00780082, "balance_loss_mlp": 1.00025392, "epoch": 0.540732279203992, "flos": 24279611143680.0, "grad_norm": 2.198129935407931, "language_loss": 0.71530008, "learning_rate": 1.833493017204962e-06, "loss": 0.74034756, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.8230338096618652 }, { "auxiliary_loss_clip": 0.0135666, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.00938082, "balance_loss_mlp": 1.00024247, "epoch": 0.5408525220946312, "flos": 20193227246880.0, "grad_norm": 1.6802276369581877, "language_loss": 0.77690661, "learning_rate": 1.8327167624831134e-06, "loss": 0.80241346, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.7211391925811768 }, { "auxiliary_loss_clip": 0.01356271, "auxiliary_loss_mlp": 0.01194144, "balance_loss_clip": 1.00947428, "balance_loss_mlp": 1.00026202, "epoch": 0.5409727649852702, "flos": 24134460465600.0, "grad_norm": 1.7152617197951079, "language_loss": 0.70688725, "learning_rate": 1.831940533137999e-06, "loss": 0.73239148, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.847348928451538 }, { "auxiliary_loss_clip": 0.0133217, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00914788, "balance_loss_mlp": 1.00024796, "epoch": 0.5410930078759093, "flos": 23912710996320.0, "grad_norm": 1.7487840127625185, "language_loss": 0.72685039, "learning_rate": 1.8311643292873718e-06, "loss": 0.75211245, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 3.65415096282959 }, { "auxiliary_loss_clip": 0.01332127, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00812602, "balance_loss_mlp": 1.00030744, "epoch": 0.5412132507665485, "flos": 21105142948800.0, "grad_norm": 1.7039297689905448, "language_loss": 0.87974524, "learning_rate": 1.8303881510489818e-06, "loss": 0.90500748, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 3.760613441467285 }, { "auxiliary_loss_clip": 0.01311834, "auxiliary_loss_mlp": 0.01194264, "balance_loss_clip": 1.0078212, "balance_loss_mlp": 1.00038171, "epoch": 0.5413334936571875, "flos": 30227353480800.0, "grad_norm": 2.24580264613822, "language_loss": 0.6891486, "learning_rate": 1.829611998540574e-06, "loss": 0.71420956, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 3.695603609085083 }, { "auxiliary_loss_clip": 0.01333016, "auxiliary_loss_mlp": 0.00872595, "balance_loss_clip": 1.00835562, "balance_loss_mlp": 1.00049615, "epoch": 0.5414537365478266, "flos": 24279647067360.0, "grad_norm": 1.961905638554664, "language_loss": 0.79916036, "learning_rate": 1.8288358718798914e-06, "loss": 0.82121646, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.7905895709991455 }, { "auxiliary_loss_clip": 0.01329738, "auxiliary_loss_mlp": 0.00872559, "balance_loss_clip": 1.00879812, "balance_loss_mlp": 1.00052714, "epoch": 0.5415739794384657, "flos": 16654553180640.0, "grad_norm": 1.7416604540170157, "language_loss": 0.72356558, "learning_rate": 1.8280597711846703e-06, "loss": 0.7455886, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.653479814529419 }, { "auxiliary_loss_clip": 0.01332679, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00934875, "balance_loss_mlp": 1.00026512, "epoch": 0.5416942223291048, "flos": 23185736200800.0, "grad_norm": 2.4021437750291224, "language_loss": 0.83128202, "learning_rate": 1.8272836965726455e-06, "loss": 0.85654926, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 3.712817430496216 }, { "auxiliary_loss_clip": 0.01273903, "auxiliary_loss_mlp": 0.01194031, "balance_loss_clip": 1.00776076, "balance_loss_mlp": 1.0002439, "epoch": 0.5418144652197439, "flos": 20303257775040.0, "grad_norm": 1.63131325211035, "language_loss": 0.78184921, "learning_rate": 1.8265076481615461e-06, "loss": 0.80652851, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.99153995513916 }, { "auxiliary_loss_clip": 0.01307941, "auxiliary_loss_mlp": 0.01194272, "balance_loss_clip": 1.00895667, "balance_loss_mlp": 1.00038981, "epoch": 0.541934708110383, "flos": 12458641687200.0, "grad_norm": 1.9112420969686144, "language_loss": 0.87323987, "learning_rate": 1.8257316260690987e-06, "loss": 0.89826196, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.873023509979248 }, { "auxiliary_loss_clip": 0.01328552, "auxiliary_loss_mlp": 0.01194122, "balance_loss_clip": 1.00894928, "balance_loss_mlp": 1.0003351, "epoch": 0.5420549510010221, "flos": 21253814147520.0, "grad_norm": 1.4547533287770043, "language_loss": 0.76012361, "learning_rate": 1.8249556304130254e-06, "loss": 0.78535032, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.750671625137329 }, { "auxiliary_loss_clip": 0.01319807, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00806475, "balance_loss_mlp": 1.00032163, "epoch": 0.5421751938916611, "flos": 29490535596960.0, "grad_norm": 2.020638658026127, "language_loss": 0.6865654, "learning_rate": 1.824179661311044e-06, "loss": 0.71170455, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.7994937896728516 }, { "auxiliary_loss_clip": 0.01294295, "auxiliary_loss_mlp": 0.01194082, "balance_loss_clip": 1.00819683, "balance_loss_mlp": 1.00029516, "epoch": 0.5422954367823003, "flos": 18734248340640.0, "grad_norm": 1.790696422029966, "language_loss": 0.79526567, "learning_rate": 1.823403718880868e-06, "loss": 0.82014948, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.8712782859802246 }, { "auxiliary_loss_clip": 0.01332578, "auxiliary_loss_mlp": 0.01194045, "balance_loss_clip": 1.00906551, "balance_loss_mlp": 1.00025868, "epoch": 0.5424156796729394, "flos": 39969022937760.0, "grad_norm": 1.6289050798708071, "language_loss": 0.66462612, "learning_rate": 1.822627803240207e-06, "loss": 0.68989229, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.9527828693389893 }, { "auxiliary_loss_clip": 0.01301268, "auxiliary_loss_mlp": 0.01194053, "balance_loss_clip": 1.00796819, "balance_loss_mlp": 1.00026608, "epoch": 0.5425359225635784, "flos": 11546546366880.0, "grad_norm": 1.9776567876695608, "language_loss": 0.84686589, "learning_rate": 1.8218519145067675e-06, "loss": 0.87181914, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.7532527446746826 }, { "auxiliary_loss_clip": 0.01304558, "auxiliary_loss_mlp": 0.01194103, "balance_loss_clip": 1.00858641, "balance_loss_mlp": 1.00031638, "epoch": 0.5426561654542175, "flos": 20229712496640.0, "grad_norm": 1.7014136028993765, "language_loss": 0.89596838, "learning_rate": 1.8210760527982508e-06, "loss": 0.920955, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.8133058547973633 }, { "auxiliary_loss_clip": 0.01319507, "auxiliary_loss_mlp": 0.00872506, "balance_loss_clip": 1.00917435, "balance_loss_mlp": 1.00046182, "epoch": 0.5427764083448566, "flos": 21871692430560.0, "grad_norm": 1.7155560878009486, "language_loss": 0.75288641, "learning_rate": 1.8203002182323552e-06, "loss": 0.77480662, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.772447347640991 }, { "auxiliary_loss_clip": 0.01321081, "auxiliary_loss_mlp": 0.01194105, "balance_loss_clip": 1.00946593, "balance_loss_mlp": 1.00031817, "epoch": 0.5428966512354957, "flos": 19640955108960.0, "grad_norm": 1.7714143872401, "language_loss": 0.75507963, "learning_rate": 1.819524410926773e-06, "loss": 0.78023154, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.827343463897705 }, { "auxiliary_loss_clip": 0.01252883, "auxiliary_loss_mlp": 0.0119415, "balance_loss_clip": 1.00696301, "balance_loss_mlp": 1.00036311, "epoch": 0.5430168941261347, "flos": 22382198537760.0, "grad_norm": 1.5016516759236123, "language_loss": 0.77226281, "learning_rate": 1.8187486309991944e-06, "loss": 0.79673314, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.9046292304992676 }, { "auxiliary_loss_clip": 0.01331044, "auxiliary_loss_mlp": 0.01194124, "balance_loss_clip": 1.00969958, "balance_loss_mlp": 1.00033748, "epoch": 0.5431371370167739, "flos": 18764195480640.0, "grad_norm": 1.652761315477666, "language_loss": 0.77411509, "learning_rate": 1.817972878567304e-06, "loss": 0.79936677, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.7818708419799805 }, { "auxiliary_loss_clip": 0.01329589, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00880909, "balance_loss_mlp": 1.00030804, "epoch": 0.543257379907413, "flos": 18806033358720.0, "grad_norm": 1.8657578158723265, "language_loss": 0.7651698, "learning_rate": 1.8171971537487834e-06, "loss": 0.79040664, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.80759596824646 }, { "auxiliary_loss_clip": 0.01356733, "auxiliary_loss_mlp": 0.01194127, "balance_loss_clip": 1.00893164, "balance_loss_mlp": 1.00034022, "epoch": 0.543377622798052, "flos": 17493390612000.0, "grad_norm": 1.8414734575505187, "language_loss": 0.80639684, "learning_rate": 1.8164214566613093e-06, "loss": 0.83190548, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.6629488468170166 }, { "auxiliary_loss_clip": 0.01355101, "auxiliary_loss_mlp": 0.01194164, "balance_loss_clip": 1.00889254, "balance_loss_mlp": 1.00028193, "epoch": 0.5434978656886912, "flos": 18989321775840.0, "grad_norm": 3.3603721132468274, "language_loss": 0.65674353, "learning_rate": 1.8156457874225547e-06, "loss": 0.68223619, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.7436282634735107 }, { "auxiliary_loss_clip": 0.0130307, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00810456, "balance_loss_mlp": 1.00030291, "epoch": 0.5436181085793302, "flos": 17274946121280.0, "grad_norm": 1.7358018901313612, "language_loss": 0.80609429, "learning_rate": 1.814870146150187e-06, "loss": 0.83106589, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.743481159210205 }, { "auxiliary_loss_clip": 0.01317745, "auxiliary_loss_mlp": 0.01194286, "balance_loss_clip": 1.00912106, "balance_loss_mlp": 1.00040388, "epoch": 0.5437383514699693, "flos": 19098597906720.0, "grad_norm": 2.000988607353097, "language_loss": 0.78871953, "learning_rate": 1.814094532961871e-06, "loss": 0.81383985, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.759453296661377 }, { "auxiliary_loss_clip": 0.0129081, "auxiliary_loss_mlp": 0.01194217, "balance_loss_clip": 1.00785756, "balance_loss_mlp": 1.00043011, "epoch": 0.5438585943606085, "flos": 22602726601920.0, "grad_norm": 1.7687722519016855, "language_loss": 0.83890641, "learning_rate": 1.8133189479752666e-06, "loss": 0.86375666, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.853360652923584 }, { "auxiliary_loss_clip": 0.01356113, "auxiliary_loss_mlp": 0.01194063, "balance_loss_clip": 1.00934076, "balance_loss_mlp": 1.00027609, "epoch": 0.5439788372512475, "flos": 21798506388960.0, "grad_norm": 1.7997343570783657, "language_loss": 0.81642365, "learning_rate": 1.8125433913080292e-06, "loss": 0.84192538, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.7445671558380127 }, { "auxiliary_loss_clip": 0.01214473, "auxiliary_loss_mlp": 0.01194279, "balance_loss_clip": 1.00785708, "balance_loss_mlp": 1.00039697, "epoch": 0.5440990801418866, "flos": 16399371974400.0, "grad_norm": 2.3478536318146954, "language_loss": 0.82913089, "learning_rate": 1.811767863077811e-06, "loss": 0.85321844, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 3.3844799995422363 }, { "auxiliary_loss_clip": 0.01244131, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.0068295, "balance_loss_mlp": 1.00023937, "epoch": 0.5442193230325257, "flos": 21615649056000.0, "grad_norm": 1.5657842398738606, "language_loss": 0.78221798, "learning_rate": 1.8109923634022577e-06, "loss": 0.80659956, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 4.813599109649658 }, { "auxiliary_loss_clip": 0.01356787, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00907207, "balance_loss_mlp": 1.00032151, "epoch": 0.5443395659231648, "flos": 15481205552160.0, "grad_norm": 1.8565725303141611, "language_loss": 0.86043596, "learning_rate": 1.8102168923990128e-06, "loss": 0.88594496, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 3.591278553009033 }, { "auxiliary_loss_clip": 0.01337614, "auxiliary_loss_mlp": 0.00872493, "balance_loss_clip": 1.00849795, "balance_loss_mlp": 1.00061131, "epoch": 0.5444598088138038, "flos": 18770446200960.0, "grad_norm": 1.6741085800717244, "language_loss": 0.79975271, "learning_rate": 1.809441450185714e-06, "loss": 0.82185376, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.970460891723633 }, { "auxiliary_loss_clip": 0.01330996, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00879812, "balance_loss_mlp": 1.00029898, "epoch": 0.544580051704443, "flos": 21142346672160.0, "grad_norm": 2.1375334498420036, "language_loss": 0.73402095, "learning_rate": 1.8086660368799958e-06, "loss": 0.75927174, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 3.7266736030578613 }, { "auxiliary_loss_clip": 0.01311704, "auxiliary_loss_mlp": 0.01194125, "balance_loss_clip": 1.00786269, "balance_loss_mlp": 1.00033844, "epoch": 0.5447002945950821, "flos": 32491522539360.0, "grad_norm": 1.5928569374106665, "language_loss": 0.77812201, "learning_rate": 1.807890652599488e-06, "loss": 0.80318034, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.853870153427124 }, { "auxiliary_loss_clip": 0.01354321, "auxiliary_loss_mlp": 0.01194057, "balance_loss_clip": 1.00882792, "balance_loss_mlp": 1.00027037, "epoch": 0.5448205374857211, "flos": 11798314823520.0, "grad_norm": 1.7681023383664862, "language_loss": 0.82249963, "learning_rate": 1.8071152974618156e-06, "loss": 0.84798336, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 3.6253607273101807 }, { "auxiliary_loss_clip": 0.0130559, "auxiliary_loss_mlp": 0.00872554, "balance_loss_clip": 1.00775707, "balance_loss_mlp": 1.00054538, "epoch": 0.5449407803763603, "flos": 24133777915680.0, "grad_norm": 3.1488678177690144, "language_loss": 0.78464961, "learning_rate": 1.806339971584599e-06, "loss": 0.80643106, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.907571792602539 }, { "auxiliary_loss_clip": 0.01355735, "auxiliary_loss_mlp": 0.01194073, "balance_loss_clip": 1.009004, "balance_loss_mlp": 1.00028658, "epoch": 0.5450610232669993, "flos": 23258563005600.0, "grad_norm": 1.7015463971439904, "language_loss": 0.85215569, "learning_rate": 1.8055646750854546e-06, "loss": 0.87765372, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.740323066711426 }, { "auxiliary_loss_clip": 0.01322003, "auxiliary_loss_mlp": 0.01194144, "balance_loss_clip": 1.00857639, "balance_loss_mlp": 1.00035703, "epoch": 0.5451812661576384, "flos": 17785092991680.0, "grad_norm": 2.118273687261153, "language_loss": 0.82003379, "learning_rate": 1.8047894080819945e-06, "loss": 0.84519529, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.741393804550171 }, { "auxiliary_loss_clip": 0.0132409, "auxiliary_loss_mlp": 0.01193054, "balance_loss_clip": 1.00399446, "balance_loss_mlp": 1.00003052, "epoch": 0.5453015090482776, "flos": 71062623617760.0, "grad_norm": 0.7224666475901231, "language_loss": 0.63186276, "learning_rate": 1.8040141706918258e-06, "loss": 0.65703416, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.430396318435669 }, { "auxiliary_loss_clip": 0.01304482, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.00915956, "balance_loss_mlp": 1.00028813, "epoch": 0.5454217519389166, "flos": 25552212196320.0, "grad_norm": 1.6324675518603664, "language_loss": 0.76982576, "learning_rate": 1.8032389630325525e-06, "loss": 0.79481131, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.8337252140045166 }, { "auxiliary_loss_clip": 0.01331032, "auxiliary_loss_mlp": 0.01194168, "balance_loss_clip": 1.00856256, "balance_loss_mlp": 1.00038171, "epoch": 0.5455419948295557, "flos": 23658356034720.0, "grad_norm": 1.5329512972297668, "language_loss": 0.75679076, "learning_rate": 1.8024637852217707e-06, "loss": 0.78204274, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.8108510971069336 }, { "auxiliary_loss_clip": 0.01315052, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00787163, "balance_loss_mlp": 1.00032139, "epoch": 0.5456622377201948, "flos": 23404001073120.0, "grad_norm": 1.6797002941690486, "language_loss": 0.84453326, "learning_rate": 1.8016886373770766e-06, "loss": 0.86962485, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.871006727218628 }, { "auxiliary_loss_clip": 0.01330975, "auxiliary_loss_mlp": 0.0119415, "balance_loss_clip": 1.00945652, "balance_loss_mlp": 1.00036323, "epoch": 0.5457824806108339, "flos": 23988052458720.0, "grad_norm": 1.5380643799525948, "language_loss": 0.78803915, "learning_rate": 1.8009135196160579e-06, "loss": 0.81329042, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.7897889614105225 }, { "auxiliary_loss_clip": 0.01307928, "auxiliary_loss_mlp": 0.0119412, "balance_loss_clip": 1.0084765, "balance_loss_mlp": 1.00033379, "epoch": 0.545902723501473, "flos": 22565882115360.0, "grad_norm": 1.5703089991181678, "language_loss": 0.84085482, "learning_rate": 1.8001384320563e-06, "loss": 0.86587524, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.8126025199890137 }, { "auxiliary_loss_clip": 0.01323938, "auxiliary_loss_mlp": 0.0119304, "balance_loss_clip": 1.00389922, "balance_loss_mlp": 1.00001669, "epoch": 0.5460229663921121, "flos": 55198424109600.0, "grad_norm": 0.7725781839863084, "language_loss": 0.57817912, "learning_rate": 1.7993633748153833e-06, "loss": 0.60334885, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.164498805999756 }, { "auxiliary_loss_clip": 0.01343367, "auxiliary_loss_mlp": 0.01194109, "balance_loss_clip": 1.00896335, "balance_loss_mlp": 1.00032222, "epoch": 0.5461432092827512, "flos": 15413875070400.0, "grad_norm": 1.6520606701057114, "language_loss": 0.73148686, "learning_rate": 1.7985883480108834e-06, "loss": 0.75686157, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.6794424057006836 }, { "auxiliary_loss_clip": 0.0134362, "auxiliary_loss_mlp": 0.01194097, "balance_loss_clip": 1.0088073, "balance_loss_mlp": 1.00031066, "epoch": 0.5462634521733902, "flos": 24024932868960.0, "grad_norm": 1.5243846890071482, "language_loss": 0.71884739, "learning_rate": 1.797813351760371e-06, "loss": 0.74422455, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.768784284591675 }, { "auxiliary_loss_clip": 0.01356175, "auxiliary_loss_mlp": 0.0119422, "balance_loss_clip": 1.00888252, "balance_loss_mlp": 1.00033844, "epoch": 0.5463836950640293, "flos": 22820955550560.0, "grad_norm": 1.6275224239140873, "language_loss": 0.78099263, "learning_rate": 1.7970383861814116e-06, "loss": 0.80649662, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.7085344791412354 }, { "auxiliary_loss_clip": 0.01331494, "auxiliary_loss_mlp": 0.01194043, "balance_loss_clip": 1.0084796, "balance_loss_mlp": 1.0002563, "epoch": 0.5465039379546685, "flos": 20448300682080.0, "grad_norm": 1.984036652648734, "language_loss": 0.73677081, "learning_rate": 1.7962634513915684e-06, "loss": 0.76202619, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.7131223678588867 }, { "auxiliary_loss_clip": 0.01354852, "auxiliary_loss_mlp": 0.01194211, "balance_loss_clip": 1.00843859, "balance_loss_mlp": 1.00032949, "epoch": 0.5466241808453075, "flos": 17343318389760.0, "grad_norm": 1.6236202745534358, "language_loss": 0.79248625, "learning_rate": 1.7954885475083969e-06, "loss": 0.81797689, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.7117185592651367 }, { "auxiliary_loss_clip": 0.01356874, "auxiliary_loss_mlp": 0.0119417, "balance_loss_clip": 1.00952983, "balance_loss_mlp": 1.00038373, "epoch": 0.5467444237359466, "flos": 21617050079520.0, "grad_norm": 2.2625996925255336, "language_loss": 0.72587943, "learning_rate": 1.7947136746494513e-06, "loss": 0.75138992, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.686861038208008 }, { "auxiliary_loss_clip": 0.01330996, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.00820994, "balance_loss_mlp": 1.00029659, "epoch": 0.5468646666265857, "flos": 24170478707520.0, "grad_norm": 1.9143368542451393, "language_loss": 0.87817478, "learning_rate": 1.793938832932277e-06, "loss": 0.90342653, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.7416350841522217 }, { "auxiliary_loss_clip": 0.01355989, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00911856, "balance_loss_mlp": 1.0002718, "epoch": 0.5469849095172248, "flos": 27527013914400.0, "grad_norm": 1.790560290334889, "language_loss": 0.70761633, "learning_rate": 1.7931640224744185e-06, "loss": 0.73311675, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.894162178039551 }, { "auxiliary_loss_clip": 0.01311232, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00821984, "balance_loss_mlp": 1.00026035, "epoch": 0.5471051524078638, "flos": 27964693216800.0, "grad_norm": 1.473361803032005, "language_loss": 0.73426187, "learning_rate": 1.7923892433934127e-06, "loss": 0.7593146, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.8962507247924805 }, { "auxiliary_loss_clip": 0.01319573, "auxiliary_loss_mlp": 0.00872704, "balance_loss_clip": 1.00860941, "balance_loss_mlp": 1.00073123, "epoch": 0.547225395298503, "flos": 18150520268160.0, "grad_norm": 1.4977183185734815, "language_loss": 0.78824031, "learning_rate": 1.7916144958067939e-06, "loss": 0.81016308, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.9179344177246094 }, { "auxiliary_loss_clip": 0.01335428, "auxiliary_loss_mlp": 0.01194105, "balance_loss_clip": 1.00773668, "balance_loss_mlp": 1.000319, "epoch": 0.5473456381891421, "flos": 21361509636480.0, "grad_norm": 1.6064264211542305, "language_loss": 0.791731, "learning_rate": 1.7908397798320905e-06, "loss": 0.81702638, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.836223602294922 }, { "auxiliary_loss_clip": 0.01345962, "auxiliary_loss_mlp": 0.00872589, "balance_loss_clip": 1.01028919, "balance_loss_mlp": 1.00045824, "epoch": 0.5474658810797811, "flos": 19932154557120.0, "grad_norm": 1.7835349388482928, "language_loss": 0.74877214, "learning_rate": 1.7900650955868265e-06, "loss": 0.77095771, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 4.8294525146484375 }, { "auxiliary_loss_clip": 0.0133428, "auxiliary_loss_mlp": 0.00872545, "balance_loss_clip": 1.00914741, "balance_loss_mlp": 1.00055504, "epoch": 0.5475861239704203, "flos": 50476236013440.0, "grad_norm": 1.418716982156317, "language_loss": 0.76774102, "learning_rate": 1.7892904431885202e-06, "loss": 0.78980923, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 3.9654226303100586 }, { "auxiliary_loss_clip": 0.01295299, "auxiliary_loss_mlp": 0.01194121, "balance_loss_clip": 1.00826228, "balance_loss_mlp": 1.00033426, "epoch": 0.5477063668610593, "flos": 20705134377600.0, "grad_norm": 1.8077582726233397, "language_loss": 0.75277829, "learning_rate": 1.788515822754686e-06, "loss": 0.77767253, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.8134162425994873 }, { "auxiliary_loss_clip": 0.01319275, "auxiliary_loss_mlp": 0.0119411, "balance_loss_clip": 1.00914049, "balance_loss_mlp": 1.0003233, "epoch": 0.5478266097516984, "flos": 19609750640160.0, "grad_norm": 2.030040963647543, "language_loss": 0.78409189, "learning_rate": 1.7877412344028335e-06, "loss": 0.8092258, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.7892651557922363 }, { "auxiliary_loss_clip": 0.01343928, "auxiliary_loss_mlp": 0.0119412, "balance_loss_clip": 1.00908148, "balance_loss_mlp": 1.00033355, "epoch": 0.5479468526423376, "flos": 12896608379040.0, "grad_norm": 2.0945441853726243, "language_loss": 0.77324694, "learning_rate": 1.7869666782504668e-06, "loss": 0.79862738, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 3.7253119945526123 }, { "auxiliary_loss_clip": 0.01320828, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00826907, "balance_loss_mlp": 1.00029874, "epoch": 0.5480670955329766, "flos": 18588810273120.0, "grad_norm": 1.653165892294444, "language_loss": 0.68506861, "learning_rate": 1.7861921544150867e-06, "loss": 0.71021771, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.8150689601898193 }, { "auxiliary_loss_clip": 0.01250517, "auxiliary_loss_mlp": 0.00872568, "balance_loss_clip": 1.00665689, "balance_loss_mlp": 1.00062752, "epoch": 0.5481873384236157, "flos": 15954615707040.0, "grad_norm": 1.6743861904691437, "language_loss": 0.766945, "learning_rate": 1.7854176630141856e-06, "loss": 0.78817582, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 3.1056947708129883 }, { "auxiliary_loss_clip": 0.01356936, "auxiliary_loss_mlp": 0.01194219, "balance_loss_clip": 1.00929725, "balance_loss_mlp": 1.00033689, "epoch": 0.5483075813142548, "flos": 22783823674560.0, "grad_norm": 2.019942383929009, "language_loss": 0.83988392, "learning_rate": 1.784643204165255e-06, "loss": 0.86539549, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.865360736846924 }, { "auxiliary_loss_clip": 0.01329096, "auxiliary_loss_mlp": 0.0119404, "balance_loss_clip": 1.00883126, "balance_loss_mlp": 1.00025344, "epoch": 0.5484278242048939, "flos": 19317222015840.0, "grad_norm": 1.8446321704881479, "language_loss": 0.77404875, "learning_rate": 1.7838687779857783e-06, "loss": 0.79928005, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.7260324954986572 }, { "auxiliary_loss_clip": 0.01320283, "auxiliary_loss_mlp": 0.011941, "balance_loss_clip": 1.00822902, "balance_loss_mlp": 1.0003134, "epoch": 0.5485480670955329, "flos": 22816034006400.0, "grad_norm": 1.7637307795568846, "language_loss": 0.64129364, "learning_rate": 1.7830943845932366e-06, "loss": 0.66643751, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.7010552883148193 }, { "auxiliary_loss_clip": 0.01322909, "auxiliary_loss_mlp": 0.01194149, "balance_loss_clip": 1.00870228, "balance_loss_mlp": 1.00036216, "epoch": 0.5486683099861721, "flos": 22671314412480.0, "grad_norm": 1.6399069191303859, "language_loss": 0.75033683, "learning_rate": 1.7823200241051044e-06, "loss": 0.77550745, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.8057281970977783 }, { "auxiliary_loss_clip": 0.01355731, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00887918, "balance_loss_mlp": 1.00030303, "epoch": 0.5487885528768112, "flos": 23149394645760.0, "grad_norm": 1.8095991801528144, "language_loss": 0.80799878, "learning_rate": 1.7815456966388513e-06, "loss": 0.83349705, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.7157790660858154 }, { "auxiliary_loss_clip": 0.01292902, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00832486, "balance_loss_mlp": 1.00028968, "epoch": 0.5489087957674502, "flos": 22053939060960.0, "grad_norm": 2.1882800666910103, "language_loss": 0.8099345, "learning_rate": 1.780771402311943e-06, "loss": 0.8348043, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.794811248779297 }, { "auxiliary_loss_clip": 0.01317443, "auxiliary_loss_mlp": 0.0119419, "balance_loss_clip": 1.0090152, "balance_loss_mlp": 1.00040388, "epoch": 0.5490290386580894, "flos": 24315988622400.0, "grad_norm": 1.576803681171542, "language_loss": 0.78448737, "learning_rate": 1.7799971412418374e-06, "loss": 0.80960369, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.82473087310791 }, { "auxiliary_loss_clip": 0.01293337, "auxiliary_loss_mlp": 0.01194168, "balance_loss_clip": 1.00846827, "balance_loss_mlp": 1.00028598, "epoch": 0.5491492815487284, "flos": 18294952472640.0, "grad_norm": 2.0329426051323467, "language_loss": 0.74202907, "learning_rate": 1.7792229135459918e-06, "loss": 0.76690412, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.7544970512390137 }, { "auxiliary_loss_clip": 0.01248036, "auxiliary_loss_mlp": 0.01193115, "balance_loss_clip": 1.008376, "balance_loss_mlp": 1.00009155, "epoch": 0.5492695244393675, "flos": 64550287320480.0, "grad_norm": 0.7287322463292363, "language_loss": 0.61657083, "learning_rate": 1.7784487193418538e-06, "loss": 0.64098239, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.49853777885437 }, { "auxiliary_loss_clip": 0.01310714, "auxiliary_loss_mlp": 0.0119408, "balance_loss_clip": 1.00849283, "balance_loss_mlp": 1.00029373, "epoch": 0.5493897673300067, "flos": 17379588097440.0, "grad_norm": 1.919600342290223, "language_loss": 0.6098249, "learning_rate": 1.7776745587468698e-06, "loss": 0.63487279, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 3.1652626991271973 }, { "auxiliary_loss_clip": 0.01355258, "auxiliary_loss_mlp": 0.01194046, "balance_loss_clip": 1.00857091, "balance_loss_mlp": 1.00025892, "epoch": 0.5495100102206457, "flos": 19901776332960.0, "grad_norm": 3.0839048555498914, "language_loss": 0.81724161, "learning_rate": 1.7769004318784776e-06, "loss": 0.84273469, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.6930553913116455 }, { "auxiliary_loss_clip": 0.01333314, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.00811517, "balance_loss_mlp": 1.000278, "epoch": 0.5496302531112848, "flos": 16727200367040.0, "grad_norm": 1.564081972796322, "language_loss": 0.80604297, "learning_rate": 1.776126338854113e-06, "loss": 0.83131677, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.714869260787964 }, { "auxiliary_loss_clip": 0.01331775, "auxiliary_loss_mlp": 0.01194037, "balance_loss_clip": 1.00914812, "balance_loss_mlp": 1.00025046, "epoch": 0.5497504960019239, "flos": 24572355310080.0, "grad_norm": 1.7939147826938413, "language_loss": 0.84627724, "learning_rate": 1.7753522797912044e-06, "loss": 0.8715353, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.7700936794281006 }, { "auxiliary_loss_clip": 0.01329752, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00862586, "balance_loss_mlp": 1.00024879, "epoch": 0.549870738892563, "flos": 15450504014880.0, "grad_norm": 2.1588676851207897, "language_loss": 0.70194125, "learning_rate": 1.7745782548071765e-06, "loss": 0.72717917, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.684242010116577 }, { "auxiliary_loss_clip": 0.01286174, "auxiliary_loss_mlp": 0.01194155, "balance_loss_clip": 1.00816071, "balance_loss_mlp": 1.00036824, "epoch": 0.549990981783202, "flos": 21069124706880.0, "grad_norm": 1.8515488602536483, "language_loss": 0.74127698, "learning_rate": 1.7738042640194482e-06, "loss": 0.7660802, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.758561372756958 }, { "auxiliary_loss_clip": 0.01355178, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.008973, "balance_loss_mlp": 1.00023901, "epoch": 0.5501112246738411, "flos": 21395911312800.0, "grad_norm": 1.5243050852721547, "language_loss": 0.70625716, "learning_rate": 1.7730303075454335e-06, "loss": 0.73174918, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.7535078525543213 }, { "auxiliary_loss_clip": 0.01304669, "auxiliary_loss_mlp": 0.01194096, "balance_loss_clip": 1.00773716, "balance_loss_mlp": 1.00030899, "epoch": 0.5502314675644803, "flos": 17456941285920.0, "grad_norm": 2.3936332264072573, "language_loss": 0.85069484, "learning_rate": 1.7722563855025402e-06, "loss": 0.87568247, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.8184590339660645 }, { "auxiliary_loss_clip": 0.01331692, "auxiliary_loss_mlp": 0.01194194, "balance_loss_clip": 1.00880361, "balance_loss_mlp": 1.00040758, "epoch": 0.5503517104551193, "flos": 24310420452000.0, "grad_norm": 2.0426617596729386, "language_loss": 0.70523584, "learning_rate": 1.7714824980081721e-06, "loss": 0.73049474, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.833355188369751 }, { "auxiliary_loss_clip": 0.01330776, "auxiliary_loss_mlp": 0.01194141, "balance_loss_clip": 1.00862861, "balance_loss_mlp": 1.00035405, "epoch": 0.5504719533457584, "flos": 22419438184800.0, "grad_norm": 1.6978444789592468, "language_loss": 0.73607981, "learning_rate": 1.7707086451797276e-06, "loss": 0.76132905, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.8149795532226562 }, { "auxiliary_loss_clip": 0.01277338, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.0031476, "balance_loss_mlp": 1.00004661, "epoch": 0.5505921962363975, "flos": 67294189101600.0, "grad_norm": 0.7166896950188375, "language_loss": 0.52362669, "learning_rate": 1.7699348271345993e-06, "loss": 0.54833078, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 5.123655796051025 }, { "auxiliary_loss_clip": 0.01272262, "auxiliary_loss_mlp": 0.01193101, "balance_loss_clip": 1.00399542, "balance_loss_mlp": 1.00007689, "epoch": 0.5507124391270366, "flos": 45685150002720.0, "grad_norm": 0.7046958454548994, "language_loss": 0.54423016, "learning_rate": 1.7691610439901753e-06, "loss": 0.56888378, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.4369654655456543 }, { "auxiliary_loss_clip": 0.01337825, "auxiliary_loss_mlp": 0.01194043, "balance_loss_clip": 1.00834656, "balance_loss_mlp": 1.00025606, "epoch": 0.5508326820176757, "flos": 22273856422560.0, "grad_norm": 1.9663938986923086, "language_loss": 0.75270593, "learning_rate": 1.7683872958638367e-06, "loss": 0.77802467, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 3.734623908996582 }, { "auxiliary_loss_clip": 0.01319801, "auxiliary_loss_mlp": 0.0119407, "balance_loss_clip": 1.00798345, "balance_loss_mlp": 1.00028384, "epoch": 0.5509529249083148, "flos": 20012453487360.0, "grad_norm": 1.8489166805983326, "language_loss": 0.84208381, "learning_rate": 1.7676135828729614e-06, "loss": 0.86722255, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.761776924133301 }, { "auxiliary_loss_clip": 0.01332805, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00838113, "balance_loss_mlp": 1.00027239, "epoch": 0.5510731677989539, "flos": 21834812020320.0, "grad_norm": 1.8862988966402061, "language_loss": 0.82618499, "learning_rate": 1.7668399051349205e-06, "loss": 0.8514536, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 3.7367987632751465 }, { "auxiliary_loss_clip": 0.01300386, "auxiliary_loss_mlp": 0.01193993, "balance_loss_clip": 1.00830424, "balance_loss_mlp": 1.00020647, "epoch": 0.5511934106895929, "flos": 21467911872960.0, "grad_norm": 1.8498251069888143, "language_loss": 0.83078122, "learning_rate": 1.766066262767081e-06, "loss": 0.85572499, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.829005479812622 }, { "auxiliary_loss_clip": 0.01306152, "auxiliary_loss_mlp": 0.01194084, "balance_loss_clip": 1.00727928, "balance_loss_mlp": 1.00029755, "epoch": 0.5513136535802321, "flos": 21068945088480.0, "grad_norm": 2.019735811307215, "language_loss": 0.77094197, "learning_rate": 1.765292655886803e-06, "loss": 0.79594433, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.7431042194366455 }, { "auxiliary_loss_clip": 0.01310366, "auxiliary_loss_mlp": 0.01194112, "balance_loss_clip": 1.00861633, "balance_loss_mlp": 1.00032496, "epoch": 0.5514338964708712, "flos": 27815016155040.0, "grad_norm": 1.732399995591696, "language_loss": 0.7031666, "learning_rate": 1.764519084611443e-06, "loss": 0.7282114, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.8904199600219727 }, { "auxiliary_loss_clip": 0.01319626, "auxiliary_loss_mlp": 0.01194218, "balance_loss_clip": 1.00835204, "balance_loss_mlp": 1.00033605, "epoch": 0.5515541393615102, "flos": 21908536917120.0, "grad_norm": 2.9108348622335467, "language_loss": 0.78029692, "learning_rate": 1.7637455490583505e-06, "loss": 0.8054353, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.7129783630371094 }, { "auxiliary_loss_clip": 0.01335867, "auxiliary_loss_mlp": 0.01194117, "balance_loss_clip": 1.00794959, "balance_loss_mlp": 1.00033069, "epoch": 0.5516743822521494, "flos": 20485432558080.0, "grad_norm": 1.9101561716282278, "language_loss": 0.7746588, "learning_rate": 1.7629720493448701e-06, "loss": 0.79995871, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.7416858673095703 }, { "auxiliary_loss_clip": 0.01329824, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00911188, "balance_loss_mlp": 1.00030875, "epoch": 0.5517946251427884, "flos": 14940393068160.0, "grad_norm": 1.6855452340940629, "language_loss": 0.85127687, "learning_rate": 1.7621985855883418e-06, "loss": 0.8765161, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.7092771530151367 }, { "auxiliary_loss_clip": 0.01308163, "auxiliary_loss_mlp": 0.01194036, "balance_loss_clip": 1.00807929, "balance_loss_mlp": 1.00024927, "epoch": 0.5519148680334275, "flos": 18404875229760.0, "grad_norm": 1.8700144776546002, "language_loss": 0.72420937, "learning_rate": 1.7614251579060983e-06, "loss": 0.74923134, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.783404588699341 }, { "auxiliary_loss_clip": 0.01282796, "auxiliary_loss_mlp": 0.0119399, "balance_loss_clip": 1.0073514, "balance_loss_mlp": 1.00020349, "epoch": 0.5520351109240667, "flos": 25113347412480.0, "grad_norm": 1.5974683910549263, "language_loss": 0.84665251, "learning_rate": 1.76065176641547e-06, "loss": 0.87142044, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.867126941680908 }, { "auxiliary_loss_clip": 0.0134214, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00820374, "balance_loss_mlp": 1.0002985, "epoch": 0.5521553538147057, "flos": 21069555791040.0, "grad_norm": 1.8721463597003225, "language_loss": 0.77751899, "learning_rate": 1.759878411233777e-06, "loss": 0.8028813, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.733180046081543 }, { "auxiliary_loss_clip": 0.01332511, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00810719, "balance_loss_mlp": 1.00032616, "epoch": 0.5522755967053448, "flos": 18880009721280.0, "grad_norm": 2.124656958569336, "language_loss": 0.75771272, "learning_rate": 1.7591050924783388e-06, "loss": 0.78297895, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.7535018920898438 }, { "auxiliary_loss_clip": 0.01254027, "auxiliary_loss_mlp": 0.01193046, "balance_loss_clip": 1.00334966, "balance_loss_mlp": 1.00002241, "epoch": 0.5523958395959839, "flos": 64675657260000.0, "grad_norm": 0.8341099089609094, "language_loss": 0.57962936, "learning_rate": 1.7583318102664661e-06, "loss": 0.60410011, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.4074127674102783 }, { "auxiliary_loss_clip": 0.01342872, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00822878, "balance_loss_mlp": 1.00027251, "epoch": 0.552516082486623, "flos": 10889740023840.0, "grad_norm": 1.7681381945355583, "language_loss": 0.79000783, "learning_rate": 1.757558564715466e-06, "loss": 0.81537712, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.693230628967285 }, { "auxiliary_loss_clip": 0.01332891, "auxiliary_loss_mlp": 0.0119418, "balance_loss_clip": 1.00827873, "balance_loss_mlp": 1.00020254, "epoch": 0.552636325377262, "flos": 22199808212640.0, "grad_norm": 2.265450159922329, "language_loss": 0.74326742, "learning_rate": 1.7567853559426386e-06, "loss": 0.76853806, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.706254482269287 }, { "auxiliary_loss_clip": 0.01337634, "auxiliary_loss_mlp": 0.01194087, "balance_loss_clip": 1.00841999, "balance_loss_mlp": 1.00030029, "epoch": 0.5527565682679012, "flos": 23988196153440.0, "grad_norm": 1.9011420977043674, "language_loss": 0.75230074, "learning_rate": 1.7560121840652797e-06, "loss": 0.77761793, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.732609987258911 }, { "auxiliary_loss_clip": 0.0129425, "auxiliary_loss_mlp": 0.01194082, "balance_loss_clip": 1.00794578, "balance_loss_mlp": 1.0002954, "epoch": 0.5528768111585403, "flos": 19719278236800.0, "grad_norm": 1.72871020518973, "language_loss": 0.69541824, "learning_rate": 1.7552390492006782e-06, "loss": 0.72030151, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.822892665863037 }, { "auxiliary_loss_clip": 0.01297426, "auxiliary_loss_mlp": 0.00872675, "balance_loss_clip": 1.00861263, "balance_loss_mlp": 1.00050759, "epoch": 0.5529970540491793, "flos": 26215987733280.0, "grad_norm": 1.600417412766015, "language_loss": 0.65301973, "learning_rate": 1.7544659514661184e-06, "loss": 0.67472076, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.861492156982422 }, { "auxiliary_loss_clip": 0.01331065, "auxiliary_loss_mlp": 0.01194093, "balance_loss_clip": 1.00883746, "balance_loss_mlp": 1.00030673, "epoch": 0.5531172969398185, "flos": 24425983226880.0, "grad_norm": 1.8737323564343709, "language_loss": 0.7964946, "learning_rate": 1.7536928909788786e-06, "loss": 0.82174611, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.8354721069335938 }, { "auxiliary_loss_clip": 0.01252946, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00390553, "balance_loss_mlp": 1.0000428, "epoch": 0.5532375398304575, "flos": 64907344379520.0, "grad_norm": 0.8769856066047548, "language_loss": 0.62012428, "learning_rate": 1.752919867856231e-06, "loss": 0.64458436, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.204984664916992 }, { "auxiliary_loss_clip": 0.01321771, "auxiliary_loss_mlp": 0.01194153, "balance_loss_clip": 1.0078826, "balance_loss_mlp": 1.00036645, "epoch": 0.5533577827210966, "flos": 19683116300160.0, "grad_norm": 3.3421401455646396, "language_loss": 0.78674674, "learning_rate": 1.7521468822154436e-06, "loss": 0.81190592, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.7937958240509033 }, { "auxiliary_loss_clip": 0.01307699, "auxiliary_loss_mlp": 0.01194088, "balance_loss_clip": 1.00812614, "balance_loss_mlp": 1.00030112, "epoch": 0.5534780256117358, "flos": 32306509785600.0, "grad_norm": 1.7463490840826836, "language_loss": 0.74928206, "learning_rate": 1.751373934173777e-06, "loss": 0.77429992, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.8452329635620117 }, { "auxiliary_loss_clip": 0.01356686, "auxiliary_loss_mlp": 0.01194114, "balance_loss_clip": 1.0089376, "balance_loss_mlp": 1.00032711, "epoch": 0.5535982685023748, "flos": 23222436992640.0, "grad_norm": 1.6124734519627153, "language_loss": 0.73260593, "learning_rate": 1.750601023848487e-06, "loss": 0.75811392, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.771988868713379 }, { "auxiliary_loss_clip": 0.01354376, "auxiliary_loss_mlp": 0.00872563, "balance_loss_clip": 1.00874496, "balance_loss_mlp": 1.00043631, "epoch": 0.5537185113930139, "flos": 24352545719520.0, "grad_norm": 1.9578660523405385, "language_loss": 0.74139726, "learning_rate": 1.749828151356823e-06, "loss": 0.76366663, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 4.690926790237427 }, { "auxiliary_loss_clip": 0.01314988, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00755608, "balance_loss_mlp": 1.00026011, "epoch": 0.553838754283653, "flos": 23549079903840.0, "grad_norm": 1.783181057123668, "language_loss": 0.7545231, "learning_rate": 1.7490553168160297e-06, "loss": 0.77961344, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.9408609867095947 }, { "auxiliary_loss_clip": 0.0131659, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00799751, "balance_loss_mlp": 1.00029898, "epoch": 0.5539589971742921, "flos": 17275053892320.0, "grad_norm": 1.871063978811194, "language_loss": 0.76525617, "learning_rate": 1.748282520343345e-06, "loss": 0.79036289, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 3.85746431350708 }, { "auxiliary_loss_clip": 0.01344404, "auxiliary_loss_mlp": 0.01194424, "balance_loss_clip": 1.00915575, "balance_loss_mlp": 1.00035155, "epoch": 0.5540792400649311, "flos": 27564181714080.0, "grad_norm": 1.9502441970124234, "language_loss": 0.7879979, "learning_rate": 1.7475097620560023e-06, "loss": 0.8133862, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.7925164699554443 }, { "auxiliary_loss_clip": 0.01355563, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00898886, "balance_loss_mlp": 1.00029969, "epoch": 0.5541994829555702, "flos": 23878668556800.0, "grad_norm": 1.6404932164061334, "language_loss": 0.71062672, "learning_rate": 1.746737042071228e-06, "loss": 0.7361232, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 3.6236212253570557 }, { "auxiliary_loss_clip": 0.01308389, "auxiliary_loss_mlp": 0.01194051, "balance_loss_clip": 1.00729108, "balance_loss_mlp": 1.00026441, "epoch": 0.5543197258462094, "flos": 20115730363680.0, "grad_norm": 1.715162167922159, "language_loss": 0.79085845, "learning_rate": 1.7459643605062424e-06, "loss": 0.8158828, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.7321858406066895 }, { "auxiliary_loss_clip": 0.01262632, "auxiliary_loss_mlp": 0.01194164, "balance_loss_clip": 1.00653613, "balance_loss_mlp": 1.00037766, "epoch": 0.5544399687368484, "flos": 20916573750720.0, "grad_norm": 1.6309598026395475, "language_loss": 0.80764139, "learning_rate": 1.745191717478262e-06, "loss": 0.83220935, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 3.01560115814209 }, { "auxiliary_loss_clip": 0.01318763, "auxiliary_loss_mlp": 0.01193966, "balance_loss_clip": 1.00894165, "balance_loss_mlp": 1.00027514, "epoch": 0.5545602116274875, "flos": 25518672688320.0, "grad_norm": 1.7140636646604674, "language_loss": 0.79500121, "learning_rate": 1.7444191131044948e-06, "loss": 0.82012856, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.778944969177246 }, { "auxiliary_loss_clip": 0.01308027, "auxiliary_loss_mlp": 0.01194067, "balance_loss_clip": 1.00786328, "balance_loss_mlp": 1.00028014, "epoch": 0.5546804545181266, "flos": 20995579428480.0, "grad_norm": 1.8211869546697381, "language_loss": 0.73092717, "learning_rate": 1.7436465475021456e-06, "loss": 0.75594813, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.7496767044067383 }, { "auxiliary_loss_clip": 0.01297013, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.00764632, "balance_loss_mlp": 1.00024223, "epoch": 0.5548006974087657, "flos": 26833650474240.0, "grad_norm": 2.031481615155788, "language_loss": 0.71342844, "learning_rate": 1.7428740207884111e-06, "loss": 0.73833883, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.856550693511963 }, { "auxiliary_loss_clip": 0.01261667, "auxiliary_loss_mlp": 0.01194042, "balance_loss_clip": 1.00752449, "balance_loss_mlp": 1.00025511, "epoch": 0.5549209402994048, "flos": 33656428103040.0, "grad_norm": 2.0314799185583152, "language_loss": 0.61062038, "learning_rate": 1.7421015330804833e-06, "loss": 0.63517749, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.8939011096954346 }, { "auxiliary_loss_clip": 0.0135528, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00887346, "balance_loss_mlp": 1.0002892, "epoch": 0.5550411831900439, "flos": 23769536120640.0, "grad_norm": 1.8874719714116441, "language_loss": 0.72227079, "learning_rate": 1.7413290844955475e-06, "loss": 0.74776435, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.7015607357025146 }, { "auxiliary_loss_clip": 0.01343041, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00962615, "balance_loss_mlp": 1.0002718, "epoch": 0.555161426080683, "flos": 21651200290080.0, "grad_norm": 1.6924468483427058, "language_loss": 0.78355324, "learning_rate": 1.7405566751507843e-06, "loss": 0.8089242, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.7488720417022705 }, { "auxiliary_loss_clip": 0.01297266, "auxiliary_loss_mlp": 0.0119408, "balance_loss_clip": 1.00777185, "balance_loss_mlp": 1.00029349, "epoch": 0.555281668971322, "flos": 49563134830080.0, "grad_norm": 1.6556047630079547, "language_loss": 0.67432666, "learning_rate": 1.7397843051633668e-06, "loss": 0.69924009, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 3.0470025539398193 }, { "auxiliary_loss_clip": 0.01328539, "auxiliary_loss_mlp": 0.01194158, "balance_loss_clip": 1.00815988, "balance_loss_mlp": 1.00037181, "epoch": 0.5554019118619612, "flos": 20741619627360.0, "grad_norm": 1.595070241627213, "language_loss": 0.71160233, "learning_rate": 1.739011974650464e-06, "loss": 0.73682928, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.7887446880340576 }, { "auxiliary_loss_clip": 0.0127048, "auxiliary_loss_mlp": 0.01194012, "balance_loss_clip": 1.00691891, "balance_loss_mlp": 1.0002259, "epoch": 0.5555221547526003, "flos": 25483229225280.0, "grad_norm": 2.148488346074867, "language_loss": 0.76666069, "learning_rate": 1.7382396837292365e-06, "loss": 0.79130566, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.874948024749756 }, { "auxiliary_loss_clip": 0.01355028, "auxiliary_loss_mlp": 0.01194211, "balance_loss_clip": 1.00874376, "balance_loss_mlp": 1.00042462, "epoch": 0.5556423976432393, "flos": 21762524070720.0, "grad_norm": 1.654762665483788, "language_loss": 0.73419559, "learning_rate": 1.737467432516841e-06, "loss": 0.75968802, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.6009955406188965 }, { "auxiliary_loss_clip": 0.01330516, "auxiliary_loss_mlp": 0.0119419, "balance_loss_clip": 1.00881958, "balance_loss_mlp": 1.0003078, "epoch": 0.5557626405338785, "flos": 24900183702720.0, "grad_norm": 2.184818859029535, "language_loss": 0.74110758, "learning_rate": 1.7366952211304274e-06, "loss": 0.76635468, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.7580909729003906 }, { "auxiliary_loss_clip": 0.01317238, "auxiliary_loss_mlp": 0.0119407, "balance_loss_clip": 1.00800681, "balance_loss_mlp": 1.00028396, "epoch": 0.5558828834245175, "flos": 18697511625120.0, "grad_norm": 1.9471049354492511, "language_loss": 0.83222985, "learning_rate": 1.735923049687139e-06, "loss": 0.85734284, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.6457793712615967 }, { "auxiliary_loss_clip": 0.01330971, "auxiliary_loss_mlp": 0.01194067, "balance_loss_clip": 1.00882626, "balance_loss_mlp": 1.00028038, "epoch": 0.5560031263151566, "flos": 27272191944960.0, "grad_norm": 1.5277442606471618, "language_loss": 0.73998547, "learning_rate": 1.7351509183041144e-06, "loss": 0.76523584, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.7887744903564453 }, { "auxiliary_loss_clip": 0.01356092, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00934637, "balance_loss_mlp": 1.00030518, "epoch": 0.5561233692057957, "flos": 23403749607360.0, "grad_norm": 1.7484273903784533, "language_loss": 0.71547818, "learning_rate": 1.7343788270984852e-06, "loss": 0.74098003, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.677715539932251 }, { "auxiliary_loss_clip": 0.01309279, "auxiliary_loss_mlp": 0.01194118, "balance_loss_clip": 1.00838006, "balance_loss_mlp": 1.00033164, "epoch": 0.5562436120964348, "flos": 37670883287040.0, "grad_norm": 1.747469053010907, "language_loss": 0.74753016, "learning_rate": 1.7336067761873764e-06, "loss": 0.77256405, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.871142625808716 }, { "auxiliary_loss_clip": 0.01343015, "auxiliary_loss_mlp": 0.01194344, "balance_loss_clip": 1.00854015, "balance_loss_mlp": 1.00036669, "epoch": 0.5563638549870739, "flos": 25155257137920.0, "grad_norm": 1.8840203823609503, "language_loss": 0.76630735, "learning_rate": 1.7328347656879076e-06, "loss": 0.79168099, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.7875123023986816 }, { "auxiliary_loss_clip": 0.01318406, "auxiliary_loss_mlp": 0.01194256, "balance_loss_clip": 1.00903308, "balance_loss_mlp": 1.00046897, "epoch": 0.556484097877713, "flos": 13581817143840.0, "grad_norm": 2.4115384730828784, "language_loss": 0.68098128, "learning_rate": 1.7320627957171927e-06, "loss": 0.70610785, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.762860059738159 }, { "auxiliary_loss_clip": 0.01354834, "auxiliary_loss_mlp": 0.0119418, "balance_loss_clip": 1.00903177, "balance_loss_mlp": 1.00039363, "epoch": 0.5566043407683521, "flos": 24681379975200.0, "grad_norm": 1.7838153833873758, "language_loss": 0.81259865, "learning_rate": 1.7312908663923382e-06, "loss": 0.83808875, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.683422803878784 }, { "auxiliary_loss_clip": 0.01342304, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00880349, "balance_loss_mlp": 1.00030851, "epoch": 0.5567245836589911, "flos": 20588170579200.0, "grad_norm": 1.8936782530925893, "language_loss": 0.6743626, "learning_rate": 1.7305189778304463e-06, "loss": 0.69972658, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.65407657623291 }, { "auxiliary_loss_clip": 0.01311238, "auxiliary_loss_mlp": 0.0119416, "balance_loss_clip": 1.00830293, "balance_loss_mlp": 1.00027776, "epoch": 0.5568448265496303, "flos": 20704200361920.0, "grad_norm": 2.053005442863684, "language_loss": 0.79750669, "learning_rate": 1.729747130148611e-06, "loss": 0.82256061, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 3.630000352859497 }, { "auxiliary_loss_clip": 0.01302083, "auxiliary_loss_mlp": 0.01194223, "balance_loss_clip": 1.0083164, "balance_loss_mlp": 1.00034082, "epoch": 0.5569650694402694, "flos": 25302922473600.0, "grad_norm": 2.950677646862584, "language_loss": 0.77014637, "learning_rate": 1.7289753234639208e-06, "loss": 0.79510945, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 3.8374578952789307 }, { "auxiliary_loss_clip": 0.01341801, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00883937, "balance_loss_mlp": 1.00030792, "epoch": 0.5570853123309084, "flos": 19712632356000.0, "grad_norm": 1.6529225866271755, "language_loss": 0.76039779, "learning_rate": 1.7282035578934592e-06, "loss": 0.78575677, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 3.615178346633911 }, { "auxiliary_loss_clip": 0.01303662, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00861359, "balance_loss_mlp": 1.00028944, "epoch": 0.5572055552215476, "flos": 16108100678880.0, "grad_norm": 1.8530938229016736, "language_loss": 0.78468215, "learning_rate": 1.727431833554301e-06, "loss": 0.80965948, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.7236177921295166 }, { "auxiliary_loss_clip": 0.01266226, "auxiliary_loss_mlp": 0.01194057, "balance_loss_clip": 1.00781488, "balance_loss_mlp": 1.00027025, "epoch": 0.5573257981121866, "flos": 17128825503840.0, "grad_norm": 1.6438722152711283, "language_loss": 0.77470076, "learning_rate": 1.7266601505635175e-06, "loss": 0.79930359, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.852226495742798 }, { "auxiliary_loss_clip": 0.01332404, "auxiliary_loss_mlp": 0.01194049, "balance_loss_clip": 1.00804842, "balance_loss_mlp": 1.00026286, "epoch": 0.5574460410028257, "flos": 18807039221760.0, "grad_norm": 2.0513249417508037, "language_loss": 0.75254679, "learning_rate": 1.7258885090381717e-06, "loss": 0.77781129, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 3.605403423309326 }, { "auxiliary_loss_clip": 0.01321795, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00788558, "balance_loss_mlp": 1.00030518, "epoch": 0.5575662838934649, "flos": 29642691392640.0, "grad_norm": 12.756059722927736, "language_loss": 0.78638256, "learning_rate": 1.7251169090953213e-06, "loss": 0.81154138, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.8242180347442627 }, { "auxiliary_loss_clip": 0.01332437, "auxiliary_loss_mlp": 0.01194187, "balance_loss_clip": 1.00789595, "balance_loss_mlp": 1.00030541, "epoch": 0.5576865267841039, "flos": 22054477916160.0, "grad_norm": 3.152547634888369, "language_loss": 0.76396888, "learning_rate": 1.7243453508520168e-06, "loss": 0.78923512, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 2.728626251220703 }, { "auxiliary_loss_clip": 0.01317449, "auxiliary_loss_mlp": 0.01194116, "balance_loss_clip": 1.00832248, "balance_loss_mlp": 1.00032985, "epoch": 0.557806769674743, "flos": 17196048214560.0, "grad_norm": 2.133882337280147, "language_loss": 0.84459507, "learning_rate": 1.7235738344253038e-06, "loss": 0.86971068, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.8318467140197754 }, { "auxiliary_loss_clip": 0.01331385, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.0090729, "balance_loss_mlp": 1.00032687, "epoch": 0.557927012565382, "flos": 24712728138720.0, "grad_norm": 1.7536653764907197, "language_loss": 0.82681751, "learning_rate": 1.72280235993222e-06, "loss": 0.8520726, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.770831823348999 }, { "auxiliary_loss_clip": 0.01329107, "auxiliary_loss_mlp": 0.00872623, "balance_loss_clip": 1.00868893, "balance_loss_mlp": 1.00046992, "epoch": 0.5580472554560212, "flos": 16983100046880.0, "grad_norm": 2.2800622790117075, "language_loss": 0.69945467, "learning_rate": 1.722030927489798e-06, "loss": 0.72147191, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 2.704127311706543 }, { "auxiliary_loss_clip": 0.0128842, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.0066185, "balance_loss_mlp": 1.00029182, "epoch": 0.5581674983466602, "flos": 23509110057120.0, "grad_norm": 1.6302189210251878, "language_loss": 0.73962152, "learning_rate": 1.7212595372150634e-06, "loss": 0.76444656, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.8306422233581543 }, { "auxiliary_loss_clip": 0.01354701, "auxiliary_loss_mlp": 0.01194123, "balance_loss_clip": 1.0089097, "balance_loss_mlp": 1.00033641, "epoch": 0.5582877412372993, "flos": 13480300527840.0, "grad_norm": 2.0004007275231643, "language_loss": 0.72554755, "learning_rate": 1.720488189225035e-06, "loss": 0.75103581, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.6806650161743164 }, { "auxiliary_loss_clip": 0.01336859, "auxiliary_loss_mlp": 0.01194082, "balance_loss_clip": 1.00828171, "balance_loss_mlp": 1.0002954, "epoch": 0.5584079841279385, "flos": 21903615372960.0, "grad_norm": 2.292923031270637, "language_loss": 0.79274714, "learning_rate": 1.7197168836367265e-06, "loss": 0.81805658, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.840423822402954 }, { "auxiliary_loss_clip": 0.01331178, "auxiliary_loss_mlp": 0.00872582, "balance_loss_clip": 1.00766969, "balance_loss_mlp": 1.00053072, "epoch": 0.5585282270185775, "flos": 18843560395200.0, "grad_norm": 1.8582191937824024, "language_loss": 0.81743991, "learning_rate": 1.7189456205671433e-06, "loss": 0.83947754, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.6802799701690674 }, { "auxiliary_loss_clip": 0.01329492, "auxiliary_loss_mlp": 0.01194121, "balance_loss_clip": 1.00933194, "balance_loss_mlp": 1.00033402, "epoch": 0.5586484699092166, "flos": 21868459299360.0, "grad_norm": 1.8152963924692158, "language_loss": 0.8218565, "learning_rate": 1.7181744001332866e-06, "loss": 0.84709251, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.7396082878112793 }, { "auxiliary_loss_clip": 0.01353777, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00879741, "balance_loss_mlp": 1.00030506, "epoch": 0.5587687127998557, "flos": 22893243500160.0, "grad_norm": 1.825781969841318, "language_loss": 0.63536143, "learning_rate": 1.7174032224521493e-06, "loss": 0.66084015, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.688123941421509 }, { "auxiliary_loss_clip": 0.01336228, "auxiliary_loss_mlp": 0.01194121, "balance_loss_clip": 1.00846505, "balance_loss_mlp": 1.00033474, "epoch": 0.5588889556904948, "flos": 20303078156640.0, "grad_norm": 1.6128168332295851, "language_loss": 0.69700819, "learning_rate": 1.7166320876407184e-06, "loss": 0.72231174, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.7937347888946533 }, { "auxiliary_loss_clip": 0.01355209, "auxiliary_loss_mlp": 0.0087266, "balance_loss_clip": 1.00909781, "balance_loss_mlp": 1.00059247, "epoch": 0.5590091985811338, "flos": 16472162855520.0, "grad_norm": 1.890421488646106, "language_loss": 0.67976975, "learning_rate": 1.7158609958159742e-06, "loss": 0.70204842, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.6793417930603027 }, { "auxiliary_loss_clip": 0.01268803, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.00948572, "balance_loss_mlp": 1.00039279, "epoch": 0.559129441471773, "flos": 14532193897920.0, "grad_norm": 2.314718083788282, "language_loss": 0.77939034, "learning_rate": 1.7150899470948911e-06, "loss": 0.80402023, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.8181099891662598 }, { "auxiliary_loss_clip": 0.01289988, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.004089, "balance_loss_mlp": 1.00003946, "epoch": 0.5592496843624121, "flos": 60521044789440.0, "grad_norm": 0.815107996714162, "language_loss": 0.5664221, "learning_rate": 1.7143189415944365e-06, "loss": 0.59125257, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.313976287841797 }, { "auxiliary_loss_clip": 0.0133023, "auxiliary_loss_mlp": 0.01194077, "balance_loss_clip": 1.00842416, "balance_loss_mlp": 1.00029063, "epoch": 0.5593699272530511, "flos": 20886267373920.0, "grad_norm": 1.6189179568954555, "language_loss": 0.76444131, "learning_rate": 1.7135479794315714e-06, "loss": 0.78968441, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.717693328857422 }, { "auxiliary_loss_clip": 0.01287512, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00691223, "balance_loss_mlp": 1.00028491, "epoch": 0.5594901701436903, "flos": 12896752073760.0, "grad_norm": 1.9588870467308865, "language_loss": 0.78789496, "learning_rate": 1.7127770607232502e-06, "loss": 0.81271076, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.7973999977111816 }, { "auxiliary_loss_clip": 0.01306065, "auxiliary_loss_mlp": 0.01194003, "balance_loss_clip": 1.00744176, "balance_loss_mlp": 1.00031209, "epoch": 0.5596104130343293, "flos": 23112119075040.0, "grad_norm": 1.897104218749906, "language_loss": 0.79684621, "learning_rate": 1.7120061855864204e-06, "loss": 0.82184696, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.822190999984741 }, { "auxiliary_loss_clip": 0.01330706, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00846124, "balance_loss_mlp": 1.00027227, "epoch": 0.5597306559249684, "flos": 25957824861600.0, "grad_norm": 1.814984493602972, "language_loss": 0.71104085, "learning_rate": 1.7112353541380233e-06, "loss": 0.73628855, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.77685546875 }, { "auxiliary_loss_clip": 0.01319906, "auxiliary_loss_mlp": 0.01194138, "balance_loss_clip": 1.00857294, "balance_loss_mlp": 1.00035083, "epoch": 0.5598508988156076, "flos": 22492300913280.0, "grad_norm": 1.3681791973539286, "language_loss": 0.72395152, "learning_rate": 1.7104645664949931e-06, "loss": 0.74909198, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.9268062114715576 }, { "auxiliary_loss_clip": 0.01320273, "auxiliary_loss_mlp": 0.01194147, "balance_loss_clip": 1.00791931, "balance_loss_mlp": 1.00036001, "epoch": 0.5599711417062466, "flos": 23112550159200.0, "grad_norm": 1.6649754341489278, "language_loss": 0.717857, "learning_rate": 1.7096938227742584e-06, "loss": 0.74300122, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 3.7086944580078125 }, { "auxiliary_loss_clip": 0.01354151, "auxiliary_loss_mlp": 0.01194287, "balance_loss_clip": 1.00880766, "balance_loss_mlp": 1.00040507, "epoch": 0.5600913845968857, "flos": 22339354796640.0, "grad_norm": 2.8024333124473477, "language_loss": 0.84191322, "learning_rate": 1.70892312309274e-06, "loss": 0.86739755, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 3.6752870082855225 }, { "auxiliary_loss_clip": 0.01330857, "auxiliary_loss_mlp": 0.01194049, "balance_loss_clip": 1.00856817, "balance_loss_mlp": 1.00026214, "epoch": 0.5602116274875248, "flos": 17633799364320.0, "grad_norm": 2.4446232491823627, "language_loss": 0.67768347, "learning_rate": 1.7081524675673523e-06, "loss": 0.7029326, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 3.7410073280334473 }, { "auxiliary_loss_clip": 0.0129252, "auxiliary_loss_mlp": 0.01193066, "balance_loss_clip": 1.00385725, "balance_loss_mlp": 1.00004256, "epoch": 0.5603318703781639, "flos": 70115982926400.0, "grad_norm": 0.7709092593020863, "language_loss": 0.59592205, "learning_rate": 1.7073818563150026e-06, "loss": 0.6207779, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.4649741649627686 }, { "auxiliary_loss_clip": 0.01342407, "auxiliary_loss_mlp": 0.01194011, "balance_loss_clip": 1.00854909, "balance_loss_mlp": 1.00022423, "epoch": 0.560452113268803, "flos": 18545858760960.0, "grad_norm": 2.558406678955042, "language_loss": 0.86540353, "learning_rate": 1.7066112894525935e-06, "loss": 0.89076769, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.7429935932159424 }, { "auxiliary_loss_clip": 0.01319855, "auxiliary_loss_mlp": 0.011941, "balance_loss_clip": 1.00816512, "balance_loss_mlp": 1.00031328, "epoch": 0.5605723561594421, "flos": 25264676963520.0, "grad_norm": 1.579133831108032, "language_loss": 0.72691143, "learning_rate": 1.7058407670970177e-06, "loss": 0.752051, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 3.7482383251190186 }, { "auxiliary_loss_clip": 0.01328604, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00920701, "balance_loss_mlp": 1.00032163, "epoch": 0.5606925990500812, "flos": 20594960154720.0, "grad_norm": 1.5391280053997285, "language_loss": 0.6098057, "learning_rate": 1.7050702893651643e-06, "loss": 0.63503289, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.706037998199463 }, { "auxiliary_loss_clip": 0.01329557, "auxiliary_loss_mlp": 0.01194081, "balance_loss_clip": 1.00775862, "balance_loss_mlp": 1.00029409, "epoch": 0.5608128419407202, "flos": 35006059031040.0, "grad_norm": 2.054289149368047, "language_loss": 0.75329196, "learning_rate": 1.7042998563739134e-06, "loss": 0.77852833, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.9016072750091553 }, { "auxiliary_loss_clip": 0.01325014, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.00889945, "balance_loss_mlp": 1.00025165, "epoch": 0.5609330848313594, "flos": 24639829486560.0, "grad_norm": 2.4091972572173126, "language_loss": 0.71832168, "learning_rate": 1.703529468240139e-06, "loss": 0.74351221, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.8660449981689453 }, { "auxiliary_loss_clip": 0.01305017, "auxiliary_loss_mlp": 0.01194004, "balance_loss_clip": 1.00792456, "balance_loss_mlp": 1.00031292, "epoch": 0.5610533277219985, "flos": 18762902228160.0, "grad_norm": 2.4227269284872293, "language_loss": 0.73297262, "learning_rate": 1.7027591250807088e-06, "loss": 0.75796276, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.710561513900757 }, { "auxiliary_loss_clip": 0.01355357, "auxiliary_loss_mlp": 0.01194054, "balance_loss_clip": 1.00925934, "balance_loss_mlp": 1.00026751, "epoch": 0.5611735706126375, "flos": 15012393628320.0, "grad_norm": 2.1005629253417566, "language_loss": 0.8448109, "learning_rate": 1.7019888270124825e-06, "loss": 0.87030494, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.661161422729492 }, { "auxiliary_loss_clip": 0.01342544, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00924134, "balance_loss_mlp": 1.00030756, "epoch": 0.5612938135032767, "flos": 16468175327040.0, "grad_norm": 5.3658248615319515, "language_loss": 0.82033151, "learning_rate": 1.7012185741523147e-06, "loss": 0.84569788, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.690484046936035 }, { "auxiliary_loss_clip": 0.01354634, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00914931, "balance_loss_mlp": 1.0003674, "epoch": 0.5614140563939157, "flos": 25666445795040.0, "grad_norm": 1.9088686881186365, "language_loss": 0.62435818, "learning_rate": 1.7004483666170514e-06, "loss": 0.64984608, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.7284634113311768 }, { "auxiliary_loss_clip": 0.01331599, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.00793898, "balance_loss_mlp": 1.00025117, "epoch": 0.5615342992845548, "flos": 24717577835520.0, "grad_norm": 1.870519371478979, "language_loss": 0.80677545, "learning_rate": 1.699678204523533e-06, "loss": 0.83203185, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.775916814804077 }, { "auxiliary_loss_clip": 0.01313708, "auxiliary_loss_mlp": 0.01194144, "balance_loss_clip": 1.00820196, "balance_loss_mlp": 1.00035775, "epoch": 0.5616545421751938, "flos": 22015945016640.0, "grad_norm": 3.035943470420151, "language_loss": 0.68473536, "learning_rate": 1.6989080879885918e-06, "loss": 0.70981389, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.7602059841156006 }, { "auxiliary_loss_clip": 0.01272977, "auxiliary_loss_mlp": 0.01193146, "balance_loss_clip": 1.00290632, "balance_loss_mlp": 1.00012207, "epoch": 0.561774785065833, "flos": 53760388708800.0, "grad_norm": 0.9027594598214865, "language_loss": 0.61054718, "learning_rate": 1.6981380171290544e-06, "loss": 0.63520837, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.294301748275757 }, { "auxiliary_loss_clip": 0.01316426, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00893342, "balance_loss_mlp": 1.00030863, "epoch": 0.5618950279564721, "flos": 19750015697760.0, "grad_norm": 1.8246900775042352, "language_loss": 0.74300194, "learning_rate": 1.6973679920617396e-06, "loss": 0.76810712, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.751845121383667 }, { "auxiliary_loss_clip": 0.01309048, "auxiliary_loss_mlp": 0.01194054, "balance_loss_clip": 1.00859523, "balance_loss_mlp": 1.00026774, "epoch": 0.5620152708471111, "flos": 16800601950720.0, "grad_norm": 2.4361627750415837, "language_loss": 0.85356098, "learning_rate": 1.6965980129034603e-06, "loss": 0.87859201, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.7549049854278564 }, { "auxiliary_loss_clip": 0.0131102, "auxiliary_loss_mlp": 0.01194077, "balance_loss_clip": 1.00751781, "balance_loss_mlp": 1.00029016, "epoch": 0.5621355137377503, "flos": 26797811850720.0, "grad_norm": 1.5865434696520284, "language_loss": 0.76548219, "learning_rate": 1.6958280797710209e-06, "loss": 0.79053319, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.8104913234710693 }, { "auxiliary_loss_clip": 0.0128744, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.0032475, "balance_loss_mlp": 1.00014532, "epoch": 0.5622557566283893, "flos": 61207079798880.0, "grad_norm": 0.8159753337515239, "language_loss": 0.54768586, "learning_rate": 1.6950581927812198e-06, "loss": 0.572492, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.2584829330444336 }, { "auxiliary_loss_clip": 0.0133472, "auxiliary_loss_mlp": 0.01194114, "balance_loss_clip": 1.00818086, "balance_loss_mlp": 1.00032699, "epoch": 0.5623759995190284, "flos": 26468546510880.0, "grad_norm": 2.6819728451208777, "language_loss": 0.7905103, "learning_rate": 1.6942883520508486e-06, "loss": 0.8157987, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.7554478645324707 }, { "auxiliary_loss_clip": 0.01326515, "auxiliary_loss_mlp": 0.0119403, "balance_loss_clip": 1.00890136, "balance_loss_mlp": 1.00024307, "epoch": 0.5624962424096676, "flos": 19390910988960.0, "grad_norm": 1.9981225265574223, "language_loss": 0.76979834, "learning_rate": 1.693518557696691e-06, "loss": 0.79500377, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.7411506175994873 }, { "auxiliary_loss_clip": 0.01341399, "auxiliary_loss_mlp": 0.01194101, "balance_loss_clip": 1.00836635, "balance_loss_mlp": 1.00031424, "epoch": 0.5626164853003066, "flos": 20667355875360.0, "grad_norm": 1.924629566947462, "language_loss": 0.8916893, "learning_rate": 1.6927488098355252e-06, "loss": 0.91704422, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.691751718521118 }, { "auxiliary_loss_clip": 0.01259358, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.0038476, "balance_loss_mlp": 1.00007534, "epoch": 0.5627367281909457, "flos": 62766102450240.0, "grad_norm": 1.0231430813812459, "language_loss": 0.63226557, "learning_rate": 1.6919791085841201e-06, "loss": 0.65679014, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.3007664680480957 }, { "auxiliary_loss_clip": 0.01342474, "auxiliary_loss_mlp": 0.01194087, "balance_loss_clip": 1.00857162, "balance_loss_mlp": 1.00030065, "epoch": 0.5628569710815848, "flos": 12787152629760.0, "grad_norm": 2.992510800071718, "language_loss": 0.78700399, "learning_rate": 1.6912094540592396e-06, "loss": 0.81236959, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.7105278968811035 }, { "auxiliary_loss_clip": 0.01330993, "auxiliary_loss_mlp": 0.01194044, "balance_loss_clip": 1.00780773, "balance_loss_mlp": 1.00025761, "epoch": 0.5629772139722239, "flos": 13762087971840.0, "grad_norm": 2.6015622387537864, "language_loss": 0.8160131, "learning_rate": 1.6904398463776393e-06, "loss": 0.84126353, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.7267630100250244 }, { "auxiliary_loss_clip": 0.01341687, "auxiliary_loss_mlp": 0.01194007, "balance_loss_clip": 1.00826597, "balance_loss_mlp": 1.00022006, "epoch": 0.5630974568628629, "flos": 21467840025600.0, "grad_norm": 1.6325267627794902, "language_loss": 0.72767758, "learning_rate": 1.6896702856560683e-06, "loss": 0.75303453, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 3.6559786796569824 }, { "auxiliary_loss_clip": 0.01312052, "auxiliary_loss_mlp": 0.01193955, "balance_loss_clip": 1.00822234, "balance_loss_mlp": 1.00026345, "epoch": 0.5632176997535021, "flos": 14245915993920.0, "grad_norm": 3.035637112743828, "language_loss": 0.69843417, "learning_rate": 1.6889007720112677e-06, "loss": 0.72349429, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 3.7386090755462646 }, { "auxiliary_loss_clip": 0.01336836, "auxiliary_loss_mlp": 0.01194115, "balance_loss_clip": 1.00831759, "balance_loss_mlp": 1.00023305, "epoch": 0.5633379426441412, "flos": 20812255087680.0, "grad_norm": 1.5978974157451713, "language_loss": 0.76857245, "learning_rate": 1.6881313055599734e-06, "loss": 0.79388201, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 3.7378673553466797 }, { "auxiliary_loss_clip": 0.01330659, "auxiliary_loss_mlp": 0.01194116, "balance_loss_clip": 1.00871325, "balance_loss_mlp": 1.00032914, "epoch": 0.5634581855347802, "flos": 22600894494240.0, "grad_norm": 2.120062291482735, "language_loss": 0.82586026, "learning_rate": 1.6873618864189117e-06, "loss": 0.85110795, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.8455569744110107 }, { "auxiliary_loss_clip": 0.01331803, "auxiliary_loss_mlp": 0.01194114, "balance_loss_clip": 1.00787973, "balance_loss_mlp": 1.00032711, "epoch": 0.5635784284254194, "flos": 21506983627680.0, "grad_norm": 2.055495769067778, "language_loss": 0.77790475, "learning_rate": 1.686592514704803e-06, "loss": 0.80316389, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.7384166717529297 }, { "auxiliary_loss_clip": 0.01309021, "auxiliary_loss_mlp": 0.01194161, "balance_loss_clip": 1.00705624, "balance_loss_mlp": 1.00037479, "epoch": 0.5636986713160584, "flos": 19827476657280.0, "grad_norm": 2.053425838627494, "language_loss": 0.70790815, "learning_rate": 1.685823190534361e-06, "loss": 0.73293996, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 3.839524984359741 }, { "auxiliary_loss_clip": 0.01355435, "auxiliary_loss_mlp": 0.01193973, "balance_loss_clip": 1.00877762, "balance_loss_mlp": 1.00028157, "epoch": 0.5638189142066975, "flos": 19792464278400.0, "grad_norm": 1.6947257927809611, "language_loss": 0.83314693, "learning_rate": 1.6850539140242907e-06, "loss": 0.85864097, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.7260894775390625 }, { "auxiliary_loss_clip": 0.01327498, "auxiliary_loss_mlp": 0.01194089, "balance_loss_clip": 1.00870156, "balance_loss_mlp": 1.00030279, "epoch": 0.5639391570973367, "flos": 22893782355360.0, "grad_norm": 1.632608502351765, "language_loss": 0.82310593, "learning_rate": 1.684284685291292e-06, "loss": 0.8483218, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.782132148742676 }, { "auxiliary_loss_clip": 0.01354297, "auxiliary_loss_mlp": 0.01194031, "balance_loss_clip": 1.0087595, "balance_loss_mlp": 1.00024414, "epoch": 0.5640593999879757, "flos": 23727087540000.0, "grad_norm": 2.2490021331714747, "language_loss": 0.81180346, "learning_rate": 1.683515504452055e-06, "loss": 0.83728677, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 2.7914822101593018 }, { "auxiliary_loss_clip": 0.01295437, "auxiliary_loss_mlp": 0.01194209, "balance_loss_clip": 1.00751352, "balance_loss_mlp": 1.00042224, "epoch": 0.5641796428786148, "flos": 22710134701440.0, "grad_norm": 1.7531017456197817, "language_loss": 0.66528392, "learning_rate": 1.6827463716232648e-06, "loss": 0.69018036, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.800583600997925 }, { "auxiliary_loss_clip": 0.01342787, "auxiliary_loss_mlp": 0.00872477, "balance_loss_clip": 1.0091393, "balance_loss_mlp": 1.00051892, "epoch": 0.5642998857692539, "flos": 19791997270560.0, "grad_norm": 1.5838428661653106, "language_loss": 0.75674385, "learning_rate": 1.6819772869215972e-06, "loss": 0.77889651, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.809255599975586 }, { "auxiliary_loss_clip": 0.01322972, "auxiliary_loss_mlp": 0.01194054, "balance_loss_clip": 1.00807214, "balance_loss_mlp": 1.00026739, "epoch": 0.564420128659893, "flos": 23185915819200.0, "grad_norm": 1.7120230545441122, "language_loss": 0.82207024, "learning_rate": 1.6812082504637228e-06, "loss": 0.84724057, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.7076590061187744 }, { "auxiliary_loss_clip": 0.01330905, "auxiliary_loss_mlp": 0.0119404, "balance_loss_clip": 1.00769901, "balance_loss_mlp": 1.00025296, "epoch": 0.564540371550532, "flos": 23258275616160.0, "grad_norm": 1.4090924347173226, "language_loss": 0.74219912, "learning_rate": 1.6804392623663025e-06, "loss": 0.76744854, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.8596808910369873 }, { "auxiliary_loss_clip": 0.01328698, "auxiliary_loss_mlp": 0.01194081, "balance_loss_clip": 1.00793183, "balance_loss_mlp": 1.00029397, "epoch": 0.5646606144411712, "flos": 25010070536160.0, "grad_norm": 1.6410774577586766, "language_loss": 0.78279173, "learning_rate": 1.6796703227459935e-06, "loss": 0.80801952, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.7798593044281006 }, { "auxiliary_loss_clip": 0.01281523, "auxiliary_loss_mlp": 0.01194032, "balance_loss_clip": 1.00701773, "balance_loss_mlp": 1.00024509, "epoch": 0.5647808573318103, "flos": 36539660926080.0, "grad_norm": 1.770289785997901, "language_loss": 0.75706911, "learning_rate": 1.6789014317194407e-06, "loss": 0.78182465, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.9713191986083984 }, { "auxiliary_loss_clip": 0.01305834, "auxiliary_loss_mlp": 0.0119413, "balance_loss_clip": 1.00864017, "balance_loss_mlp": 1.0003432, "epoch": 0.5649011002224493, "flos": 22528462849920.0, "grad_norm": 2.301689774101918, "language_loss": 0.72799969, "learning_rate": 1.6781325894032853e-06, "loss": 0.75299931, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.7679860591888428 }, { "auxiliary_loss_clip": 0.01306258, "auxiliary_loss_mlp": 0.01194104, "balance_loss_clip": 1.00841761, "balance_loss_mlp": 1.0003171, "epoch": 0.5650213431130885, "flos": 18515157223680.0, "grad_norm": 2.5002377426534546, "language_loss": 0.91572618, "learning_rate": 1.6773637959141608e-06, "loss": 0.9407298, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.7169899940490723 }, { "auxiliary_loss_clip": 0.01316466, "auxiliary_loss_mlp": 0.011939, "balance_loss_clip": 1.00783324, "balance_loss_mlp": 1.00020838, "epoch": 0.5651415860037275, "flos": 17526319417440.0, "grad_norm": 2.1390508255225735, "language_loss": 0.65996611, "learning_rate": 1.6765950513686915e-06, "loss": 0.6850698, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.7438011169433594 }, { "auxiliary_loss_clip": 0.01279305, "auxiliary_loss_mlp": 0.01193967, "balance_loss_clip": 1.00817895, "balance_loss_mlp": 1.00027609, "epoch": 0.5652618288943666, "flos": 25520037788160.0, "grad_norm": 1.5602274411610302, "language_loss": 0.7614609, "learning_rate": 1.675826355883496e-06, "loss": 0.78619361, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.816842555999756 }, { "auxiliary_loss_clip": 0.01304397, "auxiliary_loss_mlp": 0.01194062, "balance_loss_clip": 1.00797689, "balance_loss_mlp": 1.00027573, "epoch": 0.5653820717850057, "flos": 19683116300160.0, "grad_norm": 1.839901816861005, "language_loss": 0.79133236, "learning_rate": 1.6750577095751848e-06, "loss": 0.81631696, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.7743875980377197 }, { "auxiliary_loss_clip": 0.01354199, "auxiliary_loss_mlp": 0.01194027, "balance_loss_clip": 1.00881124, "balance_loss_mlp": 1.00024056, "epoch": 0.5655023146756448, "flos": 26979735168000.0, "grad_norm": 1.7697472613854093, "language_loss": 0.73148596, "learning_rate": 1.6742891125603605e-06, "loss": 0.7569682, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.737213611602783 }, { "auxiliary_loss_clip": 0.01329665, "auxiliary_loss_mlp": 0.01193985, "balance_loss_clip": 1.00833952, "balance_loss_mlp": 1.00029373, "epoch": 0.5656225575662839, "flos": 27669362545440.0, "grad_norm": 2.0646366218599805, "language_loss": 0.72098386, "learning_rate": 1.6735205649556185e-06, "loss": 0.74622035, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.766650915145874 }, { "auxiliary_loss_clip": 0.0129058, "auxiliary_loss_mlp": 0.01194141, "balance_loss_clip": 1.00813138, "balance_loss_mlp": 1.00035489, "epoch": 0.5657428004569229, "flos": 24349743672480.0, "grad_norm": 1.463242758066805, "language_loss": 0.84628606, "learning_rate": 1.6727520668775476e-06, "loss": 0.87113327, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.868288516998291 }, { "auxiliary_loss_clip": 0.01355494, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00880837, "balance_loss_mlp": 1.00028419, "epoch": 0.5658630433475621, "flos": 21944052227520.0, "grad_norm": 1.91748683848654, "language_loss": 0.74857694, "learning_rate": 1.6719836184427275e-06, "loss": 0.77407253, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.665252685546875 }, { "auxiliary_loss_clip": 0.01318957, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.00771236, "balance_loss_mlp": 1.00024247, "epoch": 0.5659832862382012, "flos": 30409025332320.0, "grad_norm": 1.7828240706060363, "language_loss": 0.64425814, "learning_rate": 1.671215219767733e-06, "loss": 0.669388, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.802304983139038 }, { "auxiliary_loss_clip": 0.01297016, "auxiliary_loss_mlp": 0.01194097, "balance_loss_clip": 1.00795412, "balance_loss_mlp": 1.00031042, "epoch": 0.5661035291288402, "flos": 13188202987680.0, "grad_norm": 2.0997492122167167, "language_loss": 0.76425827, "learning_rate": 1.670446870969127e-06, "loss": 0.78916943, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.8028464317321777 }, { "auxiliary_loss_clip": 0.01303652, "auxiliary_loss_mlp": 0.01194157, "balance_loss_clip": 1.00795448, "balance_loss_mlp": 1.00036979, "epoch": 0.5662237720194794, "flos": 16143041210400.0, "grad_norm": 2.2671486975165145, "language_loss": 0.80000561, "learning_rate": 1.6696785721634685e-06, "loss": 0.82498372, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.7997725009918213 }, { "auxiliary_loss_clip": 0.01343316, "auxiliary_loss_mlp": 0.01194128, "balance_loss_clip": 1.00946331, "balance_loss_mlp": 1.00034106, "epoch": 0.5663440149101184, "flos": 17676858647520.0, "grad_norm": 1.7306198079929578, "language_loss": 0.73480368, "learning_rate": 1.6689103234673086e-06, "loss": 0.76017815, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 4.584435939788818 }, { "auxiliary_loss_clip": 0.01306576, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00778937, "balance_loss_mlp": 1.00030351, "epoch": 0.5664642578007575, "flos": 23368342068000.0, "grad_norm": 1.825463515036085, "language_loss": 0.7692402, "learning_rate": 1.668142124997189e-06, "loss": 0.79424691, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.743863105773926 }, { "auxiliary_loss_clip": 0.0127454, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00354314, "balance_loss_mlp": 1.00006092, "epoch": 0.5665845006913967, "flos": 65516506417440.0, "grad_norm": 0.7262954833758622, "language_loss": 0.59811753, "learning_rate": 1.6673739768696453e-06, "loss": 0.62279379, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 4.300537347793579 }, { "auxiliary_loss_clip": 0.01328747, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00837147, "balance_loss_mlp": 1.00030565, "epoch": 0.5667047435820357, "flos": 26140897736640.0, "grad_norm": 1.6832092726099166, "language_loss": 0.7762922, "learning_rate": 1.6666058792012052e-06, "loss": 0.80152065, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.854226589202881 }, { "auxiliary_loss_clip": 0.01311325, "auxiliary_loss_mlp": 0.01193136, "balance_loss_clip": 1.00387001, "balance_loss_mlp": 1.00011277, "epoch": 0.5668249864726748, "flos": 71866987525440.0, "grad_norm": 1.008589573610431, "language_loss": 0.68844205, "learning_rate": 1.6658378321083878e-06, "loss": 0.71348667, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 4.329184532165527 }, { "auxiliary_loss_clip": 0.01249404, "auxiliary_loss_mlp": 0.01193934, "balance_loss_clip": 1.00752902, "balance_loss_mlp": 1.00024247, "epoch": 0.5669452293633139, "flos": 22195676989440.0, "grad_norm": 2.126718573142454, "language_loss": 0.82536876, "learning_rate": 1.6650698357077055e-06, "loss": 0.84980214, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.8739516735076904 }, { "auxiliary_loss_clip": 0.01322232, "auxiliary_loss_mlp": 0.01194044, "balance_loss_clip": 1.00799489, "balance_loss_mlp": 1.00025725, "epoch": 0.567065472253953, "flos": 18223203378240.0, "grad_norm": 2.582953870326406, "language_loss": 0.80929327, "learning_rate": 1.6643018901156632e-06, "loss": 0.83445609, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.7732057571411133 }, { "auxiliary_loss_clip": 0.01323902, "auxiliary_loss_mlp": 0.01194008, "balance_loss_clip": 1.00830841, "balance_loss_mlp": 1.00022173, "epoch": 0.567185715144592, "flos": 20371558196160.0, "grad_norm": 2.6302392890821826, "language_loss": 0.79193568, "learning_rate": 1.6635339954487566e-06, "loss": 0.81711471, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.7404894828796387 }, { "auxiliary_loss_clip": 0.01322243, "auxiliary_loss_mlp": 0.01194171, "balance_loss_clip": 1.0085789, "balance_loss_mlp": 1.00038445, "epoch": 0.5673059580352312, "flos": 23221359282240.0, "grad_norm": 1.7386020268205433, "language_loss": 0.8198697, "learning_rate": 1.6627661518234765e-06, "loss": 0.84503388, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.935739517211914 }, { "auxiliary_loss_clip": 0.01277541, "auxiliary_loss_mlp": 0.0119419, "balance_loss_clip": 1.00815189, "balance_loss_mlp": 1.00040364, "epoch": 0.5674262009258703, "flos": 21719608482240.0, "grad_norm": 2.1819348132978673, "language_loss": 0.85362285, "learning_rate": 1.661998359356302e-06, "loss": 0.87834018, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.7934114933013916 }, { "auxiliary_loss_clip": 0.01323287, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00384116, "balance_loss_mlp": 1.00013506, "epoch": 0.5675464438165093, "flos": 67470400553760.0, "grad_norm": 0.7519282276779738, "language_loss": 0.55821884, "learning_rate": 1.6612306181637077e-06, "loss": 0.58338332, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.2222278118133545 }, { "auxiliary_loss_clip": 0.01300751, "auxiliary_loss_mlp": 0.01194121, "balance_loss_clip": 1.00742078, "balance_loss_mlp": 1.00033474, "epoch": 0.5676666867071485, "flos": 18879183476640.0, "grad_norm": 2.406845065523109, "language_loss": 0.65747476, "learning_rate": 1.6604629283621598e-06, "loss": 0.68242347, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.862276077270508 }, { "auxiliary_loss_clip": 0.01355669, "auxiliary_loss_mlp": 0.01194146, "balance_loss_clip": 1.00923634, "balance_loss_mlp": 1.0003593, "epoch": 0.5677869295977875, "flos": 33546792735360.0, "grad_norm": 1.61731267363373, "language_loss": 0.73947603, "learning_rate": 1.6596952900681152e-06, "loss": 0.76497424, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.787179470062256 }, { "auxiliary_loss_clip": 0.01281337, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00965071, "balance_loss_mlp": 1.00029933, "epoch": 0.5679071724884266, "flos": 28037268555840.0, "grad_norm": 2.3072031104430657, "language_loss": 0.81861377, "learning_rate": 1.658927703398025e-06, "loss": 0.84336799, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.945866107940674 }, { "auxiliary_loss_clip": 0.01295077, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00776505, "balance_loss_mlp": 1.0002656, "epoch": 0.5680274153790658, "flos": 23550121690560.0, "grad_norm": 2.148073967129001, "language_loss": 0.78129447, "learning_rate": 1.6581601684683309e-06, "loss": 0.80618572, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.822469711303711 }, { "auxiliary_loss_clip": 0.01333021, "auxiliary_loss_mlp": 0.01194274, "balance_loss_clip": 1.00888145, "balance_loss_mlp": 1.00039196, "epoch": 0.5681476582697048, "flos": 22455169037280.0, "grad_norm": 2.341015420127663, "language_loss": 0.68241787, "learning_rate": 1.6573926853954674e-06, "loss": 0.70769083, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.72602915763855 }, { "auxiliary_loss_clip": 0.01329367, "auxiliary_loss_mlp": 0.01194093, "balance_loss_clip": 1.00867343, "balance_loss_mlp": 1.00030637, "epoch": 0.5682679011603439, "flos": 19536923835360.0, "grad_norm": 2.386105732850347, "language_loss": 0.83000606, "learning_rate": 1.6566252542958608e-06, "loss": 0.8552407, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.743060827255249 }, { "auxiliary_loss_clip": 0.01296233, "auxiliary_loss_mlp": 0.01194126, "balance_loss_clip": 1.00764656, "balance_loss_mlp": 1.0002439, "epoch": 0.568388144050983, "flos": 28765500680160.0, "grad_norm": 1.638511633063298, "language_loss": 0.78381562, "learning_rate": 1.6558578752859305e-06, "loss": 0.80871916, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.83323073387146 }, { "auxiliary_loss_clip": 0.01309356, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00767875, "balance_loss_mlp": 1.00024831, "epoch": 0.5685083869416221, "flos": 21209461611840.0, "grad_norm": 1.8827102586072613, "language_loss": 0.78853381, "learning_rate": 1.6550905484820865e-06, "loss": 0.8135677, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.797577142715454 }, { "auxiliary_loss_clip": 0.01354277, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00847256, "balance_loss_mlp": 1.00027192, "epoch": 0.5686286298322611, "flos": 24827033584800.0, "grad_norm": 2.0321704873579796, "language_loss": 0.78533626, "learning_rate": 1.6543232740007328e-06, "loss": 0.81081963, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.7818496227264404 }, { "auxiliary_loss_clip": 0.01334894, "auxiliary_loss_mlp": 0.01194053, "balance_loss_clip": 1.0085907, "balance_loss_mlp": 1.00026655, "epoch": 0.5687488727229003, "flos": 26615134136160.0, "grad_norm": 3.0021247408562743, "language_loss": 0.66963315, "learning_rate": 1.653556051958263e-06, "loss": 0.69492257, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.876598834991455 }, { "auxiliary_loss_clip": 0.01260218, "auxiliary_loss_mlp": 0.01194028, "balance_loss_clip": 1.00733733, "balance_loss_mlp": 1.00024104, "epoch": 0.5688691156135394, "flos": 20808734567040.0, "grad_norm": 1.756290138666158, "language_loss": 0.73582906, "learning_rate": 1.6527888824710642e-06, "loss": 0.76037157, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.903557062149048 }, { "auxiliary_loss_clip": 0.01308896, "auxiliary_loss_mlp": 0.0119415, "balance_loss_clip": 1.0082345, "balance_loss_mlp": 1.00036383, "epoch": 0.5689893585041784, "flos": 25880974604640.0, "grad_norm": 1.9536431422009468, "language_loss": 0.76764607, "learning_rate": 1.6520217656555166e-06, "loss": 0.79267657, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.87123441696167 }, { "auxiliary_loss_clip": 0.01329956, "auxiliary_loss_mlp": 0.01193975, "balance_loss_clip": 1.00890708, "balance_loss_mlp": 1.00028384, "epoch": 0.5691096013948175, "flos": 23477474504160.0, "grad_norm": 1.5282358387590105, "language_loss": 0.70975018, "learning_rate": 1.65125470162799e-06, "loss": 0.73498952, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.893054962158203 }, { "auxiliary_loss_clip": 0.01300612, "auxiliary_loss_mlp": 0.01194091, "balance_loss_clip": 1.00763988, "balance_loss_mlp": 1.0003041, "epoch": 0.5692298442854566, "flos": 18075609889920.0, "grad_norm": 2.0579244679954978, "language_loss": 0.6958825, "learning_rate": 1.6504876905048485e-06, "loss": 0.72082955, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.7573294639587402 }, { "auxiliary_loss_clip": 0.01354157, "auxiliary_loss_mlp": 0.0119412, "balance_loss_clip": 1.00848699, "balance_loss_mlp": 1.00033331, "epoch": 0.5693500871760957, "flos": 23039328193920.0, "grad_norm": 1.6165934623334974, "language_loss": 0.71967942, "learning_rate": 1.6497207324024464e-06, "loss": 0.74516225, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.682211399078369 }, { "auxiliary_loss_clip": 0.01315703, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00870383, "balance_loss_mlp": 1.00026071, "epoch": 0.5694703300667348, "flos": 18989681012640.0, "grad_norm": 1.8096951647303854, "language_loss": 0.82706106, "learning_rate": 1.6489538274371305e-06, "loss": 0.85215855, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 4.667155027389526 }, { "auxiliary_loss_clip": 0.01328093, "auxiliary_loss_mlp": 0.01194067, "balance_loss_clip": 1.00835872, "balance_loss_mlp": 1.00028014, "epoch": 0.5695905729573739, "flos": 21908716535520.0, "grad_norm": 1.7013679143545097, "language_loss": 0.82967776, "learning_rate": 1.6481869757252396e-06, "loss": 0.85489935, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.725717544555664 }, { "auxiliary_loss_clip": 0.01333987, "auxiliary_loss_mlp": 0.01194045, "balance_loss_clip": 1.00824237, "balance_loss_mlp": 1.00025797, "epoch": 0.569710815848013, "flos": 28476672194880.0, "grad_norm": 1.39266697426233, "language_loss": 0.71718192, "learning_rate": 1.647420177383105e-06, "loss": 0.74246228, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 3.7281482219696045 }, { "auxiliary_loss_clip": 0.0132905, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00817776, "balance_loss_mlp": 1.00032663, "epoch": 0.569831058738652, "flos": 28366174658880.0, "grad_norm": 2.0764454663532073, "language_loss": 0.72609514, "learning_rate": 1.646653432527049e-06, "loss": 0.75132674, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.8957083225250244 }, { "auxiliary_loss_clip": 0.01298843, "auxiliary_loss_mlp": 0.01194225, "balance_loss_clip": 1.00722969, "balance_loss_mlp": 1.00034308, "epoch": 0.5699513016292912, "flos": 25849985677920.0, "grad_norm": 1.4447117096619404, "language_loss": 0.74352956, "learning_rate": 1.645886741273387e-06, "loss": 0.76846021, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 3.8819570541381836 }, { "auxiliary_loss_clip": 0.01282716, "auxiliary_loss_mlp": 0.01194133, "balance_loss_clip": 1.00830054, "balance_loss_mlp": 1.00034678, "epoch": 0.5700715445199303, "flos": 18037867311360.0, "grad_norm": 2.3452005047729965, "language_loss": 0.73980665, "learning_rate": 1.645120103738424e-06, "loss": 0.76457512, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.928461790084839 }, { "auxiliary_loss_clip": 0.01340898, "auxiliary_loss_mlp": 0.00872493, "balance_loss_clip": 1.0086354, "balance_loss_mlp": 1.00049591, "epoch": 0.5701917874105693, "flos": 11473360325280.0, "grad_norm": 1.9720442065609112, "language_loss": 0.8369295, "learning_rate": 1.6443535200384591e-06, "loss": 0.85906339, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.699954032897949 }, { "auxiliary_loss_clip": 0.01353889, "auxiliary_loss_mlp": 0.01194072, "balance_loss_clip": 1.00870156, "balance_loss_mlp": 1.00028539, "epoch": 0.5703120303012085, "flos": 21761769673440.0, "grad_norm": 1.7696915486830174, "language_loss": 0.7041623, "learning_rate": 1.6435869902897827e-06, "loss": 0.72964191, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.8315014839172363 }, { "auxiliary_loss_clip": 0.01276832, "auxiliary_loss_mlp": 0.0119313, "balance_loss_clip": 1.00396633, "balance_loss_mlp": 1.00010633, "epoch": 0.5704322731918475, "flos": 56746287705600.0, "grad_norm": 0.7950334705315897, "language_loss": 0.62025166, "learning_rate": 1.6428205146086764e-06, "loss": 0.64495122, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.4411563873291016 }, { "auxiliary_loss_clip": 0.01328387, "auxiliary_loss_mlp": 0.01194151, "balance_loss_clip": 1.00843143, "balance_loss_mlp": 1.0003643, "epoch": 0.5705525160824866, "flos": 20741152619520.0, "grad_norm": 1.6306197793815793, "language_loss": 0.70808905, "learning_rate": 1.6420540931114142e-06, "loss": 0.73331445, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.799156665802002 }, { "auxiliary_loss_clip": 0.01314655, "auxiliary_loss_mlp": 0.01194068, "balance_loss_clip": 1.00766361, "balance_loss_mlp": 1.0002811, "epoch": 0.5706727589731257, "flos": 18771272445600.0, "grad_norm": 1.4992435912511972, "language_loss": 0.79016459, "learning_rate": 1.6412877259142616e-06, "loss": 0.81525183, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.7919442653656006 }, { "auxiliary_loss_clip": 0.01322003, "auxiliary_loss_mlp": 0.01194128, "balance_loss_clip": 1.00888252, "balance_loss_mlp": 1.00034165, "epoch": 0.5707930018637648, "flos": 27634745327040.0, "grad_norm": 1.9880525648263467, "language_loss": 0.73846054, "learning_rate": 1.6405214131334757e-06, "loss": 0.76362193, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.7912282943725586 }, { "auxiliary_loss_clip": 0.01269837, "auxiliary_loss_mlp": 0.01194079, "balance_loss_clip": 1.00697052, "balance_loss_mlp": 1.00029242, "epoch": 0.5709132447544039, "flos": 27597685298400.0, "grad_norm": 1.7175286418179583, "language_loss": 0.79710054, "learning_rate": 1.6397551548853052e-06, "loss": 0.82173967, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.9111204147338867 }, { "auxiliary_loss_clip": 0.0130748, "auxiliary_loss_mlp": 0.01194034, "balance_loss_clip": 1.00760365, "balance_loss_mlp": 1.00024772, "epoch": 0.571033487645043, "flos": 21686104897920.0, "grad_norm": 1.5257489243692786, "language_loss": 0.70696884, "learning_rate": 1.6389889512859917e-06, "loss": 0.73198402, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.7861504554748535 }, { "auxiliary_loss_clip": 0.01289264, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00358176, "balance_loss_mlp": 1.00016582, "epoch": 0.5711537305356821, "flos": 70181481300480.0, "grad_norm": 0.8287096764362172, "language_loss": 0.60381716, "learning_rate": 1.638222802451767e-06, "loss": 0.62864172, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.4895212650299072 }, { "auxiliary_loss_clip": 0.01328158, "auxiliary_loss_mlp": 0.01194159, "balance_loss_clip": 1.0083195, "balance_loss_mlp": 1.00037289, "epoch": 0.5712739734263211, "flos": 24717505988160.0, "grad_norm": 1.6165067225203726, "language_loss": 0.75603676, "learning_rate": 1.6374567084988561e-06, "loss": 0.78125989, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.8207290172576904 }, { "auxiliary_loss_clip": 0.01304282, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00912738, "balance_loss_mlp": 1.00027204, "epoch": 0.5713942163169603, "flos": 26578181878560.0, "grad_norm": 2.2623808390006634, "language_loss": 0.76843083, "learning_rate": 1.6366906695434738e-06, "loss": 0.79341424, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.782163143157959 }, { "auxiliary_loss_clip": 0.01331327, "auxiliary_loss_mlp": 0.01194144, "balance_loss_clip": 1.00810552, "balance_loss_mlp": 1.00026166, "epoch": 0.5715144592075994, "flos": 21142454443200.0, "grad_norm": 1.815264275821937, "language_loss": 0.85913157, "learning_rate": 1.6359246857018275e-06, "loss": 0.88438624, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.8313040733337402 }, { "auxiliary_loss_clip": 0.01278055, "auxiliary_loss_mlp": 0.01194129, "balance_loss_clip": 1.00789273, "balance_loss_mlp": 1.00034189, "epoch": 0.5716347020982384, "flos": 23330276176320.0, "grad_norm": 1.7265339057731557, "language_loss": 0.781636, "learning_rate": 1.6351587570901178e-06, "loss": 0.8063578, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.8539621829986572 }, { "auxiliary_loss_clip": 0.01294344, "auxiliary_loss_mlp": 0.01194006, "balance_loss_clip": 1.0082612, "balance_loss_mlp": 1.00021911, "epoch": 0.5717549449888776, "flos": 17009562589920.0, "grad_norm": 2.2645624220604668, "language_loss": 0.75844729, "learning_rate": 1.634392883824534e-06, "loss": 0.7833308, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.7802627086639404 }, { "auxiliary_loss_clip": 0.01268368, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.00790548, "balance_loss_mlp": 1.00027823, "epoch": 0.5718751878795166, "flos": 35518145780160.0, "grad_norm": 1.6041292481849752, "language_loss": 0.67439878, "learning_rate": 1.6336270660212595e-06, "loss": 0.69902313, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 3.1214635372161865 }, { "auxiliary_loss_clip": 0.0130618, "auxiliary_loss_mlp": 0.01194368, "balance_loss_clip": 1.00944054, "balance_loss_mlp": 1.00048566, "epoch": 0.5719954307701557, "flos": 38614003457760.0, "grad_norm": 4.052943648582252, "language_loss": 0.6585694, "learning_rate": 1.6328613037964676e-06, "loss": 0.68357486, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.910491943359375 }, { "auxiliary_loss_clip": 0.01333382, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00765967, "balance_loss_mlp": 1.00027108, "epoch": 0.5721156736607949, "flos": 20631122091360.0, "grad_norm": 1.9176463681932923, "language_loss": 0.67945158, "learning_rate": 1.6320955972663241e-06, "loss": 0.70472598, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.703477382659912 }, { "auxiliary_loss_clip": 0.01332258, "auxiliary_loss_mlp": 0.01194072, "balance_loss_clip": 1.00788152, "balance_loss_mlp": 1.00028515, "epoch": 0.5722359165514339, "flos": 37415091378240.0, "grad_norm": 1.8968072187141418, "language_loss": 0.65443909, "learning_rate": 1.6313299465469857e-06, "loss": 0.67970234, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.894151210784912 }, { "auxiliary_loss_clip": 0.01343016, "auxiliary_loss_mlp": 0.01194166, "balance_loss_clip": 1.00925064, "balance_loss_mlp": 1.0003792, "epoch": 0.572356159442073, "flos": 21972885733440.0, "grad_norm": 2.5584284690125387, "language_loss": 0.79390544, "learning_rate": 1.6305643517546014e-06, "loss": 0.81927729, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.82342529296875 }, { "auxiliary_loss_clip": 0.01354085, "auxiliary_loss_mlp": 0.01194055, "balance_loss_clip": 1.00884235, "balance_loss_mlp": 1.00026846, "epoch": 0.5724764023327121, "flos": 19135550164320.0, "grad_norm": 1.7510483678527484, "language_loss": 0.84665948, "learning_rate": 1.629798813005311e-06, "loss": 0.87214088, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.689394235610962 }, { "auxiliary_loss_clip": 0.01270039, "auxiliary_loss_mlp": 0.01194072, "balance_loss_clip": 1.00665462, "balance_loss_mlp": 1.00038028, "epoch": 0.5725966452233512, "flos": 22819770069120.0, "grad_norm": 1.83614307898986, "language_loss": 0.70932341, "learning_rate": 1.6290333304152473e-06, "loss": 0.73396456, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 4.7999351024627686 }, { "auxiliary_loss_clip": 0.01306365, "auxiliary_loss_mlp": 0.01194098, "balance_loss_clip": 1.00857139, "balance_loss_mlp": 1.00031114, "epoch": 0.5727168881139902, "flos": 41496625578240.0, "grad_norm": 1.8065209126970603, "language_loss": 0.56999391, "learning_rate": 1.6282679041005314e-06, "loss": 0.59499854, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.9909768104553223 }, { "auxiliary_loss_clip": 0.01319557, "auxiliary_loss_mlp": 0.01194103, "balance_loss_clip": 1.00786841, "balance_loss_mlp": 1.00031602, "epoch": 0.5728371310046293, "flos": 14647684825440.0, "grad_norm": 1.957065398425568, "language_loss": 0.86923349, "learning_rate": 1.6275025341772789e-06, "loss": 0.89437008, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 3.750445604324341 }, { "auxiliary_loss_clip": 0.01322598, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00832868, "balance_loss_mlp": 1.00029898, "epoch": 0.5729573738952685, "flos": 21506624390880.0, "grad_norm": 6.598541237932939, "language_loss": 0.81848115, "learning_rate": 1.626737220761596e-06, "loss": 0.84364796, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.7825231552124023 }, { "auxiliary_loss_clip": 0.01328655, "auxiliary_loss_mlp": 0.01194123, "balance_loss_clip": 1.00854766, "balance_loss_mlp": 1.00033641, "epoch": 0.5730776167859075, "flos": 23621691166560.0, "grad_norm": 1.9915283158944392, "language_loss": 0.78972125, "learning_rate": 1.62597196396958e-06, "loss": 0.81494904, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.71368145942688 }, { "auxiliary_loss_clip": 0.01329908, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.00878954, "balance_loss_mlp": 1.00028777, "epoch": 0.5731978596765466, "flos": 25739236676160.0, "grad_norm": 1.8022020140413915, "language_loss": 0.85652208, "learning_rate": 1.6252067639173197e-06, "loss": 0.88176191, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.7947018146514893 }, { "auxiliary_loss_clip": 0.0133365, "auxiliary_loss_mlp": 0.01194138, "balance_loss_clip": 1.00782943, "balance_loss_mlp": 1.00035191, "epoch": 0.5733181025671857, "flos": 26359521845760.0, "grad_norm": 1.650292751817084, "language_loss": 0.69956988, "learning_rate": 1.6244416207208956e-06, "loss": 0.72484779, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.80659556388855 }, { "auxiliary_loss_clip": 0.01294637, "auxiliary_loss_mlp": 0.01194101, "balance_loss_clip": 1.00771403, "balance_loss_mlp": 1.00031495, "epoch": 0.5734383454578248, "flos": 29423887665120.0, "grad_norm": 1.5372453287374095, "language_loss": 0.73097908, "learning_rate": 1.6236765344963787e-06, "loss": 0.75586653, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.858705997467041 }, { "auxiliary_loss_clip": 0.0130909, "auxiliary_loss_mlp": 0.01194105, "balance_loss_clip": 1.00744498, "balance_loss_mlp": 1.00031853, "epoch": 0.5735585883484638, "flos": 34969968941760.0, "grad_norm": 2.1878774556521465, "language_loss": 0.6908139, "learning_rate": 1.6229115053598322e-06, "loss": 0.71584582, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.9357962608337402 }, { "auxiliary_loss_clip": 0.01331449, "auxiliary_loss_mlp": 0.0119424, "balance_loss_clip": 1.00853252, "balance_loss_mlp": 1.00035834, "epoch": 0.573678831239103, "flos": 18770769514080.0, "grad_norm": 1.6747897529904356, "language_loss": 0.72432822, "learning_rate": 1.6221465334273108e-06, "loss": 0.74958509, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 2.771439552307129 }, { "auxiliary_loss_clip": 0.01298948, "auxiliary_loss_mlp": 0.01194119, "balance_loss_clip": 1.00727534, "balance_loss_mlp": 1.00033259, "epoch": 0.5737990741297421, "flos": 25702895121120.0, "grad_norm": 2.156654443459267, "language_loss": 0.61630678, "learning_rate": 1.6213816188148593e-06, "loss": 0.64123738, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.8186087608337402 }, { "auxiliary_loss_clip": 0.01318042, "auxiliary_loss_mlp": 0.0119412, "balance_loss_clip": 1.00945783, "balance_loss_mlp": 1.00033331, "epoch": 0.5739193170203811, "flos": 27269246203200.0, "grad_norm": 1.6933183622303547, "language_loss": 0.77441978, "learning_rate": 1.6206167616385162e-06, "loss": 0.79954141, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.838080883026123 }, { "auxiliary_loss_clip": 0.01320308, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00860572, "balance_loss_mlp": 1.00036716, "epoch": 0.5740395599110203, "flos": 12239730188640.0, "grad_norm": 1.8797999145998587, "language_loss": 0.73564923, "learning_rate": 1.6198519620143078e-06, "loss": 0.76079381, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.7693276405334473 }, { "auxiliary_loss_clip": 0.01303647, "auxiliary_loss_mlp": 0.01194125, "balance_loss_clip": 1.00810087, "balance_loss_mlp": 1.00033879, "epoch": 0.5741598028016593, "flos": 25921411459200.0, "grad_norm": 1.5537534954155958, "language_loss": 0.78233743, "learning_rate": 1.6190872200582546e-06, "loss": 0.80731517, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.8697545528411865 }, { "auxiliary_loss_clip": 0.01323979, "auxiliary_loss_mlp": 0.00872527, "balance_loss_clip": 1.00840342, "balance_loss_mlp": 1.00043631, "epoch": 0.5742800456922984, "flos": 19244143745280.0, "grad_norm": 2.3695489043636813, "language_loss": 0.78167582, "learning_rate": 1.6183225358863676e-06, "loss": 0.80364084, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.7887613773345947 }, { "auxiliary_loss_clip": 0.01316428, "auxiliary_loss_mlp": 0.01194107, "balance_loss_clip": 1.00821328, "balance_loss_mlp": 1.00032032, "epoch": 0.5744002885829376, "flos": 30920501378880.0, "grad_norm": 2.025580256760218, "language_loss": 0.71621692, "learning_rate": 1.617557909614648e-06, "loss": 0.74132222, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.792794704437256 }, { "auxiliary_loss_clip": 0.01295581, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.00716591, "balance_loss_mlp": 1.00032091, "epoch": 0.5745205314735766, "flos": 23840027886240.0, "grad_norm": 2.8187395076792314, "language_loss": 0.86367774, "learning_rate": 1.6167933413590899e-06, "loss": 0.88857466, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.801318883895874 }, { "auxiliary_loss_clip": 0.01341858, "auxiliary_loss_mlp": 0.01194226, "balance_loss_clip": 1.0088861, "balance_loss_mlp": 1.00043964, "epoch": 0.5746407743642157, "flos": 12311910367200.0, "grad_norm": 1.9368310242860554, "language_loss": 0.90692574, "learning_rate": 1.6160288312356773e-06, "loss": 0.93228656, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.717440366744995 }, { "auxiliary_loss_clip": 0.01341692, "auxiliary_loss_mlp": 0.01194053, "balance_loss_clip": 1.00866532, "balance_loss_mlp": 1.00026667, "epoch": 0.5747610172548548, "flos": 24133670144640.0, "grad_norm": 1.5530767247093433, "language_loss": 0.81431293, "learning_rate": 1.6152643793603857e-06, "loss": 0.83967036, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.778062105178833 }, { "auxiliary_loss_clip": 0.01354048, "auxiliary_loss_mlp": 0.01194062, "balance_loss_clip": 1.00886786, "balance_loss_mlp": 1.00027514, "epoch": 0.5748812601454939, "flos": 25408462541760.0, "grad_norm": 2.45761401546399, "language_loss": 0.87761843, "learning_rate": 1.6144999858491815e-06, "loss": 0.90309954, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.7286739349365234 }, { "auxiliary_loss_clip": 0.01319588, "auxiliary_loss_mlp": 0.01194069, "balance_loss_clip": 1.00777304, "balance_loss_mlp": 1.00028217, "epoch": 0.575001503036133, "flos": 30624955165440.0, "grad_norm": 1.574380952816825, "language_loss": 0.85705316, "learning_rate": 1.6137356508180232e-06, "loss": 0.88218963, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.882723331451416 }, { "auxiliary_loss_clip": 0.0135454, "auxiliary_loss_mlp": 0.00872559, "balance_loss_clip": 1.00889874, "balance_loss_mlp": 1.00038314, "epoch": 0.5751217459267721, "flos": 21726577676160.0, "grad_norm": 2.163754259356062, "language_loss": 0.81583321, "learning_rate": 1.6129713743828593e-06, "loss": 0.83810419, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.6717705726623535 }, { "auxiliary_loss_clip": 0.01329462, "auxiliary_loss_mlp": 0.0119407, "balance_loss_clip": 1.00863135, "balance_loss_mlp": 1.00028324, "epoch": 0.5752419888174112, "flos": 21651631374240.0, "grad_norm": 1.4330424058065987, "language_loss": 0.75298625, "learning_rate": 1.6122071566596306e-06, "loss": 0.77822161, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.7872700691223145 }, { "auxiliary_loss_clip": 0.01333083, "auxiliary_loss_mlp": 0.011941, "balance_loss_clip": 1.00803089, "balance_loss_mlp": 1.00031352, "epoch": 0.5753622317080502, "flos": 17776004300640.0, "grad_norm": 4.038831454852141, "language_loss": 0.83167326, "learning_rate": 1.6114429977642674e-06, "loss": 0.8569451, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.7023215293884277 }, { "auxiliary_loss_clip": 0.01330261, "auxiliary_loss_mlp": 0.01194067, "balance_loss_clip": 1.00845754, "balance_loss_mlp": 1.00028026, "epoch": 0.5754824745986894, "flos": 19789626307680.0, "grad_norm": 5.138450433619441, "language_loss": 0.73493099, "learning_rate": 1.6106788978126926e-06, "loss": 0.76017427, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.746784210205078 }, { "auxiliary_loss_clip": 0.01282694, "auxiliary_loss_mlp": 0.01194092, "balance_loss_clip": 1.00721693, "balance_loss_mlp": 1.00030494, "epoch": 0.5756027174893285, "flos": 30985676439840.0, "grad_norm": 2.0504057156251747, "language_loss": 0.78952634, "learning_rate": 1.6099148569208196e-06, "loss": 0.81429422, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.9096333980560303 }, { "auxiliary_loss_clip": 0.01307009, "auxiliary_loss_mlp": 0.01194112, "balance_loss_clip": 1.00833189, "balance_loss_mlp": 1.0003252, "epoch": 0.5757229603799675, "flos": 28546876571040.0, "grad_norm": 1.5009086676885957, "language_loss": 0.62668467, "learning_rate": 1.6091508752045523e-06, "loss": 0.65169585, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 4.784578323364258 }, { "auxiliary_loss_clip": 0.01304252, "auxiliary_loss_mlp": 0.01193868, "balance_loss_clip": 1.00790036, "balance_loss_mlp": 1.00017679, "epoch": 0.5758432032706067, "flos": 22999035034080.0, "grad_norm": 1.543779820094684, "language_loss": 0.86497647, "learning_rate": 1.608386952779787e-06, "loss": 0.88995767, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.8145813941955566 }, { "auxiliary_loss_clip": 0.01327514, "auxiliary_loss_mlp": 0.01194158, "balance_loss_clip": 1.0084964, "balance_loss_mlp": 1.00037146, "epoch": 0.5759634461612457, "flos": 25739739607680.0, "grad_norm": 1.6109144397539952, "language_loss": 0.7455337, "learning_rate": 1.6076230897624098e-06, "loss": 0.7707504, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.838859796524048 }, { "auxiliary_loss_clip": 0.01341604, "auxiliary_loss_mlp": 0.0119404, "balance_loss_clip": 1.00832844, "balance_loss_mlp": 1.00025356, "epoch": 0.5760836890518848, "flos": 30591774894240.0, "grad_norm": 2.006390180050205, "language_loss": 0.77294588, "learning_rate": 1.6068592862682974e-06, "loss": 0.79830235, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 3.7434475421905518 }, { "auxiliary_loss_clip": 0.01314552, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00786352, "balance_loss_mlp": 1.00032663, "epoch": 0.576203931942524, "flos": 36538978376160.0, "grad_norm": 1.7347441261811938, "language_loss": 0.73439002, "learning_rate": 1.6060955424133187e-06, "loss": 0.75947666, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 3.9183685779571533 }, { "auxiliary_loss_clip": 0.01330219, "auxiliary_loss_mlp": 0.01194134, "balance_loss_clip": 1.00827849, "balance_loss_mlp": 1.00034785, "epoch": 0.576324174833163, "flos": 25516948351680.0, "grad_norm": 1.5683148180596298, "language_loss": 0.89372385, "learning_rate": 1.6053318583133332e-06, "loss": 0.91896737, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.814063549041748 }, { "auxiliary_loss_clip": 0.01329969, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.00753832, "balance_loss_mlp": 1.00028777, "epoch": 0.5764444177238021, "flos": 25119274819680.0, "grad_norm": 2.115513957808429, "language_loss": 0.75196856, "learning_rate": 1.6045682340841907e-06, "loss": 0.77720904, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.7877252101898193 }, { "auxiliary_loss_clip": 0.01274096, "auxiliary_loss_mlp": 0.00871826, "balance_loss_clip": 1.00354159, "balance_loss_mlp": 1.00008249, "epoch": 0.5765646606144411, "flos": 62212716678240.0, "grad_norm": 0.7473601805767102, "language_loss": 0.58005065, "learning_rate": 1.6038046698417336e-06, "loss": 0.60150987, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.344280481338501 }, { "auxiliary_loss_clip": 0.01342209, "auxiliary_loss_mlp": 0.01194257, "balance_loss_clip": 1.00858748, "balance_loss_mlp": 1.00037515, "epoch": 0.5766849035050803, "flos": 25118771888160.0, "grad_norm": 2.1373246291106125, "language_loss": 0.68869448, "learning_rate": 1.6030411657017919e-06, "loss": 0.71405917, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.845021963119507 }, { "auxiliary_loss_clip": 0.01342901, "auxiliary_loss_mlp": 0.01193989, "balance_loss_clip": 1.00879622, "balance_loss_mlp": 1.00029755, "epoch": 0.5768051463957193, "flos": 15991100956800.0, "grad_norm": 1.6419365599120692, "language_loss": 0.84503579, "learning_rate": 1.6022777217801903e-06, "loss": 0.87040472, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.7038393020629883 }, { "auxiliary_loss_clip": 0.01289681, "auxiliary_loss_mlp": 0.01194024, "balance_loss_clip": 1.00803089, "balance_loss_mlp": 1.0002377, "epoch": 0.5769253892863584, "flos": 22163646276000.0, "grad_norm": 1.7766840905487453, "language_loss": 0.73504597, "learning_rate": 1.601514338192742e-06, "loss": 0.75988305, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.84456467628479 }, { "auxiliary_loss_clip": 0.01352839, "auxiliary_loss_mlp": 0.01194119, "balance_loss_clip": 1.0083431, "balance_loss_mlp": 1.000332, "epoch": 0.5770456321769976, "flos": 22856399013600.0, "grad_norm": 1.9485702225474153, "language_loss": 0.71324688, "learning_rate": 1.6007510150552514e-06, "loss": 0.73871648, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.7167627811431885 }, { "auxiliary_loss_clip": 0.01343023, "auxiliary_loss_mlp": 0.01194179, "balance_loss_clip": 1.00891376, "balance_loss_mlp": 1.00029731, "epoch": 0.5771658750676366, "flos": 46353690180000.0, "grad_norm": 1.5342370228556874, "language_loss": 0.62041414, "learning_rate": 1.599987752483515e-06, "loss": 0.64578623, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.945390462875366 }, { "auxiliary_loss_clip": 0.01308242, "auxiliary_loss_mlp": 0.0119406, "balance_loss_clip": 1.00823009, "balance_loss_mlp": 1.0002737, "epoch": 0.5772861179582757, "flos": 22159982060640.0, "grad_norm": 1.6154217364059915, "language_loss": 0.67896283, "learning_rate": 1.5992245505933184e-06, "loss": 0.70398581, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.8479182720184326 }, { "auxiliary_loss_clip": 0.01355028, "auxiliary_loss_mlp": 0.01194064, "balance_loss_clip": 1.00904346, "balance_loss_mlp": 1.00027716, "epoch": 0.5774063608489148, "flos": 31248940474080.0, "grad_norm": 1.8581906409452746, "language_loss": 0.71140003, "learning_rate": 1.5984614095004388e-06, "loss": 0.73689091, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.7836945056915283 }, { "auxiliary_loss_clip": 0.01343465, "auxiliary_loss_mlp": 0.01194041, "balance_loss_clip": 1.00914407, "balance_loss_mlp": 1.00025403, "epoch": 0.5775266037395539, "flos": 22527133673760.0, "grad_norm": 1.881026279967883, "language_loss": 0.80751312, "learning_rate": 1.5976983293206438e-06, "loss": 0.83288819, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.7797625064849854 }, { "auxiliary_loss_clip": 0.01330292, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00867462, "balance_loss_mlp": 1.00030768, "epoch": 0.577646846630193, "flos": 21068801393760.0, "grad_norm": 1.6189746167087105, "language_loss": 0.71593869, "learning_rate": 1.5969353101696928e-06, "loss": 0.74118257, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.853663682937622 }, { "auxiliary_loss_clip": 0.01340427, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00825047, "balance_loss_mlp": 1.00030792, "epoch": 0.5777670895208321, "flos": 29714296792320.0, "grad_norm": 1.6125528880219413, "language_loss": 0.79934692, "learning_rate": 1.5961723521633341e-06, "loss": 0.82469213, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.8095810413360596 }, { "auxiliary_loss_clip": 0.01315276, "auxiliary_loss_mlp": 0.01194111, "balance_loss_clip": 1.00862551, "balance_loss_mlp": 1.00032389, "epoch": 0.5778873324114712, "flos": 19500438585600.0, "grad_norm": 2.24171731501393, "language_loss": 0.90778357, "learning_rate": 1.5954094554173097e-06, "loss": 0.93287742, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.734954357147217 }, { "auxiliary_loss_clip": 0.01320718, "auxiliary_loss_mlp": 0.01194141, "balance_loss_clip": 1.00762331, "balance_loss_mlp": 1.00035429, "epoch": 0.5780075753021102, "flos": 14136855405120.0, "grad_norm": 1.8474082967921726, "language_loss": 0.79286504, "learning_rate": 1.5946466200473482e-06, "loss": 0.81801361, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.788379669189453 }, { "auxiliary_loss_clip": 0.01328516, "auxiliary_loss_mlp": 0.01194032, "balance_loss_clip": 1.00820804, "balance_loss_mlp": 1.00024533, "epoch": 0.5781278181927494, "flos": 15262186282560.0, "grad_norm": 1.6562148843440958, "language_loss": 0.83156264, "learning_rate": 1.5938838461691723e-06, "loss": 0.85678816, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.7184436321258545 }, { "auxiliary_loss_clip": 0.01353818, "auxiliary_loss_mlp": 0.01194168, "balance_loss_clip": 1.00867772, "balance_loss_mlp": 1.00028539, "epoch": 0.5782480610833884, "flos": 16726841130240.0, "grad_norm": 4.697838172971001, "language_loss": 0.82779217, "learning_rate": 1.593121133898494e-06, "loss": 0.85327196, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.6502304077148438 }, { "auxiliary_loss_clip": 0.01341576, "auxiliary_loss_mlp": 0.01194072, "balance_loss_clip": 1.00857711, "balance_loss_mlp": 1.00028515, "epoch": 0.5783683039740275, "flos": 25482151514880.0, "grad_norm": 6.55364632999284, "language_loss": 0.79156005, "learning_rate": 1.592358483351016e-06, "loss": 0.81691653, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.7104334831237793 }, { "auxiliary_loss_clip": 0.01331275, "auxiliary_loss_mlp": 0.01194117, "balance_loss_clip": 1.00774968, "balance_loss_mlp": 1.00033033, "epoch": 0.5784885468646667, "flos": 18405845169120.0, "grad_norm": 1.9152281784750695, "language_loss": 0.72103161, "learning_rate": 1.5915958946424326e-06, "loss": 0.74628556, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.7581121921539307 }, { "auxiliary_loss_clip": 0.01304795, "auxiliary_loss_mlp": 0.0087261, "balance_loss_clip": 1.00845218, "balance_loss_mlp": 1.00050175, "epoch": 0.5786087897553057, "flos": 46100736241920.0, "grad_norm": 1.756154599561227, "language_loss": 0.74767387, "learning_rate": 1.5908333678884271e-06, "loss": 0.76944792, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 3.0293681621551514 }, { "auxiliary_loss_clip": 0.01332299, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.00819993, "balance_loss_mlp": 1.00023949, "epoch": 0.5787290326459448, "flos": 12385958577120.0, "grad_norm": 1.9839143670126922, "language_loss": 0.73967361, "learning_rate": 1.5900709032046743e-06, "loss": 0.76493692, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.7255594730377197 }, { "auxiliary_loss_clip": 0.01305849, "auxiliary_loss_mlp": 0.01194102, "balance_loss_clip": 1.00798404, "balance_loss_mlp": 1.00031519, "epoch": 0.5788492755365839, "flos": 23290342253280.0, "grad_norm": 2.0263568317147276, "language_loss": 0.7794596, "learning_rate": 1.5893085007068391e-06, "loss": 0.8044591, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 4.751266002655029 }, { "auxiliary_loss_clip": 0.01329231, "auxiliary_loss_mlp": 0.01194002, "balance_loss_clip": 1.00887597, "balance_loss_mlp": 1.00031042, "epoch": 0.578969518427223, "flos": 24061058881920.0, "grad_norm": 2.0912832856104937, "language_loss": 0.70713603, "learning_rate": 1.5885461605105786e-06, "loss": 0.73236835, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.8455917835235596 }, { "auxiliary_loss_clip": 0.01318261, "auxiliary_loss_mlp": 0.01194083, "balance_loss_clip": 1.00871134, "balance_loss_mlp": 1.00029671, "epoch": 0.579089761317862, "flos": 21871836125280.0, "grad_norm": 2.3583099703374053, "language_loss": 0.76608562, "learning_rate": 1.5877838827315375e-06, "loss": 0.79120904, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.7726633548736572 }, { "auxiliary_loss_clip": 0.01354118, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00919366, "balance_loss_mlp": 1.00030708, "epoch": 0.5792100042085012, "flos": 22929692826240.0, "grad_norm": 3.1358304947916293, "language_loss": 0.70149499, "learning_rate": 1.587021667485355e-06, "loss": 0.72697711, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 3.771359443664551 }, { "auxiliary_loss_clip": 0.01324197, "auxiliary_loss_mlp": 0.01194081, "balance_loss_clip": 1.00790024, "balance_loss_mlp": 1.00029457, "epoch": 0.5793302470991403, "flos": 21470067293760.0, "grad_norm": 1.6246802965457177, "language_loss": 0.78322828, "learning_rate": 1.5862595148876559e-06, "loss": 0.80841106, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.838669538497925 }, { "auxiliary_loss_clip": 0.01284615, "auxiliary_loss_mlp": 0.01194012, "balance_loss_clip": 1.00854635, "balance_loss_mlp": 1.00032067, "epoch": 0.5794504899897793, "flos": 12711020846400.0, "grad_norm": 2.1248501513598903, "language_loss": 0.76241541, "learning_rate": 1.58549742505406e-06, "loss": 0.78720164, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 3.8281428813934326 }, { "auxiliary_loss_clip": 0.01353734, "auxiliary_loss_mlp": 0.01194047, "balance_loss_clip": 1.00869489, "balance_loss_mlp": 1.00026059, "epoch": 0.5795707328804185, "flos": 14867063331840.0, "grad_norm": 2.1343191349148336, "language_loss": 0.756549, "learning_rate": 1.5847353981001747e-06, "loss": 0.78202689, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.6982171535491943 }, { "auxiliary_loss_clip": 0.01329656, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.0087558, "balance_loss_mlp": 1.00030351, "epoch": 0.5796909757710575, "flos": 36430061482080.0, "grad_norm": 1.4992897504542555, "language_loss": 0.70209396, "learning_rate": 1.5839734341415993e-06, "loss": 0.7273314, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.9132750034332275 }, { "auxiliary_loss_clip": 0.01327138, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00872326, "balance_loss_mlp": 1.00029838, "epoch": 0.5798112186616966, "flos": 23039902972800.0, "grad_norm": 1.5953011300627027, "language_loss": 0.76593566, "learning_rate": 1.5832115332939238e-06, "loss": 0.79114789, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.8419911861419678 }, { "auxiliary_loss_clip": 0.01334505, "auxiliary_loss_mlp": 0.01194088, "balance_loss_clip": 1.00818396, "balance_loss_mlp": 1.000301, "epoch": 0.5799314615523358, "flos": 16652613301920.0, "grad_norm": 1.6307610274694762, "language_loss": 0.74903601, "learning_rate": 1.5824496956727272e-06, "loss": 0.77432197, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.7215075492858887 }, { "auxiliary_loss_clip": 0.01317702, "auxiliary_loss_mlp": 0.01193996, "balance_loss_clip": 1.00822055, "balance_loss_mlp": 1.0003047, "epoch": 0.5800517044429748, "flos": 20485684023840.0, "grad_norm": 1.9387195583784538, "language_loss": 0.73281574, "learning_rate": 1.5816879213935797e-06, "loss": 0.75793278, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.844574451446533 }, { "auxiliary_loss_clip": 0.01330173, "auxiliary_loss_mlp": 0.01194015, "balance_loss_clip": 1.0083282, "balance_loss_mlp": 1.00022805, "epoch": 0.5801719473336139, "flos": 31538271890880.0, "grad_norm": 1.7055743033241386, "language_loss": 0.79270518, "learning_rate": 1.5809262105720416e-06, "loss": 0.81794703, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.8983983993530273 }, { "auxiliary_loss_clip": 0.01352927, "auxiliary_loss_mlp": 0.01193929, "balance_loss_clip": 1.00858212, "balance_loss_mlp": 1.00023794, "epoch": 0.580292190224253, "flos": 20375976808800.0, "grad_norm": 1.4863660981811122, "language_loss": 0.7977972, "learning_rate": 1.5801645633236644e-06, "loss": 0.82326579, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.7154576778411865 }, { "auxiliary_loss_clip": 0.01322514, "auxiliary_loss_mlp": 0.01194173, "balance_loss_clip": 1.00794947, "balance_loss_mlp": 1.00038648, "epoch": 0.5804124331148921, "flos": 26615385601920.0, "grad_norm": 1.7955188492403273, "language_loss": 0.77253234, "learning_rate": 1.579402979763989e-06, "loss": 0.79769927, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.8377885818481445 }, { "auxiliary_loss_clip": 0.01260149, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00833511, "balance_loss_mlp": 1.00028968, "epoch": 0.5805326760055312, "flos": 13478504343840.0, "grad_norm": 2.225642293105547, "language_loss": 0.81332231, "learning_rate": 1.578641460008548e-06, "loss": 0.83786452, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.822319984436035 }, { "auxiliary_loss_clip": 0.01331428, "auxiliary_loss_mlp": 0.0119397, "balance_loss_clip": 1.00775957, "balance_loss_mlp": 1.00027871, "epoch": 0.5806529188961702, "flos": 12091382303040.0, "grad_norm": 1.8770869363774085, "language_loss": 0.67671537, "learning_rate": 1.5778800041728613e-06, "loss": 0.70196933, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.7475595474243164 }, { "auxiliary_loss_clip": 0.01328219, "auxiliary_loss_mlp": 0.01194004, "balance_loss_clip": 1.0078932, "balance_loss_mlp": 1.00021756, "epoch": 0.5807731617868094, "flos": 26214119701920.0, "grad_norm": 1.440337434849392, "language_loss": 0.66297692, "learning_rate": 1.577118612372443e-06, "loss": 0.68819916, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.774890899658203 }, { "auxiliary_loss_clip": 0.01321154, "auxiliary_loss_mlp": 0.00872571, "balance_loss_clip": 1.00848842, "balance_loss_mlp": 1.00051558, "epoch": 0.5808934046774484, "flos": 37962154582560.0, "grad_norm": 1.6892808855121497, "language_loss": 0.7054652, "learning_rate": 1.5763572847227943e-06, "loss": 0.72740245, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.953286647796631 }, { "auxiliary_loss_clip": 0.01333401, "auxiliary_loss_mlp": 0.01194028, "balance_loss_clip": 1.00752521, "balance_loss_mlp": 1.00024176, "epoch": 0.5810136475680875, "flos": 20485863642240.0, "grad_norm": 1.668079848195538, "language_loss": 0.81137067, "learning_rate": 1.5755960213394091e-06, "loss": 0.83664501, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.6912808418273926 }, { "auxiliary_loss_clip": 0.01297941, "auxiliary_loss_mlp": 0.01194127, "balance_loss_clip": 1.00904584, "balance_loss_mlp": 1.00033998, "epoch": 0.5811338904587267, "flos": 17530163251200.0, "grad_norm": 1.7589773441653294, "language_loss": 0.78145468, "learning_rate": 1.5748348223377703e-06, "loss": 0.80637538, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.8626389503479004 }, { "auxiliary_loss_clip": 0.01308407, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00804293, "balance_loss_mlp": 1.00024831, "epoch": 0.5812541333493657, "flos": 19458025928640.0, "grad_norm": 1.4469527548662977, "language_loss": 0.77942598, "learning_rate": 1.5740736878333507e-06, "loss": 0.80445039, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.8212194442749023 }, { "auxiliary_loss_clip": 0.01318677, "auxiliary_loss_mlp": 0.01194033, "balance_loss_clip": 1.00784469, "balance_loss_mlp": 1.00024605, "epoch": 0.5813743762400048, "flos": 20594960154720.0, "grad_norm": 5.90727594551089, "language_loss": 0.777704, "learning_rate": 1.5733126179416143e-06, "loss": 0.80283105, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.7142281532287598 }, { "auxiliary_loss_clip": 0.01331495, "auxiliary_loss_mlp": 0.01194022, "balance_loss_clip": 1.0080409, "balance_loss_mlp": 1.0002358, "epoch": 0.5814946191306439, "flos": 33178240098720.0, "grad_norm": 2.4389374218938555, "language_loss": 0.72636604, "learning_rate": 1.5725516127780137e-06, "loss": 0.75162125, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.8946340084075928 }, { "auxiliary_loss_clip": 0.01335507, "auxiliary_loss_mlp": 0.01194025, "balance_loss_clip": 1.00787759, "balance_loss_mlp": 1.00023806, "epoch": 0.581614862021283, "flos": 16143292676160.0, "grad_norm": 1.9975554457802043, "language_loss": 0.8855226, "learning_rate": 1.5717906724579943e-06, "loss": 0.91081798, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.709411859512329 }, { "auxiliary_loss_clip": 0.01317108, "auxiliary_loss_mlp": 0.01194002, "balance_loss_clip": 1.00882602, "balance_loss_mlp": 1.00021577, "epoch": 0.581735104911922, "flos": 33802656491520.0, "grad_norm": 1.9290505604106292, "language_loss": 0.68072367, "learning_rate": 1.571029797096989e-06, "loss": 0.70583475, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 3.0038652420043945 }, { "auxiliary_loss_clip": 0.01352208, "auxiliary_loss_mlp": 0.01193986, "balance_loss_clip": 1.00781631, "balance_loss_mlp": 1.00019896, "epoch": 0.5818553478025612, "flos": 23331174268320.0, "grad_norm": 2.3309505970327056, "language_loss": 0.79304433, "learning_rate": 1.570268986810423e-06, "loss": 0.81850624, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.7107629776000977 }, { "auxiliary_loss_clip": 0.01316661, "auxiliary_loss_mlp": 0.01193895, "balance_loss_clip": 1.00882196, "balance_loss_mlp": 1.00029898, "epoch": 0.5819755906932003, "flos": 20996154207360.0, "grad_norm": 1.7561270408801157, "language_loss": 0.74779475, "learning_rate": 1.5695082417137096e-06, "loss": 0.77290022, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 4.645503282546997 }, { "auxiliary_loss_clip": 0.01322572, "auxiliary_loss_mlp": 0.01193872, "balance_loss_clip": 1.00774002, "balance_loss_mlp": 1.00027585, "epoch": 0.5820958335838393, "flos": 21431678088960.0, "grad_norm": 1.5946192760867277, "language_loss": 0.75068355, "learning_rate": 1.5687475619222539e-06, "loss": 0.77584797, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.8404266834259033 }, { "auxiliary_loss_clip": 0.01329371, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00881791, "balance_loss_mlp": 1.00032675, "epoch": 0.5822160764744785, "flos": 17967483316800.0, "grad_norm": 2.1843906343596635, "language_loss": 0.73371536, "learning_rate": 1.5679869475514496e-06, "loss": 0.75895017, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.6871728897094727 }, { "auxiliary_loss_clip": 0.01331485, "auxiliary_loss_mlp": 0.0119401, "balance_loss_clip": 1.00824571, "balance_loss_mlp": 1.00031877, "epoch": 0.5823363193651175, "flos": 23033867794560.0, "grad_norm": 2.047360947461312, "language_loss": 0.80762547, "learning_rate": 1.567226398716682e-06, "loss": 0.83288044, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 3.7182343006134033 }, { "auxiliary_loss_clip": 0.01322899, "auxiliary_loss_mlp": 0.01193975, "balance_loss_clip": 1.00801468, "balance_loss_mlp": 1.00018871, "epoch": 0.5824565622557566, "flos": 32891854423680.0, "grad_norm": 1.6008725858954764, "language_loss": 0.61770141, "learning_rate": 1.566465915533326e-06, "loss": 0.64287019, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.921088933944702 }, { "auxiliary_loss_clip": 0.01328824, "auxiliary_loss_mlp": 0.0119397, "balance_loss_clip": 1.0077467, "balance_loss_mlp": 1.00018358, "epoch": 0.5825768051463958, "flos": 22229683505280.0, "grad_norm": 1.9294526421264542, "language_loss": 0.88065505, "learning_rate": 1.5657054981167458e-06, "loss": 0.90588301, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 3.7266175746917725 }, { "auxiliary_loss_clip": 0.01340936, "auxiliary_loss_mlp": 0.01194036, "balance_loss_clip": 1.00892806, "balance_loss_mlp": 1.00024939, "epoch": 0.5826970480370348, "flos": 28001573627040.0, "grad_norm": 1.9623362029339895, "language_loss": 0.6731894, "learning_rate": 1.5649451465822965e-06, "loss": 0.69853914, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.765896797180176 }, { "auxiliary_loss_clip": 0.01270962, "auxiliary_loss_mlp": 0.01193959, "balance_loss_clip": 1.00760198, "balance_loss_mlp": 1.00026774, "epoch": 0.5828172909276739, "flos": 17858063491200.0, "grad_norm": 1.6557083925276075, "language_loss": 0.83846247, "learning_rate": 1.5641848610453218e-06, "loss": 0.86311173, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.7815732955932617 }, { "auxiliary_loss_clip": 0.01327662, "auxiliary_loss_mlp": 0.01194096, "balance_loss_clip": 1.00842094, "balance_loss_mlp": 1.00030923, "epoch": 0.582937533818313, "flos": 19865255159520.0, "grad_norm": 2.3444923282732777, "language_loss": 0.85728586, "learning_rate": 1.563424641621158e-06, "loss": 0.88250339, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.7192234992980957 }, { "auxiliary_loss_clip": 0.01319336, "auxiliary_loss_mlp": 0.01194154, "balance_loss_clip": 1.00802684, "balance_loss_mlp": 1.00027227, "epoch": 0.5830577767089521, "flos": 26870746426560.0, "grad_norm": 1.9340219463739727, "language_loss": 0.69591182, "learning_rate": 1.5626644884251282e-06, "loss": 0.72104675, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.819443702697754 }, { "auxiliary_loss_clip": 0.01353041, "auxiliary_loss_mlp": 0.01193962, "balance_loss_clip": 1.00846636, "balance_loss_mlp": 1.00027084, "epoch": 0.5831780195995911, "flos": 25298216471520.0, "grad_norm": 1.5547113232248746, "language_loss": 0.87885952, "learning_rate": 1.5619044015725488e-06, "loss": 0.9043296, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.7338528633117676 }, { "auxiliary_loss_clip": 0.01356015, "auxiliary_loss_mlp": 0.01194098, "balance_loss_clip": 1.01017761, "balance_loss_mlp": 1.00031126, "epoch": 0.5832982624902303, "flos": 14756996880000.0, "grad_norm": 4.036287381540867, "language_loss": 0.87190413, "learning_rate": 1.5611443811787224e-06, "loss": 0.89740521, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.7791733741760254 }, { "auxiliary_loss_clip": 0.01331141, "auxiliary_loss_mlp": 0.01193889, "balance_loss_clip": 1.00819707, "balance_loss_mlp": 1.00029314, "epoch": 0.5834185053808694, "flos": 20444564619360.0, "grad_norm": 2.040874888561565, "language_loss": 0.69221628, "learning_rate": 1.560384427358945e-06, "loss": 0.71746659, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.7043495178222656 }, { "auxiliary_loss_clip": 0.01329953, "auxiliary_loss_mlp": 0.01194044, "balance_loss_clip": 1.00865817, "balance_loss_mlp": 1.00025749, "epoch": 0.5835387482715084, "flos": 27200406926880.0, "grad_norm": 1.418690017631696, "language_loss": 0.72804558, "learning_rate": 1.5596245402284998e-06, "loss": 0.75328553, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.8574554920196533 }, { "auxiliary_loss_clip": 0.0133241, "auxiliary_loss_mlp": 0.01194084, "balance_loss_clip": 1.00871491, "balance_loss_mlp": 1.00029743, "epoch": 0.5836589911621476, "flos": 16654625028000.0, "grad_norm": 1.5718883608933525, "language_loss": 0.81854033, "learning_rate": 1.5588647199026619e-06, "loss": 0.84380525, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.7203054428100586 }, { "auxiliary_loss_clip": 0.01354511, "auxiliary_loss_mlp": 0.01194066, "balance_loss_clip": 1.00928068, "balance_loss_mlp": 1.00027919, "epoch": 0.5837792340527866, "flos": 20446827811200.0, "grad_norm": 1.9591843717520567, "language_loss": 0.87085396, "learning_rate": 1.5581049664966956e-06, "loss": 0.89633977, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.6926052570343018 }, { "auxiliary_loss_clip": 0.01247556, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00314045, "balance_loss_mlp": 1.00012898, "epoch": 0.5838994769434257, "flos": 65995507457280.0, "grad_norm": 1.191320410689625, "language_loss": 0.65170479, "learning_rate": 1.5573452801258545e-06, "loss": 0.67611182, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.187858819961548 }, { "auxiliary_loss_clip": 0.01342647, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00918269, "balance_loss_mlp": 1.00030315, "epoch": 0.5840197198340649, "flos": 21470534301600.0, "grad_norm": 1.8646236923641835, "language_loss": 0.63135636, "learning_rate": 1.5565856609053824e-06, "loss": 0.65672368, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.7059168815612793 }, { "auxiliary_loss_clip": 0.01353862, "auxiliary_loss_mlp": 0.01194004, "balance_loss_clip": 1.00921679, "balance_loss_mlp": 1.00021696, "epoch": 0.5841399627247039, "flos": 19135155003840.0, "grad_norm": 1.7277094216826656, "language_loss": 0.80685163, "learning_rate": 1.5558261089505127e-06, "loss": 0.83233023, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.6805078983306885 }, { "auxiliary_loss_clip": 0.01330478, "auxiliary_loss_mlp": 0.01194069, "balance_loss_clip": 1.00843298, "balance_loss_mlp": 1.00028253, "epoch": 0.584260205615343, "flos": 26425702769760.0, "grad_norm": 1.78055839433252, "language_loss": 0.79672933, "learning_rate": 1.5550666243764697e-06, "loss": 0.82197475, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.7897660732269287 }, { "auxiliary_loss_clip": 0.01330162, "auxiliary_loss_mlp": 0.01194114, "balance_loss_clip": 1.00807321, "balance_loss_mlp": 1.00032699, "epoch": 0.584380448505982, "flos": 13881817893600.0, "grad_norm": 1.8706093124047165, "language_loss": 0.77108502, "learning_rate": 1.554307207298465e-06, "loss": 0.79632771, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.789146900177002 }, { "auxiliary_loss_clip": 0.01354435, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.0088079, "balance_loss_mlp": 1.00027859, "epoch": 0.5845006913966212, "flos": 21543720343200.0, "grad_norm": 1.9294889603866523, "language_loss": 0.7835924, "learning_rate": 1.553547857831704e-06, "loss": 0.8090775, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.673522710800171 }, { "auxiliary_loss_clip": 0.01322797, "auxiliary_loss_mlp": 0.01193132, "balance_loss_clip": 1.00410056, "balance_loss_mlp": 1.0001086, "epoch": 0.5846209342872603, "flos": 58375478809440.0, "grad_norm": 0.8775763199761007, "language_loss": 0.64202839, "learning_rate": 1.5527885760913771e-06, "loss": 0.66718769, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 3.1157467365264893 }, { "auxiliary_loss_clip": 0.01316433, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00806189, "balance_loss_mlp": 1.00029123, "epoch": 0.5847411771778993, "flos": 18588055875840.0, "grad_norm": 1.532161438233513, "language_loss": 0.76670229, "learning_rate": 1.552029362192668e-06, "loss": 0.79180741, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.7450294494628906 }, { "auxiliary_loss_clip": 0.01295362, "auxiliary_loss_mlp": 0.01194039, "balance_loss_clip": 1.00756133, "balance_loss_mlp": 1.00034797, "epoch": 0.5848614200685385, "flos": 24240790854720.0, "grad_norm": 1.70211099339287, "language_loss": 0.72657514, "learning_rate": 1.5512702162507478e-06, "loss": 0.75146914, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.837824821472168 }, { "auxiliary_loss_clip": 0.01290198, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00423026, "balance_loss_mlp": 1.00009811, "epoch": 0.5849816629591775, "flos": 71660276868960.0, "grad_norm": 1.107753709358401, "language_loss": 0.55714232, "learning_rate": 1.5505111383807792e-06, "loss": 0.58197552, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.388772487640381 }, { "auxiliary_loss_clip": 0.01281463, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00759208, "balance_loss_mlp": 1.00029159, "epoch": 0.5851019058498166, "flos": 23802105689280.0, "grad_norm": 1.6705455675205636, "language_loss": 0.80705088, "learning_rate": 1.5497521286979138e-06, "loss": 0.8318063, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 3.8006861209869385 }, { "auxiliary_loss_clip": 0.01298727, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00795496, "balance_loss_mlp": 1.00030327, "epoch": 0.5852221487404557, "flos": 24388528037760.0, "grad_norm": 1.6852124546780862, "language_loss": 0.74218011, "learning_rate": 1.5489931873172927e-06, "loss": 0.76710832, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 3.8071608543395996 }, { "auxiliary_loss_clip": 0.0126991, "auxiliary_loss_mlp": 0.0119397, "balance_loss_clip": 1.00784957, "balance_loss_mlp": 1.00018299, "epoch": 0.5853423916310948, "flos": 27271437547680.0, "grad_norm": 1.6285422867448882, "language_loss": 0.78969461, "learning_rate": 1.5482343143540467e-06, "loss": 0.81433344, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.943540573120117 }, { "auxiliary_loss_clip": 0.01283204, "auxiliary_loss_mlp": 0.00872427, "balance_loss_clip": 1.00759196, "balance_loss_mlp": 1.00050008, "epoch": 0.5854626345217339, "flos": 11983794585120.0, "grad_norm": 2.1183164945838477, "language_loss": 0.82922149, "learning_rate": 1.547475509923295e-06, "loss": 0.85077775, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 3.7358875274658203 }, { "auxiliary_loss_clip": 0.01250683, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00405669, "balance_loss_mlp": 1.00015569, "epoch": 0.585582877412373, "flos": 64342368468000.0, "grad_norm": 0.7262411024948655, "language_loss": 0.56054783, "learning_rate": 1.5467167741401495e-06, "loss": 0.58498645, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.3650102615356445 }, { "auxiliary_loss_clip": 0.01329076, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00881648, "balance_loss_mlp": 1.00027168, "epoch": 0.5857031203030121, "flos": 17011933552800.0, "grad_norm": 2.690642985810729, "language_loss": 0.71216995, "learning_rate": 1.5459581071197083e-06, "loss": 0.73740131, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 3.7544875144958496 }, { "auxiliary_loss_clip": 0.01331135, "auxiliary_loss_mlp": 0.01194109, "balance_loss_clip": 1.0086776, "balance_loss_mlp": 1.0003221, "epoch": 0.5858233631936511, "flos": 20885692595040.0, "grad_norm": 2.252447152154863, "language_loss": 0.83301568, "learning_rate": 1.5451995089770624e-06, "loss": 0.85826808, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.7569644451141357 }, { "auxiliary_loss_clip": 0.01353459, "auxiliary_loss_mlp": 0.01193721, "balance_loss_clip": 1.00835776, "balance_loss_mlp": 1.00022101, "epoch": 0.5859436060842903, "flos": 23191915073760.0, "grad_norm": 1.4222619284585865, "language_loss": 0.71875173, "learning_rate": 1.5444409798272885e-06, "loss": 0.74422348, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.7986176013946533 }, { "auxiliary_loss_clip": 0.01301344, "auxiliary_loss_mlp": 0.01194086, "balance_loss_clip": 1.00776303, "balance_loss_mlp": 1.0002991, "epoch": 0.5860638489749294, "flos": 22492660150080.0, "grad_norm": 1.6794403373189026, "language_loss": 0.80609024, "learning_rate": 1.543682519785456e-06, "loss": 0.83104455, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.8863754272460938 }, { "auxiliary_loss_clip": 0.01324072, "auxiliary_loss_mlp": 0.01193941, "balance_loss_clip": 1.00826454, "balance_loss_mlp": 1.0002501, "epoch": 0.5861840918655684, "flos": 17566253340480.0, "grad_norm": 2.24896465733972, "language_loss": 0.80517924, "learning_rate": 1.5429241289666219e-06, "loss": 0.83035934, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.8541088104248047 }, { "auxiliary_loss_clip": 0.0131991, "auxiliary_loss_mlp": 0.0119403, "balance_loss_clip": 1.00821829, "balance_loss_mlp": 1.00024319, "epoch": 0.5863043347562076, "flos": 25556163801120.0, "grad_norm": 2.0974978158118525, "language_loss": 0.69657272, "learning_rate": 1.5421658074858342e-06, "loss": 0.72171211, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.8211398124694824 }, { "auxiliary_loss_clip": 0.01319343, "auxiliary_loss_mlp": 0.0119399, "balance_loss_clip": 1.00890601, "balance_loss_mlp": 1.00029826, "epoch": 0.5864245776468466, "flos": 20667535493760.0, "grad_norm": 2.2475662929494593, "language_loss": 0.66405404, "learning_rate": 1.5414075554581298e-06, "loss": 0.68918735, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.8852486610412598 }, { "auxiliary_loss_clip": 0.01354676, "auxiliary_loss_mlp": 0.01194062, "balance_loss_clip": 1.00852728, "balance_loss_mlp": 1.00027585, "epoch": 0.5865448205374857, "flos": 28913920413120.0, "grad_norm": 2.819130682436931, "language_loss": 0.78571123, "learning_rate": 1.5406493729985348e-06, "loss": 0.81119859, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.93577241897583 }, { "auxiliary_loss_clip": 0.01263798, "auxiliary_loss_mlp": 0.00872499, "balance_loss_clip": 1.00764644, "balance_loss_mlp": 1.00041676, "epoch": 0.5866650634281249, "flos": 25842585399840.0, "grad_norm": 2.016995193395331, "language_loss": 0.72419691, "learning_rate": 1.5398912602220644e-06, "loss": 0.74555993, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.8957433700561523 }, { "auxiliary_loss_clip": 0.01279906, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.00817382, "balance_loss_mlp": 1.00023961, "epoch": 0.5867853063187639, "flos": 17052334483680.0, "grad_norm": 1.9211554830725446, "language_loss": 0.79164755, "learning_rate": 1.539133217243724e-06, "loss": 0.81638688, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.8568685054779053 }, { "auxiliary_loss_clip": 0.01299939, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00796294, "balance_loss_mlp": 1.00029182, "epoch": 0.586905549209403, "flos": 24645038420160.0, "grad_norm": 2.20591602040767, "language_loss": 0.7595498, "learning_rate": 1.5383752441785081e-06, "loss": 0.78448999, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.8295211791992188 }, { "auxiliary_loss_clip": 0.01341941, "auxiliary_loss_mlp": 0.01194099, "balance_loss_clip": 1.00919855, "balance_loss_mlp": 1.00031209, "epoch": 0.5870257921000421, "flos": 14720547553920.0, "grad_norm": 2.2331683097808273, "language_loss": 0.85703933, "learning_rate": 1.5376173411414003e-06, "loss": 0.88239968, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.669562816619873 }, { "auxiliary_loss_clip": 0.01329335, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.00874174, "balance_loss_mlp": 1.0002389, "epoch": 0.5871460349906812, "flos": 23915010111840.0, "grad_norm": 1.8074800252766512, "language_loss": 0.78938192, "learning_rate": 1.5368595082473753e-06, "loss": 0.81461549, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.8093996047973633 }, { "auxiliary_loss_clip": 0.01339626, "auxiliary_loss_mlp": 0.01194078, "balance_loss_clip": 1.00826168, "balance_loss_mlp": 1.00029159, "epoch": 0.5872662778813202, "flos": 22164185131200.0, "grad_norm": 2.264930511146097, "language_loss": 0.77948779, "learning_rate": 1.5361017456113935e-06, "loss": 0.80482483, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.7525367736816406 }, { "auxiliary_loss_clip": 0.01341064, "auxiliary_loss_mlp": 0.01193993, "balance_loss_clip": 1.00843298, "balance_loss_mlp": 1.00020683, "epoch": 0.5873865207719594, "flos": 18441935258400.0, "grad_norm": 1.926875758936225, "language_loss": 0.85742867, "learning_rate": 1.5353440533484085e-06, "loss": 0.8827793, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.7701759338378906 }, { "auxiliary_loss_clip": 0.01316289, "auxiliary_loss_mlp": 0.01194123, "balance_loss_clip": 1.00771546, "balance_loss_mlp": 1.00033605, "epoch": 0.5875067636625985, "flos": 54015341163840.0, "grad_norm": 2.074151766548895, "language_loss": 0.65894985, "learning_rate": 1.534586431573361e-06, "loss": 0.68405402, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 3.1227567195892334 }, { "auxiliary_loss_clip": 0.01271044, "auxiliary_loss_mlp": 0.01194183, "balance_loss_clip": 1.00775218, "balance_loss_mlp": 1.00030077, "epoch": 0.5876270065532375, "flos": 27995718067200.0, "grad_norm": 1.8155311598169492, "language_loss": 0.7913276, "learning_rate": 1.5338288804011817e-06, "loss": 0.81597984, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.895209550857544 }, { "auxiliary_loss_clip": 0.01329218, "auxiliary_loss_mlp": 0.01194074, "balance_loss_clip": 1.00820374, "balance_loss_mlp": 1.00028694, "epoch": 0.5877472494438767, "flos": 21361473712800.0, "grad_norm": 1.873347325227204, "language_loss": 0.70893443, "learning_rate": 1.533071399946791e-06, "loss": 0.73416734, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.9328854084014893 }, { "auxiliary_loss_clip": 0.01304179, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00824845, "balance_loss_mlp": 1.00030351, "epoch": 0.5878674923345157, "flos": 22383024782400.0, "grad_norm": 1.9877479822914492, "language_loss": 0.5780763, "learning_rate": 1.5323139903250977e-06, "loss": 0.60305905, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.8139030933380127 }, { "auxiliary_loss_clip": 0.0131586, "auxiliary_loss_mlp": 0.01194118, "balance_loss_clip": 1.00807941, "balance_loss_mlp": 1.0003314, "epoch": 0.5879877352251548, "flos": 21868674841440.0, "grad_norm": 1.4800556540208767, "language_loss": 0.76891565, "learning_rate": 1.5315566516510002e-06, "loss": 0.79401541, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.764843702316284 }, { "auxiliary_loss_clip": 0.01353541, "auxiliary_loss_mlp": 0.01193917, "balance_loss_clip": 1.00863743, "balance_loss_mlp": 1.0002259, "epoch": 0.5881079781157939, "flos": 17493821696160.0, "grad_norm": 1.5975691775311258, "language_loss": 0.67668349, "learning_rate": 1.5307993840393857e-06, "loss": 0.70215809, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.7041685581207275 }, { "auxiliary_loss_clip": 0.0135408, "auxiliary_loss_mlp": 0.01193899, "balance_loss_clip": 1.00812578, "balance_loss_mlp": 1.00030363, "epoch": 0.588228221006433, "flos": 22601864433600.0, "grad_norm": 1.7254971652346913, "language_loss": 0.80464792, "learning_rate": 1.530042187605132e-06, "loss": 0.83012772, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 3.682729721069336 }, { "auxiliary_loss_clip": 0.0133504, "auxiliary_loss_mlp": 0.00872334, "balance_loss_clip": 1.00784981, "balance_loss_mlp": 1.0004462, "epoch": 0.5883484638970721, "flos": 26176951902240.0, "grad_norm": 1.3578273283904398, "language_loss": 0.84337342, "learning_rate": 1.5292850624631044e-06, "loss": 0.8654471, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 3.778362274169922 }, { "auxiliary_loss_clip": 0.01327819, "auxiliary_loss_mlp": 0.01193987, "balance_loss_clip": 1.00834095, "balance_loss_mlp": 1.00029564, "epoch": 0.5884687067877111, "flos": 30443750321760.0, "grad_norm": 1.850057536736765, "language_loss": 0.80428177, "learning_rate": 1.5285280087281593e-06, "loss": 0.82949984, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.899012565612793 }, { "auxiliary_loss_clip": 0.01293685, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00430369, "balance_loss_mlp": 1.0000571, "epoch": 0.5885889496783503, "flos": 70507585356480.0, "grad_norm": 0.6371782063664903, "language_loss": 0.56603599, "learning_rate": 1.5277710265151398e-06, "loss": 0.59090364, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 4.432432174682617 }, { "auxiliary_loss_clip": 0.01343081, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00921988, "balance_loss_mlp": 1.00024796, "epoch": 0.5887091925689893, "flos": 19098777525120.0, "grad_norm": 3.7173564629167015, "language_loss": 0.77302086, "learning_rate": 1.5270141159388803e-06, "loss": 0.798392, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 2.7925117015838623 }, { "auxiliary_loss_clip": 0.01352674, "auxiliary_loss_mlp": 0.01193986, "balance_loss_clip": 1.00797355, "balance_loss_mlp": 1.00019896, "epoch": 0.5888294354596284, "flos": 23294293858080.0, "grad_norm": 1.5312859791642217, "language_loss": 0.80402637, "learning_rate": 1.526257277114203e-06, "loss": 0.82949293, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 3.602114200592041 }, { "auxiliary_loss_clip": 0.01305692, "auxiliary_loss_mlp": 0.01193949, "balance_loss_clip": 1.00739408, "balance_loss_mlp": 1.00025821, "epoch": 0.5889496783502676, "flos": 21981543340320.0, "grad_norm": 1.9628443790375703, "language_loss": 0.79558378, "learning_rate": 1.5255005101559201e-06, "loss": 0.82058018, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.7717349529266357 }, { "auxiliary_loss_clip": 0.01325718, "auxiliary_loss_mlp": 0.01194045, "balance_loss_clip": 1.00837696, "balance_loss_mlp": 1.00025845, "epoch": 0.5890699212409066, "flos": 21685242729600.0, "grad_norm": 2.124526277643593, "language_loss": 0.76604795, "learning_rate": 1.524743815178833e-06, "loss": 0.79124558, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.892598867416382 }, { "auxiliary_loss_clip": 0.01322574, "auxiliary_loss_mlp": 0.0119398, "balance_loss_clip": 1.00797045, "balance_loss_mlp": 1.00028861, "epoch": 0.5891901641315457, "flos": 19464564038400.0, "grad_norm": 1.7020442273496286, "language_loss": 0.80395782, "learning_rate": 1.5239871922977315e-06, "loss": 0.82912338, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.836977243423462 }, { "auxiliary_loss_clip": 0.01328431, "auxiliary_loss_mlp": 0.0119402, "balance_loss_clip": 1.00873327, "balance_loss_mlp": 1.00023353, "epoch": 0.5893104070221848, "flos": 19609894334880.0, "grad_norm": 1.7568073706467204, "language_loss": 0.89418304, "learning_rate": 1.523230641627394e-06, "loss": 0.91940755, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.8230788707733154 }, { "auxiliary_loss_clip": 0.01295137, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.00833845, "balance_loss_mlp": 1.00025177, "epoch": 0.5894306499128239, "flos": 29060076954240.0, "grad_norm": 1.897108732514862, "language_loss": 0.72525918, "learning_rate": 1.5224741632825888e-06, "loss": 0.75015092, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.9286718368530273 }, { "auxiliary_loss_clip": 0.01354215, "auxiliary_loss_mlp": 0.01194027, "balance_loss_clip": 1.00881565, "balance_loss_mlp": 1.00023997, "epoch": 0.589550892803463, "flos": 42298905912480.0, "grad_norm": 1.7395198098065257, "language_loss": 0.69272363, "learning_rate": 1.521717757378074e-06, "loss": 0.71820605, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.8960444927215576 }, { "auxiliary_loss_clip": 0.01336846, "auxiliary_loss_mlp": 0.01194108, "balance_loss_clip": 1.0082283, "balance_loss_mlp": 1.00032187, "epoch": 0.5896711356941021, "flos": 14137070947200.0, "grad_norm": 2.5935928923812694, "language_loss": 0.69026124, "learning_rate": 1.5209614240285943e-06, "loss": 0.71557081, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.7355265617370605 }, { "auxiliary_loss_clip": 0.01353556, "auxiliary_loss_mlp": 0.00872407, "balance_loss_clip": 1.00884295, "balance_loss_mlp": 1.00044298, "epoch": 0.5897913785847412, "flos": 17201364919200.0, "grad_norm": 4.117081754724969, "language_loss": 0.85026795, "learning_rate": 1.520205163348887e-06, "loss": 0.8725276, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.6988112926483154 }, { "auxiliary_loss_clip": 0.01285763, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.00459671, "balance_loss_mlp": 1.0001229, "epoch": 0.5899116214753802, "flos": 48794191670880.0, "grad_norm": 0.7197619668953077, "language_loss": 0.56951696, "learning_rate": 1.519448975453674e-06, "loss": 0.59430605, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.2141640186309814 }, { "auxiliary_loss_clip": 0.01332509, "auxiliary_loss_mlp": 0.0087245, "balance_loss_clip": 1.00879705, "balance_loss_mlp": 1.0004257, "epoch": 0.5900318643660194, "flos": 21103670077920.0, "grad_norm": 1.94462042550718, "language_loss": 0.75806069, "learning_rate": 1.5186928604576696e-06, "loss": 0.78011024, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.7352092266082764 }, { "auxiliary_loss_clip": 0.01322573, "auxiliary_loss_mlp": 0.01193986, "balance_loss_clip": 1.00826764, "balance_loss_mlp": 1.00029421, "epoch": 0.5901521072566585, "flos": 21178400837760.0, "grad_norm": 1.9487280676541912, "language_loss": 0.76742077, "learning_rate": 1.5179368184755752e-06, "loss": 0.79258633, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.7820181846618652 }, { "auxiliary_loss_clip": 0.01310841, "auxiliary_loss_mlp": 0.01193978, "balance_loss_clip": 1.00763106, "balance_loss_mlp": 1.00019157, "epoch": 0.5902723501472975, "flos": 20225976433920.0, "grad_norm": 1.6139398403436787, "language_loss": 0.82608128, "learning_rate": 1.5171808496220821e-06, "loss": 0.85112947, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.7878096103668213 }, { "auxiliary_loss_clip": 0.01323556, "auxiliary_loss_mlp": 0.01194062, "balance_loss_clip": 1.00811958, "balance_loss_mlp": 1.00027561, "epoch": 0.5903925930379367, "flos": 22964417815680.0, "grad_norm": 1.5802754098704483, "language_loss": 0.81017089, "learning_rate": 1.5164249540118708e-06, "loss": 0.83534706, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 3.10772705078125 }, { "auxiliary_loss_clip": 0.0126588, "auxiliary_loss_mlp": 0.0119405, "balance_loss_clip": 1.00730479, "balance_loss_mlp": 1.00026298, "epoch": 0.5905128359285757, "flos": 23367731365440.0, "grad_norm": 1.5581946170257601, "language_loss": 0.83176792, "learning_rate": 1.5156691317596093e-06, "loss": 0.85636723, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.966357707977295 }, { "auxiliary_loss_clip": 0.01341688, "auxiliary_loss_mlp": 0.00872406, "balance_loss_clip": 1.00886774, "balance_loss_mlp": 1.00051069, "epoch": 0.5906330788192148, "flos": 28032347011680.0, "grad_norm": 1.9510748328883138, "language_loss": 0.66810113, "learning_rate": 1.5149133829799556e-06, "loss": 0.69024205, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.7675490379333496 }, { "auxiliary_loss_clip": 0.01314868, "auxiliary_loss_mlp": 0.01194007, "balance_loss_clip": 1.00900984, "balance_loss_mlp": 1.00022018, "epoch": 0.590753321709854, "flos": 18477953500320.0, "grad_norm": 1.899432443290379, "language_loss": 0.80464095, "learning_rate": 1.5141577077875556e-06, "loss": 0.82972968, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.740469455718994 }, { "auxiliary_loss_clip": 0.01341579, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.00929379, "balance_loss_mlp": 1.00025177, "epoch": 0.590873564600493, "flos": 16873716144960.0, "grad_norm": 1.8547256283742923, "language_loss": 0.72434521, "learning_rate": 1.5134021062970451e-06, "loss": 0.74970138, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.711934804916382 }, { "auxiliary_loss_clip": 0.01292248, "auxiliary_loss_mlp": 0.01194019, "balance_loss_clip": 1.00872254, "balance_loss_mlp": 1.0002327, "epoch": 0.5909938074911321, "flos": 13516175075040.0, "grad_norm": 1.8018797457352547, "language_loss": 0.80783349, "learning_rate": 1.5126465786230483e-06, "loss": 0.8326962, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.8139803409576416 }, { "auxiliary_loss_clip": 0.01352957, "auxiliary_loss_mlp": 0.01194077, "balance_loss_clip": 1.00856233, "balance_loss_mlp": 1.00038576, "epoch": 0.5911140503817712, "flos": 26024077632960.0, "grad_norm": 1.877279116972281, "language_loss": 0.82190126, "learning_rate": 1.5118911248801787e-06, "loss": 0.84737158, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.7100212574005127 }, { "auxiliary_loss_clip": 0.01328577, "auxiliary_loss_mlp": 0.01193996, "balance_loss_clip": 1.00796938, "balance_loss_mlp": 1.00020909, "epoch": 0.5912342932724103, "flos": 23258742624000.0, "grad_norm": 1.9568735052143562, "language_loss": 0.79553556, "learning_rate": 1.5111357451830364e-06, "loss": 0.82076126, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.777730703353882 }, { "auxiliary_loss_clip": 0.01338953, "auxiliary_loss_mlp": 0.01193958, "balance_loss_clip": 1.0084368, "balance_loss_mlp": 1.00026715, "epoch": 0.5913545361630493, "flos": 19573049848320.0, "grad_norm": 2.1169347961654554, "language_loss": 0.71156597, "learning_rate": 1.5103804396462131e-06, "loss": 0.73689508, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 3.6524925231933594 }, { "auxiliary_loss_clip": 0.01342107, "auxiliary_loss_mlp": 0.01194113, "balance_loss_clip": 1.00870466, "balance_loss_mlp": 1.00032675, "epoch": 0.5914747790536885, "flos": 26213544923040.0, "grad_norm": 2.0648493754024817, "language_loss": 0.80301899, "learning_rate": 1.5096252083842877e-06, "loss": 0.82838118, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 3.764249086380005 }, { "auxiliary_loss_clip": 0.01342191, "auxiliary_loss_mlp": 0.01194088, "balance_loss_clip": 1.00866222, "balance_loss_mlp": 1.00030088, "epoch": 0.5915950219443276, "flos": 27417558165120.0, "grad_norm": 1.6956286900406063, "language_loss": 0.85489833, "learning_rate": 1.5088700515118285e-06, "loss": 0.88026118, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.8047077655792236 }, { "auxiliary_loss_clip": 0.01299855, "auxiliary_loss_mlp": 0.01193982, "balance_loss_clip": 1.0078665, "balance_loss_mlp": 1.00029111, "epoch": 0.5917152648349666, "flos": 21907890290880.0, "grad_norm": 1.572603331957203, "language_loss": 0.66627663, "learning_rate": 1.508114969143392e-06, "loss": 0.69121504, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 3.954040288925171 }, { "auxiliary_loss_clip": 0.01327583, "auxiliary_loss_mlp": 0.01193887, "balance_loss_clip": 1.00813174, "balance_loss_mlp": 1.00019526, "epoch": 0.5918355077256057, "flos": 28109197268640.0, "grad_norm": 1.4281535944465018, "language_loss": 0.77740109, "learning_rate": 1.5073599613935238e-06, "loss": 0.80261582, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 2.880490303039551 }, { "auxiliary_loss_clip": 0.01317915, "auxiliary_loss_mlp": 0.01193987, "balance_loss_clip": 1.00809872, "balance_loss_mlp": 1.00029612, "epoch": 0.5919557506162448, "flos": 28183820257440.0, "grad_norm": 1.7675218151724077, "language_loss": 0.5731585, "learning_rate": 1.5066050283767574e-06, "loss": 0.59827745, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.837491750717163 }, { "auxiliary_loss_clip": 0.01314814, "auxiliary_loss_mlp": 0.01194049, "balance_loss_clip": 1.00902963, "balance_loss_mlp": 1.00026274, "epoch": 0.5920759935068839, "flos": 12094363968480.0, "grad_norm": 1.8064069089865527, "language_loss": 0.82894206, "learning_rate": 1.505850170207616e-06, "loss": 0.85403073, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 3.679241180419922 }, { "auxiliary_loss_clip": 0.01316423, "auxiliary_loss_mlp": 0.0119399, "balance_loss_clip": 1.00762367, "balance_loss_mlp": 1.00029838, "epoch": 0.592196236397523, "flos": 29424785757120.0, "grad_norm": 1.910032202020166, "language_loss": 0.78012097, "learning_rate": 1.505095387000611e-06, "loss": 0.80522513, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.801513910293579 }, { "auxiliary_loss_clip": 0.01317214, "auxiliary_loss_mlp": 0.01193976, "balance_loss_clip": 1.00812244, "balance_loss_mlp": 1.00028467, "epoch": 0.5923164792881621, "flos": 24384720127680.0, "grad_norm": 1.8150518901367316, "language_loss": 0.73894835, "learning_rate": 1.504340678870242e-06, "loss": 0.76406026, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.8048253059387207 }, { "auxiliary_loss_clip": 0.0132865, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00780582, "balance_loss_mlp": 1.00030828, "epoch": 0.5924367221788012, "flos": 24024250319040.0, "grad_norm": 2.0480666313916744, "language_loss": 0.89686072, "learning_rate": 1.5035860459309989e-06, "loss": 0.92208815, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.684485673904419 }, { "auxiliary_loss_clip": 0.01316513, "auxiliary_loss_mlp": 0.01194019, "balance_loss_clip": 1.00843835, "balance_loss_mlp": 1.0002327, "epoch": 0.5925569650694402, "flos": 26870602731840.0, "grad_norm": 1.7631782015109825, "language_loss": 0.63810635, "learning_rate": 1.5028314882973568e-06, "loss": 0.6632117, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.847378730773926 }, { "auxiliary_loss_clip": 0.01311049, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00781393, "balance_loss_mlp": 1.00028908, "epoch": 0.5926772079600794, "flos": 22302797699520.0, "grad_norm": 2.072414382620941, "language_loss": 0.84358096, "learning_rate": 1.502077006083783e-06, "loss": 0.8686322, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.809885263442993 }, { "auxiliary_loss_clip": 0.01326856, "auxiliary_loss_mlp": 0.00872468, "balance_loss_clip": 1.00887823, "balance_loss_mlp": 1.00043762, "epoch": 0.5927974508507184, "flos": 19865255159520.0, "grad_norm": 1.7571101318744085, "language_loss": 0.7680943, "learning_rate": 1.5013225994047315e-06, "loss": 0.79008758, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.7190334796905518 }, { "auxiliary_loss_clip": 0.0133599, "auxiliary_loss_mlp": 0.00872447, "balance_loss_clip": 1.00846815, "balance_loss_mlp": 1.00034058, "epoch": 0.5929176937413575, "flos": 15776751765600.0, "grad_norm": 1.6747354842129714, "language_loss": 0.80733556, "learning_rate": 1.5005682683746452e-06, "loss": 0.82941991, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.8237838745117188 }, { "auxiliary_loss_clip": 0.01330541, "auxiliary_loss_mlp": 0.01193924, "balance_loss_clip": 1.00839698, "balance_loss_mlp": 1.0003283, "epoch": 0.5930379366319967, "flos": 17601481261440.0, "grad_norm": 2.2070154784606957, "language_loss": 0.72173887, "learning_rate": 1.4998140131079553e-06, "loss": 0.74698353, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.727531909942627 }, { "auxiliary_loss_clip": 0.01259077, "auxiliary_loss_mlp": 0.00872479, "balance_loss_clip": 1.00755799, "balance_loss_mlp": 1.00038397, "epoch": 0.5931581795226357, "flos": 17704291129920.0, "grad_norm": 1.654882277173948, "language_loss": 0.73120487, "learning_rate": 1.4990598337190821e-06, "loss": 0.75252044, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.8391590118408203 }, { "auxiliary_loss_clip": 0.01353829, "auxiliary_loss_mlp": 0.00872533, "balance_loss_clip": 1.00866294, "balance_loss_mlp": 1.0004127, "epoch": 0.5932784224132748, "flos": 24280114075200.0, "grad_norm": 1.628242187473401, "language_loss": 0.67551941, "learning_rate": 1.4983057303224338e-06, "loss": 0.69778299, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.714588165283203 }, { "auxiliary_loss_clip": 0.01280819, "auxiliary_loss_mlp": 0.01194066, "balance_loss_clip": 1.00728428, "balance_loss_mlp": 1.00027955, "epoch": 0.5933986653039139, "flos": 22926711160800.0, "grad_norm": 1.5803865657907155, "language_loss": 0.8758589, "learning_rate": 1.4975517030324072e-06, "loss": 0.90060771, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.8301100730895996 }, { "auxiliary_loss_clip": 0.01322556, "auxiliary_loss_mlp": 0.00871802, "balance_loss_clip": 1.00480592, "balance_loss_mlp": 0.99994814, "epoch": 0.593518908194553, "flos": 71121763500480.0, "grad_norm": 0.7863274676231645, "language_loss": 0.61857605, "learning_rate": 1.4967977519633882e-06, "loss": 0.64051962, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.3689801692962646 }, { "auxiliary_loss_clip": 0.01298392, "auxiliary_loss_mlp": 0.01193931, "balance_loss_clip": 1.00764847, "balance_loss_mlp": 1.00023937, "epoch": 0.593639151085192, "flos": 20448659918880.0, "grad_norm": 1.9654297851584817, "language_loss": 0.77919042, "learning_rate": 1.4960438772297494e-06, "loss": 0.80411363, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.7619314193725586 }, { "auxiliary_loss_clip": 0.01329826, "auxiliary_loss_mlp": 0.0119426, "balance_loss_clip": 1.00843883, "balance_loss_mlp": 1.00037837, "epoch": 0.5937593939758312, "flos": 30883441350240.0, "grad_norm": 2.25386093104118, "language_loss": 0.73624253, "learning_rate": 1.495290078945855e-06, "loss": 0.76148343, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.8336262702941895 }, { "auxiliary_loss_clip": 0.01352612, "auxiliary_loss_mlp": 0.01194051, "balance_loss_clip": 1.0086441, "balance_loss_mlp": 1.00036001, "epoch": 0.5938796368664703, "flos": 36898083084960.0, "grad_norm": 1.8386219872392993, "language_loss": 0.74335951, "learning_rate": 1.4945363572260529e-06, "loss": 0.76882613, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.8001394271850586 }, { "auxiliary_loss_clip": 0.01331309, "auxiliary_loss_mlp": 0.01193911, "balance_loss_clip": 1.00787473, "balance_loss_mlp": 1.00021946, "epoch": 0.5939998797571093, "flos": 23842937704320.0, "grad_norm": 2.0795416296246207, "language_loss": 0.67978704, "learning_rate": 1.4937827121846845e-06, "loss": 0.7050392, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.7623798847198486 }, { "auxiliary_loss_clip": 0.01291711, "auxiliary_loss_mlp": 0.01194054, "balance_loss_clip": 1.00778842, "balance_loss_mlp": 1.00026691, "epoch": 0.5941201226477485, "flos": 25191419074560.0, "grad_norm": 1.7358770643439239, "language_loss": 0.73351347, "learning_rate": 1.4930291439360755e-06, "loss": 0.75837111, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.787485122680664 }, { "auxiliary_loss_clip": 0.01330299, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00793839, "balance_loss_mlp": 1.00026512, "epoch": 0.5942403655383875, "flos": 22419007100640.0, "grad_norm": 1.9756618224968558, "language_loss": 0.79180092, "learning_rate": 1.4922756525945427e-06, "loss": 0.81704444, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.7484381198883057 }, { "auxiliary_loss_clip": 0.01294853, "auxiliary_loss_mlp": 0.01193131, "balance_loss_clip": 1.00502145, "balance_loss_mlp": 1.00010705, "epoch": 0.5943606084290266, "flos": 67629345924960.0, "grad_norm": 0.7670257829380364, "language_loss": 0.59591895, "learning_rate": 1.4915222382743894e-06, "loss": 0.62079877, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.3505725860595703 }, { "auxiliary_loss_clip": 0.01329426, "auxiliary_loss_mlp": 0.01193983, "balance_loss_clip": 1.00845778, "balance_loss_mlp": 1.00038743, "epoch": 0.5944808513196658, "flos": 18223167454560.0, "grad_norm": 2.3215024688460737, "language_loss": 0.72074777, "learning_rate": 1.4907689010899085e-06, "loss": 0.74598187, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.6943423748016357 }, { "auxiliary_loss_clip": 0.01309355, "auxiliary_loss_mlp": 0.01193982, "balance_loss_clip": 1.00708365, "balance_loss_mlp": 1.00019503, "epoch": 0.5946010942103048, "flos": 24790835724480.0, "grad_norm": 2.0263450521503934, "language_loss": 0.61898541, "learning_rate": 1.4900156411553804e-06, "loss": 0.64401877, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 3.8254799842834473 }, { "auxiliary_loss_clip": 0.01318153, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.00809264, "balance_loss_mlp": 1.00025129, "epoch": 0.5947213371009439, "flos": 15231628440000.0, "grad_norm": 1.9997653820346941, "language_loss": 0.85191739, "learning_rate": 1.4892624585850739e-06, "loss": 0.87703925, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.8123037815093994 }, { "auxiliary_loss_clip": 0.01354097, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00926435, "balance_loss_mlp": 1.00024796, "epoch": 0.594841579991583, "flos": 25848081722880.0, "grad_norm": 2.487892635174045, "language_loss": 0.79820406, "learning_rate": 1.4885093534932465e-06, "loss": 0.82368541, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 3.602156639099121 }, { "auxiliary_loss_clip": 0.01306148, "auxiliary_loss_mlp": 0.0119412, "balance_loss_clip": 1.00804305, "balance_loss_mlp": 1.00033331, "epoch": 0.5949618228822221, "flos": 23981119188480.0, "grad_norm": 2.410576985297702, "language_loss": 0.71128452, "learning_rate": 1.4877563259941433e-06, "loss": 0.73628712, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.7838447093963623 }, { "auxiliary_loss_clip": 0.01341499, "auxiliary_loss_mlp": 0.01194146, "balance_loss_clip": 1.00922883, "balance_loss_mlp": 1.00035989, "epoch": 0.5950820657728612, "flos": 40547506152960.0, "grad_norm": 1.825532274627545, "language_loss": 0.67840838, "learning_rate": 1.4870033762019988e-06, "loss": 0.7037648, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 3.823777914047241 }, { "auxiliary_loss_clip": 0.01323582, "auxiliary_loss_mlp": 0.01194022, "balance_loss_clip": 1.00850177, "balance_loss_mlp": 1.00033057, "epoch": 0.5952023086635003, "flos": 23184478872000.0, "grad_norm": 1.4841294052354403, "language_loss": 0.73248571, "learning_rate": 1.4862505042310334e-06, "loss": 0.7576617, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.793156147003174 }, { "auxiliary_loss_clip": 0.01319132, "auxiliary_loss_mlp": 0.01193848, "balance_loss_clip": 1.00832796, "balance_loss_mlp": 1.00025177, "epoch": 0.5953225515541394, "flos": 33653302742880.0, "grad_norm": 1.5030356644540828, "language_loss": 0.69481641, "learning_rate": 1.4854977101954587e-06, "loss": 0.71994615, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.8638546466827393 }, { "auxiliary_loss_clip": 0.01340829, "auxiliary_loss_mlp": 0.01194056, "balance_loss_clip": 1.00839305, "balance_loss_mlp": 1.00026941, "epoch": 0.5954427944447784, "flos": 24459630505920.0, "grad_norm": 1.784457064080595, "language_loss": 0.86476409, "learning_rate": 1.4847449942094716e-06, "loss": 0.890113, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.740752696990967 }, { "auxiliary_loss_clip": 0.01306507, "auxiliary_loss_mlp": 0.01193975, "balance_loss_clip": 1.00767577, "balance_loss_mlp": 1.00028396, "epoch": 0.5955630373354175, "flos": 18551858015520.0, "grad_norm": 1.9239460807950173, "language_loss": 0.86231863, "learning_rate": 1.4839923563872598e-06, "loss": 0.8873235, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.768749237060547 }, { "auxiliary_loss_clip": 0.012952, "auxiliary_loss_mlp": 0.01194077, "balance_loss_clip": 1.00827217, "balance_loss_mlp": 1.00029075, "epoch": 0.5956832802260567, "flos": 19791709881120.0, "grad_norm": 1.8587392043534923, "language_loss": 0.7574805, "learning_rate": 1.483239796842997e-06, "loss": 0.78237331, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.761862277984619 }, { "auxiliary_loss_clip": 0.01281682, "auxiliary_loss_mlp": 0.01194084, "balance_loss_clip": 1.00792122, "balance_loss_mlp": 1.00029731, "epoch": 0.5958035231166957, "flos": 19750877866080.0, "grad_norm": 1.8297169043473678, "language_loss": 0.83903766, "learning_rate": 1.4824873156908462e-06, "loss": 0.86379534, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.8046987056732178 }, { "auxiliary_loss_clip": 0.01331845, "auxiliary_loss_mlp": 0.00872678, "balance_loss_clip": 1.00838804, "balance_loss_mlp": 1.00048971, "epoch": 0.5959237660073348, "flos": 21652816855680.0, "grad_norm": 1.533740984642102, "language_loss": 0.75458682, "learning_rate": 1.4817349130449584e-06, "loss": 0.77663201, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.7808446884155273 }, { "auxiliary_loss_clip": 0.01327768, "auxiliary_loss_mlp": 0.01194016, "balance_loss_clip": 1.00799537, "balance_loss_mlp": 1.00022984, "epoch": 0.5960440088979739, "flos": 21171216101760.0, "grad_norm": 1.9470048964041045, "language_loss": 0.83056217, "learning_rate": 1.4809825890194717e-06, "loss": 0.85578001, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.816413164138794 }, { "auxiliary_loss_clip": 0.01327278, "auxiliary_loss_mlp": 0.01193782, "balance_loss_clip": 1.00842166, "balance_loss_mlp": 1.00018644, "epoch": 0.596164251788613, "flos": 14757535735200.0, "grad_norm": 1.7691372559043106, "language_loss": 0.76992786, "learning_rate": 1.4802303437285139e-06, "loss": 0.79513848, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.7663278579711914 }, { "auxiliary_loss_clip": 0.01316291, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.00786948, "balance_loss_mlp": 1.00023961, "epoch": 0.596284494679252, "flos": 20485935489600.0, "grad_norm": 2.1131925449883355, "language_loss": 0.80761105, "learning_rate": 1.4794781772861994e-06, "loss": 0.8327142, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.7759134769439697 }, { "auxiliary_loss_clip": 0.01320213, "auxiliary_loss_mlp": 0.00872516, "balance_loss_clip": 1.00816274, "balance_loss_mlp": 1.00050247, "epoch": 0.5964047375698912, "flos": 31212275605920.0, "grad_norm": 2.045825923356579, "language_loss": 0.66604257, "learning_rate": 1.4787260898066324e-06, "loss": 0.6879698, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.854562282562256 }, { "auxiliary_loss_clip": 0.013525, "auxiliary_loss_mlp": 0.01194034, "balance_loss_clip": 1.00889325, "balance_loss_mlp": 1.000247, "epoch": 0.5965249804605303, "flos": 27483631318080.0, "grad_norm": 1.8984135731690888, "language_loss": 0.85650074, "learning_rate": 1.4779740814039023e-06, "loss": 0.88196611, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.7405247688293457 }, { "auxiliary_loss_clip": 0.01353167, "auxiliary_loss_mlp": 0.01194057, "balance_loss_clip": 1.00852537, "balance_loss_mlp": 1.00027049, "epoch": 0.5966452233511693, "flos": 30773949677280.0, "grad_norm": 1.8259546010314853, "language_loss": 0.68647659, "learning_rate": 1.4772221521920894e-06, "loss": 0.71194887, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.7696754932403564 }, { "auxiliary_loss_clip": 0.01316923, "auxiliary_loss_mlp": 0.01193962, "balance_loss_clip": 1.00894845, "balance_loss_mlp": 1.00027061, "epoch": 0.5967654662418085, "flos": 25481181575520.0, "grad_norm": 1.8986521195219415, "language_loss": 0.74019992, "learning_rate": 1.4764703022852598e-06, "loss": 0.76530874, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.8093690872192383 }, { "auxiliary_loss_clip": 0.01254512, "auxiliary_loss_mlp": 0.01193955, "balance_loss_clip": 1.00738883, "balance_loss_mlp": 1.00026369, "epoch": 0.5968857091324475, "flos": 19099136761920.0, "grad_norm": 1.7491546511357865, "language_loss": 0.76956809, "learning_rate": 1.4757185317974696e-06, "loss": 0.79405272, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.871574878692627 }, { "auxiliary_loss_clip": 0.01330386, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.00782919, "balance_loss_mlp": 1.00024295, "epoch": 0.5970059520230866, "flos": 23692721787360.0, "grad_norm": 2.498005544343288, "language_loss": 0.71016812, "learning_rate": 1.474966840842761e-06, "loss": 0.73541224, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.6984610557556152 }, { "auxiliary_loss_clip": 0.01341566, "auxiliary_loss_mlp": 0.01193957, "balance_loss_clip": 1.00909662, "balance_loss_mlp": 1.00026608, "epoch": 0.5971261949137258, "flos": 23185556582400.0, "grad_norm": 1.7828129355800775, "language_loss": 0.86900073, "learning_rate": 1.4742152295351655e-06, "loss": 0.89435589, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.7528810501098633 }, { "auxiliary_loss_clip": 0.01330423, "auxiliary_loss_mlp": 0.00872572, "balance_loss_clip": 1.00776863, "balance_loss_mlp": 1.00049734, "epoch": 0.5972464378043648, "flos": 20557720507680.0, "grad_norm": 2.33830527908484, "language_loss": 0.64026672, "learning_rate": 1.4734636979887016e-06, "loss": 0.66229665, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.682983875274658 }, { "auxiliary_loss_clip": 0.01303907, "auxiliary_loss_mlp": 0.01194027, "balance_loss_clip": 1.00811291, "balance_loss_mlp": 1.00024056, "epoch": 0.5973666806950039, "flos": 29387043178560.0, "grad_norm": 1.9637933638555034, "language_loss": 0.90041524, "learning_rate": 1.4727122463173755e-06, "loss": 0.92539454, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.8831629753112793 }, { "auxiliary_loss_clip": 0.01309999, "auxiliary_loss_mlp": 0.01194046, "balance_loss_clip": 1.00739336, "balance_loss_mlp": 1.00025904, "epoch": 0.597486923585643, "flos": 22273533109440.0, "grad_norm": 1.81058820081461, "language_loss": 0.64532995, "learning_rate": 1.471960874635183e-06, "loss": 0.67037034, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.802180051803589 }, { "auxiliary_loss_clip": 0.01319497, "auxiliary_loss_mlp": 0.01193901, "balance_loss_clip": 1.00827134, "balance_loss_mlp": 1.00020957, "epoch": 0.5976071664762821, "flos": 13772469915360.0, "grad_norm": 2.294476597446828, "language_loss": 0.70971805, "learning_rate": 1.4712095830561055e-06, "loss": 0.73485202, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.7168080806732178 }, { "auxiliary_loss_clip": 0.01328513, "auxiliary_loss_mlp": 0.01193867, "balance_loss_clip": 1.008762, "balance_loss_mlp": 1.00027096, "epoch": 0.5977274093669211, "flos": 19098633830400.0, "grad_norm": 1.7111456358772001, "language_loss": 0.80727559, "learning_rate": 1.4704583716941147e-06, "loss": 0.83249938, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 4.731455087661743 }, { "auxiliary_loss_clip": 0.01341328, "auxiliary_loss_mlp": 0.01193964, "balance_loss_clip": 1.00960469, "balance_loss_mlp": 1.00027263, "epoch": 0.5978476522575603, "flos": 20376012732480.0, "grad_norm": 1.7070405980361918, "language_loss": 0.72397971, "learning_rate": 1.4697072406631672e-06, "loss": 0.74933261, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.7926318645477295 }, { "auxiliary_loss_clip": 0.01274148, "auxiliary_loss_mlp": 0.01194066, "balance_loss_clip": 1.00797915, "balance_loss_mlp": 1.00027919, "epoch": 0.5979678951481994, "flos": 29023160620320.0, "grad_norm": 1.7531006490111847, "language_loss": 0.72534335, "learning_rate": 1.4689561900772097e-06, "loss": 0.75002551, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 3.905297040939331 }, { "auxiliary_loss_clip": 0.01315983, "auxiliary_loss_mlp": 0.01193956, "balance_loss_clip": 1.00818932, "balance_loss_mlp": 1.00026512, "epoch": 0.5980881380388384, "flos": 17967698858880.0, "grad_norm": 2.2015726460947995, "language_loss": 0.72734249, "learning_rate": 1.4682052200501758e-06, "loss": 0.75244188, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.933945655822754 }, { "auxiliary_loss_clip": 0.01352523, "auxiliary_loss_mlp": 0.01194075, "balance_loss_clip": 1.00865829, "balance_loss_mlp": 1.00028825, "epoch": 0.5982083809294776, "flos": 22962837173760.0, "grad_norm": 1.7496225016986935, "language_loss": 0.80155241, "learning_rate": 1.4674543306959876e-06, "loss": 0.82701832, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 3.6320323944091797 }, { "auxiliary_loss_clip": 0.01317269, "auxiliary_loss_mlp": 0.01194148, "balance_loss_clip": 1.0079267, "balance_loss_mlp": 1.0003612, "epoch": 0.5983286238201166, "flos": 20991951136800.0, "grad_norm": 2.408033973817239, "language_loss": 0.84685802, "learning_rate": 1.4667035221285535e-06, "loss": 0.8719722, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.700655937194824 }, { "auxiliary_loss_clip": 0.01327877, "auxiliary_loss_mlp": 0.01194138, "balance_loss_clip": 1.00835848, "balance_loss_mlp": 1.00035143, "epoch": 0.5984488667107557, "flos": 28183461020640.0, "grad_norm": 5.486659805964297, "language_loss": 0.74025053, "learning_rate": 1.4659527944617715e-06, "loss": 0.76547074, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.773442029953003 }, { "auxiliary_loss_clip": 0.01258113, "auxiliary_loss_mlp": 0.01194013, "balance_loss_clip": 1.00698042, "balance_loss_mlp": 1.00022674, "epoch": 0.5985691096013949, "flos": 16471803618720.0, "grad_norm": 1.5731624845445258, "language_loss": 0.75591242, "learning_rate": 1.465202147809526e-06, "loss": 0.78043377, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.8241591453552246 }, { "auxiliary_loss_clip": 0.01354409, "auxiliary_loss_mlp": 0.01193924, "balance_loss_clip": 1.00907874, "balance_loss_mlp": 1.00023317, "epoch": 0.5986893524920339, "flos": 26719057638720.0, "grad_norm": 1.890380909426091, "language_loss": 0.75869215, "learning_rate": 1.4644515822856888e-06, "loss": 0.78417546, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.7539501190185547 }, { "auxiliary_loss_clip": 0.01276272, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00461948, "balance_loss_mlp": 1.00011504, "epoch": 0.598809595382673, "flos": 61608094233120.0, "grad_norm": 0.8430613588278902, "language_loss": 0.56569028, "learning_rate": 1.4637010980041215e-06, "loss": 0.59038436, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.3576672077178955 }, { "auxiliary_loss_clip": 0.01353171, "auxiliary_loss_mlp": 0.01194018, "balance_loss_clip": 1.00863314, "balance_loss_mlp": 1.00032687, "epoch": 0.5989298382733121, "flos": 11801727573120.0, "grad_norm": 3.3056746620129744, "language_loss": 0.89469594, "learning_rate": 1.4629506950786707e-06, "loss": 0.9201678, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.6389687061309814 }, { "auxiliary_loss_clip": 0.01323067, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.00557113, "balance_loss_mlp": 1.00008714, "epoch": 0.5990500811639512, "flos": 60025828960800.0, "grad_norm": 0.8077436850595431, "language_loss": 0.56078541, "learning_rate": 1.4622003736231733e-06, "loss": 0.58594716, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.3177478313446045 }, { "auxiliary_loss_clip": 0.0133029, "auxiliary_loss_mlp": 0.0119409, "balance_loss_clip": 1.00873375, "balance_loss_mlp": 1.00030339, "epoch": 0.5991703240545903, "flos": 18222736370400.0, "grad_norm": 1.8884489193566192, "language_loss": 0.80525565, "learning_rate": 1.461450133751451e-06, "loss": 0.83049941, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.710559368133545 }, { "auxiliary_loss_clip": 0.01326271, "auxiliary_loss_mlp": 0.01193944, "balance_loss_clip": 1.00908256, "balance_loss_mlp": 1.00025296, "epoch": 0.5992905669452293, "flos": 27709907171040.0, "grad_norm": 1.6745156438111075, "language_loss": 0.75885588, "learning_rate": 1.4606999755773153e-06, "loss": 0.78405809, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.7501282691955566 }, { "auxiliary_loss_clip": 0.01353008, "auxiliary_loss_mlp": 0.01193826, "balance_loss_clip": 1.00882471, "balance_loss_mlp": 1.0003258, "epoch": 0.5994108098358685, "flos": 20449019155680.0, "grad_norm": 1.5753832424614065, "language_loss": 0.82191336, "learning_rate": 1.4599498992145643e-06, "loss": 0.84738171, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.8723697662353516 }, { "auxiliary_loss_clip": 0.01323021, "auxiliary_loss_mlp": 0.0087245, "balance_loss_clip": 1.00868118, "balance_loss_mlp": 1.00036645, "epoch": 0.5995310527265075, "flos": 22269976665120.0, "grad_norm": 1.857759585284333, "language_loss": 0.70653987, "learning_rate": 1.4591999047769846e-06, "loss": 0.72849452, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.9493660926818848 }, { "auxiliary_loss_clip": 0.01276428, "auxiliary_loss_mlp": 0.01194042, "balance_loss_clip": 1.00824511, "balance_loss_mlp": 1.00025547, "epoch": 0.5996512956171466, "flos": 18916961978880.0, "grad_norm": 1.742885042426783, "language_loss": 0.7501657, "learning_rate": 1.4584499923783486e-06, "loss": 0.7748704, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.8761630058288574 }, { "auxiliary_loss_clip": 0.01312996, "auxiliary_loss_mlp": 0.0119393, "balance_loss_clip": 1.00773597, "balance_loss_mlp": 1.0002383, "epoch": 0.5997715385077858, "flos": 15370923558240.0, "grad_norm": 4.051440301339804, "language_loss": 0.76184797, "learning_rate": 1.457700162132419e-06, "loss": 0.78691727, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.7338263988494873 }, { "auxiliary_loss_clip": 0.01262968, "auxiliary_loss_mlp": 0.01193948, "balance_loss_clip": 1.0065577, "balance_loss_mlp": 1.0002569, "epoch": 0.5998917813984248, "flos": 25264856581920.0, "grad_norm": 1.8876105643582781, "language_loss": 0.72612286, "learning_rate": 1.4569504141529433e-06, "loss": 0.75069201, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.872764825820923 }, { "auxiliary_loss_clip": 0.01329938, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00893188, "balance_loss_mlp": 1.00029862, "epoch": 0.6000120242890639, "flos": 22054513839840.0, "grad_norm": 1.9933038640711664, "language_loss": 0.71918356, "learning_rate": 1.456200748553658e-06, "loss": 0.74442375, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.6960079669952393 }, { "auxiliary_loss_clip": 0.0135367, "auxiliary_loss_mlp": 0.01193809, "balance_loss_clip": 1.00896263, "balance_loss_mlp": 1.00021338, "epoch": 0.600132267179703, "flos": 29863434998880.0, "grad_norm": 1.4595705179056864, "language_loss": 0.78531575, "learning_rate": 1.455451165448287e-06, "loss": 0.81079054, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.79118013381958 }, { "auxiliary_loss_clip": 0.0130745, "auxiliary_loss_mlp": 0.01194056, "balance_loss_clip": 1.00821066, "balance_loss_mlp": 1.00026941, "epoch": 0.6002525100703421, "flos": 25045370304480.0, "grad_norm": 3.1775601772078117, "language_loss": 0.73548841, "learning_rate": 1.4547016649505407e-06, "loss": 0.76050341, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.763378858566284 }, { "auxiliary_loss_clip": 0.0130879, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.0081861, "balance_loss_mlp": 1.00033772, "epoch": 0.6003727529609811, "flos": 20849602505760.0, "grad_norm": 1.7829093427446718, "language_loss": 0.8450619, "learning_rate": 1.4539522471741193e-06, "loss": 0.87009007, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 286.0197563171387 }, { "auxiliary_loss_clip": 0.01332201, "auxiliary_loss_mlp": 0.01194085, "balance_loss_clip": 1.00811315, "balance_loss_mlp": 1.00029826, "epoch": 0.6004929958516203, "flos": 15594612906240.0, "grad_norm": 2.0390454550096124, "language_loss": 0.70962977, "learning_rate": 1.4532029122327067e-06, "loss": 0.73489273, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.80143666267395 }, { "auxiliary_loss_clip": 0.01281837, "auxiliary_loss_mlp": 0.01193962, "balance_loss_clip": 1.00785947, "balance_loss_mlp": 1.00027084, "epoch": 0.6006132387422594, "flos": 21763278468000.0, "grad_norm": 2.1622056896020454, "language_loss": 0.75592697, "learning_rate": 1.4524536602399783e-06, "loss": 0.78068501, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.9089694023132324 }, { "auxiliary_loss_clip": 0.01306623, "auxiliary_loss_mlp": 0.0119397, "balance_loss_clip": 1.00831521, "balance_loss_mlp": 1.00027847, "epoch": 0.6007334816328984, "flos": 22858554434400.0, "grad_norm": 1.5070885960169624, "language_loss": 0.77377582, "learning_rate": 1.4517044913095938e-06, "loss": 0.79878175, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 4.679309368133545 }, { "auxiliary_loss_clip": 0.0133142, "auxiliary_loss_mlp": 0.01193926, "balance_loss_clip": 1.00834239, "balance_loss_mlp": 1.00023437, "epoch": 0.6008537245235376, "flos": 28324588246560.0, "grad_norm": 1.5830645735011526, "language_loss": 0.81510055, "learning_rate": 1.4509554055552022e-06, "loss": 0.84035397, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.734004259109497 }, { "auxiliary_loss_clip": 0.01316365, "auxiliary_loss_mlp": 0.01193897, "balance_loss_clip": 1.00827014, "balance_loss_mlp": 1.00030124, "epoch": 0.6009739674141766, "flos": 20886123679200.0, "grad_norm": 2.5914217272363733, "language_loss": 0.83841443, "learning_rate": 1.450206403090439e-06, "loss": 0.86351705, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 3.7153940200805664 }, { "auxiliary_loss_clip": 0.01330081, "auxiliary_loss_mlp": 0.01193818, "balance_loss_clip": 1.00850606, "balance_loss_mlp": 1.00022244, "epoch": 0.6010942103048157, "flos": 20481013945440.0, "grad_norm": 1.9448334246335701, "language_loss": 0.86169201, "learning_rate": 1.4494574840289274e-06, "loss": 0.88693106, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.7201924324035645 }, { "auxiliary_loss_clip": 0.01337089, "auxiliary_loss_mlp": 0.01194121, "balance_loss_clip": 1.00860226, "balance_loss_mlp": 1.00033379, "epoch": 0.6012144531954549, "flos": 23805985446720.0, "grad_norm": 1.6884300951510358, "language_loss": 0.73985445, "learning_rate": 1.4487086484842782e-06, "loss": 0.76516658, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 2.7441582679748535 }, { "auxiliary_loss_clip": 0.01353311, "auxiliary_loss_mlp": 0.01194008, "balance_loss_clip": 1.0089215, "balance_loss_mlp": 1.00022197, "epoch": 0.6013346960860939, "flos": 18988387760160.0, "grad_norm": 2.098750295450691, "language_loss": 0.60167825, "learning_rate": 1.4479598965700878e-06, "loss": 0.62715137, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 3.6671931743621826 }, { "auxiliary_loss_clip": 0.01304029, "auxiliary_loss_mlp": 0.01193817, "balance_loss_clip": 1.00849366, "balance_loss_mlp": 1.00022089, "epoch": 0.601454938976733, "flos": 24025328029440.0, "grad_norm": 2.3484214510673875, "language_loss": 0.68841887, "learning_rate": 1.4472112283999427e-06, "loss": 0.71339726, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.7955753803253174 }, { "auxiliary_loss_clip": 0.0132883, "auxiliary_loss_mlp": 0.01193904, "balance_loss_clip": 1.00890398, "balance_loss_mlp": 1.00021291, "epoch": 0.6015751818673721, "flos": 26427139716960.0, "grad_norm": 1.9047349631532349, "language_loss": 0.69295514, "learning_rate": 1.4464626440874143e-06, "loss": 0.71818244, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.852463960647583 }, { "auxiliary_loss_clip": 0.0128534, "auxiliary_loss_mlp": 0.01193908, "balance_loss_clip": 1.0086683, "balance_loss_mlp": 1.00021696, "epoch": 0.6016954247580112, "flos": 13115268411840.0, "grad_norm": 2.253707612290708, "language_loss": 0.74285996, "learning_rate": 1.4457141437460636e-06, "loss": 0.76765245, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.7934792041778564 }, { "auxiliary_loss_clip": 0.01308385, "auxiliary_loss_mlp": 0.01194004, "balance_loss_clip": 1.00785851, "balance_loss_mlp": 1.00031233, "epoch": 0.6018156676486502, "flos": 23768458410240.0, "grad_norm": 1.733420362908534, "language_loss": 0.73383015, "learning_rate": 1.444965727489436e-06, "loss": 0.75885403, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.8950634002685547 }, { "auxiliary_loss_clip": 0.01307614, "auxiliary_loss_mlp": 0.01193961, "balance_loss_clip": 1.00840139, "balance_loss_mlp": 1.00027013, "epoch": 0.6019359105392894, "flos": 26469372755520.0, "grad_norm": 1.992484393108838, "language_loss": 0.63034397, "learning_rate": 1.444217395431066e-06, "loss": 0.65535975, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.8896045684814453 }, { "auxiliary_loss_clip": 0.01268069, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00601113, "balance_loss_mlp": 1.00005329, "epoch": 0.6020561534299285, "flos": 69190883233920.0, "grad_norm": 0.7982533245991825, "language_loss": 0.55872953, "learning_rate": 1.4434691476844755e-06, "loss": 0.583341, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.2809457778930664 }, { "auxiliary_loss_clip": 0.01308901, "auxiliary_loss_mlp": 0.01193816, "balance_loss_clip": 1.00826919, "balance_loss_mlp": 1.00022006, "epoch": 0.6021763963205675, "flos": 21835314951840.0, "grad_norm": 1.9362041581654272, "language_loss": 0.6694656, "learning_rate": 1.4427209843631729e-06, "loss": 0.69449276, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.7915561199188232 }, { "auxiliary_loss_clip": 0.01351967, "auxiliary_loss_mlp": 0.00872516, "balance_loss_clip": 1.00898147, "balance_loss_mlp": 1.00045669, "epoch": 0.6022966392112067, "flos": 26578648886400.0, "grad_norm": 2.3726477557051493, "language_loss": 0.81213164, "learning_rate": 1.4419729055806534e-06, "loss": 0.83437645, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.872568368911743 }, { "auxiliary_loss_clip": 0.01305272, "auxiliary_loss_mlp": 0.0087258, "balance_loss_clip": 1.00814199, "balance_loss_mlp": 1.00046921, "epoch": 0.6024168821018457, "flos": 20703733354080.0, "grad_norm": 1.8360161809664237, "language_loss": 0.82102537, "learning_rate": 1.441224911450401e-06, "loss": 0.8428039, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.789170742034912 }, { "auxiliary_loss_clip": 0.01342484, "auxiliary_loss_mlp": 0.011939, "balance_loss_clip": 1.00971222, "balance_loss_mlp": 1.00020838, "epoch": 0.6025371249924848, "flos": 24680984814720.0, "grad_norm": 1.7115914678345476, "language_loss": 0.82346594, "learning_rate": 1.4404770020858851e-06, "loss": 0.84882975, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.76924204826355 }, { "auxiliary_loss_clip": 0.01326421, "auxiliary_loss_mlp": 0.01193805, "balance_loss_clip": 1.00829268, "balance_loss_mlp": 1.00020945, "epoch": 0.602657367883124, "flos": 25955813135520.0, "grad_norm": 1.5672901191297346, "language_loss": 0.85950953, "learning_rate": 1.439729177600563e-06, "loss": 0.88471186, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.7532098293304443 }, { "auxiliary_loss_clip": 0.01328873, "auxiliary_loss_mlp": 0.01193811, "balance_loss_clip": 1.00815094, "balance_loss_mlp": 1.00021482, "epoch": 0.602777610773763, "flos": 16690643269920.0, "grad_norm": 16.225542054685036, "language_loss": 0.72844368, "learning_rate": 1.4389814381078793e-06, "loss": 0.75367057, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.747556686401367 }, { "auxiliary_loss_clip": 0.01196551, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00618935, "balance_loss_mlp": 1.00036776, "epoch": 0.6028978536644021, "flos": 13334251757760.0, "grad_norm": 2.0065172693330786, "language_loss": 0.79557794, "learning_rate": 1.438233783721265e-06, "loss": 0.81948406, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 3.217526435852051 }, { "auxiliary_loss_clip": 0.0130663, "auxiliary_loss_mlp": 0.01193724, "balance_loss_clip": 1.00866687, "balance_loss_mlp": 1.00022352, "epoch": 0.6030180965550412, "flos": 19644834866400.0, "grad_norm": 3.169486858999628, "language_loss": 0.77834666, "learning_rate": 1.43748621455414e-06, "loss": 0.80335015, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 3.527101755142212 }, { "auxiliary_loss_clip": 0.01306937, "auxiliary_loss_mlp": 0.01194062, "balance_loss_clip": 1.00843668, "balance_loss_mlp": 1.00027537, "epoch": 0.6031383394456803, "flos": 14458397153760.0, "grad_norm": 2.548315221736934, "language_loss": 0.8068794, "learning_rate": 1.4367387307199082e-06, "loss": 0.83188945, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.8825860023498535 }, { "auxiliary_loss_clip": 0.01341239, "auxiliary_loss_mlp": 0.01193934, "balance_loss_clip": 1.00917697, "balance_loss_mlp": 1.00024307, "epoch": 0.6032585823363193, "flos": 13917800211840.0, "grad_norm": 2.1601554491370343, "language_loss": 0.82639003, "learning_rate": 1.4359913323319632e-06, "loss": 0.85174173, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.7836177349090576 }, { "auxiliary_loss_clip": 0.01266386, "auxiliary_loss_mlp": 0.01193809, "balance_loss_clip": 1.00719166, "balance_loss_mlp": 1.00021338, "epoch": 0.6033788252269584, "flos": 24353264193120.0, "grad_norm": 1.6503955066844345, "language_loss": 0.77494478, "learning_rate": 1.4352440195036847e-06, "loss": 0.79954678, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 3.003160238265991 }, { "auxiliary_loss_clip": 0.01241118, "auxiliary_loss_mlp": 0.01193995, "balance_loss_clip": 1.0074898, "balance_loss_mlp": 1.0003041, "epoch": 0.6034990681175976, "flos": 25521259193280.0, "grad_norm": 1.5295441688955946, "language_loss": 0.79751778, "learning_rate": 1.4344967923484395e-06, "loss": 0.8218689, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.97271990776062 }, { "auxiliary_loss_clip": 0.0132837, "auxiliary_loss_mlp": 0.01193916, "balance_loss_clip": 1.00803566, "balance_loss_mlp": 1.00022495, "epoch": 0.6036193110082366, "flos": 25958399640480.0, "grad_norm": 2.0071594364685357, "language_loss": 0.72498268, "learning_rate": 1.433749650979581e-06, "loss": 0.75020552, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.8738532066345215 }, { "auxiliary_loss_clip": 0.01284298, "auxiliary_loss_mlp": 0.01194051, "balance_loss_clip": 1.00730312, "balance_loss_mlp": 1.00026464, "epoch": 0.6037395538988757, "flos": 25593439371840.0, "grad_norm": 1.7454188913911985, "language_loss": 0.67830068, "learning_rate": 1.433002595510451e-06, "loss": 0.70308417, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 3.0458993911743164 }, { "auxiliary_loss_clip": 0.01320345, "auxiliary_loss_mlp": 0.00872579, "balance_loss_clip": 1.00840354, "balance_loss_mlp": 1.00050306, "epoch": 0.6038597967895148, "flos": 17816261536800.0, "grad_norm": 1.6684413309390622, "language_loss": 0.72031504, "learning_rate": 1.4322556260543757e-06, "loss": 0.7422443, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 5.1628453731536865 }, { "auxiliary_loss_clip": 0.01275081, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00559902, "balance_loss_mlp": 1.00009692, "epoch": 0.6039800396801539, "flos": 65169243313920.0, "grad_norm": 0.8920856686189739, "language_loss": 0.62747765, "learning_rate": 1.4315087427246703e-06, "loss": 0.65215969, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.2747750282287598 }, { "auxiliary_loss_clip": 0.01323112, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00544643, "balance_loss_mlp": 1.00011814, "epoch": 0.604100282570793, "flos": 67386437408160.0, "grad_norm": 0.8662421442722346, "language_loss": 0.58500725, "learning_rate": 1.4307619456346372e-06, "loss": 0.61016977, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 3.9551124572753906 }, { "auxiliary_loss_clip": 0.01340511, "auxiliary_loss_mlp": 0.01193976, "balance_loss_clip": 1.00844812, "balance_loss_mlp": 1.00028491, "epoch": 0.6042205254614321, "flos": 35297509944960.0, "grad_norm": 1.7102827853294318, "language_loss": 0.7432524, "learning_rate": 1.430015234897564e-06, "loss": 0.7685973, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.8998937606811523 }, { "auxiliary_loss_clip": 0.01352695, "auxiliary_loss_mlp": 0.00872517, "balance_loss_clip": 1.00880694, "balance_loss_mlp": 1.00055122, "epoch": 0.6043407683520712, "flos": 45658279090080.0, "grad_norm": 1.5719578934020146, "language_loss": 0.66214204, "learning_rate": 1.4292686106267274e-06, "loss": 0.68439424, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.9076008796691895 }, { "auxiliary_loss_clip": 0.01341671, "auxiliary_loss_mlp": 0.01194191, "balance_loss_clip": 1.00914109, "balance_loss_mlp": 1.00040388, "epoch": 0.6044610112427102, "flos": 16180029391680.0, "grad_norm": 1.591796335075175, "language_loss": 0.77056354, "learning_rate": 1.4285220729353876e-06, "loss": 0.79592216, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 3.644073486328125 }, { "auxiliary_loss_clip": 0.01327962, "auxiliary_loss_mlp": 0.01194019, "balance_loss_clip": 1.00870717, "balance_loss_mlp": 1.00023293, "epoch": 0.6045812541333494, "flos": 13804069544640.0, "grad_norm": 1.7679172860862633, "language_loss": 0.77663946, "learning_rate": 1.4277756219367957e-06, "loss": 0.80185932, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.7390592098236084 }, { "auxiliary_loss_clip": 0.01303352, "auxiliary_loss_mlp": 0.0119399, "balance_loss_clip": 1.0083735, "balance_loss_mlp": 1.0002985, "epoch": 0.6047014970239885, "flos": 19975070145600.0, "grad_norm": 2.0614845766020005, "language_loss": 0.79689157, "learning_rate": 1.4270292577441864e-06, "loss": 0.82186496, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.8214640617370605 }, { "auxiliary_loss_clip": 0.01341795, "auxiliary_loss_mlp": 0.01194021, "balance_loss_clip": 1.00924993, "balance_loss_mlp": 1.00023484, "epoch": 0.6048217399146275, "flos": 25337108607840.0, "grad_norm": 1.447866546011582, "language_loss": 0.71707702, "learning_rate": 1.4262829804707836e-06, "loss": 0.74243528, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.8583474159240723 }, { "auxiliary_loss_clip": 0.01341702, "auxiliary_loss_mlp": 0.01194003, "balance_loss_clip": 1.00886965, "balance_loss_mlp": 1.00021625, "epoch": 0.6049419828052667, "flos": 26030831284800.0, "grad_norm": 1.4179396627716863, "language_loss": 0.69859326, "learning_rate": 1.4255367902297958e-06, "loss": 0.72395033, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.784862995147705 }, { "auxiliary_loss_clip": 0.01352874, "auxiliary_loss_mlp": 0.01193749, "balance_loss_clip": 1.00888717, "balance_loss_mlp": 1.00024867, "epoch": 0.6050622256959057, "flos": 14648115909600.0, "grad_norm": 2.1984671128899764, "language_loss": 0.78445053, "learning_rate": 1.4247906871344215e-06, "loss": 0.80991673, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.770947217941284 }, { "auxiliary_loss_clip": 0.01319467, "auxiliary_loss_mlp": 0.01193899, "balance_loss_clip": 1.00829053, "balance_loss_mlp": 1.00020766, "epoch": 0.6051824685865448, "flos": 23331461657760.0, "grad_norm": 2.0261738705896546, "language_loss": 0.75414318, "learning_rate": 1.4240446712978415e-06, "loss": 0.77927685, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.8303725719451904 }, { "auxiliary_loss_clip": 0.01339795, "auxiliary_loss_mlp": 0.01194077, "balance_loss_clip": 1.00866222, "balance_loss_mlp": 1.00029063, "epoch": 0.605302711477184, "flos": 27563319545760.0, "grad_norm": 1.7287283460107035, "language_loss": 0.74385184, "learning_rate": 1.423298742833227e-06, "loss": 0.76919055, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.882648229598999 }, { "auxiliary_loss_clip": 0.01304926, "auxiliary_loss_mlp": 0.01194124, "balance_loss_clip": 1.00810671, "balance_loss_mlp": 1.00033712, "epoch": 0.605422954367823, "flos": 15154706335680.0, "grad_norm": 1.831421037709756, "language_loss": 0.71668756, "learning_rate": 1.4225529018537352e-06, "loss": 0.741678, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.775317668914795 }, { "auxiliary_loss_clip": 0.01352601, "auxiliary_loss_mlp": 0.01193907, "balance_loss_clip": 1.00870407, "balance_loss_mlp": 1.00021601, "epoch": 0.6055431972584621, "flos": 27673924852800.0, "grad_norm": 1.4871553100670396, "language_loss": 0.77467394, "learning_rate": 1.4218071484725082e-06, "loss": 0.80013895, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.8116228580474854 }, { "auxiliary_loss_clip": 0.01303768, "auxiliary_loss_mlp": 0.01193992, "balance_loss_clip": 1.00822639, "balance_loss_mlp": 1.00030065, "epoch": 0.6056634401491012, "flos": 19387498239360.0, "grad_norm": 1.78562717179629, "language_loss": 0.76138866, "learning_rate": 1.4210614828026786e-06, "loss": 0.78636622, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.7942538261413574 }, { "auxiliary_loss_clip": 0.01352806, "auxiliary_loss_mlp": 0.01193975, "balance_loss_clip": 1.00826001, "balance_loss_mlp": 1.00028372, "epoch": 0.6057836830397403, "flos": 24789470624640.0, "grad_norm": 1.3897738298457292, "language_loss": 0.74254346, "learning_rate": 1.4203159049573605e-06, "loss": 0.76801127, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.8648557662963867 }, { "auxiliary_loss_clip": 0.0131345, "auxiliary_loss_mlp": 0.01194029, "balance_loss_clip": 1.00878215, "balance_loss_mlp": 1.00024235, "epoch": 0.6059039259303793, "flos": 20558259362880.0, "grad_norm": 1.8806462219081201, "language_loss": 0.87248993, "learning_rate": 1.4195704150496593e-06, "loss": 0.89756477, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.795210123062134 }, { "auxiliary_loss_clip": 0.01313671, "auxiliary_loss_mlp": 0.01193872, "balance_loss_clip": 1.00855589, "balance_loss_mlp": 1.00027609, "epoch": 0.6060241688210185, "flos": 21069735409440.0, "grad_norm": 1.718470088000945, "language_loss": 0.73706514, "learning_rate": 1.4188250131926639e-06, "loss": 0.76214057, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.740788459777832 }, { "auxiliary_loss_clip": 0.01315639, "auxiliary_loss_mlp": 0.01194061, "balance_loss_clip": 1.00835466, "balance_loss_mlp": 1.00027418, "epoch": 0.6061444117116576, "flos": 16361090540640.0, "grad_norm": 1.8801760453432559, "language_loss": 0.808411, "learning_rate": 1.4180796994994525e-06, "loss": 0.83350801, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.834766387939453 }, { "auxiliary_loss_clip": 0.01316019, "auxiliary_loss_mlp": 0.01194018, "balance_loss_clip": 1.00823331, "balance_loss_mlp": 1.0002315, "epoch": 0.6062646546022966, "flos": 21507306940800.0, "grad_norm": 1.725557413493686, "language_loss": 0.72208995, "learning_rate": 1.4173344740830877e-06, "loss": 0.7471903, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.725911855697632 }, { "auxiliary_loss_clip": 0.01316878, "auxiliary_loss_mlp": 0.01194049, "balance_loss_clip": 1.00893021, "balance_loss_mlp": 1.00026214, "epoch": 0.6063848974929358, "flos": 38983166796960.0, "grad_norm": 1.8365918193148265, "language_loss": 0.70461762, "learning_rate": 1.4165893370566206e-06, "loss": 0.72972691, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.9247493743896484 }, { "auxiliary_loss_clip": 0.0134105, "auxiliary_loss_mlp": 0.01194032, "balance_loss_clip": 1.00875771, "balance_loss_mlp": 1.00024581, "epoch": 0.6065051403835748, "flos": 19646595126720.0, "grad_norm": 1.6445717144267429, "language_loss": 0.77604949, "learning_rate": 1.4158442885330865e-06, "loss": 0.8014003, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.7502546310424805 }, { "auxiliary_loss_clip": 0.01340685, "auxiliary_loss_mlp": 0.01193948, "balance_loss_clip": 1.00867248, "balance_loss_mlp": 1.00025702, "epoch": 0.6066253832742139, "flos": 23513097585600.0, "grad_norm": 1.9137315801046393, "language_loss": 0.78595698, "learning_rate": 1.4150993286255094e-06, "loss": 0.81130332, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.8648338317871094 }, { "auxiliary_loss_clip": 0.01352246, "auxiliary_loss_mlp": 0.01194024, "balance_loss_clip": 1.00842929, "balance_loss_mlp": 1.00023782, "epoch": 0.6067456261648531, "flos": 19133718056640.0, "grad_norm": 2.0438046459140176, "language_loss": 0.795111, "learning_rate": 1.4143544574468993e-06, "loss": 0.82057369, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.831113815307617 }, { "auxiliary_loss_clip": 0.01327978, "auxiliary_loss_mlp": 0.01194019, "balance_loss_clip": 1.00809407, "balance_loss_mlp": 1.00023234, "epoch": 0.6068658690554921, "flos": 20520624555360.0, "grad_norm": 1.5324587597924242, "language_loss": 0.82452202, "learning_rate": 1.4136096751102523e-06, "loss": 0.84974194, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.849978446960449 }, { "auxiliary_loss_clip": 0.01314012, "auxiliary_loss_mlp": 0.01193923, "balance_loss_clip": 1.00785589, "balance_loss_mlp": 1.00023127, "epoch": 0.6069861119461312, "flos": 27374570729280.0, "grad_norm": 2.0586687255374305, "language_loss": 0.83219373, "learning_rate": 1.4128649817285516e-06, "loss": 0.8572731, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 3.8784193992614746 }, { "auxiliary_loss_clip": 0.01327521, "auxiliary_loss_mlp": 0.0119416, "balance_loss_clip": 1.00884223, "balance_loss_mlp": 1.00027835, "epoch": 0.6071063548367702, "flos": 25626511872000.0, "grad_norm": 2.3726043513010397, "language_loss": 0.63282508, "learning_rate": 1.412120377414766e-06, "loss": 0.65804189, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 4.319952487945557 }, { "auxiliary_loss_clip": 0.01351821, "auxiliary_loss_mlp": 0.01194009, "balance_loss_clip": 1.0087986, "balance_loss_mlp": 1.00022256, "epoch": 0.6072265977274094, "flos": 24460528597920.0, "grad_norm": 1.4780765004568193, "language_loss": 0.71133757, "learning_rate": 1.4113758622818522e-06, "loss": 0.7367959, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 3.697436809539795 }, { "auxiliary_loss_clip": 0.01321589, "auxiliary_loss_mlp": 0.00872392, "balance_loss_clip": 1.00808501, "balance_loss_mlp": 1.00051117, "epoch": 0.6073468406180484, "flos": 18149262939360.0, "grad_norm": 1.8887616758171157, "language_loss": 0.830778, "learning_rate": 1.410631436442751e-06, "loss": 0.85271776, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.748147964477539 }, { "auxiliary_loss_clip": 0.01341085, "auxiliary_loss_mlp": 0.01193928, "balance_loss_clip": 1.00887287, "balance_loss_mlp": 1.00023651, "epoch": 0.6074670835086875, "flos": 20697626328480.0, "grad_norm": 1.8858819508646345, "language_loss": 0.86593169, "learning_rate": 1.4098871000103936e-06, "loss": 0.89128184, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.7794301509857178 }, { "auxiliary_loss_clip": 0.01312579, "auxiliary_loss_mlp": 0.01193872, "balance_loss_clip": 1.00866604, "balance_loss_mlp": 1.00027657, "epoch": 0.6075873263993267, "flos": 23769967204800.0, "grad_norm": 1.6575809421454413, "language_loss": 0.82484466, "learning_rate": 1.409142853097693e-06, "loss": 0.84990919, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 3.7302944660186768 }, { "auxiliary_loss_clip": 0.01309533, "auxiliary_loss_mlp": 0.01193997, "balance_loss_clip": 1.00753999, "balance_loss_mlp": 1.00021088, "epoch": 0.6077075692899657, "flos": 24454493419680.0, "grad_norm": 2.848344534730638, "language_loss": 0.79578316, "learning_rate": 1.408398695817553e-06, "loss": 0.82081842, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.8396809101104736 }, { "auxiliary_loss_clip": 0.01317515, "auxiliary_loss_mlp": 0.01194076, "balance_loss_clip": 1.00836086, "balance_loss_mlp": 1.00028968, "epoch": 0.6078278121806048, "flos": 27382114702080.0, "grad_norm": 1.7151874558410898, "language_loss": 0.70205921, "learning_rate": 1.4076546282828593e-06, "loss": 0.72717512, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.847160577774048 }, { "auxiliary_loss_clip": 0.01329027, "auxiliary_loss_mlp": 0.01194065, "balance_loss_clip": 1.00832653, "balance_loss_mlp": 1.00027823, "epoch": 0.6079480550712439, "flos": 38436462829440.0, "grad_norm": 2.4363982130810062, "language_loss": 0.66430122, "learning_rate": 1.4069106506064874e-06, "loss": 0.68953216, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.889225721359253 }, { "auxiliary_loss_clip": 0.01304374, "auxiliary_loss_mlp": 0.01193756, "balance_loss_clip": 1.00716555, "balance_loss_mlp": 1.00025558, "epoch": 0.608068297961883, "flos": 25336282363200.0, "grad_norm": 1.5888544348356035, "language_loss": 0.78018236, "learning_rate": 1.4061667629012989e-06, "loss": 0.80516368, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.8689355850219727 }, { "auxiliary_loss_clip": 0.01317706, "auxiliary_loss_mlp": 0.011939, "balance_loss_clip": 1.00831938, "balance_loss_mlp": 1.00020874, "epoch": 0.608188540852522, "flos": 24202473497280.0, "grad_norm": 1.6032189926114806, "language_loss": 0.82778466, "learning_rate": 1.40542296528014e-06, "loss": 0.85290062, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.7977454662323 }, { "auxiliary_loss_clip": 0.01341459, "auxiliary_loss_mlp": 0.01193955, "balance_loss_clip": 1.00882554, "balance_loss_mlp": 1.00035882, "epoch": 0.6083087837431612, "flos": 21284156448000.0, "grad_norm": 1.8938845837380076, "language_loss": 0.76080096, "learning_rate": 1.4046792578558452e-06, "loss": 0.7861551, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.811033248901367 }, { "auxiliary_loss_clip": 0.01321492, "auxiliary_loss_mlp": 0.01193828, "balance_loss_clip": 1.00787687, "balance_loss_mlp": 1.0002321, "epoch": 0.6084290266338003, "flos": 16471444381920.0, "grad_norm": 2.2644222585807654, "language_loss": 0.75961769, "learning_rate": 1.4039356407412325e-06, "loss": 0.78477085, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.9582254886627197 }, { "auxiliary_loss_clip": 0.01308978, "auxiliary_loss_mlp": 0.01193104, "balance_loss_clip": 1.0044539, "balance_loss_mlp": 1.00007999, "epoch": 0.6085492695244393, "flos": 66443604626880.0, "grad_norm": 0.7821920446760785, "language_loss": 0.5715102, "learning_rate": 1.40319211404911e-06, "loss": 0.59653103, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.3187193870544434 }, { "auxiliary_loss_clip": 0.01352306, "auxiliary_loss_mlp": 0.01193955, "balance_loss_clip": 1.00842023, "balance_loss_mlp": 1.00026357, "epoch": 0.6086695124150785, "flos": 23618996890560.0, "grad_norm": 1.699163896964262, "language_loss": 0.90539801, "learning_rate": 1.4024486778922691e-06, "loss": 0.93086058, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.744723320007324 }, { "auxiliary_loss_clip": 0.01326937, "auxiliary_loss_mlp": 0.01193999, "balance_loss_clip": 1.00805151, "balance_loss_mlp": 1.00021267, "epoch": 0.6087897553057176, "flos": 20157065310240.0, "grad_norm": 2.07933933001993, "language_loss": 0.77578145, "learning_rate": 1.4017053323834884e-06, "loss": 0.80099082, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.7794387340545654 }, { "auxiliary_loss_clip": 0.01320685, "auxiliary_loss_mlp": 0.0119394, "balance_loss_clip": 1.00798893, "balance_loss_mlp": 1.00024915, "epoch": 0.6089099981963566, "flos": 25482546675360.0, "grad_norm": 2.008355087053009, "language_loss": 0.75785863, "learning_rate": 1.4009620776355333e-06, "loss": 0.78300488, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.8004448413848877 }, { "auxiliary_loss_clip": 0.01341468, "auxiliary_loss_mlp": 0.01193981, "balance_loss_clip": 1.00846648, "balance_loss_mlp": 1.00019383, "epoch": 0.6090302410869958, "flos": 25332905537280.0, "grad_norm": 1.6576530394464533, "language_loss": 0.79171789, "learning_rate": 1.4002189137611553e-06, "loss": 0.81707239, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.7300610542297363 }, { "auxiliary_loss_clip": 0.01340233, "auxiliary_loss_mlp": 0.01194109, "balance_loss_clip": 1.00843096, "balance_loss_mlp": 1.00032282, "epoch": 0.6091504839776348, "flos": 23987369908800.0, "grad_norm": 1.7540416857518089, "language_loss": 0.6947304, "learning_rate": 1.3994758408730901e-06, "loss": 0.72007382, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.7746355533599854 }, { "auxiliary_loss_clip": 0.01306995, "auxiliary_loss_mlp": 0.01193927, "balance_loss_clip": 1.00772214, "balance_loss_mlp": 1.00023603, "epoch": 0.6092707268682739, "flos": 29643050629440.0, "grad_norm": 2.02125060304702, "language_loss": 0.76353812, "learning_rate": 1.3987328590840629e-06, "loss": 0.78854734, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.787785768508911 }, { "auxiliary_loss_clip": 0.01341014, "auxiliary_loss_mlp": 0.01193938, "balance_loss_clip": 1.00839508, "balance_loss_mlp": 1.00024712, "epoch": 0.609390969758913, "flos": 24024968792640.0, "grad_norm": 1.790133905380673, "language_loss": 0.8622719, "learning_rate": 1.397989968506783e-06, "loss": 0.8876214, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.771394968032837 }, { "auxiliary_loss_clip": 0.01354279, "auxiliary_loss_mlp": 0.01193964, "balance_loss_clip": 1.00905073, "balance_loss_mlp": 1.00027239, "epoch": 0.6095112126495521, "flos": 11102149336320.0, "grad_norm": 1.9278491786093, "language_loss": 0.72018158, "learning_rate": 1.3972471692539458e-06, "loss": 0.745664, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.6095352172851562 }, { "auxiliary_loss_clip": 0.0131929, "auxiliary_loss_mlp": 0.01194012, "balance_loss_clip": 1.00837231, "balance_loss_mlp": 1.00022507, "epoch": 0.6096314555401912, "flos": 17265498193440.0, "grad_norm": 1.934881926146032, "language_loss": 0.75175595, "learning_rate": 1.3965044614382348e-06, "loss": 0.77688891, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.7770538330078125 }, { "auxiliary_loss_clip": 0.01352858, "auxiliary_loss_mlp": 0.01194055, "balance_loss_clip": 1.00931978, "balance_loss_mlp": 1.00026822, "epoch": 0.6097516984308303, "flos": 21645919509120.0, "grad_norm": 3.760422181447774, "language_loss": 0.75537711, "learning_rate": 1.3957618451723162e-06, "loss": 0.78084624, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.7183172702789307 }, { "auxiliary_loss_clip": 0.01318689, "auxiliary_loss_mlp": 0.01194042, "balance_loss_clip": 1.00798821, "balance_loss_mlp": 1.00025558, "epoch": 0.6098719413214694, "flos": 27199221445440.0, "grad_norm": 2.1759105774714214, "language_loss": 0.71084762, "learning_rate": 1.3950193205688457e-06, "loss": 0.73597503, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.763890266418457 }, { "auxiliary_loss_clip": 0.0130904, "auxiliary_loss_mlp": 0.01194021, "balance_loss_clip": 1.00772548, "balance_loss_mlp": 1.00023472, "epoch": 0.6099921842121084, "flos": 20412964990080.0, "grad_norm": 4.341240863503528, "language_loss": 0.83986217, "learning_rate": 1.3942768877404627e-06, "loss": 0.86489284, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.7607226371765137 }, { "auxiliary_loss_clip": 0.01352695, "auxiliary_loss_mlp": 0.01194058, "balance_loss_clip": 1.00835502, "balance_loss_mlp": 1.00027156, "epoch": 0.6101124271027476, "flos": 23366150723520.0, "grad_norm": 1.4625950916547579, "language_loss": 0.73479617, "learning_rate": 1.393534546799795e-06, "loss": 0.76026374, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 3.73758602142334 }, { "auxiliary_loss_clip": 0.01314355, "auxiliary_loss_mlp": 0.01193833, "balance_loss_clip": 1.00870037, "balance_loss_mlp": 1.00023663, "epoch": 0.6102326699933867, "flos": 26687853169920.0, "grad_norm": 1.4663355189347471, "language_loss": 0.68069655, "learning_rate": 1.3927922978594536e-06, "loss": 0.70577848, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 3.726630449295044 }, { "auxiliary_loss_clip": 0.01299598, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00403547, "balance_loss_mlp": 1.0001564, "epoch": 0.6103529128840257, "flos": 60644641259520.0, "grad_norm": 0.7708376237385455, "language_loss": 0.57453227, "learning_rate": 1.3920501410320387e-06, "loss": 0.59946001, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.254791021347046 }, { "auxiliary_loss_clip": 0.01315608, "auxiliary_loss_mlp": 0.01194101, "balance_loss_clip": 1.00847447, "balance_loss_mlp": 1.00031459, "epoch": 0.6104731557746649, "flos": 19021316565600.0, "grad_norm": 2.0655339626821396, "language_loss": 0.76194537, "learning_rate": 1.3913080764301333e-06, "loss": 0.7870425, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 3.747685670852661 }, { "auxiliary_loss_clip": 0.01298334, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00790167, "balance_loss_mlp": 1.00026512, "epoch": 0.6105933986653039, "flos": 23366905120800.0, "grad_norm": 2.0978884751522884, "language_loss": 0.70924699, "learning_rate": 1.3905661041663085e-06, "loss": 0.73417091, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 3.7901487350463867 }, { "auxiliary_loss_clip": 0.01328031, "auxiliary_loss_mlp": 0.01194079, "balance_loss_clip": 1.00825119, "balance_loss_mlp": 1.0002923, "epoch": 0.610713641555943, "flos": 34637578241760.0, "grad_norm": 1.8366750623658457, "language_loss": 0.6473242, "learning_rate": 1.389824224353122e-06, "loss": 0.67254531, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.807830572128296 }, { "auxiliary_loss_clip": 0.01327722, "auxiliary_loss_mlp": 0.01194155, "balance_loss_clip": 1.00793242, "balance_loss_mlp": 1.00046396, "epoch": 0.610833884446582, "flos": 26646482299680.0, "grad_norm": 1.4421638407935957, "language_loss": 0.76664144, "learning_rate": 1.389082437103115e-06, "loss": 0.79186022, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.758880376815796 }, { "auxiliary_loss_clip": 0.01304369, "auxiliary_loss_mlp": 0.0119387, "balance_loss_clip": 1.00882483, "balance_loss_mlp": 1.00027394, "epoch": 0.6109541273372212, "flos": 21215137553280.0, "grad_norm": 3.4308516473309214, "language_loss": 0.77627397, "learning_rate": 1.3883407425288172e-06, "loss": 0.80125636, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.830305814743042 }, { "auxiliary_loss_clip": 0.01322174, "auxiliary_loss_mlp": 0.01194051, "balance_loss_clip": 1.00835478, "balance_loss_mlp": 1.00026441, "epoch": 0.6110743702278603, "flos": 20084094810720.0, "grad_norm": 4.098224911478228, "language_loss": 0.79703474, "learning_rate": 1.3875991407427417e-06, "loss": 0.82219702, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.7674782276153564 }, { "auxiliary_loss_clip": 0.01280299, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00450158, "balance_loss_mlp": 1.0001601, "epoch": 0.6111946131184993, "flos": 68302987264800.0, "grad_norm": 0.78371413938583, "language_loss": 0.58219558, "learning_rate": 1.38685763185739e-06, "loss": 0.60693043, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.370037794113159 }, { "auxiliary_loss_clip": 0.01351794, "auxiliary_loss_mlp": 0.01193992, "balance_loss_clip": 1.00831449, "balance_loss_mlp": 1.00020504, "epoch": 0.6113148560091385, "flos": 19937686803840.0, "grad_norm": 2.3122645799452477, "language_loss": 0.68026435, "learning_rate": 1.3861162159852476e-06, "loss": 0.70572221, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.7148871421813965 }, { "auxiliary_loss_clip": 0.01316754, "auxiliary_loss_mlp": 0.01194057, "balance_loss_clip": 1.00826621, "balance_loss_mlp": 1.00027037, "epoch": 0.6114350988997775, "flos": 23731865389440.0, "grad_norm": 1.761420817012645, "language_loss": 0.79576033, "learning_rate": 1.3853748932387875e-06, "loss": 0.82086843, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.7762129306793213 }, { "auxiliary_loss_clip": 0.0131558, "auxiliary_loss_mlp": 0.01194014, "balance_loss_clip": 1.00827146, "balance_loss_mlp": 1.00022769, "epoch": 0.6115553417904166, "flos": 24023711463840.0, "grad_norm": 2.3438207717876467, "language_loss": 0.74763536, "learning_rate": 1.3846336637304671e-06, "loss": 0.77273136, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.8958897590637207 }, { "auxiliary_loss_clip": 0.01316848, "auxiliary_loss_mlp": 0.01194035, "balance_loss_clip": 1.00868607, "balance_loss_mlp": 1.00024796, "epoch": 0.6116755846810558, "flos": 23733553802400.0, "grad_norm": 1.8476203692323554, "language_loss": 0.83005893, "learning_rate": 1.3838925275727316e-06, "loss": 0.85516775, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 3.053229570388794 }, { "auxiliary_loss_clip": 0.01352515, "auxiliary_loss_mlp": 0.01194011, "balance_loss_clip": 1.00888288, "balance_loss_mlp": 1.00022459, "epoch": 0.6117958275716948, "flos": 18661637077920.0, "grad_norm": 1.6443826319046506, "language_loss": 0.79227638, "learning_rate": 1.3831514848780089e-06, "loss": 0.81774169, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.8584702014923096 }, { "auxiliary_loss_clip": 0.01340131, "auxiliary_loss_mlp": 0.01193947, "balance_loss_clip": 1.0087707, "balance_loss_mlp": 1.00025558, "epoch": 0.6119160704623339, "flos": 16471192916160.0, "grad_norm": 2.2060873711312077, "language_loss": 0.91730952, "learning_rate": 1.3824105357587152e-06, "loss": 0.94265032, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.7314400672912598 }, { "auxiliary_loss_clip": 0.01315966, "auxiliary_loss_mlp": 0.01194057, "balance_loss_clip": 1.00776029, "balance_loss_mlp": 1.00027001, "epoch": 0.612036313352973, "flos": 23915477119680.0, "grad_norm": 1.548009511503114, "language_loss": 0.82509971, "learning_rate": 1.381669680327253e-06, "loss": 0.8502, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.795274257659912 }, { "auxiliary_loss_clip": 0.01319622, "auxiliary_loss_mlp": 0.01193943, "balance_loss_clip": 1.00893474, "balance_loss_mlp": 1.00025177, "epoch": 0.6121565562436121, "flos": 26974777700160.0, "grad_norm": 2.1055435639130784, "language_loss": 0.70512581, "learning_rate": 1.380928918696008e-06, "loss": 0.73026145, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.8491430282592773 }, { "auxiliary_loss_clip": 0.01329937, "auxiliary_loss_mlp": 0.01194031, "balance_loss_clip": 1.00763965, "balance_loss_mlp": 1.00024486, "epoch": 0.6122767991342511, "flos": 15668876658240.0, "grad_norm": 2.136865772941721, "language_loss": 0.71649754, "learning_rate": 1.3801882509773548e-06, "loss": 0.74173725, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.7588868141174316 }, { "auxiliary_loss_clip": 0.01340387, "auxiliary_loss_mlp": 0.01193719, "balance_loss_clip": 1.00837684, "balance_loss_mlp": 1.00021875, "epoch": 0.6123970420248903, "flos": 27964333980000.0, "grad_norm": 1.8432434544114402, "language_loss": 0.81710744, "learning_rate": 1.3794476772836503e-06, "loss": 0.84244847, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.8132145404815674 }, { "auxiliary_loss_clip": 0.01293051, "auxiliary_loss_mlp": 0.01194135, "balance_loss_clip": 1.00820804, "balance_loss_mlp": 1.00034833, "epoch": 0.6125172849155294, "flos": 21468738117600.0, "grad_norm": 1.5723589112259917, "language_loss": 0.84594607, "learning_rate": 1.3787071977272402e-06, "loss": 0.87081796, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.784496784210205 }, { "auxiliary_loss_clip": 0.01257632, "auxiliary_loss_mlp": 0.01193966, "balance_loss_clip": 1.00705314, "balance_loss_mlp": 1.0002749, "epoch": 0.6126375278061684, "flos": 16248329812800.0, "grad_norm": 4.237385845419839, "language_loss": 0.71495998, "learning_rate": 1.3779668124204535e-06, "loss": 0.73947597, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.8019495010375977 }, { "auxiliary_loss_clip": 0.01302159, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00786662, "balance_loss_mlp": 1.00024104, "epoch": 0.6127577706968076, "flos": 20448875460960.0, "grad_norm": 1.6773518343397622, "language_loss": 0.80786645, "learning_rate": 1.3772265214756074e-06, "loss": 0.83282644, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.754194974899292 }, { "auxiliary_loss_clip": 0.0134104, "auxiliary_loss_mlp": 0.0119388, "balance_loss_clip": 1.00810277, "balance_loss_mlp": 1.00028419, "epoch": 0.6128780135874466, "flos": 18260407101600.0, "grad_norm": 1.6355518203616297, "language_loss": 0.74896944, "learning_rate": 1.3764863250050025e-06, "loss": 0.77431858, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.712932825088501 }, { "auxiliary_loss_clip": 0.01303191, "auxiliary_loss_mlp": 0.01193915, "balance_loss_clip": 1.0079397, "balance_loss_mlp": 1.0002234, "epoch": 0.6129982564780857, "flos": 24937100036640.0, "grad_norm": 1.6503601142097195, "language_loss": 0.80409718, "learning_rate": 1.3757462231209272e-06, "loss": 0.82906818, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.9258744716644287 }, { "auxiliary_loss_clip": 0.01317871, "auxiliary_loss_mlp": 0.01194012, "balance_loss_clip": 1.00786662, "balance_loss_mlp": 1.00032032, "epoch": 0.6131184993687249, "flos": 22492049447520.0, "grad_norm": 1.9471532373502252, "language_loss": 0.88591892, "learning_rate": 1.3750062159356525e-06, "loss": 0.91103768, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.869265079498291 }, { "auxiliary_loss_clip": 0.01296183, "auxiliary_loss_mlp": 0.01194067, "balance_loss_clip": 1.0073421, "balance_loss_mlp": 1.0002799, "epoch": 0.6132387422593639, "flos": 15885848278080.0, "grad_norm": 1.732554709351943, "language_loss": 0.82927018, "learning_rate": 1.3742663035614382e-06, "loss": 0.85417271, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.8929312229156494 }, { "auxiliary_loss_clip": 0.0135273, "auxiliary_loss_mlp": 0.01193927, "balance_loss_clip": 1.00858963, "balance_loss_mlp": 1.00023603, "epoch": 0.613358985150003, "flos": 25411551978240.0, "grad_norm": 1.5870033789894513, "language_loss": 0.80299175, "learning_rate": 1.3735264861105283e-06, "loss": 0.82845843, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 4.668025016784668 }, { "auxiliary_loss_clip": 0.0129792, "auxiliary_loss_mlp": 0.01194009, "balance_loss_clip": 1.00721872, "balance_loss_mlp": 1.00031793, "epoch": 0.6134792280406421, "flos": 21361294094400.0, "grad_norm": 1.9010470551388154, "language_loss": 0.78515136, "learning_rate": 1.372786763695152e-06, "loss": 0.81007063, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 3.8563730716705322 }, { "auxiliary_loss_clip": 0.01341097, "auxiliary_loss_mlp": 0.01194122, "balance_loss_clip": 1.00860238, "balance_loss_mlp": 1.00033522, "epoch": 0.6135994709312812, "flos": 21211257795840.0, "grad_norm": 1.8676888928253212, "language_loss": 0.77592981, "learning_rate": 1.3720471364275257e-06, "loss": 0.80128199, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.7388792037963867 }, { "auxiliary_loss_clip": 0.01304336, "auxiliary_loss_mlp": 0.00872593, "balance_loss_clip": 1.00776029, "balance_loss_mlp": 1.00043094, "epoch": 0.6137197138219203, "flos": 14794056908640.0, "grad_norm": 1.7915853545411897, "language_loss": 0.77916026, "learning_rate": 1.3713076044198486e-06, "loss": 0.80092955, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 3.6702404022216797 }, { "auxiliary_loss_clip": 0.01317316, "auxiliary_loss_mlp": 0.01193992, "balance_loss_clip": 1.00818014, "balance_loss_mlp": 1.00030112, "epoch": 0.6138399567125594, "flos": 20084525894880.0, "grad_norm": 2.1137602067667642, "language_loss": 0.80578047, "learning_rate": 1.3705681677843086e-06, "loss": 0.83089352, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.8033182621002197 }, { "auxiliary_loss_clip": 0.01321818, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.00459027, "balance_loss_mlp": 1.00008416, "epoch": 0.6139601996031985, "flos": 60123874188960.0, "grad_norm": 0.7888106933567398, "language_loss": 0.60596204, "learning_rate": 1.3698288266330768e-06, "loss": 0.63111126, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.329770088195801 }, { "auxiliary_loss_clip": 0.01303124, "auxiliary_loss_mlp": 0.01193736, "balance_loss_clip": 1.00792038, "balance_loss_mlp": 1.0002358, "epoch": 0.6140804424938375, "flos": 23586714711360.0, "grad_norm": 2.0990654542093075, "language_loss": 0.72741812, "learning_rate": 1.3690895810783113e-06, "loss": 0.75238669, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.8848111629486084 }, { "auxiliary_loss_clip": 0.01254849, "auxiliary_loss_mlp": 0.00872643, "balance_loss_clip": 1.00751042, "balance_loss_mlp": 1.00055194, "epoch": 0.6142006853844767, "flos": 21398210428320.0, "grad_norm": 2.115465497971087, "language_loss": 0.71622407, "learning_rate": 1.3683504312321543e-06, "loss": 0.73749906, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.8360748291015625 }, { "auxiliary_loss_clip": 0.01340217, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00837469, "balance_loss_mlp": 1.0002414, "epoch": 0.6143209282751158, "flos": 12057375787200.0, "grad_norm": 2.0355203636951336, "language_loss": 0.79992491, "learning_rate": 1.3676113772067355e-06, "loss": 0.82526541, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.804703950881958 }, { "auxiliary_loss_clip": 0.01278366, "auxiliary_loss_mlp": 0.01194012, "balance_loss_clip": 1.00688589, "balance_loss_mlp": 1.00032032, "epoch": 0.6144411711657548, "flos": 25082286638400.0, "grad_norm": 1.9517336424623524, "language_loss": 0.72311109, "learning_rate": 1.3668724191141671e-06, "loss": 0.7478348, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.899977207183838 }, { "auxiliary_loss_clip": 0.0128271, "auxiliary_loss_mlp": 0.01193967, "balance_loss_clip": 1.00837898, "balance_loss_mlp": 1.00027549, "epoch": 0.6145614140563939, "flos": 20114077874400.0, "grad_norm": 3.428079581790409, "language_loss": 0.66698813, "learning_rate": 1.3661335570665493e-06, "loss": 0.69175494, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.846686363220215 }, { "auxiliary_loss_clip": 0.01316083, "auxiliary_loss_mlp": 0.01193962, "balance_loss_clip": 1.00830305, "balance_loss_mlp": 1.00027025, "epoch": 0.614681656947033, "flos": 16800386408640.0, "grad_norm": 3.2351539839300987, "language_loss": 0.69912541, "learning_rate": 1.3653947911759676e-06, "loss": 0.72422588, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.8184988498687744 }, { "auxiliary_loss_clip": 0.01282559, "auxiliary_loss_mlp": 0.01193902, "balance_loss_clip": 1.00764275, "balance_loss_mlp": 1.00030661, "epoch": 0.6148018998376721, "flos": 38801602716480.0, "grad_norm": 1.5298756023167694, "language_loss": 0.74529988, "learning_rate": 1.3646561215544904e-06, "loss": 0.77006447, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.971740484237671 }, { "auxiliary_loss_clip": 0.01329702, "auxiliary_loss_mlp": 0.01193766, "balance_loss_clip": 1.00834894, "balance_loss_mlp": 1.00017023, "epoch": 0.6149221427283111, "flos": 23327043045120.0, "grad_norm": 1.9479807722733193, "language_loss": 0.79333615, "learning_rate": 1.363917548314176e-06, "loss": 0.81857085, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.744471788406372 }, { "auxiliary_loss_clip": 0.01326006, "auxiliary_loss_mlp": 0.01194011, "balance_loss_clip": 1.00855923, "balance_loss_mlp": 1.00031996, "epoch": 0.6150423856189503, "flos": 22379504261760.0, "grad_norm": 1.5557063977257293, "language_loss": 0.73349762, "learning_rate": 1.3631790715670626e-06, "loss": 0.75869775, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.6813254356384277 }, { "auxiliary_loss_clip": 0.01230864, "auxiliary_loss_mlp": 0.0119365, "balance_loss_clip": 1.00595021, "balance_loss_mlp": 1.00024509, "epoch": 0.6151626285095894, "flos": 18692087149440.0, "grad_norm": 1.7713696337626135, "language_loss": 0.85681629, "learning_rate": 1.3624406914251783e-06, "loss": 0.88106143, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.9864327907562256 }, { "auxiliary_loss_clip": 0.01340217, "auxiliary_loss_mlp": 0.01193866, "balance_loss_clip": 1.00857449, "balance_loss_mlp": 1.00026989, "epoch": 0.6152828714002284, "flos": 15851698067520.0, "grad_norm": 1.8764996527202904, "language_loss": 0.8858211, "learning_rate": 1.3617024080005335e-06, "loss": 0.9111619, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.920562505722046 }, { "auxiliary_loss_clip": 0.01304204, "auxiliary_loss_mlp": 0.00872622, "balance_loss_clip": 1.00810432, "balance_loss_mlp": 1.00051093, "epoch": 0.6154031142908676, "flos": 24869805478560.0, "grad_norm": 1.6012524192532123, "language_loss": 0.74096453, "learning_rate": 1.3609642214051266e-06, "loss": 0.7627328, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.9062328338623047 }, { "auxiliary_loss_clip": 0.01317138, "auxiliary_loss_mlp": 0.01194041, "balance_loss_clip": 1.00906181, "balance_loss_mlp": 1.00025439, "epoch": 0.6155233571815066, "flos": 19244754447840.0, "grad_norm": 2.0334853773212225, "language_loss": 0.65755808, "learning_rate": 1.3602261317509385e-06, "loss": 0.68266988, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.7400710582733154 }, { "auxiliary_loss_clip": 0.01332358, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00814962, "balance_loss_mlp": 1.00024092, "epoch": 0.6156436000721457, "flos": 18770086964160.0, "grad_norm": 11.834442367027409, "language_loss": 0.83037221, "learning_rate": 1.3594881391499387e-06, "loss": 0.85563409, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.6952016353607178 }, { "auxiliary_loss_clip": 0.01318538, "auxiliary_loss_mlp": 0.01193902, "balance_loss_clip": 1.00846291, "balance_loss_mlp": 1.00030661, "epoch": 0.6157638429627849, "flos": 18041208213600.0, "grad_norm": 1.6734828023228474, "language_loss": 0.791219, "learning_rate": 1.3587502437140778e-06, "loss": 0.81634337, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.726499319076538 }, { "auxiliary_loss_clip": 0.01319406, "auxiliary_loss_mlp": 0.01193951, "balance_loss_clip": 1.00831926, "balance_loss_mlp": 1.00026023, "epoch": 0.6158840858534239, "flos": 25556738580000.0, "grad_norm": 2.802233267863766, "language_loss": 0.85160244, "learning_rate": 1.3580124455552952e-06, "loss": 0.87673599, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.7977592945098877 }, { "auxiliary_loss_clip": 0.01331185, "auxiliary_loss_mlp": 0.00872642, "balance_loss_clip": 1.0086205, "balance_loss_mlp": 1.00060415, "epoch": 0.616004328744063, "flos": 24640799425920.0, "grad_norm": 1.6464884931689174, "language_loss": 0.8747105, "learning_rate": 1.3572747447855148e-06, "loss": 0.89674878, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.899125099182129 }, { "auxiliary_loss_clip": 0.0135321, "auxiliary_loss_mlp": 0.01193928, "balance_loss_clip": 1.00915432, "balance_loss_mlp": 1.00023699, "epoch": 0.6161245716347021, "flos": 21689697265920.0, "grad_norm": 1.7567862076187557, "language_loss": 0.69034308, "learning_rate": 1.356537141516644e-06, "loss": 0.71581447, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.6454689502716064 }, { "auxiliary_loss_clip": 0.01328926, "auxiliary_loss_mlp": 0.01193867, "balance_loss_clip": 1.00848341, "balance_loss_mlp": 1.00027084, "epoch": 0.6162448145253412, "flos": 35189239677120.0, "grad_norm": 1.8754804438796495, "language_loss": 0.62220049, "learning_rate": 1.3557996358605775e-06, "loss": 0.64742839, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.8904330730438232 }, { "auxiliary_loss_clip": 0.01334295, "auxiliary_loss_mlp": 0.01193856, "balance_loss_clip": 1.00774193, "balance_loss_mlp": 1.00026035, "epoch": 0.6163650574159802, "flos": 21615289819200.0, "grad_norm": 1.9061167075210859, "language_loss": 0.70189226, "learning_rate": 1.3550622279291941e-06, "loss": 0.72717381, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.981999158859253 }, { "auxiliary_loss_clip": 0.01277073, "auxiliary_loss_mlp": 0.01193899, "balance_loss_clip": 1.00722051, "balance_loss_mlp": 1.00030351, "epoch": 0.6164853003066194, "flos": 24572175691680.0, "grad_norm": 2.0961264305854486, "language_loss": 0.83249259, "learning_rate": 1.354324917834358e-06, "loss": 0.85720229, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 4.7870941162109375 }, { "auxiliary_loss_clip": 0.0125629, "auxiliary_loss_mlp": 0.00872591, "balance_loss_clip": 1.00664818, "balance_loss_mlp": 1.00052679, "epoch": 0.6166055431972585, "flos": 21835997501760.0, "grad_norm": 4.4883233996601835, "language_loss": 0.76663649, "learning_rate": 1.353587705687918e-06, "loss": 0.78792536, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 3.8892409801483154 }, { "auxiliary_loss_clip": 0.01316599, "auxiliary_loss_mlp": 0.01194071, "balance_loss_clip": 1.00882244, "balance_loss_mlp": 1.00028408, "epoch": 0.6167257860878975, "flos": 17785272610080.0, "grad_norm": 2.5177113181693915, "language_loss": 0.72475672, "learning_rate": 1.3528505916017096e-06, "loss": 0.74986339, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.878056526184082 }, { "auxiliary_loss_clip": 0.01341749, "auxiliary_loss_mlp": 0.01194104, "balance_loss_clip": 1.00865197, "balance_loss_mlp": 1.00031757, "epoch": 0.6168460289785367, "flos": 23214821172480.0, "grad_norm": 2.015877196540846, "language_loss": 0.8855238, "learning_rate": 1.3521135756875514e-06, "loss": 0.91088229, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 2.725667715072632 }, { "auxiliary_loss_clip": 0.01261615, "auxiliary_loss_mlp": 0.01193937, "balance_loss_clip": 1.00709176, "balance_loss_mlp": 1.00024557, "epoch": 0.6169662718691757, "flos": 26213293457280.0, "grad_norm": 1.4189959185055292, "language_loss": 0.86353278, "learning_rate": 1.3513766580572496e-06, "loss": 0.88808835, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 4.0351762771606445 }, { "auxiliary_loss_clip": 0.0133309, "auxiliary_loss_mlp": 0.01193578, "balance_loss_clip": 1.00796306, "balance_loss_mlp": 1.00026822, "epoch": 0.6170865147598148, "flos": 19026130338720.0, "grad_norm": 1.979639880569778, "language_loss": 0.77178133, "learning_rate": 1.3506398388225924e-06, "loss": 0.79704797, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.7446787357330322 }, { "auxiliary_loss_clip": 0.01351904, "auxiliary_loss_mlp": 0.01193745, "balance_loss_clip": 1.00869477, "balance_loss_mlp": 1.00024414, "epoch": 0.617206757650454, "flos": 18260371177920.0, "grad_norm": 1.7283684036334033, "language_loss": 0.71603304, "learning_rate": 1.349903118095355e-06, "loss": 0.74148953, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.650312662124634 }, { "auxiliary_loss_clip": 0.01340945, "auxiliary_loss_mlp": 0.01194094, "balance_loss_clip": 1.00855255, "balance_loss_mlp": 1.00030732, "epoch": 0.617327000541093, "flos": 18186969594240.0, "grad_norm": 1.8798060123718434, "language_loss": 0.73344612, "learning_rate": 1.349166495987298e-06, "loss": 0.75879651, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.745400905609131 }, { "auxiliary_loss_clip": 0.01292175, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00933146, "balance_loss_mlp": 1.00006092, "epoch": 0.6174472434317321, "flos": 61833831230880.0, "grad_norm": 0.8174223144725513, "language_loss": 0.60910338, "learning_rate": 1.348429972610166e-06, "loss": 0.63395596, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.423095703125 }, { "auxiliary_loss_clip": 0.0124862, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.0072937, "balance_loss_mlp": 1.00003815, "epoch": 0.6175674863223712, "flos": 71231003707680.0, "grad_norm": 0.8371688139484157, "language_loss": 0.5783478, "learning_rate": 1.3476935480756897e-06, "loss": 0.60276467, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.262916326522827 }, { "auxiliary_loss_clip": 0.01315662, "auxiliary_loss_mlp": 0.01194083, "balance_loss_clip": 1.00907087, "balance_loss_mlp": 1.00029588, "epoch": 0.6176877292130103, "flos": 21835458646560.0, "grad_norm": 2.2365926110376972, "language_loss": 0.75320816, "learning_rate": 1.346957222495583e-06, "loss": 0.77830553, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.872610330581665 }, { "auxiliary_loss_clip": 0.01304287, "auxiliary_loss_mlp": 0.0087263, "balance_loss_clip": 1.00845766, "balance_loss_mlp": 1.00069427, "epoch": 0.6178079721036493, "flos": 17741746319040.0, "grad_norm": 2.2393533656438342, "language_loss": 0.71099687, "learning_rate": 1.3462209959815466e-06, "loss": 0.73276603, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.7368509769439697 }, { "auxiliary_loss_clip": 0.01318515, "auxiliary_loss_mlp": 0.01193772, "balance_loss_clip": 1.00839913, "balance_loss_mlp": 1.00027156, "epoch": 0.6179282149942885, "flos": 22633140749760.0, "grad_norm": 1.8117021785035816, "language_loss": 0.74338275, "learning_rate": 1.345484868645265e-06, "loss": 0.76850563, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.8686702251434326 }, { "auxiliary_loss_clip": 0.012912, "auxiliary_loss_mlp": 0.01194017, "balance_loss_clip": 1.00791836, "balance_loss_mlp": 1.00032568, "epoch": 0.6180484578849276, "flos": 22310341672320.0, "grad_norm": 1.8328300448275912, "language_loss": 0.78821695, "learning_rate": 1.3447488405984088e-06, "loss": 0.81306916, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 3.046884536743164 }, { "auxiliary_loss_clip": 0.01306134, "auxiliary_loss_mlp": 0.01194038, "balance_loss_clip": 1.0076685, "balance_loss_mlp": 1.00025165, "epoch": 0.6181687007755666, "flos": 35225473461120.0, "grad_norm": 1.8633178739363514, "language_loss": 0.69803882, "learning_rate": 1.3440129119526322e-06, "loss": 0.72304058, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.954888343811035 }, { "auxiliary_loss_clip": 0.01321846, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00471711, "balance_loss_mlp": 1.00012922, "epoch": 0.6182889436662057, "flos": 61547396423040.0, "grad_norm": 0.8100241866583724, "language_loss": 0.51249856, "learning_rate": 1.3432770828195762e-06, "loss": 0.53764862, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.4355618953704834 }, { "auxiliary_loss_clip": 0.01302977, "auxiliary_loss_mlp": 0.01193971, "balance_loss_clip": 1.00817096, "balance_loss_mlp": 1.00027955, "epoch": 0.6184091865568448, "flos": 19610002105920.0, "grad_norm": 2.4887791399650387, "language_loss": 0.70495671, "learning_rate": 1.3425413533108635e-06, "loss": 0.72992623, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.7956650257110596 }, { "auxiliary_loss_clip": 0.01267501, "auxiliary_loss_mlp": 0.01193995, "balance_loss_clip": 1.00695539, "balance_loss_mlp": 1.00020814, "epoch": 0.6185294294474839, "flos": 23586894329760.0, "grad_norm": 2.1384064531444946, "language_loss": 0.70461273, "learning_rate": 1.341805723538105e-06, "loss": 0.72922772, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.8610613346099854 }, { "auxiliary_loss_clip": 0.0130746, "auxiliary_loss_mlp": 0.01194007, "balance_loss_clip": 1.00803304, "balance_loss_mlp": 1.0003159, "epoch": 0.618649672338123, "flos": 26762045074560.0, "grad_norm": 1.6668311467746213, "language_loss": 0.77124083, "learning_rate": 1.3410701936128948e-06, "loss": 0.79625547, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.8108625411987305 }, { "auxiliary_loss_clip": 0.01328353, "auxiliary_loss_mlp": 0.01194018, "balance_loss_clip": 1.00829458, "balance_loss_mlp": 1.00032711, "epoch": 0.6187699152287621, "flos": 14456636893440.0, "grad_norm": 3.5382241302774444, "language_loss": 0.84881401, "learning_rate": 1.340334763646812e-06, "loss": 0.87403774, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.723051071166992 }, { "auxiliary_loss_clip": 0.01352777, "auxiliary_loss_mlp": 0.01194009, "balance_loss_clip": 1.00865769, "balance_loss_mlp": 1.00022221, "epoch": 0.6188901581194012, "flos": 20084777360640.0, "grad_norm": 1.7042657296525332, "language_loss": 0.74393117, "learning_rate": 1.3395994337514218e-06, "loss": 0.76939905, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.6667873859405518 }, { "auxiliary_loss_clip": 0.01338742, "auxiliary_loss_mlp": 0.01193862, "balance_loss_clip": 1.00826192, "balance_loss_mlp": 1.00026608, "epoch": 0.6190104010100402, "flos": 25700739700320.0, "grad_norm": 2.875990495882811, "language_loss": 0.78831613, "learning_rate": 1.3388642040382725e-06, "loss": 0.81364214, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.791705846786499 }, { "auxiliary_loss_clip": 0.01315859, "auxiliary_loss_mlp": 0.01193988, "balance_loss_clip": 1.00876176, "balance_loss_mlp": 1.00029659, "epoch": 0.6191306439006794, "flos": 30442385221920.0, "grad_norm": 2.0000206057439116, "language_loss": 0.84220678, "learning_rate": 1.3381290746188975e-06, "loss": 0.86730522, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.824955701828003 }, { "auxiliary_loss_clip": 0.01330703, "auxiliary_loss_mlp": 0.01194151, "balance_loss_clip": 1.00825047, "balance_loss_mlp": 1.00036478, "epoch": 0.6192508867913185, "flos": 26685805520160.0, "grad_norm": 1.6552696274774754, "language_loss": 0.67435271, "learning_rate": 1.3373940456048152e-06, "loss": 0.69960129, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.785792589187622 }, { "auxiliary_loss_clip": 0.01351975, "auxiliary_loss_mlp": 0.01193952, "balance_loss_clip": 1.00879049, "balance_loss_mlp": 1.00026035, "epoch": 0.6193711296819575, "flos": 36722051251200.0, "grad_norm": 1.4676485327015787, "language_loss": 0.59186494, "learning_rate": 1.3366591171075299e-06, "loss": 0.61732423, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.852828025817871 }, { "auxiliary_loss_clip": 0.01306784, "auxiliary_loss_mlp": 0.0119374, "balance_loss_clip": 1.00737834, "balance_loss_mlp": 1.00023961, "epoch": 0.6194913725725967, "flos": 25192568632320.0, "grad_norm": 2.898108247187488, "language_loss": 0.91357857, "learning_rate": 1.335924289238529e-06, "loss": 0.93858379, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.809114933013916 }, { "auxiliary_loss_clip": 0.01330033, "auxiliary_loss_mlp": 0.00872613, "balance_loss_clip": 1.00938368, "balance_loss_mlp": 1.00051796, "epoch": 0.6196116154632357, "flos": 21178831921920.0, "grad_norm": 1.6052163275561993, "language_loss": 0.76805317, "learning_rate": 1.3351895621092859e-06, "loss": 0.79007965, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 4.6419854164123535 }, { "auxiliary_loss_clip": 0.01247676, "auxiliary_loss_mlp": 0.011937, "balance_loss_clip": 1.00769305, "balance_loss_mlp": 1.00019908, "epoch": 0.6197318583538748, "flos": 16253754288480.0, "grad_norm": 1.7459946087704514, "language_loss": 0.76121461, "learning_rate": 1.3344549358312567e-06, "loss": 0.78562832, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 4.058720588684082 }, { "auxiliary_loss_clip": 0.01332452, "auxiliary_loss_mlp": 0.01193794, "balance_loss_clip": 1.00845695, "balance_loss_mlp": 1.00019801, "epoch": 0.619852101244514, "flos": 24425623990080.0, "grad_norm": 1.9306000060169501, "language_loss": 0.7796706, "learning_rate": 1.3337204105158852e-06, "loss": 0.80493307, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 2.97412371635437 }, { "auxiliary_loss_clip": 0.01315635, "auxiliary_loss_mlp": 0.01193654, "balance_loss_clip": 1.00776815, "balance_loss_mlp": 1.00024855, "epoch": 0.619972344135153, "flos": 16727308138080.0, "grad_norm": 1.9294424208559695, "language_loss": 0.7290293, "learning_rate": 1.332985986274597e-06, "loss": 0.75412214, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 2.815842628479004 }, { "auxiliary_loss_clip": 0.01252091, "auxiliary_loss_mlp": 0.00872434, "balance_loss_clip": 1.00688934, "balance_loss_mlp": 1.00047743, "epoch": 0.6200925870257921, "flos": 12495198784320.0, "grad_norm": 1.8881269842062662, "language_loss": 0.75163227, "learning_rate": 1.3322516632188047e-06, "loss": 0.77287757, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 4.0763099193573 }, { "auxiliary_loss_clip": 0.01294677, "auxiliary_loss_mlp": 0.01193794, "balance_loss_clip": 1.00730538, "balance_loss_mlp": 1.00029314, "epoch": 0.6202128299164312, "flos": 26539361589600.0, "grad_norm": 1.6312128774918793, "language_loss": 0.66796064, "learning_rate": 1.3315174414599045e-06, "loss": 0.69284534, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.830919027328491 }, { "auxiliary_loss_clip": 0.01340311, "auxiliary_loss_mlp": 0.01193805, "balance_loss_clip": 1.00869679, "balance_loss_mlp": 1.00020862, "epoch": 0.6203330728070703, "flos": 18770518048320.0, "grad_norm": 2.652186000643513, "language_loss": 0.75429738, "learning_rate": 1.3307833211092768e-06, "loss": 0.77963853, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.726872205734253 }, { "auxiliary_loss_clip": 0.01352399, "auxiliary_loss_mlp": 0.01193817, "balance_loss_clip": 1.00911307, "balance_loss_mlp": 1.00022125, "epoch": 0.6204533156977093, "flos": 20629792915200.0, "grad_norm": 1.4555701384115234, "language_loss": 0.75150007, "learning_rate": 1.3300493022782873e-06, "loss": 0.77696222, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.6625163555145264 }, { "auxiliary_loss_clip": 0.01270301, "auxiliary_loss_mlp": 0.00872646, "balance_loss_clip": 1.00712061, "balance_loss_mlp": 1.00046062, "epoch": 0.6205735585883485, "flos": 17348060315520.0, "grad_norm": 1.8176702311829793, "language_loss": 0.72443354, "learning_rate": 1.3293153850782855e-06, "loss": 0.74586302, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.8525595664978027 }, { "auxiliary_loss_clip": 0.01292241, "auxiliary_loss_mlp": 0.01193882, "balance_loss_clip": 1.00761104, "balance_loss_mlp": 1.0002861, "epoch": 0.6206938014789876, "flos": 22965028518240.0, "grad_norm": 1.7208236019755418, "language_loss": 0.71060288, "learning_rate": 1.3285815696206069e-06, "loss": 0.73546416, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.8728091716766357 }, { "auxiliary_loss_clip": 0.01298064, "auxiliary_loss_mlp": 0.01194059, "balance_loss_clip": 1.00744176, "balance_loss_mlp": 1.00027204, "epoch": 0.6208140443696266, "flos": 23983202761920.0, "grad_norm": 1.9619797529764718, "language_loss": 0.7705785, "learning_rate": 1.32784785601657e-06, "loss": 0.79549974, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.7605676651000977 }, { "auxiliary_loss_clip": 0.01321795, "auxiliary_loss_mlp": 0.01193865, "balance_loss_clip": 1.00803173, "balance_loss_mlp": 1.00026906, "epoch": 0.6209342872602658, "flos": 35077305193920.0, "grad_norm": 1.6126220063863537, "language_loss": 0.73854268, "learning_rate": 1.3271142443774798e-06, "loss": 0.76369923, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.9248459339141846 }, { "auxiliary_loss_clip": 0.01305748, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00743294, "balance_loss_mlp": 1.0002408, "epoch": 0.6210545301509048, "flos": 26979340007520.0, "grad_norm": 1.8094541167324603, "language_loss": 0.81387329, "learning_rate": 1.3263807348146228e-06, "loss": 0.83886909, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.766150712966919 }, { "auxiliary_loss_clip": 0.0132853, "auxiliary_loss_mlp": 0.01194103, "balance_loss_clip": 1.0089643, "balance_loss_mlp": 1.00031638, "epoch": 0.6211747730415439, "flos": 33618254440320.0, "grad_norm": 2.1529312597282466, "language_loss": 0.73432851, "learning_rate": 1.3256473274392733e-06, "loss": 0.75955486, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.874368190765381 }, { "auxiliary_loss_clip": 0.01351733, "auxiliary_loss_mlp": 0.01193903, "balance_loss_clip": 1.0087049, "balance_loss_mlp": 1.00021124, "epoch": 0.6212950159321831, "flos": 34167113828640.0, "grad_norm": 1.8819191835486855, "language_loss": 0.69783759, "learning_rate": 1.3249140223626873e-06, "loss": 0.72329402, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.7735562324523926 }, { "auxiliary_loss_clip": 0.01329277, "auxiliary_loss_mlp": 0.01193877, "balance_loss_clip": 1.00780082, "balance_loss_mlp": 1.0002811, "epoch": 0.6214152588228221, "flos": 27965770927200.0, "grad_norm": 1.6692456678553629, "language_loss": 0.75286579, "learning_rate": 1.3241808196961077e-06, "loss": 0.77809733, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.8485522270202637 }, { "auxiliary_loss_clip": 0.01313829, "auxiliary_loss_mlp": 0.01193682, "balance_loss_clip": 1.00743937, "balance_loss_mlp": 1.00018156, "epoch": 0.6215355017134612, "flos": 20230215428160.0, "grad_norm": 1.7170563074664789, "language_loss": 0.70912904, "learning_rate": 1.3234477195507608e-06, "loss": 0.73420417, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.7407612800598145 }, { "auxiliary_loss_clip": 0.01287599, "auxiliary_loss_mlp": 0.01193691, "balance_loss_clip": 1.00666094, "balance_loss_mlp": 1.00028563, "epoch": 0.6216557446041003, "flos": 41428145538720.0, "grad_norm": 1.964339095314041, "language_loss": 0.62528831, "learning_rate": 1.322714722037857e-06, "loss": 0.65010118, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.929640293121338 }, { "auxiliary_loss_clip": 0.01309838, "auxiliary_loss_mlp": 0.01194101, "balance_loss_clip": 1.00811887, "balance_loss_mlp": 1.000314, "epoch": 0.6217759874947394, "flos": 27928782745920.0, "grad_norm": 2.3192373373570523, "language_loss": 0.77559531, "learning_rate": 1.321981827268591e-06, "loss": 0.80063474, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.850576639175415 }, { "auxiliary_loss_clip": 0.01319954, "auxiliary_loss_mlp": 0.01193619, "balance_loss_clip": 1.00831985, "balance_loss_mlp": 1.0002141, "epoch": 0.6218962303853784, "flos": 21765685354560.0, "grad_norm": 1.599624171928059, "language_loss": 0.81481707, "learning_rate": 1.3212490353541426e-06, "loss": 0.83995283, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.7133851051330566 }, { "auxiliary_loss_clip": 0.01352798, "auxiliary_loss_mlp": 0.01193918, "balance_loss_clip": 1.0086416, "balance_loss_mlp": 1.00022674, "epoch": 0.6220164732760175, "flos": 21246270174720.0, "grad_norm": 1.94686940458413, "language_loss": 0.8024832, "learning_rate": 1.3205163464056762e-06, "loss": 0.82795036, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.7607452869415283 }, { "auxiliary_loss_clip": 0.01327779, "auxiliary_loss_mlp": 0.01193773, "balance_loss_clip": 1.00772977, "balance_loss_mlp": 1.00017703, "epoch": 0.6221367161666567, "flos": 26136371352960.0, "grad_norm": 1.6683509524106344, "language_loss": 0.73067808, "learning_rate": 1.319783760534339e-06, "loss": 0.75589359, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.7536139488220215 }, { "auxiliary_loss_clip": 0.01328854, "auxiliary_loss_mlp": 0.01193923, "balance_loss_clip": 1.00797224, "balance_loss_mlp": 1.00032735, "epoch": 0.6222569590572957, "flos": 16284204360000.0, "grad_norm": 2.3081565160885247, "language_loss": 0.75201583, "learning_rate": 1.319051277851266e-06, "loss": 0.77724361, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.7511582374572754 }, { "auxiliary_loss_clip": 0.0132437, "auxiliary_loss_mlp": 0.01194068, "balance_loss_clip": 1.0087173, "balance_loss_mlp": 1.00028169, "epoch": 0.6223772019479348, "flos": 18223850004480.0, "grad_norm": 1.7763893877067005, "language_loss": 0.83836168, "learning_rate": 1.3183188984675716e-06, "loss": 0.86354601, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.703672409057617 }, { "auxiliary_loss_clip": 0.01304293, "auxiliary_loss_mlp": 0.01194053, "balance_loss_clip": 1.00771677, "balance_loss_mlp": 1.00026655, "epoch": 0.6224974448385739, "flos": 27489810191040.0, "grad_norm": 2.9505031152385155, "language_loss": 0.71397793, "learning_rate": 1.3175866224943586e-06, "loss": 0.7389614, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.7869441509246826 }, { "auxiliary_loss_clip": 0.01311804, "auxiliary_loss_mlp": 0.01193835, "balance_loss_clip": 1.00749123, "balance_loss_mlp": 1.00023866, "epoch": 0.622617687729213, "flos": 19791961346880.0, "grad_norm": 2.0657288535775327, "language_loss": 0.73601794, "learning_rate": 1.316854450042712e-06, "loss": 0.7610743, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.8059160709381104 }, { "auxiliary_loss_clip": 0.01335378, "auxiliary_loss_mlp": 0.01193987, "balance_loss_clip": 1.00815177, "balance_loss_mlp": 1.00020063, "epoch": 0.622737930619852, "flos": 23038897109760.0, "grad_norm": 2.267168239926566, "language_loss": 0.74222475, "learning_rate": 1.3161223812237024e-06, "loss": 0.7675184, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 3.659921407699585 }, { "auxiliary_loss_clip": 0.01351645, "auxiliary_loss_mlp": 0.01193891, "balance_loss_clip": 1.0084157, "balance_loss_mlp": 1.00019944, "epoch": 0.6228581735104912, "flos": 12634278360480.0, "grad_norm": 2.141215826027485, "language_loss": 0.85398751, "learning_rate": 1.3153904161483842e-06, "loss": 0.87944281, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 3.7219460010528564 }, { "auxiliary_loss_clip": 0.01305405, "auxiliary_loss_mlp": 0.01193892, "balance_loss_clip": 1.00767374, "balance_loss_mlp": 1.00020051, "epoch": 0.6229784164011303, "flos": 23802824162880.0, "grad_norm": 1.9803901892288696, "language_loss": 0.85308534, "learning_rate": 1.3146585549277953e-06, "loss": 0.87807828, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.8492636680603027 }, { "auxiliary_loss_clip": 0.01312527, "auxiliary_loss_mlp": 0.01193868, "balance_loss_clip": 1.00873518, "balance_loss_mlp": 1.0002718, "epoch": 0.6230986592917693, "flos": 22414229251200.0, "grad_norm": 2.1948572733329397, "language_loss": 0.78242826, "learning_rate": 1.3139267976729591e-06, "loss": 0.80749226, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 2.742490768432617 }, { "auxiliary_loss_clip": 0.01334639, "auxiliary_loss_mlp": 0.01193824, "balance_loss_clip": 1.00842261, "balance_loss_mlp": 1.00022852, "epoch": 0.6232189021824085, "flos": 34528230263520.0, "grad_norm": 1.6989098044252378, "language_loss": 0.71448261, "learning_rate": 1.3131951444948815e-06, "loss": 0.73976725, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 3.8716964721679688 }, { "auxiliary_loss_clip": 0.01313095, "auxiliary_loss_mlp": 0.0119378, "balance_loss_clip": 1.00796115, "balance_loss_mlp": 1.00027943, "epoch": 0.6233391450730476, "flos": 22237011936000.0, "grad_norm": 1.7979324277621498, "language_loss": 0.76386237, "learning_rate": 1.3124635955045546e-06, "loss": 0.78893113, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.8181047439575195 }, { "auxiliary_loss_clip": 0.01276639, "auxiliary_loss_mlp": 0.00872426, "balance_loss_clip": 1.00793409, "balance_loss_mlp": 1.00049758, "epoch": 0.6234593879636866, "flos": 20332701983520.0, "grad_norm": 1.8478591590067392, "language_loss": 0.84320241, "learning_rate": 1.3117321508129537e-06, "loss": 0.86469305, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.835568904876709 }, { "auxiliary_loss_clip": 0.0131559, "auxiliary_loss_mlp": 0.01193956, "balance_loss_clip": 1.00830436, "balance_loss_mlp": 1.00026429, "epoch": 0.6235796308543258, "flos": 20664913065120.0, "grad_norm": 1.4700562578507284, "language_loss": 0.7637403, "learning_rate": 1.3110008105310388e-06, "loss": 0.78883582, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.80831241607666 }, { "auxiliary_loss_clip": 0.01351961, "auxiliary_loss_mlp": 0.01194033, "balance_loss_clip": 1.00809741, "balance_loss_mlp": 1.00024676, "epoch": 0.6236998737449648, "flos": 26618654656800.0, "grad_norm": 1.5472950776952819, "language_loss": 0.78089607, "learning_rate": 1.3102695747697526e-06, "loss": 0.80635595, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.74873685836792 }, { "auxiliary_loss_clip": 0.01248055, "auxiliary_loss_mlp": 0.0119397, "balance_loss_clip": 1.00721109, "balance_loss_mlp": 1.00027895, "epoch": 0.6238201166356039, "flos": 12674607444000.0, "grad_norm": 2.243585291629029, "language_loss": 0.9039588, "learning_rate": 1.3095384436400237e-06, "loss": 0.92837906, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.8425185680389404 }, { "auxiliary_loss_clip": 0.01311628, "auxiliary_loss_mlp": 0.01193879, "balance_loss_clip": 1.0084064, "balance_loss_mlp": 1.00028288, "epoch": 0.623940359526243, "flos": 10452168492480.0, "grad_norm": 1.9634656028601898, "language_loss": 0.82353973, "learning_rate": 1.3088074172527633e-06, "loss": 0.84859478, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 2.6780285835266113 }, { "auxiliary_loss_clip": 0.01316498, "auxiliary_loss_mlp": 0.01194016, "balance_loss_clip": 1.00772023, "balance_loss_mlp": 1.00032473, "epoch": 0.6240606024168821, "flos": 29059538099040.0, "grad_norm": 3.9281689263137207, "language_loss": 0.71422303, "learning_rate": 1.3080764957188684e-06, "loss": 0.73932821, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.8093369007110596 }, { "auxiliary_loss_clip": 0.0128198, "auxiliary_loss_mlp": 0.01193988, "balance_loss_clip": 1.0083195, "balance_loss_mlp": 1.00020134, "epoch": 0.6241808453075212, "flos": 22018100437440.0, "grad_norm": 1.738003413839924, "language_loss": 0.70619929, "learning_rate": 1.3073456791492192e-06, "loss": 0.730959, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.8232269287109375 }, { "auxiliary_loss_clip": 0.01318208, "auxiliary_loss_mlp": 0.011937, "balance_loss_clip": 1.00755191, "balance_loss_mlp": 1.00019944, "epoch": 0.6243010881981603, "flos": 21138718380480.0, "grad_norm": 1.6901120700875658, "language_loss": 0.78186512, "learning_rate": 1.3066149676546801e-06, "loss": 0.80698419, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.7858383655548096 }, { "auxiliary_loss_clip": 0.01304804, "auxiliary_loss_mlp": 0.01194033, "balance_loss_clip": 1.00817943, "balance_loss_mlp": 1.00024641, "epoch": 0.6244213310887994, "flos": 22344958890720.0, "grad_norm": 1.6822231477934804, "language_loss": 0.66273224, "learning_rate": 1.3058843613460985e-06, "loss": 0.6877206, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.866260051727295 }, { "auxiliary_loss_clip": 0.01288547, "auxiliary_loss_mlp": 0.01194083, "balance_loss_clip": 1.00788903, "balance_loss_mlp": 1.00029647, "epoch": 0.6245415739794384, "flos": 15231987676800.0, "grad_norm": 1.722641190725361, "language_loss": 0.74062485, "learning_rate": 1.3051538603343075e-06, "loss": 0.76545113, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.7702996730804443 }, { "auxiliary_loss_clip": 0.01330502, "auxiliary_loss_mlp": 0.01193794, "balance_loss_clip": 1.00808048, "balance_loss_mlp": 1.00029325, "epoch": 0.6246618168700776, "flos": 18879901950240.0, "grad_norm": 2.9482280681449797, "language_loss": 0.67447257, "learning_rate": 1.3044234647301235e-06, "loss": 0.6997155, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.8112294673919678 }, { "auxiliary_loss_clip": 0.01326775, "auxiliary_loss_mlp": 0.01193987, "balance_loss_clip": 1.00779033, "balance_loss_mlp": 1.000296, "epoch": 0.6247820597607167, "flos": 14319209806560.0, "grad_norm": 1.8227051877826812, "language_loss": 0.72355497, "learning_rate": 1.303693174644347e-06, "loss": 0.74876255, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.684971332550049 }, { "auxiliary_loss_clip": 0.01318592, "auxiliary_loss_mlp": 0.01194037, "balance_loss_clip": 1.00805712, "balance_loss_mlp": 1.0002507, "epoch": 0.6249023026513557, "flos": 22637990446560.0, "grad_norm": 1.7692147746247173, "language_loss": 0.80570376, "learning_rate": 1.3029629901877625e-06, "loss": 0.83083004, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.7525594234466553 }, { "auxiliary_loss_clip": 0.01325612, "auxiliary_loss_mlp": 0.01193919, "balance_loss_clip": 1.00920296, "balance_loss_mlp": 1.00032282, "epoch": 0.6250225455419949, "flos": 20266700677920.0, "grad_norm": 3.0100255163114324, "language_loss": 0.77143133, "learning_rate": 1.3022329114711376e-06, "loss": 0.79662663, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.632596492767334 }, { "auxiliary_loss_clip": 0.01315457, "auxiliary_loss_mlp": 0.01193722, "balance_loss_clip": 1.00799584, "balance_loss_mlp": 1.00022197, "epoch": 0.6251427884326339, "flos": 23437863894240.0, "grad_norm": 1.920599795975153, "language_loss": 0.69712651, "learning_rate": 1.3015029386052256e-06, "loss": 0.72221822, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.7176036834716797 }, { "auxiliary_loss_clip": 0.01289165, "auxiliary_loss_mlp": 0.0119421, "balance_loss_clip": 1.00773311, "balance_loss_mlp": 1.00042379, "epoch": 0.625263031323273, "flos": 31723069102560.0, "grad_norm": 1.715914937712217, "language_loss": 0.72848445, "learning_rate": 1.3007730717007622e-06, "loss": 0.75331819, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.776437282562256 }, { "auxiliary_loss_clip": 0.01352961, "auxiliary_loss_mlp": 0.01193934, "balance_loss_clip": 1.00900102, "balance_loss_mlp": 1.00024307, "epoch": 0.6253832742139122, "flos": 24134352694560.0, "grad_norm": 1.6163213327488593, "language_loss": 0.75666118, "learning_rate": 1.3000433108684676e-06, "loss": 0.78213012, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.6507985591888428 }, { "auxiliary_loss_clip": 0.01325949, "auxiliary_loss_mlp": 0.01193933, "balance_loss_clip": 1.00816679, "balance_loss_mlp": 1.0002414, "epoch": 0.6255035171045512, "flos": 27668823690240.0, "grad_norm": 2.0810602022069036, "language_loss": 0.80097002, "learning_rate": 1.2993136562190467e-06, "loss": 0.82616889, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.664829969406128 }, { "auxiliary_loss_clip": 0.01318153, "auxiliary_loss_mlp": 0.01193961, "balance_loss_clip": 1.00789392, "balance_loss_mlp": 1.00026989, "epoch": 0.6256237599951903, "flos": 20227808541600.0, "grad_norm": 1.588536011056006, "language_loss": 0.70205426, "learning_rate": 1.2985841078631871e-06, "loss": 0.72717535, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.8349390029907227 }, { "auxiliary_loss_clip": 0.01263458, "auxiliary_loss_mlp": 0.01193927, "balance_loss_clip": 1.00629866, "balance_loss_mlp": 1.00023568, "epoch": 0.6257440028858293, "flos": 24170586478560.0, "grad_norm": 1.6501003928482296, "language_loss": 0.77977067, "learning_rate": 1.2978546659115608e-06, "loss": 0.80434453, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.953105926513672 }, { "auxiliary_loss_clip": 0.01318629, "auxiliary_loss_mlp": 0.01193657, "balance_loss_clip": 1.0082984, "balance_loss_mlp": 1.00025177, "epoch": 0.6258642457764685, "flos": 15851949533280.0, "grad_norm": 7.779916584864135, "language_loss": 0.85345852, "learning_rate": 1.2971253304748228e-06, "loss": 0.87858146, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 3.8958113193511963 }, { "auxiliary_loss_clip": 0.01330237, "auxiliary_loss_mlp": 0.01193903, "balance_loss_clip": 1.00892019, "balance_loss_mlp": 1.00021183, "epoch": 0.6259844886671075, "flos": 11911362940800.0, "grad_norm": 1.5628436585370322, "language_loss": 0.74887109, "learning_rate": 1.296396101663614e-06, "loss": 0.77411246, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 3.723518133163452 }, { "auxiliary_loss_clip": 0.01339413, "auxiliary_loss_mlp": 0.01193875, "balance_loss_clip": 1.008793, "balance_loss_mlp": 1.00018382, "epoch": 0.6261047315577466, "flos": 15887967775200.0, "grad_norm": 1.9910846243688356, "language_loss": 0.84368002, "learning_rate": 1.2956669795885565e-06, "loss": 0.86901295, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 3.80971360206604 }, { "auxiliary_loss_clip": 0.01293238, "auxiliary_loss_mlp": 0.01193867, "balance_loss_clip": 1.0080862, "balance_loss_mlp": 1.00027108, "epoch": 0.6262249744483858, "flos": 31248940474080.0, "grad_norm": 1.6654637341083518, "language_loss": 0.68170696, "learning_rate": 1.294937964360259e-06, "loss": 0.70657802, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 2.8594014644622803 }, { "auxiliary_loss_clip": 0.01304995, "auxiliary_loss_mlp": 0.01194052, "balance_loss_clip": 1.00803375, "balance_loss_mlp": 1.000265, "epoch": 0.6263452173390248, "flos": 27198610742880.0, "grad_norm": 2.0226009143559, "language_loss": 0.71175623, "learning_rate": 1.2942090560893108e-06, "loss": 0.73674673, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 3.7421226501464844 }, { "auxiliary_loss_clip": 0.01352489, "auxiliary_loss_mlp": 0.01193951, "balance_loss_clip": 1.00895977, "balance_loss_mlp": 1.00025988, "epoch": 0.6264654602296639, "flos": 37342084955040.0, "grad_norm": 1.6597650291487647, "language_loss": 0.60457098, "learning_rate": 1.2934802548862882e-06, "loss": 0.6300354, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.8686435222625732 }, { "auxiliary_loss_clip": 0.01303922, "auxiliary_loss_mlp": 0.01193731, "balance_loss_clip": 1.00663567, "balance_loss_mlp": 1.00023019, "epoch": 0.626585703120303, "flos": 14756960956320.0, "grad_norm": 1.9252855229211825, "language_loss": 0.82722509, "learning_rate": 1.292751560861749e-06, "loss": 0.85220158, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.8594555854797363 }, { "auxiliary_loss_clip": 0.01352812, "auxiliary_loss_mlp": 0.01193834, "balance_loss_clip": 1.00891304, "balance_loss_mlp": 1.00023794, "epoch": 0.6267059460109421, "flos": 22347329853600.0, "grad_norm": 2.132404213234613, "language_loss": 0.79495108, "learning_rate": 1.2920229741262354e-06, "loss": 0.82041752, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.78145432472229 }, { "auxiliary_loss_clip": 0.01317293, "auxiliary_loss_mlp": 0.01193685, "balance_loss_clip": 1.00795794, "balance_loss_mlp": 1.0001843, "epoch": 0.6268261889015811, "flos": 17748823284000.0, "grad_norm": 1.9654892134603352, "language_loss": 0.75501513, "learning_rate": 1.2912944947902739e-06, "loss": 0.7801249, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.870033025741577 }, { "auxiliary_loss_clip": 0.01320496, "auxiliary_loss_mlp": 0.0119403, "balance_loss_clip": 1.00809908, "balance_loss_mlp": 1.00024295, "epoch": 0.6269464317922203, "flos": 32846496024960.0, "grad_norm": 6.419028069632002, "language_loss": 0.71648031, "learning_rate": 1.2905661229643742e-06, "loss": 0.74162555, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.859229326248169 }, { "auxiliary_loss_clip": 0.01352272, "auxiliary_loss_mlp": 0.01193773, "balance_loss_clip": 1.00868964, "balance_loss_mlp": 1.00017703, "epoch": 0.6270666746828594, "flos": 17929201883040.0, "grad_norm": 2.2805477407080335, "language_loss": 0.83876866, "learning_rate": 1.2898378587590299e-06, "loss": 0.86422914, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.681528329849243 }, { "auxiliary_loss_clip": 0.01326538, "auxiliary_loss_mlp": 0.01193816, "balance_loss_clip": 1.00805068, "balance_loss_mlp": 1.00022018, "epoch": 0.6271869175734984, "flos": 17457336446400.0, "grad_norm": 1.7862568070737341, "language_loss": 0.8740989, "learning_rate": 1.2891097022847173e-06, "loss": 0.89930248, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.789303779602051 }, { "auxiliary_loss_clip": 0.01305852, "auxiliary_loss_mlp": 0.01193974, "balance_loss_clip": 1.00717092, "balance_loss_mlp": 1.00037849, "epoch": 0.6273071604641376, "flos": 26868626929440.0, "grad_norm": 1.7405522601025414, "language_loss": 0.66461301, "learning_rate": 1.2883816536518978e-06, "loss": 0.68961132, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.8250176906585693 }, { "auxiliary_loss_clip": 0.01329501, "auxiliary_loss_mlp": 0.01193503, "balance_loss_clip": 1.00787497, "balance_loss_mlp": 1.00019312, "epoch": 0.6274274033547766, "flos": 26062394990400.0, "grad_norm": 1.655641936493639, "language_loss": 0.81912982, "learning_rate": 1.2876537129710155e-06, "loss": 0.84435987, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.772900342941284 }, { "auxiliary_loss_clip": 0.01303503, "auxiliary_loss_mlp": 0.01193985, "balance_loss_clip": 1.00917244, "balance_loss_mlp": 1.00029397, "epoch": 0.6275476462454157, "flos": 20266269593760.0, "grad_norm": 2.1838471869865375, "language_loss": 0.74990684, "learning_rate": 1.286925880352499e-06, "loss": 0.77488172, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.848653793334961 }, { "auxiliary_loss_clip": 0.0131637, "auxiliary_loss_mlp": 0.01193835, "balance_loss_clip": 1.00850177, "balance_loss_mlp": 1.00023913, "epoch": 0.6276678891360549, "flos": 26320414167360.0, "grad_norm": 1.5264902685458535, "language_loss": 0.71177882, "learning_rate": 1.2861981559067592e-06, "loss": 0.7368809, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.778984546661377 }, { "auxiliary_loss_clip": 0.01277649, "auxiliary_loss_mlp": 0.01193824, "balance_loss_clip": 1.00711441, "balance_loss_mlp": 1.0002284, "epoch": 0.6277881320266939, "flos": 13912519430880.0, "grad_norm": 2.289930397229792, "language_loss": 0.80011272, "learning_rate": 1.2854705397441917e-06, "loss": 0.82482743, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.9408648014068604 }, { "auxiliary_loss_clip": 0.01295569, "auxiliary_loss_mlp": 0.0119358, "balance_loss_clip": 1.007514, "balance_loss_mlp": 1.00017536, "epoch": 0.627908374917333, "flos": 27048933681120.0, "grad_norm": 2.47948968060986, "language_loss": 0.77378595, "learning_rate": 1.2847430319751747e-06, "loss": 0.79867744, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 3.2566637992858887 }, { "auxiliary_loss_clip": 0.01324728, "auxiliary_loss_mlp": 0.0119389, "balance_loss_clip": 1.00823975, "balance_loss_mlp": 1.00029397, "epoch": 0.6280286178079721, "flos": 23769212807520.0, "grad_norm": 2.1672287969569806, "language_loss": 0.67532742, "learning_rate": 1.2840156327100712e-06, "loss": 0.7005136, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.7072601318359375 }, { "auxiliary_loss_clip": 0.01351633, "auxiliary_loss_mlp": 0.01193638, "balance_loss_clip": 1.00853276, "balance_loss_mlp": 1.00023293, "epoch": 0.6281488606986112, "flos": 26359162608960.0, "grad_norm": 1.8453394093871787, "language_loss": 0.72492468, "learning_rate": 1.2832883420592272e-06, "loss": 0.75037742, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.732135772705078 }, { "auxiliary_loss_clip": 0.01316065, "auxiliary_loss_mlp": 0.01194006, "balance_loss_clip": 1.00847125, "balance_loss_mlp": 1.00031447, "epoch": 0.6282691035892503, "flos": 36137209544640.0, "grad_norm": 2.257859651561038, "language_loss": 0.64130026, "learning_rate": 1.282561160132972e-06, "loss": 0.66640097, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.80578351020813 }, { "auxiliary_loss_clip": 0.0132775, "auxiliary_loss_mlp": 0.01193646, "balance_loss_clip": 1.00828028, "balance_loss_mlp": 1.00024056, "epoch": 0.6283893464798894, "flos": 26537242092480.0, "grad_norm": 1.965777252163769, "language_loss": 0.81063312, "learning_rate": 1.2818340870416186e-06, "loss": 0.83584708, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.8189454078674316 }, { "auxiliary_loss_clip": 0.01308237, "auxiliary_loss_mlp": 0.01193876, "balance_loss_clip": 1.00781178, "balance_loss_mlp": 1.0002799, "epoch": 0.6285095893705285, "flos": 22237227478080.0, "grad_norm": 1.7773528149079747, "language_loss": 0.75731289, "learning_rate": 1.2811071228954626e-06, "loss": 0.78233409, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.814932346343994 }, { "auxiliary_loss_clip": 0.01312831, "auxiliary_loss_mlp": 0.01193627, "balance_loss_clip": 1.00831449, "balance_loss_mlp": 1.00022173, "epoch": 0.6286298322611675, "flos": 26542271407680.0, "grad_norm": 2.2719827148285887, "language_loss": 0.80875206, "learning_rate": 1.2803802678047846e-06, "loss": 0.83381665, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.793628692626953 }, { "auxiliary_loss_clip": 0.01311374, "auxiliary_loss_mlp": 0.01193891, "balance_loss_clip": 1.00788081, "balance_loss_mlp": 1.00029504, "epoch": 0.6287500751518067, "flos": 21795237334080.0, "grad_norm": 1.7303569023031082, "language_loss": 0.74117154, "learning_rate": 1.279653521879848e-06, "loss": 0.76622415, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.7908945083618164 }, { "auxiliary_loss_clip": 0.01229683, "auxiliary_loss_mlp": 0.01193676, "balance_loss_clip": 1.006145, "balance_loss_mlp": 1.00017595, "epoch": 0.6288703180424458, "flos": 20009615516640.0, "grad_norm": 2.60634105155874, "language_loss": 0.84144545, "learning_rate": 1.2789268852308997e-06, "loss": 0.86567909, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.9349827766418457 }, { "auxiliary_loss_clip": 0.01339718, "auxiliary_loss_mlp": 0.01193755, "balance_loss_clip": 1.00857425, "balance_loss_mlp": 1.00025451, "epoch": 0.6289905609330848, "flos": 22124933758080.0, "grad_norm": 1.6656754293011038, "language_loss": 0.70527458, "learning_rate": 1.2782003579681688e-06, "loss": 0.73060942, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 3.841170310974121 }, { "auxiliary_loss_clip": 0.0135286, "auxiliary_loss_mlp": 0.01193804, "balance_loss_clip": 1.00878537, "balance_loss_mlp": 1.00020766, "epoch": 0.629110803823724, "flos": 25518493069920.0, "grad_norm": 2.395509158257499, "language_loss": 0.73849183, "learning_rate": 1.2774739402018701e-06, "loss": 0.76395845, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.688089370727539 }, { "auxiliary_loss_clip": 0.01328767, "auxiliary_loss_mlp": 0.01193919, "balance_loss_clip": 1.00899935, "balance_loss_mlp": 1.00022769, "epoch": 0.629231046714363, "flos": 20886626610720.0, "grad_norm": 1.5716780370908792, "language_loss": 0.72756112, "learning_rate": 1.2767476320422002e-06, "loss": 0.75278795, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 3.6803500652313232 }, { "auxiliary_loss_clip": 0.0128495, "auxiliary_loss_mlp": 0.01193138, "balance_loss_clip": 1.00538278, "balance_loss_mlp": 1.00011432, "epoch": 0.6293512896050021, "flos": 65050065456480.0, "grad_norm": 0.6750353575139554, "language_loss": 0.57173574, "learning_rate": 1.2760214335993392e-06, "loss": 0.59651661, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.4639132022857666 }, { "auxiliary_loss_clip": 0.01338953, "auxiliary_loss_mlp": 0.01193478, "balance_loss_clip": 1.00860095, "balance_loss_mlp": 1.00016809, "epoch": 0.6294715324956413, "flos": 34677871401600.0, "grad_norm": 1.8990369848528565, "language_loss": 0.58687973, "learning_rate": 1.2752953449834514e-06, "loss": 0.61220402, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 4.095459938049316 }, { "auxiliary_loss_clip": 0.01351707, "auxiliary_loss_mlp": 0.01193489, "balance_loss_clip": 1.0084393, "balance_loss_mlp": 1.0002749, "epoch": 0.6295917753862803, "flos": 22784218835040.0, "grad_norm": 1.5105529999427656, "language_loss": 0.80203009, "learning_rate": 1.2745693663046836e-06, "loss": 0.8274821, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.6794540882110596 }, { "auxiliary_loss_clip": 0.01325417, "auxiliary_loss_mlp": 0.01193623, "balance_loss_clip": 1.00727379, "balance_loss_mlp": 1.00021815, "epoch": 0.6297120182769194, "flos": 20850464674080.0, "grad_norm": 1.7171323925142703, "language_loss": 0.80272287, "learning_rate": 1.2738434976731662e-06, "loss": 0.82791328, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.7901268005371094 }, { "auxiliary_loss_clip": 0.0131735, "auxiliary_loss_mlp": 0.01193976, "balance_loss_clip": 1.00870752, "balance_loss_mlp": 1.00028467, "epoch": 0.6298322611675584, "flos": 19497672462240.0, "grad_norm": 1.6162717647837663, "language_loss": 0.75079799, "learning_rate": 1.2731177391990125e-06, "loss": 0.77591127, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.7925314903259277 }, { "auxiliary_loss_clip": 0.01315853, "auxiliary_loss_mlp": 0.01193703, "balance_loss_clip": 1.00747275, "balance_loss_mlp": 1.00020266, "epoch": 0.6299525040581976, "flos": 12604474915200.0, "grad_norm": 1.8500526119376617, "language_loss": 0.81792963, "learning_rate": 1.2723920909923203e-06, "loss": 0.84302515, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.8596889972686768 }, { "auxiliary_loss_clip": 0.01321997, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00551867, "balance_loss_mlp": 1.00007665, "epoch": 0.6300727469488366, "flos": 57725713507680.0, "grad_norm": 0.8567507085390835, "language_loss": 0.60479188, "learning_rate": 1.2716665531631688e-06, "loss": 0.62994277, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.2785634994506836 }, { "auxiliary_loss_clip": 0.01324405, "auxiliary_loss_mlp": 0.01193759, "balance_loss_clip": 1.00839138, "balance_loss_mlp": 1.00025892, "epoch": 0.6301929898394757, "flos": 22527313292160.0, "grad_norm": 1.5268908834035912, "language_loss": 0.77120495, "learning_rate": 1.270941125821623e-06, "loss": 0.79638666, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.9113950729370117 }, { "auxiliary_loss_clip": 0.01339055, "auxiliary_loss_mlp": 0.01193614, "balance_loss_clip": 1.00806832, "balance_loss_mlp": 1.00020897, "epoch": 0.6303132327301149, "flos": 28293563396160.0, "grad_norm": 1.531073186073975, "language_loss": 0.75037146, "learning_rate": 1.2702158090777278e-06, "loss": 0.77569818, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.9411494731903076 }, { "auxiliary_loss_clip": 0.01305457, "auxiliary_loss_mlp": 0.01193514, "balance_loss_clip": 1.00800347, "balance_loss_mlp": 1.0002991, "epoch": 0.6304334756207539, "flos": 25264533268800.0, "grad_norm": 1.9426693616539688, "language_loss": 0.74821204, "learning_rate": 1.2694906030415148e-06, "loss": 0.77320176, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.885894536972046 }, { "auxiliary_loss_clip": 0.01322842, "auxiliary_loss_mlp": 0.01194095, "balance_loss_clip": 1.00823843, "balance_loss_mlp": 1.00030804, "epoch": 0.630553718511393, "flos": 18033556469760.0, "grad_norm": 2.516501201309324, "language_loss": 0.82103479, "learning_rate": 1.2687655078229958e-06, "loss": 0.84620416, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.7842140197753906 }, { "auxiliary_loss_clip": 0.01303182, "auxiliary_loss_mlp": 0.01193604, "balance_loss_clip": 1.00734437, "balance_loss_mlp": 1.00019884, "epoch": 0.6306739614020321, "flos": 27304114887360.0, "grad_norm": 1.9750157865699793, "language_loss": 0.68990874, "learning_rate": 1.2680405235321678e-06, "loss": 0.71487653, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.768622875213623 }, { "auxiliary_loss_clip": 0.01309466, "auxiliary_loss_mlp": 0.00872641, "balance_loss_clip": 1.00806189, "balance_loss_mlp": 1.00069535, "epoch": 0.6307942042926712, "flos": 15341443426080.0, "grad_norm": 2.1704277253996382, "language_loss": 0.78835768, "learning_rate": 1.267315650279011e-06, "loss": 0.81017876, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.8146510124206543 }, { "auxiliary_loss_clip": 0.01277931, "auxiliary_loss_mlp": 0.01193777, "balance_loss_clip": 1.00748396, "balance_loss_mlp": 1.00027621, "epoch": 0.6309144471833102, "flos": 19606409737920.0, "grad_norm": 1.8111490808341948, "language_loss": 0.73821813, "learning_rate": 1.2665908881734874e-06, "loss": 0.76293522, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.821554183959961 }, { "auxiliary_loss_clip": 0.01331445, "auxiliary_loss_mlp": 0.01193613, "balance_loss_clip": 1.0080862, "balance_loss_mlp": 1.00020754, "epoch": 0.6310346900739494, "flos": 17493354688320.0, "grad_norm": 2.3778849362872987, "language_loss": 0.85113883, "learning_rate": 1.2658662373255432e-06, "loss": 0.87638938, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.707848072052002 }, { "auxiliary_loss_clip": 0.01288544, "auxiliary_loss_mlp": 0.01193126, "balance_loss_clip": 1.0055182, "balance_loss_mlp": 1.00010252, "epoch": 0.6311549329645885, "flos": 55070193484800.0, "grad_norm": 0.710566651066983, "language_loss": 0.52320975, "learning_rate": 1.2651416978451063e-06, "loss": 0.54802644, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.4041202068328857 }, { "auxiliary_loss_clip": 0.0135274, "auxiliary_loss_mlp": 0.01193937, "balance_loss_clip": 1.00916982, "balance_loss_mlp": 1.00024545, "epoch": 0.6312751758552275, "flos": 41902561556640.0, "grad_norm": 1.8103424855181651, "language_loss": 0.65133595, "learning_rate": 1.2644172698420903e-06, "loss": 0.67680275, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.911851406097412 }, { "auxiliary_loss_clip": 0.01291104, "auxiliary_loss_mlp": 0.01193624, "balance_loss_clip": 1.00720739, "balance_loss_mlp": 1.00021839, "epoch": 0.6313954187458667, "flos": 19646810668800.0, "grad_norm": 1.9143500057153757, "language_loss": 0.84703815, "learning_rate": 1.2636929534263892e-06, "loss": 0.87188548, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.801192283630371 }, { "auxiliary_loss_clip": 0.01313449, "auxiliary_loss_mlp": 0.01193517, "balance_loss_clip": 1.00795996, "balance_loss_mlp": 1.00020683, "epoch": 0.6315156616365057, "flos": 22894285286880.0, "grad_norm": 1.5754609625519664, "language_loss": 0.77280831, "learning_rate": 1.2629687487078821e-06, "loss": 0.79787791, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.7870726585388184 }, { "auxiliary_loss_clip": 0.01338825, "auxiliary_loss_mlp": 0.01193816, "balance_loss_clip": 1.00841451, "balance_loss_mlp": 1.00021982, "epoch": 0.6316359045271448, "flos": 23726261295360.0, "grad_norm": 2.241907837756374, "language_loss": 0.76294553, "learning_rate": 1.2622446557964293e-06, "loss": 0.7882719, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.7503578662872314 }, { "auxiliary_loss_clip": 0.01326789, "auxiliary_loss_mlp": 0.01193785, "balance_loss_clip": 1.00816464, "balance_loss_mlp": 1.00018859, "epoch": 0.631756147417784, "flos": 33108430883040.0, "grad_norm": 1.6334046033638654, "language_loss": 0.71362805, "learning_rate": 1.261520674801876e-06, "loss": 0.73883379, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.8850011825561523 }, { "auxiliary_loss_clip": 0.01303734, "auxiliary_loss_mlp": 0.01193741, "balance_loss_clip": 1.00815856, "balance_loss_mlp": 1.0002408, "epoch": 0.631876390308423, "flos": 31248437542560.0, "grad_norm": 2.587376827788761, "language_loss": 0.72210729, "learning_rate": 1.2607968058340488e-06, "loss": 0.74708211, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.8354015350341797 }, { "auxiliary_loss_clip": 0.01315223, "auxiliary_loss_mlp": 0.01193671, "balance_loss_clip": 1.0078311, "balance_loss_mlp": 1.00026548, "epoch": 0.6319966331990621, "flos": 24681164433120.0, "grad_norm": 1.6647828245424625, "language_loss": 0.7332108, "learning_rate": 1.2600730490027583e-06, "loss": 0.75829971, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.8252506256103516 }, { "auxiliary_loss_clip": 0.01299605, "auxiliary_loss_mlp": 0.01193741, "balance_loss_clip": 1.00715446, "balance_loss_mlp": 1.0002408, "epoch": 0.6321168760897012, "flos": 17491774046400.0, "grad_norm": 1.594663424591621, "language_loss": 0.80222261, "learning_rate": 1.2593494044177984e-06, "loss": 0.82715613, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.7590363025665283 }, { "auxiliary_loss_clip": 0.01352521, "auxiliary_loss_mlp": 0.01194013, "balance_loss_clip": 1.0082531, "balance_loss_mlp": 1.00032163, "epoch": 0.6322371189803403, "flos": 18295383556800.0, "grad_norm": 2.3535785501020707, "language_loss": 0.80505216, "learning_rate": 1.2586258721889448e-06, "loss": 0.83051747, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 3.5721230506896973 }, { "auxiliary_loss_clip": 0.01269991, "auxiliary_loss_mlp": 0.01193656, "balance_loss_clip": 1.00690007, "balance_loss_mlp": 1.00025058, "epoch": 0.6323573618709794, "flos": 20157280852320.0, "grad_norm": 1.8585234675447995, "language_loss": 0.8153497, "learning_rate": 1.2579024524259573e-06, "loss": 0.83998621, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 3.8013269901275635 }, { "auxiliary_loss_clip": 0.01327045, "auxiliary_loss_mlp": 0.01193747, "balance_loss_clip": 1.00850868, "balance_loss_mlp": 1.00024652, "epoch": 0.6324776047616185, "flos": 20042400627360.0, "grad_norm": 1.9323194814520988, "language_loss": 0.91230345, "learning_rate": 1.2571791452385768e-06, "loss": 0.93751132, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 2.832610845565796 }, { "auxiliary_loss_clip": 0.01314526, "auxiliary_loss_mlp": 0.01193529, "balance_loss_clip": 1.00744617, "balance_loss_mlp": 1.00021875, "epoch": 0.6325978476522576, "flos": 30848249352960.0, "grad_norm": 1.5264573825332757, "language_loss": 0.77340186, "learning_rate": 1.2564559507365301e-06, "loss": 0.79848242, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 3.7493865489959717 }, { "auxiliary_loss_clip": 0.01314587, "auxiliary_loss_mlp": 0.0119368, "balance_loss_clip": 1.00889099, "balance_loss_mlp": 1.00017977, "epoch": 0.6327180905428966, "flos": 24535115663040.0, "grad_norm": 2.081707968919898, "language_loss": 0.78808105, "learning_rate": 1.2557328690295244e-06, "loss": 0.81316376, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.7973570823669434 }, { "auxiliary_loss_clip": 0.01283872, "auxiliary_loss_mlp": 0.01193491, "balance_loss_clip": 1.00744605, "balance_loss_mlp": 1.00018108, "epoch": 0.6328383334335358, "flos": 21575284048800.0, "grad_norm": 1.8315500329946266, "language_loss": 0.76243293, "learning_rate": 1.255009900227251e-06, "loss": 0.78720653, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.8192708492279053 }, { "auxiliary_loss_clip": 0.01350298, "auxiliary_loss_mlp": 0.0119347, "balance_loss_clip": 1.00866747, "balance_loss_mlp": 1.00015986, "epoch": 0.6329585763241748, "flos": 22929872444640.0, "grad_norm": 1.7848413745472402, "language_loss": 0.79996169, "learning_rate": 1.254287044439383e-06, "loss": 0.82539928, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.7326433658599854 }, { "auxiliary_loss_clip": 0.01322137, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00553155, "balance_loss_mlp": 1.00009727, "epoch": 0.6330788192148139, "flos": 70936930365120.0, "grad_norm": 0.77512626307403, "language_loss": 0.54463112, "learning_rate": 1.2535643017755776e-06, "loss": 0.56978369, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.2759108543395996 }, { "auxiliary_loss_clip": 0.01290297, "auxiliary_loss_mlp": 0.01193492, "balance_loss_clip": 1.00790405, "balance_loss_mlp": 1.00018179, "epoch": 0.6331990621054531, "flos": 21244509914400.0, "grad_norm": 2.2257727070040976, "language_loss": 0.7194277, "learning_rate": 1.2528416723454737e-06, "loss": 0.74426556, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.757051706314087 }, { "auxiliary_loss_clip": 0.01350384, "auxiliary_loss_mlp": 0.01193665, "balance_loss_clip": 1.00852013, "balance_loss_mlp": 1.0002594, "epoch": 0.6333193049960921, "flos": 34459426910880.0, "grad_norm": 1.3463073442568025, "language_loss": 0.70957619, "learning_rate": 1.2521191562586945e-06, "loss": 0.73501664, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.8042795658111572 }, { "auxiliary_loss_clip": 0.01350995, "auxiliary_loss_mlp": 0.0087251, "balance_loss_clip": 1.00841951, "balance_loss_mlp": 1.00053275, "epoch": 0.6334395478867312, "flos": 18329893004160.0, "grad_norm": 2.0558977833212673, "language_loss": 0.76683235, "learning_rate": 1.2513967536248445e-06, "loss": 0.78906739, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.6465113162994385 }, { "auxiliary_loss_clip": 0.01324982, "auxiliary_loss_mlp": 0.01193641, "balance_loss_clip": 1.00789309, "balance_loss_mlp": 1.00023603, "epoch": 0.6335597907773702, "flos": 23623164037440.0, "grad_norm": 1.6593667305460482, "language_loss": 0.80906355, "learning_rate": 1.2506744645535117e-06, "loss": 0.83424973, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.745974540710449 }, { "auxiliary_loss_clip": 0.01327331, "auxiliary_loss_mlp": 0.01193558, "balance_loss_clip": 1.00840688, "balance_loss_mlp": 1.00015283, "epoch": 0.6336800336680094, "flos": 22710925022400.0, "grad_norm": 2.0824057853889495, "language_loss": 0.60867643, "learning_rate": 1.249952289154267e-06, "loss": 0.63388532, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.7940783500671387 }, { "auxiliary_loss_clip": 0.01264432, "auxiliary_loss_mlp": 0.01193499, "balance_loss_clip": 1.00772715, "balance_loss_mlp": 1.0001893, "epoch": 0.6338002765586485, "flos": 23622768876960.0, "grad_norm": 1.8816194880985508, "language_loss": 0.76419848, "learning_rate": 1.2492302275366635e-06, "loss": 0.78877783, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 3.088773250579834 }, { "auxiliary_loss_clip": 0.01340545, "auxiliary_loss_mlp": 0.0119381, "balance_loss_clip": 1.00884902, "balance_loss_mlp": 1.00021386, "epoch": 0.6339205194492875, "flos": 26505462844800.0, "grad_norm": 2.024170029608802, "language_loss": 0.64759785, "learning_rate": 1.2485082798102377e-06, "loss": 0.67294145, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.8728580474853516 }, { "auxiliary_loss_clip": 0.01310247, "auxiliary_loss_mlp": 0.01193929, "balance_loss_clip": 1.00815225, "balance_loss_mlp": 1.00023818, "epoch": 0.6340407623399267, "flos": 18544314042720.0, "grad_norm": 1.9015398138750002, "language_loss": 0.68332392, "learning_rate": 1.2477864460845084e-06, "loss": 0.70836568, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.8735618591308594 }, { "auxiliary_loss_clip": 0.0131998, "auxiliary_loss_mlp": 0.01193836, "balance_loss_clip": 1.00839961, "balance_loss_mlp": 1.00023973, "epoch": 0.6341610052305657, "flos": 17712589500000.0, "grad_norm": 2.3277597442775977, "language_loss": 0.73377347, "learning_rate": 1.2470647264689776e-06, "loss": 0.75891161, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.84664249420166 }, { "auxiliary_loss_clip": 0.01264933, "auxiliary_loss_mlp": 0.01193665, "balance_loss_clip": 1.00675106, "balance_loss_mlp": 1.00016475, "epoch": 0.6342812481212048, "flos": 23587038024480.0, "grad_norm": 1.9196445094183383, "language_loss": 0.71411961, "learning_rate": 1.2463431210731282e-06, "loss": 0.73870564, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.9451751708984375 }, { "auxiliary_loss_clip": 0.01282897, "auxiliary_loss_mlp": 0.01193745, "balance_loss_clip": 1.00826883, "balance_loss_mlp": 1.00024474, "epoch": 0.634401491011844, "flos": 17821937478240.0, "grad_norm": 2.2837745215583864, "language_loss": 0.76375031, "learning_rate": 1.2456216300064289e-06, "loss": 0.78851676, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.8665997982025146 }, { "auxiliary_loss_clip": 0.01328819, "auxiliary_loss_mlp": 0.01193651, "balance_loss_clip": 1.00904751, "balance_loss_mlp": 1.00024605, "epoch": 0.634521733902483, "flos": 21358168734240.0, "grad_norm": 1.5461257631593923, "language_loss": 0.78162777, "learning_rate": 1.244900253378328e-06, "loss": 0.80685246, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.7511565685272217 }, { "auxiliary_loss_clip": 0.01196379, "auxiliary_loss_mlp": 0.01193718, "balance_loss_clip": 1.00586271, "balance_loss_mlp": 1.0002178, "epoch": 0.6346419767931221, "flos": 16545061507680.0, "grad_norm": 4.19578730650931, "language_loss": 0.69485509, "learning_rate": 1.2441789912982583e-06, "loss": 0.71875608, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.88854718208313 }, { "auxiliary_loss_clip": 0.01330119, "auxiliary_loss_mlp": 0.01193783, "balance_loss_clip": 1.00802875, "balance_loss_mlp": 1.000283, "epoch": 0.6347622196837612, "flos": 24351001001280.0, "grad_norm": 1.7244040981769615, "language_loss": 0.64583635, "learning_rate": 1.2434578438756346e-06, "loss": 0.6710754, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.7465133666992188 }, { "auxiliary_loss_clip": 0.01337446, "auxiliary_loss_mlp": 0.01193547, "balance_loss_clip": 1.00789213, "balance_loss_mlp": 1.0002377, "epoch": 0.6348824625744003, "flos": 64523200865760.0, "grad_norm": 1.889492460110651, "language_loss": 0.77873683, "learning_rate": 1.242736811219855e-06, "loss": 0.80404675, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 3.1123082637786865 }, { "auxiliary_loss_clip": 0.01338743, "auxiliary_loss_mlp": 0.01193563, "balance_loss_clip": 1.00829244, "balance_loss_mlp": 1.00025308, "epoch": 0.6350027054650393, "flos": 28622146186080.0, "grad_norm": 1.6685105911435008, "language_loss": 0.8170743, "learning_rate": 1.2420158934402988e-06, "loss": 0.84239739, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.764974594116211 }, { "auxiliary_loss_clip": 0.0130554, "auxiliary_loss_mlp": 0.01193691, "balance_loss_clip": 1.00776827, "balance_loss_mlp": 1.00019062, "epoch": 0.6351229483556785, "flos": 23002555554720.0, "grad_norm": 1.808749285509652, "language_loss": 0.84553188, "learning_rate": 1.2412950906463286e-06, "loss": 0.87052417, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.8061363697052 }, { "auxiliary_loss_clip": 0.01271766, "auxiliary_loss_mlp": 0.01193452, "balance_loss_clip": 1.00650024, "balance_loss_mlp": 1.00023735, "epoch": 0.6352431912463176, "flos": 21939310301760.0, "grad_norm": 1.7128073228251426, "language_loss": 0.89920151, "learning_rate": 1.2405744029472902e-06, "loss": 0.92385375, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.8746731281280518 }, { "auxiliary_loss_clip": 0.01315744, "auxiliary_loss_mlp": 0.011937, "balance_loss_clip": 1.00808048, "balance_loss_mlp": 1.00019908, "epoch": 0.6353634341369566, "flos": 13735266192000.0, "grad_norm": 1.7142386996866303, "language_loss": 0.76334453, "learning_rate": 1.2398538304525108e-06, "loss": 0.78843898, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 3.6881039142608643 }, { "auxiliary_loss_clip": 0.01297817, "auxiliary_loss_mlp": 0.01193899, "balance_loss_clip": 1.00812161, "balance_loss_mlp": 1.00020766, "epoch": 0.6354836770275958, "flos": 19316180229120.0, "grad_norm": 2.8205872461675083, "language_loss": 0.75439882, "learning_rate": 1.2391333732713016e-06, "loss": 0.77931601, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 4.096388339996338 }, { "auxiliary_loss_clip": 0.01304557, "auxiliary_loss_mlp": 0.01193877, "balance_loss_clip": 1.0085988, "balance_loss_mlp": 1.00028133, "epoch": 0.6356039199182348, "flos": 21613385864160.0, "grad_norm": 1.7937603773296975, "language_loss": 0.78792351, "learning_rate": 1.2384130315129543e-06, "loss": 0.81290781, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 2.851036787033081 }, { "auxiliary_loss_clip": 0.01207193, "auxiliary_loss_mlp": 0.0119372, "balance_loss_clip": 1.00639105, "balance_loss_mlp": 1.0002197, "epoch": 0.6357241628088739, "flos": 18111987368640.0, "grad_norm": 2.168029876381796, "language_loss": 0.73754013, "learning_rate": 1.2376928052867447e-06, "loss": 0.76154923, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 4.232834339141846 }, { "auxiliary_loss_clip": 0.01308909, "auxiliary_loss_mlp": 0.01193667, "balance_loss_clip": 1.00812292, "balance_loss_mlp": 1.00026155, "epoch": 0.6358444056995131, "flos": 24935267928960.0, "grad_norm": 1.8490728211183374, "language_loss": 0.77550685, "learning_rate": 1.2369726947019299e-06, "loss": 0.80053264, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 3.1504883766174316 }, { "auxiliary_loss_clip": 0.01336385, "auxiliary_loss_mlp": 0.01193404, "balance_loss_clip": 1.00781429, "balance_loss_mlp": 1.0001893, "epoch": 0.6359646485901521, "flos": 23293359842400.0, "grad_norm": 2.0431448942368546, "language_loss": 0.66751111, "learning_rate": 1.2362526998677511e-06, "loss": 0.69280899, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.793808937072754 }, { "auxiliary_loss_clip": 0.01321512, "auxiliary_loss_mlp": 0.01193759, "balance_loss_clip": 1.007918, "balance_loss_mlp": 1.00025845, "epoch": 0.6360848914807912, "flos": 20887452855360.0, "grad_norm": 1.667107867780263, "language_loss": 0.8416394, "learning_rate": 1.2355328208934301e-06, "loss": 0.86679214, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.809208869934082 }, { "auxiliary_loss_clip": 0.01338441, "auxiliary_loss_mlp": 0.00872494, "balance_loss_clip": 1.00848699, "balance_loss_mlp": 1.00052047, "epoch": 0.6362051343714303, "flos": 18479785608000.0, "grad_norm": 2.1245713763124083, "language_loss": 0.72469401, "learning_rate": 1.2348130578881728e-06, "loss": 0.7468034, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.7633657455444336 }, { "auxiliary_loss_clip": 0.01352744, "auxiliary_loss_mlp": 0.01193759, "balance_loss_clip": 1.0090735, "balance_loss_mlp": 1.00025868, "epoch": 0.6363253772620694, "flos": 24389605748160.0, "grad_norm": 2.2301334724333732, "language_loss": 0.76198602, "learning_rate": 1.2340934109611664e-06, "loss": 0.78745103, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.8321359157562256 }, { "auxiliary_loss_clip": 0.01316914, "auxiliary_loss_mlp": 0.01193806, "balance_loss_clip": 1.00892842, "balance_loss_mlp": 1.00020957, "epoch": 0.6364456201527084, "flos": 25958255945760.0, "grad_norm": 3.0371018684365794, "language_loss": 0.68733555, "learning_rate": 1.2333738802215798e-06, "loss": 0.71244276, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.795189380645752 }, { "auxiliary_loss_clip": 0.01290221, "auxiliary_loss_mlp": 0.01193629, "balance_loss_clip": 1.00791728, "balance_loss_mlp": 1.00022364, "epoch": 0.6365658630433476, "flos": 20740721535360.0, "grad_norm": 2.280354926387515, "language_loss": 0.81036258, "learning_rate": 1.2326544657785668e-06, "loss": 0.83520114, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.8768551349639893 }, { "auxiliary_loss_clip": 0.01315308, "auxiliary_loss_mlp": 0.01193736, "balance_loss_clip": 1.00935996, "balance_loss_mlp": 1.00033128, "epoch": 0.6366861059339867, "flos": 21434156822880.0, "grad_norm": 2.350597798260692, "language_loss": 0.7441982, "learning_rate": 1.2319351677412608e-06, "loss": 0.76928872, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.7681384086608887 }, { "auxiliary_loss_clip": 0.01302314, "auxiliary_loss_mlp": 0.01193596, "balance_loss_clip": 1.00768554, "balance_loss_mlp": 1.00019073, "epoch": 0.6368063488246257, "flos": 22267102770720.0, "grad_norm": 2.0161563284776416, "language_loss": 0.73971498, "learning_rate": 1.2312159862187796e-06, "loss": 0.76467413, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.8358888626098633 }, { "auxiliary_loss_clip": 0.0135337, "auxiliary_loss_mlp": 0.01193847, "balance_loss_clip": 1.00929666, "balance_loss_mlp": 1.00025082, "epoch": 0.6369265917152649, "flos": 22420731437280.0, "grad_norm": 1.485753368604954, "language_loss": 0.76090801, "learning_rate": 1.2304969213202217e-06, "loss": 0.78638017, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.6846508979797363 }, { "auxiliary_loss_clip": 0.01319386, "auxiliary_loss_mlp": 0.011936, "balance_loss_clip": 1.00787115, "balance_loss_mlp": 1.00019455, "epoch": 0.6370468346059039, "flos": 24718188538080.0, "grad_norm": 2.5068413829880836, "language_loss": 0.79070544, "learning_rate": 1.2297779731546692e-06, "loss": 0.81583536, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.891202211380005 }, { "auxiliary_loss_clip": 0.01303829, "auxiliary_loss_mlp": 0.01193523, "balance_loss_clip": 1.00744891, "balance_loss_mlp": 1.00021374, "epoch": 0.637167077496543, "flos": 25296599905920.0, "grad_norm": 2.2936508688722763, "language_loss": 0.77656865, "learning_rate": 1.2290591418311853e-06, "loss": 0.80154216, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 3.611515522003174 }, { "auxiliary_loss_clip": 0.01328593, "auxiliary_loss_mlp": 0.01193919, "balance_loss_clip": 1.00885201, "balance_loss_mlp": 1.00022817, "epoch": 0.637287320387182, "flos": 27671122805760.0, "grad_norm": 1.532242997497762, "language_loss": 0.72293508, "learning_rate": 1.2283404274588172e-06, "loss": 0.74816024, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.78005051612854 }, { "auxiliary_loss_clip": 0.01205554, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.0035696, "balance_loss_mlp": 1.00004697, "epoch": 0.6374075632778212, "flos": 63173439452160.0, "grad_norm": 0.734822511362718, "language_loss": 0.52817893, "learning_rate": 1.227621830146592e-06, "loss": 0.55216515, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.4123289585113525 }, { "auxiliary_loss_clip": 0.01298356, "auxiliary_loss_mlp": 0.01193845, "balance_loss_clip": 1.00819111, "balance_loss_mlp": 1.00024915, "epoch": 0.6375278061684603, "flos": 25558139603520.0, "grad_norm": 1.7620834044589821, "language_loss": 0.79190254, "learning_rate": 1.2269033500035217e-06, "loss": 0.81682456, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.821378469467163 }, { "auxiliary_loss_clip": 0.01276341, "auxiliary_loss_mlp": 0.01193445, "balance_loss_clip": 1.00742722, "balance_loss_mlp": 1.00023043, "epoch": 0.6376480490590993, "flos": 25666373947680.0, "grad_norm": 1.7750957860468266, "language_loss": 0.7365936, "learning_rate": 1.2261849871385988e-06, "loss": 0.76129138, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.9614646434783936 }, { "auxiliary_loss_clip": 0.01351944, "auxiliary_loss_mlp": 0.01193814, "balance_loss_clip": 1.00858951, "balance_loss_mlp": 1.00021863, "epoch": 0.6377682919497385, "flos": 31537697112000.0, "grad_norm": 1.8828371648820934, "language_loss": 0.62467802, "learning_rate": 1.2254667416607972e-06, "loss": 0.65013552, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.7901015281677246 }, { "auxiliary_loss_clip": 0.01326524, "auxiliary_loss_mlp": 0.0119379, "balance_loss_clip": 1.00759661, "balance_loss_mlp": 1.00019419, "epoch": 0.6378885348403776, "flos": 23039220422880.0, "grad_norm": 2.0165090972066895, "language_loss": 0.83131731, "learning_rate": 1.2247486136790756e-06, "loss": 0.85652041, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.735191822052002 }, { "auxiliary_loss_clip": 0.01334377, "auxiliary_loss_mlp": 0.0119353, "balance_loss_clip": 1.00827384, "balance_loss_mlp": 1.00022054, "epoch": 0.6380087777310166, "flos": 18697080540960.0, "grad_norm": 2.9734059871541287, "language_loss": 0.80992007, "learning_rate": 1.2240306033023726e-06, "loss": 0.83519912, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.760603904724121 }, { "auxiliary_loss_clip": 0.01315337, "auxiliary_loss_mlp": 0.01193702, "balance_loss_clip": 1.00846291, "balance_loss_mlp": 1.00029695, "epoch": 0.6381290206216558, "flos": 23331569428800.0, "grad_norm": 1.5781896733527692, "language_loss": 0.71714652, "learning_rate": 1.223312710639611e-06, "loss": 0.74223697, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.8258118629455566 }, { "auxiliary_loss_clip": 0.01317223, "auxiliary_loss_mlp": 0.01193714, "balance_loss_clip": 1.00871825, "balance_loss_mlp": 1.00021386, "epoch": 0.6382492635122948, "flos": 18880476729120.0, "grad_norm": 1.9844979662934388, "language_loss": 0.86875266, "learning_rate": 1.2225949357996928e-06, "loss": 0.89386207, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.9564034938812256 }, { "auxiliary_loss_clip": 0.01326337, "auxiliary_loss_mlp": 0.01193718, "balance_loss_clip": 1.00859928, "balance_loss_mlp": 1.00021756, "epoch": 0.6383695064029339, "flos": 27819147378240.0, "grad_norm": 1.468738696265725, "language_loss": 0.80296302, "learning_rate": 1.221877278891505e-06, "loss": 0.82816362, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 4.107955694198608 }, { "auxiliary_loss_clip": 0.01324165, "auxiliary_loss_mlp": 0.01193865, "balance_loss_clip": 1.00907922, "balance_loss_mlp": 1.00026929, "epoch": 0.638489749293573, "flos": 26395647858720.0, "grad_norm": 2.1393652135118764, "language_loss": 0.7162509, "learning_rate": 1.221159740023915e-06, "loss": 0.74143124, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 3.808661699295044 }, { "auxiliary_loss_clip": 0.01305609, "auxiliary_loss_mlp": 0.00872624, "balance_loss_clip": 1.0085994, "balance_loss_mlp": 1.00050044, "epoch": 0.6386099921842121, "flos": 23988339848160.0, "grad_norm": 2.711468036877022, "language_loss": 0.72683865, "learning_rate": 1.2204423193057735e-06, "loss": 0.74862099, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 3.8361449241638184 }, { "auxiliary_loss_clip": 0.01291298, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00569642, "balance_loss_mlp": 1.0000577, "epoch": 0.6387302350748512, "flos": 71731199718720.0, "grad_norm": 0.8548664899591584, "language_loss": 0.63363862, "learning_rate": 1.2197250168459122e-06, "loss": 0.65848243, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.4218974113464355 }, { "auxiliary_loss_clip": 0.01339001, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.0085876, "balance_loss_mlp": 1.00019884, "epoch": 0.6388504779654903, "flos": 14535786265920.0, "grad_norm": 1.8238342164876715, "language_loss": 0.74246156, "learning_rate": 1.2190078327531454e-06, "loss": 0.76778471, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.7232954502105713 }, { "auxiliary_loss_clip": 0.01338449, "auxiliary_loss_mlp": 0.01193333, "balance_loss_clip": 1.00832927, "balance_loss_mlp": 1.0002141, "epoch": 0.6389707208561294, "flos": 22346144372160.0, "grad_norm": 1.484476130918111, "language_loss": 0.72442293, "learning_rate": 1.2182907671362697e-06, "loss": 0.74974066, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 3.706153154373169 }, { "auxiliary_loss_clip": 0.01329133, "auxiliary_loss_mlp": 0.01193763, "balance_loss_clip": 1.00811076, "balance_loss_mlp": 1.00016713, "epoch": 0.6390909637467684, "flos": 19426893307200.0, "grad_norm": 1.8409017935819838, "language_loss": 0.78424835, "learning_rate": 1.2175738201040626e-06, "loss": 0.80947727, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.811811685562134 }, { "auxiliary_loss_clip": 0.01327899, "auxiliary_loss_mlp": 0.01193703, "balance_loss_clip": 1.00764275, "balance_loss_mlp": 1.0002023, "epoch": 0.6392112066374076, "flos": 24090862327200.0, "grad_norm": 2.0030706566874925, "language_loss": 0.78496182, "learning_rate": 1.2168569917652855e-06, "loss": 0.8101778, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.844532012939453 }, { "auxiliary_loss_clip": 0.01329889, "auxiliary_loss_mlp": 0.01193574, "balance_loss_clip": 1.00807858, "balance_loss_mlp": 1.00016856, "epoch": 0.6393314495280467, "flos": 26795153498400.0, "grad_norm": 1.5045777348878293, "language_loss": 0.63795185, "learning_rate": 1.2161402822286797e-06, "loss": 0.66318643, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.874868154525757 }, { "auxiliary_loss_clip": 0.01296196, "auxiliary_loss_mlp": 0.01193672, "balance_loss_clip": 1.0078944, "balance_loss_mlp": 1.00026703, "epoch": 0.6394516924186857, "flos": 20260701423360.0, "grad_norm": 1.8313594979458907, "language_loss": 0.78841734, "learning_rate": 1.2154236916029703e-06, "loss": 0.81331599, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.768815517425537 }, { "auxiliary_loss_clip": 0.01287459, "auxiliary_loss_mlp": 0.01193699, "balance_loss_clip": 1.00730109, "balance_loss_mlp": 1.00019896, "epoch": 0.6395719353093249, "flos": 18368856987840.0, "grad_norm": 2.492157394743118, "language_loss": 0.73148447, "learning_rate": 1.2147072199968627e-06, "loss": 0.75629616, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.9214303493499756 }, { "auxiliary_loss_clip": 0.01327973, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00773072, "balance_loss_mlp": 1.00016439, "epoch": 0.6396921781999639, "flos": 17566325187840.0, "grad_norm": 1.5778222687932841, "language_loss": 0.71141118, "learning_rate": 1.2139908675190454e-06, "loss": 0.73662275, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.745939016342163 }, { "auxiliary_loss_clip": 0.01267577, "auxiliary_loss_mlp": 0.01193482, "balance_loss_clip": 1.00744689, "balance_loss_mlp": 1.00017262, "epoch": 0.639812421090603, "flos": 21251263566240.0, "grad_norm": 2.199902082831215, "language_loss": 0.75006235, "learning_rate": 1.2132746342781883e-06, "loss": 0.77467299, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.9126203060150146 }, { "auxiliary_loss_clip": 0.01352324, "auxiliary_loss_mlp": 0.01193646, "balance_loss_clip": 1.008793, "balance_loss_mlp": 1.00024056, "epoch": 0.6399326639812422, "flos": 11180975395680.0, "grad_norm": 2.655282058902409, "language_loss": 0.80067503, "learning_rate": 1.2125585203829442e-06, "loss": 0.82613462, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.815253734588623 }, { "auxiliary_loss_clip": 0.01289382, "auxiliary_loss_mlp": 0.0119371, "balance_loss_clip": 1.00776529, "balance_loss_mlp": 1.00030494, "epoch": 0.6400529068718812, "flos": 23911058507040.0, "grad_norm": 1.8138068667946108, "language_loss": 0.73854089, "learning_rate": 1.211842525941946e-06, "loss": 0.76337183, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.855163812637329 }, { "auxiliary_loss_clip": 0.01261678, "auxiliary_loss_mlp": 0.01193491, "balance_loss_clip": 1.00595486, "balance_loss_mlp": 1.00018144, "epoch": 0.6401731497625203, "flos": 44018741966400.0, "grad_norm": 3.022511697075766, "language_loss": 0.78873873, "learning_rate": 1.2111266510638105e-06, "loss": 0.81329048, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 3.0923900604248047 }, { "auxiliary_loss_clip": 0.01259228, "auxiliary_loss_mlp": 0.01193914, "balance_loss_clip": 1.00753784, "balance_loss_mlp": 1.00022256, "epoch": 0.6402933926531594, "flos": 20662218789120.0, "grad_norm": 1.5400120805612467, "language_loss": 0.79934311, "learning_rate": 1.2104108958571346e-06, "loss": 0.82387447, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.8508479595184326 }, { "auxiliary_loss_clip": 0.01328484, "auxiliary_loss_mlp": 0.01193668, "balance_loss_clip": 1.00788951, "balance_loss_mlp": 1.00026321, "epoch": 0.6404136355437985, "flos": 24863339216160.0, "grad_norm": 1.6226931115494116, "language_loss": 0.75874305, "learning_rate": 1.2096952604304975e-06, "loss": 0.78396457, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.823415756225586 }, { "auxiliary_loss_clip": 0.01330724, "auxiliary_loss_mlp": 0.01193805, "balance_loss_clip": 1.00796509, "balance_loss_mlp": 1.00020921, "epoch": 0.6405338784344375, "flos": 40479564968640.0, "grad_norm": 1.8576780590687876, "language_loss": 0.70490402, "learning_rate": 1.2089797448924616e-06, "loss": 0.73014933, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.9680287837982178 }, { "auxiliary_loss_clip": 0.0128885, "auxiliary_loss_mlp": 0.01193688, "balance_loss_clip": 1.00908947, "balance_loss_mlp": 1.00018764, "epoch": 0.6406541213250767, "flos": 20886051831840.0, "grad_norm": 1.9626961768797306, "language_loss": 0.66158223, "learning_rate": 1.2082643493515692e-06, "loss": 0.68640757, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.9263083934783936 }, { "auxiliary_loss_clip": 0.01329789, "auxiliary_loss_mlp": 0.01193705, "balance_loss_clip": 1.00795889, "balance_loss_mlp": 1.00030041, "epoch": 0.6407743642157158, "flos": 23295982271040.0, "grad_norm": 1.6752105716056833, "language_loss": 0.81548011, "learning_rate": 1.207549073916346e-06, "loss": 0.84071505, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.812654733657837 }, { "auxiliary_loss_clip": 0.01316766, "auxiliary_loss_mlp": 0.01193588, "balance_loss_clip": 1.00861597, "balance_loss_mlp": 1.00037384, "epoch": 0.6408946071063548, "flos": 15012645094080.0, "grad_norm": 2.6952967391558857, "language_loss": 0.7792303, "learning_rate": 1.2068339186952976e-06, "loss": 0.80433381, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.79609751701355 }, { "auxiliary_loss_clip": 0.01333143, "auxiliary_loss_mlp": 0.01193903, "balance_loss_clip": 1.00849676, "balance_loss_mlp": 1.00030661, "epoch": 0.6410148499969939, "flos": 22528606544640.0, "grad_norm": 2.866860871498619, "language_loss": 0.73119485, "learning_rate": 1.2061188837969136e-06, "loss": 0.75646532, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.709047317504883 }, { "auxiliary_loss_clip": 0.01306426, "auxiliary_loss_mlp": 0.0119372, "balance_loss_clip": 1.00801635, "balance_loss_mlp": 1.00021887, "epoch": 0.641135092887633, "flos": 12422012742720.0, "grad_norm": 2.703983058144261, "language_loss": 0.83791518, "learning_rate": 1.2054039693296631e-06, "loss": 0.86291671, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.716811418533325 }, { "auxiliary_loss_clip": 0.01303303, "auxiliary_loss_mlp": 0.01193693, "balance_loss_clip": 1.0086509, "balance_loss_mlp": 1.00028777, "epoch": 0.6412553357782721, "flos": 22127340644640.0, "grad_norm": 1.829593754130177, "language_loss": 0.81527025, "learning_rate": 1.2046891754019992e-06, "loss": 0.84024024, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.8545172214508057 }, { "auxiliary_loss_clip": 0.01338586, "auxiliary_loss_mlp": 0.01193717, "balance_loss_clip": 1.00866735, "balance_loss_mlp": 1.00031233, "epoch": 0.6413755786689112, "flos": 15888614401440.0, "grad_norm": 1.8365912331344674, "language_loss": 0.82501578, "learning_rate": 1.2039745021223548e-06, "loss": 0.85033882, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.6970160007476807 }, { "auxiliary_loss_clip": 0.01246912, "auxiliary_loss_mlp": 0.01193039, "balance_loss_clip": 1.00540853, "balance_loss_mlp": 1.00001526, "epoch": 0.6414958215595503, "flos": 68039197584480.0, "grad_norm": 0.7874360657816418, "language_loss": 0.5707109, "learning_rate": 1.2032599495991456e-06, "loss": 0.59511042, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 4.340535640716553 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01193823, "balance_loss_clip": 1.00869548, "balance_loss_mlp": 1.00022662, "epoch": 0.6416160644501894, "flos": 44091317305440.0, "grad_norm": 1.9265358872385665, "language_loss": 0.69239575, "learning_rate": 1.2025455179407685e-06, "loss": 0.71763539, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.8879590034484863 }, { "auxiliary_loss_clip": 0.01326514, "auxiliary_loss_mlp": 0.00872605, "balance_loss_clip": 1.00836444, "balance_loss_mlp": 1.00068438, "epoch": 0.6417363073408284, "flos": 20959848576000.0, "grad_norm": 1.9654605614707108, "language_loss": 0.73419368, "learning_rate": 1.2018312072556022e-06, "loss": 0.75618482, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 3.720912456512451 }, { "auxiliary_loss_clip": 0.01350817, "auxiliary_loss_mlp": 0.00872541, "balance_loss_clip": 1.00834537, "balance_loss_mlp": 1.00059247, "epoch": 0.6418565502314676, "flos": 22455133113600.0, "grad_norm": 1.7990590885435025, "language_loss": 0.74017799, "learning_rate": 1.2011170176520077e-06, "loss": 0.76241159, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 3.687641143798828 }, { "auxiliary_loss_clip": 0.01260362, "auxiliary_loss_mlp": 0.01193406, "balance_loss_clip": 1.00709963, "balance_loss_mlp": 1.00019169, "epoch": 0.6419767931221066, "flos": 25045513999200.0, "grad_norm": 5.817002492200253, "language_loss": 0.81373668, "learning_rate": 1.2004029492383256e-06, "loss": 0.83827436, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 3.818786859512329 }, { "auxiliary_loss_clip": 0.01327817, "auxiliary_loss_mlp": 0.01193781, "balance_loss_clip": 1.00794125, "balance_loss_mlp": 1.00028086, "epoch": 0.6420970360127457, "flos": 19463701870080.0, "grad_norm": 1.865015669427246, "language_loss": 0.73194182, "learning_rate": 1.1996890021228814e-06, "loss": 0.7571578, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.7304136753082275 }, { "auxiliary_loss_clip": 0.01325771, "auxiliary_loss_mlp": 0.01193519, "balance_loss_clip": 1.00832438, "balance_loss_mlp": 1.00020921, "epoch": 0.6422172789033849, "flos": 40406163384960.0, "grad_norm": 1.4119582155840813, "language_loss": 0.69672632, "learning_rate": 1.1989751764139785e-06, "loss": 0.7219193, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.897339344024658 }, { "auxiliary_loss_clip": 0.01293906, "auxiliary_loss_mlp": 0.01193829, "balance_loss_clip": 1.00742829, "balance_loss_mlp": 1.00023365, "epoch": 0.6423375217940239, "flos": 27672883066080.0, "grad_norm": 1.5776604397773917, "language_loss": 0.83028734, "learning_rate": 1.1982614722199044e-06, "loss": 0.85516465, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.9170985221862793 }, { "auxiliary_loss_clip": 0.01325598, "auxiliary_loss_mlp": 0.0119345, "balance_loss_clip": 1.00808144, "balance_loss_mlp": 1.00023532, "epoch": 0.642457764684663, "flos": 18369252148320.0, "grad_norm": 1.8222291389458578, "language_loss": 0.77591527, "learning_rate": 1.1975478896489276e-06, "loss": 0.80110574, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.7109506130218506 }, { "auxiliary_loss_clip": 0.01350641, "auxiliary_loss_mlp": 0.01193688, "balance_loss_clip": 1.00850153, "balance_loss_mlp": 1.00018692, "epoch": 0.6425780075753021, "flos": 19750518629280.0, "grad_norm": 1.819370605686891, "language_loss": 0.76698601, "learning_rate": 1.1968344288092981e-06, "loss": 0.79242927, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.7477102279663086 }, { "auxiliary_loss_clip": 0.01329994, "auxiliary_loss_mlp": 0.00872464, "balance_loss_clip": 1.00791216, "balance_loss_mlp": 1.00059319, "epoch": 0.6426982504659412, "flos": 20558546752320.0, "grad_norm": 1.6626040896755705, "language_loss": 0.64693016, "learning_rate": 1.1961210898092468e-06, "loss": 0.66895473, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.787113904953003 }, { "auxiliary_loss_clip": 0.01316619, "auxiliary_loss_mlp": 0.01193679, "balance_loss_clip": 1.00751114, "balance_loss_mlp": 1.0002743, "epoch": 0.6428184933565803, "flos": 17851992389280.0, "grad_norm": 2.2712617710660656, "language_loss": 0.79359645, "learning_rate": 1.1954078727569874e-06, "loss": 0.81869948, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.7238729000091553 }, { "auxiliary_loss_clip": 0.01303785, "auxiliary_loss_mlp": 0.00872519, "balance_loss_clip": 1.00760937, "balance_loss_mlp": 1.00056124, "epoch": 0.6429387362472194, "flos": 22456965221280.0, "grad_norm": 2.4393510452811755, "language_loss": 0.78051066, "learning_rate": 1.1946947777607141e-06, "loss": 0.80227369, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.869575023651123 }, { "auxiliary_loss_clip": 0.01280303, "auxiliary_loss_mlp": 0.01193642, "balance_loss_clip": 1.00754452, "balance_loss_mlp": 1.00023675, "epoch": 0.6430589791378585, "flos": 24752590214400.0, "grad_norm": 1.8551208448433036, "language_loss": 0.80400538, "learning_rate": 1.1939818049286024e-06, "loss": 0.82874489, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.815866231918335 }, { "auxiliary_loss_clip": 0.01255495, "auxiliary_loss_mlp": 0.01193534, "balance_loss_clip": 1.00741243, "balance_loss_mlp": 1.00022435, "epoch": 0.6431792220284975, "flos": 24901261413120.0, "grad_norm": 1.9202095529580172, "language_loss": 0.75713181, "learning_rate": 1.1932689543688101e-06, "loss": 0.78162211, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.977684259414673 }, { "auxiliary_loss_clip": 0.01304768, "auxiliary_loss_mlp": 0.01193586, "balance_loss_clip": 1.00772166, "balance_loss_mlp": 1.0001806, "epoch": 0.6432994649191367, "flos": 21032316144000.0, "grad_norm": 1.7807175178607313, "language_loss": 0.72373337, "learning_rate": 1.1925562261894756e-06, "loss": 0.74871695, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.9862873554229736 }, { "auxiliary_loss_clip": 0.01315878, "auxiliary_loss_mlp": 0.0119333, "balance_loss_clip": 1.0081265, "balance_loss_mlp": 1.00021136, "epoch": 0.6434197078097758, "flos": 30884447213280.0, "grad_norm": 1.785958033731761, "language_loss": 0.77606964, "learning_rate": 1.1918436204987207e-06, "loss": 0.80116177, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.8861501216888428 }, { "auxiliary_loss_clip": 0.01326847, "auxiliary_loss_mlp": 0.01193662, "balance_loss_clip": 1.00840414, "balance_loss_mlp": 1.00025654, "epoch": 0.6435399507004148, "flos": 15012501399360.0, "grad_norm": 2.379682331000494, "language_loss": 0.81715035, "learning_rate": 1.191131137404645e-06, "loss": 0.84235537, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.7029271125793457 }, { "auxiliary_loss_clip": 0.01307399, "auxiliary_loss_mlp": 0.01193563, "balance_loss_clip": 1.00833547, "balance_loss_mlp": 1.00025368, "epoch": 0.643660193591054, "flos": 19901991875040.0, "grad_norm": 1.9955070493216418, "language_loss": 0.77761042, "learning_rate": 1.190418777015333e-06, "loss": 0.80262005, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.762820243835449 }, { "auxiliary_loss_clip": 0.01313183, "auxiliary_loss_mlp": 0.01193622, "balance_loss_clip": 1.00761116, "balance_loss_mlp": 1.00021708, "epoch": 0.643780436481693, "flos": 24133634220960.0, "grad_norm": 1.4293813261110253, "language_loss": 0.73580128, "learning_rate": 1.1897065394388487e-06, "loss": 0.76086938, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.928030014038086 }, { "auxiliary_loss_clip": 0.01307639, "auxiliary_loss_mlp": 0.01193659, "balance_loss_clip": 1.00825512, "balance_loss_mlp": 1.00025368, "epoch": 0.6439006793723321, "flos": 23148819866880.0, "grad_norm": 1.4657749870130894, "language_loss": 0.76533175, "learning_rate": 1.1889944247832385e-06, "loss": 0.79034472, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.819122791290283 }, { "auxiliary_loss_clip": 0.01339113, "auxiliary_loss_mlp": 0.01193633, "balance_loss_clip": 1.00850499, "balance_loss_mlp": 1.00022829, "epoch": 0.6440209222629713, "flos": 23617919180160.0, "grad_norm": 1.7606343883236333, "language_loss": 0.70778, "learning_rate": 1.1882824331565283e-06, "loss": 0.73310745, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.8444888591766357 }, { "auxiliary_loss_clip": 0.01313348, "auxiliary_loss_mlp": 0.01193436, "balance_loss_clip": 1.00855184, "balance_loss_mlp": 1.00022149, "epoch": 0.6441411651536103, "flos": 16544881889280.0, "grad_norm": 2.1074274972661406, "language_loss": 0.88961554, "learning_rate": 1.1875705646667287e-06, "loss": 0.9146834, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.798786163330078 }, { "auxiliary_loss_clip": 0.0133928, "auxiliary_loss_mlp": 0.01193798, "balance_loss_clip": 1.00834274, "balance_loss_mlp": 1.00020242, "epoch": 0.6442614080442494, "flos": 25410977199360.0, "grad_norm": 2.003683655642697, "language_loss": 0.755512, "learning_rate": 1.1868588194218282e-06, "loss": 0.78084278, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.7799251079559326 }, { "auxiliary_loss_clip": 0.01322127, "auxiliary_loss_mlp": 0.01193688, "balance_loss_clip": 1.00779343, "balance_loss_mlp": 1.00028253, "epoch": 0.6443816509348885, "flos": 28294030404000.0, "grad_norm": 1.473087852133951, "language_loss": 0.73901564, "learning_rate": 1.1861471975297979e-06, "loss": 0.76417375, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.7859346866607666 }, { "auxiliary_loss_clip": 0.01280454, "auxiliary_loss_mlp": 0.0119369, "balance_loss_clip": 1.00736511, "balance_loss_mlp": 1.00019002, "epoch": 0.6445018938255276, "flos": 36690092385120.0, "grad_norm": 1.4972960378931695, "language_loss": 0.70852959, "learning_rate": 1.185435699098591e-06, "loss": 0.73327106, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.9558117389678955 }, { "auxiliary_loss_clip": 0.01318124, "auxiliary_loss_mlp": 0.01193513, "balance_loss_clip": 1.00805879, "balance_loss_mlp": 1.00020289, "epoch": 0.6446221367161666, "flos": 14501420513280.0, "grad_norm": 3.0123095847388157, "language_loss": 0.77958256, "learning_rate": 1.1847243242361403e-06, "loss": 0.80469894, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.7533116340637207 }, { "auxiliary_loss_clip": 0.01316013, "auxiliary_loss_mlp": 0.01193564, "balance_loss_clip": 1.00859642, "balance_loss_mlp": 1.00015926, "epoch": 0.6447423796068057, "flos": 24609379415040.0, "grad_norm": 4.2217651946625185, "language_loss": 0.78219742, "learning_rate": 1.1840130730503624e-06, "loss": 0.80729318, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 4.651172399520874 }, { "auxiliary_loss_clip": 0.01351903, "auxiliary_loss_mlp": 0.01193561, "balance_loss_clip": 1.00875998, "balance_loss_mlp": 1.00015545, "epoch": 0.6448626224974449, "flos": 25047310183200.0, "grad_norm": 1.738094014280527, "language_loss": 0.74684143, "learning_rate": 1.1833019456491518e-06, "loss": 0.77229607, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 3.6836044788360596 }, { "auxiliary_loss_clip": 0.01328624, "auxiliary_loss_mlp": 0.01193845, "balance_loss_clip": 1.00784814, "balance_loss_mlp": 1.00024962, "epoch": 0.6449828653880839, "flos": 22530366804960.0, "grad_norm": 1.80445727916498, "language_loss": 0.78833413, "learning_rate": 1.1825909421403871e-06, "loss": 0.81355888, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.8298606872558594 }, { "auxiliary_loss_clip": 0.01333176, "auxiliary_loss_mlp": 0.01193598, "balance_loss_clip": 1.00799036, "balance_loss_mlp": 1.00019324, "epoch": 0.645103108278723, "flos": 25695746308800.0, "grad_norm": 1.762953929563729, "language_loss": 0.76566744, "learning_rate": 1.181880062631926e-06, "loss": 0.79093516, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 3.8284378051757812 }, { "auxiliary_loss_clip": 0.01314384, "auxiliary_loss_mlp": 0.01193849, "balance_loss_clip": 1.00815964, "balance_loss_mlp": 1.0002532, "epoch": 0.6452233511693621, "flos": 27450343275840.0, "grad_norm": 8.36459433866574, "language_loss": 0.84555584, "learning_rate": 1.1811693072316093e-06, "loss": 0.87063819, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.8314671516418457 }, { "auxiliary_loss_clip": 0.01351301, "auxiliary_loss_mlp": 0.00872616, "balance_loss_clip": 1.00830483, "balance_loss_mlp": 1.00064874, "epoch": 0.6453435940600012, "flos": 19208628434880.0, "grad_norm": 2.1378071741404745, "language_loss": 0.84236825, "learning_rate": 1.1804586760472574e-06, "loss": 0.86460739, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.7229700088500977 }, { "auxiliary_loss_clip": 0.01295188, "auxiliary_loss_mlp": 0.01193468, "balance_loss_clip": 1.00727856, "balance_loss_mlp": 1.00015795, "epoch": 0.6454638369506402, "flos": 25737691957920.0, "grad_norm": 2.5179016869294046, "language_loss": 0.80387187, "learning_rate": 1.1797481691866736e-06, "loss": 0.82875848, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.8267030715942383 }, { "auxiliary_loss_clip": 0.01315018, "auxiliary_loss_mlp": 0.01193414, "balance_loss_clip": 1.00843072, "balance_loss_mlp": 1.0001992, "epoch": 0.6455840798412794, "flos": 20989185013440.0, "grad_norm": 1.9838439574384281, "language_loss": 0.82943553, "learning_rate": 1.1790377867576393e-06, "loss": 0.85451984, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.910444974899292 }, { "auxiliary_loss_clip": 0.01319149, "auxiliary_loss_mlp": 0.01193437, "balance_loss_clip": 1.00786197, "balance_loss_mlp": 1.00022221, "epoch": 0.6457043227319185, "flos": 26067568000320.0, "grad_norm": 1.7840406927722559, "language_loss": 0.76572502, "learning_rate": 1.1783275288679203e-06, "loss": 0.79085082, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.8709030151367188 }, { "auxiliary_loss_clip": 0.0130947, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00549054, "balance_loss_mlp": 1.00005805, "epoch": 0.6458245656225575, "flos": 60370864796160.0, "grad_norm": 0.8415967815978626, "language_loss": 0.57176173, "learning_rate": 1.177617395625262e-06, "loss": 0.59678727, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.283381462097168 }, { "auxiliary_loss_clip": 0.01330194, "auxiliary_loss_mlp": 0.01193434, "balance_loss_clip": 1.008255, "balance_loss_mlp": 1.00021923, "epoch": 0.6459448085131967, "flos": 23076783383040.0, "grad_norm": 1.8770415997334198, "language_loss": 0.75490242, "learning_rate": 1.1769073871373908e-06, "loss": 0.78013867, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.8226757049560547 }, { "auxiliary_loss_clip": 0.01306152, "auxiliary_loss_mlp": 0.01193604, "balance_loss_clip": 1.0080626, "balance_loss_mlp": 1.0001986, "epoch": 0.6460650514038357, "flos": 22598200218240.0, "grad_norm": 1.6198775336741276, "language_loss": 0.83861327, "learning_rate": 1.176197503512015e-06, "loss": 0.8636108, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.808192253112793 }, { "auxiliary_loss_clip": 0.01327501, "auxiliary_loss_mlp": 0.01193506, "balance_loss_clip": 1.00928736, "balance_loss_mlp": 1.00029159, "epoch": 0.6461852942944748, "flos": 20266736601600.0, "grad_norm": 2.001623104217709, "language_loss": 0.82070005, "learning_rate": 1.1754877448568223e-06, "loss": 0.84591019, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.8294150829315186 }, { "auxiliary_loss_clip": 0.01315738, "auxiliary_loss_mlp": 0.01193616, "balance_loss_clip": 1.00779903, "balance_loss_mlp": 1.000211, "epoch": 0.646305537185114, "flos": 23367120662880.0, "grad_norm": 1.8385830741182423, "language_loss": 0.89638782, "learning_rate": 1.1747781112794837e-06, "loss": 0.92148143, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.97847843170166 }, { "auxiliary_loss_clip": 0.01306563, "auxiliary_loss_mlp": 0.01193535, "balance_loss_clip": 1.00828123, "balance_loss_mlp": 1.00022542, "epoch": 0.646425780075753, "flos": 24277491646560.0, "grad_norm": 1.5745745143198184, "language_loss": 0.82814097, "learning_rate": 1.1740686028876487e-06, "loss": 0.85314196, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.926591396331787 }, { "auxiliary_loss_clip": 0.0132597, "auxiliary_loss_mlp": 0.01193624, "balance_loss_clip": 1.00773466, "balance_loss_mlp": 1.00021839, "epoch": 0.6465460229663921, "flos": 20813979424320.0, "grad_norm": 2.8991141167043675, "language_loss": 0.74981868, "learning_rate": 1.1733592197889507e-06, "loss": 0.77501464, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.776149034500122 }, { "auxiliary_loss_clip": 0.01325412, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00821149, "balance_loss_mlp": 1.00019884, "epoch": 0.6466662658570312, "flos": 22853309577120.0, "grad_norm": 1.727459741529915, "language_loss": 0.72591293, "learning_rate": 1.1726499620910014e-06, "loss": 0.75110018, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.7510552406311035 }, { "auxiliary_loss_clip": 0.01325784, "auxiliary_loss_mlp": 0.01193572, "balance_loss_clip": 1.00793529, "balance_loss_mlp": 1.00026274, "epoch": 0.6467865087476703, "flos": 15304563015840.0, "grad_norm": 1.8791896569179811, "language_loss": 0.7808485, "learning_rate": 1.1719408299013955e-06, "loss": 0.80604208, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.7436347007751465 }, { "auxiliary_loss_clip": 0.01350687, "auxiliary_loss_mlp": 0.01193573, "balance_loss_clip": 1.00851774, "balance_loss_mlp": 1.00016797, "epoch": 0.6469067516383094, "flos": 19573660550880.0, "grad_norm": 8.156951144344866, "language_loss": 0.75777173, "learning_rate": 1.1712318233277067e-06, "loss": 0.78321433, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.7274444103240967 }, { "auxiliary_loss_clip": 0.01304168, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00537884, "balance_loss_mlp": 1.00006759, "epoch": 0.6470269945289485, "flos": 65098033074720.0, "grad_norm": 0.7558238825844253, "language_loss": 0.57939637, "learning_rate": 1.1705229424774916e-06, "loss": 0.60436898, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.2044105529785156 }, { "auxiliary_loss_clip": 0.01322208, "auxiliary_loss_mlp": 0.01193792, "balance_loss_clip": 1.00832129, "balance_loss_mlp": 1.00029111, "epoch": 0.6471472374195876, "flos": 30696955725600.0, "grad_norm": 1.546831950875919, "language_loss": 0.63944316, "learning_rate": 1.1698141874582867e-06, "loss": 0.66460311, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.8719897270202637 }, { "auxiliary_loss_clip": 0.01349949, "auxiliary_loss_mlp": 0.01193517, "balance_loss_clip": 1.00842547, "balance_loss_mlp": 1.00020742, "epoch": 0.6472674803102266, "flos": 20521845960480.0, "grad_norm": 1.6573257509162365, "language_loss": 0.72077763, "learning_rate": 1.169105558377609e-06, "loss": 0.74621224, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.8043289184570312 }, { "auxiliary_loss_clip": 0.0125124, "auxiliary_loss_mlp": 0.00872611, "balance_loss_clip": 1.00777054, "balance_loss_mlp": 1.00062084, "epoch": 0.6473877232008658, "flos": 24715458338400.0, "grad_norm": 1.6056526282632102, "language_loss": 0.78041017, "learning_rate": 1.1683970553429587e-06, "loss": 0.80164874, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.9087884426116943 }, { "auxiliary_loss_clip": 0.01294234, "auxiliary_loss_mlp": 0.01193533, "balance_loss_clip": 1.00734627, "balance_loss_mlp": 1.0002228, "epoch": 0.6475079660915048, "flos": 15885560888640.0, "grad_norm": 1.770401721877046, "language_loss": 0.82212079, "learning_rate": 1.1676886784618128e-06, "loss": 0.84699845, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.90682053565979 }, { "auxiliary_loss_clip": 0.01326331, "auxiliary_loss_mlp": 0.01193684, "balance_loss_clip": 1.00823319, "balance_loss_mlp": 1.00018358, "epoch": 0.6476282089821439, "flos": 17381599823520.0, "grad_norm": 2.0240527840062246, "language_loss": 0.83570045, "learning_rate": 1.1669804278416332e-06, "loss": 0.86090064, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.743694543838501 }, { "auxiliary_loss_clip": 0.01317885, "auxiliary_loss_mlp": 0.01193532, "balance_loss_clip": 1.00818539, "balance_loss_mlp": 1.00022244, "epoch": 0.6477484518727831, "flos": 20194089415200.0, "grad_norm": 2.2462413493055498, "language_loss": 0.7158165, "learning_rate": 1.1662723035898602e-06, "loss": 0.74093074, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.793874502182007 }, { "auxiliary_loss_clip": 0.01326416, "auxiliary_loss_mlp": 0.01193598, "balance_loss_clip": 1.00797391, "balance_loss_mlp": 1.00019276, "epoch": 0.6478686947634221, "flos": 25410438344160.0, "grad_norm": 1.583249245119175, "language_loss": 0.81658101, "learning_rate": 1.165564305813915e-06, "loss": 0.84178114, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 3.775649070739746 }, { "auxiliary_loss_clip": 0.01332472, "auxiliary_loss_mlp": 0.01193276, "balance_loss_clip": 1.00794256, "balance_loss_mlp": 1.00015724, "epoch": 0.6479889376540612, "flos": 20083591879200.0, "grad_norm": 1.538080829285432, "language_loss": 0.81328923, "learning_rate": 1.1648564346212019e-06, "loss": 0.83854675, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 3.7294931411743164 }, { "auxiliary_loss_clip": 0.01324804, "auxiliary_loss_mlp": 0.01193615, "balance_loss_clip": 1.00819087, "balance_loss_mlp": 1.00021017, "epoch": 0.6481091805447003, "flos": 26758093469760.0, "grad_norm": 1.8355401244852658, "language_loss": 0.76208878, "learning_rate": 1.164148690119104e-06, "loss": 0.78727293, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 3.673109292984009 }, { "auxiliary_loss_clip": 0.01350163, "auxiliary_loss_mlp": 0.01193658, "balance_loss_clip": 1.00804949, "balance_loss_mlp": 1.00025296, "epoch": 0.6482294234353394, "flos": 23952106064160.0, "grad_norm": 1.682239982321714, "language_loss": 0.74083531, "learning_rate": 1.163441072414985e-06, "loss": 0.7662735, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.733323097229004 }, { "auxiliary_loss_clip": 0.01328454, "auxiliary_loss_mlp": 0.01193404, "balance_loss_clip": 1.00784135, "balance_loss_mlp": 1.00018966, "epoch": 0.6483496663259785, "flos": 26209844784000.0, "grad_norm": 1.7867753128953339, "language_loss": 0.69734633, "learning_rate": 1.16273358161619e-06, "loss": 0.72256488, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 3.6807405948638916 }, { "auxiliary_loss_clip": 0.01306627, "auxiliary_loss_mlp": 0.01193639, "balance_loss_clip": 1.00843644, "balance_loss_mlp": 1.00023413, "epoch": 0.6484699092166175, "flos": 20922249692160.0, "grad_norm": 1.7231898938290662, "language_loss": 0.83056927, "learning_rate": 1.1620262178300446e-06, "loss": 0.85557193, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.7705862522125244 }, { "auxiliary_loss_clip": 0.01305463, "auxiliary_loss_mlp": 0.01193532, "balance_loss_clip": 1.00812078, "balance_loss_mlp": 1.00022268, "epoch": 0.6485901521072567, "flos": 33072879648960.0, "grad_norm": 1.6573969969964986, "language_loss": 0.75285476, "learning_rate": 1.1613189811638563e-06, "loss": 0.77784479, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 3.0051450729370117 }, { "auxiliary_loss_clip": 0.01337951, "auxiliary_loss_mlp": 0.0119345, "balance_loss_clip": 1.00921464, "balance_loss_mlp": 1.00023532, "epoch": 0.6487103949978957, "flos": 22274071964640.0, "grad_norm": 1.6195310417967084, "language_loss": 0.78035855, "learning_rate": 1.1606118717249117e-06, "loss": 0.80567253, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.775256633758545 }, { "auxiliary_loss_clip": 0.01352282, "auxiliary_loss_mlp": 0.0119372, "balance_loss_clip": 1.00814342, "balance_loss_mlp": 1.00021958, "epoch": 0.6488306378885348, "flos": 22930411299840.0, "grad_norm": 1.6985259503564332, "language_loss": 0.67598546, "learning_rate": 1.1599048896204787e-06, "loss": 0.70144546, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.7228481769561768 }, { "auxiliary_loss_clip": 0.01302485, "auxiliary_loss_mlp": 0.01193434, "balance_loss_clip": 1.00846803, "balance_loss_mlp": 1.00022006, "epoch": 0.648950880779174, "flos": 20376120503520.0, "grad_norm": 1.7349762750977924, "language_loss": 0.80776781, "learning_rate": 1.1591980349578061e-06, "loss": 0.83272702, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 3.0083813667297363 }, { "auxiliary_loss_clip": 0.01271091, "auxiliary_loss_mlp": 0.01193027, "balance_loss_clip": 1.00456715, "balance_loss_mlp": 1.0000037, "epoch": 0.649071123669813, "flos": 59930922301920.0, "grad_norm": 0.7329938033252554, "language_loss": 0.54298532, "learning_rate": 1.158491307844123e-06, "loss": 0.56762654, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.358043670654297 }, { "auxiliary_loss_clip": 0.01313052, "auxiliary_loss_mlp": 0.0119343, "balance_loss_clip": 1.00846124, "balance_loss_mlp": 1.00021601, "epoch": 0.6491913665604521, "flos": 20446576345440.0, "grad_norm": 1.6048450964943808, "language_loss": 0.83780009, "learning_rate": 1.1577847083866387e-06, "loss": 0.86286491, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.801020622253418 }, { "auxiliary_loss_clip": 0.01313218, "auxiliary_loss_mlp": 0.01193779, "balance_loss_clip": 1.00795352, "balance_loss_mlp": 1.00027883, "epoch": 0.6493116094510912, "flos": 16946830339200.0, "grad_norm": 1.7075005165986774, "language_loss": 0.72215265, "learning_rate": 1.1570782366925453e-06, "loss": 0.7472226, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.747483253479004 }, { "auxiliary_loss_clip": 0.01326914, "auxiliary_loss_mlp": 0.01193493, "balance_loss_clip": 1.0081116, "balance_loss_mlp": 1.00018311, "epoch": 0.6494318523417303, "flos": 18802943922240.0, "grad_norm": 1.6174945775429828, "language_loss": 0.75434279, "learning_rate": 1.1563718928690132e-06, "loss": 0.77954686, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.781893253326416 }, { "auxiliary_loss_clip": 0.01286417, "auxiliary_loss_mlp": 0.01193801, "balance_loss_clip": 1.00678325, "balance_loss_mlp": 1.00020552, "epoch": 0.6495520952323693, "flos": 18982855513440.0, "grad_norm": 1.9594697329100563, "language_loss": 0.71494836, "learning_rate": 1.1556656770231942e-06, "loss": 0.7397505, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.8082265853881836 }, { "auxiliary_loss_clip": 0.01338335, "auxiliary_loss_mlp": 0.01193503, "balance_loss_clip": 1.00840354, "balance_loss_mlp": 1.00028825, "epoch": 0.6496723381230085, "flos": 22745398546080.0, "grad_norm": 1.5934926420006992, "language_loss": 0.76474917, "learning_rate": 1.1549595892622207e-06, "loss": 0.79006755, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.7777950763702393 }, { "auxiliary_loss_clip": 0.01244026, "auxiliary_loss_mlp": 0.01193029, "balance_loss_clip": 1.00588465, "balance_loss_mlp": 1.00000536, "epoch": 0.6497925810136476, "flos": 62145314349120.0, "grad_norm": 0.8202570072343555, "language_loss": 0.59028172, "learning_rate": 1.1542536296932047e-06, "loss": 0.61465228, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.483473777770996 }, { "auxiliary_loss_clip": 0.01307262, "auxiliary_loss_mlp": 0.01193771, "balance_loss_clip": 1.0084753, "balance_loss_mlp": 1.00027061, "epoch": 0.6499128239042866, "flos": 20156741997120.0, "grad_norm": 1.8151406079024004, "language_loss": 0.70314538, "learning_rate": 1.1535477984232414e-06, "loss": 0.72815567, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 3.1041505336761475 }, { "auxiliary_loss_clip": 0.01276879, "auxiliary_loss_mlp": 0.01193281, "balance_loss_clip": 1.00738502, "balance_loss_mlp": 1.00016189, "epoch": 0.6500330667949258, "flos": 24462432552960.0, "grad_norm": 1.791766791198716, "language_loss": 0.76830369, "learning_rate": 1.152842095559404e-06, "loss": 0.79300529, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.9398441314697266 }, { "auxiliary_loss_clip": 0.01324313, "auxiliary_loss_mlp": 0.01193408, "balance_loss_clip": 1.00772011, "balance_loss_mlp": 1.00019372, "epoch": 0.6501533096855648, "flos": 25477409589120.0, "grad_norm": 1.5981154877762056, "language_loss": 0.76910162, "learning_rate": 1.1521365212087474e-06, "loss": 0.79427886, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 3.1606383323669434 }, { "auxiliary_loss_clip": 0.0132812, "auxiliary_loss_mlp": 0.01193618, "balance_loss_clip": 1.00793529, "balance_loss_mlp": 1.00021315, "epoch": 0.6502735525762039, "flos": 44819262040320.0, "grad_norm": 2.316300000406488, "language_loss": 0.70606053, "learning_rate": 1.1514310754783062e-06, "loss": 0.73127794, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 3.008758068084717 }, { "auxiliary_loss_clip": 0.01316267, "auxiliary_loss_mlp": 0.01193484, "balance_loss_clip": 1.00757599, "balance_loss_mlp": 1.00026929, "epoch": 0.6503937954668431, "flos": 28658559588480.0, "grad_norm": 3.0676088203368317, "language_loss": 0.73526192, "learning_rate": 1.1507257584750964e-06, "loss": 0.76035941, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 3.125117540359497 }, { "auxiliary_loss_clip": 0.01351291, "auxiliary_loss_mlp": 0.0119365, "balance_loss_clip": 1.00883007, "balance_loss_mlp": 1.00024509, "epoch": 0.6505140383574821, "flos": 20922573005280.0, "grad_norm": 1.8246011514549314, "language_loss": 0.7726928, "learning_rate": 1.150020570306113e-06, "loss": 0.79814225, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.7566986083984375 }, { "auxiliary_loss_clip": 0.01327942, "auxiliary_loss_mlp": 0.0119364, "balance_loss_clip": 1.00878954, "balance_loss_mlp": 1.0002346, "epoch": 0.6506342812481212, "flos": 20595247544160.0, "grad_norm": 1.721727103793964, "language_loss": 0.74798739, "learning_rate": 1.1493155110783338e-06, "loss": 0.77320313, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.7885265350341797 }, { "auxiliary_loss_clip": 0.01326798, "auxiliary_loss_mlp": 0.01193681, "balance_loss_clip": 1.00803995, "balance_loss_mlp": 1.00018072, "epoch": 0.6507545241387603, "flos": 30226491312480.0, "grad_norm": 1.8097939893493822, "language_loss": 0.70528972, "learning_rate": 1.1486105808987155e-06, "loss": 0.7304945, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.8430423736572266 }, { "auxiliary_loss_clip": 0.01328875, "auxiliary_loss_mlp": 0.01193742, "balance_loss_clip": 1.00822425, "balance_loss_mlp": 1.00024164, "epoch": 0.6508747670293994, "flos": 17128250724960.0, "grad_norm": 1.6831318010329852, "language_loss": 0.81026709, "learning_rate": 1.1479057798741947e-06, "loss": 0.83549333, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.772491931915283 }, { "auxiliary_loss_clip": 0.01289071, "auxiliary_loss_mlp": 0.01193053, "balance_loss_clip": 1.00883389, "balance_loss_mlp": 1.00002909, "epoch": 0.6509950099200384, "flos": 68559856884000.0, "grad_norm": 0.7843525693755198, "language_loss": 0.53383118, "learning_rate": 1.14720110811169e-06, "loss": 0.55865246, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 4.430028676986694 }, { "auxiliary_loss_clip": 0.01331164, "auxiliary_loss_mlp": 0.0119393, "balance_loss_clip": 1.0087502, "balance_loss_mlp": 1.00023913, "epoch": 0.6511152528106776, "flos": 22347473548320.0, "grad_norm": 2.4039846260175723, "language_loss": 0.76882041, "learning_rate": 1.146496565718098e-06, "loss": 0.79407132, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 4.750324487686157 }, { "auxiliary_loss_clip": 0.01306633, "auxiliary_loss_mlp": 0.01193724, "balance_loss_clip": 1.00852633, "balance_loss_mlp": 1.00022388, "epoch": 0.6512354957013167, "flos": 20522169273600.0, "grad_norm": 1.8227344171324746, "language_loss": 0.75782198, "learning_rate": 1.1457921528002996e-06, "loss": 0.78282559, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.7611982822418213 }, { "auxiliary_loss_clip": 0.01350759, "auxiliary_loss_mlp": 0.00872585, "balance_loss_clip": 1.00808597, "balance_loss_mlp": 1.00041163, "epoch": 0.6513557385919557, "flos": 32337355017600.0, "grad_norm": 2.8558006232892907, "language_loss": 0.71750581, "learning_rate": 1.1450878694651522e-06, "loss": 0.7397393, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 3.7108068466186523 }, { "auxiliary_loss_clip": 0.01280041, "auxiliary_loss_mlp": 0.01193724, "balance_loss_clip": 1.00752783, "balance_loss_mlp": 1.00022304, "epoch": 0.6514759814825949, "flos": 12093214410720.0, "grad_norm": 2.3838062300208374, "language_loss": 0.63524079, "learning_rate": 1.1443837158194954e-06, "loss": 0.65997845, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.887378454208374 }, { "auxiliary_loss_clip": 0.01283627, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00719988, "balance_loss_mlp": 1.00024724, "epoch": 0.651596224373234, "flos": 22526918131680.0, "grad_norm": 1.6573611555675116, "language_loss": 0.74483478, "learning_rate": 1.1436796919701484e-06, "loss": 0.76960373, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.7451059818267822 }, { "auxiliary_loss_clip": 0.01309368, "auxiliary_loss_mlp": 0.01193388, "balance_loss_clip": 1.00801849, "balance_loss_mlp": 1.00017309, "epoch": 0.651716467263873, "flos": 27818967759840.0, "grad_norm": 2.116479150849915, "language_loss": 0.61684495, "learning_rate": 1.1429757980239115e-06, "loss": 0.64187253, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.8353469371795654 }, { "auxiliary_loss_clip": 0.01351081, "auxiliary_loss_mlp": 0.01193898, "balance_loss_clip": 1.00855327, "balance_loss_mlp": 1.00020719, "epoch": 0.6518367101545122, "flos": 24316311935520.0, "grad_norm": 2.5335484914391917, "language_loss": 0.8162756, "learning_rate": 1.1422720340875636e-06, "loss": 0.84172541, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.7743899822235107 }, { "auxiliary_loss_clip": 0.01339577, "auxiliary_loss_mlp": 0.01193854, "balance_loss_clip": 1.00870991, "balance_loss_mlp": 1.00025797, "epoch": 0.6519569530451512, "flos": 20011950555840.0, "grad_norm": 2.443703938113736, "language_loss": 0.79238176, "learning_rate": 1.1415684002678671e-06, "loss": 0.81771606, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.7897136211395264 }, { "auxiliary_loss_clip": 0.01327718, "auxiliary_loss_mlp": 0.01193885, "balance_loss_clip": 1.00887895, "balance_loss_mlp": 1.00028884, "epoch": 0.6520771959357903, "flos": 21576074369760.0, "grad_norm": 2.1475801680134827, "language_loss": 0.77489793, "learning_rate": 1.1408648966715617e-06, "loss": 0.80011398, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.738961696624756 }, { "auxiliary_loss_clip": 0.01328483, "auxiliary_loss_mlp": 0.01193812, "balance_loss_clip": 1.00860882, "balance_loss_mlp": 1.00021577, "epoch": 0.6521974388264293, "flos": 22711032793440.0, "grad_norm": 1.6362592396709223, "language_loss": 0.72386259, "learning_rate": 1.1401615234053683e-06, "loss": 0.74908549, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.7485311031341553 }, { "auxiliary_loss_clip": 0.01315038, "auxiliary_loss_mlp": 0.0119375, "balance_loss_clip": 1.00780272, "balance_loss_mlp": 1.00024986, "epoch": 0.6523176817170685, "flos": 23002950715200.0, "grad_norm": 1.6686286143713038, "language_loss": 0.75964046, "learning_rate": 1.1394582805759885e-06, "loss": 0.78472841, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.8307793140411377 }, { "auxiliary_loss_clip": 0.01326999, "auxiliary_loss_mlp": 0.01193837, "balance_loss_clip": 1.00808549, "balance_loss_mlp": 1.0002408, "epoch": 0.6524379246077076, "flos": 21688260318720.0, "grad_norm": 1.665014853264903, "language_loss": 0.75613391, "learning_rate": 1.1387551682901022e-06, "loss": 0.78134221, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.6651194095611572 }, { "auxiliary_loss_clip": 0.01290879, "auxiliary_loss_mlp": 0.01193723, "balance_loss_clip": 1.0074091, "balance_loss_mlp": 1.00022244, "epoch": 0.6525581674983466, "flos": 19390946912640.0, "grad_norm": 1.7611153348236466, "language_loss": 0.70876819, "learning_rate": 1.138052186654373e-06, "loss": 0.73361421, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.787097454071045 }, { "auxiliary_loss_clip": 0.01328744, "auxiliary_loss_mlp": 0.01193641, "balance_loss_clip": 1.00880504, "balance_loss_mlp": 1.00023592, "epoch": 0.6526784103889858, "flos": 17165454448320.0, "grad_norm": 2.023745845490893, "language_loss": 0.8812722, "learning_rate": 1.1373493357754417e-06, "loss": 0.90649605, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.7078263759613037 }, { "auxiliary_loss_clip": 0.01351477, "auxiliary_loss_mlp": 0.01193428, "balance_loss_clip": 1.0083766, "balance_loss_mlp": 1.00021374, "epoch": 0.6527986532796248, "flos": 18989178081120.0, "grad_norm": 1.5367971168111718, "language_loss": 0.76878262, "learning_rate": 1.1366466157599303e-06, "loss": 0.79423165, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.781337261199951 }, { "auxiliary_loss_clip": 0.01282082, "auxiliary_loss_mlp": 0.00872476, "balance_loss_clip": 1.00779533, "balance_loss_mlp": 1.00055218, "epoch": 0.6529188961702639, "flos": 14238587563200.0, "grad_norm": 2.082110863604073, "language_loss": 0.76002395, "learning_rate": 1.1359440267144412e-06, "loss": 0.78156954, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.7789947986602783 }, { "auxiliary_loss_clip": 0.01338459, "auxiliary_loss_mlp": 0.01193475, "balance_loss_clip": 1.00860095, "balance_loss_mlp": 1.0001651, "epoch": 0.653039139060903, "flos": 36682943572800.0, "grad_norm": 1.885190457577212, "language_loss": 0.74397331, "learning_rate": 1.1352415687455556e-06, "loss": 0.76929259, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.885188102722168 }, { "auxiliary_loss_clip": 0.01329623, "auxiliary_loss_mlp": 0.01193645, "balance_loss_clip": 1.00834632, "balance_loss_mlp": 1.00023973, "epoch": 0.6531593819515421, "flos": 25376288133600.0, "grad_norm": 3.7125298114567156, "language_loss": 0.63971424, "learning_rate": 1.1345392419598362e-06, "loss": 0.66494691, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.7633581161499023 }, { "auxiliary_loss_clip": 0.01338415, "auxiliary_loss_mlp": 0.01193481, "balance_loss_clip": 1.00817645, "balance_loss_mlp": 1.00017118, "epoch": 0.6532796248421812, "flos": 21178544532480.0, "grad_norm": 1.6132768082807973, "language_loss": 0.71559286, "learning_rate": 1.1338370464638263e-06, "loss": 0.74091178, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.7209208011627197 }, { "auxiliary_loss_clip": 0.0135121, "auxiliary_loss_mlp": 0.0119359, "balance_loss_clip": 1.00836647, "balance_loss_mlp": 1.00018454, "epoch": 0.6533998677328203, "flos": 17675960555520.0, "grad_norm": 1.9696988092796197, "language_loss": 0.635252, "learning_rate": 1.1331349823640474e-06, "loss": 0.66069996, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.719484806060791 }, { "auxiliary_loss_clip": 0.0133795, "auxiliary_loss_mlp": 0.00872413, "balance_loss_clip": 1.0082705, "balance_loss_mlp": 1.00071526, "epoch": 0.6535201106234594, "flos": 28400396716800.0, "grad_norm": 4.860407643071714, "language_loss": 0.77866411, "learning_rate": 1.132433049767003e-06, "loss": 0.80076778, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.797484874725342 }, { "auxiliary_loss_clip": 0.01308186, "auxiliary_loss_mlp": 0.01193506, "balance_loss_clip": 1.00743473, "balance_loss_mlp": 1.00019598, "epoch": 0.6536403535140984, "flos": 23586678787680.0, "grad_norm": 1.4860602090585413, "language_loss": 0.81206101, "learning_rate": 1.1317312487791748e-06, "loss": 0.83707792, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.848951816558838 }, { "auxiliary_loss_clip": 0.01338813, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.00844347, "balance_loss_mlp": 1.00019717, "epoch": 0.6537605964047376, "flos": 21579487119360.0, "grad_norm": 1.8575888775127438, "language_loss": 0.7324723, "learning_rate": 1.1310295795070253e-06, "loss": 0.75779355, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.7490105628967285 }, { "auxiliary_loss_clip": 0.01281709, "auxiliary_loss_mlp": 0.01193663, "balance_loss_clip": 1.0083375, "balance_loss_mlp": 1.00016236, "epoch": 0.6538808392953767, "flos": 26834009711040.0, "grad_norm": 1.6876898258774289, "language_loss": 0.80726606, "learning_rate": 1.1303280420569982e-06, "loss": 0.83201981, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.900179862976074 }, { "auxiliary_loss_clip": 0.01338762, "auxiliary_loss_mlp": 0.01193432, "balance_loss_clip": 1.00859666, "balance_loss_mlp": 1.00021732, "epoch": 0.6540010821860157, "flos": 30738254748480.0, "grad_norm": 1.609814237045499, "language_loss": 0.77551568, "learning_rate": 1.1296266365355158e-06, "loss": 0.80083764, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.7680816650390625 }, { "auxiliary_loss_clip": 0.0128805, "auxiliary_loss_mlp": 0.01193727, "balance_loss_clip": 1.00659025, "balance_loss_mlp": 1.00022674, "epoch": 0.6541213250766549, "flos": 26907159828960.0, "grad_norm": 1.8587318154869206, "language_loss": 0.73746991, "learning_rate": 1.1289253630489806e-06, "loss": 0.76228768, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 3.809595823287964 }, { "auxiliary_loss_clip": 0.01340998, "auxiliary_loss_mlp": 0.01193925, "balance_loss_clip": 1.00913656, "balance_loss_mlp": 1.00023425, "epoch": 0.6542415679672939, "flos": 19172394650880.0, "grad_norm": 2.0805504684476115, "language_loss": 0.7190367, "learning_rate": 1.1282242217037753e-06, "loss": 0.74438602, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 3.7942721843719482 }, { "auxiliary_loss_clip": 0.01294508, "auxiliary_loss_mlp": 0.01193874, "balance_loss_clip": 1.00814021, "balance_loss_mlp": 1.00027847, "epoch": 0.654361810857933, "flos": 48173534055360.0, "grad_norm": 1.7429914603699275, "language_loss": 0.62087333, "learning_rate": 1.127523212606262e-06, "loss": 0.64575708, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 3.980719566345215 }, { "auxiliary_loss_clip": 0.01326276, "auxiliary_loss_mlp": 0.01193494, "balance_loss_clip": 1.00744867, "balance_loss_mlp": 1.00018406, "epoch": 0.6544820537485722, "flos": 26943178070880.0, "grad_norm": 1.4663012442108736, "language_loss": 0.73028153, "learning_rate": 1.1268223358627835e-06, "loss": 0.75547922, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.7835423946380615 }, { "auxiliary_loss_clip": 0.01351433, "auxiliary_loss_mlp": 0.01193653, "balance_loss_clip": 1.00864697, "balance_loss_mlp": 1.00024772, "epoch": 0.6546022966392112, "flos": 20886339221280.0, "grad_norm": 1.9842165094911275, "language_loss": 0.71783137, "learning_rate": 1.126121591579663e-06, "loss": 0.74328226, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 3.7088537216186523 }, { "auxiliary_loss_clip": 0.01325825, "auxiliary_loss_mlp": 0.01193408, "balance_loss_clip": 1.00783885, "balance_loss_mlp": 1.00019372, "epoch": 0.6547225395298503, "flos": 24936704876160.0, "grad_norm": 1.5441290836245833, "language_loss": 0.68759811, "learning_rate": 1.1254209798632018e-06, "loss": 0.71279037, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.8259527683258057 }, { "auxiliary_loss_clip": 0.01258839, "auxiliary_loss_mlp": 0.01193838, "balance_loss_clip": 1.00704336, "balance_loss_mlp": 1.00024188, "epoch": 0.6548427824204894, "flos": 22565953962720.0, "grad_norm": 1.6994072240805576, "language_loss": 0.84520209, "learning_rate": 1.124720500819683e-06, "loss": 0.86972886, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.97895884513855 }, { "auxiliary_loss_clip": 0.0135107, "auxiliary_loss_mlp": 0.0119363, "balance_loss_clip": 1.00852621, "balance_loss_mlp": 1.00022459, "epoch": 0.6549630253111285, "flos": 18442510037280.0, "grad_norm": 1.986694485587101, "language_loss": 0.82187152, "learning_rate": 1.1240201545553682e-06, "loss": 0.84731847, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 3.1320154666900635 }, { "auxiliary_loss_clip": 0.01293816, "auxiliary_loss_mlp": 0.01193514, "balance_loss_clip": 1.00761425, "balance_loss_mlp": 1.00020432, "epoch": 0.6550832682017675, "flos": 25187323775040.0, "grad_norm": 1.721502952484636, "language_loss": 0.73354959, "learning_rate": 1.1233199411764987e-06, "loss": 0.75842291, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.8567323684692383 }, { "auxiliary_loss_clip": 0.01302705, "auxiliary_loss_mlp": 0.0119347, "balance_loss_clip": 1.00814867, "balance_loss_mlp": 1.00016069, "epoch": 0.6552035110924067, "flos": 22748164669440.0, "grad_norm": 2.3611281911904682, "language_loss": 0.68829787, "learning_rate": 1.1226198607892978e-06, "loss": 0.7132597, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.9515461921691895 }, { "auxiliary_loss_clip": 0.01273854, "auxiliary_loss_mlp": 0.01193509, "balance_loss_clip": 1.00744581, "balance_loss_mlp": 1.00019908, "epoch": 0.6553237539830458, "flos": 21799188938880.0, "grad_norm": 1.754759618867143, "language_loss": 0.79953647, "learning_rate": 1.1219199134999664e-06, "loss": 0.82421017, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.8594226837158203 }, { "auxiliary_loss_clip": 0.01298045, "auxiliary_loss_mlp": 0.01194043, "balance_loss_clip": 1.0085876, "balance_loss_mlp": 1.00025606, "epoch": 0.6554439968736848, "flos": 20887237313280.0, "grad_norm": 2.1334488592114718, "language_loss": 0.78923547, "learning_rate": 1.1212200994146863e-06, "loss": 0.81415641, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.8108479976654053 }, { "auxiliary_loss_clip": 0.01307782, "auxiliary_loss_mlp": 0.01193495, "balance_loss_clip": 1.00723863, "balance_loss_mlp": 1.00018525, "epoch": 0.655564239764324, "flos": 16139053681920.0, "grad_norm": 1.9103697452226098, "language_loss": 0.75505143, "learning_rate": 1.120520418639618e-06, "loss": 0.78006417, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.7977147102355957 }, { "auxiliary_loss_clip": 0.01330109, "auxiliary_loss_mlp": 0.01193575, "balance_loss_clip": 1.00854063, "balance_loss_mlp": 1.0002656, "epoch": 0.655684482654963, "flos": 29570367519360.0, "grad_norm": 1.8010087738365497, "language_loss": 0.83545291, "learning_rate": 1.119820871280903e-06, "loss": 0.86068982, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.7780537605285645 }, { "auxiliary_loss_clip": 0.01339084, "auxiliary_loss_mlp": 0.01193559, "balance_loss_clip": 1.00882101, "balance_loss_mlp": 1.00024879, "epoch": 0.6558047255456021, "flos": 29789422712640.0, "grad_norm": 1.9559165743471216, "language_loss": 0.73424792, "learning_rate": 1.1191214574446614e-06, "loss": 0.75957441, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.807270050048828 }, { "auxiliary_loss_clip": 0.01315111, "auxiliary_loss_mlp": 0.01193515, "balance_loss_clip": 1.00795615, "balance_loss_mlp": 1.00020576, "epoch": 0.6559249684362413, "flos": 29059178862240.0, "grad_norm": 1.3728954727137623, "language_loss": 0.801099, "learning_rate": 1.118422177236995e-06, "loss": 0.82618523, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.820798397064209 }, { "auxiliary_loss_clip": 0.01316393, "auxiliary_loss_mlp": 0.0119349, "balance_loss_clip": 1.00809038, "balance_loss_mlp": 1.00018013, "epoch": 0.6560452113268803, "flos": 20225473502400.0, "grad_norm": 1.94777981250141, "language_loss": 0.85459286, "learning_rate": 1.1177230307639835e-06, "loss": 0.87969172, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.7398858070373535 }, { "auxiliary_loss_clip": 0.01297968, "auxiliary_loss_mlp": 0.01193718, "balance_loss_clip": 1.00749242, "balance_loss_mlp": 1.00031269, "epoch": 0.6561654542175194, "flos": 25045549922880.0, "grad_norm": 1.644325572494065, "language_loss": 0.78593123, "learning_rate": 1.1170240181316865e-06, "loss": 0.81084812, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.810075283050537 }, { "auxiliary_loss_clip": 0.01305759, "auxiliary_loss_mlp": 0.01193691, "balance_loss_clip": 1.00779963, "balance_loss_mlp": 1.0002861, "epoch": 0.6562856971081584, "flos": 22856722326720.0, "grad_norm": 2.1103576480503365, "language_loss": 0.79114747, "learning_rate": 1.1163251394461442e-06, "loss": 0.81614196, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.813380241394043 }, { "auxiliary_loss_clip": 0.01340171, "auxiliary_loss_mlp": 0.01193938, "balance_loss_clip": 1.00927544, "balance_loss_mlp": 1.00024676, "epoch": 0.6564059399987976, "flos": 18872573519520.0, "grad_norm": 2.06963880369964, "language_loss": 0.82575393, "learning_rate": 1.1156263948133746e-06, "loss": 0.85109502, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.7242684364318848 }, { "auxiliary_loss_clip": 0.01270125, "auxiliary_loss_mlp": 0.00872563, "balance_loss_clip": 1.00641048, "balance_loss_mlp": 1.00047565, "epoch": 0.6565261828894366, "flos": 25484199164640.0, "grad_norm": 1.8546727558433467, "language_loss": 0.77634573, "learning_rate": 1.1149277843393787e-06, "loss": 0.79777259, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.841732978820801 }, { "auxiliary_loss_clip": 0.01269045, "auxiliary_loss_mlp": 0.00872477, "balance_loss_clip": 1.00820565, "balance_loss_mlp": 1.00039864, "epoch": 0.6566464257800757, "flos": 19683511460640.0, "grad_norm": 2.1097241399664117, "language_loss": 0.63522959, "learning_rate": 1.1142293081301342e-06, "loss": 0.65664482, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.9303483963012695 }, { "auxiliary_loss_clip": 0.01312276, "auxiliary_loss_mlp": 0.01193574, "balance_loss_clip": 1.0079937, "balance_loss_mlp": 1.00026417, "epoch": 0.6567666686707149, "flos": 23514139372320.0, "grad_norm": 2.046435524537095, "language_loss": 0.68204975, "learning_rate": 1.1135309662915995e-06, "loss": 0.70710826, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.893249988555908 }, { "auxiliary_loss_clip": 0.01269246, "auxiliary_loss_mlp": 0.01193473, "balance_loss_clip": 1.00709033, "balance_loss_mlp": 1.00016356, "epoch": 0.6568869115613539, "flos": 32781356887680.0, "grad_norm": 1.8949980504080888, "language_loss": 0.59816611, "learning_rate": 1.112832758929712e-06, "loss": 0.62279332, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.942744016647339 }, { "auxiliary_loss_clip": 0.01325779, "auxiliary_loss_mlp": 0.01193867, "balance_loss_clip": 1.00794709, "balance_loss_mlp": 1.00027132, "epoch": 0.657007154451993, "flos": 18442438189920.0, "grad_norm": 2.895115227201038, "language_loss": 0.7492103, "learning_rate": 1.11213468615039e-06, "loss": 0.77440679, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.8264341354370117 }, { "auxiliary_loss_clip": 0.01254662, "auxiliary_loss_mlp": 0.01193392, "balance_loss_clip": 1.00717199, "balance_loss_mlp": 1.00027275, "epoch": 0.6571273973426321, "flos": 25156730008800.0, "grad_norm": 1.5099742357559343, "language_loss": 0.75101715, "learning_rate": 1.1114367480595292e-06, "loss": 0.77549767, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.927420139312744 }, { "auxiliary_loss_clip": 0.01247609, "auxiliary_loss_mlp": 0.01193963, "balance_loss_clip": 1.00682449, "balance_loss_mlp": 1.0002718, "epoch": 0.6572476402332712, "flos": 17529839938080.0, "grad_norm": 1.876516636518835, "language_loss": 0.81535602, "learning_rate": 1.1107389447630086e-06, "loss": 0.83977175, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 4.694177865982056 }, { "auxiliary_loss_clip": 0.01325935, "auxiliary_loss_mlp": 0.00872413, "balance_loss_clip": 1.00851083, "balance_loss_mlp": 1.00043297, "epoch": 0.6573678831239103, "flos": 17014268592000.0, "grad_norm": 2.0080180477114635, "language_loss": 0.78213888, "learning_rate": 1.1100412763666818e-06, "loss": 0.80412233, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 3.688772439956665 }, { "auxiliary_loss_clip": 0.01314781, "auxiliary_loss_mlp": 0.01193761, "balance_loss_clip": 1.00801468, "balance_loss_mlp": 1.00026, "epoch": 0.6574881260145494, "flos": 23910088567680.0, "grad_norm": 1.5350676104959409, "language_loss": 0.80022424, "learning_rate": 1.1093437429763865e-06, "loss": 0.82530969, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.791799545288086 }, { "auxiliary_loss_clip": 0.01332306, "auxiliary_loss_mlp": 0.01193522, "balance_loss_clip": 1.0078609, "balance_loss_mlp": 1.00021219, "epoch": 0.6576083689051885, "flos": 11218466508480.0, "grad_norm": 1.9072438605699897, "language_loss": 0.73462343, "learning_rate": 1.1086463446979361e-06, "loss": 0.75988173, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.8282718658447266 }, { "auxiliary_loss_clip": 0.01328311, "auxiliary_loss_mlp": 0.01193526, "balance_loss_clip": 1.00863492, "balance_loss_mlp": 1.00021672, "epoch": 0.6577286117958275, "flos": 22455564197760.0, "grad_norm": 1.7243023530478754, "language_loss": 0.77364886, "learning_rate": 1.1079490816371277e-06, "loss": 0.79886729, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 3.819470167160034 }, { "auxiliary_loss_clip": 0.01328892, "auxiliary_loss_mlp": 0.00872539, "balance_loss_clip": 1.00757051, "balance_loss_mlp": 1.00038433, "epoch": 0.6578488546864667, "flos": 21872195362080.0, "grad_norm": 1.996453929449227, "language_loss": 0.74695837, "learning_rate": 1.1072519538997352e-06, "loss": 0.76897269, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.813042640686035 }, { "auxiliary_loss_clip": 0.01317353, "auxiliary_loss_mlp": 0.0119375, "balance_loss_clip": 1.00759768, "balance_loss_mlp": 1.00015438, "epoch": 0.6579690975771058, "flos": 23543763199200.0, "grad_norm": 1.5840327330632717, "language_loss": 0.82357979, "learning_rate": 1.1065549615915095e-06, "loss": 0.84869075, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.878617286682129 }, { "auxiliary_loss_clip": 0.01325483, "auxiliary_loss_mlp": 0.01193651, "balance_loss_clip": 1.00863814, "balance_loss_mlp": 1.00024629, "epoch": 0.6580893404677448, "flos": 32744009469600.0, "grad_norm": 2.338821969604551, "language_loss": 0.78233206, "learning_rate": 1.105858104818187e-06, "loss": 0.80752337, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.811389207839966 }, { "auxiliary_loss_clip": 0.01332015, "auxiliary_loss_mlp": 0.01193787, "balance_loss_clip": 1.00834918, "balance_loss_mlp": 1.00028634, "epoch": 0.658209583358384, "flos": 15888147393600.0, "grad_norm": 2.1797249835884895, "language_loss": 0.74956357, "learning_rate": 1.105161383685478e-06, "loss": 0.77482158, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.7671661376953125 }, { "auxiliary_loss_clip": 0.01266837, "auxiliary_loss_mlp": 0.01193046, "balance_loss_clip": 1.00611448, "balance_loss_mlp": 1.00002229, "epoch": 0.658329826249023, "flos": 62695933997760.0, "grad_norm": 0.7256139672440982, "language_loss": 0.56364357, "learning_rate": 1.1044647982990771e-06, "loss": 0.58824241, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.3725883960723877 }, { "auxiliary_loss_clip": 0.01307992, "auxiliary_loss_mlp": 0.01193907, "balance_loss_clip": 1.00758362, "balance_loss_mlp": 1.0003109, "epoch": 0.6584500691396621, "flos": 31722638018400.0, "grad_norm": 2.0913074179712803, "language_loss": 0.64020383, "learning_rate": 1.1037683487646536e-06, "loss": 0.66522288, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.8024370670318604 }, { "auxiliary_loss_clip": 0.01303219, "auxiliary_loss_mlp": 0.0087247, "balance_loss_clip": 1.00743544, "balance_loss_mlp": 1.00051188, "epoch": 0.6585703120303013, "flos": 18406096634880.0, "grad_norm": 1.7592706674219736, "language_loss": 0.77314776, "learning_rate": 1.1030720351878583e-06, "loss": 0.79490465, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.8519399166107178 }, { "auxiliary_loss_clip": 0.01293398, "auxiliary_loss_mlp": 0.01193054, "balance_loss_clip": 1.00647771, "balance_loss_mlp": 1.0000304, "epoch": 0.6586905549209403, "flos": 58309908588000.0, "grad_norm": 0.8042237884678264, "language_loss": 0.57641011, "learning_rate": 1.102375857674323e-06, "loss": 0.60127461, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.2381324768066406 }, { "auxiliary_loss_clip": 0.01327298, "auxiliary_loss_mlp": 0.01193393, "balance_loss_clip": 1.00879955, "balance_loss_mlp": 1.00017846, "epoch": 0.6588107978115794, "flos": 22782638193120.0, "grad_norm": 1.7484445713307066, "language_loss": 0.90274882, "learning_rate": 1.1016798163296561e-06, "loss": 0.92795569, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.840714693069458 }, { "auxiliary_loss_clip": 0.01321443, "auxiliary_loss_mlp": 0.01193442, "balance_loss_clip": 1.00823545, "balance_loss_mlp": 1.00022793, "epoch": 0.6589310407022185, "flos": 20667535493760.0, "grad_norm": 1.772114034232935, "language_loss": 0.66006267, "learning_rate": 1.1009839112594471e-06, "loss": 0.68521154, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.7097206115722656 }, { "auxiliary_loss_clip": 0.01331675, "auxiliary_loss_mlp": 0.01193464, "balance_loss_clip": 1.00789714, "balance_loss_mlp": 1.00024939, "epoch": 0.6590512835928576, "flos": 25630607171520.0, "grad_norm": 2.1892721665540464, "language_loss": 0.71895242, "learning_rate": 1.1002881425692638e-06, "loss": 0.74420381, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.7990317344665527 }, { "auxiliary_loss_clip": 0.01338574, "auxiliary_loss_mlp": 0.0119361, "balance_loss_clip": 1.0082022, "balance_loss_mlp": 1.00020504, "epoch": 0.6591715264834966, "flos": 23726117600640.0, "grad_norm": 1.5955508438069679, "language_loss": 0.75201911, "learning_rate": 1.0995925103646532e-06, "loss": 0.77734101, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.741016149520874 }, { "auxiliary_loss_clip": 0.01294682, "auxiliary_loss_mlp": 0.01193584, "balance_loss_clip": 1.0077256, "balance_loss_mlp": 1.00017929, "epoch": 0.6592917693741358, "flos": 35773865841600.0, "grad_norm": 1.5147054955731276, "language_loss": 0.66691512, "learning_rate": 1.0988970147511437e-06, "loss": 0.69179773, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.915266752243042 }, { "auxiliary_loss_clip": 0.01305951, "auxiliary_loss_mlp": 0.01193728, "balance_loss_clip": 1.00769401, "balance_loss_mlp": 1.00022793, "epoch": 0.6594120122647749, "flos": 21396845328480.0, "grad_norm": 1.8398870209747837, "language_loss": 0.80660951, "learning_rate": 1.0982016558342405e-06, "loss": 0.83160639, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.827070951461792 }, { "auxiliary_loss_clip": 0.01351536, "auxiliary_loss_mlp": 0.0119363, "balance_loss_clip": 1.00872159, "balance_loss_mlp": 1.00022531, "epoch": 0.6595322551554139, "flos": 19351839234240.0, "grad_norm": 1.827060445782891, "language_loss": 0.71506184, "learning_rate": 1.0975064337194291e-06, "loss": 0.7405135, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.673414945602417 }, { "auxiliary_loss_clip": 0.0130279, "auxiliary_loss_mlp": 0.01193476, "balance_loss_clip": 1.00840187, "balance_loss_mlp": 1.00026143, "epoch": 0.6596524980460531, "flos": 16837123124160.0, "grad_norm": 2.169773002496006, "language_loss": 0.70218509, "learning_rate": 1.0968113485121743e-06, "loss": 0.72714782, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.848335027694702 }, { "auxiliary_loss_clip": 0.01338359, "auxiliary_loss_mlp": 0.00872597, "balance_loss_clip": 1.00831342, "balance_loss_mlp": 1.00048268, "epoch": 0.6597727409366921, "flos": 21798578236320.0, "grad_norm": 1.8917160018692945, "language_loss": 0.80144584, "learning_rate": 1.0961164003179185e-06, "loss": 0.82355535, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.717416763305664 }, { "auxiliary_loss_clip": 0.01303498, "auxiliary_loss_mlp": 0.0119367, "balance_loss_clip": 1.0077765, "balance_loss_mlp": 1.00016952, "epoch": 0.6598929838273312, "flos": 23730715831680.0, "grad_norm": 1.9394439909121837, "language_loss": 0.83876526, "learning_rate": 1.0954215892420884e-06, "loss": 0.86373699, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.8568060398101807 }, { "auxiliary_loss_clip": 0.01291169, "auxiliary_loss_mlp": 0.01193861, "balance_loss_clip": 1.00745225, "balance_loss_mlp": 1.00036001, "epoch": 0.6600132267179702, "flos": 19974531290400.0, "grad_norm": 1.9686543723120449, "language_loss": 0.70281351, "learning_rate": 1.094726915390082e-06, "loss": 0.72766382, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.792928695678711 }, { "auxiliary_loss_clip": 0.01328973, "auxiliary_loss_mlp": 0.01193621, "balance_loss_clip": 1.00845718, "balance_loss_mlp": 1.00021601, "epoch": 0.6601334696086094, "flos": 22342659775200.0, "grad_norm": 1.7497628688094868, "language_loss": 0.696661, "learning_rate": 1.0940323788672836e-06, "loss": 0.72188687, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.75872540473938 }, { "auxiliary_loss_clip": 0.01326532, "auxiliary_loss_mlp": 0.01193391, "balance_loss_clip": 1.00814128, "balance_loss_mlp": 1.00017619, "epoch": 0.6602537124992485, "flos": 25703110663200.0, "grad_norm": 1.5173710034753414, "language_loss": 0.73537886, "learning_rate": 1.0933379797790522e-06, "loss": 0.76057804, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.813448190689087 }, { "auxiliary_loss_clip": 0.0135132, "auxiliary_loss_mlp": 0.01193544, "balance_loss_clip": 1.00898838, "balance_loss_mlp": 1.00023389, "epoch": 0.6603739553898875, "flos": 25848584654400.0, "grad_norm": 2.552983178137175, "language_loss": 0.70850188, "learning_rate": 1.0926437182307293e-06, "loss": 0.7339505, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 3.7165515422821045 }, { "auxiliary_loss_clip": 0.01321326, "auxiliary_loss_mlp": 0.01193601, "balance_loss_clip": 1.00800848, "balance_loss_mlp": 1.00019586, "epoch": 0.6604941982805267, "flos": 24570307660320.0, "grad_norm": 1.6848357403632952, "language_loss": 0.78124559, "learning_rate": 1.0919495943276338e-06, "loss": 0.80639482, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 4.893978118896484 }, { "auxiliary_loss_clip": 0.01315739, "auxiliary_loss_mlp": 0.01193651, "balance_loss_clip": 1.00844264, "balance_loss_mlp": 1.00024605, "epoch": 0.6606144411711657, "flos": 13261784189760.0, "grad_norm": 2.503287777354724, "language_loss": 0.76143885, "learning_rate": 1.0912556081750611e-06, "loss": 0.78653276, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.829400062561035 }, { "auxiliary_loss_clip": 0.01302652, "auxiliary_loss_mlp": 0.0119343, "balance_loss_clip": 1.00752795, "balance_loss_mlp": 1.00021601, "epoch": 0.6607346840618048, "flos": 25155293061600.0, "grad_norm": 1.8210749126354602, "language_loss": 0.76653039, "learning_rate": 1.0905617598782909e-06, "loss": 0.79149127, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.7635698318481445 }, { "auxiliary_loss_clip": 0.01280368, "auxiliary_loss_mlp": 0.01193441, "balance_loss_clip": 1.00725269, "balance_loss_mlp": 1.00022614, "epoch": 0.660854926952444, "flos": 17638038358560.0, "grad_norm": 2.285489792797689, "language_loss": 0.81001234, "learning_rate": 1.0898680495425775e-06, "loss": 0.83475041, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 3.774801254272461 }, { "auxiliary_loss_clip": 0.01315033, "auxiliary_loss_mlp": 0.01193492, "balance_loss_clip": 1.0080893, "balance_loss_mlp": 1.00027752, "epoch": 0.660975169843083, "flos": 16836009490080.0, "grad_norm": 1.5770659558330191, "language_loss": 0.80401748, "learning_rate": 1.0891744772731594e-06, "loss": 0.82910275, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.823228120803833 }, { "auxiliary_loss_clip": 0.01338089, "auxiliary_loss_mlp": 0.0119342, "balance_loss_clip": 1.00822699, "balance_loss_mlp": 1.00020552, "epoch": 0.6610954127337221, "flos": 26870423113440.0, "grad_norm": 1.6194665388828593, "language_loss": 0.65727258, "learning_rate": 1.088481043175248e-06, "loss": 0.68258762, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.755441188812256 }, { "auxiliary_loss_clip": 0.0132586, "auxiliary_loss_mlp": 0.01193497, "balance_loss_clip": 1.00825238, "balance_loss_mlp": 1.00028276, "epoch": 0.6612156556243612, "flos": 26465708540160.0, "grad_norm": 1.895184574865591, "language_loss": 0.75864589, "learning_rate": 1.0877877473540368e-06, "loss": 0.78383946, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.7750136852264404 }, { "auxiliary_loss_clip": 0.01351784, "auxiliary_loss_mlp": 0.0119355, "balance_loss_clip": 1.00855684, "balance_loss_mlp": 1.00024056, "epoch": 0.6613358985150003, "flos": 19791925423200.0, "grad_norm": 1.585769698362987, "language_loss": 0.72404355, "learning_rate": 1.0870945899147002e-06, "loss": 0.74949694, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.6711630821228027 }, { "auxiliary_loss_clip": 0.0132751, "auxiliary_loss_mlp": 0.0119334, "balance_loss_clip": 1.00763011, "balance_loss_mlp": 1.00022137, "epoch": 0.6614561414056394, "flos": 26831638748160.0, "grad_norm": 1.6941061132998878, "language_loss": 0.75906348, "learning_rate": 1.0864015709623879e-06, "loss": 0.78427196, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.780056953430176 }, { "auxiliary_loss_clip": 0.01338717, "auxiliary_loss_mlp": 0.01193619, "balance_loss_clip": 1.00832295, "balance_loss_mlp": 1.00021386, "epoch": 0.6615763842962785, "flos": 22894608600000.0, "grad_norm": 2.284307617141049, "language_loss": 0.80517352, "learning_rate": 1.0857086906022313e-06, "loss": 0.83049685, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.7523038387298584 }, { "auxiliary_loss_clip": 0.01247037, "auxiliary_loss_mlp": 0.01193695, "balance_loss_clip": 1.00740719, "balance_loss_mlp": 1.00019455, "epoch": 0.6616966271869176, "flos": 24790332792960.0, "grad_norm": 2.0607288852644845, "language_loss": 0.7268281, "learning_rate": 1.0850159489393388e-06, "loss": 0.75123543, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.8775973320007324 }, { "auxiliary_loss_clip": 0.01306608, "auxiliary_loss_mlp": 0.01193501, "balance_loss_clip": 1.00747466, "balance_loss_mlp": 1.00019097, "epoch": 0.6618168700775566, "flos": 17202119316480.0, "grad_norm": 1.6447849691501104, "language_loss": 0.81978506, "learning_rate": 1.0843233460787992e-06, "loss": 0.84478617, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.808666467666626 }, { "auxiliary_loss_clip": 0.01289152, "auxiliary_loss_mlp": 0.0119352, "balance_loss_clip": 1.00830472, "balance_loss_mlp": 1.00021005, "epoch": 0.6619371129681958, "flos": 25447103212320.0, "grad_norm": 1.99478468926653, "language_loss": 0.77740842, "learning_rate": 1.0836308821256805e-06, "loss": 0.80223513, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.788693904876709 }, { "auxiliary_loss_clip": 0.013281, "auxiliary_loss_mlp": 0.011937, "balance_loss_clip": 1.00744486, "balance_loss_mlp": 1.00029516, "epoch": 0.6620573558588349, "flos": 18040453816320.0, "grad_norm": 1.7864722545224083, "language_loss": 0.77738494, "learning_rate": 1.0829385571850282e-06, "loss": 0.80260289, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.7142767906188965 }, { "auxiliary_loss_clip": 0.01352257, "auxiliary_loss_mlp": 0.01194026, "balance_loss_clip": 1.00854921, "balance_loss_mlp": 1.00033438, "epoch": 0.6621775987494739, "flos": 17785595923200.0, "grad_norm": 2.320178637917602, "language_loss": 0.83602583, "learning_rate": 1.0822463713618679e-06, "loss": 0.86148864, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.6024904251098633 }, { "auxiliary_loss_clip": 0.01292451, "auxiliary_loss_mlp": 0.01193432, "balance_loss_clip": 1.00705886, "balance_loss_mlp": 1.00021791, "epoch": 0.6622978416401131, "flos": 17492600291040.0, "grad_norm": 1.8530167053795026, "language_loss": 0.84739864, "learning_rate": 1.0815543247612034e-06, "loss": 0.87225747, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.811999797821045 }, { "auxiliary_loss_clip": 0.01326037, "auxiliary_loss_mlp": 0.01193507, "balance_loss_clip": 1.00815368, "balance_loss_mlp": 1.00019705, "epoch": 0.6624180845307521, "flos": 21648362319360.0, "grad_norm": 1.482779193029163, "language_loss": 0.82894552, "learning_rate": 1.0808624174880168e-06, "loss": 0.85414094, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.8010430335998535 }, { "auxiliary_loss_clip": 0.01350079, "auxiliary_loss_mlp": 0.01193494, "balance_loss_clip": 1.0084672, "balance_loss_mlp": 1.00028002, "epoch": 0.6625383274213912, "flos": 23805913599360.0, "grad_norm": 1.5987050563300975, "language_loss": 0.80108058, "learning_rate": 1.080170649647272e-06, "loss": 0.82651627, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.785628318786621 }, { "auxiliary_loss_clip": 0.01350503, "auxiliary_loss_mlp": 0.01193587, "balance_loss_clip": 1.00873971, "balance_loss_mlp": 1.00027752, "epoch": 0.6626585703120303, "flos": 33262957641600.0, "grad_norm": 1.674452455431624, "language_loss": 0.67256773, "learning_rate": 1.0794790213439068e-06, "loss": 0.6980086, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.9334051609039307 }, { "auxiliary_loss_clip": 0.01281624, "auxiliary_loss_mlp": 0.01193678, "balance_loss_clip": 1.00742316, "balance_loss_mlp": 1.00027299, "epoch": 0.6627788132026694, "flos": 22085790156000.0, "grad_norm": 2.234933279737832, "language_loss": 0.78451586, "learning_rate": 1.078787532682843e-06, "loss": 0.80926889, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 3.014678955078125 }, { "auxiliary_loss_clip": 0.01326075, "auxiliary_loss_mlp": 0.01193514, "balance_loss_clip": 1.00747967, "balance_loss_mlp": 1.00020397, "epoch": 0.6628990560933085, "flos": 36173622947040.0, "grad_norm": 2.2161823605467172, "language_loss": 0.75739372, "learning_rate": 1.0780961837689773e-06, "loss": 0.78258955, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.916728973388672 }, { "auxiliary_loss_clip": 0.01315297, "auxiliary_loss_mlp": 0.01193455, "balance_loss_clip": 1.00854158, "balance_loss_mlp": 1.00024104, "epoch": 0.6630192989839476, "flos": 18513576581760.0, "grad_norm": 1.5090983219477359, "language_loss": 0.69771314, "learning_rate": 1.0774049747071883e-06, "loss": 0.72280061, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.7266712188720703 }, { "auxiliary_loss_clip": 0.01267754, "auxiliary_loss_mlp": 0.01193676, "balance_loss_clip": 1.00705755, "balance_loss_mlp": 1.00027132, "epoch": 0.6631395418745867, "flos": 35809524846720.0, "grad_norm": 1.6040168042859488, "language_loss": 0.68114501, "learning_rate": 1.076713905602332e-06, "loss": 0.70575929, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.9746968746185303 }, { "auxiliary_loss_clip": 0.01334105, "auxiliary_loss_mlp": 0.01193575, "balance_loss_clip": 1.00839615, "balance_loss_mlp": 1.00026476, "epoch": 0.6632597847652257, "flos": 20047753255680.0, "grad_norm": 1.6522116183526148, "language_loss": 0.81145072, "learning_rate": 1.07602297655924e-06, "loss": 0.8367275, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.713078498840332 }, { "auxiliary_loss_clip": 0.01351137, "auxiliary_loss_mlp": 0.01193491, "balance_loss_clip": 1.00888658, "balance_loss_mlp": 1.00027621, "epoch": 0.6633800276558649, "flos": 21214490927040.0, "grad_norm": 1.686578601205689, "language_loss": 0.80740154, "learning_rate": 1.0753321876827292e-06, "loss": 0.83284783, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.698927164077759 }, { "auxiliary_loss_clip": 0.01350397, "auxiliary_loss_mlp": 0.01193485, "balance_loss_clip": 1.00823104, "balance_loss_mlp": 1.00017512, "epoch": 0.663500270546504, "flos": 23987765069280.0, "grad_norm": 1.8579439705687861, "language_loss": 0.73914033, "learning_rate": 1.0746415390775893e-06, "loss": 0.76457918, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 3.6530933380126953 }, { "auxiliary_loss_clip": 0.01350462, "auxiliary_loss_mlp": 0.01193608, "balance_loss_clip": 1.0088203, "balance_loss_mlp": 1.00029826, "epoch": 0.663620513437143, "flos": 17932399090560.0, "grad_norm": 1.817743907272832, "language_loss": 0.76427042, "learning_rate": 1.0739510308485939e-06, "loss": 0.78971112, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 3.572232723236084 }, { "auxiliary_loss_clip": 0.01268394, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00600553, "balance_loss_mlp": 1.00007343, "epoch": 0.6637407563277821, "flos": 57840270419520.0, "grad_norm": 0.8042433116656401, "language_loss": 0.6255675, "learning_rate": 1.07326066310049e-06, "loss": 0.65018237, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 4.399585485458374 }, { "auxiliary_loss_clip": 0.01304991, "auxiliary_loss_mlp": 0.01193544, "balance_loss_clip": 1.00790727, "balance_loss_mlp": 1.00023448, "epoch": 0.6638609992184212, "flos": 27306018842400.0, "grad_norm": 1.8211014321136323, "language_loss": 0.78786284, "learning_rate": 1.0725704359380059e-06, "loss": 0.81284815, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.820204496383667 }, { "auxiliary_loss_clip": 0.01350792, "auxiliary_loss_mlp": 0.01193283, "balance_loss_clip": 1.00815141, "balance_loss_mlp": 1.00025964, "epoch": 0.6639812421090603, "flos": 18624864438720.0, "grad_norm": 1.7587710231484235, "language_loss": 0.72382963, "learning_rate": 1.0718803494658497e-06, "loss": 0.74927032, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 3.566169261932373 }, { "auxiliary_loss_clip": 0.01215074, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00793791, "balance_loss_mlp": 1.00017667, "epoch": 0.6641014849996993, "flos": 15924488948640.0, "grad_norm": 2.0925030044324515, "language_loss": 0.83654225, "learning_rate": 1.071190403788707e-06, "loss": 0.86062503, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 3.173691987991333 }, { "auxiliary_loss_clip": 0.01296819, "auxiliary_loss_mlp": 0.01193516, "balance_loss_clip": 1.00779414, "balance_loss_mlp": 1.00020587, "epoch": 0.6642217278903385, "flos": 26505498768480.0, "grad_norm": 1.7559614917013808, "language_loss": 0.75898319, "learning_rate": 1.0705005990112415e-06, "loss": 0.78388655, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 3.243011713027954 }, { "auxiliary_loss_clip": 0.01267682, "auxiliary_loss_mlp": 0.01193633, "balance_loss_clip": 1.0071404, "balance_loss_mlp": 1.00022745, "epoch": 0.6643419707809776, "flos": 15377317973280.0, "grad_norm": 2.45925126660072, "language_loss": 0.74415034, "learning_rate": 1.0698109352380957e-06, "loss": 0.76876354, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.7738378047943115 }, { "auxiliary_loss_clip": 0.0135077, "auxiliary_loss_mlp": 0.01193486, "balance_loss_clip": 1.00852823, "balance_loss_mlp": 1.00017667, "epoch": 0.6644622136716166, "flos": 25117622330400.0, "grad_norm": 1.7691826399474933, "language_loss": 0.77726215, "learning_rate": 1.0691214125738909e-06, "loss": 0.80270469, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.7697384357452393 }, { "auxiliary_loss_clip": 0.01321533, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00612855, "balance_loss_mlp": 1.00007212, "epoch": 0.6645824565622558, "flos": 66201751105920.0, "grad_norm": 0.8246619953948124, "language_loss": 0.57498503, "learning_rate": 1.0684320311232287e-06, "loss": 0.60013139, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.339529275894165 }, { "auxiliary_loss_clip": 0.01315591, "auxiliary_loss_mlp": 0.01193627, "balance_loss_clip": 1.00779355, "balance_loss_mlp": 1.00022221, "epoch": 0.6647026994528948, "flos": 25082142943680.0, "grad_norm": 1.561738303144462, "language_loss": 0.80994284, "learning_rate": 1.0677427909906865e-06, "loss": 0.83503509, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.7680580615997314 }, { "auxiliary_loss_clip": 0.01352212, "auxiliary_loss_mlp": 0.01193501, "balance_loss_clip": 1.00852942, "balance_loss_mlp": 1.00028706, "epoch": 0.6648229423435339, "flos": 18222197515200.0, "grad_norm": 1.7400246340452235, "language_loss": 0.71968579, "learning_rate": 1.0670536922808216e-06, "loss": 0.745143, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.7929396629333496 }, { "auxiliary_loss_clip": 0.01313777, "auxiliary_loss_mlp": 0.01193466, "balance_loss_clip": 1.00767326, "balance_loss_mlp": 1.00025165, "epoch": 0.6649431852341731, "flos": 18296892351360.0, "grad_norm": 2.3350586896259005, "language_loss": 0.71990114, "learning_rate": 1.06636473509817e-06, "loss": 0.74497354, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.7217652797698975 }, { "auxiliary_loss_clip": 0.01314452, "auxiliary_loss_mlp": 0.00872415, "balance_loss_clip": 1.00789905, "balance_loss_mlp": 1.00040317, "epoch": 0.6650634281248121, "flos": 17019585296640.0, "grad_norm": 2.1727277519521766, "language_loss": 0.80751944, "learning_rate": 1.0656759195472447e-06, "loss": 0.82938802, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 3.2345938682556152 }, { "auxiliary_loss_clip": 0.01283202, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00492883, "balance_loss_mlp": 1.00007343, "epoch": 0.6651836710154512, "flos": 69294842660160.0, "grad_norm": 0.7698014798146089, "language_loss": 0.59782541, "learning_rate": 1.0649872457325414e-06, "loss": 0.6225884, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.328575372695923 }, { "auxiliary_loss_clip": 0.01309327, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00619555, "balance_loss_mlp": 1.0000447, "epoch": 0.6653039139060903, "flos": 66883475273760.0, "grad_norm": 0.8510852303393843, "language_loss": 0.55188322, "learning_rate": 1.0642987137585278e-06, "loss": 0.57690716, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.215538501739502 }, { "auxiliary_loss_clip": 0.01315516, "auxiliary_loss_mlp": 0.01193673, "balance_loss_clip": 1.00814986, "balance_loss_mlp": 1.00026846, "epoch": 0.6654241567967294, "flos": 21470067293760.0, "grad_norm": 1.6198559777723518, "language_loss": 0.82140309, "learning_rate": 1.0636103237296561e-06, "loss": 0.84649491, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.8119006156921387 }, { "auxiliary_loss_clip": 0.01326669, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00793576, "balance_loss_mlp": 1.00021005, "epoch": 0.6655443996873684, "flos": 25119526285440.0, "grad_norm": 1.8740318885564469, "language_loss": 0.84172428, "learning_rate": 1.062922075750353e-06, "loss": 0.86692333, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.7648069858551025 }, { "auxiliary_loss_clip": 0.01292301, "auxiliary_loss_mlp": 0.01193447, "balance_loss_clip": 1.00700665, "balance_loss_mlp": 1.00023222, "epoch": 0.6656646425780076, "flos": 17457336446400.0, "grad_norm": 2.2431307479263944, "language_loss": 0.72026378, "learning_rate": 1.0622339699250267e-06, "loss": 0.74512124, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.7982888221740723 }, { "auxiliary_loss_clip": 0.01295117, "auxiliary_loss_mlp": 0.01193257, "balance_loss_clip": 1.00741696, "balance_loss_mlp": 1.00023329, "epoch": 0.6657848854686467, "flos": 23434199678880.0, "grad_norm": 1.5741463461610554, "language_loss": 0.79216766, "learning_rate": 1.0615460063580624e-06, "loss": 0.81705147, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.8132736682891846 }, { "auxiliary_loss_clip": 0.01317317, "auxiliary_loss_mlp": 0.01193624, "balance_loss_clip": 1.00788057, "balance_loss_mlp": 1.00021851, "epoch": 0.6659051283592857, "flos": 11509917422400.0, "grad_norm": 1.988102669350013, "language_loss": 0.73157561, "learning_rate": 1.060858185153821e-06, "loss": 0.75668502, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.7931950092315674 }, { "auxiliary_loss_clip": 0.01316285, "auxiliary_loss_mlp": 0.01193912, "balance_loss_clip": 1.00828397, "balance_loss_mlp": 1.00022089, "epoch": 0.6660253712499249, "flos": 20594564994240.0, "grad_norm": 2.4296029183970798, "language_loss": 0.76213491, "learning_rate": 1.0601705064166474e-06, "loss": 0.78723693, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.760730266571045 }, { "auxiliary_loss_clip": 0.01305476, "auxiliary_loss_mlp": 0.01193564, "balance_loss_clip": 1.00800693, "balance_loss_mlp": 1.00025415, "epoch": 0.666145614140564, "flos": 21251515032000.0, "grad_norm": 2.0045271820332053, "language_loss": 0.72978687, "learning_rate": 1.0594829702508596e-06, "loss": 0.75477731, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 3.05580997467041 }, { "auxiliary_loss_clip": 0.0130272, "auxiliary_loss_mlp": 0.01193393, "balance_loss_clip": 1.00777876, "balance_loss_mlp": 1.00017834, "epoch": 0.666265857031203, "flos": 33726201395040.0, "grad_norm": 1.6148295907646444, "language_loss": 0.54904521, "learning_rate": 1.0587955767607592e-06, "loss": 0.57400632, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.9668831825256348 }, { "auxiliary_loss_clip": 0.01349851, "auxiliary_loss_mlp": 0.01193532, "balance_loss_clip": 1.00795174, "balance_loss_mlp": 1.00022244, "epoch": 0.6663860999218422, "flos": 17456653896480.0, "grad_norm": 2.0044947295368685, "language_loss": 0.77086592, "learning_rate": 1.0581083260506206e-06, "loss": 0.79629982, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.666034460067749 }, { "auxiliary_loss_clip": 0.0131387, "auxiliary_loss_mlp": 0.01193405, "balance_loss_clip": 1.00773776, "balance_loss_mlp": 1.00019085, "epoch": 0.6665063428124812, "flos": 17676750876480.0, "grad_norm": 2.1309035158062253, "language_loss": 0.76626188, "learning_rate": 1.0574212182246993e-06, "loss": 0.79133463, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.7913894653320312 }, { "auxiliary_loss_clip": 0.01317709, "auxiliary_loss_mlp": 0.01193825, "balance_loss_clip": 1.00787628, "balance_loss_mlp": 1.00022912, "epoch": 0.6666265857031203, "flos": 27673278226560.0, "grad_norm": 2.2244704536846625, "language_loss": 0.76195848, "learning_rate": 1.0567342533872303e-06, "loss": 0.78707385, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 3.8208248615264893 }, { "auxiliary_loss_clip": 0.01309149, "auxiliary_loss_mlp": 0.01193629, "balance_loss_clip": 1.00757313, "balance_loss_mlp": 1.00022376, "epoch": 0.6667468285937594, "flos": 25046843175360.0, "grad_norm": 1.6316860713376793, "language_loss": 0.80824018, "learning_rate": 1.0560474316424255e-06, "loss": 0.83326793, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 3.7061777114868164 }, { "auxiliary_loss_clip": 0.01321329, "auxiliary_loss_mlp": 0.01193833, "balance_loss_clip": 1.0079335, "balance_loss_mlp": 1.00023758, "epoch": 0.6668670714843985, "flos": 22780482772320.0, "grad_norm": 4.306262656557802, "language_loss": 0.73610699, "learning_rate": 1.0553607530944746e-06, "loss": 0.7612586, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 3.7193448543548584 }, { "auxiliary_loss_clip": 0.0130509, "auxiliary_loss_mlp": 0.01193273, "balance_loss_clip": 1.00793874, "balance_loss_mlp": 1.00015354, "epoch": 0.6669873143750376, "flos": 22163897741760.0, "grad_norm": 1.9227944891567483, "language_loss": 0.8943603, "learning_rate": 1.0546742178475463e-06, "loss": 0.91934395, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.8311736583709717 }, { "auxiliary_loss_clip": 0.01267198, "auxiliary_loss_mlp": 0.01193249, "balance_loss_clip": 1.0076077, "balance_loss_mlp": 1.00022495, "epoch": 0.6671075572656767, "flos": 20514840842880.0, "grad_norm": 1.801966470928852, "language_loss": 0.86658585, "learning_rate": 1.0539878260057868e-06, "loss": 0.89119035, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 3.8429698944091797 }, { "auxiliary_loss_clip": 0.01328756, "auxiliary_loss_mlp": 0.0119365, "balance_loss_clip": 1.0088098, "balance_loss_mlp": 1.00024509, "epoch": 0.6672278001563158, "flos": 17931213609120.0, "grad_norm": 2.371502522113864, "language_loss": 0.68374085, "learning_rate": 1.0533015776733226e-06, "loss": 0.70896494, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.7027270793914795 }, { "auxiliary_loss_clip": 0.0130343, "auxiliary_loss_mlp": 0.01193498, "balance_loss_clip": 1.00777292, "balance_loss_mlp": 1.00018871, "epoch": 0.6673480430469548, "flos": 22342156843680.0, "grad_norm": 2.1032187043154904, "language_loss": 0.7858839, "learning_rate": 1.0526154729542566e-06, "loss": 0.81085312, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.8434388637542725 }, { "auxiliary_loss_clip": 0.01291315, "auxiliary_loss_mlp": 0.0119367, "balance_loss_clip": 1.0079459, "balance_loss_mlp": 1.00026488, "epoch": 0.6674682859375939, "flos": 20703841125120.0, "grad_norm": 2.142784413648608, "language_loss": 0.80279821, "learning_rate": 1.0519295119526699e-06, "loss": 0.82764804, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.781102418899536 }, { "auxiliary_loss_clip": 0.01317873, "auxiliary_loss_mlp": 0.01193537, "balance_loss_clip": 1.00793004, "balance_loss_mlp": 1.00022757, "epoch": 0.667588528828233, "flos": 26206683500160.0, "grad_norm": 1.563904812893927, "language_loss": 0.83041024, "learning_rate": 1.0512436947726227e-06, "loss": 0.8555243, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.854081869125366 }, { "auxiliary_loss_clip": 0.01294926, "auxiliary_loss_mlp": 0.01193676, "balance_loss_clip": 1.00721204, "balance_loss_mlp": 1.00027049, "epoch": 0.6677087717188721, "flos": 23071035594240.0, "grad_norm": 2.186016721565091, "language_loss": 0.65130532, "learning_rate": 1.0505580215181517e-06, "loss": 0.67619133, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.7676689624786377 }, { "auxiliary_loss_clip": 0.01253392, "auxiliary_loss_mlp": 0.01192268, "balance_loss_clip": 1.00637472, "balance_loss_mlp": 1.00000691, "epoch": 0.6678290146095112, "flos": 70941348977760.0, "grad_norm": 0.7849216077720562, "language_loss": 0.56693977, "learning_rate": 1.0498724922932753e-06, "loss": 0.59139633, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.344888925552368 }, { "auxiliary_loss_clip": 0.01352649, "auxiliary_loss_mlp": 0.01193785, "balance_loss_clip": 1.00916362, "balance_loss_mlp": 1.00028419, "epoch": 0.6679492575001503, "flos": 18661098222720.0, "grad_norm": 1.9247718580757662, "language_loss": 0.86726826, "learning_rate": 1.0491871072019851e-06, "loss": 0.89273262, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.667809009552002 }, { "auxiliary_loss_clip": 0.01308062, "auxiliary_loss_mlp": 0.01193435, "balance_loss_clip": 1.00773895, "balance_loss_mlp": 1.00022078, "epoch": 0.6680695003907894, "flos": 29711997676800.0, "grad_norm": 1.683427641391412, "language_loss": 0.63762665, "learning_rate": 1.0485018663482555e-06, "loss": 0.66264164, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.930696487426758 }, { "auxiliary_loss_clip": 0.01339326, "auxiliary_loss_mlp": 0.01193498, "balance_loss_clip": 1.00907946, "balance_loss_mlp": 1.00018859, "epoch": 0.6681897432814284, "flos": 28218976331040.0, "grad_norm": 2.5551233395289707, "language_loss": 0.70132816, "learning_rate": 1.0478167698360354e-06, "loss": 0.72665632, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.798129081726074 }, { "auxiliary_loss_clip": 0.0133754, "auxiliary_loss_mlp": 0.01193356, "balance_loss_clip": 1.00831318, "balance_loss_mlp": 1.00023699, "epoch": 0.6683099861720676, "flos": 25046555785920.0, "grad_norm": 1.7920093865524989, "language_loss": 0.69918394, "learning_rate": 1.0471318177692556e-06, "loss": 0.72449297, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.7916102409362793 }, { "auxiliary_loss_clip": 0.01278276, "auxiliary_loss_mlp": 0.01193604, "balance_loss_clip": 1.00697029, "balance_loss_mlp": 1.0001986, "epoch": 0.6684302290627067, "flos": 22996987384320.0, "grad_norm": 2.549783087309111, "language_loss": 0.75377327, "learning_rate": 1.046447010251821e-06, "loss": 0.77849209, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.919196367263794 }, { "auxiliary_loss_clip": 0.01304776, "auxiliary_loss_mlp": 0.01193532, "balance_loss_clip": 1.00738025, "balance_loss_mlp": 1.00022173, "epoch": 0.6685504719533457, "flos": 26573835113280.0, "grad_norm": 1.7215659099764367, "language_loss": 0.75826108, "learning_rate": 1.0457623473876157e-06, "loss": 0.78324419, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.799957036972046 }, { "auxiliary_loss_clip": 0.01350764, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.0084976, "balance_loss_mlp": 1.00019324, "epoch": 0.6686707148439849, "flos": 28986100591680.0, "grad_norm": 1.7534096867127549, "language_loss": 0.71298981, "learning_rate": 1.0450778292805046e-06, "loss": 0.73843062, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.726243734359741 }, { "auxiliary_loss_clip": 0.01338585, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.00814116, "balance_loss_mlp": 1.00016904, "epoch": 0.6687909577346239, "flos": 23623164037440.0, "grad_norm": 1.6627986478753833, "language_loss": 0.78772432, "learning_rate": 1.0443934560343267e-06, "loss": 0.81304306, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.8223087787628174 }, { "auxiliary_loss_clip": 0.01287313, "auxiliary_loss_mlp": 0.01193597, "balance_loss_clip": 1.00686502, "balance_loss_mlp": 1.00028706, "epoch": 0.668911200625263, "flos": 23148604324800.0, "grad_norm": 1.7628845612693373, "language_loss": 0.777744, "learning_rate": 1.0437092277529034e-06, "loss": 0.80255306, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.863330125808716 }, { "auxiliary_loss_clip": 0.01316957, "auxiliary_loss_mlp": 0.01193365, "balance_loss_clip": 1.00814331, "balance_loss_mlp": 1.00024581, "epoch": 0.6690314435159022, "flos": 18551929862880.0, "grad_norm": 2.0059758242453976, "language_loss": 0.73611295, "learning_rate": 1.0430251445400292e-06, "loss": 0.76121616, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.7300031185150146 }, { "auxiliary_loss_clip": 0.01210745, "auxiliary_loss_mlp": 0.01193556, "balance_loss_clip": 1.00629663, "balance_loss_mlp": 1.00024652, "epoch": 0.6691516864065412, "flos": 31759554352320.0, "grad_norm": 1.8552073914395841, "language_loss": 0.62373263, "learning_rate": 1.0423412064994787e-06, "loss": 0.64777565, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 3.309324026107788 }, { "auxiliary_loss_clip": 0.01301454, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.00738454, "balance_loss_mlp": 1.0001967, "epoch": 0.6692719292971803, "flos": 34933878852480.0, "grad_norm": 1.746199871266537, "language_loss": 0.73805511, "learning_rate": 1.0416574137350064e-06, "loss": 0.76300287, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.211188793182373 }, { "auxiliary_loss_clip": 0.01339484, "auxiliary_loss_mlp": 0.01193446, "balance_loss_clip": 1.0086906, "balance_loss_mlp": 1.0002315, "epoch": 0.6693921721878194, "flos": 20449198774080.0, "grad_norm": 2.2937452661657938, "language_loss": 0.80888617, "learning_rate": 1.0409737663503428e-06, "loss": 0.83421546, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.7831292152404785 }, { "auxiliary_loss_clip": 0.01338736, "auxiliary_loss_mlp": 0.01193514, "balance_loss_clip": 1.00804782, "balance_loss_mlp": 1.00020385, "epoch": 0.6695124150784585, "flos": 16614547410240.0, "grad_norm": 1.6316047721465716, "language_loss": 0.82789838, "learning_rate": 1.040290264449196e-06, "loss": 0.85322088, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.778791904449463 }, { "auxiliary_loss_clip": 0.01326986, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00769603, "balance_loss_mlp": 1.00019312, "epoch": 0.6696326579690975, "flos": 26652158241120.0, "grad_norm": 1.8817626342293827, "language_loss": 0.64107722, "learning_rate": 1.0396069081352532e-06, "loss": 0.66628021, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.813534736633301 }, { "auxiliary_loss_clip": 0.01320813, "auxiliary_loss_mlp": 0.01193047, "balance_loss_clip": 1.00577533, "balance_loss_mlp": 1.00002313, "epoch": 0.6697529008597367, "flos": 66964636372320.0, "grad_norm": 0.7792018079232157, "language_loss": 0.56091714, "learning_rate": 1.0389236975121782e-06, "loss": 0.58605576, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 5.746454954147339 }, { "auxiliary_loss_clip": 0.01351145, "auxiliary_loss_mlp": 0.01193617, "balance_loss_clip": 1.00844264, "balance_loss_mlp": 1.00021219, "epoch": 0.6698731437503758, "flos": 20886949923840.0, "grad_norm": 2.187906412740428, "language_loss": 0.71490788, "learning_rate": 1.0382406326836147e-06, "loss": 0.74035549, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 3.6728579998016357 }, { "auxiliary_loss_clip": 0.01334064, "auxiliary_loss_mlp": 0.0119368, "balance_loss_clip": 1.0081749, "balance_loss_mlp": 1.00017953, "epoch": 0.6699933866410148, "flos": 20409480393120.0, "grad_norm": 6.333937049351298, "language_loss": 0.76089233, "learning_rate": 1.0375577137531828e-06, "loss": 0.78616977, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.7843708992004395 }, { "auxiliary_loss_clip": 0.01312916, "auxiliary_loss_mlp": 0.0119358, "balance_loss_clip": 1.00741315, "balance_loss_mlp": 1.000175, "epoch": 0.670113629531654, "flos": 29023088772960.0, "grad_norm": 1.6190585410740121, "language_loss": 0.71798187, "learning_rate": 1.0368749408244802e-06, "loss": 0.74304682, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.8249948024749756 }, { "auxiliary_loss_clip": 0.01338687, "auxiliary_loss_mlp": 0.01193543, "balance_loss_clip": 1.0087986, "balance_loss_mlp": 1.00023317, "epoch": 0.670233872422293, "flos": 19791709881120.0, "grad_norm": 1.680400028212876, "language_loss": 0.78755802, "learning_rate": 1.0361923140010836e-06, "loss": 0.81288034, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 3.6010830402374268 }, { "auxiliary_loss_clip": 0.01339457, "auxiliary_loss_mlp": 0.01193756, "balance_loss_clip": 1.00858474, "balance_loss_mlp": 1.00025582, "epoch": 0.6703541153129321, "flos": 24243700672800.0, "grad_norm": 2.034592999447226, "language_loss": 0.63143361, "learning_rate": 1.0355098333865455e-06, "loss": 0.6567657, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.774942636489868 }, { "auxiliary_loss_clip": 0.0132699, "auxiliary_loss_mlp": 0.01193373, "balance_loss_clip": 1.00896168, "balance_loss_mlp": 1.00015903, "epoch": 0.6704743582035713, "flos": 26688535719840.0, "grad_norm": 1.5594870094738953, "language_loss": 0.69208288, "learning_rate": 1.0348274990844006e-06, "loss": 0.71728653, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.7461774349212646 }, { "auxiliary_loss_clip": 0.01325944, "auxiliary_loss_mlp": 0.01193377, "balance_loss_clip": 1.00738657, "balance_loss_mlp": 1.00016284, "epoch": 0.6705946010942103, "flos": 23514390838080.0, "grad_norm": 1.6480935596283126, "language_loss": 0.72688645, "learning_rate": 1.034145311198155e-06, "loss": 0.75207967, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.8291175365448 }, { "auxiliary_loss_clip": 0.01349552, "auxiliary_loss_mlp": 0.01193381, "balance_loss_clip": 1.00810504, "balance_loss_mlp": 1.00016689, "epoch": 0.6707148439848494, "flos": 24061022958240.0, "grad_norm": 1.6272087685668095, "language_loss": 0.64068985, "learning_rate": 1.0334632698312989e-06, "loss": 0.66611916, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.6811509132385254 }, { "auxiliary_loss_clip": 0.013154, "auxiliary_loss_mlp": 0.01193539, "balance_loss_clip": 1.0081166, "balance_loss_mlp": 1.0002296, "epoch": 0.6708350868754885, "flos": 22528678392000.0, "grad_norm": 4.225716055341261, "language_loss": 0.75314361, "learning_rate": 1.032781375087295e-06, "loss": 0.77823299, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.7436883449554443 }, { "auxiliary_loss_clip": 0.01311475, "auxiliary_loss_mlp": 0.01193511, "balance_loss_clip": 1.00755715, "balance_loss_mlp": 1.00020075, "epoch": 0.6709553297661276, "flos": 25227760629600.0, "grad_norm": 3.2105023987551915, "language_loss": 0.67245746, "learning_rate": 1.0320996270695891e-06, "loss": 0.69750732, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.850351333618164 }, { "auxiliary_loss_clip": 0.01309505, "auxiliary_loss_mlp": 0.01193513, "balance_loss_clip": 1.00805962, "balance_loss_mlp": 1.00020349, "epoch": 0.6710755726567667, "flos": 20448767689920.0, "grad_norm": 1.7232214115033475, "language_loss": 0.7326194, "learning_rate": 1.0314180258815998e-06, "loss": 0.7576496, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.802018642425537 }, { "auxiliary_loss_clip": 0.01301284, "auxiliary_loss_mlp": 0.01193321, "balance_loss_clip": 1.0078938, "balance_loss_mlp": 1.0002017, "epoch": 0.6711958155474057, "flos": 25995423745440.0, "grad_norm": 1.4802531356841615, "language_loss": 0.74381649, "learning_rate": 1.0307365716267247e-06, "loss": 0.76876247, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.857647180557251 }, { "auxiliary_loss_clip": 0.01326418, "auxiliary_loss_mlp": 0.01193481, "balance_loss_clip": 1.00778556, "balance_loss_mlp": 1.00017142, "epoch": 0.6713160584380449, "flos": 19937722727520.0, "grad_norm": 1.8831951274326615, "language_loss": 0.78163183, "learning_rate": 1.0300552644083423e-06, "loss": 0.80683082, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.7048776149749756 }, { "auxiliary_loss_clip": 0.01292914, "auxiliary_loss_mlp": 0.01193462, "balance_loss_clip": 1.00772667, "balance_loss_mlp": 1.00015259, "epoch": 0.6714363013286839, "flos": 18223382996640.0, "grad_norm": 2.282086364137198, "language_loss": 0.72015172, "learning_rate": 1.0293741043298036e-06, "loss": 0.7450155, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.9295036792755127 }, { "auxiliary_loss_clip": 0.01291561, "auxiliary_loss_mlp": 0.01193477, "balance_loss_clip": 1.00831962, "balance_loss_mlp": 1.00026298, "epoch": 0.671556544219323, "flos": 25812386794080.0, "grad_norm": 2.789378452707024, "language_loss": 0.71158004, "learning_rate": 1.0286930914944436e-06, "loss": 0.73643041, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.892164707183838 }, { "auxiliary_loss_clip": 0.01351344, "auxiliary_loss_mlp": 0.01193476, "balance_loss_clip": 1.00797343, "balance_loss_mlp": 1.00026166, "epoch": 0.6716767871099621, "flos": 15850440738720.0, "grad_norm": 2.5486942680535405, "language_loss": 0.76652074, "learning_rate": 1.0280122260055684e-06, "loss": 0.79196894, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.704545259475708 }, { "auxiliary_loss_clip": 0.01352015, "auxiliary_loss_mlp": 0.01193426, "balance_loss_clip": 1.00871837, "balance_loss_mlp": 1.0002116, "epoch": 0.6717970300006012, "flos": 19756122723360.0, "grad_norm": 4.114589944614067, "language_loss": 0.82637662, "learning_rate": 1.0273315079664652e-06, "loss": 0.85183102, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.64150071144104 }, { "auxiliary_loss_clip": 0.0133337, "auxiliary_loss_mlp": 0.01193401, "balance_loss_clip": 1.00783992, "balance_loss_mlp": 1.00018644, "epoch": 0.6719172728912403, "flos": 25485061332960.0, "grad_norm": 2.2009500072892654, "language_loss": 0.74026859, "learning_rate": 1.0266509374803992e-06, "loss": 0.76553631, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.755511999130249 }, { "auxiliary_loss_clip": 0.01350943, "auxiliary_loss_mlp": 0.00872435, "balance_loss_clip": 1.00847363, "balance_loss_mlp": 1.00045943, "epoch": 0.6720375157818794, "flos": 15880351955040.0, "grad_norm": 2.4287783283121382, "language_loss": 0.84241557, "learning_rate": 1.0259705146506123e-06, "loss": 0.86464942, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.6077284812927246 }, { "auxiliary_loss_clip": 0.01334332, "auxiliary_loss_mlp": 0.01193377, "balance_loss_clip": 1.00824749, "balance_loss_mlp": 1.00025761, "epoch": 0.6721577586725185, "flos": 32010855801120.0, "grad_norm": 2.002586340799095, "language_loss": 0.77736306, "learning_rate": 1.025290239580324e-06, "loss": 0.80264008, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.7913899421691895 }, { "auxiliary_loss_clip": 0.01287499, "auxiliary_loss_mlp": 0.01193453, "balance_loss_clip": 1.00741529, "balance_loss_mlp": 1.00014317, "epoch": 0.6722780015631575, "flos": 20737883564640.0, "grad_norm": 1.666314403199413, "language_loss": 0.75385666, "learning_rate": 1.0246101123727313e-06, "loss": 0.7786662, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.8472893238067627 }, { "auxiliary_loss_clip": 0.01337672, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00812542, "balance_loss_mlp": 1.00018692, "epoch": 0.6723982444537967, "flos": 16909626615840.0, "grad_norm": 1.8761683557752589, "language_loss": 0.78742158, "learning_rate": 1.0239301331310085e-06, "loss": 0.81273043, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.688840866088867 }, { "auxiliary_loss_clip": 0.0132824, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00751364, "balance_loss_mlp": 1.00017452, "epoch": 0.6725184873444358, "flos": 20667822883200.0, "grad_norm": 1.5372003271500485, "language_loss": 0.88344556, "learning_rate": 1.0232503019583088e-06, "loss": 0.90865993, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.7420430183410645 }, { "auxiliary_loss_clip": 0.01324542, "auxiliary_loss_mlp": 0.01193403, "balance_loss_clip": 1.00786781, "balance_loss_mlp": 1.00018835, "epoch": 0.6726387302350748, "flos": 23727626395200.0, "grad_norm": 1.6812234244584245, "language_loss": 0.69658059, "learning_rate": 1.0225706189577619e-06, "loss": 0.72176003, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.6975667476654053 }, { "auxiliary_loss_clip": 0.01329456, "auxiliary_loss_mlp": 0.01193445, "balance_loss_clip": 1.0080452, "balance_loss_mlp": 1.00023043, "epoch": 0.672758973125714, "flos": 15188281767360.0, "grad_norm": 2.4453299367711536, "language_loss": 0.74780357, "learning_rate": 1.021891084232475e-06, "loss": 0.77303255, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.7711129188537598 }, { "auxiliary_loss_clip": 0.01339218, "auxiliary_loss_mlp": 0.01193572, "balance_loss_clip": 1.00879478, "balance_loss_mlp": 1.00026226, "epoch": 0.672879216016353, "flos": 18077262379200.0, "grad_norm": 1.9435556641331575, "language_loss": 0.79856122, "learning_rate": 1.0212116978855325e-06, "loss": 0.82388914, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 3.6784000396728516 }, { "auxiliary_loss_clip": 0.01287214, "auxiliary_loss_mlp": 0.0119341, "balance_loss_clip": 1.00660968, "balance_loss_mlp": 1.00019538, "epoch": 0.6729994589069921, "flos": 23476360870080.0, "grad_norm": 1.7707828798402572, "language_loss": 0.7879293, "learning_rate": 1.020532460019997e-06, "loss": 0.81273556, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 4.670463562011719 }, { "auxiliary_loss_clip": 0.01235403, "auxiliary_loss_mlp": 0.01193619, "balance_loss_clip": 1.00640893, "balance_loss_mlp": 1.00021386, "epoch": 0.6731197017976313, "flos": 26322030732960.0, "grad_norm": 1.787488532450277, "language_loss": 0.71292555, "learning_rate": 1.0198533707389096e-06, "loss": 0.73721582, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 3.0577898025512695 }, { "auxiliary_loss_clip": 0.0132531, "auxiliary_loss_mlp": 0.00872479, "balance_loss_clip": 1.00809431, "balance_loss_mlp": 1.00028372, "epoch": 0.6732399446882703, "flos": 21616439376960.0, "grad_norm": 1.5683147761712737, "language_loss": 0.7285012, "learning_rate": 1.0191744301452853e-06, "loss": 0.7504791, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 3.1569669246673584 }, { "auxiliary_loss_clip": 0.01351174, "auxiliary_loss_mlp": 0.01193327, "balance_loss_clip": 1.00819552, "balance_loss_mlp": 1.0002079, "epoch": 0.6733601875789094, "flos": 25880184283680.0, "grad_norm": 1.5817542894117826, "language_loss": 0.70405918, "learning_rate": 1.0184956383421208e-06, "loss": 0.72950423, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 3.795565128326416 }, { "auxiliary_loss_clip": 0.01337124, "auxiliary_loss_mlp": 0.01193439, "balance_loss_clip": 1.00859499, "balance_loss_mlp": 1.00022519, "epoch": 0.6734804304695485, "flos": 22929585055200.0, "grad_norm": 2.041934602537492, "language_loss": 0.65910679, "learning_rate": 1.017816995432387e-06, "loss": 0.68441248, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.7713844776153564 }, { "auxiliary_loss_clip": 0.01312447, "auxiliary_loss_mlp": 0.01193544, "balance_loss_clip": 1.00789595, "balance_loss_mlp": 1.00023401, "epoch": 0.6736006733601876, "flos": 18697978632960.0, "grad_norm": 1.735638302759025, "language_loss": 0.74353313, "learning_rate": 1.0171385015190353e-06, "loss": 0.76859307, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.709223985671997 }, { "auxiliary_loss_clip": 0.01302589, "auxiliary_loss_mlp": 0.008724, "balance_loss_clip": 1.00776041, "balance_loss_mlp": 1.00043821, "epoch": 0.6737209162508266, "flos": 19427755475520.0, "grad_norm": 1.8555804727369927, "language_loss": 0.73165786, "learning_rate": 1.0164601567049908e-06, "loss": 0.75340772, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.7409586906433105 }, { "auxiliary_loss_clip": 0.013212, "auxiliary_loss_mlp": 0.01193556, "balance_loss_clip": 1.00799477, "balance_loss_mlp": 1.00024569, "epoch": 0.6738411591414658, "flos": 20158071173280.0, "grad_norm": 1.5893438350288078, "language_loss": 0.79993212, "learning_rate": 1.015781961093158e-06, "loss": 0.82507968, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.8132410049438477 }, { "auxiliary_loss_clip": 0.01326274, "auxiliary_loss_mlp": 0.01193412, "balance_loss_clip": 1.00793576, "balance_loss_mlp": 1.00019753, "epoch": 0.6739614020321049, "flos": 21653858642400.0, "grad_norm": 1.5057681131338105, "language_loss": 0.77140331, "learning_rate": 1.0151039147864197e-06, "loss": 0.7966001, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.835470199584961 }, { "auxiliary_loss_clip": 0.01211392, "auxiliary_loss_mlp": 0.01193551, "balance_loss_clip": 1.00609207, "balance_loss_mlp": 1.00024092, "epoch": 0.6740816449227439, "flos": 19171712100960.0, "grad_norm": 1.810297218018059, "language_loss": 0.65497786, "learning_rate": 1.0144260178876336e-06, "loss": 0.67902732, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 3.2055182456970215 }, { "auxiliary_loss_clip": 0.01324618, "auxiliary_loss_mlp": 0.01193505, "balance_loss_clip": 1.00802577, "balance_loss_mlp": 1.00019503, "epoch": 0.6742018878133831, "flos": 21097024197120.0, "grad_norm": 2.4258635719164467, "language_loss": 0.67700309, "learning_rate": 1.0137482704996388e-06, "loss": 0.70218432, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.992201805114746 }, { "auxiliary_loss_clip": 0.01296168, "auxiliary_loss_mlp": 0.01193571, "balance_loss_clip": 1.00766242, "balance_loss_mlp": 1.00026178, "epoch": 0.6743221307040221, "flos": 23549977995840.0, "grad_norm": 1.8501814167505428, "language_loss": 0.78778577, "learning_rate": 1.0130706727252461e-06, "loss": 0.81268311, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.8275647163391113 }, { "auxiliary_loss_clip": 0.01296684, "auxiliary_loss_mlp": 0.01193374, "balance_loss_clip": 1.00731921, "balance_loss_mlp": 1.00015938, "epoch": 0.6744423735946612, "flos": 16249551217920.0, "grad_norm": 2.181419615150388, "language_loss": 0.68021828, "learning_rate": 1.0123932246672468e-06, "loss": 0.70511889, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.762218475341797 }, { "auxiliary_loss_clip": 0.01250732, "auxiliary_loss_mlp": 0.00871815, "balance_loss_clip": 1.00437856, "balance_loss_mlp": 0.9999035, "epoch": 0.6745626164853004, "flos": 57843288008640.0, "grad_norm": 0.7486156911907015, "language_loss": 0.55832529, "learning_rate": 1.0117159264284114e-06, "loss": 0.57955074, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.3264007568359375 }, { "auxiliary_loss_clip": 0.01315178, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00801897, "balance_loss_mlp": 1.00019145, "epoch": 0.6746828593759394, "flos": 20485037397600.0, "grad_norm": 1.5808363451675622, "language_loss": 0.77003652, "learning_rate": 1.0110387781114837e-06, "loss": 0.79512048, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.775893211364746 }, { "auxiliary_loss_clip": 0.01350199, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00833547, "balance_loss_mlp": 1.0001986, "epoch": 0.6748031022665785, "flos": 19208233274400.0, "grad_norm": 1.8862514512582274, "language_loss": 0.77374852, "learning_rate": 1.0103617798191872e-06, "loss": 0.79918373, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.6811115741729736 }, { "auxiliary_loss_clip": 0.01303065, "auxiliary_loss_mlp": 0.01193427, "balance_loss_clip": 1.00762451, "balance_loss_mlp": 1.00021279, "epoch": 0.6749233451572175, "flos": 15195035419200.0, "grad_norm": 2.193048206146548, "language_loss": 0.82837319, "learning_rate": 1.0096849316542217e-06, "loss": 0.85333812, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.718059539794922 }, { "auxiliary_loss_clip": 0.01239132, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00678217, "balance_loss_mlp": 1.00022471, "epoch": 0.6750435880478567, "flos": 26499499513920.0, "grad_norm": 1.9730311612517497, "language_loss": 0.74690199, "learning_rate": 1.0090082337192643e-06, "loss": 0.77122581, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.8469223976135254 }, { "auxiliary_loss_clip": 0.01265609, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00688052, "balance_loss_mlp": 1.00019515, "epoch": 0.6751638309384957, "flos": 23404324386240.0, "grad_norm": 1.9624593448999377, "language_loss": 0.78429502, "learning_rate": 1.0083316861169705e-06, "loss": 0.80888331, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.858422040939331 }, { "auxiliary_loss_clip": 0.01301895, "auxiliary_loss_mlp": 0.01193528, "balance_loss_clip": 1.00773573, "balance_loss_mlp": 1.00021791, "epoch": 0.6752840738291348, "flos": 23441420338560.0, "grad_norm": 2.8079534633599885, "language_loss": 0.71336758, "learning_rate": 1.0076552889499713e-06, "loss": 0.73832178, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.847069501876831 }, { "auxiliary_loss_clip": 0.01326284, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00805342, "balance_loss_mlp": 1.00017309, "epoch": 0.675404316719774, "flos": 30335839290720.0, "grad_norm": 1.8438633447128092, "language_loss": 0.7358433, "learning_rate": 1.006979042320876e-06, "loss": 0.76103818, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.7544596195220947 }, { "auxiliary_loss_clip": 0.01321008, "auxiliary_loss_mlp": 0.01193338, "balance_loss_clip": 1.00766635, "balance_loss_mlp": 1.00021911, "epoch": 0.675524559610413, "flos": 23622625182240.0, "grad_norm": 1.9378940404924843, "language_loss": 0.63124615, "learning_rate": 1.0063029463322702e-06, "loss": 0.65638959, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.7926478385925293 }, { "auxiliary_loss_clip": 0.01275786, "auxiliary_loss_mlp": 0.00872369, "balance_loss_clip": 1.00790989, "balance_loss_mlp": 1.00035214, "epoch": 0.6756448025010521, "flos": 21248641137600.0, "grad_norm": 2.4916774322550923, "language_loss": 0.75819218, "learning_rate": 1.0056270010867164e-06, "loss": 0.77967376, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.7814040184020996 }, { "auxiliary_loss_clip": 0.01327647, "auxiliary_loss_mlp": 0.01193671, "balance_loss_clip": 1.00860214, "balance_loss_mlp": 1.00026584, "epoch": 0.6757650453916912, "flos": 21646530211680.0, "grad_norm": 2.539151540657745, "language_loss": 0.78457272, "learning_rate": 1.004951206686758e-06, "loss": 0.80978596, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.8366541862487793 }, { "auxiliary_loss_clip": 0.01337548, "auxiliary_loss_mlp": 0.01193453, "balance_loss_clip": 1.00804484, "balance_loss_mlp": 1.00023901, "epoch": 0.6758852882823303, "flos": 21795668418240.0, "grad_norm": 1.961190422066799, "language_loss": 0.71502447, "learning_rate": 1.0042755632349087e-06, "loss": 0.74033445, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.827324151992798 }, { "auxiliary_loss_clip": 0.01291493, "auxiliary_loss_mlp": 0.01193471, "balance_loss_clip": 1.00749004, "balance_loss_mlp": 1.00025702, "epoch": 0.6760055311729694, "flos": 27088795756800.0, "grad_norm": 2.173740911019058, "language_loss": 0.62838233, "learning_rate": 1.0036000708336653e-06, "loss": 0.65323198, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 3.7498066425323486 }, { "auxiliary_loss_clip": 0.0131765, "auxiliary_loss_mlp": 0.01193317, "balance_loss_clip": 1.00840163, "balance_loss_mlp": 1.00019848, "epoch": 0.6761257740636085, "flos": 17999801419680.0, "grad_norm": 1.98840787829669, "language_loss": 0.79695976, "learning_rate": 1.0029247295854984e-06, "loss": 0.82206953, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.7892537117004395 }, { "auxiliary_loss_clip": 0.01302348, "auxiliary_loss_mlp": 0.01193327, "balance_loss_clip": 1.00758243, "balance_loss_mlp": 1.00020826, "epoch": 0.6762460169542476, "flos": 15121921224960.0, "grad_norm": 1.6880708741355084, "language_loss": 0.71604675, "learning_rate": 1.0022495395928588e-06, "loss": 0.74100351, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 3.7745718955993652 }, { "auxiliary_loss_clip": 0.01320669, "auxiliary_loss_mlp": 0.01192288, "balance_loss_clip": 1.00543404, "balance_loss_mlp": 1.00002718, "epoch": 0.6763662598448866, "flos": 67887005865120.0, "grad_norm": 0.7861936051163326, "language_loss": 0.62356949, "learning_rate": 1.0015745009581697e-06, "loss": 0.64869905, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.3224308490753174 }, { "auxiliary_loss_clip": 0.01326108, "auxiliary_loss_mlp": 0.01193413, "balance_loss_clip": 1.00833654, "balance_loss_mlp": 1.00019884, "epoch": 0.6764865027355258, "flos": 20631840564960.0, "grad_norm": 1.7566326723065542, "language_loss": 0.67150986, "learning_rate": 1.0008996137838343e-06, "loss": 0.6967051, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 3.690037727355957 }, { "auxiliary_loss_clip": 0.01351995, "auxiliary_loss_mlp": 0.01193625, "balance_loss_clip": 1.00873721, "balance_loss_mlp": 1.00021958, "epoch": 0.6766067456261649, "flos": 21215820103200.0, "grad_norm": 1.8856738612069708, "language_loss": 0.79745102, "learning_rate": 1.000224878172234e-06, "loss": 0.82290721, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.696004629135132 }, { "auxiliary_loss_clip": 0.01322485, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00820684, "balance_loss_mlp": 1.00019896, "epoch": 0.6767269885168039, "flos": 19938261582720.0, "grad_norm": 2.4539028413166353, "language_loss": 0.72638965, "learning_rate": 9.99550294225724e-07, "loss": 0.75154769, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.7563772201538086 }, { "auxiliary_loss_clip": 0.01290791, "auxiliary_loss_mlp": 0.01193386, "balance_loss_clip": 1.00823092, "balance_loss_mlp": 1.00026679, "epoch": 0.6768472314074431, "flos": 20814087195360.0, "grad_norm": 2.1648466202547327, "language_loss": 0.72576416, "learning_rate": 9.988758620466402e-07, "loss": 0.75060594, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.803880453109741 }, { "auxiliary_loss_clip": 0.01255577, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00717425, "balance_loss_mlp": 1.00018311, "epoch": 0.6769674742980821, "flos": 23186023590240.0, "grad_norm": 1.5135956437201972, "language_loss": 0.7657398, "learning_rate": 9.982015817372917e-07, "loss": 0.79022765, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.8715782165527344 }, { "auxiliary_loss_clip": 0.01291479, "auxiliary_loss_mlp": 0.01193612, "balance_loss_clip": 1.00798857, "balance_loss_mlp": 1.00030196, "epoch": 0.6770877171887212, "flos": 24242946275520.0, "grad_norm": 1.6649370828067611, "language_loss": 0.81803691, "learning_rate": 9.975274533999657e-07, "loss": 0.84288782, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.8638832569122314 }, { "auxiliary_loss_clip": 0.01350797, "auxiliary_loss_mlp": 0.01193431, "balance_loss_clip": 1.00840163, "balance_loss_mlp": 1.00021636, "epoch": 0.6772079600793603, "flos": 18141575271840.0, "grad_norm": 2.4528356483323934, "language_loss": 0.83963788, "learning_rate": 9.96853477136929e-07, "loss": 0.86508024, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.6538407802581787 }, { "auxiliary_loss_clip": 0.01300623, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.00799859, "balance_loss_mlp": 1.00025308, "epoch": 0.6773282029699994, "flos": 22452079600800.0, "grad_norm": 2.234534315976892, "language_loss": 0.75232482, "learning_rate": 9.96179653050422e-07, "loss": 0.77726376, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.8296141624450684 }, { "auxiliary_loss_clip": 0.01291012, "auxiliary_loss_mlp": 0.0119352, "balance_loss_clip": 1.00743461, "balance_loss_mlp": 1.00030565, "epoch": 0.6774484458606385, "flos": 18693739638720.0, "grad_norm": 1.9222671553170771, "language_loss": 0.74155569, "learning_rate": 9.955059812426635e-07, "loss": 0.76640093, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.673269033432007 }, { "auxiliary_loss_clip": 0.01351134, "auxiliary_loss_mlp": 0.01193696, "balance_loss_clip": 1.00899947, "balance_loss_mlp": 1.00029063, "epoch": 0.6775686887512776, "flos": 25994058645600.0, "grad_norm": 1.8129927132412214, "language_loss": 0.82723534, "learning_rate": 9.948324618158493e-07, "loss": 0.85268366, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.7105612754821777 }, { "auxiliary_loss_clip": 0.0133902, "auxiliary_loss_mlp": 0.01193569, "balance_loss_clip": 1.0082947, "balance_loss_mlp": 1.00025916, "epoch": 0.6776889316419167, "flos": 13587996016800.0, "grad_norm": 2.0820503697279156, "language_loss": 0.77609879, "learning_rate": 9.941590948721502e-07, "loss": 0.80142468, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.686816453933716 }, { "auxiliary_loss_clip": 0.01302189, "auxiliary_loss_mlp": 0.0119335, "balance_loss_clip": 1.00716782, "balance_loss_mlp": 1.00013566, "epoch": 0.6778091745325557, "flos": 27601133971680.0, "grad_norm": 1.608452893209215, "language_loss": 0.76051533, "learning_rate": 9.934858805137188e-07, "loss": 0.78547072, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.7951393127441406 }, { "auxiliary_loss_clip": 0.01325356, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00779223, "balance_loss_mlp": 1.00018883, "epoch": 0.6779294174231949, "flos": 18734068722240.0, "grad_norm": 1.5524075164729914, "language_loss": 0.80677903, "learning_rate": 9.92812818842677e-07, "loss": 0.83196479, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.684751510620117 }, { "auxiliary_loss_clip": 0.01326976, "auxiliary_loss_mlp": 0.01193323, "balance_loss_clip": 1.00828683, "balance_loss_mlp": 1.00020361, "epoch": 0.678049660313834, "flos": 45873813762720.0, "grad_norm": 2.0998962570606983, "language_loss": 0.64033067, "learning_rate": 9.921399099611306e-07, "loss": 0.66553366, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.9177660942077637 }, { "auxiliary_loss_clip": 0.01318645, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.0077343, "balance_loss_mlp": 1.00015974, "epoch": 0.678169903204473, "flos": 19974567214080.0, "grad_norm": 1.5743983614494508, "language_loss": 0.68690902, "learning_rate": 9.914671539711588e-07, "loss": 0.71202725, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.760917901992798 }, { "auxiliary_loss_clip": 0.01205556, "auxiliary_loss_mlp": 0.00872466, "balance_loss_clip": 1.0053525, "balance_loss_mlp": 1.00036526, "epoch": 0.6782901460951122, "flos": 21395623923360.0, "grad_norm": 1.8547250598738076, "language_loss": 0.7812627, "learning_rate": 9.90794550974817e-07, "loss": 0.80204296, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 3.158522844314575 }, { "auxiliary_loss_clip": 0.01299517, "auxiliary_loss_mlp": 0.0119352, "balance_loss_clip": 1.00760198, "balance_loss_mlp": 1.0002104, "epoch": 0.6784103889857512, "flos": 21434013128160.0, "grad_norm": 1.8755421216966417, "language_loss": 0.81596512, "learning_rate": 9.901221010741407e-07, "loss": 0.84089553, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 2.9719247817993164 }, { "auxiliary_loss_clip": 0.01339085, "auxiliary_loss_mlp": 0.0119352, "balance_loss_clip": 1.00905216, "balance_loss_mlp": 1.00030553, "epoch": 0.6785306318763903, "flos": 32671937062080.0, "grad_norm": 2.0003084792425634, "language_loss": 0.7453981, "learning_rate": 9.894498043711375e-07, "loss": 0.77072418, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.7843985557556152 }, { "auxiliary_loss_clip": 0.01313664, "auxiliary_loss_mlp": 0.01193286, "balance_loss_clip": 1.00770926, "balance_loss_mlp": 1.00016677, "epoch": 0.6786508747670293, "flos": 25632151889760.0, "grad_norm": 1.7791441747236751, "language_loss": 0.69197583, "learning_rate": 9.887776609677962e-07, "loss": 0.71704531, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.777827739715576 }, { "auxiliary_loss_clip": 0.01308086, "auxiliary_loss_mlp": 0.01193547, "balance_loss_clip": 1.00784612, "balance_loss_mlp": 1.00023735, "epoch": 0.6787711176576685, "flos": 19171891719360.0, "grad_norm": 1.7835861124059866, "language_loss": 0.72364187, "learning_rate": 9.88105670966079e-07, "loss": 0.74865818, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.801598310470581 }, { "auxiliary_loss_clip": 0.01268541, "auxiliary_loss_mlp": 0.01193434, "balance_loss_clip": 1.00705135, "balance_loss_mlp": 1.00022006, "epoch": 0.6788913605483076, "flos": 13985166617280.0, "grad_norm": 2.2304687506849397, "language_loss": 0.78578162, "learning_rate": 9.874338344679283e-07, "loss": 0.81040144, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.8022825717926025 }, { "auxiliary_loss_clip": 0.01349129, "auxiliary_loss_mlp": 0.01193253, "balance_loss_clip": 1.00815475, "balance_loss_mlp": 1.00022912, "epoch": 0.6790116034389466, "flos": 22017597505920.0, "grad_norm": 1.750760617059793, "language_loss": 0.74048758, "learning_rate": 9.86762151575259e-07, "loss": 0.76591146, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.9330661296844482 }, { "auxiliary_loss_clip": 0.01260109, "auxiliary_loss_mlp": 0.00872291, "balance_loss_clip": 1.00713217, "balance_loss_mlp": 1.00034451, "epoch": 0.6791318463295858, "flos": 20922465234240.0, "grad_norm": 1.4397844001213456, "language_loss": 0.80214608, "learning_rate": 9.860906223899651e-07, "loss": 0.82347012, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 3.9998981952667236 }, { "auxiliary_loss_clip": 0.01325981, "auxiliary_loss_mlp": 0.01193447, "balance_loss_clip": 1.00835514, "balance_loss_mlp": 1.00023258, "epoch": 0.6792520892202248, "flos": 28512762284160.0, "grad_norm": 1.5628083144840015, "language_loss": 0.75439566, "learning_rate": 9.854192470139184e-07, "loss": 0.77959001, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 4.7648444175720215 }, { "auxiliary_loss_clip": 0.01305995, "auxiliary_loss_mlp": 0.01193395, "balance_loss_clip": 1.00774121, "balance_loss_mlp": 1.00027645, "epoch": 0.6793723321108639, "flos": 20011914632160.0, "grad_norm": 1.779101883312023, "language_loss": 0.71544862, "learning_rate": 9.847480255489645e-07, "loss": 0.74044257, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.833585500717163 }, { "auxiliary_loss_clip": 0.01319735, "auxiliary_loss_mlp": 0.01193314, "balance_loss_clip": 1.00755167, "balance_loss_mlp": 1.00019503, "epoch": 0.6794925750015031, "flos": 26649499888800.0, "grad_norm": 2.04228181790362, "language_loss": 0.69242996, "learning_rate": 9.840769580969295e-07, "loss": 0.71756047, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.8392128944396973 }, { "auxiliary_loss_clip": 0.01338736, "auxiliary_loss_mlp": 0.01193328, "balance_loss_clip": 1.00831687, "balance_loss_mlp": 1.00020862, "epoch": 0.6796128178921421, "flos": 21580385211360.0, "grad_norm": 1.9250482333404304, "language_loss": 0.79662031, "learning_rate": 9.834060447596114e-07, "loss": 0.82194096, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 3.754941701889038 }, { "auxiliary_loss_clip": 0.01337445, "auxiliary_loss_mlp": 0.01193465, "balance_loss_clip": 1.00790572, "balance_loss_mlp": 1.00025058, "epoch": 0.6797330607827812, "flos": 22492013523840.0, "grad_norm": 16.142419384857945, "language_loss": 0.77981901, "learning_rate": 9.827352856387868e-07, "loss": 0.8051281, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.738271474838257 }, { "auxiliary_loss_clip": 0.01243613, "auxiliary_loss_mlp": 0.01192299, "balance_loss_clip": 1.00467348, "balance_loss_mlp": 1.00003815, "epoch": 0.6798533036734203, "flos": 66306680471520.0, "grad_norm": 0.7760933516527885, "language_loss": 0.64270401, "learning_rate": 9.820646808362118e-07, "loss": 0.66706324, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.454392910003662 }, { "auxiliary_loss_clip": 0.01302539, "auxiliary_loss_mlp": 0.01193356, "balance_loss_clip": 1.00767541, "balance_loss_mlp": 1.00023675, "epoch": 0.6799735465640594, "flos": 16180173086400.0, "grad_norm": 1.9712524265387574, "language_loss": 0.72576743, "learning_rate": 9.813942304536154e-07, "loss": 0.75072634, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.7574124336242676 }, { "auxiliary_loss_clip": 0.01313903, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.00797963, "balance_loss_mlp": 1.00020647, "epoch": 0.6800937894546984, "flos": 22125760002720.0, "grad_norm": 1.7012637867949696, "language_loss": 0.63600528, "learning_rate": 9.807239345927043e-07, "loss": 0.66107756, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.7915456295013428 }, { "auxiliary_loss_clip": 0.01311156, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00825572, "balance_loss_mlp": 1.00020599, "epoch": 0.6802140323453376, "flos": 31612966727040.0, "grad_norm": 1.9320431032065066, "language_loss": 0.71595597, "learning_rate": 9.80053793355162e-07, "loss": 0.74100083, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.7888598442077637 }, { "auxiliary_loss_clip": 0.01277487, "auxiliary_loss_mlp": 0.01193475, "balance_loss_clip": 1.00814927, "balance_loss_mlp": 1.00026035, "epoch": 0.6803342752359767, "flos": 17712948736800.0, "grad_norm": 1.8753648707591246, "language_loss": 0.74711829, "learning_rate": 9.793838068426472e-07, "loss": 0.77182794, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.7945327758789062 }, { "auxiliary_loss_clip": 0.01349976, "auxiliary_loss_mlp": 0.01193405, "balance_loss_clip": 1.00827479, "balance_loss_mlp": 1.00019097, "epoch": 0.6804545181266157, "flos": 11326808623680.0, "grad_norm": 1.9718277135979616, "language_loss": 0.60950047, "learning_rate": 9.78713975156799e-07, "loss": 0.63493431, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.6270110607147217 }, { "auxiliary_loss_clip": 0.01294495, "auxiliary_loss_mlp": 0.01193522, "balance_loss_clip": 1.00877142, "balance_loss_mlp": 1.00030804, "epoch": 0.6805747610172549, "flos": 29350989012960.0, "grad_norm": 1.727539247907783, "language_loss": 0.71956527, "learning_rate": 9.780442983992273e-07, "loss": 0.74444544, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.9308011531829834 }, { "auxiliary_loss_clip": 0.01317818, "auxiliary_loss_mlp": 0.01193544, "balance_loss_clip": 1.00826573, "balance_loss_mlp": 1.00023472, "epoch": 0.680695003907894, "flos": 37631883379680.0, "grad_norm": 1.568666224541779, "language_loss": 0.71799815, "learning_rate": 9.773747766715238e-07, "loss": 0.74311179, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.8718276023864746 }, { "auxiliary_loss_clip": 0.01317287, "auxiliary_loss_mlp": 0.011935, "balance_loss_clip": 1.00750184, "balance_loss_mlp": 1.00018978, "epoch": 0.680815246798533, "flos": 22127376568320.0, "grad_norm": 1.791607888470756, "language_loss": 0.8021965, "learning_rate": 9.767054100752536e-07, "loss": 0.82730436, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.856472969055176 }, { "auxiliary_loss_clip": 0.01278993, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00766933, "balance_loss_mlp": 1.00020576, "epoch": 0.6809354896891722, "flos": 17201831927040.0, "grad_norm": 1.9763138019584263, "language_loss": 0.81569505, "learning_rate": 9.760361987119584e-07, "loss": 0.84041822, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.7572171688079834 }, { "auxiliary_loss_clip": 0.01303439, "auxiliary_loss_mlp": 0.01193387, "balance_loss_clip": 1.00743866, "balance_loss_mlp": 1.00017273, "epoch": 0.6810557325798112, "flos": 12458174679360.0, "grad_norm": 1.8305934817925016, "language_loss": 0.67794752, "learning_rate": 9.753671426831592e-07, "loss": 0.70291579, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.694347381591797 }, { "auxiliary_loss_clip": 0.01337842, "auxiliary_loss_mlp": 0.01193354, "balance_loss_clip": 1.00811362, "balance_loss_mlp": 1.00023448, "epoch": 0.6811759754704503, "flos": 22156174150560.0, "grad_norm": 1.8623712859976302, "language_loss": 0.79526716, "learning_rate": 9.746982420903483e-07, "loss": 0.82057911, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.722748041152954 }, { "auxiliary_loss_clip": 0.01327083, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00833738, "balance_loss_mlp": 1.00018561, "epoch": 0.6812962183610894, "flos": 17525385401760.0, "grad_norm": 1.5466241754341408, "language_loss": 0.74970007, "learning_rate": 9.740294970349993e-07, "loss": 0.77490306, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.653956651687622 }, { "auxiliary_loss_clip": 0.01288311, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00508332, "balance_loss_mlp": 1.00004327, "epoch": 0.6814164612517285, "flos": 60274507980960.0, "grad_norm": 0.8853409159206757, "language_loss": 0.60952055, "learning_rate": 9.733609076185594e-07, "loss": 0.63433433, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.194695234298706 }, { "auxiliary_loss_clip": 0.013269, "auxiliary_loss_mlp": 0.01193444, "balance_loss_clip": 1.00806046, "balance_loss_mlp": 1.00022948, "epoch": 0.6815367041423676, "flos": 19317760871040.0, "grad_norm": 1.7853493025724207, "language_loss": 0.83750951, "learning_rate": 9.72692473942455e-07, "loss": 0.86271286, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.654038429260254 }, { "auxiliary_loss_clip": 0.01269047, "auxiliary_loss_mlp": 0.01193464, "balance_loss_clip": 1.00715744, "balance_loss_mlp": 1.00024962, "epoch": 0.6816569470330067, "flos": 22161706397280.0, "grad_norm": 1.591694622635593, "language_loss": 0.7771427, "learning_rate": 9.720241961080849e-07, "loss": 0.80176783, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.8303327560424805 }, { "auxiliary_loss_clip": 0.01350595, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.0081538, "balance_loss_mlp": 1.00014853, "epoch": 0.6817771899236458, "flos": 41463517154400.0, "grad_norm": 1.946266596714488, "language_loss": 0.73180115, "learning_rate": 9.713560742168259e-07, "loss": 0.75723976, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.831144332885742 }, { "auxiliary_loss_clip": 0.01287214, "auxiliary_loss_mlp": 0.01193347, "balance_loss_clip": 1.00819468, "balance_loss_mlp": 1.00022781, "epoch": 0.6818974328142848, "flos": 21106148811840.0, "grad_norm": 1.8120449382289792, "language_loss": 0.71354425, "learning_rate": 9.706881083700333e-07, "loss": 0.7383498, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.939143657684326 }, { "auxiliary_loss_clip": 0.01243639, "auxiliary_loss_mlp": 0.01193459, "balance_loss_clip": 1.0070461, "balance_loss_mlp": 1.00024462, "epoch": 0.682017675704924, "flos": 20441906267040.0, "grad_norm": 1.8654954164798472, "language_loss": 0.82206285, "learning_rate": 9.700202986690357e-07, "loss": 0.84643376, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 3.004560947418213 }, { "auxiliary_loss_clip": 0.0132652, "auxiliary_loss_mlp": 0.00872452, "balance_loss_clip": 1.00802445, "balance_loss_mlp": 1.00032139, "epoch": 0.682137918595563, "flos": 20044448277120.0, "grad_norm": 1.8022681133557623, "language_loss": 0.66670191, "learning_rate": 9.693526452151413e-07, "loss": 0.68869162, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 3.3295862674713135 }, { "auxiliary_loss_clip": 0.01309931, "auxiliary_loss_mlp": 0.01193485, "balance_loss_clip": 1.00842881, "balance_loss_mlp": 1.00027096, "epoch": 0.6822581614862021, "flos": 31684572126720.0, "grad_norm": 1.6018250536478336, "language_loss": 0.75308454, "learning_rate": 9.686851481096305e-07, "loss": 0.77811867, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.966965675354004 }, { "auxiliary_loss_clip": 0.01246778, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00778222, "balance_loss_mlp": 1.00020576, "epoch": 0.6823784043768413, "flos": 23477582275200.0, "grad_norm": 1.731387134898864, "language_loss": 0.71431684, "learning_rate": 9.68017807453762e-07, "loss": 0.73871696, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 4.789058446884155 }, { "auxiliary_loss_clip": 0.01312337, "auxiliary_loss_mlp": 0.00872329, "balance_loss_clip": 1.00802946, "balance_loss_mlp": 1.00036168, "epoch": 0.6824986472674803, "flos": 14137142794560.0, "grad_norm": 2.0183261152290193, "language_loss": 0.73238957, "learning_rate": 9.673506233487721e-07, "loss": 0.75423622, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.760467290878296 }, { "auxiliary_loss_clip": 0.01315558, "auxiliary_loss_mlp": 0.00872279, "balance_loss_clip": 1.00745678, "balance_loss_mlp": 1.00040424, "epoch": 0.6826188901581194, "flos": 21505007825280.0, "grad_norm": 1.7519512910731045, "language_loss": 0.86365545, "learning_rate": 9.666835958958717e-07, "loss": 0.88553381, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.764431953430176 }, { "auxiliary_loss_clip": 0.01350342, "auxiliary_loss_mlp": 0.01193397, "balance_loss_clip": 1.00845492, "balance_loss_mlp": 1.00018215, "epoch": 0.6827391330487584, "flos": 20810135590560.0, "grad_norm": 1.9367254135007645, "language_loss": 0.80714607, "learning_rate": 9.660167251962484e-07, "loss": 0.83258343, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 3.669218063354492 }, { "auxiliary_loss_clip": 0.0130269, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00789523, "balance_loss_mlp": 1.00016832, "epoch": 0.6828593759393976, "flos": 21688799173920.0, "grad_norm": 1.6377893197279925, "language_loss": 0.77484345, "learning_rate": 9.653500113510654e-07, "loss": 0.79980224, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.781533718109131 }, { "auxiliary_loss_clip": 0.01316738, "auxiliary_loss_mlp": 0.01193467, "balance_loss_clip": 1.00783849, "balance_loss_mlp": 1.00025237, "epoch": 0.6829796188300367, "flos": 25337719310400.0, "grad_norm": 2.2773629199410252, "language_loss": 0.66916656, "learning_rate": 9.646834544614627e-07, "loss": 0.69426858, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.7781968116760254 }, { "auxiliary_loss_clip": 0.0131329, "auxiliary_loss_mlp": 0.01193335, "balance_loss_clip": 1.00853848, "balance_loss_mlp": 1.00021601, "epoch": 0.6830998617206757, "flos": 20704811064480.0, "grad_norm": 1.7871441837661808, "language_loss": 0.76482141, "learning_rate": 9.64017054628558e-07, "loss": 0.78988773, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.745668888092041 }, { "auxiliary_loss_clip": 0.0127485, "auxiliary_loss_mlp": 0.01193331, "balance_loss_clip": 1.00672936, "balance_loss_mlp": 1.00021172, "epoch": 0.6832201046113149, "flos": 21726649523520.0, "grad_norm": 1.598882675019518, "language_loss": 0.7900176, "learning_rate": 9.63350811953441e-07, "loss": 0.81469941, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.782313823699951 }, { "auxiliary_loss_clip": 0.01292568, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00657809, "balance_loss_mlp": 1.00019598, "epoch": 0.6833403475019539, "flos": 19536564598560.0, "grad_norm": 1.872859991218627, "language_loss": 0.70579684, "learning_rate": 9.626847265371826e-07, "loss": 0.73065472, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.77982234954834 }, { "auxiliary_loss_clip": 0.0131727, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00806069, "balance_loss_mlp": 1.00019097, "epoch": 0.683460590392593, "flos": 19352162547360.0, "grad_norm": 1.9072457414124209, "language_loss": 0.78815877, "learning_rate": 9.620187984808262e-07, "loss": 0.81326365, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.7831928730010986 }, { "auxiliary_loss_clip": 0.01306332, "auxiliary_loss_mlp": 0.00872469, "balance_loss_clip": 1.00711441, "balance_loss_mlp": 1.00034142, "epoch": 0.6835808332832322, "flos": 23288510145600.0, "grad_norm": 1.6518781221013834, "language_loss": 0.85937804, "learning_rate": 9.613530278853919e-07, "loss": 0.88116604, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.8630099296569824 }, { "auxiliary_loss_clip": 0.01327364, "auxiliary_loss_mlp": 0.01193465, "balance_loss_clip": 1.00809312, "balance_loss_mlp": 1.00025034, "epoch": 0.6837010761738712, "flos": 21653427558240.0, "grad_norm": 1.7224059045496467, "language_loss": 0.74531209, "learning_rate": 9.60687414851879e-07, "loss": 0.77052039, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.702833652496338 }, { "auxiliary_loss_clip": 0.01297352, "auxiliary_loss_mlp": 0.01193438, "balance_loss_clip": 1.00841939, "balance_loss_mlp": 1.00022328, "epoch": 0.6838213190645103, "flos": 17566397035200.0, "grad_norm": 1.9636241752857415, "language_loss": 0.7719413, "learning_rate": 9.600219594812575e-07, "loss": 0.79684913, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.7599377632141113 }, { "auxiliary_loss_clip": 0.01350306, "auxiliary_loss_mlp": 0.01193323, "balance_loss_clip": 1.00847709, "balance_loss_mlp": 1.00020361, "epoch": 0.6839415619551494, "flos": 23112550159200.0, "grad_norm": 1.5329437109632207, "language_loss": 0.72740817, "learning_rate": 9.593566618744786e-07, "loss": 0.75284439, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.646681785583496 }, { "auxiliary_loss_clip": 0.01350225, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00818419, "balance_loss_mlp": 1.0001986, "epoch": 0.6840618048457885, "flos": 22127879499840.0, "grad_norm": 1.733571759898477, "language_loss": 0.73895407, "learning_rate": 9.58691522132466e-07, "loss": 0.76438951, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.8258278369903564 }, { "auxiliary_loss_clip": 0.01309126, "auxiliary_loss_mlp": 0.01193363, "balance_loss_clip": 1.00690711, "balance_loss_mlp": 1.00024414, "epoch": 0.6841820477364275, "flos": 22015909092960.0, "grad_norm": 1.8638139606008428, "language_loss": 0.85079753, "learning_rate": 9.58026540356123e-07, "loss": 0.87582242, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.7246859073638916 }, { "auxiliary_loss_clip": 0.01337786, "auxiliary_loss_mlp": 0.01193357, "balance_loss_clip": 1.00855374, "balance_loss_mlp": 1.00023806, "epoch": 0.6843022906270667, "flos": 24900542939520.0, "grad_norm": 1.6018440443831432, "language_loss": 0.86355841, "learning_rate": 9.573617166463246e-07, "loss": 0.88886982, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.7938828468322754 }, { "auxiliary_loss_clip": 0.01314903, "auxiliary_loss_mlp": 0.01193308, "balance_loss_clip": 1.00749409, "balance_loss_mlp": 1.00018859, "epoch": 0.6844225335177058, "flos": 19969933059360.0, "grad_norm": 1.8703939225398216, "language_loss": 0.59905267, "learning_rate": 9.56697051103924e-07, "loss": 0.62413478, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.706789970397949 }, { "auxiliary_loss_clip": 0.01312116, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00766397, "balance_loss_mlp": 1.00018382, "epoch": 0.6845427764083448, "flos": 25883345567520.0, "grad_norm": 1.8487170432927529, "language_loss": 0.80905348, "learning_rate": 9.560325438297522e-07, "loss": 0.83410668, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.7869789600372314 }, { "auxiliary_loss_clip": 0.01309415, "auxiliary_loss_mlp": 0.01193367, "balance_loss_clip": 1.00828922, "balance_loss_mlp": 1.00015306, "epoch": 0.684663019298984, "flos": 18880153416000.0, "grad_norm": 1.9504085599555523, "language_loss": 0.8691994, "learning_rate": 9.553681949246127e-07, "loss": 0.89422721, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.7450788021087646 }, { "auxiliary_loss_clip": 0.01295355, "auxiliary_loss_mlp": 0.01193611, "balance_loss_clip": 1.00765169, "balance_loss_mlp": 1.00020611, "epoch": 0.684783262189623, "flos": 54193743960480.0, "grad_norm": 1.8056772597122264, "language_loss": 0.7549386, "learning_rate": 9.547040044892886e-07, "loss": 0.77982825, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 3.061098098754883 }, { "auxiliary_loss_clip": 0.01307979, "auxiliary_loss_mlp": 0.01193055, "balance_loss_clip": 1.00503504, "balance_loss_mlp": 1.00003099, "epoch": 0.6849035050802621, "flos": 63970295310720.0, "grad_norm": 0.8744332083984379, "language_loss": 0.60109639, "learning_rate": 9.540399726245354e-07, "loss": 0.62610674, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.199129343032837 }, { "auxiliary_loss_clip": 0.01320037, "auxiliary_loss_mlp": 0.01193349, "balance_loss_clip": 1.00806999, "balance_loss_mlp": 1.00023031, "epoch": 0.6850237479709013, "flos": 25224132337920.0, "grad_norm": 1.8708347028258776, "language_loss": 0.68681222, "learning_rate": 9.533760994310859e-07, "loss": 0.71194607, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.753901720046997 }, { "auxiliary_loss_clip": 0.01350899, "auxiliary_loss_mlp": 0.01193258, "balance_loss_clip": 1.00848949, "balance_loss_mlp": 1.00013876, "epoch": 0.6851439908615403, "flos": 19354138349760.0, "grad_norm": 1.9558669061138403, "language_loss": 0.75323445, "learning_rate": 9.527123850096508e-07, "loss": 0.77867603, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.680323839187622 }, { "auxiliary_loss_clip": 0.01322772, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00813556, "balance_loss_mlp": 1.00017488, "epoch": 0.6852642337521794, "flos": 23182143832800.0, "grad_norm": 1.6484726818674773, "language_loss": 0.71319938, "learning_rate": 9.520488294609142e-07, "loss": 0.73835915, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.7304937839508057 }, { "auxiliary_loss_clip": 0.01250394, "auxiliary_loss_mlp": 0.01192279, "balance_loss_clip": 1.00426209, "balance_loss_mlp": 1.00001848, "epoch": 0.6853844766428185, "flos": 62647234696800.0, "grad_norm": 0.7393779925228857, "language_loss": 0.53848505, "learning_rate": 9.513854328855368e-07, "loss": 0.56291175, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 3.31331205368042 }, { "auxiliary_loss_clip": 0.01349607, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00789642, "balance_loss_mlp": 1.00020623, "epoch": 0.6855047195334576, "flos": 23437253191680.0, "grad_norm": 1.8098016440287568, "language_loss": 0.81050897, "learning_rate": 9.507221953841558e-07, "loss": 0.83593827, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 5.5272088050842285 }, { "auxiliary_loss_clip": 0.01326419, "auxiliary_loss_mlp": 0.01193602, "balance_loss_clip": 1.00789297, "balance_loss_mlp": 1.00019693, "epoch": 0.6856249624240967, "flos": 20664841217760.0, "grad_norm": 1.5020126550126098, "language_loss": 0.77845311, "learning_rate": 9.500591170573824e-07, "loss": 0.80365336, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.7491416931152344 }, { "auxiliary_loss_clip": 0.01264933, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00646865, "balance_loss_mlp": 1.00019336, "epoch": 0.6857452053147358, "flos": 17087310938880.0, "grad_norm": 1.9665150848957231, "language_loss": 0.74178082, "learning_rate": 9.493961980058078e-07, "loss": 0.76636326, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.792858600616455 }, { "auxiliary_loss_clip": 0.01258448, "auxiliary_loss_mlp": 0.01193383, "balance_loss_clip": 1.0065558, "balance_loss_mlp": 1.00026369, "epoch": 0.6858654482053749, "flos": 30847279413600.0, "grad_norm": 1.8694367448252596, "language_loss": 0.67781121, "learning_rate": 9.48733438329993e-07, "loss": 0.70232952, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 3.9938642978668213 }, { "auxiliary_loss_clip": 0.01349676, "auxiliary_loss_mlp": 0.00872419, "balance_loss_clip": 1.00838089, "balance_loss_mlp": 1.00036335, "epoch": 0.6859856910960139, "flos": 28877327392320.0, "grad_norm": 1.6594966980422972, "language_loss": 0.74465299, "learning_rate": 9.480708381304807e-07, "loss": 0.76687396, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.7576260566711426 }, { "auxiliary_loss_clip": 0.01258324, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00605702, "balance_loss_mlp": 1.00014591, "epoch": 0.6861059339866531, "flos": 19354533510240.0, "grad_norm": 2.650114994605963, "language_loss": 0.83469248, "learning_rate": 9.474083975077858e-07, "loss": 0.85920739, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.7637083530426025 }, { "auxiliary_loss_clip": 0.01338684, "auxiliary_loss_mlp": 0.01193302, "balance_loss_clip": 1.00844252, "balance_loss_mlp": 1.00018311, "epoch": 0.6862261768772921, "flos": 22199987831040.0, "grad_norm": 2.024175012975832, "language_loss": 0.8003161, "learning_rate": 9.467461165623994e-07, "loss": 0.82563591, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.738234043121338 }, { "auxiliary_loss_clip": 0.01338634, "auxiliary_loss_mlp": 0.01193242, "balance_loss_clip": 1.00843287, "balance_loss_mlp": 1.00021875, "epoch": 0.6863464197679312, "flos": 26285689177920.0, "grad_norm": 1.843991197185414, "language_loss": 0.7948252, "learning_rate": 9.46083995394791e-07, "loss": 0.82014394, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.724726438522339 }, { "auxiliary_loss_clip": 0.01331807, "auxiliary_loss_mlp": 0.00872333, "balance_loss_clip": 1.00793982, "balance_loss_mlp": 1.00032735, "epoch": 0.6864666626585703, "flos": 37815243644160.0, "grad_norm": 1.9589182055609908, "language_loss": 0.63314998, "learning_rate": 9.454220341054012e-07, "loss": 0.6551913, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.8477306365966797 }, { "auxiliary_loss_clip": 0.01292586, "auxiliary_loss_mlp": 0.01193273, "balance_loss_clip": 1.00732791, "balance_loss_mlp": 1.00015366, "epoch": 0.6865869055492094, "flos": 19391162454720.0, "grad_norm": 1.9657148375591151, "language_loss": 0.80283332, "learning_rate": 9.447602327946512e-07, "loss": 0.82769191, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.728008508682251 }, { "auxiliary_loss_clip": 0.01328165, "auxiliary_loss_mlp": 0.01193552, "balance_loss_clip": 1.00862908, "balance_loss_mlp": 1.00024259, "epoch": 0.6867071484398485, "flos": 20375976808800.0, "grad_norm": 1.9314471142413852, "language_loss": 0.76419967, "learning_rate": 9.440985915629338e-07, "loss": 0.78941679, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.758073568344116 }, { "auxiliary_loss_clip": 0.01349082, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00831676, "balance_loss_mlp": 1.0001843, "epoch": 0.6868273913304875, "flos": 15889153256640.0, "grad_norm": 1.8997957364030644, "language_loss": 0.73417485, "learning_rate": 9.434371105106223e-07, "loss": 0.75959772, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.730416774749756 }, { "auxiliary_loss_clip": 0.01294768, "auxiliary_loss_mlp": 0.01193317, "balance_loss_clip": 1.00689292, "balance_loss_mlp": 1.00019801, "epoch": 0.6869476342211267, "flos": 24462504400320.0, "grad_norm": 1.675381528480642, "language_loss": 0.70694888, "learning_rate": 9.427757897380602e-07, "loss": 0.7318297, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.763519287109375 }, { "auxiliary_loss_clip": 0.01278415, "auxiliary_loss_mlp": 0.01193401, "balance_loss_clip": 1.00649714, "balance_loss_mlp": 1.00018704, "epoch": 0.6870678771117658, "flos": 18442581884640.0, "grad_norm": 1.938577045819062, "language_loss": 0.84405816, "learning_rate": 9.421146293455695e-07, "loss": 0.86877632, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.847010612487793 }, { "auxiliary_loss_clip": 0.01326717, "auxiliary_loss_mlp": 0.01193403, "balance_loss_clip": 1.00917733, "balance_loss_mlp": 1.00018883, "epoch": 0.6871881200024048, "flos": 22200382991520.0, "grad_norm": 2.280005634778479, "language_loss": 0.68510938, "learning_rate": 9.414536294334489e-07, "loss": 0.71031052, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.772165298461914 }, { "auxiliary_loss_clip": 0.01326501, "auxiliary_loss_mlp": 0.01193376, "balance_loss_clip": 1.00793803, "balance_loss_mlp": 1.00016189, "epoch": 0.687308362893044, "flos": 22127735805120.0, "grad_norm": 1.679171564331574, "language_loss": 0.69665891, "learning_rate": 9.407927901019708e-07, "loss": 0.72185767, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.8823163509368896 }, { "auxiliary_loss_clip": 0.01332704, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00783956, "balance_loss_mlp": 1.00019574, "epoch": 0.687428605783683, "flos": 25040556531360.0, "grad_norm": 2.6578603409210984, "language_loss": 0.7686795, "learning_rate": 9.401321114513854e-07, "loss": 0.79393876, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.706195831298828 }, { "auxiliary_loss_clip": 0.0135024, "auxiliary_loss_mlp": 0.0119345, "balance_loss_clip": 1.00851333, "balance_loss_mlp": 1.00023603, "epoch": 0.6875488486743221, "flos": 23770074975840.0, "grad_norm": 2.1626671513806777, "language_loss": 0.75268042, "learning_rate": 9.394715935819155e-07, "loss": 0.7781173, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.8136606216430664 }, { "auxiliary_loss_clip": 0.01338913, "auxiliary_loss_mlp": 0.01193337, "balance_loss_clip": 1.00858378, "balance_loss_mlp": 1.00021791, "epoch": 0.6876690915649613, "flos": 25516948351680.0, "grad_norm": 2.2186068421393146, "language_loss": 0.62362301, "learning_rate": 9.388112365937608e-07, "loss": 0.64894545, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.680290460586548 }, { "auxiliary_loss_clip": 0.01298517, "auxiliary_loss_mlp": 0.01193386, "balance_loss_clip": 1.00744033, "balance_loss_mlp": 1.00026679, "epoch": 0.6877893344556003, "flos": 19428006941280.0, "grad_norm": 2.255638456146077, "language_loss": 0.82565683, "learning_rate": 9.381510405870985e-07, "loss": 0.8505758, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.798929452896118 }, { "auxiliary_loss_clip": 0.01339716, "auxiliary_loss_mlp": 0.01193571, "balance_loss_clip": 1.00854051, "balance_loss_mlp": 1.00026155, "epoch": 0.6879095773462394, "flos": 18661313764800.0, "grad_norm": 2.0619599627150396, "language_loss": 0.77351832, "learning_rate": 9.374910056620791e-07, "loss": 0.79885113, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.6928114891052246 }, { "auxiliary_loss_clip": 0.01328451, "auxiliary_loss_mlp": 0.01193403, "balance_loss_clip": 1.00810814, "balance_loss_mlp": 1.00018883, "epoch": 0.6880298202368785, "flos": 20883142013760.0, "grad_norm": 1.7953878566169126, "language_loss": 0.80926567, "learning_rate": 9.368311319188293e-07, "loss": 0.83448422, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.6499435901641846 }, { "auxiliary_loss_clip": 0.01290682, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.00723326, "balance_loss_mlp": 1.00020659, "epoch": 0.6881500631275176, "flos": 30153305270880.0, "grad_norm": 1.6310575303151282, "language_loss": 0.79286909, "learning_rate": 9.361714194574515e-07, "loss": 0.81770921, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.849773406982422 }, { "auxiliary_loss_clip": 0.01319559, "auxiliary_loss_mlp": 0.0119231, "balance_loss_clip": 1.00500154, "balance_loss_mlp": 1.00004888, "epoch": 0.6882703060181566, "flos": 66181575206880.0, "grad_norm": 0.7402642029279115, "language_loss": 0.58310354, "learning_rate": 9.355118683780228e-07, "loss": 0.60822225, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.3406941890716553 }, { "auxiliary_loss_clip": 0.01350139, "auxiliary_loss_mlp": 0.01193259, "balance_loss_clip": 1.00807166, "balance_loss_mlp": 1.0002358, "epoch": 0.6883905489087958, "flos": 18214653542400.0, "grad_norm": 2.2834753378583197, "language_loss": 0.79039955, "learning_rate": 9.348524787805987e-07, "loss": 0.81583351, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.6632163524627686 }, { "auxiliary_loss_clip": 0.01308365, "auxiliary_loss_mlp": 0.01193304, "balance_loss_clip": 1.00776422, "balance_loss_mlp": 1.00018454, "epoch": 0.6885107917994349, "flos": 14056269085440.0, "grad_norm": 3.099975109904069, "language_loss": 0.85003877, "learning_rate": 9.341932507652053e-07, "loss": 0.87505543, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 3.760735034942627 }, { "auxiliary_loss_clip": 0.01326338, "auxiliary_loss_mlp": 0.01193682, "balance_loss_clip": 1.00807202, "balance_loss_mlp": 1.00027716, "epoch": 0.6886310346900739, "flos": 28690733996640.0, "grad_norm": 1.797411694325817, "language_loss": 0.78685522, "learning_rate": 9.335341844318489e-07, "loss": 0.81205547, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 3.7475333213806152 }, { "auxiliary_loss_clip": 0.01304973, "auxiliary_loss_mlp": 0.01193332, "balance_loss_clip": 1.00724959, "balance_loss_mlp": 1.00021255, "epoch": 0.6887512775807131, "flos": 24535331205120.0, "grad_norm": 1.7498524425184923, "language_loss": 0.73069108, "learning_rate": 9.328752798805091e-07, "loss": 0.75567412, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.80676531791687 }, { "auxiliary_loss_clip": 0.01326817, "auxiliary_loss_mlp": 0.01193462, "balance_loss_clip": 1.00778985, "balance_loss_mlp": 1.00024819, "epoch": 0.6888715204713521, "flos": 22414372945920.0, "grad_norm": 2.1246693963648706, "language_loss": 0.76289415, "learning_rate": 9.322165372111399e-07, "loss": 0.78809696, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.7031407356262207 }, { "auxiliary_loss_clip": 0.01279805, "auxiliary_loss_mlp": 0.01193259, "balance_loss_clip": 1.00663209, "balance_loss_mlp": 1.00023556, "epoch": 0.6889917633619912, "flos": 22054334221440.0, "grad_norm": 2.0613131557907867, "language_loss": 0.7536999, "learning_rate": 9.315579565236747e-07, "loss": 0.77843052, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.807718276977539 }, { "auxiliary_loss_clip": 0.01301347, "auxiliary_loss_mlp": 0.0119346, "balance_loss_clip": 1.00759721, "balance_loss_mlp": 1.00024617, "epoch": 0.6891120062526304, "flos": 23949735101280.0, "grad_norm": 1.817321335747524, "language_loss": 0.7402457, "learning_rate": 9.308995379180162e-07, "loss": 0.76519382, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 3.6566803455352783 }, { "auxiliary_loss_clip": 0.01307467, "auxiliary_loss_mlp": 0.01192295, "balance_loss_clip": 1.00502813, "balance_loss_mlp": 1.00003433, "epoch": 0.6892322491432694, "flos": 64117385867520.0, "grad_norm": 0.8656271002629315, "language_loss": 0.5956341, "learning_rate": 9.302412814940488e-07, "loss": 0.62063175, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.271763563156128 }, { "auxiliary_loss_clip": 0.01326556, "auxiliary_loss_mlp": 0.0119351, "balance_loss_clip": 1.00860834, "balance_loss_mlp": 1.00020027, "epoch": 0.6893524920339085, "flos": 23002447783680.0, "grad_norm": 1.9371889672340883, "language_loss": 0.70992601, "learning_rate": 9.295831873516276e-07, "loss": 0.73512673, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.8235092163085938 }, { "auxiliary_loss_clip": 0.01349824, "auxiliary_loss_mlp": 0.01193349, "balance_loss_clip": 1.00834703, "balance_loss_mlp": 1.00023031, "epoch": 0.6894727349245476, "flos": 21396270549600.0, "grad_norm": 1.533176753049321, "language_loss": 0.76214272, "learning_rate": 9.289252555905873e-07, "loss": 0.78757441, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.6329665184020996 }, { "auxiliary_loss_clip": 0.01327372, "auxiliary_loss_mlp": 0.01193547, "balance_loss_clip": 1.00817823, "balance_loss_mlp": 1.00023723, "epoch": 0.6895929778151867, "flos": 19865327006880.0, "grad_norm": 2.0247682661500472, "language_loss": 0.75889683, "learning_rate": 9.282674863107334e-07, "loss": 0.78410608, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.746272087097168 }, { "auxiliary_loss_clip": 0.01323753, "auxiliary_loss_mlp": 0.01193383, "balance_loss_clip": 1.00773871, "balance_loss_mlp": 1.00016832, "epoch": 0.6897132207058257, "flos": 18179174155680.0, "grad_norm": 2.0622863418391737, "language_loss": 0.7636984, "learning_rate": 9.276098796118488e-07, "loss": 0.78886974, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.7198193073272705 }, { "auxiliary_loss_clip": 0.01308708, "auxiliary_loss_mlp": 0.0119355, "balance_loss_clip": 1.00778365, "balance_loss_mlp": 1.00024056, "epoch": 0.6898334635964649, "flos": 32561655068160.0, "grad_norm": 1.7013988319199391, "language_loss": 0.66216278, "learning_rate": 9.269524355936938e-07, "loss": 0.68718541, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.83525013923645 }, { "auxiliary_loss_clip": 0.01319513, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00762916, "balance_loss_mlp": 1.00020361, "epoch": 0.689953706487104, "flos": 22819015671840.0, "grad_norm": 1.6649584154363712, "language_loss": 0.84638959, "learning_rate": 9.262951543560002e-07, "loss": 0.871517, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.778029680252075 }, { "auxiliary_loss_clip": 0.01303017, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.00809979, "balance_loss_mlp": 1.00020647, "epoch": 0.690073949377743, "flos": 18515372765760.0, "grad_norm": 2.303186153564035, "language_loss": 0.86300641, "learning_rate": 9.256380359984795e-07, "loss": 0.88796991, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.762810707092285 }, { "auxiliary_loss_clip": 0.01297145, "auxiliary_loss_mlp": 0.01193382, "balance_loss_clip": 1.00792265, "balance_loss_mlp": 1.00016773, "epoch": 0.6901941922683821, "flos": 34857208213920.0, "grad_norm": 1.7105740789869186, "language_loss": 0.74449408, "learning_rate": 9.249810806208139e-07, "loss": 0.7693994, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 3.0927722454071045 }, { "auxiliary_loss_clip": 0.01290388, "auxiliary_loss_mlp": 0.00872433, "balance_loss_clip": 1.00722885, "balance_loss_mlp": 1.00032878, "epoch": 0.6903144351590212, "flos": 16253682441120.0, "grad_norm": 1.9184766870503847, "language_loss": 0.80404162, "learning_rate": 9.243242883226627e-07, "loss": 0.82566988, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.7758474349975586 }, { "auxiliary_loss_clip": 0.01338471, "auxiliary_loss_mlp": 0.01193357, "balance_loss_clip": 1.00788033, "balance_loss_mlp": 1.00023842, "epoch": 0.6904346780496603, "flos": 28035149058720.0, "grad_norm": 1.9444604171704, "language_loss": 0.69400871, "learning_rate": 9.236676592036628e-07, "loss": 0.71932697, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.7640020847320557 }, { "auxiliary_loss_clip": 0.01302415, "auxiliary_loss_mlp": 0.011933, "balance_loss_clip": 1.00754976, "balance_loss_mlp": 1.00018072, "epoch": 0.6905549209402994, "flos": 23624277671520.0, "grad_norm": 1.5832030607374956, "language_loss": 0.73595721, "learning_rate": 9.230111933634228e-07, "loss": 0.76091433, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.776296854019165 }, { "auxiliary_loss_clip": 0.01329544, "auxiliary_loss_mlp": 0.0119338, "balance_loss_clip": 1.00798285, "balance_loss_mlp": 1.00026143, "epoch": 0.6906751638309385, "flos": 23114957045760.0, "grad_norm": 1.708771571173642, "language_loss": 0.80808687, "learning_rate": 9.223548909015288e-07, "loss": 0.83331615, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.7510766983032227 }, { "auxiliary_loss_clip": 0.01271916, "auxiliary_loss_mlp": 0.01193262, "balance_loss_clip": 1.00700641, "balance_loss_mlp": 1.00023866, "epoch": 0.6907954067215776, "flos": 27305479987200.0, "grad_norm": 1.7543638736544749, "language_loss": 0.72004247, "learning_rate": 9.216987519175407e-07, "loss": 0.74469423, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.8816564083099365 }, { "auxiliary_loss_clip": 0.01323629, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.0079298, "balance_loss_mlp": 1.00022483, "epoch": 0.6909156496122166, "flos": 21689409876480.0, "grad_norm": 1.7699735985595453, "language_loss": 0.68962288, "learning_rate": 9.210427765109942e-07, "loss": 0.7147916, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.6924986839294434 }, { "auxiliary_loss_clip": 0.01313836, "auxiliary_loss_mlp": 0.0119373, "balance_loss_clip": 1.00819397, "balance_loss_mlp": 1.00022912, "epoch": 0.6910358925028558, "flos": 22561463502720.0, "grad_norm": 1.8353038667906212, "language_loss": 0.81337583, "learning_rate": 9.20386964781402e-07, "loss": 0.83845145, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.786512851715088 }, { "auxiliary_loss_clip": 0.01320734, "auxiliary_loss_mlp": 0.01193376, "balance_loss_clip": 1.00804949, "balance_loss_mlp": 1.00025654, "epoch": 0.6911561353934949, "flos": 22054118679360.0, "grad_norm": 2.290443161136155, "language_loss": 0.84444088, "learning_rate": 9.197313168282472e-07, "loss": 0.86958194, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.810654401779175 }, { "auxiliary_loss_clip": 0.01338298, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00820136, "balance_loss_mlp": 1.0002116, "epoch": 0.6912763782841339, "flos": 24206568796800.0, "grad_norm": 2.053378634844732, "language_loss": 0.72134852, "learning_rate": 9.190758327509935e-07, "loss": 0.74666381, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.7507755756378174 }, { "auxiliary_loss_clip": 0.01248407, "auxiliary_loss_mlp": 0.00871888, "balance_loss_clip": 1.00494027, "balance_loss_mlp": 0.99991161, "epoch": 0.6913966211747731, "flos": 52329668529600.0, "grad_norm": 0.9219468977795571, "language_loss": 0.64478993, "learning_rate": 9.184205126490767e-07, "loss": 0.66599292, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.1517693996429443 }, { "auxiliary_loss_clip": 0.01272298, "auxiliary_loss_mlp": 0.00871949, "balance_loss_clip": 1.00446653, "balance_loss_mlp": 0.99993771, "epoch": 0.6915168640654121, "flos": 66741306261120.0, "grad_norm": 1.0956007587347374, "language_loss": 0.59571791, "learning_rate": 9.177653566219075e-07, "loss": 0.61716044, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.221247673034668 }, { "auxiliary_loss_clip": 0.01307954, "auxiliary_loss_mlp": 0.01193284, "balance_loss_clip": 1.00770557, "balance_loss_mlp": 1.00016475, "epoch": 0.6916371069560512, "flos": 18296533114560.0, "grad_norm": 2.0916203583094597, "language_loss": 0.76281893, "learning_rate": 9.171103647688744e-07, "loss": 0.78783137, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.7934329509735107 }, { "auxiliary_loss_clip": 0.01241842, "auxiliary_loss_mlp": 0.01193309, "balance_loss_clip": 1.00698447, "balance_loss_mlp": 1.00028563, "epoch": 0.6917573498466904, "flos": 19645804805760.0, "grad_norm": 1.676832240087921, "language_loss": 0.69299817, "learning_rate": 9.164555371893367e-07, "loss": 0.71734971, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 4.975799083709717 }, { "auxiliary_loss_clip": 0.01328518, "auxiliary_loss_mlp": 0.00872399, "balance_loss_clip": 1.00795484, "balance_loss_mlp": 1.00034559, "epoch": 0.6918775927373294, "flos": 14210328836160.0, "grad_norm": 1.9321403968407376, "language_loss": 0.75140381, "learning_rate": 9.158008739826333e-07, "loss": 0.77341306, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.893805503845215 }, { "auxiliary_loss_clip": 0.01301176, "auxiliary_loss_mlp": 0.01193525, "balance_loss_clip": 1.00744486, "balance_loss_mlp": 1.00031018, "epoch": 0.6919978356279685, "flos": 23985465953760.0, "grad_norm": 1.5633320636246726, "language_loss": 0.86565185, "learning_rate": 9.151463752480744e-07, "loss": 0.89059883, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.8179633617401123 }, { "auxiliary_loss_clip": 0.01290035, "auxiliary_loss_mlp": 0.01193238, "balance_loss_clip": 1.00717282, "balance_loss_mlp": 1.00021458, "epoch": 0.6921180785186076, "flos": 23622948495360.0, "grad_norm": 1.3301601295648027, "language_loss": 0.80305028, "learning_rate": 9.144920410849493e-07, "loss": 0.82788301, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.8093929290771484 }, { "auxiliary_loss_clip": 0.01319565, "auxiliary_loss_mlp": 0.01193246, "balance_loss_clip": 1.00840402, "balance_loss_mlp": 1.00022197, "epoch": 0.6922383214092467, "flos": 21142634061600.0, "grad_norm": 1.6690193831234525, "language_loss": 0.8002286, "learning_rate": 9.138378715925176e-07, "loss": 0.82535672, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 4.12559962272644 }, { "auxiliary_loss_clip": 0.01315297, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.0074079, "balance_loss_mlp": 1.00016618, "epoch": 0.6923585642998857, "flos": 21470677996320.0, "grad_norm": 1.6213370496188642, "language_loss": 0.80973732, "learning_rate": 9.131838668700167e-07, "loss": 0.83482218, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.8909707069396973 }, { "auxiliary_loss_clip": 0.01314375, "auxiliary_loss_mlp": 0.01193381, "balance_loss_clip": 1.00880611, "balance_loss_mlp": 1.00016618, "epoch": 0.6924788071905249, "flos": 21105214796160.0, "grad_norm": 1.6616607923024056, "language_loss": 0.86302334, "learning_rate": 9.125300270166598e-07, "loss": 0.88810086, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.8713269233703613 }, { "auxiliary_loss_clip": 0.01292552, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00750768, "balance_loss_mlp": 1.00017178, "epoch": 0.692599050081164, "flos": 26250030172800.0, "grad_norm": 1.6417106459387112, "language_loss": 0.85922468, "learning_rate": 9.118763521316324e-07, "loss": 0.88408214, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.877253293991089 }, { "auxiliary_loss_clip": 0.01350347, "auxiliary_loss_mlp": 0.00872487, "balance_loss_clip": 1.00786662, "balance_loss_mlp": 1.00032485, "epoch": 0.692719292971803, "flos": 20885225587200.0, "grad_norm": 1.582854040230288, "language_loss": 0.76296639, "learning_rate": 9.112228423140987e-07, "loss": 0.78519475, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.718536138534546 }, { "auxiliary_loss_clip": 0.01315174, "auxiliary_loss_mlp": 0.01193427, "balance_loss_clip": 1.00746703, "balance_loss_mlp": 1.00021243, "epoch": 0.6928395358624422, "flos": 25921950314400.0, "grad_norm": 2.1967270235133984, "language_loss": 0.85958284, "learning_rate": 9.105694976631932e-07, "loss": 0.88466883, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.799988031387329 }, { "auxiliary_loss_clip": 0.01329481, "auxiliary_loss_mlp": 0.0119332, "balance_loss_clip": 1.0081377, "balance_loss_mlp": 1.00020134, "epoch": 0.6929597787530812, "flos": 23586571016640.0, "grad_norm": 1.9413675631808212, "language_loss": 0.72654152, "learning_rate": 9.099163182780283e-07, "loss": 0.75176954, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.7828006744384766 }, { "auxiliary_loss_clip": 0.01303399, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00747025, "balance_loss_mlp": 1.00020146, "epoch": 0.6930800216437203, "flos": 18255665175840.0, "grad_norm": 2.5018097983012413, "language_loss": 0.48889315, "learning_rate": 9.092633042576916e-07, "loss": 0.51385939, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.7202486991882324 }, { "auxiliary_loss_clip": 0.01303815, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00712657, "balance_loss_mlp": 1.00017905, "epoch": 0.6932002645343595, "flos": 29168634611520.0, "grad_norm": 1.7050309024507693, "language_loss": 0.56337631, "learning_rate": 9.086104557012446e-07, "loss": 0.58834648, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.833312749862671 }, { "auxiliary_loss_clip": 0.0133748, "auxiliary_loss_mlp": 0.01193313, "balance_loss_clip": 1.00798166, "balance_loss_mlp": 1.0001936, "epoch": 0.6933205074249985, "flos": 23842758085920.0, "grad_norm": 1.863480218277834, "language_loss": 0.65445936, "learning_rate": 9.079577727077239e-07, "loss": 0.67976737, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.7228102684020996 }, { "auxiliary_loss_clip": 0.01326523, "auxiliary_loss_mlp": 0.01193453, "balance_loss_clip": 1.00788236, "balance_loss_mlp": 1.00023818, "epoch": 0.6934407503156376, "flos": 24166706721120.0, "grad_norm": 2.093595258722556, "language_loss": 0.71800256, "learning_rate": 9.073052553761404e-07, "loss": 0.74320227, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.7473883628845215 }, { "auxiliary_loss_clip": 0.01285016, "auxiliary_loss_mlp": 0.01193531, "balance_loss_clip": 1.0079577, "balance_loss_mlp": 1.00022078, "epoch": 0.6935609932062767, "flos": 20631337633440.0, "grad_norm": 1.6596040797443847, "language_loss": 0.78038448, "learning_rate": 9.066529038054805e-07, "loss": 0.80516994, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.9836690425872803 }, { "auxiliary_loss_clip": 0.01316118, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.00729358, "balance_loss_mlp": 1.0002408, "epoch": 0.6936812360969158, "flos": 18254192304960.0, "grad_norm": 1.9009807969843906, "language_loss": 0.7383284, "learning_rate": 9.060007180947071e-07, "loss": 0.76342213, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.8018271923065186 }, { "auxiliary_loss_clip": 0.01277516, "auxiliary_loss_mlp": 0.0119349, "balance_loss_clip": 1.00798333, "balance_loss_mlp": 1.00018072, "epoch": 0.6938014789875548, "flos": 31317348666240.0, "grad_norm": 1.967211419490774, "language_loss": 0.73129392, "learning_rate": 9.053486983427534e-07, "loss": 0.75600398, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.90212345123291 }, { "auxiliary_loss_clip": 0.01320313, "auxiliary_loss_mlp": 0.01193281, "balance_loss_clip": 1.00751781, "balance_loss_mlp": 1.00016165, "epoch": 0.6939217218781939, "flos": 17528438914560.0, "grad_norm": 1.8141786717361128, "language_loss": 0.7076056, "learning_rate": 9.046968446485326e-07, "loss": 0.73274148, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.7802581787109375 }, { "auxiliary_loss_clip": 0.01330696, "auxiliary_loss_mlp": 0.01193375, "balance_loss_clip": 1.0083375, "balance_loss_mlp": 1.00025558, "epoch": 0.6940419647688331, "flos": 18551786168160.0, "grad_norm": 2.1059918259976467, "language_loss": 0.70687652, "learning_rate": 9.040451571109295e-07, "loss": 0.73211724, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.6992673873901367 }, { "auxiliary_loss_clip": 0.01269861, "auxiliary_loss_mlp": 0.0119232, "balance_loss_clip": 1.00787485, "balance_loss_mlp": 1.0000596, "epoch": 0.6941622076594721, "flos": 66926319014880.0, "grad_norm": 0.8339343495692636, "language_loss": 0.60465866, "learning_rate": 9.033936358288042e-07, "loss": 0.62928051, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.2459146976470947 }, { "auxiliary_loss_clip": 0.01350237, "auxiliary_loss_mlp": 0.01193421, "balance_loss_clip": 1.00813985, "balance_loss_mlp": 1.00020695, "epoch": 0.6942824505501112, "flos": 26578074107520.0, "grad_norm": 1.6444708811660174, "language_loss": 0.82380176, "learning_rate": 9.027422809009937e-07, "loss": 0.84923834, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.8047871589660645 }, { "auxiliary_loss_clip": 0.01338217, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00789559, "balance_loss_mlp": 1.00021148, "epoch": 0.6944026934407503, "flos": 21248317824480.0, "grad_norm": 1.5045738703428044, "language_loss": 0.8280977, "learning_rate": 9.020910924263054e-07, "loss": 0.85341227, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.7513620853424072 }, { "auxiliary_loss_clip": 0.01269421, "auxiliary_loss_mlp": 0.0119234, "balance_loss_clip": 1.00764358, "balance_loss_mlp": 1.0000788, "epoch": 0.6945229363313894, "flos": 70677222775200.0, "grad_norm": 0.8164221567184302, "language_loss": 0.58177328, "learning_rate": 9.014400705035261e-07, "loss": 0.60639095, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.3422329425811768 }, { "auxiliary_loss_clip": 0.01350008, "auxiliary_loss_mlp": 0.01193315, "balance_loss_clip": 1.00887322, "balance_loss_mlp": 1.00019574, "epoch": 0.6946431792220285, "flos": 18952944297120.0, "grad_norm": 2.0432350011586204, "language_loss": 0.76942062, "learning_rate": 9.00789215231414e-07, "loss": 0.79485381, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 2.7204997539520264 }, { "auxiliary_loss_clip": 0.01314997, "auxiliary_loss_mlp": 0.00872498, "balance_loss_clip": 1.00822926, "balance_loss_mlp": 1.00057638, "epoch": 0.6947634221126676, "flos": 20338844932800.0, "grad_norm": 1.679315540947963, "language_loss": 0.81711221, "learning_rate": 9.001385267087056e-07, "loss": 0.83898717, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.8304975032806396 }, { "auxiliary_loss_clip": 0.01332972, "auxiliary_loss_mlp": 0.01193351, "balance_loss_clip": 1.00777435, "balance_loss_mlp": 1.00023222, "epoch": 0.6948836650033067, "flos": 21833734309920.0, "grad_norm": 1.7272376655964998, "language_loss": 0.70357889, "learning_rate": 8.994880050341072e-07, "loss": 0.72884214, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 4.726563453674316 }, { "auxiliary_loss_clip": 0.01327244, "auxiliary_loss_mlp": 0.01193467, "balance_loss_clip": 1.00895023, "balance_loss_mlp": 1.00025272, "epoch": 0.6950039078939457, "flos": 23657529790080.0, "grad_norm": 1.6378944569302305, "language_loss": 0.77644074, "learning_rate": 8.988376503063026e-07, "loss": 0.80164778, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.8812096118927 }, { "auxiliary_loss_clip": 0.01277718, "auxiliary_loss_mlp": 0.01193627, "balance_loss_clip": 1.00651407, "balance_loss_mlp": 1.00031745, "epoch": 0.6951241507845849, "flos": 21792471210720.0, "grad_norm": 1.8703053697399674, "language_loss": 0.82144952, "learning_rate": 8.981874626239521e-07, "loss": 0.84616297, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.7885918617248535 }, { "auxiliary_loss_clip": 0.01326566, "auxiliary_loss_mlp": 0.01193349, "balance_loss_clip": 1.00802338, "balance_loss_mlp": 1.00022995, "epoch": 0.695244393675224, "flos": 14647577054400.0, "grad_norm": 1.8616566042215257, "language_loss": 0.88480204, "learning_rate": 8.975374420856872e-07, "loss": 0.91000128, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.7312779426574707 }, { "auxiliary_loss_clip": 0.01301903, "auxiliary_loss_mlp": 0.01193315, "balance_loss_clip": 1.00764489, "balance_loss_mlp": 1.00019598, "epoch": 0.695364636565863, "flos": 16873213213440.0, "grad_norm": 1.907780386365884, "language_loss": 0.72862768, "learning_rate": 8.968875887901157e-07, "loss": 0.75357985, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 3.811697006225586 }, { "auxiliary_loss_clip": 0.01325946, "auxiliary_loss_mlp": 0.01193329, "balance_loss_clip": 1.0086844, "balance_loss_mlp": 1.00021005, "epoch": 0.6954848794565022, "flos": 19354533510240.0, "grad_norm": 2.351677867649018, "language_loss": 0.62624836, "learning_rate": 8.9623790283582e-07, "loss": 0.6514411, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.729240655899048 }, { "auxiliary_loss_clip": 0.01297138, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00727141, "balance_loss_mlp": 1.0001992, "epoch": 0.6956051223471412, "flos": 18990219867840.0, "grad_norm": 1.9557350179870774, "language_loss": 0.76565003, "learning_rate": 8.955883843213561e-07, "loss": 0.79055369, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.8388001918792725 }, { "auxiliary_loss_clip": 0.01337504, "auxiliary_loss_mlp": 0.01193621, "balance_loss_clip": 1.00824142, "balance_loss_mlp": 1.00031161, "epoch": 0.6957253652377803, "flos": 16107238510560.0, "grad_norm": 2.5358126837958097, "language_loss": 0.87033439, "learning_rate": 8.949390333452569e-07, "loss": 0.8956455, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.716521978378296 }, { "auxiliary_loss_clip": 0.01350055, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00871325, "balance_loss_mlp": 1.00021338, "epoch": 0.6958456081284194, "flos": 29388659744160.0, "grad_norm": 1.7280128283012735, "language_loss": 0.67588615, "learning_rate": 8.942898500060279e-07, "loss": 0.7013191, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.793126106262207 }, { "auxiliary_loss_clip": 0.01265858, "auxiliary_loss_mlp": 0.0119337, "balance_loss_clip": 1.00776219, "balance_loss_mlp": 1.00025058, "epoch": 0.6959658510190585, "flos": 25154861977440.0, "grad_norm": 2.2222619013537046, "language_loss": 0.71963811, "learning_rate": 8.936408344021493e-07, "loss": 0.74423033, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.8485991954803467 }, { "auxiliary_loss_clip": 0.01302442, "auxiliary_loss_mlp": 0.01193667, "balance_loss_clip": 1.00827193, "balance_loss_mlp": 1.00026202, "epoch": 0.6960860939096976, "flos": 42814405411200.0, "grad_norm": 2.0389794344379712, "language_loss": 0.71172839, "learning_rate": 8.929919866320765e-07, "loss": 0.73668951, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.929234504699707 }, { "auxiliary_loss_clip": 0.01302985, "auxiliary_loss_mlp": 0.00872546, "balance_loss_clip": 1.00765276, "balance_loss_mlp": 1.00036979, "epoch": 0.6962063368003367, "flos": 17566576653600.0, "grad_norm": 2.392800289262962, "language_loss": 0.81938899, "learning_rate": 8.923433067942385e-07, "loss": 0.84114432, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.7990472316741943 }, { "auxiliary_loss_clip": 0.01291695, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00813973, "balance_loss_mlp": 1.00020576, "epoch": 0.6963265796909758, "flos": 21251658726720.0, "grad_norm": 1.790280742627677, "language_loss": 0.68338442, "learning_rate": 8.916947949870417e-07, "loss": 0.70823467, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.8209357261657715 }, { "auxiliary_loss_clip": 0.01307249, "auxiliary_loss_mlp": 0.01192299, "balance_loss_clip": 1.00480437, "balance_loss_mlp": 1.00003803, "epoch": 0.6964468225816148, "flos": 68828329851840.0, "grad_norm": 0.7429465381193963, "language_loss": 0.58168727, "learning_rate": 8.910464513088615e-07, "loss": 0.60668278, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.3352715969085693 }, { "auxiliary_loss_clip": 0.01313706, "auxiliary_loss_mlp": 0.01193348, "balance_loss_clip": 1.00757921, "balance_loss_mlp": 1.00022864, "epoch": 0.696567065472254, "flos": 18950896647360.0, "grad_norm": 1.7731093924385135, "language_loss": 0.78795218, "learning_rate": 8.903982758580542e-07, "loss": 0.81302273, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.7190239429473877 }, { "auxiliary_loss_clip": 0.01315801, "auxiliary_loss_mlp": 0.01193311, "balance_loss_clip": 1.00884163, "balance_loss_mlp": 1.00019217, "epoch": 0.696687308362893, "flos": 22856686403040.0, "grad_norm": 1.9742002469766349, "language_loss": 0.80297029, "learning_rate": 8.897502687329457e-07, "loss": 0.82806146, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.8127710819244385 }, { "auxiliary_loss_clip": 0.01300408, "auxiliary_loss_mlp": 0.01193347, "balance_loss_clip": 1.00780094, "balance_loss_mlp": 1.00022793, "epoch": 0.6968075512535321, "flos": 24972938660160.0, "grad_norm": 1.853802944133896, "language_loss": 0.80353355, "learning_rate": 8.891024300318382e-07, "loss": 0.82847106, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.830447196960449 }, { "auxiliary_loss_clip": 0.01301539, "auxiliary_loss_mlp": 0.01193363, "balance_loss_clip": 1.0075984, "balance_loss_mlp": 1.00024378, "epoch": 0.6969277941441713, "flos": 21030448112640.0, "grad_norm": 1.4737883204487963, "language_loss": 0.7585119, "learning_rate": 8.884547598530103e-07, "loss": 0.78346086, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.7758193016052246 }, { "auxiliary_loss_clip": 0.01239069, "auxiliary_loss_mlp": 0.01193289, "balance_loss_clip": 1.00633645, "balance_loss_mlp": 1.00017011, "epoch": 0.6970480370348103, "flos": 21579415272000.0, "grad_norm": 1.763704386733413, "language_loss": 0.75275731, "learning_rate": 8.8780725829471e-07, "loss": 0.77708095, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.9990341663360596 }, { "auxiliary_loss_clip": 0.0134981, "auxiliary_loss_mlp": 0.01193432, "balance_loss_clip": 1.00777483, "balance_loss_mlp": 1.0002172, "epoch": 0.6971682799254494, "flos": 22419186719040.0, "grad_norm": 2.9344976094875843, "language_loss": 0.77967, "learning_rate": 8.87159925455165e-07, "loss": 0.80510235, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.9345974922180176 }, { "auxiliary_loss_clip": 0.01298653, "auxiliary_loss_mlp": 0.0119339, "balance_loss_clip": 1.00725317, "balance_loss_mlp": 1.00027144, "epoch": 0.6972885228160886, "flos": 20005843530240.0, "grad_norm": 2.0691063867465447, "language_loss": 0.73243999, "learning_rate": 8.865127614325738e-07, "loss": 0.75736046, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.830383777618408 }, { "auxiliary_loss_clip": 0.01326297, "auxiliary_loss_mlp": 0.01193625, "balance_loss_clip": 1.00861561, "balance_loss_mlp": 1.00021994, "epoch": 0.6974087657067276, "flos": 37853453230560.0, "grad_norm": 1.7510494488224893, "language_loss": 0.66262341, "learning_rate": 8.85865766325113e-07, "loss": 0.6878227, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.87555193901062 }, { "auxiliary_loss_clip": 0.0131976, "auxiliary_loss_mlp": 0.01193333, "balance_loss_clip": 1.00816035, "balance_loss_mlp": 1.0002141, "epoch": 0.6975290085973667, "flos": 29489278268160.0, "grad_norm": 2.012537687995402, "language_loss": 0.72044039, "learning_rate": 8.852189402309287e-07, "loss": 0.74557137, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.8892016410827637 }, { "auxiliary_loss_clip": 0.01325265, "auxiliary_loss_mlp": 0.01193297, "balance_loss_clip": 1.00821948, "balance_loss_mlp": 1.00017798, "epoch": 0.6976492514880057, "flos": 12895638439680.0, "grad_norm": 2.097366093756693, "language_loss": 0.73639011, "learning_rate": 8.845722832481441e-07, "loss": 0.7615757, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.6887741088867188 }, { "auxiliary_loss_clip": 0.01326567, "auxiliary_loss_mlp": 0.01193419, "balance_loss_clip": 1.00776267, "balance_loss_mlp": 1.00020456, "epoch": 0.6977694943786449, "flos": 24352940880000.0, "grad_norm": 1.6644026096913738, "language_loss": 0.77020133, "learning_rate": 8.83925795474858e-07, "loss": 0.79540122, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.734720468521118 }, { "auxiliary_loss_clip": 0.01278085, "auxiliary_loss_mlp": 0.01193349, "balance_loss_clip": 1.00622869, "balance_loss_mlp": 1.00022948, "epoch": 0.6978897372692839, "flos": 29898483301440.0, "grad_norm": 2.355780884060836, "language_loss": 0.5932979, "learning_rate": 8.832794770091414e-07, "loss": 0.61801225, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.8181557655334473 }, { "auxiliary_loss_clip": 0.01317924, "auxiliary_loss_mlp": 0.01193415, "balance_loss_clip": 1.00722432, "balance_loss_mlp": 1.00020099, "epoch": 0.698009980159923, "flos": 21761590055040.0, "grad_norm": 2.153233005162187, "language_loss": 0.82764745, "learning_rate": 8.826333279490401e-07, "loss": 0.85276085, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 3.717555046081543 }, { "auxiliary_loss_clip": 0.01314854, "auxiliary_loss_mlp": 0.01193376, "balance_loss_clip": 1.00833094, "balance_loss_mlp": 1.00016117, "epoch": 0.6981302230505622, "flos": 19857172331520.0, "grad_norm": 2.0568637440175404, "language_loss": 0.68488419, "learning_rate": 8.819873483925748e-07, "loss": 0.70996648, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 3.688603639602661 }, { "auxiliary_loss_clip": 0.0129417, "auxiliary_loss_mlp": 0.00872449, "balance_loss_clip": 1.00716519, "balance_loss_mlp": 1.00032568, "epoch": 0.6982504659412012, "flos": 22198658654880.0, "grad_norm": 2.0462791993925107, "language_loss": 0.74556309, "learning_rate": 8.81341538437739e-07, "loss": 0.7672292, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.794529676437378 }, { "auxiliary_loss_clip": 0.01317213, "auxiliary_loss_mlp": 0.01193391, "balance_loss_clip": 1.00764084, "balance_loss_mlp": 1.00017631, "epoch": 0.6983707088318403, "flos": 35588493851040.0, "grad_norm": 1.510868880442695, "language_loss": 0.67780787, "learning_rate": 8.80695898182503e-07, "loss": 0.70291388, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.8740718364715576 }, { "auxiliary_loss_clip": 0.01298842, "auxiliary_loss_mlp": 0.01192303, "balance_loss_clip": 1.00766051, "balance_loss_mlp": 1.00004256, "epoch": 0.6984909517224794, "flos": 65440087244640.0, "grad_norm": 0.8356568456756179, "language_loss": 0.65074885, "learning_rate": 8.800504277248093e-07, "loss": 0.67566037, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 4.572046995162964 }, { "auxiliary_loss_clip": 0.01278701, "auxiliary_loss_mlp": 0.00872376, "balance_loss_clip": 1.00798845, "balance_loss_mlp": 1.00048482, "epoch": 0.6986111946131185, "flos": 18546936471360.0, "grad_norm": 2.632949413553594, "language_loss": 0.75186312, "learning_rate": 8.794051271625753e-07, "loss": 0.7733739, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.8404288291931152 }, { "auxiliary_loss_clip": 0.01307742, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00679755, "balance_loss_mlp": 1.00020194, "epoch": 0.6987314375037575, "flos": 23039184499200.0, "grad_norm": 1.568327705962509, "language_loss": 0.83172059, "learning_rate": 8.787599965936925e-07, "loss": 0.85673034, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.7912821769714355 }, { "auxiliary_loss_clip": 0.01298015, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00746429, "balance_loss_mlp": 1.00021553, "epoch": 0.6988516803943967, "flos": 38400408663840.0, "grad_norm": 1.53522333886808, "language_loss": 0.71704966, "learning_rate": 8.781150361160261e-07, "loss": 0.74196219, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.9968738555908203 }, { "auxiliary_loss_clip": 0.01284267, "auxiliary_loss_mlp": 0.01193292, "balance_loss_clip": 1.00815487, "balance_loss_mlp": 1.00017321, "epoch": 0.6989719232850358, "flos": 24097005276480.0, "grad_norm": 1.800826685509383, "language_loss": 0.73703921, "learning_rate": 8.774702458274181e-07, "loss": 0.76181483, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.8016061782836914 }, { "auxiliary_loss_clip": 0.013272, "auxiliary_loss_mlp": 0.01193389, "balance_loss_clip": 1.00798583, "balance_loss_mlp": 1.00017488, "epoch": 0.6990921661756748, "flos": 14866847789760.0, "grad_norm": 3.5688639427256175, "language_loss": 0.70557052, "learning_rate": 8.768256258256799e-07, "loss": 0.73077643, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.737905740737915 }, { "auxiliary_loss_clip": 0.01338918, "auxiliary_loss_mlp": 0.01193293, "balance_loss_clip": 1.0084269, "balance_loss_mlp": 1.00017369, "epoch": 0.699212409066314, "flos": 20193730178400.0, "grad_norm": 1.6374581728070319, "language_loss": 0.73597324, "learning_rate": 8.76181176208602e-07, "loss": 0.76129538, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.7154576778411865 }, { "auxiliary_loss_clip": 0.01290356, "auxiliary_loss_mlp": 0.01193502, "balance_loss_clip": 1.00855231, "balance_loss_mlp": 1.00019228, "epoch": 0.699332651956953, "flos": 19427899170240.0, "grad_norm": 1.6606077823891858, "language_loss": 0.73490679, "learning_rate": 8.755368970739461e-07, "loss": 0.75974536, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.8432657718658447 }, { "auxiliary_loss_clip": 0.01305724, "auxiliary_loss_mlp": 0.01193642, "balance_loss_clip": 1.00754082, "balance_loss_mlp": 1.00023699, "epoch": 0.6994528948475921, "flos": 16143723760320.0, "grad_norm": 2.1147949303698095, "language_loss": 0.61519271, "learning_rate": 8.748927885194479e-07, "loss": 0.64018643, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.7785227298736572 }, { "auxiliary_loss_clip": 0.01260841, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00580859, "balance_loss_mlp": 1.00001073, "epoch": 0.6995731377382313, "flos": 64952451312480.0, "grad_norm": 0.8034591266371476, "language_loss": 0.57468843, "learning_rate": 8.742488506428209e-07, "loss": 0.59921956, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2982475757598877 }, { "auxiliary_loss_clip": 0.01319126, "auxiliary_loss_mlp": 0.00872438, "balance_loss_clip": 1.00770164, "balance_loss_mlp": 1.00032568, "epoch": 0.6996933806288703, "flos": 24900147779040.0, "grad_norm": 1.6546545774752384, "language_loss": 0.78478521, "learning_rate": 8.736050835417466e-07, "loss": 0.80670089, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.814051389694214 }, { "auxiliary_loss_clip": 0.01336979, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.00801647, "balance_loss_mlp": 1.00019646, "epoch": 0.6998136235195094, "flos": 20777817487680.0, "grad_norm": 1.857674980806883, "language_loss": 0.61049783, "learning_rate": 8.729614873138862e-07, "loss": 0.63580084, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.7590320110321045 }, { "auxiliary_loss_clip": 0.01272123, "auxiliary_loss_mlp": 0.01193439, "balance_loss_clip": 1.00683784, "balance_loss_mlp": 1.00022435, "epoch": 0.6999338664101485, "flos": 23733481955040.0, "grad_norm": 2.0553526585326973, "language_loss": 0.77846789, "learning_rate": 8.723180620568716e-07, "loss": 0.80312347, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.9137611389160156 }, { "auxiliary_loss_clip": 0.01309322, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00839424, "balance_loss_mlp": 1.0002104, "epoch": 0.7000541093007876, "flos": 19864608533280.0, "grad_norm": 1.7237208719229014, "language_loss": 0.84995639, "learning_rate": 8.716748078683116e-07, "loss": 0.874982, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.7555081844329834 }, { "auxiliary_loss_clip": 0.0123301, "auxiliary_loss_mlp": 0.01193334, "balance_loss_clip": 1.00673747, "balance_loss_mlp": 1.00021541, "epoch": 0.7001743521914267, "flos": 29679068871360.0, "grad_norm": 2.0245390145109607, "language_loss": 0.68425119, "learning_rate": 8.710317248457855e-07, "loss": 0.70851463, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.932945489883423 }, { "auxiliary_loss_clip": 0.01302549, "auxiliary_loss_mlp": 0.01193408, "balance_loss_clip": 1.00766063, "balance_loss_mlp": 1.00019312, "epoch": 0.7002945950820658, "flos": 27489774267360.0, "grad_norm": 1.7536318320127764, "language_loss": 0.72278965, "learning_rate": 8.703888130868482e-07, "loss": 0.74774921, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.7832863330841064 }, { "auxiliary_loss_clip": 0.01290123, "auxiliary_loss_mlp": 0.01193294, "balance_loss_clip": 1.00711548, "balance_loss_mlp": 1.00017452, "epoch": 0.7004148379727049, "flos": 22158473266080.0, "grad_norm": 2.6531792648435175, "language_loss": 0.81944692, "learning_rate": 8.697460726890307e-07, "loss": 0.84428108, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.7509453296661377 }, { "auxiliary_loss_clip": 0.01313449, "auxiliary_loss_mlp": 0.0087249, "balance_loss_clip": 1.00858438, "balance_loss_mlp": 1.0004133, "epoch": 0.7005350808633439, "flos": 19423767947040.0, "grad_norm": 2.099885295985371, "language_loss": 0.90896839, "learning_rate": 8.691035037498354e-07, "loss": 0.93082786, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.8068442344665527 }, { "auxiliary_loss_clip": 0.0132615, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00835228, "balance_loss_mlp": 1.00015426, "epoch": 0.7006553237539831, "flos": 23476720106880.0, "grad_norm": 1.6015328631033607, "language_loss": 0.72393316, "learning_rate": 8.684611063667391e-07, "loss": 0.74912643, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.7302818298339844 }, { "auxiliary_loss_clip": 0.01338169, "auxiliary_loss_mlp": 0.01193293, "balance_loss_clip": 1.00840902, "balance_loss_mlp": 1.00017405, "epoch": 0.7007755666446221, "flos": 31212886308480.0, "grad_norm": 2.480145501403755, "language_loss": 0.76655412, "learning_rate": 8.678188806371935e-07, "loss": 0.79186881, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.8669028282165527 }, { "auxiliary_loss_clip": 0.01336795, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00793421, "balance_loss_mlp": 1.00013804, "epoch": 0.7008958095352612, "flos": 18149909565600.0, "grad_norm": 1.645997586808744, "language_loss": 0.85211062, "learning_rate": 8.671768266586228e-07, "loss": 0.87741017, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 2.6656241416931152 }, { "auxiliary_loss_clip": 0.01299996, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00792456, "balance_loss_mlp": 1.00018537, "epoch": 0.7010160524259004, "flos": 27452319078240.0, "grad_norm": 1.816989342749048, "language_loss": 0.78134727, "learning_rate": 8.665349445284275e-07, "loss": 0.8062793, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.859652519226074 }, { "auxiliary_loss_clip": 0.01292306, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00821209, "balance_loss_mlp": 1.00022376, "epoch": 0.7011362953165394, "flos": 23842075536000.0, "grad_norm": 1.399718388516563, "language_loss": 0.80860132, "learning_rate": 8.658932343439799e-07, "loss": 0.83345687, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 4.660770416259766 }, { "auxiliary_loss_clip": 0.01349717, "auxiliary_loss_mlp": 0.01193561, "balance_loss_clip": 1.00840974, "balance_loss_mlp": 1.00025153, "epoch": 0.7012565382071785, "flos": 24823441216800.0, "grad_norm": 1.8160229614545476, "language_loss": 0.77829468, "learning_rate": 8.65251696202627e-07, "loss": 0.80372745, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 3.6920032501220703 }, { "auxiliary_loss_clip": 0.01273956, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00733435, "balance_loss_mlp": 1.00018764, "epoch": 0.7013767810978175, "flos": 21397456031040.0, "grad_norm": 25.737970819430245, "language_loss": 0.87401962, "learning_rate": 8.646103302016896e-07, "loss": 0.8986913, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 2.7431845664978027 }, { "auxiliary_loss_clip": 0.01287355, "auxiliary_loss_mlp": 0.01193401, "balance_loss_clip": 1.00692785, "balance_loss_mlp": 1.00018704, "epoch": 0.7014970239884567, "flos": 16687158672960.0, "grad_norm": 1.7134431958350462, "language_loss": 0.88584661, "learning_rate": 8.639691364384614e-07, "loss": 0.91065419, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.782606363296509 }, { "auxiliary_loss_clip": 0.01313172, "auxiliary_loss_mlp": 0.01193437, "balance_loss_clip": 1.00810206, "balance_loss_mlp": 1.00022209, "epoch": 0.7016172668790958, "flos": 12568277054880.0, "grad_norm": 1.8310237295753495, "language_loss": 0.72437471, "learning_rate": 8.633281150102136e-07, "loss": 0.74944079, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 3.7003836631774902 }, { "auxiliary_loss_clip": 0.01303553, "auxiliary_loss_mlp": 0.01193339, "balance_loss_clip": 1.00684881, "balance_loss_mlp": 1.00012493, "epoch": 0.7017375097697348, "flos": 17452738215360.0, "grad_norm": 2.0690255417335583, "language_loss": 0.67907125, "learning_rate": 8.626872660141855e-07, "loss": 0.70404017, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.7763803005218506 }, { "auxiliary_loss_clip": 0.012642, "auxiliary_loss_mlp": 0.01193246, "balance_loss_clip": 1.00616467, "balance_loss_mlp": 1.00022233, "epoch": 0.701857752660374, "flos": 18513037726560.0, "grad_norm": 1.661327474732449, "language_loss": 0.74844527, "learning_rate": 8.620465895475957e-07, "loss": 0.77301979, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.8445918560028076 }, { "auxiliary_loss_clip": 0.01273643, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.0067724, "balance_loss_mlp": 1.00022721, "epoch": 0.701977995551013, "flos": 24425983226880.0, "grad_norm": 1.3681716543664613, "language_loss": 0.75196528, "learning_rate": 8.614060857076333e-07, "loss": 0.77663422, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.8156208992004395 }, { "auxiliary_loss_clip": 0.01314616, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00811136, "balance_loss_mlp": 1.00020599, "epoch": 0.7020982384416521, "flos": 23002771096800.0, "grad_norm": 1.6828919376847369, "language_loss": 0.74861842, "learning_rate": 8.60765754591462e-07, "loss": 0.77369785, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.8234996795654297 }, { "auxiliary_loss_clip": 0.01349267, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00815368, "balance_loss_mlp": 1.00017977, "epoch": 0.7022184813322913, "flos": 20449091003040.0, "grad_norm": 1.8829318825771861, "language_loss": 0.72912383, "learning_rate": 8.601255962962211e-07, "loss": 0.75454855, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.6854302883148193 }, { "auxiliary_loss_clip": 0.01333958, "auxiliary_loss_mlp": 0.0119364, "balance_loss_clip": 1.00818563, "balance_loss_mlp": 1.0002346, "epoch": 0.7023387242229303, "flos": 19790524399680.0, "grad_norm": 2.9087475673254946, "language_loss": 0.71946132, "learning_rate": 8.594856109190194e-07, "loss": 0.74473739, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.795504331588745 }, { "auxiliary_loss_clip": 0.01350292, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00838339, "balance_loss_mlp": 1.00015664, "epoch": 0.7024589671135694, "flos": 33259293426240.0, "grad_norm": 1.6410545431626384, "language_loss": 0.69195592, "learning_rate": 8.588457985569446e-07, "loss": 0.71739066, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.832615613937378 }, { "auxiliary_loss_clip": 0.01350294, "auxiliary_loss_mlp": 0.01193335, "balance_loss_clip": 1.00830793, "balance_loss_mlp": 1.00021577, "epoch": 0.7025792100042085, "flos": 19098993067200.0, "grad_norm": 2.136655115516637, "language_loss": 0.71847224, "learning_rate": 8.582061593070542e-07, "loss": 0.74390852, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.7077839374542236 }, { "auxiliary_loss_clip": 0.01350109, "auxiliary_loss_mlp": 0.0087252, "balance_loss_clip": 1.00809717, "balance_loss_mlp": 1.00040126, "epoch": 0.7026994528948476, "flos": 18952620984000.0, "grad_norm": 2.281237747291281, "language_loss": 0.7678116, "learning_rate": 8.57566693266383e-07, "loss": 0.79003799, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.6733949184417725 }, { "auxiliary_loss_clip": 0.0131887, "auxiliary_loss_mlp": 0.00872472, "balance_loss_clip": 1.00760388, "balance_loss_mlp": 1.00033414, "epoch": 0.7028196957854866, "flos": 19536672369600.0, "grad_norm": 1.8956799985058468, "language_loss": 0.69286299, "learning_rate": 8.569274005319354e-07, "loss": 0.7147764, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.6981492042541504 }, { "auxiliary_loss_clip": 0.01339751, "auxiliary_loss_mlp": 0.01193352, "balance_loss_clip": 1.00884807, "balance_loss_mlp": 1.00023317, "epoch": 0.7029399386761258, "flos": 20845327587840.0, "grad_norm": 1.6295263919739942, "language_loss": 0.7973187, "learning_rate": 8.562882812006913e-07, "loss": 0.82264978, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.5893285274505615 }, { "auxiliary_loss_clip": 0.01349237, "auxiliary_loss_mlp": 0.0119343, "balance_loss_clip": 1.00782299, "balance_loss_mlp": 1.00021505, "epoch": 0.7030601815667649, "flos": 22055016771360.0, "grad_norm": 1.5890018270050181, "language_loss": 0.77275193, "learning_rate": 8.556493353696066e-07, "loss": 0.79817855, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.5858068466186523 }, { "auxiliary_loss_clip": 0.01330732, "auxiliary_loss_mlp": 0.00872621, "balance_loss_clip": 1.00800037, "balance_loss_mlp": 1.00042176, "epoch": 0.7031804244574039, "flos": 27198754437600.0, "grad_norm": 2.172904015115407, "language_loss": 0.67844564, "learning_rate": 8.550105631356077e-07, "loss": 0.70047915, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.65212345123291 }, { "auxiliary_loss_clip": 0.01299767, "auxiliary_loss_mlp": 0.01193306, "balance_loss_clip": 1.00804496, "balance_loss_mlp": 1.00018668, "epoch": 0.7033006673480431, "flos": 22379863498560.0, "grad_norm": 1.9779007549267078, "language_loss": 0.77306759, "learning_rate": 8.543719645955961e-07, "loss": 0.79799831, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.7514593601226807 }, { "auxiliary_loss_clip": 0.01309904, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00694597, "balance_loss_mlp": 1.00015974, "epoch": 0.7034209102386821, "flos": 24715997193600.0, "grad_norm": 1.5578129463010257, "language_loss": 0.74520242, "learning_rate": 8.537335398464467e-07, "loss": 0.77023327, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.803668737411499 }, { "auxiliary_loss_clip": 0.01326927, "auxiliary_loss_mlp": 0.01193365, "balance_loss_clip": 1.00858986, "balance_loss_mlp": 1.00024557, "epoch": 0.7035411531293212, "flos": 22556182721760.0, "grad_norm": 3.032642871710726, "language_loss": 0.85194665, "learning_rate": 8.53095288985007e-07, "loss": 0.87714958, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.6970319747924805 }, { "auxiliary_loss_clip": 0.01348935, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00788903, "balance_loss_mlp": 1.00014782, "epoch": 0.7036613960199604, "flos": 22674978627840.0, "grad_norm": 1.5361178886028586, "language_loss": 0.82512629, "learning_rate": 8.524572121081009e-07, "loss": 0.85054737, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.7039291858673096 }, { "auxiliary_loss_clip": 0.01338155, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.00854731, "balance_loss_mlp": 1.00016892, "epoch": 0.7037816389105994, "flos": 22492157218560.0, "grad_norm": 2.144602179951253, "language_loss": 0.62036657, "learning_rate": 8.518193093125232e-07, "loss": 0.64568096, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.7042555809020996 }, { "auxiliary_loss_clip": 0.0131925, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00752401, "balance_loss_mlp": 1.00017357, "epoch": 0.7039018818012385, "flos": 27087502504320.0, "grad_norm": 1.5803214435170598, "language_loss": 0.80796087, "learning_rate": 8.511815806950436e-07, "loss": 0.83308536, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.820855140686035 }, { "auxiliary_loss_clip": 0.01327751, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00698912, "balance_loss_mlp": 1.00012875, "epoch": 0.7040221246918776, "flos": 17749829147040.0, "grad_norm": 1.9214681892010606, "language_loss": 0.77923179, "learning_rate": 8.505440263524044e-07, "loss": 0.80444175, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 2.73551869392395 }, { "auxiliary_loss_clip": 0.0133931, "auxiliary_loss_mlp": 0.01193454, "balance_loss_clip": 1.00851846, "balance_loss_mlp": 1.00023973, "epoch": 0.7041423675825167, "flos": 16279857594720.0, "grad_norm": 2.847417041144616, "language_loss": 0.87795413, "learning_rate": 8.49906646381322e-07, "loss": 0.90328175, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.7378273010253906 }, { "auxiliary_loss_clip": 0.01279314, "auxiliary_loss_mlp": 0.01193304, "balance_loss_clip": 1.00784099, "balance_loss_mlp": 1.00018466, "epoch": 0.7042626104731557, "flos": 25483193301600.0, "grad_norm": 1.7782788523615514, "language_loss": 0.72204733, "learning_rate": 8.492694408784884e-07, "loss": 0.74677348, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 3.693193197250366 }, { "auxiliary_loss_clip": 0.01338444, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00858676, "balance_loss_mlp": 1.00021911, "epoch": 0.7043828533637949, "flos": 17857632407040.0, "grad_norm": 3.86328376442219, "language_loss": 0.62403798, "learning_rate": 8.486324099405642e-07, "loss": 0.64935482, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 4.631892442703247 }, { "auxiliary_loss_clip": 0.01328434, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00773549, "balance_loss_mlp": 1.00015891, "epoch": 0.704503096254434, "flos": 29494271659680.0, "grad_norm": 1.6617893317181458, "language_loss": 0.74805903, "learning_rate": 8.479955536641887e-07, "loss": 0.77327514, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.7511651515960693 }, { "auxiliary_loss_clip": 0.01326031, "auxiliary_loss_mlp": 0.01193306, "balance_loss_clip": 1.00818062, "balance_loss_mlp": 1.0001874, "epoch": 0.704623339145073, "flos": 30920752844640.0, "grad_norm": 2.095582735894161, "language_loss": 0.663903, "learning_rate": 8.473588721459716e-07, "loss": 0.68909639, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.884495496749878 }, { "auxiliary_loss_clip": 0.01326793, "auxiliary_loss_mlp": 0.01193471, "balance_loss_clip": 1.00842345, "balance_loss_mlp": 1.00025702, "epoch": 0.7047435820357122, "flos": 23914758646080.0, "grad_norm": 1.8277583057824223, "language_loss": 0.70283645, "learning_rate": 8.467223654824967e-07, "loss": 0.72803915, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 3.7581167221069336 }, { "auxiliary_loss_clip": 0.01338772, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.0084914, "balance_loss_mlp": 1.00015759, "epoch": 0.7048638249263512, "flos": 46494027084960.0, "grad_norm": 1.7566708187043738, "language_loss": 0.6236192, "learning_rate": 8.460860337703233e-07, "loss": 0.64893973, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.96781325340271 }, { "auxiliary_loss_clip": 0.0130412, "auxiliary_loss_mlp": 0.0119342, "balance_loss_clip": 1.00832593, "balance_loss_mlp": 1.00020552, "epoch": 0.7049840678169903, "flos": 21689230258080.0, "grad_norm": 1.8632954726310784, "language_loss": 0.70341825, "learning_rate": 8.454498771059797e-07, "loss": 0.72839355, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.8620188236236572 }, { "auxiliary_loss_clip": 0.01270755, "auxiliary_loss_mlp": 0.01193307, "balance_loss_clip": 1.00684667, "balance_loss_mlp": 1.00018764, "epoch": 0.7051043107076294, "flos": 18405090771840.0, "grad_norm": 2.0257256453113275, "language_loss": 0.83573174, "learning_rate": 8.448138955859725e-07, "loss": 0.86037236, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.851212739944458 }, { "auxiliary_loss_clip": 0.01313715, "auxiliary_loss_mlp": 0.01193291, "balance_loss_clip": 1.00811124, "balance_loss_mlp": 1.00017178, "epoch": 0.7052245535982685, "flos": 19319054123520.0, "grad_norm": 1.8673741249917009, "language_loss": 0.89845037, "learning_rate": 8.44178089306778e-07, "loss": 0.92352045, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.77093505859375 }, { "auxiliary_loss_clip": 0.01349306, "auxiliary_loss_mlp": 0.01193501, "balance_loss_clip": 1.00810468, "balance_loss_mlp": 1.0002867, "epoch": 0.7053447964889076, "flos": 19062148580640.0, "grad_norm": 1.8567649216328355, "language_loss": 0.77104199, "learning_rate": 8.4354245836485e-07, "loss": 0.79647011, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.6147751808166504 }, { "auxiliary_loss_clip": 0.01290549, "auxiliary_loss_mlp": 0.01193516, "balance_loss_clip": 1.00756788, "balance_loss_mlp": 1.00020671, "epoch": 0.7054650393795466, "flos": 27379240807680.0, "grad_norm": 1.556209029251133, "language_loss": 0.72984076, "learning_rate": 8.429070028566108e-07, "loss": 0.75468141, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 3.016772747039795 }, { "auxiliary_loss_clip": 0.01329938, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00834632, "balance_loss_mlp": 1.00017917, "epoch": 0.7055852822701858, "flos": 16102209195360.0, "grad_norm": 2.885268480956475, "language_loss": 0.74920738, "learning_rate": 8.422717228784586e-07, "loss": 0.77443874, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.6891837120056152 }, { "auxiliary_loss_clip": 0.0125974, "auxiliary_loss_mlp": 0.01193584, "balance_loss_clip": 1.00630093, "balance_loss_mlp": 1.00036907, "epoch": 0.7057055251608249, "flos": 11692307747520.0, "grad_norm": 1.797576096331185, "language_loss": 0.6936866, "learning_rate": 8.416366185267663e-07, "loss": 0.71821988, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.849630832672119 }, { "auxiliary_loss_clip": 0.01336644, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.007725, "balance_loss_mlp": 1.00021338, "epoch": 0.7058257680514639, "flos": 22711571648640.0, "grad_norm": 1.5868853385658603, "language_loss": 0.77958822, "learning_rate": 8.410016898978778e-07, "loss": 0.804887, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.7290306091308594 }, { "auxiliary_loss_clip": 0.01253896, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00690949, "balance_loss_mlp": 1.0001508, "epoch": 0.7059460109421031, "flos": 17529552548640.0, "grad_norm": 1.5683602869932702, "language_loss": 0.78683347, "learning_rate": 8.403669370881115e-07, "loss": 0.81130415, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.766141891479492 }, { "auxiliary_loss_clip": 0.01350391, "auxiliary_loss_mlp": 0.01193259, "balance_loss_clip": 1.00810134, "balance_loss_mlp": 1.00023508, "epoch": 0.7060662538327421, "flos": 23544697214880.0, "grad_norm": 1.5269591761947667, "language_loss": 0.78516316, "learning_rate": 8.397323601937587e-07, "loss": 0.81059968, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.7384021282196045 }, { "auxiliary_loss_clip": 0.01289031, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.00707769, "balance_loss_mlp": 1.00020647, "epoch": 0.7061864967233812, "flos": 30260749294080.0, "grad_norm": 1.7709688464749251, "language_loss": 0.76989329, "learning_rate": 8.390979593110838e-07, "loss": 0.79471684, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.868724822998047 }, { "auxiliary_loss_clip": 0.0131304, "auxiliary_loss_mlp": 0.01193322, "balance_loss_clip": 1.00764477, "balance_loss_mlp": 1.00020266, "epoch": 0.7063067396140204, "flos": 20701470162240.0, "grad_norm": 1.5214293623978525, "language_loss": 0.81529826, "learning_rate": 8.384637345363262e-07, "loss": 0.84036189, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.7707247734069824 }, { "auxiliary_loss_clip": 0.01325545, "auxiliary_loss_mlp": 0.01193263, "balance_loss_clip": 1.00805295, "balance_loss_mlp": 1.000144, "epoch": 0.7064269825046594, "flos": 32266180702080.0, "grad_norm": 1.6144974990147296, "language_loss": 0.7679168, "learning_rate": 8.378296859656964e-07, "loss": 0.79310489, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.8309128284454346 }, { "auxiliary_loss_clip": 0.0131671, "auxiliary_loss_mlp": 0.01193291, "balance_loss_clip": 1.00820208, "balance_loss_mlp": 1.00017178, "epoch": 0.7065472253952985, "flos": 30227137938720.0, "grad_norm": 2.2799330198421672, "language_loss": 0.68217951, "learning_rate": 8.371958136953792e-07, "loss": 0.70727956, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.8847105503082275 }, { "auxiliary_loss_clip": 0.01297881, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.00728929, "balance_loss_mlp": 1.0001967, "epoch": 0.7066674682859376, "flos": 16216730183520.0, "grad_norm": 2.239428282651338, "language_loss": 0.66399282, "learning_rate": 8.365621178215326e-07, "loss": 0.68890476, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.7409117221832275 }, { "auxiliary_loss_clip": 0.01337786, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00855768, "balance_loss_mlp": 1.00018239, "epoch": 0.7067877111765767, "flos": 14830470311040.0, "grad_norm": 1.8871614665445708, "language_loss": 0.75210249, "learning_rate": 8.359285984402871e-07, "loss": 0.77741241, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.8582279682159424 }, { "auxiliary_loss_clip": 0.01303481, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00743604, "balance_loss_mlp": 1.00021613, "epoch": 0.7069079540672157, "flos": 25440205865760.0, "grad_norm": 1.7966039467185164, "language_loss": 0.74159634, "learning_rate": 8.352952556477489e-07, "loss": 0.76656359, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.8195228576660156 }, { "auxiliary_loss_clip": 0.01328669, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00774312, "balance_loss_mlp": 1.00015199, "epoch": 0.7070281969578549, "flos": 24607762849440.0, "grad_norm": 1.8098817048489058, "language_loss": 0.76488686, "learning_rate": 8.34662089539993e-07, "loss": 0.7901063, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.728893756866455 }, { "auxiliary_loss_clip": 0.0134879, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00827909, "balance_loss_mlp": 1.00018477, "epoch": 0.707148439848494, "flos": 26724482114400.0, "grad_norm": 1.8483102091452595, "language_loss": 0.78964841, "learning_rate": 8.340291002130722e-07, "loss": 0.81506836, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 2.775865077972412 }, { "auxiliary_loss_clip": 0.01349952, "auxiliary_loss_mlp": 0.01193332, "balance_loss_clip": 1.00831032, "balance_loss_mlp": 1.00021291, "epoch": 0.707268682739133, "flos": 15085759288320.0, "grad_norm": 9.637876926146417, "language_loss": 0.79274297, "learning_rate": 8.3339628776301e-07, "loss": 0.81817579, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 2.6535511016845703 }, { "auxiliary_loss_clip": 0.01349899, "auxiliary_loss_mlp": 0.01193407, "balance_loss_clip": 1.00809121, "balance_loss_mlp": 1.00019217, "epoch": 0.7073889256297722, "flos": 34313162598720.0, "grad_norm": 1.7503193631676344, "language_loss": 0.5679239, "learning_rate": 8.327636522858033e-07, "loss": 0.59335697, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 3.670546293258667 }, { "auxiliary_loss_clip": 0.01252227, "auxiliary_loss_mlp": 0.01193311, "balance_loss_clip": 1.00697136, "balance_loss_mlp": 1.00019181, "epoch": 0.7075091685204112, "flos": 20083951116000.0, "grad_norm": 1.8034032773710615, "language_loss": 0.76901388, "learning_rate": 8.321311938774225e-07, "loss": 0.79346931, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 4.77703332901001 }, { "auxiliary_loss_clip": 0.01350965, "auxiliary_loss_mlp": 0.01193329, "balance_loss_clip": 1.00849724, "balance_loss_mlp": 1.00021029, "epoch": 0.7076294114110503, "flos": 20777134937760.0, "grad_norm": 1.7613294592911517, "language_loss": 0.79034096, "learning_rate": 8.314989126338104e-07, "loss": 0.81578392, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.6624557971954346 }, { "auxiliary_loss_clip": 0.01337665, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00834513, "balance_loss_mlp": 1.00019479, "epoch": 0.7077496543016895, "flos": 17967698858880.0, "grad_norm": 1.73127800281428, "language_loss": 0.84459054, "learning_rate": 8.308668086508847e-07, "loss": 0.86989939, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.7411344051361084 }, { "auxiliary_loss_clip": 0.01308373, "auxiliary_loss_mlp": 0.01193262, "balance_loss_clip": 1.00782633, "balance_loss_mlp": 1.00014281, "epoch": 0.7078698971923285, "flos": 45478116033120.0, "grad_norm": 1.6525433745832099, "language_loss": 0.73997676, "learning_rate": 8.302348820245342e-07, "loss": 0.76499319, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 3.938337564468384 }, { "auxiliary_loss_clip": 0.0128827, "auxiliary_loss_mlp": 0.01193529, "balance_loss_clip": 1.00834119, "balance_loss_mlp": 1.00021958, "epoch": 0.7079901400829676, "flos": 26943716926080.0, "grad_norm": 4.879461778162835, "language_loss": 0.70011699, "learning_rate": 8.296031328506232e-07, "loss": 0.72493494, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.82206392288208 }, { "auxiliary_loss_clip": 0.01317017, "auxiliary_loss_mlp": 0.01193305, "balance_loss_clip": 1.00809312, "balance_loss_mlp": 1.00018632, "epoch": 0.7081103829736067, "flos": 24423217103520.0, "grad_norm": 1.7703455100956278, "language_loss": 0.75463343, "learning_rate": 8.289715612249857e-07, "loss": 0.77973664, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.83056640625 }, { "auxiliary_loss_clip": 0.0131363, "auxiliary_loss_mlp": 0.01193289, "balance_loss_clip": 1.00771356, "balance_loss_mlp": 1.00016975, "epoch": 0.7082306258642458, "flos": 18543308179680.0, "grad_norm": 2.376673494202546, "language_loss": 0.77807403, "learning_rate": 8.283401672434305e-07, "loss": 0.80314326, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.6899919509887695 }, { "auxiliary_loss_clip": 0.01302178, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00721717, "balance_loss_mlp": 1.00019348, "epoch": 0.7083508687548848, "flos": 23477546351520.0, "grad_norm": 2.227240488729479, "language_loss": 0.70474315, "learning_rate": 8.277089510017412e-07, "loss": 0.72969806, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.8038861751556396 }, { "auxiliary_loss_clip": 0.01302293, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00691366, "balance_loss_mlp": 1.00021911, "epoch": 0.708471111645524, "flos": 22419474108480.0, "grad_norm": 1.753902849390187, "language_loss": 0.8215645, "learning_rate": 8.270779125956719e-07, "loss": 0.84651983, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.76254940032959 }, { "auxiliary_loss_clip": 0.01271897, "auxiliary_loss_mlp": 0.01193283, "balance_loss_clip": 1.00638962, "balance_loss_mlp": 1.00016391, "epoch": 0.7085913545361631, "flos": 20922896318400.0, "grad_norm": 1.9571948332347442, "language_loss": 0.80197471, "learning_rate": 8.264470521209505e-07, "loss": 0.82662654, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.7857093811035156 }, { "auxiliary_loss_clip": 0.01337749, "auxiliary_loss_mlp": 0.01193328, "balance_loss_clip": 1.00806379, "balance_loss_mlp": 1.00020862, "epoch": 0.7087115974268021, "flos": 15012393628320.0, "grad_norm": 2.158483057825355, "language_loss": 0.76354122, "learning_rate": 8.258163696732785e-07, "loss": 0.78885198, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.7863528728485107 }, { "auxiliary_loss_clip": 0.01337734, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00814009, "balance_loss_mlp": 1.00014019, "epoch": 0.7088318403174413, "flos": 21539050264800.0, "grad_norm": 1.8469905255958696, "language_loss": 0.76973403, "learning_rate": 8.251858653483288e-07, "loss": 0.79504299, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.7500216960906982 }, { "auxiliary_loss_clip": 0.01329668, "auxiliary_loss_mlp": 0.01193311, "balance_loss_clip": 1.00847816, "balance_loss_mlp": 1.00019193, "epoch": 0.7089520832080803, "flos": 15516792709920.0, "grad_norm": 2.0466438948006647, "language_loss": 0.86138302, "learning_rate": 8.245555392417501e-07, "loss": 0.88661277, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.7185018062591553 }, { "auxiliary_loss_clip": 0.01281502, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00705171, "balance_loss_mlp": 1.00015545, "epoch": 0.7090723260987194, "flos": 20412677600640.0, "grad_norm": 1.8203487485860708, "language_loss": 0.78830326, "learning_rate": 8.239253914491613e-07, "loss": 0.81305009, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.832070827484131 }, { "auxiliary_loss_clip": 0.01296104, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00770295, "balance_loss_mlp": 1.00017238, "epoch": 0.7091925689893585, "flos": 25668349750080.0, "grad_norm": 1.7516327333600548, "language_loss": 0.75269091, "learning_rate": 8.232954220661556e-07, "loss": 0.77758396, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.8660669326782227 }, { "auxiliary_loss_clip": 0.01349314, "auxiliary_loss_mlp": 0.01193321, "balance_loss_clip": 1.00873542, "balance_loss_mlp": 1.00020182, "epoch": 0.7093128118799976, "flos": 24206640644160.0, "grad_norm": 2.139256646440725, "language_loss": 0.70079207, "learning_rate": 8.226656311882989e-07, "loss": 0.72621846, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.714151620864868 }, { "auxiliary_loss_clip": 0.01326475, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00811553, "balance_loss_mlp": 1.00017238, "epoch": 0.7094330547706367, "flos": 16646793665760.0, "grad_norm": 2.328105359262401, "language_loss": 0.77270967, "learning_rate": 8.22036018911129e-07, "loss": 0.7979064, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.688591718673706 }, { "auxiliary_loss_clip": 0.01350874, "auxiliary_loss_mlp": 0.01193547, "balance_loss_clip": 1.00820446, "balance_loss_mlp": 1.00023723, "epoch": 0.7095532976612757, "flos": 16283378115360.0, "grad_norm": 2.4766690874006185, "language_loss": 0.80870724, "learning_rate": 8.214065853301599e-07, "loss": 0.83415145, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.660383701324463 }, { "auxiliary_loss_clip": 0.0130773, "auxiliary_loss_mlp": 0.01192259, "balance_loss_clip": 1.00517941, "balance_loss_mlp": 0.99999791, "epoch": 0.7096735405519149, "flos": 70722114166080.0, "grad_norm": 0.811674099898377, "language_loss": 0.58237982, "learning_rate": 8.207773305408734e-07, "loss": 0.60737973, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.4166975021362305 }, { "auxiliary_loss_clip": 0.01289088, "auxiliary_loss_mlp": 0.01193484, "balance_loss_clip": 1.00739813, "balance_loss_mlp": 1.00017452, "epoch": 0.709793783442554, "flos": 23621511548160.0, "grad_norm": 1.9392067339751315, "language_loss": 0.79825616, "learning_rate": 8.201482546387288e-07, "loss": 0.82308185, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.821603775024414 }, { "auxiliary_loss_clip": 0.01327539, "auxiliary_loss_mlp": 0.01193371, "balance_loss_clip": 1.00754952, "balance_loss_mlp": 1.00025165, "epoch": 0.709914026333193, "flos": 25993483866720.0, "grad_norm": 1.600717036253726, "language_loss": 0.91770929, "learning_rate": 8.195193577191553e-07, "loss": 0.94291842, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.7672781944274902 }, { "auxiliary_loss_clip": 0.0130888, "auxiliary_loss_mlp": 0.00872497, "balance_loss_clip": 1.00804472, "balance_loss_mlp": 1.00041056, "epoch": 0.7100342692238322, "flos": 24861543032160.0, "grad_norm": 1.5659402998750596, "language_loss": 0.84362602, "learning_rate": 8.188906398775579e-07, "loss": 0.86543977, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.817883014678955 }, { "auxiliary_loss_clip": 0.01350963, "auxiliary_loss_mlp": 0.00872476, "balance_loss_clip": 1.00811541, "balance_loss_mlp": 1.00036263, "epoch": 0.7101545121144712, "flos": 24932214416160.0, "grad_norm": 1.7098955223792482, "language_loss": 0.68711627, "learning_rate": 8.18262101209311e-07, "loss": 0.70935059, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.662611722946167 }, { "auxiliary_loss_clip": 0.01322326, "auxiliary_loss_mlp": 0.01193309, "balance_loss_clip": 1.00810373, "balance_loss_mlp": 1.00018954, "epoch": 0.7102747550051103, "flos": 23768853570720.0, "grad_norm": 1.8780441520120126, "language_loss": 0.69943607, "learning_rate": 8.176337418097626e-07, "loss": 0.72459233, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.732813596725464 }, { "auxiliary_loss_clip": 0.01325428, "auxiliary_loss_mlp": 0.00872453, "balance_loss_clip": 1.00713253, "balance_loss_mlp": 1.00039017, "epoch": 0.7103949978957494, "flos": 15303916389600.0, "grad_norm": 1.9336743091253294, "language_loss": 0.79534197, "learning_rate": 8.170055617742364e-07, "loss": 0.8173207, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.6903879642486572 }, { "auxiliary_loss_clip": 0.01316188, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00769401, "balance_loss_mlp": 1.00018835, "epoch": 0.7105152407863885, "flos": 22638816691200.0, "grad_norm": 1.742627514529518, "language_loss": 0.7082119, "learning_rate": 8.163775611980252e-07, "loss": 0.73330593, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 2.8192427158355713 }, { "auxiliary_loss_clip": 0.01308136, "auxiliary_loss_mlp": 0.01193309, "balance_loss_clip": 1.00724173, "balance_loss_mlp": 1.00019002, "epoch": 0.7106354836770276, "flos": 17238604566240.0, "grad_norm": 1.6343735870841856, "language_loss": 0.78571463, "learning_rate": 8.157497401763982e-07, "loss": 0.81072909, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 4.795414924621582 }, { "auxiliary_loss_clip": 0.01326804, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00740206, "balance_loss_mlp": 1.00017357, "epoch": 0.7107557265676667, "flos": 20193658331040.0, "grad_norm": 1.5923152306733173, "language_loss": 0.77768832, "learning_rate": 8.151220988045935e-07, "loss": 0.80288833, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 3.703916072845459 }, { "auxiliary_loss_clip": 0.01329666, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.0078789, "balance_loss_mlp": 1.00012887, "epoch": 0.7108759694583058, "flos": 21507091398720.0, "grad_norm": 1.654316426448744, "language_loss": 0.82677424, "learning_rate": 8.144946371778234e-07, "loss": 0.8520034, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.696816921234131 }, { "auxiliary_loss_clip": 0.01302418, "auxiliary_loss_mlp": 0.00872493, "balance_loss_clip": 1.00695324, "balance_loss_mlp": 1.00035524, "epoch": 0.7109962123489448, "flos": 24061921050240.0, "grad_norm": 1.5661008082608248, "language_loss": 0.78213966, "learning_rate": 8.138673553912751e-07, "loss": 0.8038888, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 3.7323219776153564 }, { "auxiliary_loss_clip": 0.01270595, "auxiliary_loss_mlp": 0.0119331, "balance_loss_clip": 1.00752544, "balance_loss_mlp": 1.00019097, "epoch": 0.711116455239584, "flos": 30480487037280.0, "grad_norm": 2.2765800740092517, "language_loss": 0.57064867, "learning_rate": 8.132402535401059e-07, "loss": 0.59528768, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.8936100006103516 }, { "auxiliary_loss_clip": 0.01325993, "auxiliary_loss_mlp": 0.01193383, "balance_loss_clip": 1.00819576, "balance_loss_mlp": 1.00016856, "epoch": 0.711236698130223, "flos": 25045621770240.0, "grad_norm": 1.6727114500026106, "language_loss": 0.74247944, "learning_rate": 8.126133317194465e-07, "loss": 0.76767319, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.7357869148254395 }, { "auxiliary_loss_clip": 0.01264423, "auxiliary_loss_mlp": 0.01193509, "balance_loss_clip": 1.00886822, "balance_loss_mlp": 1.00019932, "epoch": 0.7113569410208621, "flos": 24206712491520.0, "grad_norm": 1.7555230639278878, "language_loss": 0.7405296, "learning_rate": 8.11986590024401e-07, "loss": 0.76510888, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.9009296894073486 }, { "auxiliary_loss_clip": 0.01314037, "auxiliary_loss_mlp": 0.01193473, "balance_loss_clip": 1.00786448, "balance_loss_mlp": 1.00025845, "epoch": 0.7114771839115013, "flos": 35439319720800.0, "grad_norm": 1.5766909412749983, "language_loss": 0.68814087, "learning_rate": 8.113600285500442e-07, "loss": 0.71321595, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.8592629432678223 }, { "auxiliary_loss_clip": 0.01350197, "auxiliary_loss_mlp": 0.01193297, "balance_loss_clip": 1.00788236, "balance_loss_mlp": 1.0001781, "epoch": 0.7115974268021403, "flos": 21099467007360.0, "grad_norm": 1.7431751601820804, "language_loss": 0.74399972, "learning_rate": 8.107336473914268e-07, "loss": 0.76943469, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.79880690574646 }, { "auxiliary_loss_clip": 0.01284188, "auxiliary_loss_mlp": 0.01192309, "balance_loss_clip": 1.00370228, "balance_loss_mlp": 1.00004792, "epoch": 0.7117176696927794, "flos": 56752897662720.0, "grad_norm": 0.7709498558995836, "language_loss": 0.55766952, "learning_rate": 8.101074466435694e-07, "loss": 0.58243442, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.283975839614868 }, { "auxiliary_loss_clip": 0.01338558, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00858843, "balance_loss_mlp": 1.00019431, "epoch": 0.7118379125834186, "flos": 15925279269600.0, "grad_norm": 1.8314532520804756, "language_loss": 0.67942846, "learning_rate": 8.094814264014662e-07, "loss": 0.70474625, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.7108938694000244 }, { "auxiliary_loss_clip": 0.01350394, "auxiliary_loss_mlp": 0.01193434, "balance_loss_clip": 1.00801063, "balance_loss_mlp": 1.00021994, "epoch": 0.7119581554740576, "flos": 20193370941600.0, "grad_norm": 1.967276680045416, "language_loss": 0.81810737, "learning_rate": 8.088555867600844e-07, "loss": 0.84354568, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.752411365509033 }, { "auxiliary_loss_clip": 0.0130127, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00778389, "balance_loss_mlp": 1.00015116, "epoch": 0.7120783983646967, "flos": 34715398438080.0, "grad_norm": 2.1999824784367035, "language_loss": 0.60633922, "learning_rate": 8.08229927814362e-07, "loss": 0.63128364, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.8702635765075684 }, { "auxiliary_loss_clip": 0.0129468, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00705481, "balance_loss_mlp": 1.00019896, "epoch": 0.7121986412553358, "flos": 26359126685280.0, "grad_norm": 1.6125559022611564, "language_loss": 0.65128297, "learning_rate": 8.076044496592134e-07, "loss": 0.676162, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.903125762939453 }, { "auxiliary_loss_clip": 0.01303737, "auxiliary_loss_mlp": 0.01193308, "balance_loss_clip": 1.00701046, "balance_loss_mlp": 1.00018859, "epoch": 0.7123188841459749, "flos": 11145352314240.0, "grad_norm": 4.419926076959776, "language_loss": 0.77701068, "learning_rate": 8.069791523895204e-07, "loss": 0.80198115, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.7000997066497803 }, { "auxiliary_loss_clip": 0.01302234, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00766206, "balance_loss_mlp": 1.00016677, "epoch": 0.7124391270366139, "flos": 20811680308800.0, "grad_norm": 1.7227000073485599, "language_loss": 0.76920342, "learning_rate": 8.063540361001422e-07, "loss": 0.79415762, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.822328805923462 }, { "auxiliary_loss_clip": 0.012909, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00695181, "balance_loss_mlp": 1.00021577, "epoch": 0.7125593699272531, "flos": 17603744453280.0, "grad_norm": 1.9333002378348094, "language_loss": 0.79505229, "learning_rate": 8.057291008859069e-07, "loss": 0.81989366, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.7543060779571533 }, { "auxiliary_loss_clip": 0.01336816, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00794613, "balance_loss_mlp": 1.00016785, "epoch": 0.7126796128178922, "flos": 28654069128480.0, "grad_norm": 1.8930242437073677, "language_loss": 0.68253589, "learning_rate": 8.051043468416187e-07, "loss": 0.70783603, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.7791831493377686 }, { "auxiliary_loss_clip": 0.01349739, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00829363, "balance_loss_mlp": 1.00016952, "epoch": 0.7127998557085312, "flos": 16034447629440.0, "grad_norm": 1.925344045996432, "language_loss": 0.81787479, "learning_rate": 8.044797740620506e-07, "loss": 0.8433041, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.6745238304138184 }, { "auxiliary_loss_clip": 0.01270639, "auxiliary_loss_mlp": 0.01193352, "balance_loss_clip": 1.0072397, "balance_loss_mlp": 1.00023341, "epoch": 0.7129200985991703, "flos": 23403282599520.0, "grad_norm": 1.9953569450550148, "language_loss": 0.78851163, "learning_rate": 8.038553826419494e-07, "loss": 0.81315154, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.781811475753784 }, { "auxiliary_loss_clip": 0.01349496, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00784254, "balance_loss_mlp": 1.00020313, "epoch": 0.7130403414898094, "flos": 21397456031040.0, "grad_norm": 1.4864249610070475, "language_loss": 0.8102926, "learning_rate": 8.032311726760364e-07, "loss": 0.83571988, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.7831058502197266 }, { "auxiliary_loss_clip": 0.01292124, "auxiliary_loss_mlp": 0.01193386, "balance_loss_clip": 1.00708461, "balance_loss_mlp": 1.00017154, "epoch": 0.7131605843804485, "flos": 74739070798560.0, "grad_norm": 1.673329147246764, "language_loss": 0.68831623, "learning_rate": 8.026071442590022e-07, "loss": 0.71317124, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.1628365516662598 }, { "auxiliary_loss_clip": 0.0132627, "auxiliary_loss_mlp": 0.01193245, "balance_loss_clip": 1.00808728, "balance_loss_mlp": 1.00022089, "epoch": 0.7132808272710875, "flos": 18368749216800.0, "grad_norm": 1.9194412520539441, "language_loss": 0.80537254, "learning_rate": 8.019832974855134e-07, "loss": 0.83056772, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.6910829544067383 }, { "auxiliary_loss_clip": 0.01292991, "auxiliary_loss_mlp": 0.01193327, "balance_loss_clip": 1.00745535, "balance_loss_mlp": 1.00020742, "epoch": 0.7134010701617267, "flos": 23253389995680.0, "grad_norm": 2.224099555916729, "language_loss": 0.82780194, "learning_rate": 8.013596324502052e-07, "loss": 0.85266507, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.7469828128814697 }, { "auxiliary_loss_clip": 0.01324106, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00768614, "balance_loss_mlp": 1.0001905, "epoch": 0.7135213130523658, "flos": 23653147101120.0, "grad_norm": 1.6289470591159858, "language_loss": 0.78877938, "learning_rate": 8.007361492476872e-07, "loss": 0.81395257, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.7577335834503174 }, { "auxiliary_loss_clip": 0.01303657, "auxiliary_loss_mlp": 0.01193381, "balance_loss_clip": 1.00719893, "balance_loss_mlp": 1.00016654, "epoch": 0.7136415559430048, "flos": 24790656106080.0, "grad_norm": 1.44937654378456, "language_loss": 0.78941941, "learning_rate": 8.001128479725426e-07, "loss": 0.81438982, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.8276896476745605 }, { "auxiliary_loss_clip": 0.0128112, "auxiliary_loss_mlp": 0.01193295, "balance_loss_clip": 1.00727248, "balance_loss_mlp": 1.00017571, "epoch": 0.713761798833644, "flos": 18296964198720.0, "grad_norm": 1.597356417681011, "language_loss": 0.81089997, "learning_rate": 7.994897287193248e-07, "loss": 0.83564413, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 4.893023490905762 }, { "auxiliary_loss_clip": 0.01336658, "auxiliary_loss_mlp": 0.01193404, "balance_loss_clip": 1.00796175, "balance_loss_mlp": 1.00028539, "epoch": 0.713882041724283, "flos": 15558271351200.0, "grad_norm": 2.303256388696851, "language_loss": 0.83315289, "learning_rate": 7.988667915825605e-07, "loss": 0.85845339, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 3.6209635734558105 }, { "auxiliary_loss_clip": 0.01314543, "auxiliary_loss_mlp": 0.01193487, "balance_loss_clip": 1.00792265, "balance_loss_mlp": 1.00017762, "epoch": 0.7140022846149221, "flos": 24061022958240.0, "grad_norm": 1.9479278881847002, "language_loss": 0.75484794, "learning_rate": 7.982440366567491e-07, "loss": 0.77992821, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.7476840019226074 }, { "auxiliary_loss_clip": 0.0133791, "auxiliary_loss_mlp": 0.01193255, "balance_loss_clip": 1.00808811, "balance_loss_mlp": 1.00023103, "epoch": 0.7141225275055613, "flos": 27891722717280.0, "grad_norm": 1.5520003399515114, "language_loss": 0.75000131, "learning_rate": 7.97621464036361e-07, "loss": 0.77531302, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 3.7529098987579346 }, { "auxiliary_loss_clip": 0.01332244, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00734079, "balance_loss_mlp": 1.00019217, "epoch": 0.7142427703962003, "flos": 19682613368640.0, "grad_norm": 1.4925050912824007, "language_loss": 0.68141508, "learning_rate": 7.969990738158417e-07, "loss": 0.70666969, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.6881251335144043 }, { "auxiliary_loss_clip": 0.01327136, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00759792, "balance_loss_mlp": 1.00016308, "epoch": 0.7143630132868394, "flos": 21032387991360.0, "grad_norm": 2.0253853103298516, "language_loss": 0.85091633, "learning_rate": 7.963768660896062e-07, "loss": 0.87611961, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.7199275493621826 }, { "auxiliary_loss_clip": 0.01337285, "auxiliary_loss_mlp": 0.01193324, "balance_loss_clip": 1.00829911, "balance_loss_mlp": 1.0002048, "epoch": 0.7144832561774785, "flos": 24129934081920.0, "grad_norm": 1.9186175394147584, "language_loss": 0.82341033, "learning_rate": 7.957548409520432e-07, "loss": 0.84871638, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.7260982990264893 }, { "auxiliary_loss_clip": 0.01294689, "auxiliary_loss_mlp": 0.0119328, "balance_loss_clip": 1.00701928, "balance_loss_mlp": 1.00016069, "epoch": 0.7146034990681176, "flos": 16325826696000.0, "grad_norm": 1.802297154021018, "language_loss": 0.84195268, "learning_rate": 7.951329984975135e-07, "loss": 0.86683238, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.769667625427246 }, { "auxiliary_loss_clip": 0.01266537, "auxiliary_loss_mlp": 0.01192295, "balance_loss_clip": 1.00507879, "balance_loss_mlp": 1.00003397, "epoch": 0.7147237419587567, "flos": 69627197436480.0, "grad_norm": 0.7136895690910567, "language_loss": 0.54318583, "learning_rate": 7.94511338820349e-07, "loss": 0.56777418, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.326786518096924 }, { "auxiliary_loss_clip": 0.01302866, "auxiliary_loss_mlp": 0.00872585, "balance_loss_clip": 1.00690579, "balance_loss_mlp": 1.00030208, "epoch": 0.7148439848493958, "flos": 22266815381280.0, "grad_norm": 1.8299768566929615, "language_loss": 0.78003001, "learning_rate": 7.938898620148575e-07, "loss": 0.80178452, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.795381546020508 }, { "auxiliary_loss_clip": 0.01303793, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00670886, "balance_loss_mlp": 1.00021482, "epoch": 0.7149642277400349, "flos": 17931393227520.0, "grad_norm": 1.7700806518546657, "language_loss": 0.70971453, "learning_rate": 7.932685681753135e-07, "loss": 0.73468494, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.6860761642456055 }, { "auxiliary_loss_clip": 0.01349313, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00839937, "balance_loss_mlp": 1.00018978, "epoch": 0.7150844706306739, "flos": 31681949698080.0, "grad_norm": 1.77966051004971, "language_loss": 0.62845576, "learning_rate": 7.92647457395969e-07, "loss": 0.65388095, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.7993476390838623 }, { "auxiliary_loss_clip": 0.01287855, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00788212, "balance_loss_mlp": 1.00019288, "epoch": 0.7152047135213131, "flos": 10926225273600.0, "grad_norm": 2.3723803263220327, "language_loss": 0.73987257, "learning_rate": 7.920265297710444e-07, "loss": 0.76468331, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.8210136890411377 }, { "auxiliary_loss_clip": 0.01331772, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00754929, "balance_loss_mlp": 1.0001775, "epoch": 0.7153249564119522, "flos": 20995651275840.0, "grad_norm": 1.7543790521033558, "language_loss": 0.73220378, "learning_rate": 7.914057853947363e-07, "loss": 0.7574535, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.7309210300445557 }, { "auxiliary_loss_clip": 0.01302918, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.00832736, "balance_loss_mlp": 1.0001688, "epoch": 0.7154451993025912, "flos": 24243125893920.0, "grad_norm": 1.6338906943525189, "language_loss": 0.62797856, "learning_rate": 7.907852243612089e-07, "loss": 0.65294063, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.8731725215911865 }, { "auxiliary_loss_clip": 0.01314823, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00771499, "balance_loss_mlp": 1.00017262, "epoch": 0.7155654421932304, "flos": 23330958726240.0, "grad_norm": 1.7349940331844633, "language_loss": 0.72344601, "learning_rate": 7.901648467646009e-07, "loss": 0.74852616, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.78918194770813 }, { "auxiliary_loss_clip": 0.01350268, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00830674, "balance_loss_mlp": 1.00019336, "epoch": 0.7156856850838694, "flos": 22711894961760.0, "grad_norm": 1.606588024067959, "language_loss": 0.72533786, "learning_rate": 7.895446526990244e-07, "loss": 0.75077361, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.71016263961792 }, { "auxiliary_loss_clip": 0.01287563, "auxiliary_loss_mlp": 0.01193431, "balance_loss_clip": 1.00734568, "balance_loss_mlp": 1.00031185, "epoch": 0.7158059279745085, "flos": 19865434777920.0, "grad_norm": 2.0985626984263344, "language_loss": 0.75728863, "learning_rate": 7.889246422585609e-07, "loss": 0.78209859, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.773754835128784 }, { "auxiliary_loss_clip": 0.01349968, "auxiliary_loss_mlp": 0.01193617, "balance_loss_clip": 1.00834227, "balance_loss_mlp": 1.0002116, "epoch": 0.7159261708651476, "flos": 24134783778720.0, "grad_norm": 1.9235218896085238, "language_loss": 0.73372597, "learning_rate": 7.883048155372675e-07, "loss": 0.75916183, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.752547264099121 }, { "auxiliary_loss_clip": 0.01317089, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00794148, "balance_loss_mlp": 1.00016308, "epoch": 0.7160464137557867, "flos": 16983207817920.0, "grad_norm": 2.4396291954016327, "language_loss": 0.71563202, "learning_rate": 7.876851726291698e-07, "loss": 0.74073482, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.680819272994995 }, { "auxiliary_loss_clip": 0.01306133, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00747299, "balance_loss_mlp": 1.00014758, "epoch": 0.7161666566464258, "flos": 25228263561120.0, "grad_norm": 1.7250269101660978, "language_loss": 0.782184, "learning_rate": 7.870657136282666e-07, "loss": 0.80717707, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.823500156402588 }, { "auxiliary_loss_clip": 0.01337657, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00833702, "balance_loss_mlp": 1.00021315, "epoch": 0.7162868995370649, "flos": 26468402816160.0, "grad_norm": 1.6177485009792532, "language_loss": 0.81771421, "learning_rate": 7.86446438628531e-07, "loss": 0.84302312, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.7586185932159424 }, { "auxiliary_loss_clip": 0.01319163, "auxiliary_loss_mlp": 0.01192272, "balance_loss_clip": 1.00500178, "balance_loss_mlp": 1.00001132, "epoch": 0.716407142427704, "flos": 69998947280640.0, "grad_norm": 0.7663422234876908, "language_loss": 0.56931305, "learning_rate": 7.858273477239059e-07, "loss": 0.59442735, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.215607166290283 }, { "auxiliary_loss_clip": 0.01277409, "auxiliary_loss_mlp": 0.01193371, "balance_loss_clip": 1.00733018, "balance_loss_mlp": 1.00025213, "epoch": 0.716527385318343, "flos": 20740470069600.0, "grad_norm": 2.5750300349683557, "language_loss": 0.70931524, "learning_rate": 7.852084410083067e-07, "loss": 0.73402297, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 2.7913496494293213 }, { "auxiliary_loss_clip": 0.01302148, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00717938, "balance_loss_mlp": 1.00020218, "epoch": 0.7166476282089821, "flos": 25371977292000.0, "grad_norm": 1.6391914980472282, "language_loss": 0.63453871, "learning_rate": 7.84589718575621e-07, "loss": 0.65949243, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.741934299468994 }, { "auxiliary_loss_clip": 0.01326324, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00792336, "balance_loss_mlp": 1.00013876, "epoch": 0.7167678710996213, "flos": 24133741992000.0, "grad_norm": 1.8367600753907827, "language_loss": 0.69001794, "learning_rate": 7.83971180519708e-07, "loss": 0.71521282, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 3.7949464321136475 }, { "auxiliary_loss_clip": 0.0135052, "auxiliary_loss_mlp": 0.0119345, "balance_loss_clip": 1.00834358, "balance_loss_mlp": 1.00023615, "epoch": 0.7168881139902603, "flos": 30226599083520.0, "grad_norm": 1.8948105914273048, "language_loss": 0.75608909, "learning_rate": 7.833528269344008e-07, "loss": 0.78152883, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 3.6670215129852295 }, { "auxiliary_loss_clip": 0.01301764, "auxiliary_loss_mlp": 0.01193489, "balance_loss_clip": 1.00877357, "balance_loss_mlp": 1.0002749, "epoch": 0.7170083568808994, "flos": 14606421726240.0, "grad_norm": 2.182424170565089, "language_loss": 0.77808475, "learning_rate": 7.827346579135023e-07, "loss": 0.80303735, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.704866886138916 }, { "auxiliary_loss_clip": 0.01313187, "auxiliary_loss_mlp": 0.01193365, "balance_loss_clip": 1.0075531, "balance_loss_mlp": 1.00024617, "epoch": 0.7171285997715385, "flos": 23331102420960.0, "grad_norm": 1.8480506736422466, "language_loss": 0.8301872, "learning_rate": 7.821166735507885e-07, "loss": 0.85525268, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 3.683281660079956 }, { "auxiliary_loss_clip": 0.01349523, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00820088, "balance_loss_mlp": 1.00020468, "epoch": 0.7172488426621776, "flos": 16543552713120.0, "grad_norm": 1.5243287580149552, "language_loss": 0.68458712, "learning_rate": 7.81498873940007e-07, "loss": 0.71001458, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 3.6449992656707764 }, { "auxiliary_loss_clip": 0.0133861, "auxiliary_loss_mlp": 0.01193327, "balance_loss_clip": 1.00789857, "balance_loss_mlp": 1.00020754, "epoch": 0.7173690855528166, "flos": 26541624781440.0, "grad_norm": 1.971692477095041, "language_loss": 0.77056897, "learning_rate": 7.808812591748768e-07, "loss": 0.79588836, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.729357957839966 }, { "auxiliary_loss_clip": 0.01290125, "auxiliary_loss_mlp": 0.01193424, "balance_loss_clip": 1.00710201, "balance_loss_mlp": 1.00020981, "epoch": 0.7174893284434558, "flos": 22784111064000.0, "grad_norm": 1.8126763089495024, "language_loss": 0.64870876, "learning_rate": 7.802638293490915e-07, "loss": 0.67354429, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.7896037101745605 }, { "auxiliary_loss_clip": 0.01325067, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00852275, "balance_loss_mlp": 1.00018072, "epoch": 0.7176095713340949, "flos": 23293575384480.0, "grad_norm": 1.4878357347119908, "language_loss": 0.76593524, "learning_rate": 7.796465845563123e-07, "loss": 0.79111791, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.729034423828125 }, { "auxiliary_loss_clip": 0.01311369, "auxiliary_loss_mlp": 0.00872428, "balance_loss_clip": 1.00765157, "balance_loss_mlp": 1.00035751, "epoch": 0.7177298142247339, "flos": 25591643187840.0, "grad_norm": 2.567966874416152, "language_loss": 0.79224491, "learning_rate": 7.790295248901766e-07, "loss": 0.81408286, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.807927370071411 }, { "auxiliary_loss_clip": 0.01337292, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00826275, "balance_loss_mlp": 1.00019372, "epoch": 0.7178500571153731, "flos": 31652792879040.0, "grad_norm": 1.6809366535970232, "language_loss": 0.62437606, "learning_rate": 7.784126504442902e-07, "loss": 0.64968121, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.7941744327545166 }, { "auxiliary_loss_clip": 0.01290148, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.0069468, "balance_loss_mlp": 1.00019574, "epoch": 0.7179703000060121, "flos": 19427252544000.0, "grad_norm": 1.2954890356324897, "language_loss": 0.67838472, "learning_rate": 7.777959613122351e-07, "loss": 0.70321834, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.7454447746276855 }, { "auxiliary_loss_clip": 0.01298247, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00703073, "balance_loss_mlp": 1.00020719, "epoch": 0.7180905428966512, "flos": 28839261500640.0, "grad_norm": 1.5538865772582815, "language_loss": 0.77945238, "learning_rate": 7.771794575875604e-07, "loss": 0.80436713, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.8947887420654297 }, { "auxiliary_loss_clip": 0.01327012, "auxiliary_loss_mlp": 0.01193482, "balance_loss_clip": 1.0081836, "balance_loss_mlp": 1.00026798, "epoch": 0.7182107857872904, "flos": 20047573637280.0, "grad_norm": 2.104480180650091, "language_loss": 0.77783346, "learning_rate": 7.765631393637888e-07, "loss": 0.80303836, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.7332234382629395 }, { "auxiliary_loss_clip": 0.01338, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00814569, "balance_loss_mlp": 1.00018311, "epoch": 0.7183310286779294, "flos": 22747697661600.0, "grad_norm": 3.051375754163776, "language_loss": 0.49234495, "learning_rate": 7.75947006734417e-07, "loss": 0.51765704, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.756969690322876 }, { "auxiliary_loss_clip": 0.01350878, "auxiliary_loss_mlp": 0.01193301, "balance_loss_clip": 1.0082401, "balance_loss_mlp": 1.00018191, "epoch": 0.7184512715685685, "flos": 17158269712320.0, "grad_norm": 2.1279135185247493, "language_loss": 0.83012944, "learning_rate": 7.753310597929101e-07, "loss": 0.85557121, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.645233631134033 }, { "auxiliary_loss_clip": 0.01319329, "auxiliary_loss_mlp": 0.01192282, "balance_loss_clip": 1.00494194, "balance_loss_mlp": 1.00002122, "epoch": 0.7185715144592076, "flos": 65509644994560.0, "grad_norm": 0.7558718610498488, "language_loss": 0.55233473, "learning_rate": 7.747152986327095e-07, "loss": 0.57745087, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.1714844703674316 }, { "auxiliary_loss_clip": 0.01281761, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00736713, "balance_loss_mlp": 1.00019431, "epoch": 0.7186917573498467, "flos": 16180532323200.0, "grad_norm": 1.7656135572868872, "language_loss": 0.67427415, "learning_rate": 7.740997233472228e-07, "loss": 0.69902396, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.8109500408172607 }, { "auxiliary_loss_clip": 0.01313598, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00733805, "balance_loss_mlp": 1.00018239, "epoch": 0.7188120002404857, "flos": 29242287660960.0, "grad_norm": 1.9356747723735146, "language_loss": 0.70617753, "learning_rate": 7.734843340298329e-07, "loss": 0.73124564, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.8066656589508057 }, { "auxiliary_loss_clip": 0.01317983, "auxiliary_loss_mlp": 0.01193304, "balance_loss_clip": 1.0077728, "balance_loss_mlp": 1.00018513, "epoch": 0.7189322431311249, "flos": 33401175049440.0, "grad_norm": 1.8709413124107845, "language_loss": 0.74985909, "learning_rate": 7.72869130773895e-07, "loss": 0.77497196, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.900500535964966 }, { "auxiliary_loss_clip": 0.0130713, "auxiliary_loss_mlp": 0.01192293, "balance_loss_clip": 1.00500453, "balance_loss_mlp": 1.00003195, "epoch": 0.719052486021764, "flos": 61351296461280.0, "grad_norm": 0.7865177987125476, "language_loss": 0.59391379, "learning_rate": 7.722541136727343e-07, "loss": 0.61890805, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.139307737350464 }, { "auxiliary_loss_clip": 0.01326802, "auxiliary_loss_mlp": 0.01193344, "balance_loss_clip": 1.00722694, "balance_loss_mlp": 1.00022542, "epoch": 0.719172728912403, "flos": 15596804250720.0, "grad_norm": 1.8440260876178418, "language_loss": 0.80345136, "learning_rate": 7.716392828196483e-07, "loss": 0.8286528, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.7106008529663086 }, { "auxiliary_loss_clip": 0.01326463, "auxiliary_loss_mlp": 0.01193438, "balance_loss_clip": 1.00733352, "balance_loss_mlp": 1.0002234, "epoch": 0.7192929718030422, "flos": 15553170188640.0, "grad_norm": 3.257801110126449, "language_loss": 0.7734189, "learning_rate": 7.710246383079064e-07, "loss": 0.79861796, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.6477606296539307 }, { "auxiliary_loss_clip": 0.01326094, "auxiliary_loss_mlp": 0.01193252, "balance_loss_clip": 1.008286, "balance_loss_mlp": 1.0002284, "epoch": 0.7194132146936812, "flos": 21862495968480.0, "grad_norm": 2.478095267039711, "language_loss": 0.91771388, "learning_rate": 7.704101802307492e-07, "loss": 0.94290733, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.8133561611175537 }, { "auxiliary_loss_clip": 0.01301117, "auxiliary_loss_mlp": 0.01193371, "balance_loss_clip": 1.00797212, "balance_loss_mlp": 1.00025189, "epoch": 0.7195334575843203, "flos": 27338911724160.0, "grad_norm": 1.9492483371673863, "language_loss": 0.87335175, "learning_rate": 7.697959086813912e-07, "loss": 0.89829665, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.781947374343872 }, { "auxiliary_loss_clip": 0.01302329, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00767589, "balance_loss_mlp": 1.00017285, "epoch": 0.7196537004749595, "flos": 18770625819360.0, "grad_norm": 1.537381813869888, "language_loss": 0.80149245, "learning_rate": 7.691818237530145e-07, "loss": 0.82644773, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.7923121452331543 }, { "auxiliary_loss_clip": 0.01295563, "auxiliary_loss_mlp": 0.01193364, "balance_loss_clip": 1.00845063, "balance_loss_mlp": 1.00024533, "epoch": 0.7197739433655985, "flos": 24531020363520.0, "grad_norm": 1.7286916651494717, "language_loss": 0.77574271, "learning_rate": 7.685679255387774e-07, "loss": 0.800632, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.952754020690918 }, { "auxiliary_loss_clip": 0.01301953, "auxiliary_loss_mlp": 0.01193244, "balance_loss_clip": 1.00678921, "balance_loss_mlp": 1.00022054, "epoch": 0.7198941862562376, "flos": 18040597511040.0, "grad_norm": 1.9254457643428, "language_loss": 0.76502216, "learning_rate": 7.679542141318065e-07, "loss": 0.78997409, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 3.8646116256713867 }, { "auxiliary_loss_clip": 0.01326312, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00808132, "balance_loss_mlp": 1.00017929, "epoch": 0.7200144291468767, "flos": 29022406223040.0, "grad_norm": 1.6597008065852041, "language_loss": 0.75678289, "learning_rate": 7.673406896252013e-07, "loss": 0.78197801, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 3.7296302318573 }, { "auxiliary_loss_clip": 0.01305195, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.00771761, "balance_loss_mlp": 1.00024068, "epoch": 0.7201346720375158, "flos": 25374276407520.0, "grad_norm": 1.483787694680649, "language_loss": 0.78341842, "learning_rate": 7.667273521120347e-07, "loss": 0.80840302, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 3.7750189304351807 }, { "auxiliary_loss_clip": 0.01294109, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00695324, "balance_loss_mlp": 1.00012708, "epoch": 0.7202549149281549, "flos": 14355623208960.0, "grad_norm": 2.3552252100991145, "language_loss": 0.79915977, "learning_rate": 7.661142016853468e-07, "loss": 0.82403237, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.720400333404541 }, { "auxiliary_loss_clip": 0.01279549, "auxiliary_loss_mlp": 0.01193282, "balance_loss_clip": 1.00692177, "balance_loss_mlp": 1.00025797, "epoch": 0.7203751578187939, "flos": 23001693386400.0, "grad_norm": 1.9863457163142924, "language_loss": 0.74962825, "learning_rate": 7.655012384381543e-07, "loss": 0.77435654, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 3.77948260307312 }, { "auxiliary_loss_clip": 0.01300507, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00751519, "balance_loss_mlp": 1.00018406, "epoch": 0.7204954007094331, "flos": 23692434397920.0, "grad_norm": 2.670039373104597, "language_loss": 0.81776559, "learning_rate": 7.648884624634415e-07, "loss": 0.84270269, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 2.755837917327881 }, { "auxiliary_loss_clip": 0.01326014, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00778747, "balance_loss_mlp": 1.0001502, "epoch": 0.7206156436000721, "flos": 16253035814880.0, "grad_norm": 2.2167430345610306, "language_loss": 0.88754451, "learning_rate": 7.642758738541683e-07, "loss": 0.91273636, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.676687240600586 }, { "auxiliary_loss_clip": 0.01302046, "auxiliary_loss_mlp": 0.0119305, "balance_loss_clip": 1.0050292, "balance_loss_mlp": 1.00002623, "epoch": 0.7207358864907112, "flos": 54377835907680.0, "grad_norm": 0.7590061121955803, "language_loss": 0.60763615, "learning_rate": 7.636634727032621e-07, "loss": 0.63258713, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.1228365898132324 }, { "auxiliary_loss_clip": 0.01314303, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.00828886, "balance_loss_mlp": 1.0002408, "epoch": 0.7208561293813504, "flos": 19135550164320.0, "grad_norm": 1.857315460211605, "language_loss": 0.7838999, "learning_rate": 7.630512591036231e-07, "loss": 0.80897558, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.7093048095703125 }, { "auxiliary_loss_clip": 0.01329943, "auxiliary_loss_mlp": 0.01193286, "balance_loss_clip": 1.00808263, "balance_loss_mlp": 1.00016677, "epoch": 0.7209763722719894, "flos": 17748535894560.0, "grad_norm": 2.063993896946303, "language_loss": 0.64286447, "learning_rate": 7.624392331481255e-07, "loss": 0.66809678, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.743497610092163 }, { "auxiliary_loss_clip": 0.01300889, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00504875, "balance_loss_mlp": 1.00001073, "epoch": 0.7210966151626285, "flos": 66819521617920.0, "grad_norm": 0.7465945883534449, "language_loss": 0.51906693, "learning_rate": 7.618273949296115e-07, "loss": 0.54399848, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.1590194702148438 }, { "auxiliary_loss_clip": 0.01316923, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.0079546, "balance_loss_mlp": 1.00021672, "epoch": 0.7212168580532676, "flos": 21141879664320.0, "grad_norm": 1.8472910855979738, "language_loss": 0.68743122, "learning_rate": 7.612157445408987e-07, "loss": 0.71253288, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.752431631088257 }, { "auxiliary_loss_clip": 0.01313602, "auxiliary_loss_mlp": 0.01193424, "balance_loss_clip": 1.00833941, "balance_loss_mlp": 1.00020945, "epoch": 0.7213371009439067, "flos": 22345749211680.0, "grad_norm": 1.989795960112999, "language_loss": 0.74240661, "learning_rate": 7.606042820747716e-07, "loss": 0.7674768, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.7735443115234375 }, { "auxiliary_loss_clip": 0.01313258, "auxiliary_loss_mlp": 0.01193361, "balance_loss_clip": 1.00729167, "balance_loss_mlp": 1.00024152, "epoch": 0.7214573438345457, "flos": 18515911620960.0, "grad_norm": 1.7368005569401466, "language_loss": 0.85303575, "learning_rate": 7.599930076239889e-07, "loss": 0.87810194, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.780299425125122 }, { "auxiliary_loss_clip": 0.0127547, "auxiliary_loss_mlp": 0.00872541, "balance_loss_clip": 1.0074991, "balance_loss_mlp": 1.00034904, "epoch": 0.7215775867251849, "flos": 35736123263040.0, "grad_norm": 1.919110036022092, "language_loss": 0.70741284, "learning_rate": 7.593819212812818e-07, "loss": 0.72889298, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.980241060256958 }, { "auxiliary_loss_clip": 0.01327942, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00744665, "balance_loss_mlp": 1.00016832, "epoch": 0.721697829615824, "flos": 20372420364480.0, "grad_norm": 2.0430921934214696, "language_loss": 0.71752965, "learning_rate": 7.587710231393508e-07, "loss": 0.74274099, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.7136447429656982 }, { "auxiliary_loss_clip": 0.01236184, "auxiliary_loss_mlp": 0.01193321, "balance_loss_clip": 1.00598514, "balance_loss_mlp": 1.00020218, "epoch": 0.721818072506463, "flos": 20229820267680.0, "grad_norm": 2.299259522171416, "language_loss": 0.83405244, "learning_rate": 7.581603132908685e-07, "loss": 0.85834754, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.9142305850982666 }, { "auxiliary_loss_clip": 0.01295624, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00730288, "balance_loss_mlp": 1.0002234, "epoch": 0.7219383153971022, "flos": 18186897746880.0, "grad_norm": 2.0047702065038164, "language_loss": 0.7850734, "learning_rate": 7.575497918284795e-07, "loss": 0.80996215, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.8435404300689697 }, { "auxiliary_loss_clip": 0.01350647, "auxiliary_loss_mlp": 0.0119346, "balance_loss_clip": 1.00770926, "balance_loss_mlp": 1.00024533, "epoch": 0.7220585582877412, "flos": 17342132908320.0, "grad_norm": 5.020239027487583, "language_loss": 0.74573588, "learning_rate": 7.569394588447984e-07, "loss": 0.77117693, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.6794607639312744 }, { "auxiliary_loss_clip": 0.01337765, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.00800788, "balance_loss_mlp": 1.00020933, "epoch": 0.7221788011783803, "flos": 16976346395040.0, "grad_norm": 2.3442747462811426, "language_loss": 0.77926338, "learning_rate": 7.563293144324146e-07, "loss": 0.8045733, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.771399974822998 }, { "auxiliary_loss_clip": 0.01349854, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00870407, "balance_loss_mlp": 1.00014663, "epoch": 0.7222990440690195, "flos": 26286371727840.0, "grad_norm": 1.8450820102215004, "language_loss": 0.80010676, "learning_rate": 7.557193586838834e-07, "loss": 0.82553697, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.6844229698181152 }, { "auxiliary_loss_clip": 0.01300299, "auxiliary_loss_mlp": 0.01193238, "balance_loss_clip": 1.00722051, "balance_loss_mlp": 1.0002141, "epoch": 0.7224192869596585, "flos": 17601696803520.0, "grad_norm": 2.165794132467626, "language_loss": 0.70501649, "learning_rate": 7.551095916917371e-07, "loss": 0.72995186, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.7832698822021484 }, { "auxiliary_loss_clip": 0.01293185, "auxiliary_loss_mlp": 0.01193624, "balance_loss_clip": 1.00804019, "balance_loss_mlp": 1.00031424, "epoch": 0.7225395298502976, "flos": 12932339231520.0, "grad_norm": 2.6966441919231, "language_loss": 0.66486275, "learning_rate": 7.545000135484758e-07, "loss": 0.68973076, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.7318496704101562 }, { "auxiliary_loss_clip": 0.01349654, "auxiliary_loss_mlp": 0.00872591, "balance_loss_clip": 1.0080775, "balance_loss_mlp": 1.00033307, "epoch": 0.7226597727409367, "flos": 29643912797760.0, "grad_norm": 2.003705411434258, "language_loss": 0.62796283, "learning_rate": 7.538906243465714e-07, "loss": 0.65018529, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.794658660888672 }, { "auxiliary_loss_clip": 0.01350778, "auxiliary_loss_mlp": 0.01193362, "balance_loss_clip": 1.00842881, "balance_loss_mlp": 1.00024343, "epoch": 0.7227800156315758, "flos": 13771643670720.0, "grad_norm": 1.9708059198751902, "language_loss": 0.78919703, "learning_rate": 7.5328142417847e-07, "loss": 0.8146385, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.648341417312622 }, { "auxiliary_loss_clip": 0.01336392, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.0076679, "balance_loss_mlp": 1.00016785, "epoch": 0.7229002585222148, "flos": 20301892675200.0, "grad_norm": 1.8366260297601058, "language_loss": 0.691733, "learning_rate": 7.526724131365838e-07, "loss": 0.71702886, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.8758788108825684 }, { "auxiliary_loss_clip": 0.01301174, "auxiliary_loss_mlp": 0.01193377, "balance_loss_clip": 1.00765514, "balance_loss_mlp": 1.00025821, "epoch": 0.723020501412854, "flos": 16581259368000.0, "grad_norm": 1.6854957495924598, "language_loss": 0.70321321, "learning_rate": 7.520635913133017e-07, "loss": 0.72815871, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 3.708458662033081 }, { "auxiliary_loss_clip": 0.0133122, "auxiliary_loss_mlp": 0.01193265, "balance_loss_clip": 1.00751972, "balance_loss_mlp": 1.00024092, "epoch": 0.7231407443034931, "flos": 28548313518240.0, "grad_norm": 1.7461639163059453, "language_loss": 0.82099491, "learning_rate": 7.514549588009798e-07, "loss": 0.84623969, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.7519009113311768 }, { "auxiliary_loss_clip": 0.0131283, "auxiliary_loss_mlp": 0.01193356, "balance_loss_clip": 1.00759244, "balance_loss_mlp": 1.00023651, "epoch": 0.7232609871941321, "flos": 30008549753280.0, "grad_norm": 1.8418676640137621, "language_loss": 0.70467514, "learning_rate": 7.508465156919492e-07, "loss": 0.72973704, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 3.7345187664031982 }, { "auxiliary_loss_clip": 0.01314479, "auxiliary_loss_mlp": 0.01193291, "balance_loss_clip": 1.00789928, "balance_loss_mlp": 1.00026691, "epoch": 0.7233812300847713, "flos": 16654014325440.0, "grad_norm": 2.454333805105203, "language_loss": 0.61175919, "learning_rate": 7.502382620785083e-07, "loss": 0.63683689, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 3.637164831161499 }, { "auxiliary_loss_clip": 0.01258931, "auxiliary_loss_mlp": 0.01192273, "balance_loss_clip": 1.00473309, "balance_loss_mlp": 1.00001264, "epoch": 0.7235014729754103, "flos": 67258817485920.0, "grad_norm": 0.8154570117774167, "language_loss": 0.62537456, "learning_rate": 7.496301980529289e-07, "loss": 0.64988661, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 4.360846042633057 }, { "auxiliary_loss_clip": 0.01350886, "auxiliary_loss_mlp": 0.01193306, "balance_loss_clip": 1.00830364, "balance_loss_mlp": 1.00018656, "epoch": 0.7236217158660494, "flos": 26943249918240.0, "grad_norm": 31.00774660116728, "language_loss": 0.74584627, "learning_rate": 7.490223237074547e-07, "loss": 0.77128822, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.688260316848755 }, { "auxiliary_loss_clip": 0.01302839, "auxiliary_loss_mlp": 0.01193255, "balance_loss_clip": 1.00765741, "balance_loss_mlp": 1.00023127, "epoch": 0.7237419587566886, "flos": 29423384733600.0, "grad_norm": 2.4293842785241626, "language_loss": 0.66280293, "learning_rate": 7.484146391342989e-07, "loss": 0.68776387, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.8401758670806885 }, { "auxiliary_loss_clip": 0.01317048, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00761938, "balance_loss_mlp": 1.00019062, "epoch": 0.7238622016473276, "flos": 17821506394080.0, "grad_norm": 2.0483818754153464, "language_loss": 0.56581116, "learning_rate": 7.478071444256484e-07, "loss": 0.59091377, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.698315382003784 }, { "auxiliary_loss_clip": 0.01293815, "auxiliary_loss_mlp": 0.01193483, "balance_loss_clip": 1.00792289, "balance_loss_mlp": 1.0002687, "epoch": 0.7239824445379667, "flos": 25739128905120.0, "grad_norm": 1.623362907816659, "language_loss": 0.78904504, "learning_rate": 7.471998396736579e-07, "loss": 0.81391805, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.8507943153381348 }, { "auxiliary_loss_clip": 0.01293127, "auxiliary_loss_mlp": 0.01193319, "balance_loss_clip": 1.00751281, "balance_loss_mlp": 1.00019956, "epoch": 0.7241026874286057, "flos": 23148927637920.0, "grad_norm": 1.6190874786074223, "language_loss": 0.76103568, "learning_rate": 7.465927249704549e-07, "loss": 0.78590018, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.7886152267456055 }, { "auxiliary_loss_clip": 0.01329309, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00741673, "balance_loss_mlp": 1.00021601, "epoch": 0.7242229303192449, "flos": 20266916220000.0, "grad_norm": 1.7019262801859596, "language_loss": 0.77331936, "learning_rate": 7.459858004081398e-07, "loss": 0.79854488, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.8014729022979736 }, { "auxiliary_loss_clip": 0.01256459, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00518024, "balance_loss_mlp": 1.00000525, "epoch": 0.724343173209884, "flos": 62311695922080.0, "grad_norm": 0.6579753840372754, "language_loss": 0.5802393, "learning_rate": 7.453790660787815e-07, "loss": 0.60472655, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.4815757274627686 }, { "auxiliary_loss_clip": 0.01305926, "auxiliary_loss_mlp": 0.0119341, "balance_loss_clip": 1.00721407, "balance_loss_mlp": 1.00019538, "epoch": 0.724463416100523, "flos": 35006418267840.0, "grad_norm": 2.1261146797250117, "language_loss": 0.63462961, "learning_rate": 7.447725220744214e-07, "loss": 0.65962303, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.8306119441986084 }, { "auxiliary_loss_clip": 0.01350456, "auxiliary_loss_mlp": 0.01193324, "balance_loss_clip": 1.00832403, "balance_loss_mlp": 1.0002054, "epoch": 0.7245836589911622, "flos": 21871979820000.0, "grad_norm": 1.8447650230532273, "language_loss": 0.77130914, "learning_rate": 7.441661684870717e-07, "loss": 0.79674685, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.7462613582611084 }, { "auxiliary_loss_clip": 0.01349823, "auxiliary_loss_mlp": 0.01193294, "balance_loss_clip": 1.00823081, "balance_loss_mlp": 1.00017464, "epoch": 0.7247039018818012, "flos": 23006507159520.0, "grad_norm": 2.4955749517844823, "language_loss": 0.8162818, "learning_rate": 7.435600054087152e-07, "loss": 0.84171295, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.699505090713501 }, { "auxiliary_loss_clip": 0.01349792, "auxiliary_loss_mlp": 0.01193306, "balance_loss_clip": 1.00808668, "balance_loss_mlp": 1.00018668, "epoch": 0.7248241447724403, "flos": 31722602094720.0, "grad_norm": 2.00780971765015, "language_loss": 0.74389845, "learning_rate": 7.42954032931308e-07, "loss": 0.76932943, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.8578262329101562 }, { "auxiliary_loss_clip": 0.01309582, "auxiliary_loss_mlp": 0.01193407, "balance_loss_clip": 1.00670862, "balance_loss_mlp": 1.00028777, "epoch": 0.7249443876630794, "flos": 34896998442240.0, "grad_norm": 1.8376970464640372, "language_loss": 0.74708188, "learning_rate": 7.423482511467733e-07, "loss": 0.77211171, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.878371238708496 }, { "auxiliary_loss_clip": 0.01256171, "auxiliary_loss_mlp": 0.0119343, "balance_loss_clip": 1.00721645, "balance_loss_mlp": 1.00031137, "epoch": 0.7250646305537185, "flos": 26359306303680.0, "grad_norm": 2.348083044455895, "language_loss": 0.64756906, "learning_rate": 7.417426601470099e-07, "loss": 0.67206508, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.9393398761749268 }, { "auxiliary_loss_clip": 0.01326891, "auxiliary_loss_mlp": 0.01193369, "balance_loss_clip": 1.00786746, "balance_loss_mlp": 1.00024962, "epoch": 0.7251848734443576, "flos": 30081628023840.0, "grad_norm": 4.2229818496121165, "language_loss": 0.78444827, "learning_rate": 7.411372600238841e-07, "loss": 0.80965084, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 3.0974414348602295 }, { "auxiliary_loss_clip": 0.01350962, "auxiliary_loss_mlp": 0.01193319, "balance_loss_clip": 1.00844336, "balance_loss_mlp": 1.00019944, "epoch": 0.7253051163349967, "flos": 17785272610080.0, "grad_norm": 1.8709475953831196, "language_loss": 0.74293101, "learning_rate": 7.405320508692346e-07, "loss": 0.76837379, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.6669585704803467 }, { "auxiliary_loss_clip": 0.01348031, "auxiliary_loss_mlp": 0.0119329, "balance_loss_clip": 1.00788522, "balance_loss_mlp": 1.00017083, "epoch": 0.7254253592256358, "flos": 12641355325440.0, "grad_norm": 1.879194169954887, "language_loss": 0.75533754, "learning_rate": 7.399270327748727e-07, "loss": 0.78075075, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.6788086891174316 }, { "auxiliary_loss_clip": 0.01301928, "auxiliary_loss_mlp": 0.00872426, "balance_loss_clip": 1.00719106, "balance_loss_mlp": 1.00036585, "epoch": 0.7255456021162748, "flos": 27199221445440.0, "grad_norm": 1.60286084159554, "language_loss": 0.73847318, "learning_rate": 7.39322205832577e-07, "loss": 0.76021671, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.837493896484375 }, { "auxiliary_loss_clip": 0.01317129, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.0079298, "balance_loss_mlp": 1.00017214, "epoch": 0.725665845006914, "flos": 21288215823840.0, "grad_norm": 1.7589630557938862, "language_loss": 0.81043994, "learning_rate": 7.387175701341009e-07, "loss": 0.83554327, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.803327798843384 }, { "auxiliary_loss_clip": 0.01329017, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.0076654, "balance_loss_mlp": 1.00016344, "epoch": 0.7257860878975531, "flos": 16033693232160.0, "grad_norm": 2.206913253755146, "language_loss": 0.72068369, "learning_rate": 7.381131257711659e-07, "loss": 0.7459057, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.7685322761535645 }, { "auxiliary_loss_clip": 0.01303695, "auxiliary_loss_mlp": 0.01193258, "balance_loss_clip": 1.00731814, "balance_loss_mlp": 1.0002346, "epoch": 0.7259063307881921, "flos": 12129951126240.0, "grad_norm": 1.8635927181792082, "language_loss": 0.83525735, "learning_rate": 7.375088728354677e-07, "loss": 0.86022687, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.7530739307403564 }, { "auxiliary_loss_clip": 0.01296913, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.00688696, "balance_loss_mlp": 1.00024033, "epoch": 0.7260265736788313, "flos": 30443858092800.0, "grad_norm": 1.409217111744938, "language_loss": 0.67695028, "learning_rate": 7.369048114186691e-07, "loss": 0.70185208, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.848130226135254 }, { "auxiliary_loss_clip": 0.0129688, "auxiliary_loss_mlp": 0.00872345, "balance_loss_clip": 1.00830829, "balance_loss_mlp": 1.00033021, "epoch": 0.7261468165694703, "flos": 21142274824800.0, "grad_norm": 1.7091738563926304, "language_loss": 0.83083665, "learning_rate": 7.363009416124055e-07, "loss": 0.85252893, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 3.8407821655273438 }, { "auxiliary_loss_clip": 0.01292682, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00766492, "balance_loss_mlp": 1.00020361, "epoch": 0.7262670594601094, "flos": 22306318220160.0, "grad_norm": 2.16678458614196, "language_loss": 0.62599653, "learning_rate": 7.356972635082852e-07, "loss": 0.6508556, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 3.7261219024658203 }, { "auxiliary_loss_clip": 0.01256956, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00664461, "balance_loss_mlp": 1.00020552, "epoch": 0.7263873023507486, "flos": 25335060958080.0, "grad_norm": 1.7006252803532202, "language_loss": 0.75325125, "learning_rate": 7.35093777197884e-07, "loss": 0.77775311, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.8651647567749023 }, { "auxiliary_loss_clip": 0.01305226, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00762379, "balance_loss_mlp": 1.00019884, "epoch": 0.7265075452413876, "flos": 23878632633120.0, "grad_norm": 2.1458167218907014, "language_loss": 0.85920674, "learning_rate": 7.344904827727525e-07, "loss": 0.88419127, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 4.710334777832031 }, { "auxiliary_loss_clip": 0.01308025, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00787044, "balance_loss_mlp": 1.00015152, "epoch": 0.7266277881320267, "flos": 28724560894080.0, "grad_norm": 2.2554669369593308, "language_loss": 0.73292321, "learning_rate": 7.338873803244076e-07, "loss": 0.75793618, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.832263231277466 }, { "auxiliary_loss_clip": 0.01301426, "auxiliary_loss_mlp": 0.01193411, "balance_loss_clip": 1.00661004, "balance_loss_mlp": 1.0001967, "epoch": 0.7267480310226658, "flos": 24863518834560.0, "grad_norm": 1.6448907555168921, "language_loss": 0.80721492, "learning_rate": 7.332844699443401e-07, "loss": 0.83216321, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.8288252353668213 }, { "auxiliary_loss_clip": 0.01279407, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00685811, "balance_loss_mlp": 1.00022447, "epoch": 0.7268682739133049, "flos": 27198502971840.0, "grad_norm": 1.7762303607685688, "language_loss": 0.75095689, "learning_rate": 7.326817517240121e-07, "loss": 0.7756834, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.826650857925415 }, { "auxiliary_loss_clip": 0.01336407, "auxiliary_loss_mlp": 0.00872412, "balance_loss_clip": 1.00770748, "balance_loss_mlp": 1.00026178, "epoch": 0.7269885168039439, "flos": 33508152064800.0, "grad_norm": 1.6714112832572652, "language_loss": 0.83158016, "learning_rate": 7.320792257548545e-07, "loss": 0.85366833, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.901136636734009 }, { "auxiliary_loss_clip": 0.01317243, "auxiliary_loss_mlp": 0.01193401, "balance_loss_clip": 1.00769091, "balance_loss_mlp": 1.00018692, "epoch": 0.7271087596945831, "flos": 24313761354240.0, "grad_norm": 2.114511849314073, "language_loss": 0.76562375, "learning_rate": 7.314768921282704e-07, "loss": 0.79073018, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.7279951572418213 }, { "auxiliary_loss_clip": 0.01337562, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00839245, "balance_loss_mlp": 1.00018954, "epoch": 0.7272290025852222, "flos": 23805159202080.0, "grad_norm": 3.1486757756047252, "language_loss": 0.7255857, "learning_rate": 7.30874750935633e-07, "loss": 0.75089347, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.783381938934326 }, { "auxiliary_loss_clip": 0.01285386, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00690126, "balance_loss_mlp": 1.00020587, "epoch": 0.7273492454758612, "flos": 16720374867840.0, "grad_norm": 1.8912289848156545, "language_loss": 0.79158807, "learning_rate": 7.30272802268286e-07, "loss": 0.8163743, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.720015287399292 }, { "auxiliary_loss_clip": 0.01249503, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00692439, "balance_loss_mlp": 1.00014377, "epoch": 0.7274694883665004, "flos": 28031341148640.0, "grad_norm": 1.696796200011497, "language_loss": 0.76112092, "learning_rate": 7.29671046217547e-07, "loss": 0.78554767, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.926737070083618 }, { "auxiliary_loss_clip": 0.0129483, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00733626, "balance_loss_mlp": 1.00021684, "epoch": 0.7275897312571394, "flos": 30372719700960.0, "grad_norm": 1.7837783615609042, "language_loss": 0.81450176, "learning_rate": 7.290694828746988e-07, "loss": 0.83938253, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.8930201530456543 }, { "auxiliary_loss_clip": 0.01305475, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00767648, "balance_loss_mlp": 1.00018358, "epoch": 0.7277099741477785, "flos": 19204784601120.0, "grad_norm": 1.8099884208671078, "language_loss": 0.86073923, "learning_rate": 7.284681123310004e-07, "loss": 0.88572609, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.7965831756591797 }, { "auxiliary_loss_clip": 0.01324681, "auxiliary_loss_mlp": 0.01193308, "balance_loss_clip": 1.00781369, "balance_loss_mlp": 1.0001893, "epoch": 0.7278302170384175, "flos": 20667894730560.0, "grad_norm": 1.7173151695283984, "language_loss": 0.7963624, "learning_rate": 7.27866934677678e-07, "loss": 0.82154232, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.7726542949676514 }, { "auxiliary_loss_clip": 0.01265207, "auxiliary_loss_mlp": 0.01193331, "balance_loss_clip": 1.00669527, "balance_loss_mlp": 1.00021195, "epoch": 0.7279504599290567, "flos": 19093209354720.0, "grad_norm": 1.5790916253068616, "language_loss": 0.78287369, "learning_rate": 7.272659500059297e-07, "loss": 0.80745906, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.796696662902832 }, { "auxiliary_loss_clip": 0.01338621, "auxiliary_loss_mlp": 0.01193273, "balance_loss_clip": 1.00820553, "balance_loss_mlp": 1.00024915, "epoch": 0.7280707028196958, "flos": 19062184504320.0, "grad_norm": 1.939979288763052, "language_loss": 0.80272812, "learning_rate": 7.266651584069264e-07, "loss": 0.8280471, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.837484359741211 }, { "auxiliary_loss_clip": 0.01330761, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.00779748, "balance_loss_mlp": 1.00024343, "epoch": 0.7281909457103348, "flos": 37196323574400.0, "grad_norm": 1.644095848271937, "language_loss": 0.57018256, "learning_rate": 7.260645599718045e-07, "loss": 0.59542274, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.8758318424224854 }, { "auxiliary_loss_clip": 0.01314473, "auxiliary_loss_mlp": 0.01193468, "balance_loss_clip": 1.00778556, "balance_loss_mlp": 1.00025344, "epoch": 0.728311188600974, "flos": 20667104409600.0, "grad_norm": 1.9579880476189202, "language_loss": 0.67177302, "learning_rate": 7.254641547916767e-07, "loss": 0.69685245, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.8592658042907715 }, { "auxiliary_loss_clip": 0.01349815, "auxiliary_loss_mlp": 0.01193376, "balance_loss_clip": 1.00889266, "balance_loss_mlp": 1.00016141, "epoch": 0.728431431491613, "flos": 28840698447840.0, "grad_norm": 1.761134816536471, "language_loss": 0.69230592, "learning_rate": 7.248639429576226e-07, "loss": 0.71773785, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.777322769165039 }, { "auxiliary_loss_clip": 0.01331001, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00774384, "balance_loss_mlp": 1.00018549, "epoch": 0.7285516743822521, "flos": 25991867301120.0, "grad_norm": 1.578067586391994, "language_loss": 0.71874344, "learning_rate": 7.242639245606959e-07, "loss": 0.74398559, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.751718759536743 }, { "auxiliary_loss_clip": 0.01325137, "auxiliary_loss_mlp": 0.01193266, "balance_loss_clip": 1.00819182, "balance_loss_mlp": 1.00024247, "epoch": 0.7286719172728913, "flos": 16399731211200.0, "grad_norm": 2.109094320156685, "language_loss": 0.82179409, "learning_rate": 7.236640996919168e-07, "loss": 0.84697807, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.7579731941223145 }, { "auxiliary_loss_clip": 0.01335923, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00778198, "balance_loss_mlp": 1.00020421, "epoch": 0.7287921601635303, "flos": 22018172284800.0, "grad_norm": 1.5106541515063598, "language_loss": 0.70328569, "learning_rate": 7.230644684422782e-07, "loss": 0.7285772, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.7242801189422607 }, { "auxiliary_loss_clip": 0.01294227, "auxiliary_loss_mlp": 0.01193387, "balance_loss_clip": 1.00686324, "balance_loss_mlp": 1.00017262, "epoch": 0.7289124030541694, "flos": 24600937350240.0, "grad_norm": 5.376759142688208, "language_loss": 0.81316221, "learning_rate": 7.224650309027451e-07, "loss": 0.83803833, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.833580493927002 }, { "auxiliary_loss_clip": 0.01332918, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00772905, "balance_loss_mlp": 1.00021529, "epoch": 0.7290326459448085, "flos": 21393648120960.0, "grad_norm": 1.6342245947976255, "language_loss": 0.68489635, "learning_rate": 7.218657871642506e-07, "loss": 0.71015793, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.7358946800231934 }, { "auxiliary_loss_clip": 0.01350822, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.0084691, "balance_loss_mlp": 1.00020599, "epoch": 0.7291528888354476, "flos": 18587696639040.0, "grad_norm": 1.865334020367118, "language_loss": 0.62306023, "learning_rate": 7.212667373177012e-07, "loss": 0.64850163, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.6849210262298584 }, { "auxiliary_loss_clip": 0.0130499, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00754595, "balance_loss_mlp": 1.00019455, "epoch": 0.7292731317260867, "flos": 18951076265760.0, "grad_norm": 1.8069730368024568, "language_loss": 0.75153953, "learning_rate": 7.206678814539704e-07, "loss": 0.77652162, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 3.763866424560547 }, { "auxiliary_loss_clip": 0.01288786, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00702024, "balance_loss_mlp": 1.00019443, "epoch": 0.7293933746167258, "flos": 21067579988640.0, "grad_norm": 1.6048250732714484, "language_loss": 0.72821462, "learning_rate": 7.20069219663904e-07, "loss": 0.75303465, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.870147466659546 }, { "auxiliary_loss_clip": 0.01337689, "auxiliary_loss_mlp": 0.01193324, "balance_loss_clip": 1.00808811, "balance_loss_mlp": 1.0002048, "epoch": 0.7295136175073649, "flos": 22453336929600.0, "grad_norm": 1.51828810233512, "language_loss": 0.79517972, "learning_rate": 7.1947075203832e-07, "loss": 0.82048988, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 3.6524500846862793 }, { "auxiliary_loss_clip": 0.0131936, "auxiliary_loss_mlp": 0.01192279, "balance_loss_clip": 1.00514197, "balance_loss_mlp": 1.000018, "epoch": 0.7296338603980039, "flos": 56125535528160.0, "grad_norm": 0.8650879267422167, "language_loss": 0.60198426, "learning_rate": 7.188724786680049e-07, "loss": 0.62710071, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 4.242600679397583 }, { "auxiliary_loss_clip": 0.01313333, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00733519, "balance_loss_mlp": 1.00019372, "epoch": 0.7297541032886431, "flos": 25228299484800.0, "grad_norm": 1.5889031222864787, "language_loss": 0.75694597, "learning_rate": 7.182743996437162e-07, "loss": 0.78201151, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.774292469024658 }, { "auxiliary_loss_clip": 0.01304018, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00746298, "balance_loss_mlp": 1.00022459, "epoch": 0.7298743461792822, "flos": 26467612495200.0, "grad_norm": 1.79499121455397, "language_loss": 0.68851864, "learning_rate": 7.176765150561819e-07, "loss": 0.71349132, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.8965847492218018 }, { "auxiliary_loss_clip": 0.0135034, "auxiliary_loss_mlp": 0.01193468, "balance_loss_clip": 1.00809431, "balance_loss_mlp": 1.0002532, "epoch": 0.7299945890699212, "flos": 19569062319840.0, "grad_norm": 1.825601566793094, "language_loss": 0.80270249, "learning_rate": 7.170788249961002e-07, "loss": 0.82814056, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.674915075302124 }, { "auxiliary_loss_clip": 0.01349219, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00794792, "balance_loss_mlp": 1.00015545, "epoch": 0.7301148319605604, "flos": 22928974352640.0, "grad_norm": 3.0440650770427298, "language_loss": 0.88281417, "learning_rate": 7.164813295541418e-07, "loss": 0.90823817, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.6771674156188965 }, { "auxiliary_loss_clip": 0.01314101, "auxiliary_loss_mlp": 0.01193308, "balance_loss_clip": 1.00734925, "balance_loss_mlp": 1.00018942, "epoch": 0.7302350748511994, "flos": 25369714100160.0, "grad_norm": 1.646205440898425, "language_loss": 0.70203114, "learning_rate": 7.15884028820944e-07, "loss": 0.72710526, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.7819628715515137 }, { "auxiliary_loss_clip": 0.01301794, "auxiliary_loss_mlp": 0.01193329, "balance_loss_clip": 1.00755441, "balance_loss_mlp": 1.00020981, "epoch": 0.7303553177418385, "flos": 27819183301920.0, "grad_norm": 1.9930794337807132, "language_loss": 0.60438567, "learning_rate": 7.152869228871185e-07, "loss": 0.62933695, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.8208653926849365 }, { "auxiliary_loss_clip": 0.01321532, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00801873, "balance_loss_mlp": 1.00019813, "epoch": 0.7304755606324776, "flos": 24426522082080.0, "grad_norm": 1.5920630965178264, "language_loss": 0.72066075, "learning_rate": 7.146900118432457e-07, "loss": 0.74580824, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.7617874145507812 }, { "auxiliary_loss_clip": 0.01258838, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00712085, "balance_loss_mlp": 1.00017202, "epoch": 0.7305958035231167, "flos": 23840494894080.0, "grad_norm": 1.5910710242124608, "language_loss": 0.85880029, "learning_rate": 7.140932957798753e-07, "loss": 0.88332057, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.9408583641052246 }, { "auxiliary_loss_clip": 0.0132617, "auxiliary_loss_mlp": 0.01193343, "balance_loss_clip": 1.00823009, "balance_loss_mlp": 1.00022411, "epoch": 0.7307160464137558, "flos": 16726948901280.0, "grad_norm": 1.7229310548033647, "language_loss": 0.71357262, "learning_rate": 7.134967747875309e-07, "loss": 0.73876774, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.9128568172454834 }, { "auxiliary_loss_clip": 0.01338122, "auxiliary_loss_mlp": 0.0119325, "balance_loss_clip": 1.00809705, "balance_loss_mlp": 1.0002265, "epoch": 0.7308362893043949, "flos": 21798290846880.0, "grad_norm": 1.6826592336955604, "language_loss": 0.8156029, "learning_rate": 7.129004489567014e-07, "loss": 0.84091663, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.767564535140991 }, { "auxiliary_loss_clip": 0.01294952, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.0070585, "balance_loss_mlp": 1.00020647, "epoch": 0.730956532195034, "flos": 10707385622400.0, "grad_norm": 1.954400583259561, "language_loss": 0.77830023, "learning_rate": 7.123043183778512e-07, "loss": 0.80318207, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.8857860565185547 }, { "auxiliary_loss_clip": 0.01293094, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00706053, "balance_loss_mlp": 1.00022697, "epoch": 0.731076775085673, "flos": 19791997270560.0, "grad_norm": 1.6770147535016462, "language_loss": 0.65088791, "learning_rate": 7.117083831414114e-07, "loss": 0.67575133, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.764124631881714 }, { "auxiliary_loss_clip": 0.01348757, "auxiliary_loss_mlp": 0.01193322, "balance_loss_clip": 1.00764, "balance_loss_mlp": 1.00020266, "epoch": 0.7311970179763122, "flos": 20447043353280.0, "grad_norm": 2.219432890612911, "language_loss": 0.69520026, "learning_rate": 7.11112643337787e-07, "loss": 0.72062099, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.683180570602417 }, { "auxiliary_loss_clip": 0.01307381, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00800276, "balance_loss_mlp": 1.00019932, "epoch": 0.7313172608669513, "flos": 18513828047520.0, "grad_norm": 2.508665129614871, "language_loss": 0.76691329, "learning_rate": 7.10517099057349e-07, "loss": 0.7919203, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.7551121711730957 }, { "auxiliary_loss_clip": 0.01312307, "auxiliary_loss_mlp": 0.01193399, "balance_loss_clip": 1.00803268, "balance_loss_mlp": 1.00018489, "epoch": 0.7314375037575903, "flos": 16180747865280.0, "grad_norm": 2.1303511299133384, "language_loss": 0.6092546, "learning_rate": 7.099217503904411e-07, "loss": 0.63431168, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.7474353313446045 }, { "auxiliary_loss_clip": 0.01314428, "auxiliary_loss_mlp": 0.01193272, "balance_loss_clip": 1.00695014, "balance_loss_mlp": 1.00015283, "epoch": 0.7315577466482295, "flos": 17967950324640.0, "grad_norm": 1.7840662289961335, "language_loss": 0.89749748, "learning_rate": 7.093265974273788e-07, "loss": 0.92257452, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.773421049118042 }, { "auxiliary_loss_clip": 0.01337681, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00770545, "balance_loss_mlp": 1.00024736, "epoch": 0.7316779895388685, "flos": 18405450008640.0, "grad_norm": 1.6630284504335529, "language_loss": 0.71646237, "learning_rate": 7.087316402584447e-07, "loss": 0.74177188, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.711932897567749 }, { "auxiliary_loss_clip": 0.01349898, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00805318, "balance_loss_mlp": 1.00017905, "epoch": 0.7317982324295076, "flos": 17928303791040.0, "grad_norm": 2.049012588742192, "language_loss": 0.86477208, "learning_rate": 7.081368789738953e-07, "loss": 0.89020306, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.764981269836426 }, { "auxiliary_loss_clip": 0.01325062, "auxiliary_loss_mlp": 0.011933, "balance_loss_clip": 1.00795746, "balance_loss_mlp": 1.00018108, "epoch": 0.7319184753201466, "flos": 27229851135360.0, "grad_norm": 1.8404094156343505, "language_loss": 0.77494901, "learning_rate": 7.075423136639537e-07, "loss": 0.80013263, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.8256068229675293 }, { "auxiliary_loss_clip": 0.01297815, "auxiliary_loss_mlp": 0.01193434, "balance_loss_clip": 1.00724542, "balance_loss_mlp": 1.00021935, "epoch": 0.7320387182107858, "flos": 37448559038880.0, "grad_norm": 1.718616054285741, "language_loss": 0.74556172, "learning_rate": 7.069479444188149e-07, "loss": 0.77047414, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.9145705699920654 }, { "auxiliary_loss_clip": 0.01314916, "auxiliary_loss_mlp": 0.01193136, "balance_loss_clip": 1.00770903, "balance_loss_mlp": 1.0001123, "epoch": 0.7321589611014249, "flos": 17859033430560.0, "grad_norm": 1.828352270259812, "language_loss": 0.8204838, "learning_rate": 7.063537713286453e-07, "loss": 0.84556437, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 2.856415271759033 }, { "auxiliary_loss_clip": 0.01318054, "auxiliary_loss_mlp": 0.01193398, "balance_loss_clip": 1.00769663, "balance_loss_mlp": 1.00027871, "epoch": 0.7322792039920639, "flos": 26100604576800.0, "grad_norm": 1.8078071723323224, "language_loss": 0.80630851, "learning_rate": 7.057597944835803e-07, "loss": 0.83142304, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.8577544689178467 }, { "auxiliary_loss_clip": 0.01307339, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.00728703, "balance_loss_mlp": 1.00020933, "epoch": 0.7323994468827031, "flos": 25369103397600.0, "grad_norm": 1.7311973337811621, "language_loss": 0.74676728, "learning_rate": 7.051660139737253e-07, "loss": 0.77177298, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.849263906478882 }, { "auxiliary_loss_clip": 0.01324179, "auxiliary_loss_mlp": 0.00872535, "balance_loss_clip": 1.00786769, "balance_loss_mlp": 1.00028777, "epoch": 0.7325196897733421, "flos": 26907087981600.0, "grad_norm": 2.296709207337074, "language_loss": 0.76449698, "learning_rate": 7.045724298891565e-07, "loss": 0.78646415, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 3.8370893001556396 }, { "auxiliary_loss_clip": 0.01325914, "auxiliary_loss_mlp": 0.01193361, "balance_loss_clip": 1.00743079, "balance_loss_mlp": 1.00024152, "epoch": 0.7326399326639812, "flos": 25775793773280.0, "grad_norm": 1.835765229486542, "language_loss": 0.69474393, "learning_rate": 7.039790423199192e-07, "loss": 0.71993673, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 3.667588710784912 }, { "auxiliary_loss_clip": 0.01314371, "auxiliary_loss_mlp": 0.01193547, "balance_loss_clip": 1.00796461, "balance_loss_mlp": 1.00023723, "epoch": 0.7327601755546204, "flos": 21032280220320.0, "grad_norm": 2.0530814924535967, "language_loss": 0.77648479, "learning_rate": 7.033858513560322e-07, "loss": 0.80156398, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 3.8084254264831543 }, { "auxiliary_loss_clip": 0.01331389, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00807488, "balance_loss_mlp": 1.00014949, "epoch": 0.7328804184452594, "flos": 16289233675200.0, "grad_norm": 2.0235799614278505, "language_loss": 0.76257879, "learning_rate": 7.027928570874794e-07, "loss": 0.78782439, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 3.759916305541992 }, { "auxiliary_loss_clip": 0.01349141, "auxiliary_loss_mlp": 0.01193346, "balance_loss_clip": 1.00768113, "balance_loss_mlp": 1.00022686, "epoch": 0.7330006613358985, "flos": 17858243109600.0, "grad_norm": 2.156406871856175, "language_loss": 0.85476005, "learning_rate": 7.022000596042194e-07, "loss": 0.88018489, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.653644561767578 }, { "auxiliary_loss_clip": 0.01303825, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00722456, "balance_loss_mlp": 1.00021529, "epoch": 0.7331209042265376, "flos": 22492085371200.0, "grad_norm": 1.9927053655238773, "language_loss": 0.81443048, "learning_rate": 7.016074589961784e-07, "loss": 0.83940107, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.86995530128479 }, { "auxiliary_loss_clip": 0.01312846, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00836337, "balance_loss_mlp": 1.00021672, "epoch": 0.7332411471171767, "flos": 33072771877920.0, "grad_norm": 1.6863027721115775, "language_loss": 0.67343211, "learning_rate": 7.01015055353253e-07, "loss": 0.69849294, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.855273962020874 }, { "auxiliary_loss_clip": 0.01279425, "auxiliary_loss_mlp": 0.0119334, "balance_loss_clip": 1.00813282, "balance_loss_mlp": 1.00022066, "epoch": 0.7333613900078157, "flos": 22743027583200.0, "grad_norm": 1.8477618774168014, "language_loss": 0.77769488, "learning_rate": 7.004228487653123e-07, "loss": 0.80242252, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.9267215728759766 }, { "auxiliary_loss_clip": 0.01313623, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00777721, "balance_loss_mlp": 1.00016844, "epoch": 0.7334816328984549, "flos": 22346144372160.0, "grad_norm": 1.6970230276906781, "language_loss": 0.78389019, "learning_rate": 6.998308393221906e-07, "loss": 0.80895835, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.810997724533081 }, { "auxiliary_loss_clip": 0.01295388, "auxiliary_loss_mlp": 0.01193307, "balance_loss_clip": 1.00726473, "balance_loss_mlp": 1.00018799, "epoch": 0.733601875789094, "flos": 20736159228000.0, "grad_norm": 2.2632267965718875, "language_loss": 0.71199954, "learning_rate": 6.992390271136977e-07, "loss": 0.73688644, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.782087802886963 }, { "auxiliary_loss_clip": 0.01336743, "auxiliary_loss_mlp": 0.01193269, "balance_loss_clip": 1.00791907, "balance_loss_mlp": 1.00024486, "epoch": 0.733722118679733, "flos": 22564373320800.0, "grad_norm": 1.6695825722640638, "language_loss": 0.85522449, "learning_rate": 6.986474122296094e-07, "loss": 0.88052464, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.795443058013916 }, { "auxiliary_loss_clip": 0.01350747, "auxiliary_loss_mlp": 0.01193436, "balance_loss_clip": 1.00853896, "balance_loss_mlp": 1.00022173, "epoch": 0.7338423615703722, "flos": 20084202581760.0, "grad_norm": 1.7667291333840316, "language_loss": 0.72545964, "learning_rate": 6.980559947596751e-07, "loss": 0.75090146, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.678783893585205 }, { "auxiliary_loss_clip": 0.01279927, "auxiliary_loss_mlp": 0.01193374, "balance_loss_clip": 1.00716877, "balance_loss_mlp": 1.00025523, "epoch": 0.7339626044610112, "flos": 21687685539840.0, "grad_norm": 1.9106787642210628, "language_loss": 0.75360239, "learning_rate": 6.974647747936109e-07, "loss": 0.77833539, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.9032366275787354 }, { "auxiliary_loss_clip": 0.01350014, "auxiliary_loss_mlp": 0.00872447, "balance_loss_clip": 1.00827599, "balance_loss_mlp": 1.00022101, "epoch": 0.7340828473516503, "flos": 15268257384480.0, "grad_norm": 1.7821629567055584, "language_loss": 0.82291341, "learning_rate": 6.968737524211039e-07, "loss": 0.84513801, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.6579699516296387 }, { "auxiliary_loss_clip": 0.0132504, "auxiliary_loss_mlp": 0.01193402, "balance_loss_clip": 1.00811815, "balance_loss_mlp": 1.00018787, "epoch": 0.7342030902422895, "flos": 22930123910400.0, "grad_norm": 3.492800472053129, "language_loss": 0.80390316, "learning_rate": 6.962829277318132e-07, "loss": 0.82908762, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.778010606765747 }, { "auxiliary_loss_clip": 0.01330118, "auxiliary_loss_mlp": 0.01193298, "balance_loss_clip": 1.00782001, "balance_loss_mlp": 1.00017905, "epoch": 0.7343233331329285, "flos": 25847902104480.0, "grad_norm": 1.7709403050269719, "language_loss": 0.83261001, "learning_rate": 6.956923008153652e-07, "loss": 0.85784423, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.806537389755249 }, { "auxiliary_loss_clip": 0.01337564, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00775886, "balance_loss_mlp": 1.00019312, "epoch": 0.7344435760235676, "flos": 18478995287040.0, "grad_norm": 1.9421037374889891, "language_loss": 0.84257823, "learning_rate": 6.951018717613593e-07, "loss": 0.86788607, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.7769880294799805 }, { "auxiliary_loss_clip": 0.01326093, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.00735998, "balance_loss_mlp": 1.00016928, "epoch": 0.7345638189142067, "flos": 17640050084640.0, "grad_norm": 1.759820089926988, "language_loss": 0.78711426, "learning_rate": 6.945116406593614e-07, "loss": 0.81230807, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.7697598934173584 }, { "auxiliary_loss_clip": 0.01269991, "auxiliary_loss_mlp": 0.01193249, "balance_loss_clip": 1.00673556, "balance_loss_mlp": 1.00022554, "epoch": 0.7346840618048458, "flos": 20260234415520.0, "grad_norm": 2.4747196300021885, "language_loss": 0.74546289, "learning_rate": 6.939216075989089e-07, "loss": 0.77009523, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.9813473224639893 }, { "auxiliary_loss_clip": 0.01312509, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00731814, "balance_loss_mlp": 1.00019026, "epoch": 0.7348043046954849, "flos": 29023196544000.0, "grad_norm": 1.628673273457111, "language_loss": 0.66046375, "learning_rate": 6.933317726695109e-07, "loss": 0.68552101, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.896519184112549 }, { "auxiliary_loss_clip": 0.01282421, "auxiliary_loss_mlp": 0.0119335, "balance_loss_clip": 1.00741601, "balance_loss_mlp": 1.0002315, "epoch": 0.734924547586124, "flos": 17931213609120.0, "grad_norm": 2.4644484085270255, "language_loss": 0.79670167, "learning_rate": 6.92742135960644e-07, "loss": 0.82145941, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.880469799041748 }, { "auxiliary_loss_clip": 0.01304114, "auxiliary_loss_mlp": 0.01192311, "balance_loss_clip": 1.0049907, "balance_loss_mlp": 1.00005054, "epoch": 0.7350447904767631, "flos": 63588356350560.0, "grad_norm": 0.8175461386205886, "language_loss": 0.55776942, "learning_rate": 6.921526975617556e-07, "loss": 0.58273369, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.407219171524048 }, { "auxiliary_loss_clip": 0.01311131, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00837088, "balance_loss_mlp": 1.00020063, "epoch": 0.7351650333674021, "flos": 21580025974560.0, "grad_norm": 1.6627858018628405, "language_loss": 0.75452882, "learning_rate": 6.915634575622631e-07, "loss": 0.77957237, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.8657021522521973 }, { "auxiliary_loss_clip": 0.01349341, "auxiliary_loss_mlp": 0.0119335, "balance_loss_clip": 1.0078299, "balance_loss_mlp": 1.00023079, "epoch": 0.7352852762580413, "flos": 18186358891680.0, "grad_norm": 1.8901525893561097, "language_loss": 0.70638937, "learning_rate": 6.909744160515532e-07, "loss": 0.73181629, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 2.8354568481445312 }, { "auxiliary_loss_clip": 0.01305103, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00714386, "balance_loss_mlp": 1.00017905, "epoch": 0.7354055191486804, "flos": 38910088526400.0, "grad_norm": 1.7132270065003183, "language_loss": 0.69418818, "learning_rate": 6.903855731189849e-07, "loss": 0.71917123, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 3.0352392196655273 }, { "auxiliary_loss_clip": 0.01315431, "auxiliary_loss_mlp": 0.0119343, "balance_loss_clip": 1.00719512, "balance_loss_mlp": 1.00031114, "epoch": 0.7355257620393194, "flos": 16289988072480.0, "grad_norm": 2.2776384597687755, "language_loss": 0.81915832, "learning_rate": 6.897969288538825e-07, "loss": 0.84424698, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.8293471336364746 }, { "auxiliary_loss_clip": 0.01302985, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00649762, "balance_loss_mlp": 1.00017071, "epoch": 0.7356460049299585, "flos": 18114250560480.0, "grad_norm": 1.5845407744987696, "language_loss": 0.81238061, "learning_rate": 6.892084833455452e-07, "loss": 0.83734244, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 3.7101848125457764 }, { "auxiliary_loss_clip": 0.01325199, "auxiliary_loss_mlp": 0.01193245, "balance_loss_clip": 1.00731778, "balance_loss_mlp": 1.00022161, "epoch": 0.7357662478205976, "flos": 21325204005120.0, "grad_norm": 1.478087434073827, "language_loss": 0.83916897, "learning_rate": 6.886202366832384e-07, "loss": 0.86435342, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.784241199493408 }, { "auxiliary_loss_clip": 0.01266534, "auxiliary_loss_mlp": 0.01193244, "balance_loss_clip": 1.00661302, "balance_loss_mlp": 1.00022054, "epoch": 0.7358864907112367, "flos": 14246850009600.0, "grad_norm": 1.6899246876565184, "language_loss": 0.73596585, "learning_rate": 6.880321889561987e-07, "loss": 0.76056361, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 6.235054016113281 }, { "auxiliary_loss_clip": 0.01289871, "auxiliary_loss_mlp": 0.01193441, "balance_loss_clip": 1.00708747, "balance_loss_mlp": 1.00022602, "epoch": 0.7360067336018757, "flos": 22309695046080.0, "grad_norm": 3.6813463733459035, "language_loss": 0.65259659, "learning_rate": 6.874443402536338e-07, "loss": 0.67742974, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.974228858947754 }, { "auxiliary_loss_clip": 0.01314962, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00756311, "balance_loss_mlp": 1.00017297, "epoch": 0.7361269764925149, "flos": 25554619082880.0, "grad_norm": 1.6442647703933477, "language_loss": 0.80247563, "learning_rate": 6.868566906647177e-07, "loss": 0.82755721, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.8380661010742188 }, { "auxiliary_loss_clip": 0.01339002, "auxiliary_loss_mlp": 0.01193456, "balance_loss_clip": 1.00855565, "balance_loss_mlp": 1.00024211, "epoch": 0.736247219383154, "flos": 20376515664000.0, "grad_norm": 1.6434730642979654, "language_loss": 0.83452737, "learning_rate": 6.862692402785984e-07, "loss": 0.85985196, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.821237564086914 }, { "auxiliary_loss_clip": 0.01267728, "auxiliary_loss_mlp": 0.01192297, "balance_loss_clip": 1.00734973, "balance_loss_mlp": 1.00003672, "epoch": 0.736367462273793, "flos": 70339564503360.0, "grad_norm": 0.6802651685258772, "language_loss": 0.49639434, "learning_rate": 6.856819891843899e-07, "loss": 0.5209946, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.4775161743164062 }, { "auxiliary_loss_clip": 0.01248827, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00603986, "balance_loss_mlp": 1.00018251, "epoch": 0.7364877051644322, "flos": 22412720456640.0, "grad_norm": 2.1618867709432297, "language_loss": 0.72072268, "learning_rate": 6.8509493747118e-07, "loss": 0.74514306, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.8975670337677 }, { "auxiliary_loss_clip": 0.0134968, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00837994, "balance_loss_mlp": 1.00024748, "epoch": 0.7366079480550712, "flos": 12130274439360.0, "grad_norm": 2.131979260124195, "language_loss": 0.87920463, "learning_rate": 6.845080852280221e-07, "loss": 0.90463412, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.7217321395874023 }, { "auxiliary_loss_clip": 0.01295407, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00721693, "balance_loss_mlp": 1.00013423, "epoch": 0.7367281909457103, "flos": 15049345885920.0, "grad_norm": 1.6954415253935633, "language_loss": 0.74622142, "learning_rate": 6.839214325439409e-07, "loss": 0.77110708, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.9102859497070312 }, { "auxiliary_loss_clip": 0.0130108, "auxiliary_loss_mlp": 0.01193314, "balance_loss_clip": 1.0072403, "balance_loss_mlp": 1.00019503, "epoch": 0.7368484338363495, "flos": 23510762546400.0, "grad_norm": 1.699464909840352, "language_loss": 0.71424598, "learning_rate": 6.833349795079327e-07, "loss": 0.73918992, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.812499523162842 }, { "auxiliary_loss_clip": 0.01281938, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00689936, "balance_loss_mlp": 1.00019872, "epoch": 0.7369686767269885, "flos": 27417845554560.0, "grad_norm": 1.5497575718772971, "language_loss": 0.68591535, "learning_rate": 6.827487262089613e-07, "loss": 0.71066695, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.9259328842163086 }, { "auxiliary_loss_clip": 0.01280203, "auxiliary_loss_mlp": 0.01192268, "balance_loss_clip": 1.00557709, "balance_loss_mlp": 1.00000751, "epoch": 0.7370889196176276, "flos": 70293379860000.0, "grad_norm": 0.9076642945535529, "language_loss": 0.56821162, "learning_rate": 6.821626727359606e-07, "loss": 0.59293628, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.428119421005249 }, { "auxiliary_loss_clip": 0.0130372, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00833893, "balance_loss_mlp": 1.00022721, "epoch": 0.7372091625082667, "flos": 18040848976800.0, "grad_norm": 2.368007477253874, "language_loss": 0.76596624, "learning_rate": 6.815768191778348e-07, "loss": 0.79093593, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.865710973739624 }, { "auxiliary_loss_clip": 0.01337555, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00809932, "balance_loss_mlp": 1.00021362, "epoch": 0.7373294053989058, "flos": 33726345089760.0, "grad_norm": 1.6238937175504689, "language_loss": 0.7309379, "learning_rate": 6.809911656234569e-07, "loss": 0.75624579, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.921211004257202 }, { "auxiliary_loss_clip": 0.01306934, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00724638, "balance_loss_mlp": 1.00016439, "epoch": 0.7374496482895448, "flos": 21506337001440.0, "grad_norm": 2.157160316664235, "language_loss": 0.78199846, "learning_rate": 6.804057121616707e-07, "loss": 0.80699968, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.8282313346862793 }, { "auxiliary_loss_clip": 0.01330271, "auxiliary_loss_mlp": 0.01193335, "balance_loss_clip": 1.00772977, "balance_loss_mlp": 1.00021625, "epoch": 0.737569891180184, "flos": 24936920418240.0, "grad_norm": 2.2129351900179794, "language_loss": 0.72302353, "learning_rate": 6.798204588812888e-07, "loss": 0.74825954, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.7727861404418945 }, { "auxiliary_loss_clip": 0.01256624, "auxiliary_loss_mlp": 0.00872437, "balance_loss_clip": 1.00648665, "balance_loss_mlp": 1.00021577, "epoch": 0.7376901340708231, "flos": 20664553828320.0, "grad_norm": 1.7621229970589016, "language_loss": 0.75226909, "learning_rate": 6.792354058710937e-07, "loss": 0.77355969, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.9610540866851807 }, { "auxiliary_loss_clip": 0.01347177, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00786507, "balance_loss_mlp": 1.00018859, "epoch": 0.7378103769614621, "flos": 23805805828320.0, "grad_norm": 1.8713035510242826, "language_loss": 0.65260071, "learning_rate": 6.786505532198374e-07, "loss": 0.67800462, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.848841428756714 }, { "auxiliary_loss_clip": 0.01349824, "auxiliary_loss_mlp": 0.01193328, "balance_loss_clip": 1.00775182, "balance_loss_mlp": 1.00020874, "epoch": 0.7379306198521013, "flos": 22237227478080.0, "grad_norm": 1.7498218682863753, "language_loss": 0.85003096, "learning_rate": 6.780659010162411e-07, "loss": 0.87546253, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.907586097717285 }, { "auxiliary_loss_clip": 0.01297214, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00713754, "balance_loss_mlp": 1.00018859, "epoch": 0.7380508627427403, "flos": 14903117497440.0, "grad_norm": 1.5884498878808455, "language_loss": 0.83201289, "learning_rate": 6.774814493489975e-07, "loss": 0.85691714, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.97683048248291 }, { "auxiliary_loss_clip": 0.01324925, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00721931, "balance_loss_mlp": 1.0001837, "epoch": 0.7381711056333794, "flos": 21685853432160.0, "grad_norm": 1.8573514854992021, "language_loss": 0.66024017, "learning_rate": 6.768971983067655e-07, "loss": 0.68542147, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.87638783454895 }, { "auxiliary_loss_clip": 0.01319461, "auxiliary_loss_mlp": 0.01192285, "balance_loss_clip": 1.0051862, "balance_loss_mlp": 1.0000248, "epoch": 0.7382913485240186, "flos": 52404291518400.0, "grad_norm": 1.006022199276851, "language_loss": 0.67893678, "learning_rate": 6.763131479781772e-07, "loss": 0.70405418, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.1337461471557617 }, { "auxiliary_loss_clip": 0.01296688, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00706649, "balance_loss_mlp": 1.00019455, "epoch": 0.7384115914146576, "flos": 21798829702080.0, "grad_norm": 1.749661898483741, "language_loss": 0.76012892, "learning_rate": 6.757292984518316e-07, "loss": 0.78502798, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 2.8904552459716797 }, { "auxiliary_loss_clip": 0.01303705, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00509357, "balance_loss_mlp": 1.00001049, "epoch": 0.7385318343052967, "flos": 61494363565920.0, "grad_norm": 0.7449170567268558, "language_loss": 0.56428719, "learning_rate": 6.751456498162981e-07, "loss": 0.58924699, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.2741944789886475 }, { "auxiliary_loss_clip": 0.01336968, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.0078907, "balance_loss_mlp": 1.00015771, "epoch": 0.7386520771959358, "flos": 17013765660480.0, "grad_norm": 2.250449773662379, "language_loss": 0.85478717, "learning_rate": 6.745622021601174e-07, "loss": 0.88008964, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.818969964981079 }, { "auxiliary_loss_clip": 0.01293834, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00674605, "balance_loss_mlp": 1.00020099, "epoch": 0.7387723200865749, "flos": 18770769514080.0, "grad_norm": 1.8569410079954511, "language_loss": 0.69554222, "learning_rate": 6.739789555717954e-07, "loss": 0.72041285, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 3.83573579788208 }, { "auxiliary_loss_clip": 0.01348972, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00802636, "balance_loss_mlp": 1.00016975, "epoch": 0.738892562977214, "flos": 22525553031840.0, "grad_norm": 2.060033713958515, "language_loss": 0.76966619, "learning_rate": 6.733959101398124e-07, "loss": 0.79508781, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 3.699765682220459 }, { "auxiliary_loss_clip": 0.01324046, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00782204, "balance_loss_mlp": 1.00017285, "epoch": 0.7390128058678531, "flos": 21501487304640.0, "grad_norm": 1.6080350504513568, "language_loss": 0.81703603, "learning_rate": 6.728130659526143e-07, "loss": 0.8422085, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 4.311053276062012 }, { "auxiliary_loss_clip": 0.01311545, "auxiliary_loss_mlp": 0.01193275, "balance_loss_clip": 1.00742602, "balance_loss_mlp": 1.00025129, "epoch": 0.7391330487584922, "flos": 25776188933760.0, "grad_norm": 2.8458607845575483, "language_loss": 0.70669341, "learning_rate": 6.7223042309862e-07, "loss": 0.73174155, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.886096954345703 }, { "auxiliary_loss_clip": 0.01337459, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00809646, "balance_loss_mlp": 1.00017202, "epoch": 0.7392532916491312, "flos": 28366749437760.0, "grad_norm": 2.1310205695151843, "language_loss": 0.7396121, "learning_rate": 6.716479816662144e-07, "loss": 0.76491863, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.854830503463745 }, { "auxiliary_loss_clip": 0.01320583, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00762975, "balance_loss_mlp": 1.00016534, "epoch": 0.7393735345397703, "flos": 23585888466720.0, "grad_norm": 1.7971251838538467, "language_loss": 0.73014092, "learning_rate": 6.710657417437531e-07, "loss": 0.75527865, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.863603115081787 }, { "auxiliary_loss_clip": 0.01311702, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00784397, "balance_loss_mlp": 1.00016999, "epoch": 0.7394937774304094, "flos": 19974782756160.0, "grad_norm": 2.2637703963474416, "language_loss": 0.8009541, "learning_rate": 6.704837034195628e-07, "loss": 0.82600302, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.824392557144165 }, { "auxiliary_loss_clip": 0.01337507, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00807977, "balance_loss_mlp": 1.00021279, "epoch": 0.7396140203210485, "flos": 23478049283040.0, "grad_norm": 1.716942227694207, "language_loss": 0.84859025, "learning_rate": 6.699018667819376e-07, "loss": 0.87389779, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.8039021492004395 }, { "auxiliary_loss_clip": 0.01338331, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.0082829, "balance_loss_mlp": 1.00019729, "epoch": 0.7397342632116876, "flos": 25555445327520.0, "grad_norm": 1.4846656119020092, "language_loss": 0.72790861, "learning_rate": 6.693202319191415e-07, "loss": 0.75322413, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.7467777729034424 }, { "auxiliary_loss_clip": 0.01348897, "auxiliary_loss_mlp": 0.01193245, "balance_loss_clip": 1.0085355, "balance_loss_mlp": 1.00022101, "epoch": 0.7398545061023267, "flos": 24755033024640.0, "grad_norm": 2.3725095844987893, "language_loss": 0.74554998, "learning_rate": 6.687387989194084e-07, "loss": 0.77097142, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.7458090782165527 }, { "auxiliary_loss_clip": 0.01301635, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00715041, "balance_loss_mlp": 1.0001955, "epoch": 0.7399747489929658, "flos": 16508612181600.0, "grad_norm": 1.7876583726402278, "language_loss": 0.79205751, "learning_rate": 6.681575678709404e-07, "loss": 0.81700611, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.7107841968536377 }, { "auxiliary_loss_clip": 0.01325498, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00731182, "balance_loss_mlp": 1.00022471, "epoch": 0.7400949918836048, "flos": 24097077123840.0, "grad_norm": 1.8669315954752468, "language_loss": 0.70734012, "learning_rate": 6.67576538861911e-07, "loss": 0.73252755, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.776472568511963 }, { "auxiliary_loss_clip": 0.01310735, "auxiliary_loss_mlp": 0.01193104, "balance_loss_clip": 1.00776029, "balance_loss_mlp": 1.00017619, "epoch": 0.740215234774244, "flos": 21802529841120.0, "grad_norm": 1.4456015196423688, "language_loss": 0.82111573, "learning_rate": 6.669957119804612e-07, "loss": 0.84615409, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.796966314315796 }, { "auxiliary_loss_clip": 0.01320132, "auxiliary_loss_mlp": 0.01193351, "balance_loss_clip": 1.00742972, "balance_loss_mlp": 1.00023174, "epoch": 0.7403354776648831, "flos": 18733206553920.0, "grad_norm": 2.721157328602768, "language_loss": 0.72475874, "learning_rate": 6.66415087314702e-07, "loss": 0.74989355, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.8235504627227783 }, { "auxiliary_loss_clip": 0.01315465, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00755334, "balance_loss_mlp": 1.00017333, "epoch": 0.7404557205555221, "flos": 16909590692160.0, "grad_norm": 2.1778702303434154, "language_loss": 0.73353136, "learning_rate": 6.65834664952714e-07, "loss": 0.758618, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.737611770629883 }, { "auxiliary_loss_clip": 0.01301885, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00726223, "balance_loss_mlp": 1.00018013, "epoch": 0.7405759634461613, "flos": 21214419079680.0, "grad_norm": 1.7393642317814566, "language_loss": 0.76172721, "learning_rate": 6.652544449825457e-07, "loss": 0.78667814, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.79604434967041 }, { "auxiliary_loss_clip": 0.01309975, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.00840151, "balance_loss_mlp": 1.00024056, "epoch": 0.7406962063368003, "flos": 20480115853440.0, "grad_norm": 1.8000114039306254, "language_loss": 0.76469958, "learning_rate": 6.646744274922182e-07, "loss": 0.78973198, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.7534427642822266 }, { "auxiliary_loss_clip": 0.01312837, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00727904, "balance_loss_mlp": 1.00015521, "epoch": 0.7408164492274394, "flos": 19791925423200.0, "grad_norm": 10.960685530657116, "language_loss": 0.75316918, "learning_rate": 6.640946125697171e-07, "loss": 0.77822936, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.778597354888916 }, { "auxiliary_loss_clip": 0.01326234, "auxiliary_loss_mlp": 0.01193296, "balance_loss_clip": 1.0071547, "balance_loss_mlp": 1.0001775, "epoch": 0.7409366921180786, "flos": 29204868395520.0, "grad_norm": 1.770591384436909, "language_loss": 0.75715923, "learning_rate": 6.635150003030017e-07, "loss": 0.78235459, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.8603594303131104 }, { "auxiliary_loss_clip": 0.01272496, "auxiliary_loss_mlp": 0.01193317, "balance_loss_clip": 1.00745368, "balance_loss_mlp": 1.00019789, "epoch": 0.7410569350087176, "flos": 22930016139360.0, "grad_norm": 2.3712038893738487, "language_loss": 0.85987139, "learning_rate": 6.629355907799981e-07, "loss": 0.88452947, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.8812456130981445 }, { "auxiliary_loss_clip": 0.01328721, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00741136, "balance_loss_mlp": 1.00017583, "epoch": 0.7411771778993567, "flos": 30440409419520.0, "grad_norm": 1.6986072302573358, "language_loss": 0.69354308, "learning_rate": 6.623563840886015e-07, "loss": 0.71876228, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.845182418823242 }, { "auxiliary_loss_clip": 0.01336688, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00822318, "balance_loss_mlp": 1.00014877, "epoch": 0.7412974207899958, "flos": 20522061502560.0, "grad_norm": 1.6624562846795343, "language_loss": 0.69596386, "learning_rate": 6.617773803166795e-07, "loss": 0.72126251, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.732426404953003 }, { "auxiliary_loss_clip": 0.01309917, "auxiliary_loss_mlp": 0.00872535, "balance_loss_clip": 1.00711775, "balance_loss_mlp": 1.0002625, "epoch": 0.7414176636806349, "flos": 22090711700160.0, "grad_norm": 2.004579106313131, "language_loss": 0.81930804, "learning_rate": 6.611985795520634e-07, "loss": 0.84113258, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.92401385307312 }, { "auxiliary_loss_clip": 0.01299063, "auxiliary_loss_mlp": 0.0119334, "balance_loss_clip": 1.00767827, "balance_loss_mlp": 1.00022066, "epoch": 0.7415379065712739, "flos": 25155257137920.0, "grad_norm": 1.871232177600521, "language_loss": 0.77163661, "learning_rate": 6.606199818825588e-07, "loss": 0.79656065, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.901960611343384 }, { "auxiliary_loss_clip": 0.01325299, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00811434, "balance_loss_mlp": 1.00015306, "epoch": 0.7416581494619131, "flos": 16871740342560.0, "grad_norm": 2.0982087280671946, "language_loss": 0.81958485, "learning_rate": 6.600415873959377e-07, "loss": 0.8447696, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.7878549098968506 }, { "auxiliary_loss_clip": 0.01271354, "auxiliary_loss_mlp": 0.00872313, "balance_loss_clip": 1.00771928, "balance_loss_mlp": 1.00025558, "epoch": 0.7417783923525522, "flos": 28438893692640.0, "grad_norm": 2.0166394414492825, "language_loss": 0.64805967, "learning_rate": 6.594633961799437e-07, "loss": 0.6694963, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 3.134956121444702 }, { "auxiliary_loss_clip": 0.01302585, "auxiliary_loss_mlp": 0.01193315, "balance_loss_clip": 1.00727248, "balance_loss_mlp": 1.00019622, "epoch": 0.7418986352431912, "flos": 20084310352800.0, "grad_norm": 1.5479801670157984, "language_loss": 0.81277716, "learning_rate": 6.588854083222857e-07, "loss": 0.83773619, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 3.8071980476379395 }, { "auxiliary_loss_clip": 0.0131125, "auxiliary_loss_mlp": 0.0119358, "balance_loss_clip": 1.0074966, "balance_loss_mlp": 1.00027072, "epoch": 0.7420188781338304, "flos": 18259580856960.0, "grad_norm": 3.304314283757947, "language_loss": 0.80692512, "learning_rate": 6.583076239106444e-07, "loss": 0.83197343, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 3.6588778495788574 }, { "auxiliary_loss_clip": 0.01316682, "auxiliary_loss_mlp": 0.01193283, "balance_loss_clip": 1.00735641, "balance_loss_mlp": 1.00016403, "epoch": 0.7421391210244694, "flos": 13772002907520.0, "grad_norm": 2.2174316635586813, "language_loss": 0.74959385, "learning_rate": 6.577300430326707e-07, "loss": 0.77469349, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 5.3255615234375 }, { "auxiliary_loss_clip": 0.01283335, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00684762, "balance_loss_mlp": 1.0001694, "epoch": 0.7422593639151085, "flos": 15961692672000.0, "grad_norm": 2.0168291567115806, "language_loss": 0.71944141, "learning_rate": 6.571526657759821e-07, "loss": 0.74420667, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 3.087273359298706 }, { "auxiliary_loss_clip": 0.01337664, "auxiliary_loss_mlp": 0.01193387, "balance_loss_clip": 1.00833917, "balance_loss_mlp": 1.00026751, "epoch": 0.7423796068057477, "flos": 30114413134560.0, "grad_norm": 1.7079707346689854, "language_loss": 0.71082473, "learning_rate": 6.565754922281663e-07, "loss": 0.73613518, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.790358066558838 }, { "auxiliary_loss_clip": 0.0132424, "auxiliary_loss_mlp": 0.01193268, "balance_loss_clip": 1.0079248, "balance_loss_mlp": 1.00014925, "epoch": 0.7424998496963867, "flos": 20521917807840.0, "grad_norm": 2.447021718504806, "language_loss": 0.78420782, "learning_rate": 6.559985224767801e-07, "loss": 0.80938292, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.9474241733551025 }, { "auxiliary_loss_clip": 0.01285272, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00781107, "balance_loss_mlp": 1.00018907, "epoch": 0.7426200925870258, "flos": 21871584659520.0, "grad_norm": 2.2283626031155723, "language_loss": 0.75301456, "learning_rate": 6.55421756609349e-07, "loss": 0.77779937, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.920032024383545 }, { "auxiliary_loss_clip": 0.0132511, "auxiliary_loss_mlp": 0.01193286, "balance_loss_clip": 1.00804567, "balance_loss_mlp": 1.00026226, "epoch": 0.7427403354776649, "flos": 26432061261120.0, "grad_norm": 1.7812459013646535, "language_loss": 0.78962076, "learning_rate": 6.54845194713369e-07, "loss": 0.81480467, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.8040974140167236 }, { "auxiliary_loss_clip": 0.01337683, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00825894, "balance_loss_mlp": 1.00017095, "epoch": 0.742860578368304, "flos": 19898399507040.0, "grad_norm": 1.7804305825957745, "language_loss": 0.79805958, "learning_rate": 6.542688368763034e-07, "loss": 0.82336831, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.780086040496826 }, { "auxiliary_loss_clip": 0.01325409, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00757933, "balance_loss_mlp": 1.00020063, "epoch": 0.742980821258943, "flos": 24827213203200.0, "grad_norm": 1.6609343043674927, "language_loss": 0.77190125, "learning_rate": 6.536926831855854e-07, "loss": 0.79708761, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.812840700149536 }, { "auxiliary_loss_clip": 0.0130241, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00662231, "balance_loss_mlp": 1.00022721, "epoch": 0.7431010641495821, "flos": 25228658721600.0, "grad_norm": 2.142783213120927, "language_loss": 0.73499739, "learning_rate": 6.531167337286165e-07, "loss": 0.75995398, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.855921745300293 }, { "auxiliary_loss_clip": 0.01302973, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00727892, "balance_loss_mlp": 1.000211, "epoch": 0.7432213070402213, "flos": 21762380376000.0, "grad_norm": 1.3782888870988388, "language_loss": 0.79591882, "learning_rate": 6.52540988592768e-07, "loss": 0.82088095, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.8167684078216553 }, { "auxiliary_loss_clip": 0.01316332, "auxiliary_loss_mlp": 0.0119325, "balance_loss_clip": 1.00799513, "balance_loss_mlp": 1.00022602, "epoch": 0.7433415499308603, "flos": 14793841366560.0, "grad_norm": 2.0237044494905403, "language_loss": 0.83420801, "learning_rate": 6.519654478653814e-07, "loss": 0.85930383, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.8435959815979004 }, { "auxiliary_loss_clip": 0.01295227, "auxiliary_loss_mlp": 0.01192281, "balance_loss_clip": 1.00511026, "balance_loss_mlp": 1.00001991, "epoch": 0.7434617928214994, "flos": 67155612456960.0, "grad_norm": 0.7533042574696031, "language_loss": 0.56116247, "learning_rate": 6.51390111633763e-07, "loss": 0.58603752, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.466554641723633 }, { "auxiliary_loss_clip": 0.01273739, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00766921, "balance_loss_mlp": 1.00019419, "epoch": 0.7435820357121385, "flos": 27377588318400.0, "grad_norm": 1.536546099846646, "language_loss": 0.76369733, "learning_rate": 6.508149799851932e-07, "loss": 0.78836691, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.9793484210968018 }, { "auxiliary_loss_clip": 0.01301437, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00715959, "balance_loss_mlp": 1.00018227, "epoch": 0.7437022786027776, "flos": 23987657298240.0, "grad_norm": 1.9143428096245994, "language_loss": 0.61262155, "learning_rate": 6.502400530069183e-07, "loss": 0.637568, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.85964035987854 }, { "auxiliary_loss_clip": 0.01292587, "auxiliary_loss_mlp": 0.01193515, "balance_loss_clip": 1.0074122, "balance_loss_mlp": 1.0002048, "epoch": 0.7438225214934167, "flos": 21866770886400.0, "grad_norm": 1.5057079555351165, "language_loss": 0.68598521, "learning_rate": 6.496653307861535e-07, "loss": 0.71084619, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.907089948654175 }, { "auxiliary_loss_clip": 0.01338535, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00820899, "balance_loss_mlp": 1.00018144, "epoch": 0.7439427643840558, "flos": 20230107657120.0, "grad_norm": 1.9774913818512938, "language_loss": 0.65824795, "learning_rate": 6.490908134100857e-07, "loss": 0.68356538, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.804330348968506 }, { "auxiliary_loss_clip": 0.01337509, "auxiliary_loss_mlp": 0.01193307, "balance_loss_clip": 1.0081408, "balance_loss_mlp": 1.0001874, "epoch": 0.7440630072746949, "flos": 20849925818880.0, "grad_norm": 2.0402448208190456, "language_loss": 0.69286531, "learning_rate": 6.48516500965866e-07, "loss": 0.7181735, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.8424854278564453 }, { "auxiliary_loss_clip": 0.01337646, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00764263, "balance_loss_mlp": 1.00018871, "epoch": 0.7441832501653339, "flos": 26503774431840.0, "grad_norm": 1.6451707819563481, "language_loss": 0.81500185, "learning_rate": 6.479423935406192e-07, "loss": 0.84031045, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.856435537338257 }, { "auxiliary_loss_clip": 0.01274269, "auxiliary_loss_mlp": 0.01192301, "balance_loss_clip": 1.0052855, "balance_loss_mlp": 1.00004041, "epoch": 0.7443034930559731, "flos": 68602880243520.0, "grad_norm": 0.805941466907744, "language_loss": 0.62036526, "learning_rate": 6.473684912214357e-07, "loss": 0.64503098, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.4933888912200928 }, { "auxiliary_loss_clip": 0.01325734, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00757957, "balance_loss_mlp": 1.00017691, "epoch": 0.7444237359466122, "flos": 18654991197120.0, "grad_norm": 1.882161530898176, "language_loss": 0.69611782, "learning_rate": 6.467947940953778e-07, "loss": 0.72130716, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.8324971199035645 }, { "auxiliary_loss_clip": 0.01315255, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00767171, "balance_loss_mlp": 1.00016081, "epoch": 0.7445439788372512, "flos": 22817614648320.0, "grad_norm": 1.740463667888001, "language_loss": 0.72532713, "learning_rate": 6.462213022494732e-07, "loss": 0.75041151, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.9807119369506836 }, { "auxiliary_loss_clip": 0.01307417, "auxiliary_loss_mlp": 0.01192283, "balance_loss_clip": 1.00512028, "balance_loss_mlp": 1.00002217, "epoch": 0.7446642217278904, "flos": 67045725623520.0, "grad_norm": 0.7691364473357407, "language_loss": 0.61071455, "learning_rate": 6.456480157707201e-07, "loss": 0.63571155, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 3.3564705848693848 }, { "auxiliary_loss_clip": 0.0130153, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00761962, "balance_loss_mlp": 1.00017691, "epoch": 0.7447844646185294, "flos": 17417474370720.0, "grad_norm": 1.822394865434637, "language_loss": 0.85195041, "learning_rate": 6.450749347460866e-07, "loss": 0.87689781, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 3.0246005058288574 }, { "auxiliary_loss_clip": 0.01349172, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00800872, "balance_loss_mlp": 1.00016201, "epoch": 0.7449047075091685, "flos": 26615888533440.0, "grad_norm": 1.8984224850173943, "language_loss": 0.78734267, "learning_rate": 6.445020592625083e-07, "loss": 0.81276619, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.8730616569519043 }, { "auxiliary_loss_clip": 0.01349059, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00770152, "balance_loss_mlp": 1.00021672, "epoch": 0.7450249503998077, "flos": 14170466760480.0, "grad_norm": 2.0016313888163415, "language_loss": 0.79943073, "learning_rate": 6.4392938940689e-07, "loss": 0.82485372, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 3.751429319381714 }, { "auxiliary_loss_clip": 0.01276449, "auxiliary_loss_mlp": 0.00872453, "balance_loss_clip": 1.00719714, "balance_loss_mlp": 1.0002085, "epoch": 0.7451451932904467, "flos": 19606697127360.0, "grad_norm": 2.159213392992975, "language_loss": 0.7096476, "learning_rate": 6.433569252661049e-07, "loss": 0.73113656, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 3.866403579711914 }, { "auxiliary_loss_clip": 0.01297021, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00695109, "balance_loss_mlp": 1.00018442, "epoch": 0.7452654361810858, "flos": 12495414326400.0, "grad_norm": 1.7340499338581687, "language_loss": 0.71190977, "learning_rate": 6.427846669269952e-07, "loss": 0.73681211, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 4.172312259674072 }, { "auxiliary_loss_clip": 0.01350437, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00844049, "balance_loss_mlp": 1.00019169, "epoch": 0.7453856790717249, "flos": 22127340644640.0, "grad_norm": 1.8957282493459924, "language_loss": 0.8236419, "learning_rate": 6.422126144763729e-07, "loss": 0.84907836, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.8451907634735107 }, { "auxiliary_loss_clip": 0.01302486, "auxiliary_loss_mlp": 0.00872554, "balance_loss_clip": 1.00732148, "balance_loss_mlp": 1.00034738, "epoch": 0.745505921962364, "flos": 20010693227040.0, "grad_norm": 2.1992774774362287, "language_loss": 0.76708531, "learning_rate": 6.416407680010174e-07, "loss": 0.7888357, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.9937069416046143 }, { "auxiliary_loss_clip": 0.0127425, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.00742888, "balance_loss_mlp": 1.00025296, "epoch": 0.745626164853003, "flos": 24677895378240.0, "grad_norm": 2.0371448071146245, "language_loss": 0.81055075, "learning_rate": 6.410691275876774e-07, "loss": 0.83522606, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 3.179377794265747 }, { "auxiliary_loss_clip": 0.01309173, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00827885, "balance_loss_mlp": 1.00018907, "epoch": 0.7457464077436422, "flos": 14538839778720.0, "grad_norm": 2.2653050721272936, "language_loss": 0.76569957, "learning_rate": 6.404976933230704e-07, "loss": 0.79072344, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.9058477878570557 }, { "auxiliary_loss_clip": 0.01325109, "auxiliary_loss_mlp": 0.01193242, "balance_loss_clip": 1.00822616, "balance_loss_mlp": 1.00021863, "epoch": 0.7458666506342813, "flos": 34021208753280.0, "grad_norm": 1.7403327493804166, "language_loss": 0.7238971, "learning_rate": 6.399264652938813e-07, "loss": 0.7490806, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 3.0848515033721924 }, { "auxiliary_loss_clip": 0.01312168, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00727129, "balance_loss_mlp": 1.00018311, "epoch": 0.7459868935249203, "flos": 24279036364800.0, "grad_norm": 1.8021610951438352, "language_loss": 0.74752587, "learning_rate": 6.393554435867679e-07, "loss": 0.77257961, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.943171977996826 }, { "auxiliary_loss_clip": 0.0130458, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00806594, "balance_loss_mlp": 1.00016785, "epoch": 0.7461071364155595, "flos": 21908788382880.0, "grad_norm": 1.8795098364009506, "language_loss": 0.83272409, "learning_rate": 6.387846282883502e-07, "loss": 0.85770178, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.830019235610962 }, { "auxiliary_loss_clip": 0.01349126, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00810599, "balance_loss_mlp": 1.00019109, "epoch": 0.7462273793061985, "flos": 22889722979520.0, "grad_norm": 1.8997487162605395, "language_loss": 0.76868677, "learning_rate": 6.38214019485223e-07, "loss": 0.79411018, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.8284666538238525 }, { "auxiliary_loss_clip": 0.01256998, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.0065465, "balance_loss_mlp": 1.00018525, "epoch": 0.7463476221968376, "flos": 19968460188480.0, "grad_norm": 1.61350450687455, "language_loss": 0.71678537, "learning_rate": 6.376436172639461e-07, "loss": 0.74128741, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.873351573944092 }, { "auxiliary_loss_clip": 0.01232865, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00701451, "balance_loss_mlp": 1.00021029, "epoch": 0.7464678650874768, "flos": 16836620192640.0, "grad_norm": 2.9950296178705584, "language_loss": 0.6481576, "learning_rate": 6.370734217110487e-07, "loss": 0.67241859, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.9823837280273438 }, { "auxiliary_loss_clip": 0.01309502, "auxiliary_loss_mlp": 0.01193418, "balance_loss_clip": 1.00794411, "balance_loss_mlp": 1.00020409, "epoch": 0.7465881079781158, "flos": 48100886868960.0, "grad_norm": 1.370217080122748, "language_loss": 0.63974673, "learning_rate": 6.36503432913031e-07, "loss": 0.66477591, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 3.18865704536438 }, { "auxiliary_loss_clip": 0.0132421, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00769365, "balance_loss_mlp": 1.00015616, "epoch": 0.7467083508687549, "flos": 19677368511360.0, "grad_norm": 1.7851074740708965, "language_loss": 0.68913877, "learning_rate": 6.359336509563569e-07, "loss": 0.71431267, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.718594789505005 }, { "auxiliary_loss_clip": 0.01287846, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00747693, "balance_loss_mlp": 1.00019336, "epoch": 0.7468285937593939, "flos": 17895446832960.0, "grad_norm": 2.3114554520735284, "language_loss": 0.80762255, "learning_rate": 6.353640759274641e-07, "loss": 0.83243322, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.8758704662323 }, { "auxiliary_loss_clip": 0.01337177, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.0078969, "balance_loss_mlp": 1.0001775, "epoch": 0.7469488366500331, "flos": 23141455512480.0, "grad_norm": 2.4712065801900023, "language_loss": 0.75298315, "learning_rate": 6.347947079127556e-07, "loss": 0.77828693, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.7464282512664795 }, { "auxiliary_loss_clip": 0.01314304, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00771379, "balance_loss_mlp": 1.00020027, "epoch": 0.7470690795406721, "flos": 16690858812000.0, "grad_norm": 1.8939732600307155, "language_loss": 0.76609498, "learning_rate": 6.342255469986053e-07, "loss": 0.79117024, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.8561060428619385 }, { "auxiliary_loss_clip": 0.01349524, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.0081929, "balance_loss_mlp": 1.00022316, "epoch": 0.7471893224313112, "flos": 25192712327040.0, "grad_norm": 1.7908418156147825, "language_loss": 0.76221287, "learning_rate": 6.336565932713533e-07, "loss": 0.78764057, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.743830919265747 }, { "auxiliary_loss_clip": 0.01301192, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00717449, "balance_loss_mlp": 1.00017965, "epoch": 0.7473095653219504, "flos": 22526235581760.0, "grad_norm": 1.6860337027558774, "language_loss": 0.77451003, "learning_rate": 6.330878468173088e-07, "loss": 0.79945397, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.84428334236145 }, { "auxiliary_loss_clip": 0.01337183, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00791287, "balance_loss_mlp": 1.00014901, "epoch": 0.7474298082125894, "flos": 18113999094720.0, "grad_norm": 1.714808427592634, "language_loss": 0.72513533, "learning_rate": 6.32519307722752e-07, "loss": 0.75043887, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.7864532470703125 }, { "auxiliary_loss_clip": 0.01268425, "auxiliary_loss_mlp": 0.01192282, "balance_loss_clip": 1.00832009, "balance_loss_mlp": 1.00002158, "epoch": 0.7475500511032285, "flos": 62086569626880.0, "grad_norm": 0.8480117600710682, "language_loss": 0.54986405, "learning_rate": 6.31950976073929e-07, "loss": 0.57447112, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.3156633377075195 }, { "auxiliary_loss_clip": 0.01270926, "auxiliary_loss_mlp": 0.0119334, "balance_loss_clip": 1.00774479, "balance_loss_mlp": 1.00022113, "epoch": 0.7476702939938676, "flos": 17785595923200.0, "grad_norm": 2.123201890940312, "language_loss": 0.81025362, "learning_rate": 6.31382851957055e-07, "loss": 0.83489627, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.811525344848633 }, { "auxiliary_loss_clip": 0.01292483, "auxiliary_loss_mlp": 0.00872425, "balance_loss_clip": 1.00760496, "balance_loss_mlp": 1.0002687, "epoch": 0.7477905368845067, "flos": 27927956501280.0, "grad_norm": 1.832422864122524, "language_loss": 0.71368319, "learning_rate": 6.308149354583143e-07, "loss": 0.73533225, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.894939422607422 }, { "auxiliary_loss_clip": 0.0133247, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00781941, "balance_loss_mlp": 1.00021553, "epoch": 0.7479107797751458, "flos": 26870387189760.0, "grad_norm": 1.6929693533288668, "language_loss": 0.81484365, "learning_rate": 6.302472266638586e-07, "loss": 0.84010077, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.7907187938690186 }, { "auxiliary_loss_clip": 0.0135192, "auxiliary_loss_mlp": 0.01193311, "balance_loss_clip": 1.00895035, "balance_loss_mlp": 1.0001924, "epoch": 0.7480310226657849, "flos": 33943388556960.0, "grad_norm": 1.8961171654900852, "language_loss": 0.69638377, "learning_rate": 6.296797256598101e-07, "loss": 0.72183603, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.8226966857910156 }, { "auxiliary_loss_clip": 0.01294779, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00689375, "balance_loss_mlp": 1.00013673, "epoch": 0.748151265556424, "flos": 24826566576960.0, "grad_norm": 1.9207022253339563, "language_loss": 0.81525224, "learning_rate": 6.291124325322576e-07, "loss": 0.84013164, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 3.795985698699951 }, { "auxiliary_loss_clip": 0.01324309, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00832939, "balance_loss_mlp": 1.0001812, "epoch": 0.748271508447063, "flos": 38399366877120.0, "grad_norm": 1.4866499580327215, "language_loss": 0.62530357, "learning_rate": 6.285453473672595e-07, "loss": 0.65047872, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 3.8619377613067627 }, { "auxiliary_loss_clip": 0.01349234, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00766027, "balance_loss_mlp": 1.0001843, "epoch": 0.7483917513377022, "flos": 21541852311840.0, "grad_norm": 2.3811811971360446, "language_loss": 0.75715601, "learning_rate": 6.279784702508415e-07, "loss": 0.78258049, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 3.7666239738464355 }, { "auxiliary_loss_clip": 0.0126739, "auxiliary_loss_mlp": 0.01192322, "balance_loss_clip": 1.00526774, "balance_loss_mlp": 1.00006127, "epoch": 0.7485119942283412, "flos": 62314569816480.0, "grad_norm": 0.779668089143848, "language_loss": 0.58628023, "learning_rate": 6.274118012689979e-07, "loss": 0.61087734, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.519904136657715 }, { "auxiliary_loss_clip": 0.01312348, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00780773, "balance_loss_mlp": 1.00013947, "epoch": 0.7486322371189803, "flos": 29937626903520.0, "grad_norm": 1.4830031252265246, "language_loss": 0.67825234, "learning_rate": 6.268453405076943e-07, "loss": 0.70330751, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.820808172225952 }, { "auxiliary_loss_clip": 0.01314244, "auxiliary_loss_mlp": 0.01193297, "balance_loss_clip": 1.00729358, "balance_loss_mlp": 1.00017846, "epoch": 0.7487524800096195, "flos": 18949423776480.0, "grad_norm": 1.7841243778520437, "language_loss": 0.8263104, "learning_rate": 6.262790880528592e-07, "loss": 0.85138583, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.7525339126586914 }, { "auxiliary_loss_clip": 0.0131329, "auxiliary_loss_mlp": 0.01193451, "balance_loss_clip": 1.00788927, "balance_loss_mlp": 1.00023651, "epoch": 0.7488727229002585, "flos": 18697403854080.0, "grad_norm": 2.3689145673043193, "language_loss": 0.79819232, "learning_rate": 6.257130439903951e-07, "loss": 0.82325971, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.7902727127075195 }, { "auxiliary_loss_clip": 0.01350894, "auxiliary_loss_mlp": 0.01193261, "balance_loss_clip": 1.00851107, "balance_loss_mlp": 1.00023723, "epoch": 0.7489929657908976, "flos": 23623379579520.0, "grad_norm": 1.8178864348729507, "language_loss": 0.81214046, "learning_rate": 6.251472084061695e-07, "loss": 0.83758205, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.736898422241211 }, { "auxiliary_loss_clip": 0.0132467, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00727439, "balance_loss_mlp": 1.00015903, "epoch": 0.7491132086815367, "flos": 20551541634720.0, "grad_norm": 1.8808628492432309, "language_loss": 0.88834298, "learning_rate": 6.245815813860191e-07, "loss": 0.91352147, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.7211735248565674 }, { "auxiliary_loss_clip": 0.0135013, "auxiliary_loss_mlp": 0.01193354, "balance_loss_clip": 1.00797665, "balance_loss_mlp": 1.00023508, "epoch": 0.7492334515721758, "flos": 23003022562560.0, "grad_norm": 2.0094180393440126, "language_loss": 0.70293355, "learning_rate": 6.240161630157495e-07, "loss": 0.7283684, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.8338570594787598 }, { "auxiliary_loss_clip": 0.01349502, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00802994, "balance_loss_mlp": 1.00022781, "epoch": 0.7493536944628149, "flos": 16398833119200.0, "grad_norm": 2.0907412276270243, "language_loss": 0.70083237, "learning_rate": 6.23450953381133e-07, "loss": 0.72625995, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.6385111808776855 }, { "auxiliary_loss_clip": 0.01318008, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00739658, "balance_loss_mlp": 1.00015187, "epoch": 0.749473937353454, "flos": 15338569531680.0, "grad_norm": 1.8127448175458953, "language_loss": 0.68095016, "learning_rate": 6.228859525679131e-07, "loss": 0.70606291, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.771075487136841 }, { "auxiliary_loss_clip": 0.01328742, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00746322, "balance_loss_mlp": 1.00016582, "epoch": 0.7495941802440931, "flos": 18951148113120.0, "grad_norm": 1.9971311862580987, "language_loss": 0.79730535, "learning_rate": 6.223211606617986e-07, "loss": 0.82252461, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.7389678955078125 }, { "auxiliary_loss_clip": 0.01329182, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00757456, "balance_loss_mlp": 1.00015771, "epoch": 0.7497144231347321, "flos": 22492480531680.0, "grad_norm": 1.7202484222876593, "language_loss": 0.84156239, "learning_rate": 6.217565777484701e-07, "loss": 0.86678606, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.773585081100464 }, { "auxiliary_loss_clip": 0.01303557, "auxiliary_loss_mlp": 0.00872417, "balance_loss_clip": 1.00702524, "balance_loss_mlp": 1.00022888, "epoch": 0.7498346660253713, "flos": 24243521054400.0, "grad_norm": 1.7502186043204973, "language_loss": 0.80128682, "learning_rate": 6.211922039135722e-07, "loss": 0.82304651, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.838740587234497 }, { "auxiliary_loss_clip": 0.01349672, "auxiliary_loss_mlp": 0.01193323, "balance_loss_clip": 1.00818372, "balance_loss_mlp": 1.00020397, "epoch": 0.7499549089160104, "flos": 24387091090560.0, "grad_norm": 2.0145420936224347, "language_loss": 0.8095485, "learning_rate": 6.206280392427201e-07, "loss": 0.83497846, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.7285478115081787 }, { "auxiliary_loss_clip": 0.01336911, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00826466, "balance_loss_mlp": 1.0001936, "epoch": 0.7500751518066494, "flos": 34057334766240.0, "grad_norm": 1.466591837788999, "language_loss": 0.73568964, "learning_rate": 6.200640838214983e-07, "loss": 0.76099098, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.884139060974121 }, { "auxiliary_loss_clip": 0.01348922, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00767303, "balance_loss_mlp": 1.00014114, "epoch": 0.7501953946972886, "flos": 18843596318880.0, "grad_norm": 1.8362301276136717, "language_loss": 0.66867304, "learning_rate": 6.195003377354578e-07, "loss": 0.69409394, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.6602554321289062 }, { "auxiliary_loss_clip": 0.01337771, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00793362, "balance_loss_mlp": 1.00019312, "epoch": 0.7503156375879276, "flos": 20257683834240.0, "grad_norm": 3.0387167090070157, "language_loss": 0.73354483, "learning_rate": 6.189368010701183e-07, "loss": 0.75885564, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.8930814266204834 }, { "auxiliary_loss_clip": 0.01338494, "auxiliary_loss_mlp": 0.01193298, "balance_loss_clip": 1.00785017, "balance_loss_mlp": 1.00017858, "epoch": 0.7504358804785667, "flos": 13480049062080.0, "grad_norm": 1.8789562202978265, "language_loss": 0.76519668, "learning_rate": 6.183734739109683e-07, "loss": 0.79051459, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.6697864532470703 }, { "auxiliary_loss_clip": 0.01321599, "auxiliary_loss_mlp": 0.01193425, "balance_loss_clip": 1.00827909, "balance_loss_mlp": 1.00021029, "epoch": 0.7505561233692057, "flos": 29461055464800.0, "grad_norm": 2.888624624957415, "language_loss": 0.68775487, "learning_rate": 6.178103563434629e-07, "loss": 0.71290511, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.775184392929077 }, { "auxiliary_loss_clip": 0.0134976, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00797212, "balance_loss_mlp": 1.00021958, "epoch": 0.7506763662598449, "flos": 20302467454080.0, "grad_norm": 1.5777892921989203, "language_loss": 0.83918941, "learning_rate": 6.172474484530283e-07, "loss": 0.86461949, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.7198216915130615 }, { "auxiliary_loss_clip": 0.01325177, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00825071, "balance_loss_mlp": 1.0002234, "epoch": 0.750796609150484, "flos": 37230976716480.0, "grad_norm": 1.846360689899767, "language_loss": 0.76025575, "learning_rate": 6.166847503250563e-07, "loss": 0.78543997, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.919646739959717 }, { "auxiliary_loss_clip": 0.01314947, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00734031, "balance_loss_mlp": 1.0001936, "epoch": 0.750916852041123, "flos": 19609427327040.0, "grad_norm": 2.1932135676440274, "language_loss": 0.7882846, "learning_rate": 6.161222620449078e-07, "loss": 0.81336623, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.8905742168426514 }, { "auxiliary_loss_clip": 0.01293482, "auxiliary_loss_mlp": 0.0119327, "balance_loss_clip": 1.00707722, "balance_loss_mlp": 1.00015056, "epoch": 0.7510370949317622, "flos": 25112700786240.0, "grad_norm": 1.9794340903562462, "language_loss": 0.80215186, "learning_rate": 6.155599836979117e-07, "loss": 0.82701933, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.896374225616455 }, { "auxiliary_loss_clip": 0.01276028, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00643039, "balance_loss_mlp": 1.00019479, "epoch": 0.7511573378224012, "flos": 19062292275360.0, "grad_norm": 2.241616721080677, "language_loss": 0.81372166, "learning_rate": 6.149979153693649e-07, "loss": 0.83841413, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.870823621749878 }, { "auxiliary_loss_clip": 0.01337695, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00808966, "balance_loss_mlp": 1.00019383, "epoch": 0.7512775807130403, "flos": 19937686803840.0, "grad_norm": 2.5555023538318506, "language_loss": 0.76628721, "learning_rate": 6.144360571445343e-07, "loss": 0.79159629, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 3.6591145992279053 }, { "auxiliary_loss_clip": 0.01323997, "auxiliary_loss_mlp": 0.01193252, "balance_loss_clip": 1.00761914, "balance_loss_mlp": 1.0002284, "epoch": 0.7513978236036795, "flos": 20739931214400.0, "grad_norm": 1.6277816686493403, "language_loss": 0.80075264, "learning_rate": 6.138744091086509e-07, "loss": 0.82592517, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 3.755063772201538 }, { "auxiliary_loss_clip": 0.01292112, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00704205, "balance_loss_mlp": 1.00016713, "epoch": 0.7515180664943185, "flos": 27563175851040.0, "grad_norm": 2.3542477224090557, "language_loss": 0.72720295, "learning_rate": 6.133129713469183e-07, "loss": 0.752056, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 4.836194276809692 }, { "auxiliary_loss_clip": 0.01308201, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00749683, "balance_loss_mlp": 1.00019038, "epoch": 0.7516383093849576, "flos": 33803195346720.0, "grad_norm": 1.6078658139962747, "language_loss": 0.63838363, "learning_rate": 6.127517439445053e-07, "loss": 0.66339779, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.937668800354004 }, { "auxiliary_loss_clip": 0.01266587, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00745368, "balance_loss_mlp": 1.0001955, "epoch": 0.7517585522755967, "flos": 29746183811040.0, "grad_norm": 1.8219842650583542, "language_loss": 0.81294143, "learning_rate": 6.121907269865498e-07, "loss": 0.83753955, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.851106882095337 }, { "auxiliary_loss_clip": 0.01260267, "auxiliary_loss_mlp": 0.01192302, "balance_loss_clip": 1.00437284, "balance_loss_mlp": 1.00004089, "epoch": 0.7518787951662358, "flos": 69808007119680.0, "grad_norm": 0.9220807452378882, "language_loss": 0.67241395, "learning_rate": 6.116299205581577e-07, "loss": 0.69693959, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.28652286529541 }, { "auxiliary_loss_clip": 0.01351278, "auxiliary_loss_mlp": 0.01193448, "balance_loss_clip": 1.00860012, "balance_loss_mlp": 1.00023389, "epoch": 0.7519990380568748, "flos": 34203239841600.0, "grad_norm": 1.929708264572127, "language_loss": 0.68374646, "learning_rate": 6.110693247444018e-07, "loss": 0.70919371, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.828108072280884 }, { "auxiliary_loss_clip": 0.01293318, "auxiliary_loss_mlp": 0.01193238, "balance_loss_clip": 1.00716734, "balance_loss_mlp": 1.0002141, "epoch": 0.752119280947514, "flos": 21725715507840.0, "grad_norm": 1.70567397786612, "language_loss": 0.8271209, "learning_rate": 6.105089396303258e-07, "loss": 0.85198647, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.7302143573760986 }, { "auxiliary_loss_clip": 0.01313967, "auxiliary_loss_mlp": 0.01193338, "balance_loss_clip": 1.00772929, "balance_loss_mlp": 1.00021887, "epoch": 0.7522395238381531, "flos": 32742788064480.0, "grad_norm": 1.9209995213282696, "language_loss": 0.75988233, "learning_rate": 6.099487653009383e-07, "loss": 0.78495538, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.896448850631714 }, { "auxiliary_loss_clip": 0.0133638, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00789726, "balance_loss_mlp": 1.00014329, "epoch": 0.7523597667287921, "flos": 23476037556960.0, "grad_norm": 1.8852683581504877, "language_loss": 0.83141929, "learning_rate": 6.093888018412192e-07, "loss": 0.85671473, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.7875490188598633 }, { "auxiliary_loss_clip": 0.01307536, "auxiliary_loss_mlp": 0.0119231, "balance_loss_clip": 1.00502002, "balance_loss_mlp": 1.00004888, "epoch": 0.7524800096194313, "flos": 67346768160000.0, "grad_norm": 0.7153444998919877, "language_loss": 0.54746759, "learning_rate": 6.088290493361125e-07, "loss": 0.57246602, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.440315008163452 }, { "auxiliary_loss_clip": 0.01278834, "auxiliary_loss_mlp": 0.0119326, "balance_loss_clip": 1.00780249, "balance_loss_mlp": 1.00023627, "epoch": 0.7526002525100703, "flos": 13006064128320.0, "grad_norm": 2.2836692689968308, "language_loss": 0.71085078, "learning_rate": 6.082695078705322e-07, "loss": 0.73557174, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.802056074142456 }, { "auxiliary_loss_clip": 0.0133754, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00822365, "balance_loss_mlp": 1.00025809, "epoch": 0.7527204954007094, "flos": 21397240488960.0, "grad_norm": 2.2408976422248754, "language_loss": 0.68603772, "learning_rate": 6.077101775293618e-07, "loss": 0.71134496, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.674525737762451 }, { "auxiliary_loss_clip": 0.0132879, "auxiliary_loss_mlp": 0.01193241, "balance_loss_clip": 1.00759244, "balance_loss_mlp": 1.00021768, "epoch": 0.7528407382913486, "flos": 18947196508320.0, "grad_norm": 2.1616796497391926, "language_loss": 0.82651305, "learning_rate": 6.071510583974504e-07, "loss": 0.85173339, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.7585389614105225 }, { "auxiliary_loss_clip": 0.0135008, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00820565, "balance_loss_mlp": 1.00020218, "epoch": 0.7529609811819876, "flos": 15231808058400.0, "grad_norm": 2.0993823411418773, "language_loss": 0.71623057, "learning_rate": 6.065921505596161e-07, "loss": 0.74166363, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.692758560180664 }, { "auxiliary_loss_clip": 0.01287897, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.0068357, "balance_loss_mlp": 1.00019658, "epoch": 0.7530812240726267, "flos": 19354497586560.0, "grad_norm": 1.4902260882270422, "language_loss": 0.76839715, "learning_rate": 6.060334541006445e-07, "loss": 0.79320836, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.922652006149292 }, { "auxiliary_loss_clip": 0.01310882, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00742221, "balance_loss_mlp": 1.0002023, "epoch": 0.7532014669632658, "flos": 27748260452160.0, "grad_norm": 1.5275315191887335, "language_loss": 0.68876475, "learning_rate": 6.05474969105289e-07, "loss": 0.71380579, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.825145959854126 }, { "auxiliary_loss_clip": 0.01327195, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.0076642, "balance_loss_mlp": 1.0001688, "epoch": 0.7533217098539049, "flos": 14137430184000.0, "grad_norm": 1.8401623672940637, "language_loss": 0.73708797, "learning_rate": 6.049166956582725e-07, "loss": 0.76229185, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.7819883823394775 }, { "auxiliary_loss_clip": 0.01336317, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00834656, "balance_loss_mlp": 1.00018573, "epoch": 0.753441952744544, "flos": 26429079595680.0, "grad_norm": 1.954611485111115, "language_loss": 0.87277317, "learning_rate": 6.043586338442841e-07, "loss": 0.89806849, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.7515084743499756 }, { "auxiliary_loss_clip": 0.01347613, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00771666, "balance_loss_mlp": 1.00013566, "epoch": 0.7535621956351831, "flos": 23878632633120.0, "grad_norm": 1.4809620929661647, "language_loss": 0.73024565, "learning_rate": 6.038007837479815e-07, "loss": 0.75565338, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.8146862983703613 }, { "auxiliary_loss_clip": 0.01325925, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00741947, "balance_loss_mlp": 1.00014365, "epoch": 0.7536824385258222, "flos": 21795883960320.0, "grad_norm": 1.8771731257347797, "language_loss": 0.64161617, "learning_rate": 6.032431454539897e-07, "loss": 0.66680706, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.878171682357788 }, { "auxiliary_loss_clip": 0.01294954, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00768816, "balance_loss_mlp": 1.00017262, "epoch": 0.7538026814164612, "flos": 28911657221280.0, "grad_norm": 1.5704582998748466, "language_loss": 0.81308782, "learning_rate": 6.026857190469014e-07, "loss": 0.8379693, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.94632625579834 }, { "auxiliary_loss_clip": 0.01313191, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00764287, "balance_loss_mlp": 1.00018287, "epoch": 0.7539229243071004, "flos": 21104711864640.0, "grad_norm": 2.448072766405346, "language_loss": 0.74121344, "learning_rate": 6.0212850461128e-07, "loss": 0.76627743, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.7522201538085938 }, { "auxiliary_loss_clip": 0.01314627, "auxiliary_loss_mlp": 0.01193302, "balance_loss_clip": 1.00741684, "balance_loss_mlp": 1.00018346, "epoch": 0.7540431671977395, "flos": 15158478322080.0, "grad_norm": 1.8580935441654627, "language_loss": 0.74515605, "learning_rate": 6.015715022316516e-07, "loss": 0.7702353, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.718604803085327 }, { "auxiliary_loss_clip": 0.01287255, "auxiliary_loss_mlp": 0.01193451, "balance_loss_clip": 1.00757408, "balance_loss_mlp": 1.00023663, "epoch": 0.7541634100883785, "flos": 18770589895680.0, "grad_norm": 2.905620566589562, "language_loss": 0.77853537, "learning_rate": 6.010147119925154e-07, "loss": 0.80334246, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.879364252090454 }, { "auxiliary_loss_clip": 0.01292567, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.00710487, "balance_loss_mlp": 1.00020909, "epoch": 0.7542836529790176, "flos": 20594780536320.0, "grad_norm": 1.9993569514782101, "language_loss": 0.66837513, "learning_rate": 6.004581339783348e-07, "loss": 0.69323313, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.7202322483062744 }, { "auxiliary_loss_clip": 0.01331747, "auxiliary_loss_mlp": 0.01193303, "balance_loss_clip": 1.00789011, "balance_loss_mlp": 1.00018358, "epoch": 0.7544038958696567, "flos": 19095113309760.0, "grad_norm": 11.925844613115942, "language_loss": 0.68692613, "learning_rate": 5.999017682735425e-07, "loss": 0.71217668, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 3.656251907348633 }, { "auxiliary_loss_clip": 0.01260144, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00644803, "balance_loss_mlp": 1.00021625, "epoch": 0.7545241387602958, "flos": 31723320568320.0, "grad_norm": 1.9558994062594695, "language_loss": 0.6624552, "learning_rate": 5.993456149625387e-07, "loss": 0.68698907, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 3.8942859172821045 }, { "auxiliary_loss_clip": 0.01292235, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00663805, "balance_loss_mlp": 1.00015903, "epoch": 0.7546443816509348, "flos": 20296504123200.0, "grad_norm": 1.6778251155946524, "language_loss": 0.82371289, "learning_rate": 5.987896741296909e-07, "loss": 0.84856701, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 3.8498876094818115 }, { "auxiliary_loss_clip": 0.01307102, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00786829, "balance_loss_mlp": 1.00017273, "epoch": 0.754764624541574, "flos": 23696170460640.0, "grad_norm": 1.9575819954224454, "language_loss": 0.78377324, "learning_rate": 5.982339458593361e-07, "loss": 0.8087762, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 3.7890162467956543 }, { "auxiliary_loss_clip": 0.01323456, "auxiliary_loss_mlp": 0.00872434, "balance_loss_clip": 1.00761843, "balance_loss_mlp": 1.00029373, "epoch": 0.7548848674322131, "flos": 25337216378880.0, "grad_norm": 1.6046802982423725, "language_loss": 0.84015918, "learning_rate": 5.976784302357767e-07, "loss": 0.86211812, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.861966371536255 }, { "auxiliary_loss_clip": 0.01336784, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00832152, "balance_loss_mlp": 1.00019455, "epoch": 0.7550051103228521, "flos": 19573157619360.0, "grad_norm": 1.7488622126184004, "language_loss": 0.73153758, "learning_rate": 5.971231273432855e-07, "loss": 0.75683755, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.761941432952881 }, { "auxiliary_loss_clip": 0.01306366, "auxiliary_loss_mlp": 0.01192317, "balance_loss_clip": 1.00477529, "balance_loss_mlp": 1.00005674, "epoch": 0.7551253532134913, "flos": 64150099130880.0, "grad_norm": 0.806746509313867, "language_loss": 0.54562646, "learning_rate": 5.965680372661e-07, "loss": 0.57061327, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.2392501831054688 }, { "auxiliary_loss_clip": 0.01314337, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00802064, "balance_loss_mlp": 1.00018418, "epoch": 0.7552455961041303, "flos": 26067999084480.0, "grad_norm": 1.6309441712420512, "language_loss": 0.55984694, "learning_rate": 5.960131600884266e-07, "loss": 0.58492237, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 3.028338670730591 }, { "auxiliary_loss_clip": 0.01297628, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00680184, "balance_loss_mlp": 1.00017607, "epoch": 0.7553658389947694, "flos": 24498235252800.0, "grad_norm": 1.6173190111460576, "language_loss": 0.76156247, "learning_rate": 5.954584958944413e-07, "loss": 0.78647077, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.791668176651001 }, { "auxiliary_loss_clip": 0.01303752, "auxiliary_loss_mlp": 0.0087245, "balance_loss_clip": 1.00752735, "balance_loss_mlp": 1.00031245, "epoch": 0.7554860818854086, "flos": 21799476328320.0, "grad_norm": 1.6661207542577297, "language_loss": 0.81324637, "learning_rate": 5.949040447682854e-07, "loss": 0.83500838, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.957944869995117 }, { "auxiliary_loss_clip": 0.01315751, "auxiliary_loss_mlp": 0.01193318, "balance_loss_clip": 1.00744843, "balance_loss_mlp": 1.0001986, "epoch": 0.7556063247760476, "flos": 16362132327360.0, "grad_norm": 2.0134021020134165, "language_loss": 0.68422139, "learning_rate": 5.943498067940686e-07, "loss": 0.70931214, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.703514575958252 }, { "auxiliary_loss_clip": 0.01299843, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00811172, "balance_loss_mlp": 1.00018835, "epoch": 0.7557265676666867, "flos": 27235167840000.0, "grad_norm": 1.6866011965961198, "language_loss": 0.81639034, "learning_rate": 5.937957820558686e-07, "loss": 0.84132087, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.778015613555908 }, { "auxiliary_loss_clip": 0.01278675, "auxiliary_loss_mlp": 0.01192298, "balance_loss_clip": 1.00489855, "balance_loss_mlp": 1.00003743, "epoch": 0.7558468105573258, "flos": 62189163953280.0, "grad_norm": 0.8526541035328191, "language_loss": 0.65437537, "learning_rate": 5.932419706377296e-07, "loss": 0.67908508, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.2817554473876953 }, { "auxiliary_loss_clip": 0.01277743, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00695276, "balance_loss_mlp": 1.00019395, "epoch": 0.7559670534479649, "flos": 33249091101120.0, "grad_norm": 1.782657243122281, "language_loss": 0.74216723, "learning_rate": 5.92688372623666e-07, "loss": 0.76687688, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.915567636489868 }, { "auxiliary_loss_clip": 0.01327885, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00721192, "balance_loss_mlp": 1.00018549, "epoch": 0.7560872963386039, "flos": 14064387837120.0, "grad_norm": 3.3882110726894004, "language_loss": 0.74014062, "learning_rate": 5.921349880976574e-07, "loss": 0.76535165, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.7688467502593994 }, { "auxiliary_loss_clip": 0.01323919, "auxiliary_loss_mlp": 0.00872497, "balance_loss_clip": 1.00780785, "balance_loss_mlp": 1.00029159, "epoch": 0.7562075392292431, "flos": 20412318363840.0, "grad_norm": 1.7786372951107958, "language_loss": 0.81969535, "learning_rate": 5.915818171436515e-07, "loss": 0.84165955, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.7564094066619873 }, { "auxiliary_loss_clip": 0.01325682, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00816441, "balance_loss_mlp": 1.00020409, "epoch": 0.7563277821198822, "flos": 20376803053440.0, "grad_norm": 1.4912934285460446, "language_loss": 0.74521971, "learning_rate": 5.910288598455642e-07, "loss": 0.77040881, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.8350932598114014 }, { "auxiliary_loss_clip": 0.01338662, "auxiliary_loss_mlp": 0.01193272, "balance_loss_clip": 1.00856769, "balance_loss_mlp": 1.00024796, "epoch": 0.7564480250105212, "flos": 18588271417920.0, "grad_norm": 2.76537642079307, "language_loss": 0.7487244, "learning_rate": 5.90476116287278e-07, "loss": 0.77404374, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.6987621784210205 }, { "auxiliary_loss_clip": 0.01304348, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00715065, "balance_loss_mlp": 1.00019503, "epoch": 0.7565682679011604, "flos": 21215532713760.0, "grad_norm": 1.7082673805289244, "language_loss": 0.67707014, "learning_rate": 5.899235865526456e-07, "loss": 0.7020458, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.7072134017944336 }, { "auxiliary_loss_clip": 0.01300965, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00792098, "balance_loss_mlp": 1.00018346, "epoch": 0.7566885107917994, "flos": 20449019155680.0, "grad_norm": 1.9149263877345024, "language_loss": 0.81981897, "learning_rate": 5.893712707254825e-07, "loss": 0.84476072, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.7692854404449463 }, { "auxiliary_loss_clip": 0.01279073, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00713158, "balance_loss_mlp": 1.00016224, "epoch": 0.7568087536824385, "flos": 19025842949280.0, "grad_norm": 2.258465393761976, "language_loss": 0.6601724, "learning_rate": 5.888191688895769e-07, "loss": 0.68489498, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.7673425674438477 }, { "auxiliary_loss_clip": 0.01349519, "auxiliary_loss_mlp": 0.01193557, "balance_loss_clip": 1.008008, "balance_loss_mlp": 1.000247, "epoch": 0.7569289965730777, "flos": 15225449567040.0, "grad_norm": 2.1304331669836003, "language_loss": 0.61760199, "learning_rate": 5.882672811286813e-07, "loss": 0.64303273, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.710355043411255 }, { "auxiliary_loss_clip": 0.0134971, "auxiliary_loss_mlp": 0.01193286, "balance_loss_clip": 1.00819016, "balance_loss_mlp": 1.00016725, "epoch": 0.7570492394637167, "flos": 20769375422880.0, "grad_norm": 2.0288662714284906, "language_loss": 0.69275081, "learning_rate": 5.877156075265166e-07, "loss": 0.71818078, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.6703836917877197 }, { "auxiliary_loss_clip": 0.01324916, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.0081594, "balance_loss_mlp": 1.00014877, "epoch": 0.7571694823543558, "flos": 15664098808800.0, "grad_norm": 2.2755495448778285, "language_loss": 0.70006353, "learning_rate": 5.871641481667715e-07, "loss": 0.72524434, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.8082239627838135 }, { "auxiliary_loss_clip": 0.01272618, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.00643289, "balance_loss_mlp": 1.00020909, "epoch": 0.7572897252449949, "flos": 25409252862720.0, "grad_norm": 1.625838438482695, "language_loss": 0.84598744, "learning_rate": 5.866129031331011e-07, "loss": 0.87064594, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.9143455028533936 }, { "auxiliary_loss_clip": 0.01314124, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00713611, "balance_loss_mlp": 1.00021243, "epoch": 0.757409968135634, "flos": 24279359677920.0, "grad_norm": 2.050666018612342, "language_loss": 0.8313387, "learning_rate": 5.8606187250913e-07, "loss": 0.85641229, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.829397678375244 }, { "auxiliary_loss_clip": 0.01324288, "auxiliary_loss_mlp": 0.0087242, "balance_loss_clip": 1.00801826, "balance_loss_mlp": 1.00034809, "epoch": 0.757530211026273, "flos": 24133777915680.0, "grad_norm": 2.094135615845118, "language_loss": 0.84237003, "learning_rate": 5.855110563784482e-07, "loss": 0.86433709, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 3.588728904724121 }, { "auxiliary_loss_clip": 0.01336724, "auxiliary_loss_mlp": 0.0087244, "balance_loss_clip": 1.00793147, "balance_loss_mlp": 1.00028062, "epoch": 0.7576504539169122, "flos": 23951818674720.0, "grad_norm": 1.5036720651370006, "language_loss": 0.64069343, "learning_rate": 5.849604548246156e-07, "loss": 0.66278505, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.754474639892578 }, { "auxiliary_loss_clip": 0.01315244, "auxiliary_loss_mlp": 0.00872516, "balance_loss_clip": 1.00751948, "balance_loss_mlp": 1.00040936, "epoch": 0.7577706968075513, "flos": 21251371337280.0, "grad_norm": 1.9855057513535783, "language_loss": 0.80415761, "learning_rate": 5.844100679311565e-07, "loss": 0.8260352, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 4.685885190963745 }, { "auxiliary_loss_clip": 0.01304616, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00697565, "balance_loss_mlp": 1.00018036, "epoch": 0.7578909396981903, "flos": 18296604961920.0, "grad_norm": 2.0477728101955925, "language_loss": 0.76158452, "learning_rate": 5.838598957815637e-07, "loss": 0.78656268, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.8282623291015625 }, { "auxiliary_loss_clip": 0.01300986, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00701416, "balance_loss_mlp": 1.000175, "epoch": 0.7580111825888295, "flos": 25373881247040.0, "grad_norm": 1.555573185247911, "language_loss": 0.85600299, "learning_rate": 5.833099384592996e-07, "loss": 0.88094485, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.777482032775879 }, { "auxiliary_loss_clip": 0.01300623, "auxiliary_loss_mlp": 0.01193349, "balance_loss_clip": 1.00674689, "balance_loss_mlp": 1.00013471, "epoch": 0.7581314254794685, "flos": 23768673952320.0, "grad_norm": 2.1798286428167866, "language_loss": 0.71520376, "learning_rate": 5.827601960477913e-07, "loss": 0.74014342, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.8092570304870605 }, { "auxiliary_loss_clip": 0.01336655, "auxiliary_loss_mlp": 0.01193281, "balance_loss_clip": 1.00807047, "balance_loss_mlp": 1.00025785, "epoch": 0.7582516683701076, "flos": 22054621610880.0, "grad_norm": 1.8440780962759755, "language_loss": 0.70311165, "learning_rate": 5.822106686304344e-07, "loss": 0.72841102, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.75313401222229 }, { "auxiliary_loss_clip": 0.01297619, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00677323, "balance_loss_mlp": 1.00017643, "epoch": 0.7583719112607467, "flos": 31649739366240.0, "grad_norm": 1.7766488723633087, "language_loss": 0.57734233, "learning_rate": 5.816613562905919e-07, "loss": 0.60225052, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.9350321292877197 }, { "auxiliary_loss_clip": 0.01279026, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00671792, "balance_loss_mlp": 1.00022316, "epoch": 0.7584921541513858, "flos": 33068389188960.0, "grad_norm": 1.4664439488628573, "language_loss": 0.69863963, "learning_rate": 5.811122591115933e-07, "loss": 0.72336233, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 3.022141218185425 }, { "auxiliary_loss_clip": 0.01279819, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00702536, "balance_loss_mlp": 1.00018501, "epoch": 0.7586123970420249, "flos": 23326360495200.0, "grad_norm": 2.33451443141141, "language_loss": 0.71414196, "learning_rate": 5.805633771767376e-07, "loss": 0.73887229, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.9442203044891357 }, { "auxiliary_loss_clip": 0.01309559, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00847256, "balance_loss_mlp": 1.00022316, "epoch": 0.7587326399326639, "flos": 18334239769440.0, "grad_norm": 1.5331814155964703, "language_loss": 0.77679849, "learning_rate": 5.800147105692888e-07, "loss": 0.8018266, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.816028118133545 }, { "auxiliary_loss_clip": 0.01337167, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00777018, "balance_loss_mlp": 1.00018084, "epoch": 0.7588528828233031, "flos": 17275089816000.0, "grad_norm": 1.6982433276573474, "language_loss": 0.79458511, "learning_rate": 5.794662593724795e-07, "loss": 0.81988883, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.7965352535247803 }, { "auxiliary_loss_clip": 0.01348883, "auxiliary_loss_mlp": 0.01193334, "balance_loss_clip": 1.00869131, "balance_loss_mlp": 1.00021541, "epoch": 0.7589731257139422, "flos": 17713631286720.0, "grad_norm": 1.9492159753299827, "language_loss": 0.74866325, "learning_rate": 5.789180236695091e-07, "loss": 0.7740854, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.751784324645996 }, { "auxiliary_loss_clip": 0.01324943, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00744152, "balance_loss_mlp": 1.00013566, "epoch": 0.7590933686045812, "flos": 15961081969440.0, "grad_norm": 1.7824937891457953, "language_loss": 0.84723914, "learning_rate": 5.78370003543544e-07, "loss": 0.87242019, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.7986197471618652 }, { "auxiliary_loss_clip": 0.01328958, "auxiliary_loss_mlp": 0.00872454, "balance_loss_clip": 1.00784838, "balance_loss_mlp": 1.00035977, "epoch": 0.7592136114952204, "flos": 21068082920160.0, "grad_norm": 1.750763680670936, "language_loss": 0.83642983, "learning_rate": 5.778221990777203e-07, "loss": 0.85844398, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.8151426315307617 }, { "auxiliary_loss_clip": 0.01306599, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00721383, "balance_loss_mlp": 1.00017416, "epoch": 0.7593338543858594, "flos": 25297677616320.0, "grad_norm": 1.915839473926144, "language_loss": 0.82648754, "learning_rate": 5.772746103551372e-07, "loss": 0.85148561, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.8519303798675537 }, { "auxiliary_loss_clip": 0.01305049, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00749624, "balance_loss_mlp": 1.00018203, "epoch": 0.7594540972764985, "flos": 31832381157120.0, "grad_norm": 1.768703547275769, "language_loss": 0.72039139, "learning_rate": 5.767272374588648e-07, "loss": 0.74537396, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.8535854816436768 }, { "auxiliary_loss_clip": 0.01325918, "auxiliary_loss_mlp": 0.01193325, "balance_loss_clip": 1.00773907, "balance_loss_mlp": 1.00020599, "epoch": 0.7595743401671377, "flos": 37597266161280.0, "grad_norm": 2.4294160293960787, "language_loss": 0.77881932, "learning_rate": 5.76180080471939e-07, "loss": 0.8040117, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.9413485527038574 }, { "auxiliary_loss_clip": 0.01351015, "auxiliary_loss_mlp": 0.01193268, "balance_loss_clip": 1.00853288, "balance_loss_mlp": 1.00024462, "epoch": 0.7596945830577767, "flos": 18287731812960.0, "grad_norm": 1.9060058680357623, "language_loss": 0.72324067, "learning_rate": 5.756331394773631e-07, "loss": 0.74868345, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.68487811088562 }, { "auxiliary_loss_clip": 0.0125227, "auxiliary_loss_mlp": 0.008725, "balance_loss_clip": 1.00672793, "balance_loss_mlp": 1.0002749, "epoch": 0.7598148259484158, "flos": 22233132178560.0, "grad_norm": 3.084734339720527, "language_loss": 0.75952441, "learning_rate": 5.750864145581071e-07, "loss": 0.78077209, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 3.0148210525512695 }, { "auxiliary_loss_clip": 0.01349089, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00803244, "balance_loss_mlp": 1.00019205, "epoch": 0.7599350688390549, "flos": 27161730332640.0, "grad_norm": 1.8889470070781467, "language_loss": 0.86331558, "learning_rate": 5.745399057971085e-07, "loss": 0.88873857, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.739877223968506 }, { "auxiliary_loss_clip": 0.01337834, "auxiliary_loss_mlp": 0.01193249, "balance_loss_clip": 1.00807762, "balance_loss_mlp": 1.00022507, "epoch": 0.760055311729694, "flos": 15560714161440.0, "grad_norm": 1.9154297706425973, "language_loss": 0.75104147, "learning_rate": 5.739936132772738e-07, "loss": 0.77635229, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.7771008014678955 }, { "auxiliary_loss_clip": 0.01348206, "auxiliary_loss_mlp": 0.01193294, "balance_loss_clip": 1.00758696, "balance_loss_mlp": 1.00017548, "epoch": 0.760175554620333, "flos": 25155508603680.0, "grad_norm": 3.8320689788498496, "language_loss": 0.74593687, "learning_rate": 5.734475370814733e-07, "loss": 0.77135193, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.741105318069458 }, { "auxiliary_loss_clip": 0.01336405, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00766098, "balance_loss_mlp": 1.0001868, "epoch": 0.7602957975109722, "flos": 24353803048320.0, "grad_norm": 1.5840662439344464, "language_loss": 0.78258014, "learning_rate": 5.729016772925483e-07, "loss": 0.80787623, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.8820912837982178 }, { "auxiliary_loss_clip": 0.01265855, "auxiliary_loss_mlp": 0.01193256, "balance_loss_clip": 1.00729632, "balance_loss_mlp": 1.00023198, "epoch": 0.7604160404016113, "flos": 25192676403360.0, "grad_norm": 1.8194894001505233, "language_loss": 0.70393175, "learning_rate": 5.723560339933038e-07, "loss": 0.7285229, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.9946017265319824 }, { "auxiliary_loss_clip": 0.01338298, "auxiliary_loss_mlp": 0.00872375, "balance_loss_clip": 1.00830364, "balance_loss_mlp": 1.00033665, "epoch": 0.7605362832922503, "flos": 29861854356960.0, "grad_norm": 1.9703847797889202, "language_loss": 0.65312254, "learning_rate": 5.71810607266513e-07, "loss": 0.67522931, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.82558274269104 }, { "auxiliary_loss_clip": 0.01334902, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00762057, "balance_loss_mlp": 1.00018692, "epoch": 0.7606565261828895, "flos": 13917943906560.0, "grad_norm": 1.6453040979627322, "language_loss": 0.60490543, "learning_rate": 5.712653971949184e-07, "loss": 0.63018656, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 3.667973041534424 }, { "auxiliary_loss_clip": 0.01337423, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.0082655, "balance_loss_mlp": 1.00017214, "epoch": 0.7607767690735285, "flos": 18551283236640.0, "grad_norm": 2.6050089262380167, "language_loss": 0.75340676, "learning_rate": 5.707204038612268e-07, "loss": 0.77871287, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 3.6117138862609863 }, { "auxiliary_loss_clip": 0.01306801, "auxiliary_loss_mlp": 0.01193344, "balance_loss_clip": 1.00767386, "balance_loss_mlp": 1.00022459, "epoch": 0.7608970119641676, "flos": 20923004089440.0, "grad_norm": 3.8802700075469283, "language_loss": 0.73814005, "learning_rate": 5.701756273481138e-07, "loss": 0.76314151, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 4.787558078765869 }, { "auxiliary_loss_clip": 0.01310283, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00823629, "balance_loss_mlp": 1.00018883, "epoch": 0.7610172548548068, "flos": 23807314622880.0, "grad_norm": 1.4347312197073938, "language_loss": 0.73781812, "learning_rate": 5.696310677382212e-07, "loss": 0.76285315, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.7725989818573 }, { "auxiliary_loss_clip": 0.01267625, "auxiliary_loss_mlp": 0.01192291, "balance_loss_clip": 1.00504196, "balance_loss_mlp": 1.0000304, "epoch": 0.7611374977454458, "flos": 66496614769440.0, "grad_norm": 0.8895263781459506, "language_loss": 0.61782491, "learning_rate": 5.690867251141576e-07, "loss": 0.64242399, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.5365052223205566 }, { "auxiliary_loss_clip": 0.01322427, "auxiliary_loss_mlp": 0.01193299, "balance_loss_clip": 1.00786591, "balance_loss_mlp": 1.00018024, "epoch": 0.7612577406360849, "flos": 15633145805760.0, "grad_norm": 2.2794067377591136, "language_loss": 0.91488349, "learning_rate": 5.685425995585013e-07, "loss": 0.94004077, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.776433229446411 }, { "auxiliary_loss_clip": 0.01291209, "auxiliary_loss_mlp": 0.01192299, "balance_loss_clip": 1.00466943, "balance_loss_mlp": 1.00003874, "epoch": 0.761377983526724, "flos": 60526289646720.0, "grad_norm": 0.96681941976725, "language_loss": 0.59027529, "learning_rate": 5.679986911537935e-07, "loss": 0.6151104, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.51576566696167 }, { "auxiliary_loss_clip": 0.01263729, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00711012, "balance_loss_mlp": 1.0001843, "epoch": 0.7614982264173631, "flos": 35772536665440.0, "grad_norm": 2.0999565863233634, "language_loss": 0.67184687, "learning_rate": 5.674549999825462e-07, "loss": 0.69641626, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 3.067690372467041 }, { "auxiliary_loss_clip": 0.01306737, "auxiliary_loss_mlp": 0.01192329, "balance_loss_clip": 1.00473821, "balance_loss_mlp": 1.00006866, "epoch": 0.7616184693080021, "flos": 67925538764640.0, "grad_norm": 0.9162530945473666, "language_loss": 0.71496892, "learning_rate": 5.669115261272363e-07, "loss": 0.7399596, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.300217866897583 }, { "auxiliary_loss_clip": 0.01326086, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00762999, "balance_loss_mlp": 1.0001452, "epoch": 0.7617387121986413, "flos": 20521989655200.0, "grad_norm": 2.5740780610124485, "language_loss": 0.72795212, "learning_rate": 5.663682696703081e-07, "loss": 0.75314468, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.7349026203155518 }, { "auxiliary_loss_clip": 0.01349445, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00827527, "balance_loss_mlp": 1.00014281, "epoch": 0.7618589550892804, "flos": 18624505201920.0, "grad_norm": 1.7970421802120515, "language_loss": 0.82078987, "learning_rate": 5.658252306941746e-07, "loss": 0.84621596, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.6467463970184326 }, { "auxiliary_loss_clip": 0.01263186, "auxiliary_loss_mlp": 0.01193294, "balance_loss_clip": 1.00596261, "balance_loss_mlp": 1.00017548, "epoch": 0.7619791979799194, "flos": 17453744078400.0, "grad_norm": 2.7996006032580105, "language_loss": 0.75211298, "learning_rate": 5.65282409281212e-07, "loss": 0.77667785, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.880579948425293 }, { "auxiliary_loss_clip": 0.01316663, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00774431, "balance_loss_mlp": 1.00020289, "epoch": 0.7620994408705585, "flos": 14137430184000.0, "grad_norm": 2.5721474466738665, "language_loss": 0.6968894, "learning_rate": 5.64739805513768e-07, "loss": 0.72198832, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.8677289485931396 }, { "auxiliary_loss_clip": 0.01296647, "auxiliary_loss_mlp": 0.00871952, "balance_loss_clip": 1.00431824, "balance_loss_mlp": 1.00005817, "epoch": 0.7622196837611976, "flos": 70708822404480.0, "grad_norm": 0.7855118508684754, "language_loss": 0.55749661, "learning_rate": 5.641974194741541e-07, "loss": 0.57918257, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.169687509536743 }, { "auxiliary_loss_clip": 0.01273776, "auxiliary_loss_mlp": 0.01192299, "balance_loss_clip": 1.00954556, "balance_loss_mlp": 1.00003803, "epoch": 0.7623399266518367, "flos": 60684180022080.0, "grad_norm": 0.7766520841445752, "language_loss": 0.63826442, "learning_rate": 5.636552512446502e-07, "loss": 0.66292512, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.278287887573242 }, { "auxiliary_loss_clip": 0.01323366, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00786436, "balance_loss_mlp": 1.00018537, "epoch": 0.7624601695424758, "flos": 26468905747680.0, "grad_norm": 1.7708269745747807, "language_loss": 0.7795015, "learning_rate": 5.631133009075027e-07, "loss": 0.80466723, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.8472683429718018 }, { "auxiliary_loss_clip": 0.01331023, "auxiliary_loss_mlp": 0.00872378, "balance_loss_clip": 1.00759518, "balance_loss_mlp": 1.00030267, "epoch": 0.7625804124331149, "flos": 19135765706400.0, "grad_norm": 1.7916295368895958, "language_loss": 0.68374801, "learning_rate": 5.625715685449242e-07, "loss": 0.70578206, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.8742194175720215 }, { "auxiliary_loss_clip": 0.0127358, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00835693, "balance_loss_mlp": 1.000175, "epoch": 0.762700655323754, "flos": 26213113838880.0, "grad_norm": 2.0334772520604663, "language_loss": 0.71332246, "learning_rate": 5.620300542390966e-07, "loss": 0.73799026, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.9626245498657227 }, { "auxiliary_loss_clip": 0.01308505, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00781155, "balance_loss_mlp": 1.00016451, "epoch": 0.762820898214393, "flos": 22382593698240.0, "grad_norm": 1.7231210190907038, "language_loss": 0.84804749, "learning_rate": 5.614887580721659e-07, "loss": 0.8730644, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.780078649520874 }, { "auxiliary_loss_clip": 0.01277125, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00722325, "balance_loss_mlp": 1.00017476, "epoch": 0.7629411411050322, "flos": 15700512211200.0, "grad_norm": 1.8551821337201806, "language_loss": 0.74066699, "learning_rate": 5.609476801262481e-07, "loss": 0.76537019, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.845261335372925 }, { "auxiliary_loss_clip": 0.01290446, "auxiliary_loss_mlp": 0.01193378, "balance_loss_clip": 1.0079596, "balance_loss_mlp": 1.00025892, "epoch": 0.7630613839956712, "flos": 13770350418240.0, "grad_norm": 2.228616880666351, "language_loss": 0.64385599, "learning_rate": 5.604068204834223e-07, "loss": 0.6686942, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.807042360305786 }, { "auxiliary_loss_clip": 0.01264845, "auxiliary_loss_mlp": 0.00872505, "balance_loss_clip": 1.00674164, "balance_loss_mlp": 1.00032222, "epoch": 0.7631816268863103, "flos": 14569577239680.0, "grad_norm": 2.4819768029595037, "language_loss": 0.76664853, "learning_rate": 5.598661792257367e-07, "loss": 0.78802204, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.8575615882873535 }, { "auxiliary_loss_clip": 0.01337134, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.0077635, "balance_loss_mlp": 1.00020945, "epoch": 0.7633018697769495, "flos": 19062220428000.0, "grad_norm": 1.7767381816716568, "language_loss": 0.7571969, "learning_rate": 5.593257564352071e-07, "loss": 0.78250062, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.7580981254577637 }, { "auxiliary_loss_clip": 0.01327025, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00712717, "balance_loss_mlp": 1.00015235, "epoch": 0.7634221126675885, "flos": 22052969121600.0, "grad_norm": 1.4437446471423285, "language_loss": 0.75311327, "learning_rate": 5.58785552193815e-07, "loss": 0.77831519, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.9253923892974854 }, { "auxiliary_loss_clip": 0.01349113, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00782287, "balance_loss_mlp": 1.00016916, "epoch": 0.7635423555582276, "flos": 29382732336960.0, "grad_norm": 1.6138646815906708, "language_loss": 0.75362468, "learning_rate": 5.582455665835086e-07, "loss": 0.77904773, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.9410059452056885 }, { "auxiliary_loss_clip": 0.01325129, "auxiliary_loss_mlp": 0.0119333, "balance_loss_clip": 1.0077486, "balance_loss_mlp": 1.00021112, "epoch": 0.7636625984488667, "flos": 17784913373280.0, "grad_norm": 4.225985290651348, "language_loss": 0.72349036, "learning_rate": 5.577057996862036e-07, "loss": 0.74867487, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.9471471309661865 }, { "auxiliary_loss_clip": 0.0134817, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.00801718, "balance_loss_mlp": 1.00016904, "epoch": 0.7637828413395058, "flos": 23734595589120.0, "grad_norm": 1.580277019114969, "language_loss": 0.76331633, "learning_rate": 5.571662515837814e-07, "loss": 0.78873086, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 3.796651601791382 }, { "auxiliary_loss_clip": 0.01308946, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00770521, "balance_loss_mlp": 1.00014353, "epoch": 0.7639030842301449, "flos": 36283294238400.0, "grad_norm": 1.869069979974755, "language_loss": 0.83734667, "learning_rate": 5.566269223580926e-07, "loss": 0.86236787, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 3.897864818572998 }, { "auxiliary_loss_clip": 0.01336532, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.0081265, "balance_loss_mlp": 1.00020468, "epoch": 0.764023327120784, "flos": 28878117713280.0, "grad_norm": 1.5873171304256526, "language_loss": 0.7555446, "learning_rate": 5.560878120909511e-07, "loss": 0.78084219, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 4.859827756881714 }, { "auxiliary_loss_clip": 0.01306812, "auxiliary_loss_mlp": 0.0119232, "balance_loss_clip": 1.00485349, "balance_loss_mlp": 1.00005901, "epoch": 0.7641435700114231, "flos": 64789747164000.0, "grad_norm": 0.8462414553973412, "language_loss": 0.58622247, "learning_rate": 5.55548920864141e-07, "loss": 0.6112138, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.371981382369995 }, { "auxiliary_loss_clip": 0.01326552, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00789928, "balance_loss_mlp": 1.00016332, "epoch": 0.7642638129020621, "flos": 16835793948000.0, "grad_norm": 1.5448665525142944, "language_loss": 0.77889407, "learning_rate": 5.550102487594113e-07, "loss": 0.80409145, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.820572853088379 }, { "auxiliary_loss_clip": 0.01283335, "auxiliary_loss_mlp": 0.00872267, "balance_loss_clip": 1.00701594, "balance_loss_mlp": 1.0002923, "epoch": 0.7643840557927013, "flos": 30408953484960.0, "grad_norm": 1.452783429016233, "language_loss": 0.71484607, "learning_rate": 5.54471795858477e-07, "loss": 0.73640209, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.9346556663513184 }, { "auxiliary_loss_clip": 0.01312173, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00767851, "balance_loss_mlp": 1.00013709, "epoch": 0.7645042986833404, "flos": 16983243741600.0, "grad_norm": 1.8705349563145548, "language_loss": 0.83012998, "learning_rate": 5.539335622430235e-07, "loss": 0.85518336, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.7886245250701904 }, { "auxiliary_loss_clip": 0.01336473, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00780225, "balance_loss_mlp": 1.00018919, "epoch": 0.7646245415739794, "flos": 17311503218400.0, "grad_norm": 1.7465184393977953, "language_loss": 0.74363637, "learning_rate": 5.533955479946975e-07, "loss": 0.76893324, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.819417953491211 }, { "auxiliary_loss_clip": 0.01245199, "auxiliary_loss_mlp": 0.00871678, "balance_loss_clip": 1.00954151, "balance_loss_mlp": 0.99984139, "epoch": 0.7647447844646186, "flos": 70402368601440.0, "grad_norm": 0.8633880567547281, "language_loss": 0.65885317, "learning_rate": 5.528577531951173e-07, "loss": 0.680022, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.329676389694214 }, { "auxiliary_loss_clip": 0.01319472, "auxiliary_loss_mlp": 0.01193117, "balance_loss_clip": 1.00787497, "balance_loss_mlp": 1.00018859, "epoch": 0.7648650273552576, "flos": 17675924631840.0, "grad_norm": 1.993179412861157, "language_loss": 0.73973399, "learning_rate": 5.523201779258653e-07, "loss": 0.76485991, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.8414103984832764 }, { "auxiliary_loss_clip": 0.01348758, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.0076412, "balance_loss_mlp": 1.00017142, "epoch": 0.7649852702458967, "flos": 22162029710400.0, "grad_norm": 1.6766318891294032, "language_loss": 0.84211028, "learning_rate": 5.517828222684912e-07, "loss": 0.86752987, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.827815055847168 }, { "auxiliary_loss_clip": 0.01283803, "auxiliary_loss_mlp": 0.01192318, "balance_loss_clip": 1.00431371, "balance_loss_mlp": 1.0000577, "epoch": 0.7651055131365359, "flos": 69848372126880.0, "grad_norm": 0.7679895717637625, "language_loss": 0.59028465, "learning_rate": 5.512456863045117e-07, "loss": 0.61504585, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.4174587726593018 }, { "auxiliary_loss_clip": 0.01349372, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00793934, "balance_loss_mlp": 1.00013685, "epoch": 0.7652257560271749, "flos": 19464025183200.0, "grad_norm": 1.6942939275201192, "language_loss": 0.73766667, "learning_rate": 5.507087701154089e-07, "loss": 0.76309198, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.7868502140045166 }, { "auxiliary_loss_clip": 0.01285611, "auxiliary_loss_mlp": 0.0119331, "balance_loss_clip": 1.00732315, "balance_loss_mlp": 1.00019097, "epoch": 0.765345998917814, "flos": 15961117893120.0, "grad_norm": 1.8841378133678996, "language_loss": 0.75545847, "learning_rate": 5.50172073782634e-07, "loss": 0.78024769, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.8113901615142822 }, { "auxiliary_loss_clip": 0.0128641, "auxiliary_loss_mlp": 0.01193353, "balance_loss_clip": 1.00729513, "balance_loss_mlp": 1.00023425, "epoch": 0.7654662418084531, "flos": 23659864829280.0, "grad_norm": 1.9680435201060067, "language_loss": 0.87755758, "learning_rate": 5.496355973876023e-07, "loss": 0.90235519, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.8526406288146973 }, { "auxiliary_loss_clip": 0.01301912, "auxiliary_loss_mlp": 0.00872554, "balance_loss_clip": 1.00766754, "balance_loss_mlp": 1.00033903, "epoch": 0.7655864846990922, "flos": 41463624925440.0, "grad_norm": 1.6301918655823662, "language_loss": 0.7081461, "learning_rate": 5.490993410116984e-07, "loss": 0.72989082, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.892526865005493 }, { "auxiliary_loss_clip": 0.01294627, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00779247, "balance_loss_mlp": 1.00017107, "epoch": 0.7657067275897312, "flos": 43142700811680.0, "grad_norm": 1.597281486453406, "language_loss": 0.69395524, "learning_rate": 5.485633047362704e-07, "loss": 0.71883345, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.9569883346557617 }, { "auxiliary_loss_clip": 0.01350161, "auxiliary_loss_mlp": 0.01193283, "balance_loss_clip": 1.00886881, "balance_loss_mlp": 1.00025988, "epoch": 0.7658269704803703, "flos": 17311790607840.0, "grad_norm": 1.972875137660386, "language_loss": 0.78660184, "learning_rate": 5.480274886426341e-07, "loss": 0.81203628, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.6745831966400146 }, { "auxiliary_loss_clip": 0.01325122, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.0081917, "balance_loss_mlp": 1.00015402, "epoch": 0.7659472133710095, "flos": 12568169283840.0, "grad_norm": 1.8366428371501586, "language_loss": 0.77823067, "learning_rate": 5.474918928120744e-07, "loss": 0.80341363, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.657409191131592 }, { "auxiliary_loss_clip": 0.01327707, "auxiliary_loss_mlp": 0.01193126, "balance_loss_clip": 1.00749636, "balance_loss_mlp": 1.00019813, "epoch": 0.7660674562616485, "flos": 22707440425440.0, "grad_norm": 1.563794460441272, "language_loss": 0.87275594, "learning_rate": 5.469565173258392e-07, "loss": 0.8979643, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.7420654296875 }, { "auxiliary_loss_clip": 0.01350711, "auxiliary_loss_mlp": 0.0119335, "balance_loss_clip": 1.00821972, "balance_loss_mlp": 1.00023103, "epoch": 0.7661876991522876, "flos": 17056465706880.0, "grad_norm": 1.6217945005452201, "language_loss": 0.63696051, "learning_rate": 5.464213622651454e-07, "loss": 0.66240108, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.637655258178711 }, { "auxiliary_loss_clip": 0.0129547, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00704646, "balance_loss_mlp": 1.00018668, "epoch": 0.7663079420429267, "flos": 20084238505440.0, "grad_norm": 1.6238991530299098, "language_loss": 0.84077442, "learning_rate": 5.458864277111753e-07, "loss": 0.8656612, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.758378267288208 }, { "auxiliary_loss_clip": 0.01311749, "auxiliary_loss_mlp": 0.00872408, "balance_loss_clip": 1.00746298, "balance_loss_mlp": 1.00026, "epoch": 0.7664281849335658, "flos": 12677481338400.0, "grad_norm": 2.183792645367516, "language_loss": 0.69828165, "learning_rate": 5.453517137450769e-07, "loss": 0.72012323, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.7500674724578857 }, { "auxiliary_loss_clip": 0.01326197, "auxiliary_loss_mlp": 0.01193253, "balance_loss_clip": 1.00809789, "balance_loss_mlp": 1.000229, "epoch": 0.7665484278242048, "flos": 22345282203840.0, "grad_norm": 1.4895073280565143, "language_loss": 0.76042712, "learning_rate": 5.448172204479684e-07, "loss": 0.78562164, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.7696304321289062 }, { "auxiliary_loss_clip": 0.01348278, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00767291, "balance_loss_mlp": 1.00019002, "epoch": 0.766668670714844, "flos": 23617919180160.0, "grad_norm": 1.7915806816994837, "language_loss": 0.74455351, "learning_rate": 5.442829479009294e-07, "loss": 0.76996839, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.7015531063079834 }, { "auxiliary_loss_clip": 0.0132282, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00835204, "balance_loss_mlp": 1.00018263, "epoch": 0.7667889136054831, "flos": 19427144772960.0, "grad_norm": 1.7063453486210711, "language_loss": 0.71554989, "learning_rate": 5.437488961850103e-07, "loss": 0.74071014, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.7528862953186035 }, { "auxiliary_loss_clip": 0.01266211, "auxiliary_loss_mlp": 0.01193134, "balance_loss_clip": 1.00600302, "balance_loss_mlp": 1.00011051, "epoch": 0.7669091564961221, "flos": 26866363737600.0, "grad_norm": 1.6831060750358142, "language_loss": 0.75386119, "learning_rate": 5.432150653812258e-07, "loss": 0.77845466, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 3.7613484859466553 }, { "auxiliary_loss_clip": 0.01323585, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00789237, "balance_loss_mlp": 1.00013661, "epoch": 0.7670293993867613, "flos": 12385311950880.0, "grad_norm": 2.165199665000854, "language_loss": 0.82357562, "learning_rate": 5.42681455570557e-07, "loss": 0.84874308, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 3.724748134613037 }, { "auxiliary_loss_clip": 0.01348373, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00761282, "balance_loss_mlp": 1.00018394, "epoch": 0.7671496422774003, "flos": 21762955154880.0, "grad_norm": 1.736637639650757, "language_loss": 0.64946771, "learning_rate": 5.42148066833954e-07, "loss": 0.67488348, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 4.835853576660156 }, { "auxiliary_loss_clip": 0.01348468, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00774872, "balance_loss_mlp": 1.00016046, "epoch": 0.7672698851680394, "flos": 21069232477920.0, "grad_norm": 1.951161304660911, "language_loss": 0.74931943, "learning_rate": 5.416148992523289e-07, "loss": 0.77473593, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.777956008911133 }, { "auxiliary_loss_clip": 0.01239686, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.00631559, "balance_loss_mlp": 1.00020599, "epoch": 0.7673901280586786, "flos": 16976705631840.0, "grad_norm": 1.8204624560351437, "language_loss": 0.78607488, "learning_rate": 5.410819529065644e-07, "loss": 0.81040406, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.9667458534240723 }, { "auxiliary_loss_clip": 0.01276712, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.00645161, "balance_loss_mlp": 1.00020814, "epoch": 0.7675103709493176, "flos": 29242682821440.0, "grad_norm": 2.030912499909921, "language_loss": 0.65158772, "learning_rate": 5.405492278775079e-07, "loss": 0.67628717, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.9197418689727783 }, { "auxiliary_loss_clip": 0.01327089, "auxiliary_loss_mlp": 0.01193319, "balance_loss_clip": 1.00852084, "balance_loss_mlp": 1.00020015, "epoch": 0.7676306138399567, "flos": 29023016925600.0, "grad_norm": 1.9756905017494344, "language_loss": 0.79673028, "learning_rate": 5.400167242459732e-07, "loss": 0.82193434, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.976550579071045 }, { "auxiliary_loss_clip": 0.01324399, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00725055, "balance_loss_mlp": 1.00016952, "epoch": 0.7677508567305958, "flos": 22565127718080.0, "grad_norm": 1.6324759711231343, "language_loss": 0.80563331, "learning_rate": 5.394844420927405e-07, "loss": 0.83080918, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.73610258102417 }, { "auxiliary_loss_clip": 0.01348227, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00798774, "balance_loss_mlp": 1.00016689, "epoch": 0.7678710996212349, "flos": 25411444207200.0, "grad_norm": 3.6485321678293063, "language_loss": 0.73480731, "learning_rate": 5.389523814985562e-07, "loss": 0.76022148, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.8905484676361084 }, { "auxiliary_loss_clip": 0.01288208, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00810218, "balance_loss_mlp": 1.00022459, "epoch": 0.767991342511874, "flos": 26756836140960.0, "grad_norm": 1.702016387873536, "language_loss": 0.75977862, "learning_rate": 5.384205425441344e-07, "loss": 0.78459316, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.9150850772857666 }, { "auxiliary_loss_clip": 0.01315143, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00736582, "balance_loss_mlp": 1.00015283, "epoch": 0.7681115854025131, "flos": 26359521845760.0, "grad_norm": 1.6043735148993947, "language_loss": 0.84319234, "learning_rate": 5.378889253101537e-07, "loss": 0.86827552, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.963026523590088 }, { "auxiliary_loss_clip": 0.01336316, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00756645, "balance_loss_mlp": 1.00013971, "epoch": 0.7682318282931522, "flos": 23257054211040.0, "grad_norm": 1.677777487695228, "language_loss": 0.80778021, "learning_rate": 5.373575298772617e-07, "loss": 0.83307499, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.8363795280456543 }, { "auxiliary_loss_clip": 0.01306822, "auxiliary_loss_mlp": 0.0119232, "balance_loss_clip": 1.0049634, "balance_loss_mlp": 1.0000596, "epoch": 0.7683520711837912, "flos": 70072492559040.0, "grad_norm": 0.7475574705515016, "language_loss": 0.61328846, "learning_rate": 5.368263563260689e-07, "loss": 0.63827986, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.425213575363159 }, { "auxiliary_loss_clip": 0.01337988, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00798464, "balance_loss_mlp": 1.00022411, "epoch": 0.7684723140744304, "flos": 18624900362400.0, "grad_norm": 1.6660592354341774, "language_loss": 0.64106268, "learning_rate": 5.362954047371537e-07, "loss": 0.66637504, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.866811752319336 }, { "auxiliary_loss_clip": 0.01280728, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00724113, "balance_loss_mlp": 1.0001837, "epoch": 0.7685925569650695, "flos": 27453001628160.0, "grad_norm": 1.9712404723240884, "language_loss": 0.72447062, "learning_rate": 5.357646751910627e-07, "loss": 0.74920994, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.9322118759155273 }, { "auxiliary_loss_clip": 0.01314529, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.00779033, "balance_loss_mlp": 1.00024295, "epoch": 0.7687127998557085, "flos": 24535726365600.0, "grad_norm": 1.9361613872905608, "language_loss": 0.79764718, "learning_rate": 5.352341677683061e-07, "loss": 0.82272512, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.8955013751983643 }, { "auxiliary_loss_clip": 0.01300362, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00702715, "balance_loss_mlp": 1.00019264, "epoch": 0.7688330427463477, "flos": 25155975611520.0, "grad_norm": 1.7409294779056617, "language_loss": 0.78947222, "learning_rate": 5.347038825493617e-07, "loss": 0.81440806, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.9302358627319336 }, { "auxiliary_loss_clip": 0.01303376, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00698876, "balance_loss_mlp": 1.00016654, "epoch": 0.7689532856369867, "flos": 21211293719520.0, "grad_norm": 2.0412283078323266, "language_loss": 0.68460262, "learning_rate": 5.341738196146732e-07, "loss": 0.70956838, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.8210442066192627 }, { "auxiliary_loss_clip": 0.01337987, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00783801, "balance_loss_mlp": 1.00018978, "epoch": 0.7690735285276258, "flos": 25119095201280.0, "grad_norm": 2.592632074346327, "language_loss": 0.73385262, "learning_rate": 5.336439790446503e-07, "loss": 0.75916463, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.8407766819000244 }, { "auxiliary_loss_clip": 0.0131179, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00797582, "balance_loss_mlp": 1.0001905, "epoch": 0.769193771418265, "flos": 54744040296000.0, "grad_norm": 1.5514704641316188, "language_loss": 0.62209976, "learning_rate": 5.331143609196711e-07, "loss": 0.64714986, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 3.110032558441162 }, { "auxiliary_loss_clip": 0.01323611, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.00754809, "balance_loss_mlp": 1.00024354, "epoch": 0.769314014308904, "flos": 37341905336640.0, "grad_norm": 1.7769139507039964, "language_loss": 0.77046371, "learning_rate": 5.325849653200758e-07, "loss": 0.79563248, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.89224910736084 }, { "auxiliary_loss_clip": 0.013485, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00792563, "balance_loss_mlp": 1.00019395, "epoch": 0.7694342571995431, "flos": 20631696870240.0, "grad_norm": 1.6613929024623344, "language_loss": 0.76406753, "learning_rate": 5.32055792326175e-07, "loss": 0.78948474, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.7242555618286133 }, { "auxiliary_loss_clip": 0.01308335, "auxiliary_loss_mlp": 0.01193273, "balance_loss_clip": 1.00746155, "balance_loss_mlp": 1.00024962, "epoch": 0.7695545000901821, "flos": 24207718354560.0, "grad_norm": 1.9620721878981287, "language_loss": 0.72649026, "learning_rate": 5.315268420182437e-07, "loss": 0.75150633, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.874810218811035 }, { "auxiliary_loss_clip": 0.01298757, "auxiliary_loss_mlp": 0.00872517, "balance_loss_clip": 1.00725842, "balance_loss_mlp": 1.00036788, "epoch": 0.7696747429808213, "flos": 28001286237600.0, "grad_norm": 1.6323385905653172, "language_loss": 0.76454341, "learning_rate": 5.309981144765221e-07, "loss": 0.78625613, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.956993579864502 }, { "auxiliary_loss_clip": 0.01286176, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00749445, "balance_loss_mlp": 1.00014901, "epoch": 0.7697949858714603, "flos": 11509558185600.0, "grad_norm": 3.0386546330685493, "language_loss": 0.75847906, "learning_rate": 5.304696097812196e-07, "loss": 0.7832725, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.830054521560669 }, { "auxiliary_loss_clip": 0.01317131, "auxiliary_loss_mlp": 0.01193359, "balance_loss_clip": 1.0075165, "balance_loss_mlp": 1.00023961, "epoch": 0.7699152287620994, "flos": 26688284254080.0, "grad_norm": 6.796579798981497, "language_loss": 0.60113639, "learning_rate": 5.299413280125078e-07, "loss": 0.62624121, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.907721519470215 }, { "auxiliary_loss_clip": 0.01325539, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00838673, "balance_loss_mlp": 1.0002116, "epoch": 0.7700354716527386, "flos": 16544953736640.0, "grad_norm": 1.9122793052390044, "language_loss": 0.72996491, "learning_rate": 5.294132692505284e-07, "loss": 0.75515264, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 3.726248264312744 }, { "auxiliary_loss_clip": 0.01276513, "auxiliary_loss_mlp": 0.01193268, "balance_loss_clip": 1.00708675, "balance_loss_mlp": 1.00024414, "epoch": 0.7701557145433776, "flos": 19242742721760.0, "grad_norm": 2.1489426678709274, "language_loss": 0.79346573, "learning_rate": 5.288854335753861e-07, "loss": 0.81816351, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 3.8901772499084473 }, { "auxiliary_loss_clip": 0.01334934, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00765729, "balance_loss_mlp": 1.00018036, "epoch": 0.7702759574340167, "flos": 31685757608160.0, "grad_norm": 1.6296113691640983, "language_loss": 0.75600857, "learning_rate": 5.283578210671551e-07, "loss": 0.78128994, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 4.808419466018677 }, { "auxiliary_loss_clip": 0.01315413, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00744987, "balance_loss_mlp": 1.00018096, "epoch": 0.7703962003246558, "flos": 16800099019200.0, "grad_norm": 1.969050255167873, "language_loss": 0.76507217, "learning_rate": 5.278304318058719e-07, "loss": 0.79015833, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.8883354663848877 }, { "auxiliary_loss_clip": 0.01259832, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00708532, "balance_loss_mlp": 1.00020051, "epoch": 0.7705164432152949, "flos": 35736087339360.0, "grad_norm": 1.712839387572248, "language_loss": 0.78782952, "learning_rate": 5.273032658715411e-07, "loss": 0.81236005, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 3.003100633621216 }, { "auxiliary_loss_clip": 0.01277052, "auxiliary_loss_mlp": 0.01193279, "balance_loss_clip": 1.00704861, "balance_loss_mlp": 1.00025547, "epoch": 0.7706366861059339, "flos": 23365971105120.0, "grad_norm": 1.780915263944984, "language_loss": 0.76701206, "learning_rate": 5.267763233441347e-07, "loss": 0.79171538, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.9757089614868164 }, { "auxiliary_loss_clip": 0.01327167, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00798607, "balance_loss_mlp": 1.00020003, "epoch": 0.7707569289965731, "flos": 22929908368320.0, "grad_norm": 11.10772942846809, "language_loss": 0.6989482, "learning_rate": 5.26249604303588e-07, "loss": 0.72415209, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.998425006866455 }, { "auxiliary_loss_clip": 0.01349704, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00828695, "balance_loss_mlp": 1.0001446, "epoch": 0.7708771718872122, "flos": 17420671578240.0, "grad_norm": 2.2179163599467415, "language_loss": 0.78053725, "learning_rate": 5.257231088298057e-07, "loss": 0.80596602, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.735973596572876 }, { "auxiliary_loss_clip": 0.01260197, "auxiliary_loss_mlp": 0.01192358, "balance_loss_clip": 1.00374115, "balance_loss_mlp": 1.00009751, "epoch": 0.7709974147778512, "flos": 72241346589120.0, "grad_norm": 0.7986122464382267, "language_loss": 0.53979158, "learning_rate": 5.25196837002655e-07, "loss": 0.56431711, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.420443296432495 }, { "auxiliary_loss_clip": 0.01316546, "auxiliary_loss_mlp": 0.01193257, "balance_loss_clip": 1.00816488, "balance_loss_mlp": 1.00023317, "epoch": 0.7711176576684904, "flos": 39859710883200.0, "grad_norm": 1.8479005740291643, "language_loss": 0.67925918, "learning_rate": 5.24670788901971e-07, "loss": 0.70435715, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.9123775959014893 }, { "auxiliary_loss_clip": 0.01320607, "auxiliary_loss_mlp": 0.01193452, "balance_loss_clip": 1.00834823, "balance_loss_mlp": 1.00023782, "epoch": 0.7712379005591294, "flos": 36976406212800.0, "grad_norm": 2.050408169043687, "language_loss": 0.67995453, "learning_rate": 5.241449646075557e-07, "loss": 0.70509517, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.983358144760132 }, { "auxiliary_loss_clip": 0.01338415, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.00793839, "balance_loss_mlp": 1.00020885, "epoch": 0.7713581434497685, "flos": 22776782633280.0, "grad_norm": 2.1404968997434133, "language_loss": 0.72501016, "learning_rate": 5.236193641991762e-07, "loss": 0.75032663, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.8157541751861572 }, { "auxiliary_loss_clip": 0.01302999, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.0067234, "balance_loss_mlp": 1.00018907, "epoch": 0.7714783863404077, "flos": 24097472284320.0, "grad_norm": 2.0412951217876754, "language_loss": 0.69809806, "learning_rate": 5.23093987756565e-07, "loss": 0.72306013, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.833656072616577 }, { "auxiliary_loss_clip": 0.01306265, "auxiliary_loss_mlp": 0.01193329, "balance_loss_clip": 1.00774801, "balance_loss_mlp": 1.00021029, "epoch": 0.7715986292310467, "flos": 21063664307520.0, "grad_norm": 1.8004721809239461, "language_loss": 0.75170022, "learning_rate": 5.225688353594217e-07, "loss": 0.77669615, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.9394702911376953 }, { "auxiliary_loss_clip": 0.01314499, "auxiliary_loss_mlp": 0.00872509, "balance_loss_clip": 1.00707901, "balance_loss_mlp": 1.00041544, "epoch": 0.7717188721216858, "flos": 20594888307360.0, "grad_norm": 2.010317575684321, "language_loss": 0.77893758, "learning_rate": 5.220439070874108e-07, "loss": 0.8008076, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.846600294113159 }, { "auxiliary_loss_clip": 0.01327502, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00796854, "balance_loss_mlp": 1.00020444, "epoch": 0.7718391150123249, "flos": 26250964188480.0, "grad_norm": 2.6080184521214287, "language_loss": 0.71005988, "learning_rate": 5.215192030201652e-07, "loss": 0.73526716, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.780243158340454 }, { "auxiliary_loss_clip": 0.01298529, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00753641, "balance_loss_mlp": 1.00016689, "epoch": 0.771959357902964, "flos": 22049771914080.0, "grad_norm": 1.7766259031876932, "language_loss": 0.86131716, "learning_rate": 5.209947232372798e-07, "loss": 0.88623434, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.8593814373016357 }, { "auxiliary_loss_clip": 0.01336824, "auxiliary_loss_mlp": 0.00872519, "balance_loss_clip": 1.00799441, "balance_loss_mlp": 1.00036645, "epoch": 0.772079600793603, "flos": 30446013513600.0, "grad_norm": 2.1418559681133438, "language_loss": 0.81115597, "learning_rate": 5.204704678183196e-07, "loss": 0.83324945, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.7702577114105225 }, { "auxiliary_loss_clip": 0.01349592, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00864494, "balance_loss_mlp": 1.00016785, "epoch": 0.7721998436842422, "flos": 12969866268000.0, "grad_norm": 2.27656424304967, "language_loss": 0.85029435, "learning_rate": 5.19946436842813e-07, "loss": 0.87572229, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 3.0540614128112793 }, { "auxiliary_loss_clip": 0.01293587, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00747013, "balance_loss_mlp": 1.00016689, "epoch": 0.7723200865748813, "flos": 32635523659680.0, "grad_norm": 1.516676280000888, "language_loss": 0.6829704, "learning_rate": 5.194226303902546e-07, "loss": 0.70783722, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 3.279674768447876 }, { "auxiliary_loss_clip": 0.01315139, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00746524, "balance_loss_mlp": 1.00017095, "epoch": 0.7724403294655203, "flos": 21105717727680.0, "grad_norm": 2.8246865117561315, "language_loss": 0.70710063, "learning_rate": 5.188990485401072e-07, "loss": 0.73218399, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.8570187091827393 }, { "auxiliary_loss_clip": 0.01325997, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.0077095, "balance_loss_mlp": 1.00024068, "epoch": 0.7725605723561595, "flos": 22090747623840.0, "grad_norm": 1.7116547222558274, "language_loss": 0.86426693, "learning_rate": 5.183756913717954e-07, "loss": 0.88945955, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.7776196002960205 }, { "auxiliary_loss_clip": 0.01299906, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00725853, "balance_loss_mlp": 1.00018394, "epoch": 0.7726808152467985, "flos": 34495624771200.0, "grad_norm": 1.636235433920376, "language_loss": 0.72701144, "learning_rate": 5.178525589647136e-07, "loss": 0.75194263, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.8721282482147217 }, { "auxiliary_loss_clip": 0.01323703, "auxiliary_loss_mlp": 0.01193304, "balance_loss_clip": 1.00790679, "balance_loss_mlp": 1.00018477, "epoch": 0.7728010581374376, "flos": 22306354143840.0, "grad_norm": 1.7826540892011555, "language_loss": 0.78520018, "learning_rate": 5.173296513982197e-07, "loss": 0.81037021, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 2.810391664505005 }, { "auxiliary_loss_clip": 0.01304041, "auxiliary_loss_mlp": 0.01193449, "balance_loss_clip": 1.00814581, "balance_loss_mlp": 1.00023484, "epoch": 0.7729213010280768, "flos": 27126466488000.0, "grad_norm": 1.9760580377647883, "language_loss": 0.65127897, "learning_rate": 5.168069687516398e-07, "loss": 0.67625386, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.891294240951538 }, { "auxiliary_loss_clip": 0.01305827, "auxiliary_loss_mlp": 0.01193307, "balance_loss_clip": 1.00750959, "balance_loss_mlp": 1.00028348, "epoch": 0.7730415439187158, "flos": 18150232878720.0, "grad_norm": 1.6447413038557381, "language_loss": 0.71657348, "learning_rate": 5.16284511104263e-07, "loss": 0.74156481, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.8134875297546387 }, { "auxiliary_loss_clip": 0.0130296, "auxiliary_loss_mlp": 0.01193329, "balance_loss_clip": 1.00750029, "balance_loss_mlp": 1.00021005, "epoch": 0.7731617868093549, "flos": 11947488953760.0, "grad_norm": 2.614629051872309, "language_loss": 0.80664122, "learning_rate": 5.157622785353457e-07, "loss": 0.83160412, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.7912726402282715 }, { "auxiliary_loss_clip": 0.01306442, "auxiliary_loss_mlp": 0.01192312, "balance_loss_clip": 1.0048281, "balance_loss_mlp": 1.00005138, "epoch": 0.7732820296999939, "flos": 64201061623680.0, "grad_norm": 0.643033636096047, "language_loss": 0.60384893, "learning_rate": 5.152402711241113e-07, "loss": 0.62883651, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 5.473481893539429 }, { "auxiliary_loss_clip": 0.01300984, "auxiliary_loss_mlp": 0.01193136, "balance_loss_clip": 1.00744152, "balance_loss_mlp": 1.00020814, "epoch": 0.7734022725906331, "flos": 25302204000000.0, "grad_norm": 1.7174096170513948, "language_loss": 0.83006215, "learning_rate": 5.147184889497465e-07, "loss": 0.8550033, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 4.876668453216553 }, { "auxiliary_loss_clip": 0.01288981, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.0071907, "balance_loss_mlp": 1.0002073, "epoch": 0.7735225154812722, "flos": 17347449612960.0, "grad_norm": 2.0816027540059214, "language_loss": 0.79913491, "learning_rate": 5.141969320914072e-07, "loss": 0.82395804, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.798794984817505 }, { "auxiliary_loss_clip": 0.01349541, "auxiliary_loss_mlp": 0.01193304, "balance_loss_clip": 1.0081327, "balance_loss_mlp": 1.00018454, "epoch": 0.7736427583719112, "flos": 32630099184000.0, "grad_norm": 2.543353967290329, "language_loss": 0.62761164, "learning_rate": 5.136756006282113e-07, "loss": 0.65304005, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.774043560028076 }, { "auxiliary_loss_clip": 0.01350428, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00811148, "balance_loss_mlp": 1.00021052, "epoch": 0.7737630012625504, "flos": 19860082149600.0, "grad_norm": 2.5708238184378476, "language_loss": 0.85273975, "learning_rate": 5.131544946392446e-07, "loss": 0.87817639, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.779872179031372 }, { "auxiliary_loss_clip": 0.01305576, "auxiliary_loss_mlp": 0.01193391, "balance_loss_clip": 1.00811338, "balance_loss_mlp": 1.00027204, "epoch": 0.7738832441531894, "flos": 36022652632800.0, "grad_norm": 2.390908220785007, "language_loss": 0.63538671, "learning_rate": 5.126336142035592e-07, "loss": 0.66037637, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 3.022634506225586 }, { "auxiliary_loss_clip": 0.01314961, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00739372, "balance_loss_mlp": 1.00015926, "epoch": 0.7740034870438285, "flos": 13405282378560.0, "grad_norm": 2.385623556620585, "language_loss": 0.7190702, "learning_rate": 5.121129594001721e-07, "loss": 0.74415159, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.7954893112182617 }, { "auxiliary_loss_clip": 0.01326618, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00748849, "balance_loss_mlp": 1.00017118, "epoch": 0.7741237299344677, "flos": 22086724171680.0, "grad_norm": 1.4474908021692308, "language_loss": 0.81300426, "learning_rate": 5.115925303080661e-07, "loss": 0.83820236, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.846224784851074 }, { "auxiliary_loss_clip": 0.01295809, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00775623, "balance_loss_mlp": 1.00015116, "epoch": 0.7742439728251067, "flos": 19864788151680.0, "grad_norm": 3.481040317370091, "language_loss": 0.7936874, "learning_rate": 5.110723270061899e-07, "loss": 0.81857729, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.8724803924560547 }, { "auxiliary_loss_clip": 0.01348106, "auxiliary_loss_mlp": 0.01193255, "balance_loss_clip": 1.0077126, "balance_loss_mlp": 1.00023127, "epoch": 0.7743642157157458, "flos": 16690176262080.0, "grad_norm": 1.7309098433137122, "language_loss": 0.79667163, "learning_rate": 5.105523495734572e-07, "loss": 0.82208526, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.790278911590576 }, { "auxiliary_loss_clip": 0.01349233, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00770116, "balance_loss_mlp": 1.00019252, "epoch": 0.7744844586063849, "flos": 20304371409120.0, "grad_norm": 1.5214659688189924, "language_loss": 0.74989104, "learning_rate": 5.100325980887499e-07, "loss": 0.77531552, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.7991421222686768 }, { "auxiliary_loss_clip": 0.01308619, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00813961, "balance_loss_mlp": 1.00020206, "epoch": 0.774604701497024, "flos": 22966716931200.0, "grad_norm": 1.711097262559055, "language_loss": 0.83255827, "learning_rate": 5.095130726309116e-07, "loss": 0.85757673, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.861168384552002 }, { "auxiliary_loss_clip": 0.01318738, "auxiliary_loss_mlp": 0.01192296, "balance_loss_clip": 1.0048871, "balance_loss_mlp": 1.00003517, "epoch": 0.774724944387663, "flos": 60288554139840.0, "grad_norm": 0.7900189359069815, "language_loss": 0.58995849, "learning_rate": 5.089937732787559e-07, "loss": 0.61506879, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.287811517715454 }, { "auxiliary_loss_clip": 0.01300176, "auxiliary_loss_mlp": 0.01193245, "balance_loss_clip": 1.00822258, "balance_loss_mlp": 1.00022125, "epoch": 0.7748451872783022, "flos": 26761039211520.0, "grad_norm": 2.60887016607916, "language_loss": 0.66534972, "learning_rate": 5.084747001110592e-07, "loss": 0.69028395, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.8569345474243164 }, { "auxiliary_loss_clip": 0.01324948, "auxiliary_loss_mlp": 0.00872393, "balance_loss_clip": 1.0082314, "balance_loss_mlp": 1.00029397, "epoch": 0.7749654301689413, "flos": 30338641337760.0, "grad_norm": 1.711728655639824, "language_loss": 0.70382452, "learning_rate": 5.07955853206564e-07, "loss": 0.72579789, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.96863055229187 }, { "auxiliary_loss_clip": 0.01335681, "auxiliary_loss_mlp": 0.01193273, "balance_loss_clip": 1.00779438, "balance_loss_mlp": 1.00024939, "epoch": 0.7750856730595803, "flos": 43179868611360.0, "grad_norm": 2.0528619996536746, "language_loss": 0.71058249, "learning_rate": 5.074372326439807e-07, "loss": 0.73587203, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 3.020486354827881 }, { "auxiliary_loss_clip": 0.01289608, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.00671244, "balance_loss_mlp": 1.00025368, "epoch": 0.7752059159502195, "flos": 17640050084640.0, "grad_norm": 2.080862786352404, "language_loss": 0.73712087, "learning_rate": 5.069188385019814e-07, "loss": 0.76194972, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.788471221923828 }, { "auxiliary_loss_clip": 0.0128581, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00699723, "balance_loss_mlp": 1.00019264, "epoch": 0.7753261588408585, "flos": 12677696880480.0, "grad_norm": 2.3514335357669887, "language_loss": 0.6086086, "learning_rate": 5.064006708592077e-07, "loss": 0.63339889, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.8591983318328857 }, { "auxiliary_loss_clip": 0.01300275, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00701833, "balance_loss_mlp": 1.00013578, "epoch": 0.7754464017314976, "flos": 16690751040960.0, "grad_norm": 2.14079426806313, "language_loss": 0.75719303, "learning_rate": 5.058827297942641e-07, "loss": 0.78212738, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.7530770301818848 }, { "auxiliary_loss_clip": 0.01308408, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.0083766, "balance_loss_mlp": 1.00016892, "epoch": 0.7755666446221368, "flos": 19718954923680.0, "grad_norm": 2.614693892884032, "language_loss": 0.75121093, "learning_rate": 5.053650153857237e-07, "loss": 0.77622694, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.812612533569336 }, { "auxiliary_loss_clip": 0.01327183, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00794959, "balance_loss_mlp": 1.00017679, "epoch": 0.7756868875127758, "flos": 18693631867680.0, "grad_norm": 1.5611433151047085, "language_loss": 0.69851613, "learning_rate": 5.048475277121214e-07, "loss": 0.72371995, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.7838473320007324 }, { "auxiliary_loss_clip": 0.01327738, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.0073154, "balance_loss_mlp": 1.00019073, "epoch": 0.7758071304034149, "flos": 28404204626880.0, "grad_norm": 1.5232884349394888, "language_loss": 0.77092624, "learning_rate": 5.043302668519598e-07, "loss": 0.79613578, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.8698487281799316 }, { "auxiliary_loss_clip": 0.01336695, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.0080111, "balance_loss_mlp": 1.00015926, "epoch": 0.775927373294054, "flos": 20595355315200.0, "grad_norm": 1.7132330951922579, "language_loss": 0.71630961, "learning_rate": 5.038132328837079e-07, "loss": 0.74160844, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.687307119369507 }, { "auxiliary_loss_clip": 0.0133666, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00856233, "balance_loss_mlp": 1.00018597, "epoch": 0.7760476161846931, "flos": 22526379276480.0, "grad_norm": 1.8488397520569868, "language_loss": 0.73825175, "learning_rate": 5.032964258857993e-07, "loss": 0.7635504, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.7993240356445312 }, { "auxiliary_loss_clip": 0.01337356, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00786042, "balance_loss_mlp": 1.00016975, "epoch": 0.7761678590753321, "flos": 48651506517600.0, "grad_norm": 1.476886721326797, "language_loss": 0.68344903, "learning_rate": 5.027798459366329e-07, "loss": 0.70875448, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.926948070526123 }, { "auxiliary_loss_clip": 0.01338204, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00845361, "balance_loss_mlp": 1.00018716, "epoch": 0.7762881019659713, "flos": 26177059673280.0, "grad_norm": 1.797070451097939, "language_loss": 0.63812435, "learning_rate": 5.02263493114573e-07, "loss": 0.6634385, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 3.7051846981048584 }, { "auxiliary_loss_clip": 0.01348794, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00786638, "balance_loss_mlp": 1.00018477, "epoch": 0.7764083448566104, "flos": 20588350197600.0, "grad_norm": 2.3132038209807377, "language_loss": 0.76777476, "learning_rate": 5.017473674979502e-07, "loss": 0.79319477, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 3.662768840789795 }, { "auxiliary_loss_clip": 0.01255132, "auxiliary_loss_mlp": 0.01192291, "balance_loss_clip": 1.00487614, "balance_loss_mlp": 1.00003004, "epoch": 0.7765285877472494, "flos": 67293111391200.0, "grad_norm": 0.7425764757419967, "language_loss": 0.58378637, "learning_rate": 5.01231469165061e-07, "loss": 0.60826063, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.254978895187378 }, { "auxiliary_loss_clip": 0.01302712, "auxiliary_loss_mlp": 0.01192286, "balance_loss_clip": 1.0046494, "balance_loss_mlp": 1.00002515, "epoch": 0.7766488306378886, "flos": 61344506885760.0, "grad_norm": 0.843142441977201, "language_loss": 0.56993759, "learning_rate": 5.007157981941663e-07, "loss": 0.59488761, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 4.691847562789917 }, { "auxiliary_loss_clip": 0.01288638, "auxiliary_loss_mlp": 0.01192302, "balance_loss_clip": 1.00490975, "balance_loss_mlp": 1.00004101, "epoch": 0.7767690735285276, "flos": 62946229583520.0, "grad_norm": 0.8805121577173891, "language_loss": 0.67489326, "learning_rate": 5.002003546634928e-07, "loss": 0.69970268, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.2678706645965576 }, { "auxiliary_loss_clip": 0.0126807, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00711691, "balance_loss_mlp": 1.00016928, "epoch": 0.7768893164191667, "flos": 20886411068640.0, "grad_norm": 1.596813045149939, "language_loss": 0.75909066, "learning_rate": 4.996851386512331e-07, "loss": 0.78370333, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.9338107109069824 }, { "auxiliary_loss_clip": 0.01303603, "auxiliary_loss_mlp": 0.01193293, "balance_loss_clip": 1.0069226, "balance_loss_mlp": 1.0001744, "epoch": 0.7770095593098058, "flos": 20704595522400.0, "grad_norm": 1.622573184469991, "language_loss": 0.83037186, "learning_rate": 4.991701502355444e-07, "loss": 0.85534084, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.793870449066162 }, { "auxiliary_loss_clip": 0.0133621, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00810027, "balance_loss_mlp": 1.00018883, "epoch": 0.7771298022004449, "flos": 24717721530240.0, "grad_norm": 1.5253811709082274, "language_loss": 0.7563293, "learning_rate": 4.986553894945518e-07, "loss": 0.78162354, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.7858309745788574 }, { "auxiliary_loss_clip": 0.01265523, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.0069232, "balance_loss_mlp": 1.00014114, "epoch": 0.777250045091084, "flos": 25009244291520.0, "grad_norm": 2.1559480207834727, "language_loss": 0.86322057, "learning_rate": 4.981408565063416e-07, "loss": 0.88780743, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.841526746749878 }, { "auxiliary_loss_clip": 0.01349401, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00822949, "balance_loss_mlp": 1.00016999, "epoch": 0.777370287981723, "flos": 20119897510560.0, "grad_norm": 1.7415810126423121, "language_loss": 0.75864387, "learning_rate": 4.976265513489701e-07, "loss": 0.78406978, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.758901834487915 }, { "auxiliary_loss_clip": 0.01337035, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00809181, "balance_loss_mlp": 1.00018239, "epoch": 0.7774905308723622, "flos": 21718818161280.0, "grad_norm": 1.7544804780017604, "language_loss": 0.80393368, "learning_rate": 4.971124741004562e-07, "loss": 0.82923603, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.816648006439209 }, { "auxiliary_loss_clip": 0.01327869, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00738263, "balance_loss_mlp": 1.00020456, "epoch": 0.7776107737630013, "flos": 16034124316320.0, "grad_norm": 1.6651711631547237, "language_loss": 0.76411581, "learning_rate": 4.965986248387846e-07, "loss": 0.78932679, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.747892379760742 }, { "auxiliary_loss_clip": 0.01314591, "auxiliary_loss_mlp": 0.01193297, "balance_loss_clip": 1.00748277, "balance_loss_mlp": 1.0001781, "epoch": 0.7777310166536403, "flos": 24790907571840.0, "grad_norm": 1.4928056689530094, "language_loss": 0.77183032, "learning_rate": 4.960850036419073e-07, "loss": 0.79690927, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.8920907974243164 }, { "auxiliary_loss_clip": 0.01300656, "auxiliary_loss_mlp": 0.01193145, "balance_loss_clip": 1.00677943, "balance_loss_mlp": 1.00012159, "epoch": 0.7778512595442795, "flos": 17272539234720.0, "grad_norm": 1.7445775747034729, "language_loss": 0.78565115, "learning_rate": 4.955716105877378e-07, "loss": 0.81058919, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.798475980758667 }, { "auxiliary_loss_clip": 0.01336197, "auxiliary_loss_mlp": 0.00872456, "balance_loss_clip": 1.00809073, "balance_loss_mlp": 1.00028574, "epoch": 0.7779715024349185, "flos": 17748428123520.0, "grad_norm": 1.5479544803374652, "language_loss": 0.82868505, "learning_rate": 4.950584457541598e-07, "loss": 0.85077155, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.8710005283355713 }, { "auxiliary_loss_clip": 0.01336663, "auxiliary_loss_mlp": 0.01193326, "balance_loss_clip": 1.0078913, "balance_loss_mlp": 1.00020671, "epoch": 0.7780917453255576, "flos": 24316886714400.0, "grad_norm": 1.3469501397020827, "language_loss": 0.821486, "learning_rate": 4.945455092190183e-07, "loss": 0.8467859, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.910954713821411 }, { "auxiliary_loss_clip": 0.01318571, "auxiliary_loss_mlp": 0.01192298, "balance_loss_clip": 1.00490355, "balance_loss_mlp": 1.00003707, "epoch": 0.7782119882161967, "flos": 56364636134880.0, "grad_norm": 0.7130495479047158, "language_loss": 0.56014591, "learning_rate": 4.940328010601271e-07, "loss": 0.58525461, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.3469624519348145 }, { "auxiliary_loss_clip": 0.01300381, "auxiliary_loss_mlp": 0.01193358, "balance_loss_clip": 1.00830388, "balance_loss_mlp": 1.00023937, "epoch": 0.7783322311068358, "flos": 46790004382560.0, "grad_norm": 1.8146926236112917, "language_loss": 0.76869035, "learning_rate": 4.935203213552621e-07, "loss": 0.79362774, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 3.0176641941070557 }, { "auxiliary_loss_clip": 0.01313232, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00824785, "balance_loss_mlp": 1.00019169, "epoch": 0.7784524739974749, "flos": 19057873662720.0, "grad_norm": 3.11988137688107, "language_loss": 0.66659069, "learning_rate": 4.930080701821662e-07, "loss": 0.69165516, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.7522385120391846 }, { "auxiliary_loss_clip": 0.01311706, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00736952, "balance_loss_mlp": 1.00017488, "epoch": 0.778572716888114, "flos": 24791123113920.0, "grad_norm": 1.7887922775300806, "language_loss": 0.77018595, "learning_rate": 4.92496047618548e-07, "loss": 0.79523498, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.865504026412964 }, { "auxiliary_loss_clip": 0.01327891, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00788081, "balance_loss_mlp": 1.00017762, "epoch": 0.7786929597787531, "flos": 20078095556160.0, "grad_norm": 1.8058225150313005, "language_loss": 0.77813005, "learning_rate": 4.919842537420811e-07, "loss": 0.80334097, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.7623276710510254 }, { "auxiliary_loss_clip": 0.01312317, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00792456, "balance_loss_mlp": 1.00017154, "epoch": 0.7788132026693921, "flos": 21872231285760.0, "grad_norm": 1.6189684140247493, "language_loss": 0.79195505, "learning_rate": 4.91472688630404e-07, "loss": 0.81701022, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.931445837020874 }, { "auxiliary_loss_clip": 0.01347926, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00782609, "balance_loss_mlp": 1.00017619, "epoch": 0.7789334455600313, "flos": 11181945335040.0, "grad_norm": 1.60338897224765, "language_loss": 0.74245572, "learning_rate": 4.909613523611202e-07, "loss": 0.76786697, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.7815659046173096 }, { "auxiliary_loss_clip": 0.01275065, "auxiliary_loss_mlp": 0.00872506, "balance_loss_clip": 1.00776219, "balance_loss_mlp": 1.00035524, "epoch": 0.7790536884506704, "flos": 28695439998720.0, "grad_norm": 1.8575431601473882, "language_loss": 0.74617797, "learning_rate": 4.904502450117991e-07, "loss": 0.7676537, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 3.0553689002990723 }, { "auxiliary_loss_clip": 0.01301641, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00785661, "balance_loss_mlp": 1.00015211, "epoch": 0.7791739313413094, "flos": 11072309967360.0, "grad_norm": 2.255957587594945, "language_loss": 0.72344804, "learning_rate": 4.899393666599762e-07, "loss": 0.74839616, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 3.0629541873931885 }, { "auxiliary_loss_clip": 0.01348855, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00762701, "balance_loss_mlp": 1.00016499, "epoch": 0.7792941742319486, "flos": 14679284454720.0, "grad_norm": 2.0974366813109677, "language_loss": 0.72557449, "learning_rate": 4.894287173831506e-07, "loss": 0.75099492, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.767090320587158 }, { "auxiliary_loss_clip": 0.01313033, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00737751, "balance_loss_mlp": 1.00015879, "epoch": 0.7794144171225876, "flos": 23258886318720.0, "grad_norm": 3.5123647936199753, "language_loss": 0.84652364, "learning_rate": 4.889182972587877e-07, "loss": 0.87158585, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.983046293258667 }, { "auxiliary_loss_clip": 0.01311123, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00858152, "balance_loss_mlp": 1.00017071, "epoch": 0.7795346600132267, "flos": 21507091398720.0, "grad_norm": 1.6973066368373082, "language_loss": 0.66256219, "learning_rate": 4.884081063643177e-07, "loss": 0.68760538, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 4.7464659214019775 }, { "auxiliary_loss_clip": 0.01272407, "auxiliary_loss_mlp": 0.01192285, "balance_loss_clip": 1.00508201, "balance_loss_mlp": 1.00002384, "epoch": 0.7796549029038659, "flos": 70052303450880.0, "grad_norm": 0.8514534852777909, "language_loss": 0.5258137, "learning_rate": 4.878981447771353e-07, "loss": 0.55046064, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.3643674850463867 }, { "auxiliary_loss_clip": 0.01290329, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00742292, "balance_loss_mlp": 1.00016212, "epoch": 0.7797751457945049, "flos": 23989417558560.0, "grad_norm": 1.4450429644260097, "language_loss": 0.73022842, "learning_rate": 4.873884125746035e-07, "loss": 0.75506353, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 5.030500411987305 }, { "auxiliary_loss_clip": 0.01314592, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00729895, "balance_loss_mlp": 1.00017798, "epoch": 0.779895388685144, "flos": 22674763085760.0, "grad_norm": 2.248347590572155, "language_loss": 0.71872354, "learning_rate": 4.868789098340456e-07, "loss": 0.74380147, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.819617509841919 }, { "auxiliary_loss_clip": 0.01290936, "auxiliary_loss_mlp": 0.01193289, "balance_loss_clip": 1.00646067, "balance_loss_mlp": 1.00016952, "epoch": 0.7800156315757831, "flos": 23768709876000.0, "grad_norm": 2.4688934096948256, "language_loss": 0.72808236, "learning_rate": 4.863696366327543e-07, "loss": 0.75292462, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.974867343902588 }, { "auxiliary_loss_clip": 0.01335911, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00754094, "balance_loss_mlp": 1.0001893, "epoch": 0.7801358744664222, "flos": 26429726221920.0, "grad_norm": 1.6393530798207545, "language_loss": 0.77881402, "learning_rate": 4.85860593047986e-07, "loss": 0.80410528, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.7574357986450195 }, { "auxiliary_loss_clip": 0.01302966, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00728965, "balance_loss_mlp": 1.00020218, "epoch": 0.7802561173570612, "flos": 26322174427680.0, "grad_norm": 1.5587886095749506, "language_loss": 0.7435813, "learning_rate": 4.853517791569613e-07, "loss": 0.76854312, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.962968349456787 }, { "auxiliary_loss_clip": 0.01314611, "auxiliary_loss_mlp": 0.00872515, "balance_loss_clip": 1.00719321, "balance_loss_mlp": 1.00033617, "epoch": 0.7803763602477004, "flos": 40333767664320.0, "grad_norm": 1.6901768792876481, "language_loss": 0.65988034, "learning_rate": 4.848431950368684e-07, "loss": 0.68175161, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.892335891723633 }, { "auxiliary_loss_clip": 0.01318137, "auxiliary_loss_mlp": 0.00871855, "balance_loss_clip": 1.00454044, "balance_loss_mlp": 1.00002277, "epoch": 0.7804966031383395, "flos": 67001480858880.0, "grad_norm": 0.7026004593634482, "language_loss": 0.55766839, "learning_rate": 4.843348407648569e-07, "loss": 0.57956833, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.206725835800171 }, { "auxiliary_loss_clip": 0.01337117, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00731468, "balance_loss_mlp": 1.00016427, "epoch": 0.7806168460289785, "flos": 17740740456000.0, "grad_norm": 2.2697901185630616, "language_loss": 0.83289337, "learning_rate": 4.838267164180457e-07, "loss": 0.85819638, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.7804365158081055 }, { "auxiliary_loss_clip": 0.01350438, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.0083518, "balance_loss_mlp": 1.0002104, "epoch": 0.7807370889196176, "flos": 23946250504320.0, "grad_norm": 1.8281564888754789, "language_loss": 0.83592123, "learning_rate": 4.833188220735156e-07, "loss": 0.86135793, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.7620785236358643 }, { "auxiliary_loss_clip": 0.0132744, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00715542, "balance_loss_mlp": 1.00017285, "epoch": 0.7808573318102567, "flos": 18989034386400.0, "grad_norm": 1.997099189901727, "language_loss": 0.74931812, "learning_rate": 4.828111578083152e-07, "loss": 0.77452445, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.7845985889434814 }, { "auxiliary_loss_clip": 0.01300503, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.0069263, "balance_loss_mlp": 1.00015306, "epoch": 0.7809775747008958, "flos": 23980759951680.0, "grad_norm": 1.924996245250423, "language_loss": 0.81161892, "learning_rate": 4.823037236994556e-07, "loss": 0.83655572, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.772243022918701 }, { "auxiliary_loss_clip": 0.01306378, "auxiliary_loss_mlp": 0.01192302, "balance_loss_clip": 1.00480461, "balance_loss_mlp": 1.0000416, "epoch": 0.7810978175915348, "flos": 68535908998560.0, "grad_norm": 0.7144576039193478, "language_loss": 0.56336129, "learning_rate": 4.817965198239136e-07, "loss": 0.58834809, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.298542022705078 }, { "auxiliary_loss_clip": 0.01303483, "auxiliary_loss_mlp": 0.01193286, "balance_loss_clip": 1.0075376, "balance_loss_mlp": 1.00016713, "epoch": 0.781218060482174, "flos": 19642140590400.0, "grad_norm": 2.0261862828547312, "language_loss": 0.74472219, "learning_rate": 4.812895462586331e-07, "loss": 0.76968992, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.806786298751831 }, { "auxiliary_loss_clip": 0.01291729, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.0070107, "balance_loss_mlp": 1.00016868, "epoch": 0.7813383033728131, "flos": 25627876971840.0, "grad_norm": 1.7122848990623227, "language_loss": 0.81792998, "learning_rate": 4.807828030805207e-07, "loss": 0.84277928, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.7909324169158936 }, { "auxiliary_loss_clip": 0.01325157, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00783038, "balance_loss_mlp": 1.00017703, "epoch": 0.7814585462634521, "flos": 20485935489600.0, "grad_norm": 1.8141244910146803, "language_loss": 0.67638028, "learning_rate": 4.802762903664495e-07, "loss": 0.70156384, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.7221875190734863 }, { "auxiliary_loss_clip": 0.01307688, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00739026, "balance_loss_mlp": 1.00018883, "epoch": 0.7815787891540913, "flos": 22304306494080.0, "grad_norm": 2.127363927717517, "language_loss": 0.73711026, "learning_rate": 4.797700081932565e-07, "loss": 0.76211929, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.6366686820983887 }, { "auxiliary_loss_clip": 0.01262902, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00686884, "balance_loss_mlp": 1.00016844, "epoch": 0.7816990320447303, "flos": 22600678952160.0, "grad_norm": 2.2115651815263884, "language_loss": 0.81633312, "learning_rate": 4.792639566377442e-07, "loss": 0.84089404, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.8157477378845215 }, { "auxiliary_loss_clip": 0.01336871, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00795031, "balance_loss_mlp": 1.00016212, "epoch": 0.7818192749353694, "flos": 24935986402560.0, "grad_norm": 1.6625512888618643, "language_loss": 0.77599525, "learning_rate": 4.78758135776681e-07, "loss": 0.80129582, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.9523186683654785 }, { "auxiliary_loss_clip": 0.01305328, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00718141, "balance_loss_mlp": 1.00019085, "epoch": 0.7819395178260086, "flos": 23733050870880.0, "grad_norm": 2.7601722928794983, "language_loss": 0.78746879, "learning_rate": 4.782525456867989e-07, "loss": 0.81245422, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.665591239929199 }, { "auxiliary_loss_clip": 0.01274413, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00726151, "balance_loss_mlp": 1.00021935, "epoch": 0.7820597607166476, "flos": 23221682595360.0, "grad_norm": 1.5198690573405964, "language_loss": 0.83296943, "learning_rate": 4.777471864447959e-07, "loss": 0.85764599, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.718425989151001 }, { "auxiliary_loss_clip": 0.01326508, "auxiliary_loss_mlp": 0.0119329, "balance_loss_clip": 1.00835419, "balance_loss_mlp": 1.00017047, "epoch": 0.7821800036072867, "flos": 22309551351360.0, "grad_norm": 1.8415290574332366, "language_loss": 0.80379933, "learning_rate": 4.772420581273344e-07, "loss": 0.82899731, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.8198695182800293 }, { "auxiliary_loss_clip": 0.01323377, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00767183, "balance_loss_mlp": 1.00015569, "epoch": 0.7823002464979258, "flos": 21544187351040.0, "grad_norm": 2.1413376211260573, "language_loss": 0.7650128, "learning_rate": 4.7673716081104134e-07, "loss": 0.79017836, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 2.7020063400268555 }, { "auxiliary_loss_clip": 0.01324227, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00790763, "balance_loss_mlp": 1.00016618, "epoch": 0.7824204893885649, "flos": 24535654518240.0, "grad_norm": 1.7695519944995353, "language_loss": 0.841398, "learning_rate": 4.762324945725109e-07, "loss": 0.8665722, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.900245428085327 }, { "auxiliary_loss_clip": 0.01302071, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00768375, "balance_loss_mlp": 1.00017333, "epoch": 0.782540732279204, "flos": 27415223125920.0, "grad_norm": 1.6427744546676606, "language_loss": 0.76075578, "learning_rate": 4.7572805948829844e-07, "loss": 0.78570849, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 3.8225390911102295 }, { "auxiliary_loss_clip": 0.0127625, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00628424, "balance_loss_mlp": 1.00015855, "epoch": 0.7826609751698431, "flos": 24353228269440.0, "grad_norm": 2.2920736949497393, "language_loss": 0.71128327, "learning_rate": 4.7522385563492795e-07, "loss": 0.73597753, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.855153799057007 }, { "auxiliary_loss_clip": 0.01293277, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.0075556, "balance_loss_mlp": 1.00022411, "epoch": 0.7827812180604822, "flos": 23988555390240.0, "grad_norm": 1.8546811485961374, "language_loss": 0.70111978, "learning_rate": 4.747198830888863e-07, "loss": 0.72598505, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 5.143319129943848 }, { "auxiliary_loss_clip": 0.01312347, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00748658, "balance_loss_mlp": 1.00014126, "epoch": 0.7829014609511212, "flos": 27454330804320.0, "grad_norm": 1.796005469377548, "language_loss": 0.68388486, "learning_rate": 4.742161419266251e-07, "loss": 0.70894003, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 3.80056095123291 }, { "auxiliary_loss_clip": 0.0133653, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00808001, "balance_loss_mlp": 1.00021017, "epoch": 0.7830217038417604, "flos": 29204545082400.0, "grad_norm": 2.662497938590638, "language_loss": 0.65227443, "learning_rate": 4.7371263222456304e-07, "loss": 0.67757207, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.8450944423675537 }, { "auxiliary_loss_clip": 0.01295726, "auxiliary_loss_mlp": 0.01192298, "balance_loss_clip": 1.00385475, "balance_loss_mlp": 1.00003719, "epoch": 0.7831419467323995, "flos": 60950928653280.0, "grad_norm": 0.7966385344163136, "language_loss": 0.61394715, "learning_rate": 4.7320935405908004e-07, "loss": 0.63882738, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.2816274166107178 }, { "auxiliary_loss_clip": 0.01349746, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00800693, "balance_loss_mlp": 1.00020981, "epoch": 0.7832621896230385, "flos": 19682541521280.0, "grad_norm": 2.065973922607567, "language_loss": 0.83980596, "learning_rate": 4.7270630750652475e-07, "loss": 0.86523581, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.6586503982543945 }, { "auxiliary_loss_clip": 0.01326512, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00698256, "balance_loss_mlp": 1.00013876, "epoch": 0.7833824325136777, "flos": 25009244291520.0, "grad_norm": 1.5998884522196226, "language_loss": 0.80400085, "learning_rate": 4.7220349264320746e-07, "loss": 0.82919669, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.7687013149261475 }, { "auxiliary_loss_clip": 0.01300382, "auxiliary_loss_mlp": 0.01192292, "balance_loss_clip": 1.00479031, "balance_loss_mlp": 1.00003099, "epoch": 0.7835026754043167, "flos": 68800178895840.0, "grad_norm": 0.7338734611137808, "language_loss": 0.54921532, "learning_rate": 4.71700909545407e-07, "loss": 0.57414204, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.32578182220459 }, { "auxiliary_loss_clip": 0.01336336, "auxiliary_loss_mlp": 0.01193149, "balance_loss_clip": 1.00794172, "balance_loss_mlp": 1.00012541, "epoch": 0.7836229182949558, "flos": 19864608533280.0, "grad_norm": 1.9997104263710899, "language_loss": 0.77062738, "learning_rate": 4.711985582893627e-07, "loss": 0.79592228, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.8490757942199707 }, { "auxiliary_loss_clip": 0.01277832, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00712895, "balance_loss_mlp": 1.00018001, "epoch": 0.783743161185595, "flos": 22965854762880.0, "grad_norm": 1.5859170838738208, "language_loss": 0.71547717, "learning_rate": 4.706964389512811e-07, "loss": 0.74018747, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.995028018951416 }, { "auxiliary_loss_clip": 0.01347869, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00813842, "balance_loss_mlp": 1.00016367, "epoch": 0.783863404076234, "flos": 12458497992480.0, "grad_norm": 1.7543295578693983, "language_loss": 0.87501806, "learning_rate": 4.701945516073345e-07, "loss": 0.90042859, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.92244815826416 }, { "auxiliary_loss_clip": 0.01280497, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00626349, "balance_loss_mlp": 1.00017178, "epoch": 0.7839836469668731, "flos": 24243952138560.0, "grad_norm": 1.7409049179144633, "language_loss": 0.75387973, "learning_rate": 4.696928963336577e-07, "loss": 0.77861667, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.9869542121887207 }, { "auxiliary_loss_clip": 0.01295778, "auxiliary_loss_mlp": 0.01192317, "balance_loss_clip": 1.00376427, "balance_loss_mlp": 1.00005674, "epoch": 0.7841038898575122, "flos": 62122013089920.0, "grad_norm": 0.8496598518904398, "language_loss": 0.61007535, "learning_rate": 4.6919147320635224e-07, "loss": 0.63495624, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.3180992603302 }, { "auxiliary_loss_clip": 0.01336843, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00812674, "balance_loss_mlp": 1.0001955, "epoch": 0.7842241327481513, "flos": 20193909796800.0, "grad_norm": 2.0689689883373976, "language_loss": 0.73143959, "learning_rate": 4.6869028230148286e-07, "loss": 0.75674021, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.851285934448242 }, { "auxiliary_loss_clip": 0.01301699, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00706947, "balance_loss_mlp": 1.0001955, "epoch": 0.7843443756387903, "flos": 28074544126560.0, "grad_norm": 2.786740806133556, "language_loss": 0.60223269, "learning_rate": 4.6818932369507957e-07, "loss": 0.62718189, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.9055888652801514 }, { "auxiliary_loss_clip": 0.01327116, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00769579, "balance_loss_mlp": 1.00019336, "epoch": 0.7844646185294295, "flos": 21323407821120.0, "grad_norm": 1.955071869961186, "language_loss": 0.88710403, "learning_rate": 4.676885974631386e-07, "loss": 0.91230738, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.7407360076904297 }, { "auxiliary_loss_clip": 0.01327417, "auxiliary_loss_mlp": 0.0119337, "balance_loss_clip": 1.00740325, "balance_loss_mlp": 1.00025117, "epoch": 0.7845848614200686, "flos": 23656595774400.0, "grad_norm": 1.7886915510823551, "language_loss": 0.81040204, "learning_rate": 4.67188103681619e-07, "loss": 0.83560997, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.7738990783691406 }, { "auxiliary_loss_clip": 0.01324808, "auxiliary_loss_mlp": 0.00872455, "balance_loss_clip": 1.00814617, "balance_loss_mlp": 1.00028753, "epoch": 0.7847051043107076, "flos": 23402204889120.0, "grad_norm": 2.4568485744237867, "language_loss": 0.69135416, "learning_rate": 4.666878424264453e-07, "loss": 0.71332681, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.7926175594329834 }, { "auxiliary_loss_clip": 0.01322553, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00828791, "balance_loss_mlp": 1.00015223, "epoch": 0.7848253472013467, "flos": 19022286504960.0, "grad_norm": 1.5899846678688736, "language_loss": 0.73841661, "learning_rate": 4.661878137735069e-07, "loss": 0.76357388, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.722956418991089 }, { "auxiliary_loss_clip": 0.01305706, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00677252, "balance_loss_mlp": 1.00016236, "epoch": 0.7849455900919858, "flos": 21179191158720.0, "grad_norm": 1.8583754281993314, "language_loss": 0.75000453, "learning_rate": 4.656880177986571e-07, "loss": 0.77499342, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.7519166469573975 }, { "auxiliary_loss_clip": 0.01320021, "auxiliary_loss_mlp": 0.01193444, "balance_loss_clip": 1.00779629, "balance_loss_mlp": 1.00022948, "epoch": 0.7850658329826249, "flos": 19536492751200.0, "grad_norm": 1.8575994213669782, "language_loss": 0.8145355, "learning_rate": 4.6518845457771607e-07, "loss": 0.83967006, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 2.8484253883361816 }, { "auxiliary_loss_clip": 0.01337054, "auxiliary_loss_mlp": 0.00872352, "balance_loss_clip": 1.00810289, "balance_loss_mlp": 1.00035739, "epoch": 0.7851860758732639, "flos": 12495342479040.0, "grad_norm": 1.778263794386599, "language_loss": 0.79264176, "learning_rate": 4.646891241864652e-07, "loss": 0.81473577, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.6099534034729004 }, { "auxiliary_loss_clip": 0.01337769, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00778866, "balance_loss_mlp": 1.0001955, "epoch": 0.7853063187639031, "flos": 22960969142400.0, "grad_norm": 2.0626885399423562, "language_loss": 0.73161435, "learning_rate": 4.6419002670065397e-07, "loss": 0.75692427, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.6978278160095215 }, { "auxiliary_loss_clip": 0.01288753, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00663567, "balance_loss_mlp": 1.00018072, "epoch": 0.7854265616545422, "flos": 17347269994560.0, "grad_norm": 1.9004811494001108, "language_loss": 0.86621112, "learning_rate": 4.6369116219599445e-07, "loss": 0.89103067, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 2.7983407974243164 }, { "auxiliary_loss_clip": 0.01291011, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00725245, "balance_loss_mlp": 1.00015533, "epoch": 0.7855468045451812, "flos": 23838303549600.0, "grad_norm": 1.6910598505931014, "language_loss": 0.79225981, "learning_rate": 4.631925307481637e-07, "loss": 0.81710076, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.8472650051116943 }, { "auxiliary_loss_clip": 0.01306477, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00706744, "balance_loss_mlp": 1.00017011, "epoch": 0.7856670474358204, "flos": 25666805031840.0, "grad_norm": 2.0184800976777493, "language_loss": 0.75152266, "learning_rate": 4.6269413243280533e-07, "loss": 0.77651936, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 3.827223777770996 }, { "auxiliary_loss_clip": 0.01313784, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00840986, "balance_loss_mlp": 1.00019848, "epoch": 0.7857872903264594, "flos": 18144664708320.0, "grad_norm": 2.2553028438084954, "language_loss": 0.74263865, "learning_rate": 4.621959673255236e-07, "loss": 0.76770872, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 3.6610217094421387 }, { "auxiliary_loss_clip": 0.01275774, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00723338, "balance_loss_mlp": 1.00019968, "epoch": 0.7859075332170985, "flos": 14386145127840.0, "grad_norm": 1.9670841836591293, "language_loss": 0.90414143, "learning_rate": 4.6169803550189135e-07, "loss": 0.9288314, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 3.8072774410247803 }, { "auxiliary_loss_clip": 0.01266106, "auxiliary_loss_mlp": 0.01193338, "balance_loss_clip": 1.00757694, "balance_loss_mlp": 1.00021935, "epoch": 0.7860277761077377, "flos": 19864069678080.0, "grad_norm": 2.075417126921998, "language_loss": 0.77315152, "learning_rate": 4.6120033703744355e-07, "loss": 0.79774594, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.956549882888794 }, { "auxiliary_loss_clip": 0.01314708, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00756931, "balance_loss_mlp": 1.00018167, "epoch": 0.7861480189983767, "flos": 26396186713920.0, "grad_norm": 2.1181712490658726, "language_loss": 0.78287208, "learning_rate": 4.607028720076822e-07, "loss": 0.80795121, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 3.8323473930358887 }, { "auxiliary_loss_clip": 0.01327208, "auxiliary_loss_mlp": 0.01193149, "balance_loss_clip": 1.00764608, "balance_loss_mlp": 1.00012517, "epoch": 0.7862682618890158, "flos": 24236587784160.0, "grad_norm": 1.8343832546780359, "language_loss": 0.73469126, "learning_rate": 4.6020564048807074e-07, "loss": 0.75989485, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.7327053546905518 }, { "auxiliary_loss_clip": 0.01328344, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00779986, "balance_loss_mlp": 1.00017095, "epoch": 0.7863885047796549, "flos": 47551524549120.0, "grad_norm": 1.9625305186984285, "language_loss": 0.71953094, "learning_rate": 4.5970864255403883e-07, "loss": 0.74474633, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.9595160484313965 }, { "auxiliary_loss_clip": 0.01335453, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00804055, "balance_loss_mlp": 1.00014853, "epoch": 0.786508747670294, "flos": 24389246511360.0, "grad_norm": 1.986291697969166, "language_loss": 0.81750631, "learning_rate": 4.59211878280982e-07, "loss": 0.84279251, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.716829299926758 }, { "auxiliary_loss_clip": 0.0131206, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00747204, "balance_loss_mlp": 1.00015855, "epoch": 0.786628990560933, "flos": 18041244137280.0, "grad_norm": 1.9623304960216859, "language_loss": 0.70030355, "learning_rate": 4.587153477442578e-07, "loss": 0.72535592, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.8025355339050293 }, { "auxiliary_loss_clip": 0.0134978, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.0082643, "balance_loss_mlp": 1.00019741, "epoch": 0.7867492334515722, "flos": 25848872043840.0, "grad_norm": 1.990614692982548, "language_loss": 0.81334639, "learning_rate": 4.582190510191899e-07, "loss": 0.83877736, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.7170464992523193 }, { "auxiliary_loss_clip": 0.01280496, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00633061, "balance_loss_mlp": 1.00023508, "epoch": 0.7868694763422113, "flos": 16580828283840.0, "grad_norm": 2.0203809593579938, "language_loss": 0.8749088, "learning_rate": 4.5772298818106625e-07, "loss": 0.89964545, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.7403602600097656 }, { "auxiliary_loss_clip": 0.01286269, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00886333, "balance_loss_mlp": 1.00013673, "epoch": 0.7869897192328503, "flos": 29386288781280.0, "grad_norm": 2.711653105463587, "language_loss": 0.72144198, "learning_rate": 4.572271593051384e-07, "loss": 0.74623626, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.789062261581421 }, { "auxiliary_loss_clip": 0.01268909, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00714529, "balance_loss_mlp": 1.00017715, "epoch": 0.7871099621234895, "flos": 17128933274880.0, "grad_norm": 1.5880251724030459, "language_loss": 0.78456336, "learning_rate": 4.567315644666245e-07, "loss": 0.80918449, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.891892433166504 }, { "auxiliary_loss_clip": 0.01294095, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00764048, "balance_loss_mlp": 1.00019169, "epoch": 0.7872302050141285, "flos": 23440198933440.0, "grad_norm": 1.893818738462747, "language_loss": 0.84475958, "learning_rate": 4.5623620374070507e-07, "loss": 0.86963266, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.8236567974090576 }, { "auxiliary_loss_clip": 0.0126962, "auxiliary_loss_mlp": 0.01192268, "balance_loss_clip": 1.00466061, "balance_loss_mlp": 1.00000715, "epoch": 0.7873504479047676, "flos": 65959789813920.0, "grad_norm": 0.7623447197199725, "language_loss": 0.58432668, "learning_rate": 4.557410772025263e-07, "loss": 0.60894555, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.4933950901031494 }, { "auxiliary_loss_clip": 0.01317874, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00770068, "balance_loss_mlp": 1.0001632, "epoch": 0.7874706907954068, "flos": 23258347463520.0, "grad_norm": 1.7287760529121563, "language_loss": 0.66410065, "learning_rate": 4.5524618492719803e-07, "loss": 0.68921125, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.809736967086792 }, { "auxiliary_loss_clip": 0.01330374, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00720453, "balance_loss_mlp": 1.00014997, "epoch": 0.7875909336860458, "flos": 28767799795680.0, "grad_norm": 1.4807533401927064, "language_loss": 0.78830886, "learning_rate": 4.54751526989795e-07, "loss": 0.81354433, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.7904744148254395 }, { "auxiliary_loss_clip": 0.01336345, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00817776, "balance_loss_mlp": 1.00015283, "epoch": 0.7877111765766849, "flos": 18697296083040.0, "grad_norm": 1.9621638078242416, "language_loss": 0.79177451, "learning_rate": 4.5425710346535775e-07, "loss": 0.81706977, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.739758253097534 }, { "auxiliary_loss_clip": 0.0132957, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00747466, "balance_loss_mlp": 1.00018501, "epoch": 0.787831419467324, "flos": 27592979296320.0, "grad_norm": 2.2512179743048844, "language_loss": 0.82082391, "learning_rate": 4.537629144288877e-07, "loss": 0.84605169, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.7143712043762207 }, { "auxiliary_loss_clip": 0.01276054, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00753784, "balance_loss_mlp": 1.00015819, "epoch": 0.7879516623579631, "flos": 18150196955040.0, "grad_norm": 1.8939771068287847, "language_loss": 0.74981558, "learning_rate": 4.5326895995535477e-07, "loss": 0.774508, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.8666632175445557 }, { "auxiliary_loss_clip": 0.0132457, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.0072962, "balance_loss_mlp": 1.00020266, "epoch": 0.7880719052486022, "flos": 20339204169600.0, "grad_norm": 2.1159557704374845, "language_loss": 0.84353423, "learning_rate": 4.527752401196907e-07, "loss": 0.86871219, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.7095112800598145 }, { "auxiliary_loss_clip": 0.01312592, "auxiliary_loss_mlp": 0.01193311, "balance_loss_clip": 1.00801826, "balance_loss_mlp": 1.00019169, "epoch": 0.7881921481392413, "flos": 21653247939840.0, "grad_norm": 1.7360392795563904, "language_loss": 0.66545093, "learning_rate": 4.5228175499679254e-07, "loss": 0.69051003, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 2.7304575443267822 }, { "auxiliary_loss_clip": 0.01301276, "auxiliary_loss_mlp": 0.01192293, "balance_loss_clip": 1.00457382, "balance_loss_mlp": 1.00003242, "epoch": 0.7883123910298804, "flos": 68565892062240.0, "grad_norm": 0.8394168769602492, "language_loss": 0.54568791, "learning_rate": 4.5178850466152174e-07, "loss": 0.57062364, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.3859591484069824 }, { "auxiliary_loss_clip": 0.01323775, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00811148, "balance_loss_mlp": 1.00017524, "epoch": 0.7884326339205194, "flos": 19318227878880.0, "grad_norm": 6.688220490645749, "language_loss": 0.81955183, "learning_rate": 4.512954891887031e-07, "loss": 0.84472156, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.7359166145324707 }, { "auxiliary_loss_clip": 0.01313751, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00843596, "balance_loss_mlp": 1.00021076, "epoch": 0.7885528768111585, "flos": 17784913373280.0, "grad_norm": 2.3835830649658614, "language_loss": 0.83269584, "learning_rate": 4.5080270865312806e-07, "loss": 0.85776579, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.77238130569458 }, { "auxiliary_loss_clip": 0.0132836, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00766814, "balance_loss_mlp": 1.00017452, "epoch": 0.7886731197017977, "flos": 18807649924320.0, "grad_norm": 2.541435556071743, "language_loss": 0.71299398, "learning_rate": 4.5031016312954985e-07, "loss": 0.73820961, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.6944994926452637 }, { "auxiliary_loss_clip": 0.01337233, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00839496, "balance_loss_mlp": 1.00020301, "epoch": 0.7887933625924367, "flos": 33365372349600.0, "grad_norm": 3.5780933784465105, "language_loss": 0.74412, "learning_rate": 4.498178526926886e-07, "loss": 0.76942456, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.837930202484131 }, { "auxiliary_loss_clip": 0.0134824, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00832415, "balance_loss_mlp": 1.00016034, "epoch": 0.7889136054830758, "flos": 17019369754560.0, "grad_norm": 2.197620328980895, "language_loss": 0.72590643, "learning_rate": 4.4932577741722635e-07, "loss": 0.75132066, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 3.7302889823913574 }, { "auxiliary_loss_clip": 0.01317162, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00745726, "balance_loss_mlp": 1.00016415, "epoch": 0.7890338483737149, "flos": 29424642062400.0, "grad_norm": 1.6054897402866795, "language_loss": 0.74289393, "learning_rate": 4.4883393737780985e-07, "loss": 0.76799738, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 4.6797754764556885 }, { "auxiliary_loss_clip": 0.01335842, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00779128, "balance_loss_mlp": 1.0001924, "epoch": 0.789154091264354, "flos": 19971585548640.0, "grad_norm": 1.7714132828388742, "language_loss": 0.7838347, "learning_rate": 4.4834233264905254e-07, "loss": 0.80912423, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 3.662583351135254 }, { "auxiliary_loss_clip": 0.01301315, "auxiliary_loss_mlp": 0.01193276, "balance_loss_clip": 1.00817299, "balance_loss_mlp": 1.00025272, "epoch": 0.789274334154993, "flos": 14537833915680.0, "grad_norm": 2.404323721093033, "language_loss": 0.71440244, "learning_rate": 4.478509633055294e-07, "loss": 0.73934829, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.864847421646118 }, { "auxiliary_loss_clip": 0.01315618, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00761878, "balance_loss_mlp": 1.00022459, "epoch": 0.7893945770456322, "flos": 21827411742240.0, "grad_norm": 2.141008552624829, "language_loss": 0.79994559, "learning_rate": 4.473598294217813e-07, "loss": 0.82503426, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.813933849334717 }, { "auxiliary_loss_clip": 0.01325975, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00716996, "balance_loss_mlp": 1.00015974, "epoch": 0.7895148199362713, "flos": 20740649688000.0, "grad_norm": 2.0285173564440804, "language_loss": 0.71841979, "learning_rate": 4.468689310723124e-07, "loss": 0.7436114, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.7364888191223145 }, { "auxiliary_loss_clip": 0.0130048, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.00681305, "balance_loss_mlp": 1.00020957, "epoch": 0.7896350628269103, "flos": 16690679193600.0, "grad_norm": 1.9936377848873998, "language_loss": 0.79056758, "learning_rate": 4.463782683315913e-07, "loss": 0.81550467, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.7805354595184326 }, { "auxiliary_loss_clip": 0.01347219, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00750279, "balance_loss_mlp": 1.00015831, "epoch": 0.7897553057175495, "flos": 22638385607040.0, "grad_norm": 1.6465521513602417, "language_loss": 0.73198271, "learning_rate": 4.458878412740523e-07, "loss": 0.75738668, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.729480743408203 }, { "auxiliary_loss_clip": 0.01323662, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00791848, "balance_loss_mlp": 1.00021183, "epoch": 0.7898755486081885, "flos": 14537582449920.0, "grad_norm": 2.4666438865753673, "language_loss": 0.78200603, "learning_rate": 4.453976499740919e-07, "loss": 0.80717504, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.73073410987854 }, { "auxiliary_loss_clip": 0.01323427, "auxiliary_loss_mlp": 0.01193146, "balance_loss_clip": 1.00811517, "balance_loss_mlp": 1.00012207, "epoch": 0.7899957914988276, "flos": 17238496795200.0, "grad_norm": 3.5768456774149793, "language_loss": 0.78064287, "learning_rate": 4.4490769450607215e-07, "loss": 0.8058086, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.6689844131469727 }, { "auxiliary_loss_clip": 0.01312904, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00831795, "balance_loss_mlp": 1.00014377, "epoch": 0.7901160343894668, "flos": 41279366568960.0, "grad_norm": 1.707662458848919, "language_loss": 0.7265178, "learning_rate": 4.4441797494431845e-07, "loss": 0.75157857, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 3.1809871196746826 }, { "auxiliary_loss_clip": 0.01324127, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00735557, "balance_loss_mlp": 1.00021267, "epoch": 0.7902362772801058, "flos": 16837015353120.0, "grad_norm": 1.87050461823944, "language_loss": 0.78345144, "learning_rate": 4.439284913631207e-07, "loss": 0.80862504, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.6805591583251953 }, { "auxiliary_loss_clip": 0.01273771, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00755322, "balance_loss_mlp": 1.00019562, "epoch": 0.7903565201707449, "flos": 27125999480160.0, "grad_norm": 1.9891550283580708, "language_loss": 0.83872849, "learning_rate": 4.434392438367347e-07, "loss": 0.86339837, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.922196626663208 }, { "auxiliary_loss_clip": 0.01321575, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00803089, "balance_loss_mlp": 1.00019491, "epoch": 0.790476763061384, "flos": 31025179278720.0, "grad_norm": 1.6565659212480273, "language_loss": 0.73854339, "learning_rate": 4.4295023243937677e-07, "loss": 0.76369137, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.8438851833343506 }, { "auxiliary_loss_clip": 0.01326014, "auxiliary_loss_mlp": 0.01193339, "balance_loss_clip": 1.00841916, "balance_loss_mlp": 1.0002203, "epoch": 0.7905970059520231, "flos": 22089095134560.0, "grad_norm": 1.653652656564819, "language_loss": 0.80145073, "learning_rate": 4.4246145724523123e-07, "loss": 0.8266443, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.7273757457733154 }, { "auxiliary_loss_clip": 0.01287488, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00685978, "balance_loss_mlp": 1.00017345, "epoch": 0.7907172488426621, "flos": 20558151591840.0, "grad_norm": 2.1874080038028736, "language_loss": 0.77197444, "learning_rate": 4.41972918328444e-07, "loss": 0.7967813, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.8343329429626465 }, { "auxiliary_loss_clip": 0.01324194, "auxiliary_loss_mlp": 0.01193255, "balance_loss_clip": 1.00754309, "balance_loss_mlp": 1.00023174, "epoch": 0.7908374917333013, "flos": 30081556176480.0, "grad_norm": 2.035685119457451, "language_loss": 0.77619261, "learning_rate": 4.4148461576312646e-07, "loss": 0.8013671, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.756826400756836 }, { "auxiliary_loss_clip": 0.01331253, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00772119, "balance_loss_mlp": 1.0001781, "epoch": 0.7909577346239404, "flos": 20996369749440.0, "grad_norm": 1.4720751285903446, "language_loss": 0.74696648, "learning_rate": 4.4099654962335343e-07, "loss": 0.77221107, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.744053840637207 }, { "auxiliary_loss_clip": 0.01318496, "auxiliary_loss_mlp": 0.01193255, "balance_loss_clip": 1.00767136, "balance_loss_mlp": 1.00023127, "epoch": 0.7910779775145794, "flos": 26247946599360.0, "grad_norm": 1.687094317146352, "language_loss": 0.74733925, "learning_rate": 4.405087199831636e-07, "loss": 0.77245677, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.7870826721191406 }, { "auxiliary_loss_clip": 0.01316238, "auxiliary_loss_mlp": 0.00872378, "balance_loss_clip": 1.007797, "balance_loss_mlp": 1.0002749, "epoch": 0.7911982204052186, "flos": 22564445168160.0, "grad_norm": 1.838008317893328, "language_loss": 0.67012078, "learning_rate": 4.400211269165619e-07, "loss": 0.69200695, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.827300786972046 }, { "auxiliary_loss_clip": 0.01349748, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00872791, "balance_loss_mlp": 1.00019252, "epoch": 0.7913184632958576, "flos": 23112550159200.0, "grad_norm": 1.6054639221636398, "language_loss": 0.76594186, "learning_rate": 4.3953377049751416e-07, "loss": 0.79137152, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.673264980316162 }, { "auxiliary_loss_clip": 0.01315238, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00804353, "balance_loss_mlp": 1.00022745, "epoch": 0.7914387061864967, "flos": 12311766672480.0, "grad_norm": 2.508112635342359, "language_loss": 0.7832104, "learning_rate": 4.390466507999537e-07, "loss": 0.80829531, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.8183510303497314 }, { "auxiliary_loss_clip": 0.01287716, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00709176, "balance_loss_mlp": 1.00016534, "epoch": 0.7915589490771359, "flos": 17603277445440.0, "grad_norm": 2.0386384963155697, "language_loss": 0.76043224, "learning_rate": 4.385597678977748e-07, "loss": 0.78524137, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.7842154502868652 }, { "auxiliary_loss_clip": 0.01314081, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00801134, "balance_loss_mlp": 1.00016069, "epoch": 0.7916791919677749, "flos": 25591283951040.0, "grad_norm": 1.552884779177857, "language_loss": 0.75318152, "learning_rate": 4.3807312186483726e-07, "loss": 0.77825415, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 2.842697858810425 }, { "auxiliary_loss_clip": 0.01323498, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00787342, "balance_loss_mlp": 1.00016522, "epoch": 0.791799434858414, "flos": 18844350716160.0, "grad_norm": 1.7565961125223182, "language_loss": 0.78289115, "learning_rate": 4.375867127749655e-07, "loss": 0.80805802, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.8070263862609863 }, { "auxiliary_loss_clip": 0.01285639, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00721216, "balance_loss_mlp": 1.00016189, "epoch": 0.7919196777490531, "flos": 25812027557280.0, "grad_norm": 1.845307713070154, "language_loss": 0.6699121, "learning_rate": 4.3710054070194744e-07, "loss": 0.6947003, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 3.832988739013672 }, { "auxiliary_loss_clip": 0.01349158, "auxiliary_loss_mlp": 0.00872478, "balance_loss_clip": 1.00764334, "balance_loss_mlp": 1.000278, "epoch": 0.7920399206396922, "flos": 11947632648480.0, "grad_norm": 2.7893905933187417, "language_loss": 0.66106677, "learning_rate": 4.3661460571953455e-07, "loss": 0.68328315, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.764120578765869 }, { "auxiliary_loss_clip": 0.0133621, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00766289, "balance_loss_mlp": 1.0001359, "epoch": 0.7921601635303313, "flos": 21579918203520.0, "grad_norm": 1.8589371357845093, "language_loss": 0.68776721, "learning_rate": 4.36128907901443e-07, "loss": 0.71306086, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 3.667694568634033 }, { "auxiliary_loss_clip": 0.01302425, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00806069, "balance_loss_mlp": 1.0001936, "epoch": 0.7922804064209703, "flos": 18113999094720.0, "grad_norm": 2.2220613235736937, "language_loss": 0.73138642, "learning_rate": 4.356434473213519e-07, "loss": 0.75634289, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 3.7574634552001953 }, { "auxiliary_loss_clip": 0.01301923, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.00725639, "balance_loss_mlp": 1.00020599, "epoch": 0.7924006493116095, "flos": 21652816855680.0, "grad_norm": 1.6728534828840234, "language_loss": 0.79645193, "learning_rate": 4.351582240529068e-07, "loss": 0.82140344, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 4.234701871871948 }, { "auxiliary_loss_clip": 0.01294095, "auxiliary_loss_mlp": 0.0119228, "balance_loss_clip": 1.00439, "balance_loss_mlp": 1.00001943, "epoch": 0.7925208922022485, "flos": 64242791730720.0, "grad_norm": 0.6745250265423091, "language_loss": 0.58214301, "learning_rate": 4.346732381697149e-07, "loss": 0.60700673, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.409419536590576 }, { "auxiliary_loss_clip": 0.01299439, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00737381, "balance_loss_mlp": 1.00017214, "epoch": 0.7926411350928876, "flos": 16941549558240.0, "grad_norm": 1.9504379496869975, "language_loss": 0.8077848, "learning_rate": 4.3418848974534825e-07, "loss": 0.8327111, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.867074966430664 }, { "auxiliary_loss_clip": 0.01300404, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00749755, "balance_loss_mlp": 1.00020766, "epoch": 0.7927613779835267, "flos": 34460001689760.0, "grad_norm": 1.533849572044041, "language_loss": 0.68991053, "learning_rate": 4.3370397885334276e-07, "loss": 0.71484691, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.938490629196167 }, { "auxiliary_loss_clip": 0.01337886, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00864923, "balance_loss_mlp": 1.00015819, "epoch": 0.7928816208741658, "flos": 18951184036800.0, "grad_norm": 1.6799258481320527, "language_loss": 0.75253212, "learning_rate": 4.3321970556719777e-07, "loss": 0.77784288, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.8435840606689453 }, { "auxiliary_loss_clip": 0.01348466, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00809455, "balance_loss_mlp": 1.00020516, "epoch": 0.7930018637648049, "flos": 18623032331040.0, "grad_norm": 2.486918713260896, "language_loss": 0.71704006, "learning_rate": 4.3273566996037856e-07, "loss": 0.74245703, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.742924928665161 }, { "auxiliary_loss_clip": 0.01309472, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00689173, "balance_loss_mlp": 1.00014484, "epoch": 0.793122106655444, "flos": 24530661126720.0, "grad_norm": 1.910976809129656, "language_loss": 0.80063349, "learning_rate": 4.322518721063113e-07, "loss": 0.82565892, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.966801404953003 }, { "auxiliary_loss_clip": 0.01325136, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00771987, "balance_loss_mlp": 1.00016928, "epoch": 0.7932423495460831, "flos": 34421217324480.0, "grad_norm": 1.7382158581072733, "language_loss": 0.70020437, "learning_rate": 4.3176831207838906e-07, "loss": 0.72538769, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.8597352504730225 }, { "auxiliary_loss_clip": 0.01324323, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00818491, "balance_loss_mlp": 1.00017929, "epoch": 0.7933625924367221, "flos": 26980345870560.0, "grad_norm": 1.6322457637853913, "language_loss": 0.74260306, "learning_rate": 4.3128498994996685e-07, "loss": 0.76777834, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.840386152267456 }, { "auxiliary_loss_clip": 0.01331938, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00746119, "balance_loss_mlp": 1.00017095, "epoch": 0.7934828353273613, "flos": 29568643182720.0, "grad_norm": 2.1374084336608288, "language_loss": 0.71122837, "learning_rate": 4.308019057943646e-07, "loss": 0.73647964, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.921799898147583 }, { "auxiliary_loss_clip": 0.01266535, "auxiliary_loss_mlp": 0.01193252, "balance_loss_clip": 1.00712526, "balance_loss_mlp": 1.00022829, "epoch": 0.7936030782180004, "flos": 28615392534240.0, "grad_norm": 1.593595187341114, "language_loss": 0.74669832, "learning_rate": 4.3031905968486535e-07, "loss": 0.77129614, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.9262425899505615 }, { "auxiliary_loss_clip": 0.01262294, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00651181, "balance_loss_mlp": 1.00020194, "epoch": 0.7937233211086394, "flos": 16392582398880.0, "grad_norm": 2.0783379432791875, "language_loss": 0.68458951, "learning_rate": 4.298364516947162e-07, "loss": 0.70914471, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.9059712886810303 }, { "auxiliary_loss_clip": 0.01282611, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00732446, "balance_loss_mlp": 1.00017071, "epoch": 0.7938435639992786, "flos": 22013430359040.0, "grad_norm": 1.819496641680932, "language_loss": 0.65524781, "learning_rate": 4.293540818971295e-07, "loss": 0.68000585, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.819340467453003 }, { "auxiliary_loss_clip": 0.01335858, "auxiliary_loss_mlp": 0.01193138, "balance_loss_clip": 1.00764441, "balance_loss_mlp": 1.00011432, "epoch": 0.7939638068899176, "flos": 22197042089280.0, "grad_norm": 2.252323778067685, "language_loss": 0.76547933, "learning_rate": 4.2887195036527934e-07, "loss": 0.79076934, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.833691120147705 }, { "auxiliary_loss_clip": 0.01336248, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00747967, "balance_loss_mlp": 1.00014114, "epoch": 0.7940840497805567, "flos": 17745194992320.0, "grad_norm": 2.2654785927901666, "language_loss": 0.73098838, "learning_rate": 4.28390057172306e-07, "loss": 0.75628245, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.7812411785125732 }, { "auxiliary_loss_clip": 0.01312909, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00839925, "balance_loss_mlp": 1.00021482, "epoch": 0.7942042926711959, "flos": 23805446591520.0, "grad_norm": 2.251408213033508, "language_loss": 0.71954751, "learning_rate": 4.279084023913111e-07, "loss": 0.744609, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.812377452850342 }, { "auxiliary_loss_clip": 0.01327481, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00775599, "balance_loss_mlp": 1.00015485, "epoch": 0.7943245355618349, "flos": 19244969989920.0, "grad_norm": 1.7088805737107364, "language_loss": 0.6940726, "learning_rate": 4.2742698609536096e-07, "loss": 0.71927923, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.7224152088165283 }, { "auxiliary_loss_clip": 0.01310317, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00695896, "balance_loss_mlp": 1.00020111, "epoch": 0.794444778452474, "flos": 25007627725920.0, "grad_norm": 1.8742427506883792, "language_loss": 0.78491616, "learning_rate": 4.2694580835748706e-07, "loss": 0.80995154, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.7997584342956543 }, { "auxiliary_loss_clip": 0.01309682, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00838661, "balance_loss_mlp": 1.0001694, "epoch": 0.7945650213431131, "flos": 23221502976960.0, "grad_norm": 1.8028624316852322, "language_loss": 0.74456155, "learning_rate": 4.264648692506836e-07, "loss": 0.76959026, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 2.8398349285125732 }, { "auxiliary_loss_clip": 0.01314932, "auxiliary_loss_mlp": 0.0119332, "balance_loss_clip": 1.00777614, "balance_loss_mlp": 1.00020087, "epoch": 0.7946852642337522, "flos": 26062897921920.0, "grad_norm": 1.6960840399338408, "language_loss": 0.72401929, "learning_rate": 4.2598416884790824e-07, "loss": 0.74910182, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.7329139709472656 }, { "auxiliary_loss_clip": 0.01324417, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00785756, "balance_loss_mlp": 1.00015485, "epoch": 0.7948055071243912, "flos": 23769715739040.0, "grad_norm": 2.1377630707712796, "language_loss": 0.80670249, "learning_rate": 4.255037072220828e-07, "loss": 0.83187842, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 2.758174180984497 }, { "auxiliary_loss_clip": 0.01347703, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00755692, "balance_loss_mlp": 1.00015092, "epoch": 0.7949257500150304, "flos": 21980824866720.0, "grad_norm": 1.6036728590494318, "language_loss": 0.71497154, "learning_rate": 4.2502348444609293e-07, "loss": 0.74038029, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.7000632286071777 }, { "auxiliary_loss_clip": 0.01289991, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00743175, "balance_loss_mlp": 1.00017428, "epoch": 0.7950459929056695, "flos": 25774141284000.0, "grad_norm": 2.033782178261759, "language_loss": 0.69195259, "learning_rate": 4.2454350059278844e-07, "loss": 0.71678448, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.9327239990234375 }, { "auxiliary_loss_clip": 0.01324474, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00762534, "balance_loss_mlp": 1.00019205, "epoch": 0.7951662357963085, "flos": 22158078105600.0, "grad_norm": 1.7207001850711452, "language_loss": 0.84543592, "learning_rate": 4.240637557349824e-07, "loss": 0.87061286, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 3.6865527629852295 }, { "auxiliary_loss_clip": 0.01309964, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00783944, "balance_loss_mlp": 1.00017405, "epoch": 0.7952864786869477, "flos": 24641948983680.0, "grad_norm": 1.8392116926002877, "language_loss": 0.6689651, "learning_rate": 4.235842499454516e-07, "loss": 0.69399667, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 3.723651885986328 }, { "auxiliary_loss_clip": 0.0131253, "auxiliary_loss_mlp": 0.01193136, "balance_loss_clip": 1.0072937, "balance_loss_mlp": 1.00020742, "epoch": 0.7954067215775867, "flos": 21830932262880.0, "grad_norm": 1.5958918285467443, "language_loss": 0.82730627, "learning_rate": 4.2310498329693687e-07, "loss": 0.85236293, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 3.7740955352783203 }, { "auxiliary_loss_clip": 0.01327757, "auxiliary_loss_mlp": 0.01193309, "balance_loss_clip": 1.00742793, "balance_loss_mlp": 1.0001899, "epoch": 0.7955269644682258, "flos": 24060663721440.0, "grad_norm": 1.65486677894631, "language_loss": 0.80675614, "learning_rate": 4.2262595586214164e-07, "loss": 0.83196682, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 3.934445381164551 }, { "auxiliary_loss_clip": 0.01332141, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00742817, "balance_loss_mlp": 1.00018561, "epoch": 0.795647207358865, "flos": 25010753086080.0, "grad_norm": 1.6615074366765907, "language_loss": 0.76756132, "learning_rate": 4.221471677137358e-07, "loss": 0.79281485, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.76873517036438 }, { "auxiliary_loss_clip": 0.01311856, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00720644, "balance_loss_mlp": 1.00024748, "epoch": 0.795767450249504, "flos": 14648367375360.0, "grad_norm": 1.4852497267481333, "language_loss": 0.70094955, "learning_rate": 4.216686189243492e-07, "loss": 0.72600079, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.770379066467285 }, { "auxiliary_loss_clip": 0.01277759, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.00652075, "balance_loss_mlp": 1.00014591, "epoch": 0.7958876931401431, "flos": 18547906410720.0, "grad_norm": 1.6615725914415698, "language_loss": 0.73061895, "learning_rate": 4.211903095665785e-07, "loss": 0.75532728, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.8091113567352295 }, { "auxiliary_loss_clip": 0.01322843, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00727427, "balance_loss_mlp": 1.00015807, "epoch": 0.7960079360307821, "flos": 21543971808960.0, "grad_norm": 1.725636624763088, "language_loss": 0.7507394, "learning_rate": 4.2071223971298277e-07, "loss": 0.77589965, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.703396797180176 }, { "auxiliary_loss_clip": 0.01337392, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00823736, "balance_loss_mlp": 1.0002197, "epoch": 0.7961281789214213, "flos": 25481756354400.0, "grad_norm": 1.7633461447566554, "language_loss": 0.61257172, "learning_rate": 4.2023440943608433e-07, "loss": 0.63787806, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.764653444290161 }, { "auxiliary_loss_clip": 0.01334956, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00759006, "balance_loss_mlp": 1.00014639, "epoch": 0.7962484218120603, "flos": 21944447388000.0, "grad_norm": 1.726834371606022, "language_loss": 0.78195727, "learning_rate": 4.1975681880837023e-07, "loss": 0.80723846, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.7142562866210938 }, { "auxiliary_loss_clip": 0.0130118, "auxiliary_loss_mlp": 0.0119329, "balance_loss_clip": 1.007231, "balance_loss_mlp": 1.00017142, "epoch": 0.7963686647026994, "flos": 18876273658560.0, "grad_norm": 1.5778883482461294, "language_loss": 0.82227409, "learning_rate": 4.192794679022895e-07, "loss": 0.84721875, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.792619466781616 }, { "auxiliary_loss_clip": 0.01336428, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00788295, "balance_loss_mlp": 1.00021899, "epoch": 0.7964889075933386, "flos": 29716595907840.0, "grad_norm": 2.279360038127701, "language_loss": 0.7215305, "learning_rate": 4.1880235679025743e-07, "loss": 0.74682724, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.766017436981201 }, { "auxiliary_loss_clip": 0.01270872, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00736856, "balance_loss_mlp": 1.00015211, "epoch": 0.7966091504839776, "flos": 29491469612640.0, "grad_norm": 1.9676096480244047, "language_loss": 0.63586807, "learning_rate": 4.1832548554464986e-07, "loss": 0.66050851, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 3.1047863960266113 }, { "auxiliary_loss_clip": 0.01296599, "auxiliary_loss_mlp": 0.01192277, "balance_loss_clip": 1.00443864, "balance_loss_mlp": 1.00001669, "epoch": 0.7967293933746167, "flos": 67288728702240.0, "grad_norm": 0.7411797532750531, "language_loss": 0.58833921, "learning_rate": 4.178488542378098e-07, "loss": 0.61322796, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.3771772384643555 }, { "auxiliary_loss_clip": 0.01350254, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00828421, "balance_loss_mlp": 1.00016677, "epoch": 0.7968496362652558, "flos": 25554690930240.0, "grad_norm": 1.8482685379115007, "language_loss": 0.89145482, "learning_rate": 4.173724629420401e-07, "loss": 0.91688931, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.7438104152679443 }, { "auxiliary_loss_clip": 0.013132, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00768077, "balance_loss_mlp": 1.0001539, "epoch": 0.7969698791558949, "flos": 14501097200160.0, "grad_norm": 2.4648872116953586, "language_loss": 0.68492627, "learning_rate": 4.168963117296087e-07, "loss": 0.70999002, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.886420726776123 }, { "auxiliary_loss_clip": 0.01348392, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00815117, "balance_loss_mlp": 1.00015724, "epoch": 0.797090122046534, "flos": 22127556186720.0, "grad_norm": 2.162898880042662, "language_loss": 0.75594306, "learning_rate": 4.1642040067274876e-07, "loss": 0.78135872, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.754498243331909 }, { "auxiliary_loss_clip": 0.01316881, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00729537, "balance_loss_mlp": 1.00014675, "epoch": 0.7972103649371731, "flos": 19897681033440.0, "grad_norm": 1.9983231475389929, "language_loss": 0.72382188, "learning_rate": 4.1594472984365493e-07, "loss": 0.74892151, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.7762351036071777 }, { "auxiliary_loss_clip": 0.0132441, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00756502, "balance_loss_mlp": 1.00016522, "epoch": 0.7973306078278122, "flos": 36058634951040.0, "grad_norm": 1.7626433514806172, "language_loss": 0.77491218, "learning_rate": 4.154692993144862e-07, "loss": 0.80008721, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.8942787647247314 }, { "auxiliary_loss_clip": 0.01349088, "auxiliary_loss_mlp": 0.00872435, "balance_loss_clip": 1.00811243, "balance_loss_mlp": 1.00036156, "epoch": 0.7974508507184512, "flos": 21360611544480.0, "grad_norm": 1.8536075935098248, "language_loss": 0.71350563, "learning_rate": 4.1499410915736476e-07, "loss": 0.73572087, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.707076072692871 }, { "auxiliary_loss_clip": 0.01305055, "auxiliary_loss_mlp": 0.01192283, "balance_loss_clip": 1.00428414, "balance_loss_mlp": 1.00002193, "epoch": 0.7975710936090904, "flos": 68253151615200.0, "grad_norm": 0.7658710908453314, "language_loss": 0.6430282, "learning_rate": 4.145191594443762e-07, "loss": 0.66800159, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.5011684894561768 }, { "auxiliary_loss_clip": 0.01293239, "auxiliary_loss_mlp": 0.01193337, "balance_loss_clip": 1.00684285, "balance_loss_mlp": 1.00021791, "epoch": 0.7976913364997295, "flos": 22492444608000.0, "grad_norm": 2.100613866717931, "language_loss": 0.70384419, "learning_rate": 4.140444502475713e-07, "loss": 0.72871, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.8401317596435547 }, { "auxiliary_loss_clip": 0.01337465, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00780201, "balance_loss_mlp": 1.00017226, "epoch": 0.7978115793903685, "flos": 15263228069280.0, "grad_norm": 1.7373139672137858, "language_loss": 0.69778097, "learning_rate": 4.1356998163896216e-07, "loss": 0.72308755, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.8514201641082764 }, { "auxiliary_loss_clip": 0.01291867, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00658762, "balance_loss_mlp": 1.00017285, "epoch": 0.7979318222810077, "flos": 19719242313120.0, "grad_norm": 2.2414978157185073, "language_loss": 0.74875194, "learning_rate": 4.130957536905255e-07, "loss": 0.77360255, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.8615875244140625 }, { "auxiliary_loss_clip": 0.01307701, "auxiliary_loss_mlp": 0.01193256, "balance_loss_clip": 1.00793779, "balance_loss_mlp": 1.00023246, "epoch": 0.7980520651716467, "flos": 15560283077280.0, "grad_norm": 2.2627803150491523, "language_loss": 0.71495354, "learning_rate": 4.1262176647420134e-07, "loss": 0.73996305, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.838749885559082 }, { "auxiliary_loss_clip": 0.01319263, "auxiliary_loss_mlp": 0.01193157, "balance_loss_clip": 1.00748229, "balance_loss_mlp": 1.00013316, "epoch": 0.7981723080622858, "flos": 22309443580320.0, "grad_norm": 1.6322447446049013, "language_loss": 0.79752541, "learning_rate": 4.121480200618923e-07, "loss": 0.82264966, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 3.829049587249756 }, { "auxiliary_loss_clip": 0.01313391, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00752473, "balance_loss_mlp": 1.00018299, "epoch": 0.798292550952925, "flos": 22929585055200.0, "grad_norm": 1.8024318009298566, "language_loss": 0.8013823, "learning_rate": 4.116745145254674e-07, "loss": 0.82644832, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.8418667316436768 }, { "auxiliary_loss_clip": 0.01273882, "auxiliary_loss_mlp": 0.01192295, "balance_loss_clip": 1.00372696, "balance_loss_mlp": 1.00003421, "epoch": 0.798412793843564, "flos": 64497972936960.0, "grad_norm": 0.7625422131759696, "language_loss": 0.58019894, "learning_rate": 4.1120124993675476e-07, "loss": 0.60486072, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 4.305471181869507 }, { "auxiliary_loss_clip": 0.0132059, "auxiliary_loss_mlp": 0.01193305, "balance_loss_clip": 1.00808907, "balance_loss_mlp": 1.00018585, "epoch": 0.7985330367342031, "flos": 13586918306400.0, "grad_norm": 1.9000370959307464, "language_loss": 0.61792588, "learning_rate": 4.107282263675498e-07, "loss": 0.6430648, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 3.766608953475952 }, { "auxiliary_loss_clip": 0.01279291, "auxiliary_loss_mlp": 0.00871618, "balance_loss_clip": 1.00479412, "balance_loss_mlp": 0.99984789, "epoch": 0.7986532796248422, "flos": 67698831827520.0, "grad_norm": 0.7609200947285221, "language_loss": 0.52512097, "learning_rate": 4.1025544388960907e-07, "loss": 0.54663014, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 4.382227659225464 }, { "auxiliary_loss_clip": 0.01326362, "auxiliary_loss_mlp": 0.01193007, "balance_loss_clip": 1.00747895, "balance_loss_mlp": 1.00017428, "epoch": 0.7987735225154813, "flos": 22455384579360.0, "grad_norm": 2.3804818924542706, "language_loss": 0.71372545, "learning_rate": 4.097829025746538e-07, "loss": 0.73891914, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.765674352645874 }, { "auxiliary_loss_clip": 0.01299841, "auxiliary_loss_mlp": 0.01192275, "balance_loss_clip": 1.00405812, "balance_loss_mlp": 1.00001431, "epoch": 0.7988937654061203, "flos": 68864132551680.0, "grad_norm": 0.6578328650018772, "language_loss": 0.61090159, "learning_rate": 4.0931060249436757e-07, "loss": 0.63582277, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.363551139831543 }, { "auxiliary_loss_clip": 0.01325045, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00767112, "balance_loss_mlp": 1.0001272, "epoch": 0.7990140082967595, "flos": 20806902459360.0, "grad_norm": 2.9331005759222326, "language_loss": 0.69599277, "learning_rate": 4.088385437203978e-07, "loss": 0.72117472, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.7450709342956543 }, { "auxiliary_loss_clip": 0.01348759, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00781679, "balance_loss_mlp": 1.00017905, "epoch": 0.7991342511873986, "flos": 18985298323680.0, "grad_norm": 2.546341142850334, "language_loss": 0.77536559, "learning_rate": 4.083667263243564e-07, "loss": 0.80078518, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.71484112739563 }, { "auxiliary_loss_clip": 0.01324624, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00792336, "balance_loss_mlp": 1.000175, "epoch": 0.7992544940780376, "flos": 20816817395040.0, "grad_norm": 1.5444732769144975, "language_loss": 0.71702367, "learning_rate": 4.0789515037781653e-07, "loss": 0.74220192, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.871180534362793 }, { "auxiliary_loss_clip": 0.0133183, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00761271, "balance_loss_mlp": 1.00014901, "epoch": 0.7993747369686768, "flos": 12640780546560.0, "grad_norm": 1.7443379198645546, "language_loss": 0.82827526, "learning_rate": 4.0742381595231755e-07, "loss": 0.85352528, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.786421775817871 }, { "auxiliary_loss_clip": 0.01278795, "auxiliary_loss_mlp": 0.01193155, "balance_loss_clip": 1.00708485, "balance_loss_mlp": 1.00013113, "epoch": 0.7994949798593158, "flos": 20078778106080.0, "grad_norm": 1.55602644798689, "language_loss": 0.78409815, "learning_rate": 4.06952723119359e-07, "loss": 0.80881763, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.975663185119629 }, { "auxiliary_loss_clip": 0.01311859, "auxiliary_loss_mlp": 0.01193257, "balance_loss_clip": 1.00767612, "balance_loss_mlp": 1.00023317, "epoch": 0.7996152227499549, "flos": 38654224770240.0, "grad_norm": 1.8338520075167257, "language_loss": 0.67206109, "learning_rate": 4.0648187195040504e-07, "loss": 0.6971122, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.8660197257995605 }, { "auxiliary_loss_clip": 0.01296595, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00450587, "balance_loss_mlp": 1.00000501, "epoch": 0.799735465640594, "flos": 70243854314400.0, "grad_norm": 0.8142548039758775, "language_loss": 0.67617369, "learning_rate": 4.060112625168848e-07, "loss": 0.70106232, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.380533218383789 }, { "auxiliary_loss_clip": 0.01348407, "auxiliary_loss_mlp": 0.01193302, "balance_loss_clip": 1.00821579, "balance_loss_mlp": 1.00018275, "epoch": 0.7998557085312331, "flos": 24241006396800.0, "grad_norm": 2.299021917669132, "language_loss": 0.73686677, "learning_rate": 4.055408948901886e-07, "loss": 0.76228386, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.8588743209838867 }, { "auxiliary_loss_clip": 0.01331939, "auxiliary_loss_mlp": 0.01193249, "balance_loss_clip": 1.00800097, "balance_loss_mlp": 1.00022554, "epoch": 0.7999759514218722, "flos": 27564038019360.0, "grad_norm": 1.7078849220302135, "language_loss": 0.71878636, "learning_rate": 4.050707691416708e-07, "loss": 0.74403822, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.8645899295806885 }, { "auxiliary_loss_clip": 0.01296628, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00443995, "balance_loss_mlp": 1.00001073, "epoch": 0.8000961943125112, "flos": 67337463926880.0, "grad_norm": 0.6726932436815302, "language_loss": 0.59769779, "learning_rate": 4.046008853426495e-07, "loss": 0.62258679, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.3438901901245117 }, { "auxiliary_loss_clip": 0.0129358, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00746214, "balance_loss_mlp": 1.00014973, "epoch": 0.8002164372031504, "flos": 28733829203520.0, "grad_norm": 2.4326252063422498, "language_loss": 0.62584615, "learning_rate": 4.0413124356440464e-07, "loss": 0.65071368, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.8835277557373047 }, { "auxiliary_loss_clip": 0.01276196, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00688779, "balance_loss_mlp": 1.00018096, "epoch": 0.8003366800937894, "flos": 17639439382080.0, "grad_norm": 3.214720601801505, "language_loss": 0.8200236, "learning_rate": 4.0366184387818223e-07, "loss": 0.84471756, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.835796356201172 }, { "auxiliary_loss_clip": 0.01351096, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00834215, "balance_loss_mlp": 1.00021195, "epoch": 0.8004569229844285, "flos": 25995315974400.0, "grad_norm": 1.7173321231931664, "language_loss": 0.85054344, "learning_rate": 4.0319268635518797e-07, "loss": 0.87598675, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 281.8726398944855 }, { "auxiliary_loss_clip": 0.01335851, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00785828, "balance_loss_mlp": 1.00018883, "epoch": 0.8005771658750677, "flos": 20812362858720.0, "grad_norm": 1.7390860573979097, "language_loss": 0.7509461, "learning_rate": 4.027237710665943e-07, "loss": 0.77623677, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.789480686187744 }, { "auxiliary_loss_clip": 0.01298407, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.0071454, "balance_loss_mlp": 1.00016737, "epoch": 0.8006974087657067, "flos": 25812638259840.0, "grad_norm": 1.7363652934316183, "language_loss": 0.6926595, "learning_rate": 4.022550980835344e-07, "loss": 0.71757549, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.9118881225585938 }, { "auxiliary_loss_clip": 0.01291793, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00675619, "balance_loss_mlp": 1.00017262, "epoch": 0.8008176516563458, "flos": 17164700051040.0, "grad_norm": 1.9683079899150113, "language_loss": 0.79334426, "learning_rate": 4.017866674771051e-07, "loss": 0.81819415, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 2.8374898433685303 }, { "auxiliary_loss_clip": 0.01274775, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00709414, "balance_loss_mlp": 1.00017107, "epoch": 0.8009378945469849, "flos": 24207323194080.0, "grad_norm": 2.101720352013415, "language_loss": 0.74519825, "learning_rate": 4.013184793183688e-07, "loss": 0.76987791, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.9379935264587402 }, { "auxiliary_loss_clip": 0.01335443, "auxiliary_loss_mlp": 0.01193104, "balance_loss_clip": 1.00768614, "balance_loss_mlp": 1.00017548, "epoch": 0.801058137437624, "flos": 19787327192160.0, "grad_norm": 1.6350202857532001, "language_loss": 0.72574687, "learning_rate": 4.008505336783472e-07, "loss": 0.75103223, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.761308431625366 }, { "auxiliary_loss_clip": 0.01334554, "auxiliary_loss_mlp": 0.01193155, "balance_loss_clip": 1.00802112, "balance_loss_mlp": 1.00013089, "epoch": 0.801178380328263, "flos": 18659409809760.0, "grad_norm": 1.745994522780404, "language_loss": 0.80714428, "learning_rate": 4.003828306280284e-07, "loss": 0.83242142, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.800527334213257 }, { "auxiliary_loss_clip": 0.01319984, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00812042, "balance_loss_mlp": 1.00016356, "epoch": 0.8012986232189022, "flos": 15706583313120.0, "grad_norm": 1.6244781118559468, "language_loss": 0.78093696, "learning_rate": 3.999153702383626e-07, "loss": 0.80606866, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 3.696491003036499 }, { "auxiliary_loss_clip": 0.01330591, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00735462, "balance_loss_mlp": 1.00014973, "epoch": 0.8014188661095413, "flos": 28584152141760.0, "grad_norm": 1.7420442983002542, "language_loss": 0.73762262, "learning_rate": 3.9944815258026263e-07, "loss": 0.7628603, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.798107624053955 }, { "auxiliary_loss_clip": 0.01331405, "auxiliary_loss_mlp": 0.01193261, "balance_loss_clip": 1.00763941, "balance_loss_mlp": 1.00014186, "epoch": 0.8015391090001803, "flos": 29310372540000.0, "grad_norm": 1.6247590159131322, "language_loss": 0.82676739, "learning_rate": 3.989811777246057e-07, "loss": 0.85201406, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 3.7705066204071045 }, { "auxiliary_loss_clip": 0.01317557, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.004215, "balance_loss_mlp": 1.00001478, "epoch": 0.8016593518908195, "flos": 70397375209920.0, "grad_norm": 0.8481167608162855, "language_loss": 0.66262853, "learning_rate": 3.985144457422305e-07, "loss": 0.68772686, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 5.01051664352417 }, { "auxiliary_loss_clip": 0.01349374, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00795376, "balance_loss_mlp": 1.00016403, "epoch": 0.8017795947814585, "flos": 26026125282720.0, "grad_norm": 1.8595886933409251, "language_loss": 0.76612151, "learning_rate": 3.9804795670394096e-07, "loss": 0.79154712, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.7504444122314453 }, { "auxiliary_loss_clip": 0.01314703, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00742006, "balance_loss_mlp": 1.00016809, "epoch": 0.8018998376720976, "flos": 22087191179520.0, "grad_norm": 1.446422679496165, "language_loss": 0.70581037, "learning_rate": 3.975817106805022e-07, "loss": 0.73088837, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.86439847946167 }, { "auxiliary_loss_clip": 0.01289413, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.00696802, "balance_loss_mlp": 1.00020623, "epoch": 0.8020200805627368, "flos": 34568559347040.0, "grad_norm": 2.617913368073799, "language_loss": 0.64594078, "learning_rate": 3.97115707742645e-07, "loss": 0.67076719, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.9573678970336914 }, { "auxiliary_loss_clip": 0.01311634, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00768733, "balance_loss_mlp": 1.00018179, "epoch": 0.8021403234533758, "flos": 20120364518400.0, "grad_norm": 1.8361573730539744, "language_loss": 0.64753389, "learning_rate": 3.966499479610599e-07, "loss": 0.67258227, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.7713544368743896 }, { "auxiliary_loss_clip": 0.01283761, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00691843, "balance_loss_mlp": 1.00018096, "epoch": 0.8022605663440149, "flos": 27746212802400.0, "grad_norm": 1.5935925632204462, "language_loss": 0.64547932, "learning_rate": 3.9618443140640225e-07, "loss": 0.67024899, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.8923628330230713 }, { "auxiliary_loss_clip": 0.01243912, "auxiliary_loss_mlp": 0.01192274, "balance_loss_clip": 1.00308156, "balance_loss_mlp": 1.00001335, "epoch": 0.802380809234654, "flos": 60245027848800.0, "grad_norm": 0.6878509244838943, "language_loss": 0.51395267, "learning_rate": 3.957191581492918e-07, "loss": 0.53831452, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.407155752182007 }, { "auxiliary_loss_clip": 0.013125, "auxiliary_loss_mlp": 0.0119326, "balance_loss_clip": 1.00764275, "balance_loss_mlp": 1.00023639, "epoch": 0.8025010521252931, "flos": 15080729973120.0, "grad_norm": 2.684235567948292, "language_loss": 0.70951581, "learning_rate": 3.952541282603097e-07, "loss": 0.73457336, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.7485575675964355 }, { "auxiliary_loss_clip": 0.01323167, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00769448, "balance_loss_mlp": 1.00017524, "epoch": 0.8026212950159322, "flos": 22163538504960.0, "grad_norm": 1.66398088747601, "language_loss": 0.83273727, "learning_rate": 3.9478934181000013e-07, "loss": 0.85790092, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.7434024810791016 }, { "auxiliary_loss_clip": 0.01349424, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00807428, "balance_loss_mlp": 1.00019073, "epoch": 0.8027415379065713, "flos": 17675996479200.0, "grad_norm": 1.991549788170689, "language_loss": 0.83858073, "learning_rate": 3.943247988688714e-07, "loss": 0.86400717, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.797318696975708 }, { "auxiliary_loss_clip": 0.01334677, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00745809, "balance_loss_mlp": 1.00018537, "epoch": 0.8028617807972104, "flos": 21979603461600.0, "grad_norm": 1.618400182746592, "language_loss": 0.71894228, "learning_rate": 3.938604995073933e-07, "loss": 0.74422121, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.858009099960327 }, { "auxiliary_loss_clip": 0.01315944, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00817037, "balance_loss_mlp": 1.00015402, "epoch": 0.8029820236878494, "flos": 26428468893120.0, "grad_norm": 1.7366161602513959, "language_loss": 0.65587389, "learning_rate": 3.9339644379600157e-07, "loss": 0.68096513, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.8432483673095703 }, { "auxiliary_loss_clip": 0.01320714, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00871432, "balance_loss_mlp": 1.00019336, "epoch": 0.8031022665784886, "flos": 17676499410720.0, "grad_norm": 1.941803441546255, "language_loss": 0.71237624, "learning_rate": 3.929326318050907e-07, "loss": 0.73751456, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.7380247116088867 }, { "auxiliary_loss_clip": 0.01347954, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.00752449, "balance_loss_mlp": 1.00013709, "epoch": 0.8032225094691277, "flos": 15450288472800.0, "grad_norm": 1.8435032159359717, "language_loss": 0.78839207, "learning_rate": 3.924690636050225e-07, "loss": 0.8138023, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.6461079120635986 }, { "auxiliary_loss_clip": 0.01328092, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00778592, "balance_loss_mlp": 1.00016499, "epoch": 0.8033427523597667, "flos": 26179215094080.0, "grad_norm": 2.4015629530873603, "language_loss": 0.72779489, "learning_rate": 3.9200573926611915e-07, "loss": 0.75300765, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.847630023956299 }, { "auxiliary_loss_clip": 0.01325463, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00826836, "balance_loss_mlp": 1.00016832, "epoch": 0.8034629952504058, "flos": 21324916615680.0, "grad_norm": 1.8419644886910482, "language_loss": 0.72782505, "learning_rate": 3.9154265885866613e-07, "loss": 0.75301158, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.7025070190429688 }, { "auxiliary_loss_clip": 0.01323123, "auxiliary_loss_mlp": 0.01193312, "balance_loss_clip": 1.00774598, "balance_loss_mlp": 1.00019312, "epoch": 0.8035832381410449, "flos": 21651595450560.0, "grad_norm": 2.884027484477081, "language_loss": 0.74764651, "learning_rate": 3.9107982245291394e-07, "loss": 0.77281082, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.8045811653137207 }, { "auxiliary_loss_clip": 0.01282005, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00701368, "balance_loss_mlp": 1.00014937, "epoch": 0.803703481031684, "flos": 20518828371360.0, "grad_norm": 1.9788077862225735, "language_loss": 0.77195418, "learning_rate": 3.9061723011907245e-07, "loss": 0.79670596, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.811887502670288 }, { "auxiliary_loss_clip": 0.01315316, "auxiliary_loss_mlp": 0.01193144, "balance_loss_clip": 1.00742066, "balance_loss_mlp": 1.00012064, "epoch": 0.803823723922323, "flos": 22854818371680.0, "grad_norm": 1.7380473961517926, "language_loss": 0.79223388, "learning_rate": 3.901548819273179e-07, "loss": 0.8173185, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.834625720977783 }, { "auxiliary_loss_clip": 0.01327525, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00800455, "balance_loss_mlp": 1.00018954, "epoch": 0.8039439668129622, "flos": 21362156262720.0, "grad_norm": 1.9040587187587927, "language_loss": 0.69467044, "learning_rate": 3.896927779477881e-07, "loss": 0.71987784, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 2.7306981086730957 }, { "auxiliary_loss_clip": 0.0129605, "auxiliary_loss_mlp": 0.01192975, "balance_loss_clip": 1.00694823, "balance_loss_mlp": 1.00014162, "epoch": 0.8040642097036013, "flos": 23802393078720.0, "grad_norm": 2.176053487468814, "language_loss": 0.66989595, "learning_rate": 3.892309182505833e-07, "loss": 0.69478619, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.7921030521392822 }, { "auxiliary_loss_clip": 0.01349153, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00792849, "balance_loss_mlp": 1.00019598, "epoch": 0.8041844525942403, "flos": 25922058085440.0, "grad_norm": 2.2292397998952542, "language_loss": 0.85923177, "learning_rate": 3.887693029057675e-07, "loss": 0.88465548, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.766742706298828 }, { "auxiliary_loss_clip": 0.01313923, "auxiliary_loss_mlp": 0.01193152, "balance_loss_clip": 1.00740385, "balance_loss_mlp": 1.00012803, "epoch": 0.8043046954848795, "flos": 25191131685120.0, "grad_norm": 1.5817051163715057, "language_loss": 0.80933177, "learning_rate": 3.8830793198336684e-07, "loss": 0.83440256, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.782136917114258 }, { "auxiliary_loss_clip": 0.0132155, "auxiliary_loss_mlp": 0.01193321, "balance_loss_clip": 1.00792658, "balance_loss_mlp": 1.00020218, "epoch": 0.8044249383755185, "flos": 41719201292160.0, "grad_norm": 1.9580282875521473, "language_loss": 0.69946921, "learning_rate": 3.878468055533721e-07, "loss": 0.7246179, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.954235553741455 }, { "auxiliary_loss_clip": 0.01291123, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00658953, "balance_loss_mlp": 1.00018787, "epoch": 0.8045451812661576, "flos": 20631445404480.0, "grad_norm": 2.3016795590997434, "language_loss": 0.84754825, "learning_rate": 3.8738592368573464e-07, "loss": 0.87239158, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 3.7895283699035645 }, { "auxiliary_loss_clip": 0.01288755, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00730324, "balance_loss_mlp": 1.00019062, "epoch": 0.8046654241567968, "flos": 29711818058400.0, "grad_norm": 1.9633601156420242, "language_loss": 0.8803975, "learning_rate": 3.8692528645037137e-07, "loss": 0.90521723, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 3.831451177597046 }, { "auxiliary_loss_clip": 0.01348484, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00771153, "balance_loss_mlp": 1.00017893, "epoch": 0.8047856670474358, "flos": 17671398248160.0, "grad_norm": 4.821438719374943, "language_loss": 0.77724743, "learning_rate": 3.8646489391715907e-07, "loss": 0.80266434, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 4.543667554855347 }, { "auxiliary_loss_clip": 0.01310067, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00777888, "balance_loss_mlp": 1.00021672, "epoch": 0.8049059099380749, "flos": 17120706752160.0, "grad_norm": 2.2765757303911904, "language_loss": 0.87998337, "learning_rate": 3.8600474615593903e-07, "loss": 0.90501642, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.741328716278076 }, { "auxiliary_loss_clip": 0.01272562, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.0040822, "balance_loss_mlp": 1.00000596, "epoch": 0.805026152828714, "flos": 62212932220320.0, "grad_norm": 0.7837363889468398, "language_loss": 0.59695524, "learning_rate": 3.8554484323651605e-07, "loss": 0.62160349, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.4015560150146484 }, { "auxiliary_loss_clip": 0.01325343, "auxiliary_loss_mlp": 0.00872538, "balance_loss_clip": 1.00741899, "balance_loss_mlp": 1.00041223, "epoch": 0.8051463957193531, "flos": 21688619555520.0, "grad_norm": 1.4415047180079479, "language_loss": 0.79315507, "learning_rate": 3.85085185228657e-07, "loss": 0.81513393, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.724489688873291 }, { "auxiliary_loss_clip": 0.01312936, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00769877, "balance_loss_mlp": 1.00019431, "epoch": 0.8052666386099921, "flos": 32051472274080.0, "grad_norm": 1.6583212335815953, "language_loss": 0.73105061, "learning_rate": 3.8462577220209114e-07, "loss": 0.7561121, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 3.0046937465667725 }, { "auxiliary_loss_clip": 0.01317321, "auxiliary_loss_mlp": 0.01192275, "balance_loss_clip": 1.00406396, "balance_loss_mlp": 1.00001431, "epoch": 0.8053868815006313, "flos": 67157911572480.0, "grad_norm": 0.7071687875916194, "language_loss": 0.590087, "learning_rate": 3.8416660422651127e-07, "loss": 0.615183, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.298621892929077 }, { "auxiliary_loss_clip": 0.01293445, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00684798, "balance_loss_mlp": 1.00018656, "epoch": 0.8055071243912704, "flos": 23837010297120.0, "grad_norm": 1.7435564068732834, "language_loss": 0.67972541, "learning_rate": 3.837076813715723e-07, "loss": 0.70459199, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.857640027999878 }, { "auxiliary_loss_clip": 0.01314697, "auxiliary_loss_mlp": 0.01193336, "balance_loss_clip": 1.00914621, "balance_loss_mlp": 1.00021648, "epoch": 0.8056273672819094, "flos": 21324521455200.0, "grad_norm": 1.6732052470687797, "language_loss": 0.75105059, "learning_rate": 3.832490037068941e-07, "loss": 0.77613086, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.799647808074951 }, { "auxiliary_loss_clip": 0.01245693, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.00607753, "balance_loss_mlp": 1.00020671, "epoch": 0.8057476101725486, "flos": 25768393495200.0, "grad_norm": 1.8991713087627642, "language_loss": 0.76001203, "learning_rate": 3.827905713020554e-07, "loss": 0.7844013, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.918341636657715 }, { "auxiliary_loss_clip": 0.01314567, "auxiliary_loss_mlp": 0.01193464, "balance_loss_clip": 1.00863051, "balance_loss_mlp": 1.00024951, "epoch": 0.8058678530631876, "flos": 24535295281440.0, "grad_norm": 1.9735978207003497, "language_loss": 0.6868872, "learning_rate": 3.823323842266017e-07, "loss": 0.71196753, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.8400819301605225 }, { "auxiliary_loss_clip": 0.01336353, "auxiliary_loss_mlp": 0.011933, "balance_loss_clip": 1.00765014, "balance_loss_mlp": 1.00018144, "epoch": 0.8059880959538267, "flos": 24753739772160.0, "grad_norm": 4.173596169935737, "language_loss": 0.73171127, "learning_rate": 3.818744425500393e-07, "loss": 0.75700784, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.715764284133911 }, { "auxiliary_loss_clip": 0.01300282, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00721872, "balance_loss_mlp": 1.00016141, "epoch": 0.8061083388444659, "flos": 22196359539360.0, "grad_norm": 1.7151555595582582, "language_loss": 0.80644292, "learning_rate": 3.8141674634183675e-07, "loss": 0.83137757, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.866502046585083 }, { "auxiliary_loss_clip": 0.0126306, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00626898, "balance_loss_mlp": 1.00019431, "epoch": 0.8062285817351049, "flos": 30044208758400.0, "grad_norm": 1.751798222833291, "language_loss": 0.66518843, "learning_rate": 3.809592956714278e-07, "loss": 0.68975115, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.86806321144104 }, { "auxiliary_loss_clip": 0.01337584, "auxiliary_loss_mlp": 0.01193342, "balance_loss_clip": 1.0087862, "balance_loss_mlp": 1.00022292, "epoch": 0.806348824625744, "flos": 22782602269440.0, "grad_norm": 2.2133338695605858, "language_loss": 0.74468982, "learning_rate": 3.805020906082057e-07, "loss": 0.76999903, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.778662919998169 }, { "auxiliary_loss_clip": 0.01308869, "auxiliary_loss_mlp": 0.01193291, "balance_loss_clip": 1.00720787, "balance_loss_mlp": 1.00026751, "epoch": 0.8064690675163831, "flos": 23404611775680.0, "grad_norm": 2.153686194856596, "language_loss": 0.81014156, "learning_rate": 3.8004513122152917e-07, "loss": 0.83516324, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.745272159576416 }, { "auxiliary_loss_clip": 0.01312642, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00857079, "balance_loss_mlp": 1.00016177, "epoch": 0.8065893104070222, "flos": 24060915187200.0, "grad_norm": 1.8050790073642886, "language_loss": 0.67020822, "learning_rate": 3.79588417580718e-07, "loss": 0.69526649, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.8395583629608154 }, { "auxiliary_loss_clip": 0.01330925, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00775766, "balance_loss_mlp": 1.00015259, "epoch": 0.8067095532976613, "flos": 22305420128160.0, "grad_norm": 9.387583698603724, "language_loss": 0.76684493, "learning_rate": 3.791319497550558e-07, "loss": 0.79208595, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.7367775440216064 }, { "auxiliary_loss_clip": 0.01281241, "auxiliary_loss_mlp": 0.0087242, "balance_loss_clip": 1.00731516, "balance_loss_mlp": 1.000337, "epoch": 0.8068297961883004, "flos": 17129508053760.0, "grad_norm": 2.0835404496355343, "language_loss": 0.70789909, "learning_rate": 3.78675727813788e-07, "loss": 0.72943568, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.7374422550201416 }, { "auxiliary_loss_clip": 0.01302428, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00668383, "balance_loss_mlp": 1.00018287, "epoch": 0.8069500390789395, "flos": 22018854834720.0, "grad_norm": 1.5963868754645603, "language_loss": 0.73476064, "learning_rate": 3.782197518261225e-07, "loss": 0.75971699, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.781787633895874 }, { "auxiliary_loss_clip": 0.01317883, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00788331, "balance_loss_mlp": 1.0001905, "epoch": 0.8070702819695785, "flos": 19244251516320.0, "grad_norm": 1.9475324562498721, "language_loss": 0.95399034, "learning_rate": 3.777640218612319e-07, "loss": 0.97910136, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.7352757453918457 }, { "auxiliary_loss_clip": 0.01321671, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00751579, "balance_loss_mlp": 1.00014329, "epoch": 0.8071905248602176, "flos": 21544331045760.0, "grad_norm": 1.9031398415389214, "language_loss": 0.72293293, "learning_rate": 3.773085379882488e-07, "loss": 0.74808127, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.7140820026397705 }, { "auxiliary_loss_clip": 0.0133694, "auxiliary_loss_mlp": 0.00872549, "balance_loss_clip": 1.00810933, "balance_loss_mlp": 1.00041592, "epoch": 0.8073107677508568, "flos": 37268324134560.0, "grad_norm": 2.222814118811307, "language_loss": 0.75855953, "learning_rate": 3.768533002762715e-07, "loss": 0.78065443, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.824284791946411 }, { "auxiliary_loss_clip": 0.01322573, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00763941, "balance_loss_mlp": 1.00017011, "epoch": 0.8074310106414958, "flos": 28366282429920.0, "grad_norm": 1.6680831780279894, "language_loss": 0.7672739, "learning_rate": 3.763983087943572e-07, "loss": 0.79243159, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.8526039123535156 }, { "auxiliary_loss_clip": 0.01336559, "auxiliary_loss_mlp": 0.00872481, "balance_loss_clip": 1.0078814, "balance_loss_mlp": 1.00039363, "epoch": 0.8075512535321349, "flos": 24281658793440.0, "grad_norm": 1.6759957517671205, "language_loss": 0.80981576, "learning_rate": 3.759435636115282e-07, "loss": 0.8319062, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.684108018875122 }, { "auxiliary_loss_clip": 0.01251965, "auxiliary_loss_mlp": 0.00872461, "balance_loss_clip": 1.00772345, "balance_loss_mlp": 1.00040388, "epoch": 0.807671496422774, "flos": 26030867208480.0, "grad_norm": 1.7175565213571082, "language_loss": 0.73124254, "learning_rate": 3.7548906479676967e-07, "loss": 0.75248682, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 3.86669921875 }, { "auxiliary_loss_clip": 0.01336158, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00773144, "balance_loss_mlp": 1.00017023, "epoch": 0.8077917393134131, "flos": 23730751755360.0, "grad_norm": 2.181828961326291, "language_loss": 0.71548074, "learning_rate": 3.7503481241902855e-07, "loss": 0.74077427, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.714203119277954 }, { "auxiliary_loss_clip": 0.01311964, "auxiliary_loss_mlp": 0.00872463, "balance_loss_clip": 1.00741398, "balance_loss_mlp": 1.00047076, "epoch": 0.8079119822040521, "flos": 18402037259040.0, "grad_norm": 1.5962560863856516, "language_loss": 0.80538315, "learning_rate": 3.745808065472145e-07, "loss": 0.82722735, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 4.646473407745361 }, { "auxiliary_loss_clip": 0.01324498, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00881219, "balance_loss_mlp": 1.00014484, "epoch": 0.8080322250946913, "flos": 23621798937600.0, "grad_norm": 1.927299823164865, "language_loss": 0.75767404, "learning_rate": 3.741270472501994e-07, "loss": 0.78285068, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.705385208129883 }, { "auxiliary_loss_clip": 0.01302628, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00684714, "balance_loss_mlp": 1.00015163, "epoch": 0.8081524679853304, "flos": 22820704084800.0, "grad_norm": 1.5646794090491143, "language_loss": 0.7271238, "learning_rate": 3.736735345968183e-07, "loss": 0.75208187, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.801032543182373 }, { "auxiliary_loss_clip": 0.01334632, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00778961, "balance_loss_mlp": 1.00017953, "epoch": 0.8082727108759694, "flos": 17640014160960.0, "grad_norm": 1.675073494859794, "language_loss": 0.79013222, "learning_rate": 3.7322026865586986e-07, "loss": 0.81541055, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.733677625656128 }, { "auxiliary_loss_clip": 0.0133816, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00883806, "balance_loss_mlp": 1.00013649, "epoch": 0.8083929537666086, "flos": 25958184098400.0, "grad_norm": 1.8641011922434683, "language_loss": 0.73335922, "learning_rate": 3.7276724949611206e-07, "loss": 0.75867242, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.9239561557769775 }, { "auxiliary_loss_clip": 0.01308719, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00749576, "balance_loss_mlp": 1.00018656, "epoch": 0.8085131966572476, "flos": 27089190917280.0, "grad_norm": 2.501937412222123, "language_loss": 0.74785864, "learning_rate": 3.723144771862694e-07, "loss": 0.77287793, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.824190855026245 }, { "auxiliary_loss_clip": 0.01294046, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.00647938, "balance_loss_mlp": 1.0002079, "epoch": 0.8086334395478867, "flos": 23988555390240.0, "grad_norm": 1.541084689756221, "language_loss": 0.76713061, "learning_rate": 3.718619517950263e-07, "loss": 0.79200339, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.9453907012939453 }, { "auxiliary_loss_clip": 0.01348996, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.0084939, "balance_loss_mlp": 1.00016081, "epoch": 0.8087536824385259, "flos": 20405888025120.0, "grad_norm": 1.8230928489598839, "language_loss": 0.76986575, "learning_rate": 3.714096733910301e-07, "loss": 0.79528761, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.761967658996582 }, { "auxiliary_loss_clip": 0.01322736, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.0085299, "balance_loss_mlp": 1.00019324, "epoch": 0.8088739253291649, "flos": 25919651198880.0, "grad_norm": 1.9578085296814225, "language_loss": 0.7007103, "learning_rate": 3.709576420428926e-07, "loss": 0.72586989, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.6989452838897705 }, { "auxiliary_loss_clip": 0.01319578, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00716138, "balance_loss_mlp": 1.00014305, "epoch": 0.808994168219804, "flos": 28402085129760.0, "grad_norm": 3.2634601509457926, "language_loss": 0.73488659, "learning_rate": 3.7050585781918463e-07, "loss": 0.76001406, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.796954393386841 }, { "auxiliary_loss_clip": 0.01331528, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.0075779, "balance_loss_mlp": 1.00016725, "epoch": 0.8091144111104431, "flos": 17421066738720.0, "grad_norm": 2.3657726115644135, "language_loss": 0.69272584, "learning_rate": 3.700543207884428e-07, "loss": 0.71797299, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.652329206466675 }, { "auxiliary_loss_clip": 0.01326401, "auxiliary_loss_mlp": 0.01193059, "balance_loss_clip": 1.00700629, "balance_loss_mlp": 1.00013041, "epoch": 0.8092346540010822, "flos": 32153815134720.0, "grad_norm": 1.8663258035763657, "language_loss": 0.71006477, "learning_rate": 3.6960303101916466e-07, "loss": 0.73525935, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.8199269771575928 }, { "auxiliary_loss_clip": 0.0131717, "auxiliary_loss_mlp": 0.00871818, "balance_loss_clip": 1.00399792, "balance_loss_mlp": 1.0000428, "epoch": 0.8093548968917212, "flos": 58035126261600.0, "grad_norm": 0.7416849665692606, "language_loss": 0.55589139, "learning_rate": 3.6915198857981047e-07, "loss": 0.57778132, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.3130252361297607 }, { "auxiliary_loss_clip": 0.01300605, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.00813425, "balance_loss_mlp": 1.00020814, "epoch": 0.8094751397823604, "flos": 27381611770560.0, "grad_norm": 1.6761106474982126, "language_loss": 0.68164587, "learning_rate": 3.687011935388027e-07, "loss": 0.70658427, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.87465763092041 }, { "auxiliary_loss_clip": 0.01327755, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00762105, "balance_loss_mlp": 1.00014615, "epoch": 0.8095953826729995, "flos": 24061094805600.0, "grad_norm": 1.7386044891557944, "language_loss": 0.73059624, "learning_rate": 3.6825064596452646e-07, "loss": 0.75580549, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.904653787612915 }, { "auxiliary_loss_clip": 0.01335567, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00784802, "balance_loss_mlp": 1.00020289, "epoch": 0.8097156255636385, "flos": 23951423514240.0, "grad_norm": 1.6458004963732937, "language_loss": 0.70536458, "learning_rate": 3.678003459253305e-07, "loss": 0.73065257, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.742506742477417 }, { "auxiliary_loss_clip": 0.01289442, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.0073148, "balance_loss_mlp": 1.00018334, "epoch": 0.8098358684542777, "flos": 21799153015200.0, "grad_norm": 1.9742490979320606, "language_loss": 0.73940796, "learning_rate": 3.673502934895236e-07, "loss": 0.76423442, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.7879433631896973 }, { "auxiliary_loss_clip": 0.01317222, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.00402153, "balance_loss_mlp": 1.00001526, "epoch": 0.8099561113449167, "flos": 68809554976320.0, "grad_norm": 0.6926925097620066, "language_loss": 0.57989061, "learning_rate": 3.669004887253802e-07, "loss": 0.60498559, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.3790767192840576 }, { "auxiliary_loss_clip": 0.01318395, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00795865, "balance_loss_mlp": 1.0001694, "epoch": 0.8100763542355558, "flos": 23586068085120.0, "grad_norm": 1.5741186381465422, "language_loss": 0.79007185, "learning_rate": 3.664509317011335e-07, "loss": 0.81518769, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.7570927143096924 }, { "auxiliary_loss_clip": 0.01323428, "auxiliary_loss_mlp": 0.01193296, "balance_loss_clip": 1.00805712, "balance_loss_mlp": 1.00017715, "epoch": 0.810196597126195, "flos": 31650421916160.0, "grad_norm": 2.723477347437298, "language_loss": 0.73499125, "learning_rate": 3.6600162248498134e-07, "loss": 0.76015854, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 2.7767419815063477 }, { "auxiliary_loss_clip": 0.01255024, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00668526, "balance_loss_mlp": 1.00016057, "epoch": 0.810316840016834, "flos": 24900471092160.0, "grad_norm": 1.6573413267941464, "language_loss": 0.75951529, "learning_rate": 3.6555256114508426e-07, "loss": 0.78399742, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.8986618518829346 }, { "auxiliary_loss_clip": 0.01324575, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00772357, "balance_loss_mlp": 1.00017858, "epoch": 0.8104370829074731, "flos": 27965016529920.0, "grad_norm": 1.7126610933559794, "language_loss": 0.72922146, "learning_rate": 3.651037477495642e-07, "loss": 0.75439918, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.8572397232055664 }, { "auxiliary_loss_clip": 0.01348543, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00786185, "balance_loss_mlp": 1.00017834, "epoch": 0.8105573257981122, "flos": 24640763502240.0, "grad_norm": 2.2142398630154534, "language_loss": 0.67799312, "learning_rate": 3.6465518236650584e-07, "loss": 0.70341051, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.814345121383667 }, { "auxiliary_loss_clip": 0.01293069, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00661993, "balance_loss_mlp": 1.00013804, "epoch": 0.8106775686887513, "flos": 26358947066880.0, "grad_norm": 1.5819502388503348, "language_loss": 0.78280455, "learning_rate": 3.642068650639558e-07, "loss": 0.80766684, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.8833870887756348 }, { "auxiliary_loss_clip": 0.01324089, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00769627, "balance_loss_mlp": 1.00016999, "epoch": 0.8107978115793903, "flos": 27271904555520.0, "grad_norm": 3.3093990041197525, "language_loss": 0.64425874, "learning_rate": 3.6375879590992334e-07, "loss": 0.66943157, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 3.8990981578826904 }, { "auxiliary_loss_clip": 0.01314826, "auxiliary_loss_mlp": 0.01193152, "balance_loss_clip": 1.00769222, "balance_loss_mlp": 1.00012827, "epoch": 0.8109180544700295, "flos": 24934333913280.0, "grad_norm": 3.0009325797521127, "language_loss": 0.81101978, "learning_rate": 3.6331097497238173e-07, "loss": 0.83609957, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.7782654762268066 }, { "auxiliary_loss_clip": 0.01284276, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00597143, "balance_loss_mlp": 1.00014842, "epoch": 0.8110382973606686, "flos": 21105394414560.0, "grad_norm": 2.0225441749634907, "language_loss": 0.8009795, "learning_rate": 3.628634023192627e-07, "loss": 0.82575303, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 3.780209541320801 }, { "auxiliary_loss_clip": 0.01330964, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00765455, "balance_loss_mlp": 1.00019062, "epoch": 0.8111585402513076, "flos": 15414090612480.0, "grad_norm": 1.8790434076413671, "language_loss": 0.75061464, "learning_rate": 3.624160780184644e-07, "loss": 0.77585638, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.676426649093628 }, { "auxiliary_loss_clip": 0.01314078, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00769603, "balance_loss_mlp": 1.00015497, "epoch": 0.8112787831419467, "flos": 24095747947680.0, "grad_norm": 1.6408360454915982, "language_loss": 0.74473006, "learning_rate": 3.6196900213784496e-07, "loss": 0.76980263, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.7755887508392334 }, { "auxiliary_loss_clip": 0.01328912, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00726461, "balance_loss_mlp": 1.00020564, "epoch": 0.8113990260325858, "flos": 20483384908320.0, "grad_norm": 1.8316397585703408, "language_loss": 0.86649263, "learning_rate": 3.6152217474522527e-07, "loss": 0.8917141, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.8256301879882812 }, { "auxiliary_loss_clip": 0.01326773, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00789928, "balance_loss_mlp": 1.00017846, "epoch": 0.8115192689232249, "flos": 24901153642080.0, "grad_norm": 1.5250718233184175, "language_loss": 0.72564048, "learning_rate": 3.6107559590838975e-07, "loss": 0.75084019, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.7426843643188477 }, { "auxiliary_loss_clip": 0.01263495, "auxiliary_loss_mlp": 0.01193277, "balance_loss_clip": 1.00725865, "balance_loss_mlp": 1.00025332, "epoch": 0.811639511813864, "flos": 24057215048160.0, "grad_norm": 2.475896574785037, "language_loss": 0.66143698, "learning_rate": 3.606292656950822e-07, "loss": 0.68600464, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.956920862197876 }, { "auxiliary_loss_clip": 0.01315396, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00721622, "balance_loss_mlp": 1.00020695, "epoch": 0.8117597547045031, "flos": 23185161421920.0, "grad_norm": 1.80546502500987, "language_loss": 0.86880821, "learning_rate": 3.601831841730121e-07, "loss": 0.89389443, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.8107259273529053 }, { "auxiliary_loss_clip": 0.01326713, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00737143, "balance_loss_mlp": 1.00019181, "epoch": 0.8118799975951422, "flos": 23040262209600.0, "grad_norm": 1.709056177272051, "language_loss": 0.72552967, "learning_rate": 3.5973735140984916e-07, "loss": 0.75072896, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.805429220199585 }, { "auxiliary_loss_clip": 0.01262207, "auxiliary_loss_mlp": 0.00872456, "balance_loss_clip": 1.00647712, "balance_loss_mlp": 1.00031352, "epoch": 0.8120002404857812, "flos": 24639973181280.0, "grad_norm": 2.0072237697110933, "language_loss": 0.80266476, "learning_rate": 3.5929176747322607e-07, "loss": 0.82401139, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.9279165267944336 }, { "auxiliary_loss_clip": 0.0129304, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00392556, "balance_loss_mlp": 1.00000513, "epoch": 0.8121204833764204, "flos": 57415775107680.0, "grad_norm": 0.8488098152825126, "language_loss": 0.56252491, "learning_rate": 3.588464324307372e-07, "loss": 0.58737791, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.357089042663574 }, { "auxiliary_loss_clip": 0.0133541, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00741339, "balance_loss_mlp": 1.00016618, "epoch": 0.8122407262670595, "flos": 19464599962080.0, "grad_norm": 1.765559742251615, "language_loss": 0.75328571, "learning_rate": 3.584013463499391e-07, "loss": 0.77857172, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.782994270324707 }, { "auxiliary_loss_clip": 0.01286684, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00378203, "balance_loss_mlp": 1.00000906, "epoch": 0.8123609691576985, "flos": 56425356659520.0, "grad_norm": 0.7368196226307081, "language_loss": 0.6443702, "learning_rate": 3.579565092983521e-07, "loss": 0.66915977, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.1488378047943115 }, { "auxiliary_loss_clip": 0.01348376, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00796914, "balance_loss_mlp": 1.00015712, "epoch": 0.8124812120483377, "flos": 20631984259680.0, "grad_norm": 1.89094595566448, "language_loss": 0.83771038, "learning_rate": 3.575119213434565e-07, "loss": 0.86312592, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.727363109588623 }, { "auxiliary_loss_clip": 0.01324433, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00734818, "balance_loss_mlp": 1.00015998, "epoch": 0.8126014549389767, "flos": 22492408684320.0, "grad_norm": 1.6567502138479215, "language_loss": 0.8155567, "learning_rate": 3.5706758255269765e-07, "loss": 0.84073281, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.7083358764648438 }, { "auxiliary_loss_clip": 0.01314503, "auxiliary_loss_mlp": 0.01193336, "balance_loss_clip": 1.00736141, "balance_loss_mlp": 1.00021696, "epoch": 0.8127216978296158, "flos": 23287971290400.0, "grad_norm": 1.5133341596572016, "language_loss": 0.69617802, "learning_rate": 3.566234929934795e-07, "loss": 0.72125638, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.8475937843322754 }, { "auxiliary_loss_clip": 0.01325496, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00820303, "balance_loss_mlp": 1.00011361, "epoch": 0.812841940720255, "flos": 25155005672160.0, "grad_norm": 1.450235450770961, "language_loss": 0.71827745, "learning_rate": 3.561796527331706e-07, "loss": 0.74346375, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.7827584743499756 }, { "auxiliary_loss_clip": 0.01291054, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00704622, "balance_loss_mlp": 1.00016189, "epoch": 0.812962183610894, "flos": 26648458102080.0, "grad_norm": 1.714487616218466, "language_loss": 0.77706164, "learning_rate": 3.5573606183910163e-07, "loss": 0.80190402, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.8536124229431152 }, { "auxiliary_loss_clip": 0.01337917, "auxiliary_loss_mlp": 0.01193241, "balance_loss_clip": 1.00788558, "balance_loss_mlp": 1.00021768, "epoch": 0.8130824265015331, "flos": 24966975329280.0, "grad_norm": 1.6543249984931163, "language_loss": 0.78753161, "learning_rate": 3.5529272037856493e-07, "loss": 0.8128432, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.75520658493042 }, { "auxiliary_loss_clip": 0.01251291, "auxiliary_loss_mlp": 0.01192299, "balance_loss_clip": 1.00375104, "balance_loss_mlp": 1.00003862, "epoch": 0.8132026693921722, "flos": 67622951509920.0, "grad_norm": 0.7083254249209284, "language_loss": 0.53867453, "learning_rate": 3.548496284188149e-07, "loss": 0.56311053, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.511807680130005 }, { "auxiliary_loss_clip": 0.01255623, "auxiliary_loss_mlp": 0.01193044, "balance_loss_clip": 1.00610995, "balance_loss_mlp": 1.00011587, "epoch": 0.8133229122828113, "flos": 19495157804640.0, "grad_norm": 2.5224436419777625, "language_loss": 0.79249454, "learning_rate": 3.544067860270681e-07, "loss": 0.81698132, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 3.089243173599243 }, { "auxiliary_loss_clip": 0.01292006, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00701761, "balance_loss_mlp": 1.00021207, "epoch": 0.8134431551734503, "flos": 20668146196320.0, "grad_norm": 2.0448084043766923, "language_loss": 0.70652425, "learning_rate": 3.539641932705029e-07, "loss": 0.73137671, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.782485008239746 }, { "auxiliary_loss_clip": 0.01349692, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00838578, "balance_loss_mlp": 1.00017595, "epoch": 0.8135633980640895, "flos": 21507342864480.0, "grad_norm": 2.289367691413251, "language_loss": 0.77422953, "learning_rate": 3.53521850216262e-07, "loss": 0.79965848, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.733574390411377 }, { "auxiliary_loss_clip": 0.01349192, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00834501, "balance_loss_mlp": 1.00016236, "epoch": 0.8136836409547286, "flos": 20554451452800.0, "grad_norm": 1.6356162686315678, "language_loss": 0.7684747, "learning_rate": 3.530797569314461e-07, "loss": 0.79389846, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.693067789077759 }, { "auxiliary_loss_clip": 0.01348386, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00812554, "balance_loss_mlp": 1.00015831, "epoch": 0.8138038838453676, "flos": 20299054704480.0, "grad_norm": 2.0722013441431884, "language_loss": 0.7787686, "learning_rate": 3.5263791348312235e-07, "loss": 0.80418432, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.675145149230957 }, { "auxiliary_loss_clip": 0.01309207, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00723624, "balance_loss_mlp": 1.00018954, "epoch": 0.8139241267360068, "flos": 29789853796800.0, "grad_norm": 1.7394035100666554, "language_loss": 0.7042411, "learning_rate": 3.521963199383171e-07, "loss": 0.72926533, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 4.007923126220703 }, { "auxiliary_loss_clip": 0.01277309, "auxiliary_loss_mlp": 0.01193285, "balance_loss_clip": 1.00683928, "balance_loss_mlp": 1.00016642, "epoch": 0.8140443696266458, "flos": 19713278982240.0, "grad_norm": 2.927023520595343, "language_loss": 0.7699064, "learning_rate": 3.517549763640197e-07, "loss": 0.79461235, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.8752553462982178 }, { "auxiliary_loss_clip": 0.01323649, "auxiliary_loss_mlp": 0.00872388, "balance_loss_clip": 1.00747311, "balance_loss_mlp": 1.00042093, "epoch": 0.8141646125172849, "flos": 27160580774880.0, "grad_norm": 1.7298904364722736, "language_loss": 0.71022302, "learning_rate": 3.513138828271829e-07, "loss": 0.7321834, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 5.187971115112305 }, { "auxiliary_loss_clip": 0.01288656, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00726914, "balance_loss_mlp": 1.0001874, "epoch": 0.8142848554079241, "flos": 39673117487520.0, "grad_norm": 1.7821178515768803, "language_loss": 0.69819391, "learning_rate": 3.508730393947179e-07, "loss": 0.72301257, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.9793639183044434 }, { "auxiliary_loss_clip": 0.01290741, "auxiliary_loss_mlp": 0.01193154, "balance_loss_clip": 1.00705385, "balance_loss_mlp": 1.00012994, "epoch": 0.8144050982985631, "flos": 22237299325440.0, "grad_norm": 1.653192284007216, "language_loss": 0.7192114, "learning_rate": 3.504324461335024e-07, "loss": 0.74405032, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 3.0152299404144287 }, { "auxiliary_loss_clip": 0.0128924, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00829399, "balance_loss_mlp": 1.00016618, "epoch": 0.8145253411892022, "flos": 23038250483520.0, "grad_norm": 1.6738631873739986, "language_loss": 0.88494855, "learning_rate": 3.499921031103732e-07, "loss": 0.90977281, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.8120579719543457 }, { "auxiliary_loss_clip": 0.01297954, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00806665, "balance_loss_mlp": 1.0002023, "epoch": 0.8146455840798413, "flos": 24827680211040.0, "grad_norm": 1.570947727279221, "language_loss": 0.78243196, "learning_rate": 3.4955201039212987e-07, "loss": 0.80734372, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.849120616912842 }, { "auxiliary_loss_clip": 0.01321615, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00866675, "balance_loss_mlp": 1.00020146, "epoch": 0.8147658269704804, "flos": 19974531290400.0, "grad_norm": 2.318172239036308, "language_loss": 0.65386307, "learning_rate": 3.4911216804553465e-07, "loss": 0.67901146, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.6829984188079834 }, { "auxiliary_loss_clip": 0.01314663, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.0078373, "balance_loss_mlp": 1.00018883, "epoch": 0.8148860698611194, "flos": 21178041600960.0, "grad_norm": 1.8181932534878993, "language_loss": 0.70400089, "learning_rate": 3.4867257613731017e-07, "loss": 0.72907966, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.79213547706604 }, { "auxiliary_loss_clip": 0.01315678, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00760949, "balance_loss_mlp": 1.00013494, "epoch": 0.8150063127517585, "flos": 19606912669440.0, "grad_norm": 1.6002637315516894, "language_loss": 0.85731089, "learning_rate": 3.4823323473414343e-07, "loss": 0.88239932, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.763638973236084 }, { "auxiliary_loss_clip": 0.01283668, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.0079, "balance_loss_mlp": 1.00017643, "epoch": 0.8151265556423977, "flos": 22638385607040.0, "grad_norm": 2.4067849308141556, "language_loss": 0.75799954, "learning_rate": 3.477941439026812e-07, "loss": 0.78276825, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.8230679035186768 }, { "auxiliary_loss_clip": 0.01296429, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00779116, "balance_loss_mlp": 1.00016487, "epoch": 0.8152467985330367, "flos": 17968058095680.0, "grad_norm": 1.7078136038320526, "language_loss": 0.72815526, "learning_rate": 3.473553037095349e-07, "loss": 0.75305045, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.8630759716033936 }, { "auxiliary_loss_clip": 0.01323425, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.0079999, "balance_loss_mlp": 1.00016797, "epoch": 0.8153670414236758, "flos": 24969022979040.0, "grad_norm": 1.7304206061241285, "language_loss": 0.83431667, "learning_rate": 3.469167142212743e-07, "loss": 0.85948282, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.749802350997925 }, { "auxiliary_loss_clip": 0.0132641, "auxiliary_loss_mlp": 0.01193339, "balance_loss_clip": 1.00726795, "balance_loss_mlp": 1.00022006, "epoch": 0.8154872843143149, "flos": 31066083141120.0, "grad_norm": 2.4635479713621247, "language_loss": 0.62885737, "learning_rate": 3.4647837550443337e-07, "loss": 0.65405482, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.7762136459350586 }, { "auxiliary_loss_clip": 0.01285349, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00653398, "balance_loss_mlp": 1.00017381, "epoch": 0.815607527204954, "flos": 19391665386240.0, "grad_norm": 1.9321903082824818, "language_loss": 0.74377346, "learning_rate": 3.460402876255086e-07, "loss": 0.76855892, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.734184741973877 }, { "auxiliary_loss_clip": 0.01334925, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00792968, "balance_loss_mlp": 1.00019133, "epoch": 0.815727770095593, "flos": 26140430728800.0, "grad_norm": 2.0637704382014785, "language_loss": 0.71650648, "learning_rate": 3.456024506509574e-07, "loss": 0.74178785, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.8057117462158203 }, { "auxiliary_loss_clip": 0.0132815, "auxiliary_loss_mlp": 0.0087248, "balance_loss_clip": 1.00746489, "balance_loss_mlp": 1.00031233, "epoch": 0.8158480129862322, "flos": 25337539692000.0, "grad_norm": 1.5204686694151353, "language_loss": 0.7391184, "learning_rate": 3.4516486464719873e-07, "loss": 0.76112467, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.8955366611480713 }, { "auxiliary_loss_clip": 0.01278367, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00757098, "balance_loss_mlp": 1.0001626, "epoch": 0.8159682558768713, "flos": 34423660134720.0, "grad_norm": 1.7242043804688034, "language_loss": 0.62179893, "learning_rate": 3.4472752968061445e-07, "loss": 0.64651346, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.9957239627838135 }, { "auxiliary_loss_clip": 0.01330796, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.0072819, "balance_loss_mlp": 1.00014782, "epoch": 0.8160884987675103, "flos": 18653230936800.0, "grad_norm": 2.0519758128296584, "language_loss": 0.73719609, "learning_rate": 3.442904458175475e-07, "loss": 0.76243585, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.696789503097534 }, { "auxiliary_loss_clip": 0.01336944, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00800848, "balance_loss_mlp": 1.0001545, "epoch": 0.8162087416581495, "flos": 31430540478240.0, "grad_norm": 1.7620490970116274, "language_loss": 0.75841558, "learning_rate": 3.438536131243044e-07, "loss": 0.7837168, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.7752766609191895 }, { "auxiliary_loss_clip": 0.01308304, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.0071311, "balance_loss_mlp": 1.00017536, "epoch": 0.8163289845487885, "flos": 37593925259040.0, "grad_norm": 2.1408176547317113, "language_loss": 0.61531246, "learning_rate": 3.434170316671503e-07, "loss": 0.64032745, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 3.0063867568969727 }, { "auxiliary_loss_clip": 0.01277063, "auxiliary_loss_mlp": 0.01193242, "balance_loss_clip": 1.00682247, "balance_loss_mlp": 1.00021887, "epoch": 0.8164492274394276, "flos": 13953998072160.0, "grad_norm": 2.265725632567398, "language_loss": 0.89444441, "learning_rate": 3.4298070151231583e-07, "loss": 0.91914749, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 2.7264952659606934 }, { "auxiliary_loss_clip": 0.01315654, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00720727, "balance_loss_mlp": 1.00018167, "epoch": 0.8165694703300668, "flos": 28986567599520.0, "grad_norm": 1.9573522393862823, "language_loss": 0.59988987, "learning_rate": 3.425446227259916e-07, "loss": 0.62497848, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.850562810897827 }, { "auxiliary_loss_clip": 0.0131369, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.00708485, "balance_loss_mlp": 1.00012326, "epoch": 0.8166897132207058, "flos": 25118376727680.0, "grad_norm": 1.9812199744072474, "language_loss": 0.82422018, "learning_rate": 3.421087953743296e-07, "loss": 0.84928858, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.8095855712890625 }, { "auxiliary_loss_clip": 0.0133652, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00780749, "balance_loss_mlp": 1.00016499, "epoch": 0.8168099561113449, "flos": 23148604324800.0, "grad_norm": 1.9289793091057283, "language_loss": 0.80124938, "learning_rate": 3.416732195234464e-07, "loss": 0.82654643, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.7078633308410645 }, { "auxiliary_loss_clip": 0.01335719, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00770926, "balance_loss_mlp": 1.00016189, "epoch": 0.816930199001984, "flos": 18407677276800.0, "grad_norm": 1.4478818997765017, "language_loss": 0.79337806, "learning_rate": 3.4123789523941613e-07, "loss": 0.81866705, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.7026591300964355 }, { "auxiliary_loss_clip": 0.01336657, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00814712, "balance_loss_mlp": 1.00013804, "epoch": 0.8170504418926231, "flos": 21251335413600.0, "grad_norm": 1.5106576120052522, "language_loss": 0.63241851, "learning_rate": 3.4080282258827884e-07, "loss": 0.65771669, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 3.723209857940674 }, { "auxiliary_loss_clip": 0.01337004, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00834191, "balance_loss_mlp": 1.00011885, "epoch": 0.8171706847832622, "flos": 19099244532960.0, "grad_norm": 2.189754294174776, "language_loss": 0.72737157, "learning_rate": 3.403680016360342e-07, "loss": 0.75267303, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.742284059524536 }, { "auxiliary_loss_clip": 0.01337184, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00860822, "balance_loss_mlp": 1.00017762, "epoch": 0.8172909276739013, "flos": 21470139141120.0, "grad_norm": 1.3839204431114438, "language_loss": 0.67678475, "learning_rate": 3.3993343244864403e-07, "loss": 0.70208865, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 4.977493524551392 }, { "auxiliary_loss_clip": 0.01325391, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00719404, "balance_loss_mlp": 1.00017846, "epoch": 0.8174111705645404, "flos": 27599804795520.0, "grad_norm": 1.72654092057896, "language_loss": 0.72885299, "learning_rate": 3.394991150920323e-07, "loss": 0.75403893, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.9275124073028564 }, { "auxiliary_loss_clip": 0.01281879, "auxiliary_loss_mlp": 0.00872614, "balance_loss_clip": 1.00720048, "balance_loss_mlp": 1.00043499, "epoch": 0.8175314134551794, "flos": 14064603379200.0, "grad_norm": 2.134564958131754, "language_loss": 0.7424885, "learning_rate": 3.3906504963208396e-07, "loss": 0.76403344, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.8165435791015625 }, { "auxiliary_loss_clip": 0.01254143, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00593185, "balance_loss_mlp": 1.00016356, "epoch": 0.8176516563458186, "flos": 22708087051680.0, "grad_norm": 1.721513301749315, "language_loss": 0.66709065, "learning_rate": 3.3863123613464774e-07, "loss": 0.69156396, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.796720266342163 }, { "auxiliary_loss_clip": 0.01324665, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00754797, "balance_loss_mlp": 1.00014031, "epoch": 0.8177718992364577, "flos": 21945417327360.0, "grad_norm": 1.6302346519305466, "language_loss": 0.7478435, "learning_rate": 3.381976746655317e-07, "loss": 0.7730208, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.928635358810425 }, { "auxiliary_loss_clip": 0.01254397, "auxiliary_loss_mlp": 0.0119303, "balance_loss_clip": 1.0063076, "balance_loss_mlp": 1.0001018, "epoch": 0.8178921421270967, "flos": 22017453811200.0, "grad_norm": 2.149438770251106, "language_loss": 0.66869605, "learning_rate": 3.3776436529050756e-07, "loss": 0.69317031, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.851104974746704 }, { "auxiliary_loss_clip": 0.01347721, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.0076952, "balance_loss_mlp": 1.00015306, "epoch": 0.8180123850177359, "flos": 33183125719200.0, "grad_norm": 1.5792552072714787, "language_loss": 0.72487569, "learning_rate": 3.373313080753073e-07, "loss": 0.75028467, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.8551766872406006 }, { "auxiliary_loss_clip": 0.01336406, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00793648, "balance_loss_mlp": 1.00018835, "epoch": 0.8181326279083749, "flos": 22091178708000.0, "grad_norm": 1.4823080980755632, "language_loss": 0.77602667, "learning_rate": 3.3689850308562527e-07, "loss": 0.80132282, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.7844414710998535 }, { "auxiliary_loss_clip": 0.01270812, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00738263, "balance_loss_mlp": 1.0001626, "epoch": 0.818252870799014, "flos": 15705757068480.0, "grad_norm": 1.728874427356298, "language_loss": 0.77745777, "learning_rate": 3.364659503871183e-07, "loss": 0.80209774, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.852811098098755 }, { "auxiliary_loss_clip": 0.01300646, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00646973, "balance_loss_mlp": 1.00009608, "epoch": 0.8183731136896532, "flos": 18770697666720.0, "grad_norm": 1.9751009438734868, "language_loss": 0.83630687, "learning_rate": 3.3603365004540417e-07, "loss": 0.86124456, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.8447787761688232 }, { "auxiliary_loss_clip": 0.01348357, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00829577, "balance_loss_mlp": 1.00014198, "epoch": 0.8184933565802922, "flos": 26541804399840.0, "grad_norm": 1.7965268771840126, "language_loss": 0.7704379, "learning_rate": 3.356016021260624e-07, "loss": 0.79585308, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.6863505840301514 }, { "auxiliary_loss_clip": 0.01326534, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00736392, "balance_loss_mlp": 1.00017476, "epoch": 0.8186135994709313, "flos": 17530127327520.0, "grad_norm": 2.433810922636649, "language_loss": 0.65952456, "learning_rate": 3.35169806694634e-07, "loss": 0.68472195, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.7357516288757324 }, { "auxiliary_loss_clip": 0.01266165, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00431991, "balance_loss_mlp": 1.00000501, "epoch": 0.8187338423615703, "flos": 63480337548480.0, "grad_norm": 0.7140398822022669, "language_loss": 0.60699993, "learning_rate": 3.3473826381662186e-07, "loss": 0.63158423, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.4757769107818604 }, { "auxiliary_loss_clip": 0.01322044, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00751567, "balance_loss_mlp": 1.00017715, "epoch": 0.8188540852522095, "flos": 17529983632800.0, "grad_norm": 1.897034283048288, "language_loss": 0.81626451, "learning_rate": 3.3430697355749216e-07, "loss": 0.84141695, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.705636501312256 }, { "auxiliary_loss_clip": 0.01277587, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00695872, "balance_loss_mlp": 1.00016665, "epoch": 0.8189743281428485, "flos": 14392539542880.0, "grad_norm": 1.9352260193029964, "language_loss": 0.75504023, "learning_rate": 3.3387593598266907e-07, "loss": 0.77974796, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.8431174755096436 }, { "auxiliary_loss_clip": 0.01305803, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00757492, "balance_loss_mlp": 1.00015891, "epoch": 0.8190945710334876, "flos": 25080490454400.0, "grad_norm": 1.5902718938951514, "language_loss": 0.78489834, "learning_rate": 3.3344515115754225e-07, "loss": 0.80988818, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.8118810653686523 }, { "auxiliary_loss_clip": 0.01287519, "auxiliary_loss_mlp": 0.01193135, "balance_loss_clip": 1.00725615, "balance_loss_mlp": 1.00011146, "epoch": 0.8192148139241268, "flos": 21507163246080.0, "grad_norm": 2.566180876866784, "language_loss": 0.80124915, "learning_rate": 3.33014619147461e-07, "loss": 0.82605565, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.8241751194000244 }, { "auxiliary_loss_clip": 0.01300986, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00716949, "balance_loss_mlp": 1.00021625, "epoch": 0.8193350568147658, "flos": 23952177911520.0, "grad_norm": 2.289194654919006, "language_loss": 0.71463901, "learning_rate": 3.325843400177362e-07, "loss": 0.73958129, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.8084914684295654 }, { "auxiliary_loss_clip": 0.0133769, "auxiliary_loss_mlp": 0.00872554, "balance_loss_clip": 1.00828969, "balance_loss_mlp": 1.00044215, "epoch": 0.8194552997054049, "flos": 20559480768000.0, "grad_norm": 1.6810097447110122, "language_loss": 0.7338748, "learning_rate": 3.32154313833642e-07, "loss": 0.75597727, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.7197887897491455 }, { "auxiliary_loss_clip": 0.01348503, "auxiliary_loss_mlp": 0.01193136, "balance_loss_clip": 1.00764751, "balance_loss_mlp": 1.0002073, "epoch": 0.819575542596044, "flos": 26031765300480.0, "grad_norm": 2.3671117512621214, "language_loss": 0.59212226, "learning_rate": 3.3172454066041164e-07, "loss": 0.61753863, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.740903615951538 }, { "auxiliary_loss_clip": 0.0124842, "auxiliary_loss_mlp": 0.00872364, "balance_loss_clip": 1.00656402, "balance_loss_mlp": 1.00035167, "epoch": 0.8196957854866831, "flos": 29096957364480.0, "grad_norm": 2.000742771630457, "language_loss": 0.76121998, "learning_rate": 3.3129502056324234e-07, "loss": 0.78242785, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 3.033672332763672 }, { "auxiliary_loss_clip": 0.01225496, "auxiliary_loss_mlp": 0.01192285, "balance_loss_clip": 1.00445318, "balance_loss_mlp": 1.0000248, "epoch": 0.8198160283773221, "flos": 69033662199360.0, "grad_norm": 1.0702947404652068, "language_loss": 0.59789836, "learning_rate": 3.3086575360729165e-07, "loss": 0.62207615, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.584467887878418 }, { "auxiliary_loss_clip": 0.01311111, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00786209, "balance_loss_mlp": 1.00021911, "epoch": 0.8199362712679613, "flos": 16618067930880.0, "grad_norm": 1.6103151862315013, "language_loss": 0.7128002, "learning_rate": 3.3043673985767906e-07, "loss": 0.73784375, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 3.0046474933624268 }, { "auxiliary_loss_clip": 0.01302234, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00731158, "balance_loss_mlp": 1.00017762, "epoch": 0.8200565141586004, "flos": 21757674373920.0, "grad_norm": 1.6952537576341487, "language_loss": 0.77667034, "learning_rate": 3.3000797937948564e-07, "loss": 0.80162466, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 3.5552823543548584 }, { "auxiliary_loss_clip": 0.01256045, "auxiliary_loss_mlp": 0.01192265, "balance_loss_clip": 1.00381422, "balance_loss_mlp": 1.00000465, "epoch": 0.8201767570492394, "flos": 69807144951360.0, "grad_norm": 0.9371469005219649, "language_loss": 0.65104878, "learning_rate": 3.295794722377534e-07, "loss": 0.67553186, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 4.273104667663574 }, { "auxiliary_loss_clip": 0.0134778, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00769413, "balance_loss_mlp": 1.00013387, "epoch": 0.8202969999398786, "flos": 23111903532960.0, "grad_norm": 1.4329943877441125, "language_loss": 0.79803634, "learning_rate": 3.291512184974876e-07, "loss": 0.82344568, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.7168033123016357 }, { "auxiliary_loss_clip": 0.01324175, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00782418, "balance_loss_mlp": 1.00015783, "epoch": 0.8204172428305176, "flos": 28220628820320.0, "grad_norm": 1.9433138526836446, "language_loss": 0.66465652, "learning_rate": 3.2872321822365346e-07, "loss": 0.68982923, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 3.756751537322998 }, { "auxiliary_loss_clip": 0.01325013, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00713277, "balance_loss_mlp": 1.000157, "epoch": 0.8205374857211567, "flos": 20887021771200.0, "grad_norm": 3.6322091955869853, "language_loss": 0.73132145, "learning_rate": 3.282954714811783e-07, "loss": 0.7565034, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 3.94464111328125 }, { "auxiliary_loss_clip": 0.01325034, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00780928, "balance_loss_mlp": 1.00016356, "epoch": 0.8206577286117959, "flos": 13152148822080.0, "grad_norm": 2.0981242434966534, "language_loss": 0.70766354, "learning_rate": 3.2786797833495093e-07, "loss": 0.73284566, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.723543643951416 }, { "auxiliary_loss_clip": 0.01348144, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00754094, "balance_loss_mlp": 1.00015032, "epoch": 0.8207779715024349, "flos": 25265646902880.0, "grad_norm": 1.7958686671933748, "language_loss": 0.72620136, "learning_rate": 3.274407388498213e-07, "loss": 0.7516135, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.797244071960449 }, { "auxiliary_loss_clip": 0.01301413, "auxiliary_loss_mlp": 0.01193114, "balance_loss_clip": 1.00725508, "balance_loss_mlp": 1.00018597, "epoch": 0.820898214393074, "flos": 19610253571680.0, "grad_norm": 1.7372793232111727, "language_loss": 0.74546272, "learning_rate": 3.270137530906021e-07, "loss": 0.77040803, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.8250362873077393 }, { "auxiliary_loss_clip": 0.01253526, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.00613642, "balance_loss_mlp": 1.00012612, "epoch": 0.8210184572837131, "flos": 15596624632320.0, "grad_norm": 1.826004173267344, "language_loss": 0.83613229, "learning_rate": 3.265870211220665e-07, "loss": 0.8605991, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.832094192504883 }, { "auxiliary_loss_clip": 0.0129093, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00763941, "balance_loss_mlp": 1.00018072, "epoch": 0.8211387001743522, "flos": 20813943500640.0, "grad_norm": 1.9661920936936679, "language_loss": 0.81640619, "learning_rate": 3.2616054300894934e-07, "loss": 0.84124756, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.988837957382202 }, { "auxiliary_loss_clip": 0.01312208, "auxiliary_loss_mlp": 0.01193301, "balance_loss_clip": 1.00786722, "balance_loss_mlp": 1.00018239, "epoch": 0.8212589430649913, "flos": 27704590466400.0, "grad_norm": 2.139512848204142, "language_loss": 0.84380698, "learning_rate": 3.2573431881594693e-07, "loss": 0.86886203, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.8538219928741455 }, { "auxiliary_loss_clip": 0.01261269, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00586867, "balance_loss_mlp": 1.00015926, "epoch": 0.8213791859556304, "flos": 22455636045120.0, "grad_norm": 2.0476356825704936, "language_loss": 0.65875214, "learning_rate": 3.2530834860771663e-07, "loss": 0.68329668, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.8851816654205322 }, { "auxiliary_loss_clip": 0.01336482, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00809598, "balance_loss_mlp": 1.00018227, "epoch": 0.8214994288462695, "flos": 16654481333280.0, "grad_norm": 2.0910980674518886, "language_loss": 0.74151385, "learning_rate": 3.248826324488794e-07, "loss": 0.76681077, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.7424614429473877 }, { "auxiliary_loss_clip": 0.01348403, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00813913, "balance_loss_mlp": 1.00018156, "epoch": 0.8216196717369085, "flos": 25221797298720.0, "grad_norm": 1.7220622341111431, "language_loss": 0.87667537, "learning_rate": 3.244571704040138e-07, "loss": 0.90209138, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.828904151916504 }, { "auxiliary_loss_clip": 0.01337446, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00804257, "balance_loss_mlp": 1.00018382, "epoch": 0.8217399146275477, "flos": 25371941368320.0, "grad_norm": 1.9854882126027102, "language_loss": 0.73257649, "learning_rate": 3.2403196253766374e-07, "loss": 0.75788301, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.7683963775634766 }, { "auxiliary_loss_clip": 0.01339486, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00852644, "balance_loss_mlp": 1.00019264, "epoch": 0.8218601575181868, "flos": 25629637232160.0, "grad_norm": 2.1131810462933833, "language_loss": 0.79451096, "learning_rate": 3.2360700891433254e-07, "loss": 0.81983697, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.7977802753448486 }, { "auxiliary_loss_clip": 0.0124929, "auxiliary_loss_mlp": 0.01192308, "balance_loss_clip": 1.00319028, "balance_loss_mlp": 1.00004756, "epoch": 0.8219804004088258, "flos": 67660263004320.0, "grad_norm": 0.7902077354248256, "language_loss": 0.57321751, "learning_rate": 3.231823095984847e-07, "loss": 0.59763348, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.274789810180664 }, { "auxiliary_loss_clip": 0.01305396, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00705481, "balance_loss_mlp": 1.00017273, "epoch": 0.822100643299465, "flos": 19464276648960.0, "grad_norm": 2.268812535573755, "language_loss": 0.76276112, "learning_rate": 3.2275786465454814e-07, "loss": 0.78774703, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.7886171340942383 }, { "auxiliary_loss_clip": 0.0130219, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00742316, "balance_loss_mlp": 1.00019073, "epoch": 0.822220886190104, "flos": 24681379975200.0, "grad_norm": 1.6644805976094572, "language_loss": 0.75560629, "learning_rate": 3.2233367414690917e-07, "loss": 0.78056037, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.8267018795013428 }, { "auxiliary_loss_clip": 0.0130161, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00719774, "balance_loss_mlp": 1.00018597, "epoch": 0.8223411290807431, "flos": 27819075530880.0, "grad_norm": 2.249314424435353, "language_loss": 0.84624147, "learning_rate": 3.219097381399183e-07, "loss": 0.87118959, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.8932340145111084 }, { "auxiliary_loss_clip": 0.0131816, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00742483, "balance_loss_mlp": 1.00019801, "epoch": 0.8224613719713821, "flos": 23218557235200.0, "grad_norm": 2.4804175479617774, "language_loss": 0.80852079, "learning_rate": 3.2148605669788584e-07, "loss": 0.83363467, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.768871545791626 }, { "auxiliary_loss_clip": 0.01307282, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00703907, "balance_loss_mlp": 1.00016737, "epoch": 0.8225816148620213, "flos": 15706260000000.0, "grad_norm": 2.3451780406992913, "language_loss": 0.77541125, "learning_rate": 3.2106262988508405e-07, "loss": 0.80041599, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.7733771800994873 }, { "auxiliary_loss_clip": 0.013043, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00660229, "balance_loss_mlp": 1.00016201, "epoch": 0.8227018577526604, "flos": 18515121300000.0, "grad_norm": 1.7200576388676594, "language_loss": 0.74282187, "learning_rate": 3.206394577657465e-07, "loss": 0.76779675, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.7264838218688965 }, { "auxiliary_loss_clip": 0.01328572, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00780296, "balance_loss_mlp": 1.00017786, "epoch": 0.8228221006432994, "flos": 22236796393920.0, "grad_norm": 2.7220188556930847, "language_loss": 0.7301861, "learning_rate": 3.202165404040675e-07, "loss": 0.75540376, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.7727935314178467 }, { "auxiliary_loss_clip": 0.01262381, "auxiliary_loss_mlp": 0.01193244, "balance_loss_clip": 1.00738192, "balance_loss_mlp": 1.00022006, "epoch": 0.8229423435339386, "flos": 24097544131680.0, "grad_norm": 1.8331848902790777, "language_loss": 0.74789774, "learning_rate": 3.1979387786420396e-07, "loss": 0.77245402, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.862189769744873 }, { "auxiliary_loss_clip": 0.01323971, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00782609, "balance_loss_mlp": 1.00015664, "epoch": 0.8230625864245776, "flos": 23878560785760.0, "grad_norm": 1.707388136097626, "language_loss": 0.82140803, "learning_rate": 3.1937147021027346e-07, "loss": 0.84657955, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.7732863426208496 }, { "auxiliary_loss_clip": 0.01326395, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00735903, "balance_loss_mlp": 1.00015318, "epoch": 0.8231828293152167, "flos": 16581115673280.0, "grad_norm": 2.735718283735525, "language_loss": 0.75975043, "learning_rate": 3.189493175063547e-07, "loss": 0.78494614, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.737354278564453 }, { "auxiliary_loss_clip": 0.01306701, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00726199, "balance_loss_mlp": 1.00017643, "epoch": 0.8233030722058559, "flos": 18880081568640.0, "grad_norm": 1.7344772062080012, "language_loss": 0.67360991, "learning_rate": 3.1852741981648776e-07, "loss": 0.69860899, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 3.7956061363220215 }, { "auxiliary_loss_clip": 0.01290027, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00777853, "balance_loss_mlp": 1.00017285, "epoch": 0.8234233150964949, "flos": 28439037387360.0, "grad_norm": 2.0388632941470437, "language_loss": 0.70108008, "learning_rate": 3.1810577720467404e-07, "loss": 0.72591233, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.8829843997955322 }, { "auxiliary_loss_clip": 0.01311328, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00700843, "balance_loss_mlp": 1.0001539, "epoch": 0.823543557987134, "flos": 33765955699680.0, "grad_norm": 1.5717597032860025, "language_loss": 0.56560302, "learning_rate": 3.176843897348769e-07, "loss": 0.59064806, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 4.804013013839722 }, { "auxiliary_loss_clip": 0.01312891, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00764179, "balance_loss_mlp": 1.00019157, "epoch": 0.8236638008777731, "flos": 17092376177760.0, "grad_norm": 2.3564352946978886, "language_loss": 0.7509259, "learning_rate": 3.1726325747102034e-07, "loss": 0.77598703, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.6900298595428467 }, { "auxiliary_loss_clip": 0.01288635, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.00759673, "balance_loss_mlp": 1.00020874, "epoch": 0.8237840437684122, "flos": 61640003966400.0, "grad_norm": 1.4601483470085523, "language_loss": 0.63953811, "learning_rate": 3.1684238047698974e-07, "loss": 0.66435677, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.1924922466278076 }, { "auxiliary_loss_clip": 0.01305594, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.0066514, "balance_loss_mlp": 1.0002054, "epoch": 0.8239042866590512, "flos": 27309036431520.0, "grad_norm": 2.1549691912938886, "language_loss": 0.52973735, "learning_rate": 3.1642175881663155e-07, "loss": 0.55472553, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.7855143547058105 }, { "auxiliary_loss_clip": 0.01347767, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00727558, "balance_loss_mlp": 1.00014544, "epoch": 0.8240245295496904, "flos": 21726362134080.0, "grad_norm": 1.9083509150162472, "language_loss": 0.8381846, "learning_rate": 3.160013925537537e-07, "loss": 0.86359394, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.717360734939575 }, { "auxiliary_loss_clip": 0.01283395, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00736141, "balance_loss_mlp": 1.00019884, "epoch": 0.8241447724403295, "flos": 20009328127200.0, "grad_norm": 1.7940334648794123, "language_loss": 0.75516385, "learning_rate": 3.155812817521266e-07, "loss": 0.77993, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.7619547843933105 }, { "auxiliary_loss_clip": 0.01308092, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.0071466, "balance_loss_mlp": 1.00017941, "epoch": 0.8242650153309685, "flos": 22272994254240.0, "grad_norm": 1.8198610284765078, "language_loss": 0.78139001, "learning_rate": 3.151614264754787e-07, "loss": 0.80640298, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.7434542179107666 }, { "auxiliary_loss_clip": 0.01348849, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00748098, "balance_loss_mlp": 1.00016057, "epoch": 0.8243852582216077, "flos": 22309982435520.0, "grad_norm": 2.488390139854043, "language_loss": 0.79431713, "learning_rate": 3.147418267875035e-07, "loss": 0.81973755, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.7317140102386475 }, { "auxiliary_loss_clip": 0.01264752, "auxiliary_loss_mlp": 0.00872414, "balance_loss_clip": 1.0062027, "balance_loss_mlp": 1.00038576, "epoch": 0.8245055011122467, "flos": 24645433580640.0, "grad_norm": 2.1639257724864347, "language_loss": 0.6568864, "learning_rate": 3.1432248275185315e-07, "loss": 0.67825806, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.915736198425293 }, { "auxiliary_loss_clip": 0.0132357, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00748658, "balance_loss_mlp": 1.00018692, "epoch": 0.8246257440028858, "flos": 17487283586400.0, "grad_norm": 2.1613941700784944, "language_loss": 0.77079362, "learning_rate": 3.139033944321412e-07, "loss": 0.79596144, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.7978503704071045 }, { "auxiliary_loss_clip": 0.01330216, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00763607, "balance_loss_mlp": 1.00018108, "epoch": 0.824745986893525, "flos": 25010142383520.0, "grad_norm": 1.8547017383714637, "language_loss": 0.78994727, "learning_rate": 3.1348456189194507e-07, "loss": 0.81518149, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.7348272800445557 }, { "auxiliary_loss_clip": 0.01307434, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00764477, "balance_loss_mlp": 1.00020719, "epoch": 0.824866229784164, "flos": 18772709392800.0, "grad_norm": 1.5447755562453231, "language_loss": 0.82831687, "learning_rate": 3.1306598519479876e-07, "loss": 0.85332352, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.8005738258361816 }, { "auxiliary_loss_clip": 0.01309504, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.0075531, "balance_loss_mlp": 1.00018573, "epoch": 0.8249864726748031, "flos": 23842183307040.0, "grad_norm": 1.526375707886911, "language_loss": 0.78163636, "learning_rate": 3.1264766440420177e-07, "loss": 0.80666351, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.8482251167297363 }, { "auxiliary_loss_clip": 0.01324151, "auxiliary_loss_mlp": 0.01193064, "balance_loss_clip": 1.00717139, "balance_loss_mlp": 1.00013578, "epoch": 0.8251067155654422, "flos": 20303114080320.0, "grad_norm": 1.8116169265588038, "language_loss": 0.6921584, "learning_rate": 3.122295995836124e-07, "loss": 0.71733052, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.683753252029419 }, { "auxiliary_loss_clip": 0.01336504, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00765669, "balance_loss_mlp": 1.00019121, "epoch": 0.8252269584560813, "flos": 25009711299360.0, "grad_norm": 1.8638274565901836, "language_loss": 0.77406192, "learning_rate": 3.118117907964508e-07, "loss": 0.79935914, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.7996668815612793 }, { "auxiliary_loss_clip": 0.01305798, "auxiliary_loss_mlp": 0.01193279, "balance_loss_clip": 1.00717008, "balance_loss_mlp": 1.00025547, "epoch": 0.8253472013467203, "flos": 17128574038080.0, "grad_norm": 1.7762703606356214, "language_loss": 0.80332422, "learning_rate": 3.1139423810609856e-07, "loss": 0.82831496, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.7820181846618652 }, { "auxiliary_loss_clip": 0.01348233, "auxiliary_loss_mlp": 0.0119314, "balance_loss_clip": 1.00741792, "balance_loss_mlp": 1.00011611, "epoch": 0.8254674442373595, "flos": 22414804030080.0, "grad_norm": 1.9409160921898747, "language_loss": 0.75135821, "learning_rate": 3.1097694157589714e-07, "loss": 0.77677196, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.778696060180664 }, { "auxiliary_loss_clip": 0.01324142, "auxiliary_loss_mlp": 0.01193054, "balance_loss_clip": 1.00756788, "balance_loss_mlp": 1.00012541, "epoch": 0.8255876871279986, "flos": 24786776348640.0, "grad_norm": 3.0858526685584375, "language_loss": 0.7581985, "learning_rate": 3.105599012691511e-07, "loss": 0.78337049, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.7363293170928955 }, { "auxiliary_loss_clip": 0.01323585, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00729775, "balance_loss_mlp": 1.0001384, "epoch": 0.8257079300186376, "flos": 27455444438400.0, "grad_norm": 1.683347439997975, "language_loss": 0.820508, "learning_rate": 3.101431172491249e-07, "loss": 0.84567547, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.7841813564300537 }, { "auxiliary_loss_clip": 0.01301879, "auxiliary_loss_mlp": 0.00872508, "balance_loss_clip": 1.00739443, "balance_loss_mlp": 1.00035286, "epoch": 0.8258281729092768, "flos": 16471875466080.0, "grad_norm": 2.049710860309185, "language_loss": 0.7199502, "learning_rate": 3.097265895790444e-07, "loss": 0.74169409, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.857266426086426 }, { "auxiliary_loss_clip": 0.01303124, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00801492, "balance_loss_mlp": 1.00015616, "epoch": 0.8259484157999158, "flos": 21433833509760.0, "grad_norm": 1.8916976538666384, "language_loss": 0.8342573, "learning_rate": 3.093103183220962e-07, "loss": 0.85922039, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 2.846088409423828 }, { "auxiliary_loss_clip": 0.01299752, "auxiliary_loss_mlp": 0.01192274, "balance_loss_clip": 1.00385785, "balance_loss_mlp": 1.00001287, "epoch": 0.8260686586905549, "flos": 58322374104960.0, "grad_norm": 0.8156177539883263, "language_loss": 0.59392262, "learning_rate": 3.0889430354142796e-07, "loss": 0.61884284, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.3151886463165283 }, { "auxiliary_loss_clip": 0.01296656, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00653124, "balance_loss_mlp": 1.00014877, "epoch": 0.826188901581194, "flos": 27527301303840.0, "grad_norm": 1.737927277102702, "language_loss": 0.70092165, "learning_rate": 3.084785453001497e-07, "loss": 0.72581989, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.8011701107025146 }, { "auxiliary_loss_clip": 0.01304056, "auxiliary_loss_mlp": 0.00872488, "balance_loss_clip": 1.00717783, "balance_loss_mlp": 1.00051045, "epoch": 0.8263091444718331, "flos": 23696062689600.0, "grad_norm": 2.75279491542921, "language_loss": 0.81616122, "learning_rate": 3.080630436613314e-07, "loss": 0.83792663, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.821678400039673 }, { "auxiliary_loss_clip": 0.01336216, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00782263, "balance_loss_mlp": 1.00017118, "epoch": 0.8264293873624722, "flos": 17165167058880.0, "grad_norm": 1.9931851916235719, "language_loss": 0.86128294, "learning_rate": 3.076477986880039e-07, "loss": 0.88657707, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 3.688856601715088 }, { "auxiliary_loss_clip": 0.01305832, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00696552, "balance_loss_mlp": 1.00017691, "epoch": 0.8265496302531112, "flos": 24098657765760.0, "grad_norm": 2.9983537276710552, "language_loss": 0.69308937, "learning_rate": 3.0723281044315986e-07, "loss": 0.71807879, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.721977472305298 }, { "auxiliary_loss_clip": 0.01347373, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.0071795, "balance_loss_mlp": 1.00014913, "epoch": 0.8266698731437504, "flos": 14099903147520.0, "grad_norm": 2.0089219415173276, "language_loss": 0.76347297, "learning_rate": 3.068180789897521e-07, "loss": 0.78887755, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 4.720479726791382 }, { "auxiliary_loss_clip": 0.01331897, "auxiliary_loss_mlp": 0.011933, "balance_loss_clip": 1.00731742, "balance_loss_mlp": 1.00018096, "epoch": 0.8267901160343895, "flos": 30777577968960.0, "grad_norm": 1.4092134080962808, "language_loss": 0.815377, "learning_rate": 3.064036043906966e-07, "loss": 0.84062892, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.8914151191711426 }, { "auxiliary_loss_clip": 0.01300276, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00702655, "balance_loss_mlp": 1.00021577, "epoch": 0.8269103589250285, "flos": 40624931188800.0, "grad_norm": 1.8717946371987495, "language_loss": 0.68166184, "learning_rate": 3.059893867088668e-07, "loss": 0.70659697, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.9398720264434814 }, { "auxiliary_loss_clip": 0.01323834, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00715041, "balance_loss_mlp": 1.00016356, "epoch": 0.8270306018156677, "flos": 30263659112160.0, "grad_norm": 1.7695956051020814, "language_loss": 0.66770673, "learning_rate": 3.055754260071004e-07, "loss": 0.69287598, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.7921624183654785 }, { "auxiliary_loss_clip": 0.01329188, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00762069, "balance_loss_mlp": 1.0001564, "epoch": 0.8271508447063067, "flos": 25226611071840.0, "grad_norm": 1.9825697580446588, "language_loss": 0.73292524, "learning_rate": 3.051617223481948e-07, "loss": 0.75814891, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.7473349571228027 }, { "auxiliary_loss_clip": 0.0129755, "auxiliary_loss_mlp": 0.01193106, "balance_loss_clip": 1.00844717, "balance_loss_mlp": 1.00017774, "epoch": 0.8272710875969458, "flos": 17566612577280.0, "grad_norm": 1.9473917925701232, "language_loss": 0.75069159, "learning_rate": 3.047482757949078e-07, "loss": 0.77559817, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.8855273723602295 }, { "auxiliary_loss_clip": 0.01291386, "auxiliary_loss_mlp": 0.00872407, "balance_loss_clip": 1.00659394, "balance_loss_mlp": 1.00031853, "epoch": 0.827391330487585, "flos": 19755476097120.0, "grad_norm": 1.879370478913653, "language_loss": 0.86001688, "learning_rate": 3.043350864099605e-07, "loss": 0.88165474, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.7783584594726562 }, { "auxiliary_loss_clip": 0.01336206, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00789189, "balance_loss_mlp": 1.00011539, "epoch": 0.827511573378224, "flos": 16835183245440.0, "grad_norm": 1.95947572302226, "language_loss": 0.80709493, "learning_rate": 3.039221542560315e-07, "loss": 0.8323884, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.679358720779419 }, { "auxiliary_loss_clip": 0.01325392, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00766826, "balance_loss_mlp": 1.00013602, "epoch": 0.8276318162688631, "flos": 18369252148320.0, "grad_norm": 1.7814714261190912, "language_loss": 0.73547351, "learning_rate": 3.0350947939576356e-07, "loss": 0.76065904, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.757560968399048 }, { "auxiliary_loss_clip": 0.01337817, "auxiliary_loss_mlp": 0.01193297, "balance_loss_clip": 1.00849414, "balance_loss_mlp": 1.00017834, "epoch": 0.8277520591595022, "flos": 19352701402560.0, "grad_norm": 2.7290054044054726, "language_loss": 0.72048414, "learning_rate": 3.0309706189175876e-07, "loss": 0.74579525, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.725935220718384 }, { "auxiliary_loss_clip": 0.01287156, "auxiliary_loss_mlp": 0.01192274, "balance_loss_clip": 1.00408316, "balance_loss_mlp": 1.00001323, "epoch": 0.8278723020501413, "flos": 67918892883840.0, "grad_norm": 0.8037154632173545, "language_loss": 0.5738728, "learning_rate": 3.0268490180658045e-07, "loss": 0.59866709, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.294687271118164 }, { "auxiliary_loss_clip": 0.01349359, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00797904, "balance_loss_mlp": 1.00020134, "epoch": 0.8279925449407803, "flos": 18185748189120.0, "grad_norm": 2.153121743282077, "language_loss": 0.79171896, "learning_rate": 3.0227299920275305e-07, "loss": 0.81714475, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.7378499507904053 }, { "auxiliary_loss_clip": 0.01282068, "auxiliary_loss_mlp": 0.01193294, "balance_loss_clip": 1.00656319, "balance_loss_mlp": 1.00017524, "epoch": 0.8281127878314195, "flos": 20631445404480.0, "grad_norm": 1.7945421234207934, "language_loss": 0.85958213, "learning_rate": 3.018613541427613e-07, "loss": 0.8843357, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.8576669692993164 }, { "auxiliary_loss_clip": 0.01348069, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.0075618, "balance_loss_mlp": 1.00018048, "epoch": 0.8282330307220586, "flos": 18004291879680.0, "grad_norm": 1.6428542381695186, "language_loss": 0.73875469, "learning_rate": 3.0144996668905243e-07, "loss": 0.76416647, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.6263575553894043 }, { "auxiliary_loss_clip": 0.01276171, "auxiliary_loss_mlp": 0.00872436, "balance_loss_clip": 1.00755525, "balance_loss_mlp": 1.00036669, "epoch": 0.8283532736126976, "flos": 20084130734400.0, "grad_norm": 1.8802637326850051, "language_loss": 0.82086599, "learning_rate": 3.010388369040331e-07, "loss": 0.84235203, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.9936106204986572 }, { "auxiliary_loss_clip": 0.01326314, "auxiliary_loss_mlp": 0.01193238, "balance_loss_clip": 1.00716388, "balance_loss_mlp": 1.00021398, "epoch": 0.8284735165033368, "flos": 31868435322720.0, "grad_norm": 1.5410263304849536, "language_loss": 0.82618201, "learning_rate": 3.0062796485007156e-07, "loss": 0.85137749, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.8296961784362793 }, { "auxiliary_loss_clip": 0.01349015, "auxiliary_loss_mlp": 0.00872522, "balance_loss_clip": 1.00767517, "balance_loss_mlp": 1.00050509, "epoch": 0.8285937593939758, "flos": 26651331996480.0, "grad_norm": 2.2473220066860367, "language_loss": 0.65973759, "learning_rate": 3.002173505894965e-07, "loss": 0.68195289, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.7595975399017334 }, { "auxiliary_loss_clip": 0.0133836, "auxiliary_loss_mlp": 0.01193333, "balance_loss_clip": 1.00838041, "balance_loss_mlp": 1.00021434, "epoch": 0.8287140022846149, "flos": 20193694254720.0, "grad_norm": 3.3730600750244037, "language_loss": 0.61968327, "learning_rate": 2.998069941845973e-07, "loss": 0.64500022, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.801311731338501 }, { "auxiliary_loss_clip": 0.01316771, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00382459, "balance_loss_mlp": 1.00000572, "epoch": 0.8288342451752541, "flos": 70756012910880.0, "grad_norm": 0.7136669544942044, "language_loss": 0.57485831, "learning_rate": 2.993968956976258e-07, "loss": 0.59994864, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.3350565433502197 }, { "auxiliary_loss_clip": 0.01350446, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00820756, "balance_loss_mlp": 1.00017059, "epoch": 0.8289544880658931, "flos": 24572247539040.0, "grad_norm": 2.0078977237279245, "language_loss": 0.70497209, "learning_rate": 2.9898705519079313e-07, "loss": 0.73040843, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.731715440750122 }, { "auxiliary_loss_clip": 0.01312755, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00739253, "balance_loss_mlp": 1.00013494, "epoch": 0.8290747309565322, "flos": 22273389414720.0, "grad_norm": 1.6294700960254087, "language_loss": 0.74771821, "learning_rate": 2.985774727262715e-07, "loss": 0.77277744, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.7780189514160156 }, { "auxiliary_loss_clip": 0.01348537, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00783885, "balance_loss_mlp": 1.00014353, "epoch": 0.8291949738471713, "flos": 23255581340160.0, "grad_norm": 1.7332297212588401, "language_loss": 0.81178403, "learning_rate": 2.981681483661949e-07, "loss": 0.837201, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.7151176929473877 }, { "auxiliary_loss_clip": 0.01327483, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.00765991, "balance_loss_mlp": 1.00020325, "epoch": 0.8293152167378104, "flos": 52555787478720.0, "grad_norm": 1.6208332755586499, "language_loss": 0.7128033, "learning_rate": 2.9775908217265633e-07, "loss": 0.73801041, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 3.1523635387420654 }, { "auxiliary_loss_clip": 0.01234091, "auxiliary_loss_mlp": 0.01192269, "balance_loss_clip": 1.00378561, "balance_loss_mlp": 1.00000846, "epoch": 0.8294354596284494, "flos": 63356189014080.0, "grad_norm": 0.8288115699936853, "language_loss": 0.50415057, "learning_rate": 2.9735027420771253e-07, "loss": 0.52841419, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.385157346725464 }, { "auxiliary_loss_clip": 0.01301424, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00726259, "balance_loss_mlp": 1.00016809, "epoch": 0.8295557025190886, "flos": 24827033584800.0, "grad_norm": 1.6680719581433603, "language_loss": 0.71613193, "learning_rate": 2.969417245333774e-07, "loss": 0.74107802, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 3.970348596572876 }, { "auxiliary_loss_clip": 0.01278234, "auxiliary_loss_mlp": 0.01193106, "balance_loss_clip": 1.00707018, "balance_loss_mlp": 1.00017798, "epoch": 0.8296759454097277, "flos": 25118592269760.0, "grad_norm": 1.8898793161041856, "language_loss": 0.7807979, "learning_rate": 2.9653343321162915e-07, "loss": 0.80551136, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.8680968284606934 }, { "auxiliary_loss_clip": 0.01281723, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00718021, "balance_loss_mlp": 1.00016809, "epoch": 0.8297961883003667, "flos": 24132592434240.0, "grad_norm": 1.9769774116243837, "language_loss": 0.6503554, "learning_rate": 2.9612540030440446e-07, "loss": 0.6751045, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 4.7224814891815186 }, { "auxiliary_loss_clip": 0.01283391, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00370431, "balance_loss_mlp": 1.0000056, "epoch": 0.8299164311910058, "flos": 67446596363040.0, "grad_norm": 0.8890952950586848, "language_loss": 0.64114809, "learning_rate": 2.9571762587360206e-07, "loss": 0.66590464, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.2690818309783936 }, { "auxiliary_loss_clip": 0.01297653, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00708616, "balance_loss_mlp": 1.00017595, "epoch": 0.8300366740816449, "flos": 25228694645280.0, "grad_norm": 1.4633538453728154, "language_loss": 0.73749375, "learning_rate": 2.953101099810806e-07, "loss": 0.7624023, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.919065475463867 }, { "auxiliary_loss_clip": 0.01321646, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00740719, "balance_loss_mlp": 1.00016212, "epoch": 0.830156916972284, "flos": 18041028595200.0, "grad_norm": 1.9497969386471095, "language_loss": 0.82581723, "learning_rate": 2.9490285268865965e-07, "loss": 0.85096562, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.741974353790283 }, { "auxiliary_loss_clip": 0.01329452, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00788736, "balance_loss_mlp": 1.0001719, "epoch": 0.830277159862923, "flos": 26322497740800.0, "grad_norm": 1.9115115759862193, "language_loss": 0.79412568, "learning_rate": 2.9449585405812085e-07, "loss": 0.81935215, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.754328966140747 }, { "auxiliary_loss_clip": 0.01276423, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00699091, "balance_loss_mlp": 1.00014937, "epoch": 0.8303974027535622, "flos": 19938872285280.0, "grad_norm": 1.7613094418025041, "language_loss": 0.73760778, "learning_rate": 2.940891141512043e-07, "loss": 0.76230371, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.7870664596557617 }, { "auxiliary_loss_clip": 0.01323907, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00808227, "balance_loss_mlp": 1.00020456, "epoch": 0.8305176456442013, "flos": 17165562219360.0, "grad_norm": 2.50336723242551, "language_loss": 0.71787459, "learning_rate": 2.9368263302961385e-07, "loss": 0.74304593, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.78914213180542 }, { "auxiliary_loss_clip": 0.01258697, "auxiliary_loss_mlp": 0.01193288, "balance_loss_clip": 1.0066092, "balance_loss_mlp": 1.00016916, "epoch": 0.8306378885348403, "flos": 25627625506080.0, "grad_norm": 1.735159807169372, "language_loss": 0.79555279, "learning_rate": 2.9327641075501075e-07, "loss": 0.82007265, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.908926248550415 }, { "auxiliary_loss_clip": 0.01324284, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00804317, "balance_loss_mlp": 1.00018001, "epoch": 0.8307581314254795, "flos": 33947879016960.0, "grad_norm": 2.2083563494587914, "language_loss": 0.66861993, "learning_rate": 2.9287044738901866e-07, "loss": 0.69379479, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.866759777069092 }, { "auxiliary_loss_clip": 0.01335185, "auxiliary_loss_mlp": 0.00872425, "balance_loss_clip": 1.00760102, "balance_loss_mlp": 1.00039136, "epoch": 0.8308783743161186, "flos": 17562732819840.0, "grad_norm": 2.142302491990843, "language_loss": 0.90600413, "learning_rate": 2.9246474299322274e-07, "loss": 0.9280802, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.733793020248413 }, { "auxiliary_loss_clip": 0.01249951, "auxiliary_loss_mlp": 0.01192262, "balance_loss_clip": 1.00401044, "balance_loss_mlp": 1.00000119, "epoch": 0.8309986172067576, "flos": 69412920092640.0, "grad_norm": 0.883143756770043, "language_loss": 0.63186914, "learning_rate": 2.920592976291678e-07, "loss": 0.65629125, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.3067476749420166 }, { "auxiliary_loss_clip": 0.0133668, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00804925, "balance_loss_mlp": 1.00016475, "epoch": 0.8311188600973968, "flos": 22309766893440.0, "grad_norm": 1.9237952609136162, "language_loss": 0.8076632, "learning_rate": 2.916541113583595e-07, "loss": 0.83296192, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.834312915802002 }, { "auxiliary_loss_clip": 0.01277289, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00798845, "balance_loss_mlp": 1.00023437, "epoch": 0.8312391029880358, "flos": 18770086964160.0, "grad_norm": 2.321346528099721, "language_loss": 0.66163743, "learning_rate": 2.912491842422642e-07, "loss": 0.68634188, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.78466796875 }, { "auxiliary_loss_clip": 0.01336552, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.0080744, "balance_loss_mlp": 1.0001843, "epoch": 0.8313593458786749, "flos": 20376659358720.0, "grad_norm": 1.6290858014757001, "language_loss": 0.7090683, "learning_rate": 2.9084451634230857e-07, "loss": 0.73436582, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.7752788066864014 }, { "auxiliary_loss_clip": 0.01289837, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00655603, "balance_loss_mlp": 1.00015569, "epoch": 0.831479588769314, "flos": 32124083536800.0, "grad_norm": 2.1567242095284067, "language_loss": 0.71307886, "learning_rate": 2.9044010771988125e-07, "loss": 0.73790801, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.9094929695129395 }, { "auxiliary_loss_clip": 0.01315686, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00774169, "balance_loss_mlp": 1.00015187, "epoch": 0.8315998316599531, "flos": 45185946645600.0, "grad_norm": 1.7535446841600986, "language_loss": 0.72230506, "learning_rate": 2.900359584363303e-07, "loss": 0.74739367, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.9536190032958984 }, { "auxiliary_loss_clip": 0.01273468, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00782347, "balance_loss_mlp": 1.00021279, "epoch": 0.8317200745505922, "flos": 18363755825280.0, "grad_norm": 2.205580447158077, "language_loss": 0.84650278, "learning_rate": 2.8963206855296494e-07, "loss": 0.87116981, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.8334271907806396 }, { "auxiliary_loss_clip": 0.0133396, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00756621, "balance_loss_mlp": 1.00013757, "epoch": 0.8318403174412313, "flos": 24206568796800.0, "grad_norm": 1.6400948933018271, "language_loss": 0.77031338, "learning_rate": 2.892284381310548e-07, "loss": 0.79558456, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.761776924133301 }, { "auxiliary_loss_clip": 0.01300204, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00673234, "balance_loss_mlp": 1.00014305, "epoch": 0.8319605603318704, "flos": 22418791558560.0, "grad_norm": 2.85018075134019, "language_loss": 0.72148883, "learning_rate": 2.888250672318302e-07, "loss": 0.74642253, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.8461246490478516 }, { "auxiliary_loss_clip": 0.01349322, "auxiliary_loss_mlp": 0.01193345, "balance_loss_clip": 1.0083071, "balance_loss_mlp": 1.00022554, "epoch": 0.8320808032225094, "flos": 37414516599360.0, "grad_norm": 1.5709621955314295, "language_loss": 0.68484581, "learning_rate": 2.884219559164831e-07, "loss": 0.71027243, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.85813045501709 }, { "auxiliary_loss_clip": 0.01324692, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00729704, "balance_loss_mlp": 1.0001725, "epoch": 0.8322010461131486, "flos": 12787404095520.0, "grad_norm": 1.7598658340082929, "language_loss": 0.81010985, "learning_rate": 2.880191042461635e-07, "loss": 0.83528876, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 2.72422194480896 }, { "auxiliary_loss_clip": 0.01286298, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00736809, "balance_loss_mlp": 1.00015879, "epoch": 0.8323212890037877, "flos": 15815464283520.0, "grad_norm": 1.6743513522056934, "language_loss": 0.80337107, "learning_rate": 2.876165122819849e-07, "loss": 0.82816589, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.939074993133545 }, { "auxiliary_loss_clip": 0.01347093, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00765252, "balance_loss_mlp": 1.00014973, "epoch": 0.8324415318944267, "flos": 21719285169120.0, "grad_norm": 1.5335628501450318, "language_loss": 0.79432142, "learning_rate": 2.872141800850201e-07, "loss": 0.81972414, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.673689842224121 }, { "auxiliary_loss_clip": 0.01348083, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00775027, "balance_loss_mlp": 1.00015593, "epoch": 0.8325617747850659, "flos": 34198713457920.0, "grad_norm": 1.6639603650597223, "language_loss": 0.7318598, "learning_rate": 2.868121077163024e-07, "loss": 0.75727147, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.8413984775543213 }, { "auxiliary_loss_clip": 0.01336087, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00770712, "balance_loss_mlp": 1.0002166, "epoch": 0.8326820176757049, "flos": 18369467690400.0, "grad_norm": 1.6072083378525057, "language_loss": 0.72241008, "learning_rate": 2.864102952368257e-07, "loss": 0.74770343, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 3.65053129196167 }, { "auxiliary_loss_clip": 0.01286928, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.0071857, "balance_loss_mlp": 1.00014949, "epoch": 0.832802260566344, "flos": 35991340392960.0, "grad_norm": 1.4003323149685198, "language_loss": 0.59428483, "learning_rate": 2.860087427075444e-07, "loss": 0.61908489, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.9869513511657715 }, { "auxiliary_loss_clip": 0.01318611, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00747609, "balance_loss_mlp": 1.00015616, "epoch": 0.8329225034569832, "flos": 14244443123040.0, "grad_norm": 2.4246270229021656, "language_loss": 0.86361659, "learning_rate": 2.856074501893744e-07, "loss": 0.88873357, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 3.690953254699707 }, { "auxiliary_loss_clip": 0.01327943, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.0078913, "balance_loss_mlp": 1.00012374, "epoch": 0.8330427463476222, "flos": 18077477921280.0, "grad_norm": 1.6098520756849448, "language_loss": 0.81423092, "learning_rate": 2.8520641774319054e-07, "loss": 0.83944178, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 3.658642530441284 }, { "auxiliary_loss_clip": 0.01323775, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00775743, "balance_loss_mlp": 1.00014544, "epoch": 0.8331629892382613, "flos": 18040848976800.0, "grad_norm": 2.1470622664960413, "language_loss": 0.75869399, "learning_rate": 2.848056454298309e-07, "loss": 0.78386348, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.741018772125244 }, { "auxiliary_loss_clip": 0.01302674, "auxiliary_loss_mlp": 0.01193323, "balance_loss_clip": 1.00691116, "balance_loss_mlp": 1.00020409, "epoch": 0.8332832321289004, "flos": 17457408293760.0, "grad_norm": 1.7829066360050956, "language_loss": 0.65077025, "learning_rate": 2.844051333100905e-07, "loss": 0.67573023, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.750730276107788 }, { "auxiliary_loss_clip": 0.01311051, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00757885, "balance_loss_mlp": 1.00011492, "epoch": 0.8334034750195395, "flos": 15084861196320.0, "grad_norm": 1.6870387755565863, "language_loss": 0.83507818, "learning_rate": 2.840048814447269e-07, "loss": 0.86012006, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.7469067573547363 }, { "auxiliary_loss_clip": 0.01325314, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00801206, "balance_loss_mlp": 1.00018036, "epoch": 0.8335237179101785, "flos": 19427180696640.0, "grad_norm": 2.2495696438353896, "language_loss": 0.74050254, "learning_rate": 2.836048898944587e-07, "loss": 0.76568675, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.809748411178589 }, { "auxiliary_loss_clip": 0.01312855, "auxiliary_loss_mlp": 0.01193066, "balance_loss_clip": 1.00712562, "balance_loss_mlp": 1.00013781, "epoch": 0.8336439608008177, "flos": 21762057062880.0, "grad_norm": 2.3732017808331514, "language_loss": 0.72800422, "learning_rate": 2.832051587199642e-07, "loss": 0.75306344, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.797192335128784 }, { "auxiliary_loss_clip": 0.01304754, "auxiliary_loss_mlp": 0.01192281, "balance_loss_clip": 1.00395536, "balance_loss_mlp": 1.00002062, "epoch": 0.8337642036914568, "flos": 59702814341280.0, "grad_norm": 0.8093483407295264, "language_loss": 0.57756948, "learning_rate": 2.828056879818821e-07, "loss": 0.60253984, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.291259765625 }, { "auxiliary_loss_clip": 0.01306459, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00716639, "balance_loss_mlp": 1.00017643, "epoch": 0.8338844465820958, "flos": 27162197340480.0, "grad_norm": 1.8336036521992056, "language_loss": 0.83797908, "learning_rate": 2.824064777408117e-07, "loss": 0.86297566, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.951007604598999 }, { "auxiliary_loss_clip": 0.01323002, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00741172, "balance_loss_mlp": 1.00017929, "epoch": 0.8340046894727349, "flos": 30481277358240.0, "grad_norm": 1.7261913188400513, "language_loss": 0.7566514, "learning_rate": 2.8200752805731263e-07, "loss": 0.78181344, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.793445110321045 }, { "auxiliary_loss_clip": 0.01324529, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00755095, "balance_loss_mlp": 1.00018024, "epoch": 0.834124932363374, "flos": 27126179098560.0, "grad_norm": 1.4277573923759133, "language_loss": 0.81099558, "learning_rate": 2.8160883899190625e-07, "loss": 0.83617288, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.82551908493042 }, { "auxiliary_loss_clip": 0.01288583, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00736511, "balance_loss_mlp": 1.00016046, "epoch": 0.8342451752540131, "flos": 24569876576160.0, "grad_norm": 2.83068349503859, "language_loss": 0.73504996, "learning_rate": 2.8121041060507234e-07, "loss": 0.75986761, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.780118465423584 }, { "auxiliary_loss_clip": 0.01336322, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00763905, "balance_loss_mlp": 1.00014472, "epoch": 0.8343654181446521, "flos": 26615098212480.0, "grad_norm": 1.7651781160002646, "language_loss": 0.71150082, "learning_rate": 2.808122429572528e-07, "loss": 0.73679578, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.8197100162506104 }, { "auxiliary_loss_clip": 0.01298458, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00691199, "balance_loss_mlp": 1.00017452, "epoch": 0.8344856610352913, "flos": 20777278632480.0, "grad_norm": 2.775829570909789, "language_loss": 0.76019394, "learning_rate": 2.804143361088489e-07, "loss": 0.78511047, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.8007795810699463 }, { "auxiliary_loss_clip": 0.01312149, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.0072633, "balance_loss_mlp": 1.00024343, "epoch": 0.8346059039259304, "flos": 26095970422080.0, "grad_norm": 2.5808340099291036, "language_loss": 0.77704227, "learning_rate": 2.8001669012022277e-07, "loss": 0.80209649, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.790808916091919 }, { "auxiliary_loss_clip": 0.0132395, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00780571, "balance_loss_mlp": 1.00019288, "epoch": 0.8347261468165694, "flos": 29027722927680.0, "grad_norm": 1.6015768441124387, "language_loss": 0.69038194, "learning_rate": 2.7961930505169795e-07, "loss": 0.71555358, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.793367862701416 }, { "auxiliary_loss_clip": 0.0132715, "auxiliary_loss_mlp": 0.00872513, "balance_loss_clip": 1.00710702, "balance_loss_mlp": 1.00043261, "epoch": 0.8348463897072086, "flos": 26396474103360.0, "grad_norm": 1.8929401591570203, "language_loss": 0.76141286, "learning_rate": 2.792221809635558e-07, "loss": 0.78340954, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.7878756523132324 }, { "auxiliary_loss_clip": 0.01226746, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.0057627, "balance_loss_mlp": 1.00016725, "epoch": 0.8349666325978476, "flos": 23367731365440.0, "grad_norm": 2.110563739353777, "language_loss": 0.7512157, "learning_rate": 2.788253179160411e-07, "loss": 0.77541512, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.989584445953369 }, { "auxiliary_loss_clip": 0.01312169, "auxiliary_loss_mlp": 0.01193112, "balance_loss_clip": 1.00737321, "balance_loss_mlp": 1.0001837, "epoch": 0.8350868754884867, "flos": 12896536531680.0, "grad_norm": 1.7809351024616564, "language_loss": 0.64670825, "learning_rate": 2.7842871596935725e-07, "loss": 0.67176098, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.9104976654052734 }, { "auxiliary_loss_clip": 0.01322024, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00785708, "balance_loss_mlp": 1.00016844, "epoch": 0.8352071183791259, "flos": 26505534692160.0, "grad_norm": 1.5832515870207664, "language_loss": 0.68944484, "learning_rate": 2.780323751836682e-07, "loss": 0.71459699, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.767953872680664 }, { "auxiliary_loss_clip": 0.01323436, "auxiliary_loss_mlp": 0.00872407, "balance_loss_clip": 1.00779712, "balance_loss_mlp": 1.00041676, "epoch": 0.8353273612697649, "flos": 20668074348960.0, "grad_norm": 1.362091127675077, "language_loss": 0.78597581, "learning_rate": 2.7763629561909876e-07, "loss": 0.80793422, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 2.83916974067688 }, { "auxiliary_loss_clip": 0.01347683, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00765002, "balance_loss_mlp": 1.00015354, "epoch": 0.835447604160404, "flos": 19754146920960.0, "grad_norm": 1.9530232638721952, "language_loss": 0.76442528, "learning_rate": 2.772404773357335e-07, "loss": 0.78983295, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.7017316818237305 }, { "auxiliary_loss_clip": 0.01291578, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00676775, "balance_loss_mlp": 1.00016499, "epoch": 0.8355678470510431, "flos": 23435852168160.0, "grad_norm": 2.0651003558181498, "language_loss": 0.78499788, "learning_rate": 2.7684492039361853e-07, "loss": 0.80984557, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.8850009441375732 }, { "auxiliary_loss_clip": 0.01348653, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00803232, "balance_loss_mlp": 1.00019515, "epoch": 0.8356880899416822, "flos": 21214598698080.0, "grad_norm": 1.7561828274475257, "language_loss": 0.83797926, "learning_rate": 2.764496248527586e-07, "loss": 0.86339796, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.791774272918701 }, { "auxiliary_loss_clip": 0.01290851, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00730824, "balance_loss_mlp": 1.00015032, "epoch": 0.8358083328323213, "flos": 28037555945280.0, "grad_norm": 1.7658539153912345, "language_loss": 0.78300089, "learning_rate": 2.760545907731211e-07, "loss": 0.80784112, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 3.824817419052124 }, { "auxiliary_loss_clip": 0.01337028, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00795102, "balance_loss_mlp": 1.00021255, "epoch": 0.8359285757229604, "flos": 27783667991520.0, "grad_norm": 2.445901479182457, "language_loss": 0.67564964, "learning_rate": 2.75659818214631e-07, "loss": 0.70095223, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.7510013580322266 }, { "auxiliary_loss_clip": 0.01315025, "auxiliary_loss_mlp": 0.01193247, "balance_loss_clip": 1.00689042, "balance_loss_mlp": 1.00022292, "epoch": 0.8360488186135995, "flos": 21435126762240.0, "grad_norm": 1.7396469063600102, "language_loss": 0.78272629, "learning_rate": 2.752653072371749e-07, "loss": 0.807809, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 4.798600196838379 }, { "auxiliary_loss_clip": 0.01279564, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00590277, "balance_loss_mlp": 1.00014007, "epoch": 0.8361690615042385, "flos": 27632338440480.0, "grad_norm": 2.299302454103749, "language_loss": 0.74683905, "learning_rate": 2.7487105790060105e-07, "loss": 0.77156538, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 3.9339425563812256 }, { "auxiliary_loss_clip": 0.01334894, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00746179, "balance_loss_mlp": 1.00015724, "epoch": 0.8362893043948777, "flos": 39202545303360.0, "grad_norm": 2.1583519032687066, "language_loss": 0.69066274, "learning_rate": 2.7447707026471587e-07, "loss": 0.7159425, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.8849496841430664 }, { "auxiliary_loss_clip": 0.01296103, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00695765, "balance_loss_mlp": 1.00013852, "epoch": 0.8364095472855168, "flos": 24785339401440.0, "grad_norm": 1.8983574478981924, "language_loss": 0.7983889, "learning_rate": 2.740833443892874e-07, "loss": 0.82328057, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.837615728378296 }, { "auxiliary_loss_clip": 0.01311367, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00716591, "balance_loss_mlp": 1.00015974, "epoch": 0.8365297901761558, "flos": 22743422743680.0, "grad_norm": 1.6376675039253867, "language_loss": 0.79726762, "learning_rate": 2.7368988033404327e-07, "loss": 0.82231313, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.819408655166626 }, { "auxiliary_loss_clip": 0.01294757, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.0066694, "balance_loss_mlp": 1.00021267, "epoch": 0.836650033066795, "flos": 28396014027840.0, "grad_norm": 1.44577686407701, "language_loss": 0.84271431, "learning_rate": 2.732966781586712e-07, "loss": 0.86759418, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.8341052532196045 }, { "auxiliary_loss_clip": 0.01335224, "auxiliary_loss_mlp": 0.01193102, "balance_loss_clip": 1.00781989, "balance_loss_mlp": 1.00017393, "epoch": 0.836770275957434, "flos": 22236868241280.0, "grad_norm": 1.6337353644248367, "language_loss": 0.66704738, "learning_rate": 2.729037379228205e-07, "loss": 0.69233072, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.760906457901001 }, { "auxiliary_loss_clip": 0.01302026, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00696886, "balance_loss_mlp": 1.00020516, "epoch": 0.8368905188480731, "flos": 22491941676480.0, "grad_norm": 1.7313346379375143, "language_loss": 0.80551583, "learning_rate": 2.725110596860998e-07, "loss": 0.83046842, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.8031163215637207 }, { "auxiliary_loss_clip": 0.01276136, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00682247, "balance_loss_mlp": 1.00018096, "epoch": 0.8370107617387123, "flos": 13370413694400.0, "grad_norm": 1.7601317237733118, "language_loss": 0.70327771, "learning_rate": 2.7211864350807776e-07, "loss": 0.72797108, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.8471670150756836 }, { "auxiliary_loss_clip": 0.01348512, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00781178, "balance_loss_mlp": 1.00016308, "epoch": 0.8371310046293513, "flos": 25261300137600.0, "grad_norm": 1.5301929304447757, "language_loss": 0.73459196, "learning_rate": 2.717264894482836e-07, "loss": 0.76000893, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.8106682300567627 }, { "auxiliary_loss_clip": 0.01328337, "auxiliary_loss_mlp": 0.01193274, "balance_loss_clip": 1.00793195, "balance_loss_mlp": 1.00015545, "epoch": 0.8372512475199904, "flos": 19792715744160.0, "grad_norm": 1.8856363879855496, "language_loss": 0.80990505, "learning_rate": 2.7133459756620646e-07, "loss": 0.83512115, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.715588331222534 }, { "auxiliary_loss_clip": 0.01335562, "auxiliary_loss_mlp": 0.01193238, "balance_loss_clip": 1.00803733, "balance_loss_mlp": 1.0002141, "epoch": 0.8373714904106295, "flos": 19391234302080.0, "grad_norm": 1.8576558569029673, "language_loss": 0.7361455, "learning_rate": 2.7094296792129733e-07, "loss": 0.76143348, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.87773060798645 }, { "auxiliary_loss_clip": 0.01330369, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00732255, "balance_loss_mlp": 1.00013971, "epoch": 0.8374917333012686, "flos": 14975944302240.0, "grad_norm": 1.6699112711369062, "language_loss": 0.75219953, "learning_rate": 2.7055160057296424e-07, "loss": 0.77743495, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.6896305084228516 }, { "auxiliary_loss_clip": 0.01294282, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00683725, "balance_loss_mlp": 1.00015593, "epoch": 0.8376119761919076, "flos": 30331851762240.0, "grad_norm": 1.6787909513474208, "language_loss": 0.72319782, "learning_rate": 2.7016049558057896e-07, "loss": 0.7480725, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.888524293899536 }, { "auxiliary_loss_clip": 0.01325038, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00744629, "balance_loss_mlp": 1.0001694, "epoch": 0.8377322190825467, "flos": 29423348809920.0, "grad_norm": 2.599117868754528, "language_loss": 0.70511568, "learning_rate": 2.6976965300347074e-07, "loss": 0.73029804, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.8044660091400146 }, { "auxiliary_loss_clip": 0.01312319, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00682604, "balance_loss_mlp": 1.00018716, "epoch": 0.8378524619731859, "flos": 26687098772640.0, "grad_norm": 4.648287462079966, "language_loss": 0.6919018, "learning_rate": 2.693790729009309e-07, "loss": 0.71695715, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.798678398132324 }, { "auxiliary_loss_clip": 0.01306868, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00658226, "balance_loss_mlp": 1.00018287, "epoch": 0.8379727048638249, "flos": 20703877048800.0, "grad_norm": 2.4412813319959685, "language_loss": 0.8824507, "learning_rate": 2.6898875533220946e-07, "loss": 0.90745145, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.779149293899536 }, { "auxiliary_loss_clip": 0.01346652, "auxiliary_loss_mlp": 0.01192976, "balance_loss_clip": 1.00763869, "balance_loss_mlp": 1.00014281, "epoch": 0.838092947754464, "flos": 20084094810720.0, "grad_norm": 1.7520460567809408, "language_loss": 0.81932461, "learning_rate": 2.685987003565171e-07, "loss": 0.84472096, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.7428669929504395 }, { "auxiliary_loss_clip": 0.01276417, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00700557, "balance_loss_mlp": 1.0001359, "epoch": 0.8382131906451031, "flos": 18113280621120.0, "grad_norm": 2.514480034820588, "language_loss": 0.75087357, "learning_rate": 2.6820890803302566e-07, "loss": 0.77556932, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.8351778984069824 }, { "auxiliary_loss_clip": 0.01295833, "auxiliary_loss_mlp": 0.01193154, "balance_loss_clip": 1.00824487, "balance_loss_mlp": 1.00013041, "epoch": 0.8383334335357422, "flos": 17092663567200.0, "grad_norm": 2.0475897163694223, "language_loss": 0.81887221, "learning_rate": 2.6781937842086557e-07, "loss": 0.84376204, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.795254707336426 }, { "auxiliary_loss_clip": 0.01331059, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00739098, "balance_loss_mlp": 1.00015521, "epoch": 0.8384536764263812, "flos": 20704739217120.0, "grad_norm": 1.7419665462443314, "language_loss": 0.67565429, "learning_rate": 2.6743011157912933e-07, "loss": 0.70089668, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 2.771221876144409 }, { "auxiliary_loss_clip": 0.01287997, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00743318, "balance_loss_mlp": 1.00021958, "epoch": 0.8385739193170204, "flos": 28986854988960.0, "grad_norm": 1.7519013640490173, "language_loss": 0.65271461, "learning_rate": 2.6704110756686725e-07, "loss": 0.67752701, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.847808837890625 }, { "auxiliary_loss_clip": 0.0132398, "auxiliary_loss_mlp": 0.00872539, "balance_loss_clip": 1.00751829, "balance_loss_mlp": 1.00044131, "epoch": 0.8386941622076595, "flos": 23438079436320.0, "grad_norm": 1.95417405881964, "language_loss": 0.84146631, "learning_rate": 2.6665236644309085e-07, "loss": 0.86343157, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.892084836959839 }, { "auxiliary_loss_clip": 0.0133604, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00806975, "balance_loss_mlp": 1.00017262, "epoch": 0.8388144050982985, "flos": 23002735173120.0, "grad_norm": 1.7137833924694128, "language_loss": 0.79451597, "learning_rate": 2.662638882667727e-07, "loss": 0.8198083, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.9212076663970947 }, { "auxiliary_loss_clip": 0.01349535, "auxiliary_loss_mlp": 0.01193125, "balance_loss_clip": 1.00796282, "balance_loss_mlp": 1.00019634, "epoch": 0.8389346479889377, "flos": 24280365540960.0, "grad_norm": 2.370038627203646, "language_loss": 0.72917134, "learning_rate": 2.658756730968443e-07, "loss": 0.7545979, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 3.6321356296539307 }, { "auxiliary_loss_clip": 0.01310437, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00738311, "balance_loss_mlp": 1.00018394, "epoch": 0.8390548908795767, "flos": 21215029782240.0, "grad_norm": 2.1663481054223053, "language_loss": 0.88249063, "learning_rate": 2.654877209921975e-07, "loss": 0.90752709, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.7688138484954834 }, { "auxiliary_loss_clip": 0.01293071, "auxiliary_loss_mlp": 0.01193322, "balance_loss_clip": 1.00776911, "balance_loss_mlp": 1.00020313, "epoch": 0.8391751337702158, "flos": 35627314140000.0, "grad_norm": 2.1278158709348793, "language_loss": 0.63462561, "learning_rate": 2.651000320116843e-07, "loss": 0.65948957, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 3.928577423095703 }, { "auxiliary_loss_clip": 0.01295063, "auxiliary_loss_mlp": 0.00872578, "balance_loss_clip": 1.00688004, "balance_loss_mlp": 1.00045502, "epoch": 0.839295376660855, "flos": 21325239928800.0, "grad_norm": 1.8771967327395083, "language_loss": 0.75888848, "learning_rate": 2.647126062141163e-07, "loss": 0.7805649, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.801133632659912 }, { "auxiliary_loss_clip": 0.0132311, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00755799, "balance_loss_mlp": 1.00014448, "epoch": 0.839415619551494, "flos": 18442545960960.0, "grad_norm": 1.6261396132606847, "language_loss": 0.83909577, "learning_rate": 2.643254436582669e-07, "loss": 0.86425859, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.7593445777893066 }, { "auxiliary_loss_clip": 0.01269035, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00634825, "balance_loss_mlp": 1.00015223, "epoch": 0.8395358624421331, "flos": 23221969984800.0, "grad_norm": 1.8938146620616767, "language_loss": 0.82444155, "learning_rate": 2.6393854440286743e-07, "loss": 0.84906369, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.8075151443481445 }, { "auxiliary_loss_clip": 0.01348185, "auxiliary_loss_mlp": 0.01193018, "balance_loss_clip": 1.00840282, "balance_loss_mlp": 1.00018525, "epoch": 0.8396561053327722, "flos": 24381666614880.0, "grad_norm": 2.4170562150342794, "language_loss": 0.70689821, "learning_rate": 2.6355190850661045e-07, "loss": 0.73231018, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.733682632446289 }, { "auxiliary_loss_clip": 0.01310444, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00779629, "balance_loss_mlp": 1.00013387, "epoch": 0.8397763482234113, "flos": 22237766333280.0, "grad_norm": 1.50454230415167, "language_loss": 0.86505687, "learning_rate": 2.631655360281486e-07, "loss": 0.8900919, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.7122440338134766 }, { "auxiliary_loss_clip": 0.01321004, "auxiliary_loss_mlp": 0.00872548, "balance_loss_clip": 1.00784934, "balance_loss_mlp": 1.00045943, "epoch": 0.8398965911140504, "flos": 22163754047040.0, "grad_norm": 2.0173825456636814, "language_loss": 0.65718019, "learning_rate": 2.6277942702609323e-07, "loss": 0.67911565, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.78650164604187 }, { "auxiliary_loss_clip": 0.01292285, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00708413, "balance_loss_mlp": 1.00016356, "epoch": 0.8400168340046895, "flos": 21542786327520.0, "grad_norm": 1.915649663999392, "language_loss": 0.8725512, "learning_rate": 2.623935815590186e-07, "loss": 0.89740586, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.828745126724243 }, { "auxiliary_loss_clip": 0.01309859, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00769877, "balance_loss_mlp": 1.00017798, "epoch": 0.8401370768953286, "flos": 22491977600160.0, "grad_norm": 1.7575924560269522, "language_loss": 0.80638194, "learning_rate": 2.6200799968545516e-07, "loss": 0.83141255, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.804914712905884 }, { "auxiliary_loss_clip": 0.01277665, "auxiliary_loss_mlp": 0.0119228, "balance_loss_clip": 1.00409484, "balance_loss_mlp": 1.00001931, "epoch": 0.8402573197859676, "flos": 59238923961600.0, "grad_norm": 0.7855107929031756, "language_loss": 0.56464994, "learning_rate": 2.616226814638969e-07, "loss": 0.58934939, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.347130298614502 }, { "auxiliary_loss_clip": 0.01296258, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00730097, "balance_loss_mlp": 1.00013924, "epoch": 0.8403775626766068, "flos": 22674619391040.0, "grad_norm": 1.8691781065005661, "language_loss": 0.77586794, "learning_rate": 2.612376269527954e-07, "loss": 0.80076122, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.70778226852417 }, { "auxiliary_loss_clip": 0.01299939, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.00658464, "balance_loss_mlp": 1.00014544, "epoch": 0.8404978055672458, "flos": 19609714716480.0, "grad_norm": 2.097374137696859, "language_loss": 0.67632329, "learning_rate": 2.608528362105635e-07, "loss": 0.70125341, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.7721750736236572 }, { "auxiliary_loss_clip": 0.01302503, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00709939, "balance_loss_mlp": 1.00011349, "epoch": 0.8406180484578849, "flos": 27526942067040.0, "grad_norm": 1.7529872565095468, "language_loss": 0.73554087, "learning_rate": 2.6046830929557374e-07, "loss": 0.76049727, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.8443686962127686 }, { "auxiliary_loss_clip": 0.01299365, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00757265, "balance_loss_mlp": 1.00015521, "epoch": 0.8407382913485241, "flos": 22127484339360.0, "grad_norm": 1.8686744524517063, "language_loss": 0.8486352, "learning_rate": 2.6008404626615776e-07, "loss": 0.87356067, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.864189624786377 }, { "auxiliary_loss_clip": 0.01336323, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00845408, "balance_loss_mlp": 1.00018048, "epoch": 0.8408585342391631, "flos": 13918482761760.0, "grad_norm": 2.556050488208227, "language_loss": 0.73727381, "learning_rate": 2.597000471806092e-07, "loss": 0.76256907, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.7346978187561035 }, { "auxiliary_loss_clip": 0.01297766, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00763726, "balance_loss_mlp": 1.00021255, "epoch": 0.8409787771298022, "flos": 20187874618560.0, "grad_norm": 1.8526980114008633, "language_loss": 0.72867268, "learning_rate": 2.593163120971793e-07, "loss": 0.75358272, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.7951176166534424 }, { "auxiliary_loss_clip": 0.01286512, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00789833, "balance_loss_mlp": 1.00015974, "epoch": 0.8410990200204413, "flos": 23142533222880.0, "grad_norm": 1.9194102424755213, "language_loss": 0.69049913, "learning_rate": 2.5893284107408165e-07, "loss": 0.71529514, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.9095051288604736 }, { "auxiliary_loss_clip": 0.01272672, "auxiliary_loss_mlp": 0.01193104, "balance_loss_clip": 1.00725174, "balance_loss_mlp": 1.00017595, "epoch": 0.8412192629110804, "flos": 24027231984480.0, "grad_norm": 1.6279502412122584, "language_loss": 0.77950388, "learning_rate": 2.5854963416948726e-07, "loss": 0.80416167, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.8851420879364014 }, { "auxiliary_loss_clip": 0.01283483, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00776243, "balance_loss_mlp": 1.00017715, "epoch": 0.8413395058017195, "flos": 25591715035200.0, "grad_norm": 1.5845471192964455, "language_loss": 0.69567096, "learning_rate": 2.5816669144152816e-07, "loss": 0.72043782, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.96248722076416 }, { "auxiliary_loss_clip": 0.01316761, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.00372994, "balance_loss_mlp": 1.0000155, "epoch": 0.8414597486923585, "flos": 63635425876800.0, "grad_norm": 0.8482156743358168, "language_loss": 0.6636098, "learning_rate": 2.5778401294829777e-07, "loss": 0.6887002, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.3156111240386963 }, { "auxiliary_loss_clip": 0.01324317, "auxiliary_loss_mlp": 0.00872405, "balance_loss_clip": 1.00741649, "balance_loss_mlp": 1.0003612, "epoch": 0.8415799915829977, "flos": 19098741601440.0, "grad_norm": 1.7213900722173663, "language_loss": 0.65132403, "learning_rate": 2.574015987478473e-07, "loss": 0.67329127, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 2.7966015338897705 }, { "auxiliary_loss_clip": 0.01315581, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00741625, "balance_loss_mlp": 1.000193, "epoch": 0.8417002344736367, "flos": 19821621097440.0, "grad_norm": 1.9445847404186976, "language_loss": 0.8707779, "learning_rate": 2.570194488981887e-07, "loss": 0.89586592, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.7424025535583496 }, { "auxiliary_loss_clip": 0.01316803, "auxiliary_loss_mlp": 0.01192281, "balance_loss_clip": 1.00377202, "balance_loss_mlp": 1.00002027, "epoch": 0.8418204773642758, "flos": 62161551852480.0, "grad_norm": 0.8386349086984956, "language_loss": 0.6041398, "learning_rate": 2.566375634572939e-07, "loss": 0.62923062, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.204256772994995 }, { "auxiliary_loss_clip": 0.01302798, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00735164, "balance_loss_mlp": 1.00017893, "epoch": 0.841940720254915, "flos": 17092915032960.0, "grad_norm": 1.6669816417366763, "language_loss": 0.76252526, "learning_rate": 2.562559424830943e-07, "loss": 0.78748524, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.9446444511413574 }, { "auxiliary_loss_clip": 0.01324631, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00823522, "balance_loss_mlp": 1.00023115, "epoch": 0.842060963145554, "flos": 16283593657440.0, "grad_norm": 1.9204429763080082, "language_loss": 0.70201242, "learning_rate": 2.5587458603348256e-07, "loss": 0.72719038, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 3.7181341648101807 }, { "auxiliary_loss_clip": 0.01296026, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00710237, "balance_loss_mlp": 1.0001744, "epoch": 0.8421812060361931, "flos": 21908249527680.0, "grad_norm": 2.2392372448471445, "language_loss": 0.83956575, "learning_rate": 2.554934941663085e-07, "loss": 0.86445802, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.79948091506958 }, { "auxiliary_loss_clip": 0.01287824, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00712752, "balance_loss_mlp": 1.00016522, "epoch": 0.8423014489268322, "flos": 27777704660640.0, "grad_norm": 1.934211491020799, "language_loss": 0.73577613, "learning_rate": 2.5511266693938484e-07, "loss": 0.76058626, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.882410764694214 }, { "auxiliary_loss_clip": 0.01299744, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00744843, "balance_loss_mlp": 1.00014329, "epoch": 0.8424216918174713, "flos": 25117622330400.0, "grad_norm": 1.705613087016021, "language_loss": 0.77708113, "learning_rate": 2.547321044104822e-07, "loss": 0.8020103, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 5.71326470375061 }, { "auxiliary_loss_clip": 0.01349408, "auxiliary_loss_mlp": 0.0119323, "balance_loss_clip": 1.00812626, "balance_loss_mlp": 1.00020647, "epoch": 0.8425419347081103, "flos": 24748458991200.0, "grad_norm": 1.5752418593180764, "language_loss": 0.76275223, "learning_rate": 2.5435180663733113e-07, "loss": 0.78817856, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.7848174571990967 }, { "auxiliary_loss_clip": 0.01284753, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00853419, "balance_loss_mlp": 1.00016046, "epoch": 0.8426621775987495, "flos": 24820926559200.0, "grad_norm": 2.286437900572088, "language_loss": 0.7195434, "learning_rate": 2.539717736776241e-07, "loss": 0.74432272, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.8137400150299072 }, { "auxiliary_loss_clip": 0.01322732, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00775528, "balance_loss_mlp": 1.00015938, "epoch": 0.8427824204893886, "flos": 23550085766880.0, "grad_norm": 1.4054507642800387, "language_loss": 0.76344126, "learning_rate": 2.535920055890097e-07, "loss": 0.78859955, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.7664010524749756 }, { "auxiliary_loss_clip": 0.01283559, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00768018, "balance_loss_mlp": 1.00018883, "epoch": 0.8429026633800276, "flos": 16143867455040.0, "grad_norm": 2.1008193250677314, "language_loss": 0.64745307, "learning_rate": 2.5321250242910006e-07, "loss": 0.67222071, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.8214938640594482 }, { "auxiliary_loss_clip": 0.01348264, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00805998, "balance_loss_mlp": 1.00018048, "epoch": 0.8430229062706668, "flos": 22198550883840.0, "grad_norm": 1.67339963078894, "language_loss": 0.86507148, "learning_rate": 2.5283326425546493e-07, "loss": 0.89048523, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.7449467182159424 }, { "auxiliary_loss_clip": 0.01277604, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00691307, "balance_loss_mlp": 1.0001334, "epoch": 0.8431431491613058, "flos": 35330330979360.0, "grad_norm": 2.720764918851105, "language_loss": 0.69629371, "learning_rate": 2.5245429112563443e-07, "loss": 0.72100031, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.8853094577789307 }, { "auxiliary_loss_clip": 0.0132521, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00787354, "balance_loss_mlp": 1.00014019, "epoch": 0.8432633920519449, "flos": 25812386794080.0, "grad_norm": 1.7185460540876598, "language_loss": 0.8207531, "learning_rate": 2.5207558309709865e-07, "loss": 0.84593689, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.7628302574157715 }, { "auxiliary_loss_clip": 0.01280577, "auxiliary_loss_mlp": 0.00871861, "balance_loss_clip": 1.00377989, "balance_loss_mlp": 1.00008821, "epoch": 0.8433836349425841, "flos": 64959564201120.0, "grad_norm": 0.660142103896237, "language_loss": 0.5627501, "learning_rate": 2.516971402273065e-07, "loss": 0.58427453, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.361959457397461 }, { "auxiliary_loss_clip": 0.01311966, "auxiliary_loss_mlp": 0.01193056, "balance_loss_clip": 1.00724506, "balance_loss_mlp": 1.00012803, "epoch": 0.8435038778332231, "flos": 20229999886080.0, "grad_norm": 1.9535926521090001, "language_loss": 0.67271173, "learning_rate": 2.513189625736687e-07, "loss": 0.69776195, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.8184814453125 }, { "auxiliary_loss_clip": 0.01293617, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00711131, "balance_loss_mlp": 1.00016713, "epoch": 0.8436241207238622, "flos": 20992238526240.0, "grad_norm": 2.007042433757586, "language_loss": 0.71277416, "learning_rate": 2.509410501935534e-07, "loss": 0.73764223, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.7884576320648193 }, { "auxiliary_loss_clip": 0.01312516, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00773132, "balance_loss_mlp": 1.00017095, "epoch": 0.8437443636145013, "flos": 14682266120160.0, "grad_norm": 2.393665237533174, "language_loss": 0.75844848, "learning_rate": 2.5056340314429116e-07, "loss": 0.78350556, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.777461528778076 }, { "auxiliary_loss_clip": 0.01286625, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00723398, "balance_loss_mlp": 1.00014615, "epoch": 0.8438646065051404, "flos": 21608823556800.0, "grad_norm": 6.201637076052838, "language_loss": 0.8051272, "learning_rate": 2.5018602148316904e-07, "loss": 0.82992506, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.9238367080688477 }, { "auxiliary_loss_clip": 0.0134732, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.00803971, "balance_loss_mlp": 1.0001502, "epoch": 0.8439848493957794, "flos": 23289947092800.0, "grad_norm": 1.8025104668897216, "language_loss": 0.80428809, "learning_rate": 2.498089052674359e-07, "loss": 0.82969213, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.7777228355407715 }, { "auxiliary_loss_clip": 0.01324175, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00762165, "balance_loss_mlp": 1.00016749, "epoch": 0.8441050922864186, "flos": 19719350084160.0, "grad_norm": 1.8150496711925395, "language_loss": 0.75321078, "learning_rate": 2.494320545543007e-07, "loss": 0.77838445, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.780296564102173 }, { "auxiliary_loss_clip": 0.01349773, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.0082655, "balance_loss_mlp": 1.00018144, "epoch": 0.8442253351770577, "flos": 21835279028160.0, "grad_norm": 1.624585309717618, "language_loss": 0.66538316, "learning_rate": 2.490554694009308e-07, "loss": 0.69081295, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.909181594848633 }, { "auxiliary_loss_clip": 0.01336811, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00779581, "balance_loss_mlp": 1.00014138, "epoch": 0.8443455780676967, "flos": 34346378793600.0, "grad_norm": 1.5591199305997787, "language_loss": 0.78420269, "learning_rate": 2.4867914986445426e-07, "loss": 0.80950236, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.842604875564575 }, { "auxiliary_loss_clip": 0.01323079, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00779295, "balance_loss_mlp": 1.00017309, "epoch": 0.8444658209583359, "flos": 48214617536160.0, "grad_norm": 1.868728633405056, "language_loss": 0.70964217, "learning_rate": 2.483030960019581e-07, "loss": 0.73480487, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.99385929107666 }, { "auxiliary_loss_clip": 0.01248111, "auxiliary_loss_mlp": 0.01192308, "balance_loss_clip": 1.00246048, "balance_loss_mlp": 1.00004768, "epoch": 0.8445860638489749, "flos": 68484802811040.0, "grad_norm": 0.7300740317753379, "language_loss": 0.55471361, "learning_rate": 2.479273078704891e-07, "loss": 0.57911777, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.220360279083252 }, { "auxiliary_loss_clip": 0.01238528, "auxiliary_loss_mlp": 0.01192263, "balance_loss_clip": 1.00422502, "balance_loss_mlp": 1.00000215, "epoch": 0.844706306739614, "flos": 62833361084640.0, "grad_norm": 0.7791802240503424, "language_loss": 0.647856, "learning_rate": 2.475517855270552e-07, "loss": 0.6721639, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 3.3557257652282715 }, { "auxiliary_loss_clip": 0.01347616, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.0078311, "balance_loss_mlp": 1.0001682, "epoch": 0.8448265496302532, "flos": 14976123920640.0, "grad_norm": 1.8027293686141201, "language_loss": 0.72688317, "learning_rate": 2.4717652902862143e-07, "loss": 0.7522912, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.6670145988464355 }, { "auxiliary_loss_clip": 0.0129942, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.00757384, "balance_loss_mlp": 1.00012922, "epoch": 0.8449467925208922, "flos": 23441276643840.0, "grad_norm": 1.738094177367262, "language_loss": 0.81150579, "learning_rate": 2.4680153843211495e-07, "loss": 0.83643055, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.777514934539795 }, { "auxiliary_loss_clip": 0.01302015, "auxiliary_loss_mlp": 0.01193125, "balance_loss_clip": 1.00693333, "balance_loss_mlp": 1.00019717, "epoch": 0.8450670354115313, "flos": 22748056898400.0, "grad_norm": 1.6302290986221462, "language_loss": 0.72363746, "learning_rate": 2.464268137944212e-07, "loss": 0.74858892, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.841320276260376 }, { "auxiliary_loss_clip": 0.01279673, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.00781345, "balance_loss_mlp": 1.00019741, "epoch": 0.8451872783021703, "flos": 29825584649280.0, "grad_norm": 1.7390415967070034, "language_loss": 0.77878881, "learning_rate": 2.46052355172385e-07, "loss": 0.8035177, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 3.892211437225342 }, { "auxiliary_loss_clip": 0.01348753, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.007954, "balance_loss_mlp": 1.00018144, "epoch": 0.8453075211928095, "flos": 21870039941280.0, "grad_norm": 1.7816102279578268, "language_loss": 0.74542844, "learning_rate": 2.456781626228128e-07, "loss": 0.77084804, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.745089054107666 }, { "auxiliary_loss_clip": 0.01253097, "auxiliary_loss_mlp": 0.00871755, "balance_loss_clip": 1.00378108, "balance_loss_mlp": 1.00002885, "epoch": 0.8454277640834486, "flos": 58751898732000.0, "grad_norm": 0.9159264594555152, "language_loss": 0.66292244, "learning_rate": 2.453042362024675e-07, "loss": 0.68417096, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 4.305738210678101 }, { "auxiliary_loss_clip": 0.01347678, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00742435, "balance_loss_mlp": 1.00016999, "epoch": 0.8455480069740876, "flos": 27090089009280.0, "grad_norm": 1.3952662345294549, "language_loss": 0.72854185, "learning_rate": 2.449305759680751e-07, "loss": 0.75395054, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 3.771552085876465 }, { "auxiliary_loss_clip": 0.01280271, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00683367, "balance_loss_mlp": 1.00017869, "epoch": 0.8456682498647268, "flos": 27198682590240.0, "grad_norm": 1.4778438665403433, "language_loss": 0.75112939, "learning_rate": 2.445571819763188e-07, "loss": 0.77586412, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.8621346950531006 }, { "auxiliary_loss_clip": 0.01347759, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00803864, "balance_loss_mlp": 1.00018442, "epoch": 0.8457884927553658, "flos": 20631912412320.0, "grad_norm": 1.6703238335185961, "language_loss": 0.58594382, "learning_rate": 2.4418405428384227e-07, "loss": 0.61135352, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.7266628742218018 }, { "auxiliary_loss_clip": 0.01347231, "auxiliary_loss_mlp": 0.0087254, "balance_loss_clip": 1.00764751, "balance_loss_mlp": 1.00043201, "epoch": 0.8459087356460049, "flos": 15299030769120.0, "grad_norm": 1.6722563002914077, "language_loss": 0.71840239, "learning_rate": 2.4381119294724864e-07, "loss": 0.74060011, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.6791138648986816 }, { "auxiliary_loss_clip": 0.01348455, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00794601, "balance_loss_mlp": 1.00015998, "epoch": 0.846028978536644, "flos": 18843165234720.0, "grad_norm": 1.8544762762300318, "language_loss": 0.53750217, "learning_rate": 2.434385980231004e-07, "loss": 0.56291753, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.6627097129821777 }, { "auxiliary_loss_clip": 0.0132786, "auxiliary_loss_mlp": 0.01192965, "balance_loss_clip": 1.00709534, "balance_loss_mlp": 1.0001322, "epoch": 0.8461492214272831, "flos": 52661758631040.0, "grad_norm": 1.4880723959603057, "language_loss": 0.65582722, "learning_rate": 2.4306626956792043e-07, "loss": 0.68103546, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 3.0286543369293213 }, { "auxiliary_loss_clip": 0.01326743, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00718677, "balance_loss_mlp": 1.00014889, "epoch": 0.8462694643179222, "flos": 18588415112640.0, "grad_norm": 1.5864893174547268, "language_loss": 0.7541883, "learning_rate": 2.4269420763819017e-07, "loss": 0.77938646, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.714428186416626 }, { "auxiliary_loss_clip": 0.01326508, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00721848, "balance_loss_mlp": 1.00013554, "epoch": 0.8463897072085613, "flos": 24387091090560.0, "grad_norm": 2.588394061250433, "language_loss": 0.83444393, "learning_rate": 2.4232241229035223e-07, "loss": 0.85964066, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.7307016849517822 }, { "auxiliary_loss_clip": 0.0130421, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00362515, "balance_loss_mlp": 1.00001037, "epoch": 0.8465099500992004, "flos": 68702169591360.0, "grad_norm": 0.7473285284907278, "language_loss": 0.56785035, "learning_rate": 2.419508835808064e-07, "loss": 0.59281516, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.243384599685669 }, { "auxiliary_loss_clip": 0.01303029, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00675726, "balance_loss_mlp": 1.00013375, "epoch": 0.8466301929898394, "flos": 13735733199840.0, "grad_norm": 1.9049656356621647, "language_loss": 0.62787974, "learning_rate": 2.415796215659134e-07, "loss": 0.65284067, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.7303647994995117 }, { "auxiliary_loss_clip": 0.01312414, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00794792, "balance_loss_mlp": 1.0001421, "epoch": 0.8467504358804786, "flos": 19241269850880.0, "grad_norm": 1.8375223674052548, "language_loss": 0.77236623, "learning_rate": 2.412086263019939e-07, "loss": 0.7974211, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.830460548400879 }, { "auxiliary_loss_clip": 0.01346856, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.00808477, "balance_loss_mlp": 1.0001297, "epoch": 0.8468706787711177, "flos": 21324126294720.0, "grad_norm": 1.5891414765781398, "language_loss": 0.79910445, "learning_rate": 2.408378978453276e-07, "loss": 0.8245036, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.8886258602142334 }, { "auxiliary_loss_clip": 0.01304227, "auxiliary_loss_mlp": 0.01192284, "balance_loss_clip": 1.00359845, "balance_loss_mlp": 1.00002313, "epoch": 0.8469909216617567, "flos": 64877469086880.0, "grad_norm": 0.8124063797796396, "language_loss": 0.64018416, "learning_rate": 2.404674362521533e-07, "loss": 0.66514927, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.2031455039978027 }, { "auxiliary_loss_clip": 0.01325283, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00713575, "balance_loss_mlp": 1.00014496, "epoch": 0.8471111645523959, "flos": 19280593071360.0, "grad_norm": 2.109543010641283, "language_loss": 0.74524021, "learning_rate": 2.4009724157866997e-07, "loss": 0.77042377, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.7843167781829834 }, { "auxiliary_loss_clip": 0.01348305, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00813651, "balance_loss_mlp": 1.00015175, "epoch": 0.8472314074430349, "flos": 22015837245600.0, "grad_norm": 1.7292750514815656, "language_loss": 0.76191938, "learning_rate": 2.3972731388103564e-07, "loss": 0.78733414, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.8539817333221436 }, { "auxiliary_loss_clip": 0.01218618, "auxiliary_loss_mlp": 0.01192259, "balance_loss_clip": 1.00309372, "balance_loss_mlp": 0.99999791, "epoch": 0.847351650333674, "flos": 57882611229120.0, "grad_norm": 0.7980947229015412, "language_loss": 0.62374479, "learning_rate": 2.393576532153687e-07, "loss": 0.64785355, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.6146528720855713 }, { "auxiliary_loss_clip": 0.01296195, "auxiliary_loss_mlp": 0.01192269, "balance_loss_clip": 1.00362146, "balance_loss_mlp": 1.00000846, "epoch": 0.8474718932243132, "flos": 41284265398560.0, "grad_norm": 0.9341255274366603, "language_loss": 0.57858789, "learning_rate": 2.389882596377453e-07, "loss": 0.60347253, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.562366485595703 }, { "auxiliary_loss_clip": 0.01347336, "auxiliary_loss_mlp": 0.01193064, "balance_loss_clip": 1.00726628, "balance_loss_mlp": 1.00013542, "epoch": 0.8475921361149522, "flos": 38180922386400.0, "grad_norm": 1.7479556420639688, "language_loss": 0.76363027, "learning_rate": 2.386191332042031e-07, "loss": 0.78903425, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.9082770347595215 }, { "auxiliary_loss_clip": 0.01348965, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00821376, "balance_loss_mlp": 1.00013494, "epoch": 0.8477123790055913, "flos": 25375066728480.0, "grad_norm": 1.61431940144039, "language_loss": 0.72524804, "learning_rate": 2.3825027397073794e-07, "loss": 0.7506693, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.813762664794922 }, { "auxiliary_loss_clip": 0.0132219, "auxiliary_loss_mlp": 0.01193143, "balance_loss_clip": 1.00757563, "balance_loss_mlp": 1.0002147, "epoch": 0.8478326218962304, "flos": 30225198060000.0, "grad_norm": 3.190077639722568, "language_loss": 0.66924202, "learning_rate": 2.3788168199330515e-07, "loss": 0.6943953, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.8408265113830566 }, { "auxiliary_loss_clip": 0.01323701, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00767481, "balance_loss_mlp": 1.00016952, "epoch": 0.8479528647868695, "flos": 38213815268160.0, "grad_norm": 1.5114235736847559, "language_loss": 0.72680342, "learning_rate": 2.3751335732782074e-07, "loss": 0.75197238, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.8929834365844727 }, { "auxiliary_loss_clip": 0.01323427, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00773978, "balance_loss_mlp": 1.00015271, "epoch": 0.8480731076775085, "flos": 20957800926240.0, "grad_norm": 1.9094062584182137, "language_loss": 0.79687953, "learning_rate": 2.371453000301582e-07, "loss": 0.82204551, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.736107587814331 }, { "auxiliary_loss_clip": 0.01280712, "auxiliary_loss_mlp": 0.01193059, "balance_loss_clip": 1.00655425, "balance_loss_mlp": 1.00013053, "epoch": 0.8481933505681477, "flos": 32596523752320.0, "grad_norm": 1.7069386199045207, "language_loss": 0.74467295, "learning_rate": 2.3677751015615222e-07, "loss": 0.76941073, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.923427104949951 }, { "auxiliary_loss_clip": 0.01324872, "auxiliary_loss_mlp": 0.0119311, "balance_loss_clip": 1.0082407, "balance_loss_mlp": 1.00018167, "epoch": 0.8483135934587868, "flos": 20741188543200.0, "grad_norm": 1.8087355190669345, "language_loss": 0.85074401, "learning_rate": 2.3640998776159593e-07, "loss": 0.87592387, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 3.74428129196167 }, { "auxiliary_loss_clip": 0.01312323, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00692558, "balance_loss_mlp": 1.00015926, "epoch": 0.8484338363494258, "flos": 21653068321440.0, "grad_norm": 1.7141649079126868, "language_loss": 0.81184399, "learning_rate": 2.3604273290224253e-07, "loss": 0.83689713, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.8241891860961914 }, { "auxiliary_loss_clip": 0.01305161, "auxiliary_loss_mlp": 0.01193149, "balance_loss_clip": 1.00735104, "balance_loss_mlp": 1.00022054, "epoch": 0.848554079240065, "flos": 15013974270240.0, "grad_norm": 1.9096985565062659, "language_loss": 0.74644399, "learning_rate": 2.356757456338039e-07, "loss": 0.77142709, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.74407696723938 }, { "auxiliary_loss_clip": 0.01274924, "auxiliary_loss_mlp": 0.01192282, "balance_loss_clip": 1.00431263, "balance_loss_mlp": 1.00002086, "epoch": 0.848674322130704, "flos": 68060487117600.0, "grad_norm": 0.750638840713125, "language_loss": 0.59070319, "learning_rate": 2.3530902601195147e-07, "loss": 0.61537528, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 5.270642995834351 }, { "auxiliary_loss_clip": 0.01322037, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00757003, "balance_loss_mlp": 1.00015402, "epoch": 0.8487945650213431, "flos": 18475798079520.0, "grad_norm": 2.70950368967199, "language_loss": 0.78420019, "learning_rate": 2.34942574092317e-07, "loss": 0.8093524, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.609539031982422 }, { "auxiliary_loss_clip": 0.01331617, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00735056, "balance_loss_mlp": 1.00014555, "epoch": 0.8489148079119821, "flos": 23473199586240.0, "grad_norm": 2.0313349591836283, "language_loss": 0.76888096, "learning_rate": 2.3457638993049045e-07, "loss": 0.79412884, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.678468704223633 }, { "auxiliary_loss_clip": 0.01235374, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00546694, "balance_loss_mlp": 1.00019252, "epoch": 0.8490350508026213, "flos": 19937614956480.0, "grad_norm": 1.8670211257749953, "language_loss": 0.64414334, "learning_rate": 2.3421047358202252e-07, "loss": 0.66842926, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.7463061809539795 }, { "auxiliary_loss_clip": 0.0133015, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00731063, "balance_loss_mlp": 1.00017571, "epoch": 0.8491552936932604, "flos": 24279970380480.0, "grad_norm": 2.200883701088875, "language_loss": 0.82969677, "learning_rate": 2.3384482510242144e-07, "loss": 0.85493016, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.7654876708984375 }, { "auxiliary_loss_clip": 0.01349385, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00790393, "balance_loss_mlp": 1.00017071, "epoch": 0.8492755365838994, "flos": 22522535442720.0, "grad_norm": 1.771853936801501, "language_loss": 0.77067298, "learning_rate": 2.3347944454715575e-07, "loss": 0.79609877, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.682936906814575 }, { "auxiliary_loss_clip": 0.01348201, "auxiliary_loss_mlp": 0.01193129, "balance_loss_clip": 1.00769305, "balance_loss_mlp": 1.00020099, "epoch": 0.8493957794745386, "flos": 26980453641600.0, "grad_norm": 1.6318923502692688, "language_loss": 0.6725719, "learning_rate": 2.331143319716542e-07, "loss": 0.69798523, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.7315833568573 }, { "auxiliary_loss_clip": 0.01281208, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00737405, "balance_loss_mlp": 1.00018239, "epoch": 0.8495160223651776, "flos": 29861998051680.0, "grad_norm": 2.054131568115967, "language_loss": 0.65844655, "learning_rate": 2.3274948743130363e-07, "loss": 0.6831907, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.932492971420288 }, { "auxiliary_loss_clip": 0.01348426, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00774562, "balance_loss_mlp": 1.00018334, "epoch": 0.8496362652558167, "flos": 23075454206880.0, "grad_norm": 1.570210918598563, "language_loss": 0.79325604, "learning_rate": 2.3238491098145085e-07, "loss": 0.81867236, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.701232433319092 }, { "auxiliary_loss_clip": 0.01324627, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00726366, "balance_loss_mlp": 1.00016832, "epoch": 0.8497565081464559, "flos": 14609116002240.0, "grad_norm": 2.208347523020116, "language_loss": 0.7302568, "learning_rate": 2.3202060267740141e-07, "loss": 0.75543505, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.7719595432281494 }, { "auxiliary_loss_clip": 0.01280209, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.00695407, "balance_loss_mlp": 1.00012612, "epoch": 0.8498767510370949, "flos": 21136455188640.0, "grad_norm": 2.3706140316280004, "language_loss": 0.77001655, "learning_rate": 2.3165656257442044e-07, "loss": 0.79475015, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.8263041973114014 }, { "auxiliary_loss_clip": 0.01323063, "auxiliary_loss_mlp": 0.01192989, "balance_loss_clip": 1.00725174, "balance_loss_mlp": 1.00015593, "epoch": 0.849996993927734, "flos": 23654548124640.0, "grad_norm": 2.115575417407835, "language_loss": 0.90234679, "learning_rate": 2.31292790727734e-07, "loss": 0.92750728, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.727121114730835 }, { "auxiliary_loss_clip": 0.01346656, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00724185, "balance_loss_mlp": 1.00016725, "epoch": 0.8501172368183731, "flos": 20558079744480.0, "grad_norm": 2.118133521647112, "language_loss": 0.80154401, "learning_rate": 2.3092928719252392e-07, "loss": 0.82694244, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.735797166824341 }, { "auxiliary_loss_clip": 0.01336486, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00811744, "balance_loss_mlp": 1.00015152, "epoch": 0.8502374797090122, "flos": 22272635017440.0, "grad_norm": 2.1322455651347036, "language_loss": 0.78082979, "learning_rate": 2.3056605202393475e-07, "loss": 0.80612636, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.7529823780059814 }, { "auxiliary_loss_clip": 0.01335928, "auxiliary_loss_mlp": 0.00872495, "balance_loss_clip": 1.00737309, "balance_loss_mlp": 1.00033677, "epoch": 0.8503577225996513, "flos": 23659828905600.0, "grad_norm": 1.7474143101292725, "language_loss": 0.66624993, "learning_rate": 2.3020308527706888e-07, "loss": 0.68833411, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.7851381301879883 }, { "auxiliary_loss_clip": 0.01323704, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.00777757, "balance_loss_mlp": 1.00012636, "epoch": 0.8504779654902904, "flos": 26758524553920.0, "grad_norm": 1.6085226435277722, "language_loss": 0.88807976, "learning_rate": 2.2984038700698715e-07, "loss": 0.9132483, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.8169310092926025 }, { "auxiliary_loss_clip": 0.01323446, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00761545, "balance_loss_mlp": 1.00015771, "epoch": 0.8505982083809295, "flos": 26468259121440.0, "grad_norm": 1.7793144241733188, "language_loss": 0.78973907, "learning_rate": 2.2947795726871222e-07, "loss": 0.81490433, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.74436354637146 }, { "auxiliary_loss_clip": 0.01324472, "auxiliary_loss_mlp": 0.00872419, "balance_loss_clip": 1.008394, "balance_loss_mlp": 1.00043917, "epoch": 0.8507184512715685, "flos": 20303401469760.0, "grad_norm": 1.7448405197637287, "language_loss": 0.85584593, "learning_rate": 2.2911579611722253e-07, "loss": 0.87781483, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.7655959129333496 }, { "auxiliary_loss_clip": 0.01307413, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00814772, "balance_loss_mlp": 1.00015044, "epoch": 0.8508386941622077, "flos": 19025196323040.0, "grad_norm": 1.736810169862416, "language_loss": 0.87346619, "learning_rate": 2.2875390360745905e-07, "loss": 0.89847207, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.752483367919922 }, { "auxiliary_loss_clip": 0.01294121, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00684881, "balance_loss_mlp": 1.00020194, "epoch": 0.8509589370528468, "flos": 16433414413920.0, "grad_norm": 1.8118194122393567, "language_loss": 0.77439177, "learning_rate": 2.2839227979432008e-07, "loss": 0.79926521, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.791228771209717 }, { "auxiliary_loss_clip": 0.01311935, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00706697, "balance_loss_mlp": 1.00013709, "epoch": 0.8510791799434858, "flos": 18259724551680.0, "grad_norm": 1.751940694485012, "language_loss": 0.85120142, "learning_rate": 2.2803092473266373e-07, "loss": 0.87625235, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.763108968734741 }, { "auxiliary_loss_clip": 0.01349282, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00805569, "balance_loss_mlp": 1.00019288, "epoch": 0.851199422834125, "flos": 23441384414880.0, "grad_norm": 2.130525322919357, "language_loss": 0.86699319, "learning_rate": 2.2766983847730724e-07, "loss": 0.89241815, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.7247252464294434 }, { "auxiliary_loss_clip": 0.01306241, "auxiliary_loss_mlp": 0.01193126, "balance_loss_clip": 1.00728822, "balance_loss_mlp": 1.00019789, "epoch": 0.851319665724764, "flos": 16289413293600.0, "grad_norm": 1.7828835694521192, "language_loss": 0.66541326, "learning_rate": 2.2730902108302663e-07, "loss": 0.69040698, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.8860716819763184 }, { "auxiliary_loss_clip": 0.01324166, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00784576, "balance_loss_mlp": 1.00013888, "epoch": 0.8514399086154031, "flos": 18989357699520.0, "grad_norm": 1.599356717405876, "language_loss": 0.68558031, "learning_rate": 2.269484726045583e-07, "loss": 0.71075356, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.777564525604248 }, { "auxiliary_loss_clip": 0.01294135, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00711775, "balance_loss_mlp": 1.00014162, "epoch": 0.8515601515060423, "flos": 24571205752320.0, "grad_norm": 1.5917376521602653, "language_loss": 0.79189813, "learning_rate": 2.2658819309659672e-07, "loss": 0.81677115, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 3.729895830154419 }, { "auxiliary_loss_clip": 0.01305023, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00750351, "balance_loss_mlp": 1.00014472, "epoch": 0.8516803943966813, "flos": 19529451709920.0, "grad_norm": 1.8540713005751255, "language_loss": 0.84670413, "learning_rate": 2.2622818261379706e-07, "loss": 0.87168598, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.707306146621704 }, { "auxiliary_loss_clip": 0.01324784, "auxiliary_loss_mlp": 0.01193267, "balance_loss_clip": 1.00804639, "balance_loss_mlp": 1.00024319, "epoch": 0.8518006372873204, "flos": 20265802585920.0, "grad_norm": 1.715743153878945, "language_loss": 0.74826938, "learning_rate": 2.2586844121077142e-07, "loss": 0.7734499, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 5.666215658187866 }, { "auxiliary_loss_clip": 0.0127777, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00777578, "balance_loss_mlp": 1.00015247, "epoch": 0.8519208801779595, "flos": 24133239060480.0, "grad_norm": 1.6367942557720774, "language_loss": 0.71941209, "learning_rate": 2.2550896894209215e-07, "loss": 0.74412161, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.8847734928131104 }, { "auxiliary_loss_clip": 0.01236014, "auxiliary_loss_mlp": 0.01192277, "balance_loss_clip": 1.00344539, "balance_loss_mlp": 1.00001645, "epoch": 0.8520411230685986, "flos": 63035257968000.0, "grad_norm": 0.6783783380259077, "language_loss": 0.56668454, "learning_rate": 2.2514976586229184e-07, "loss": 0.59096748, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.537971019744873 }, { "auxiliary_loss_clip": 0.01300517, "auxiliary_loss_mlp": 0.01192264, "balance_loss_clip": 1.00342739, "balance_loss_mlp": 1.00000322, "epoch": 0.8521613659592376, "flos": 65836898608320.0, "grad_norm": 0.7750831805247268, "language_loss": 0.54729533, "learning_rate": 2.247908320258609e-07, "loss": 0.57222319, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.30433988571167 }, { "auxiliary_loss_clip": 0.01250566, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.0060544, "balance_loss_mlp": 1.00014985, "epoch": 0.8522816088498768, "flos": 23112334617120.0, "grad_norm": 2.082767100250735, "language_loss": 0.79577506, "learning_rate": 2.2443216748724914e-07, "loss": 0.82021248, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.8927249908447266 }, { "auxiliary_loss_clip": 0.01330518, "auxiliary_loss_mlp": 0.00872456, "balance_loss_clip": 1.00767779, "balance_loss_mlp": 1.00030494, "epoch": 0.8524018517405159, "flos": 31758153328800.0, "grad_norm": 2.5320086469319794, "language_loss": 0.74236846, "learning_rate": 2.2407377230086588e-07, "loss": 0.76439816, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.8244526386260986 }, { "auxiliary_loss_clip": 0.01290921, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00776958, "balance_loss_mlp": 1.00020885, "epoch": 0.8525220946311549, "flos": 18690326889120.0, "grad_norm": 1.8390780064848344, "language_loss": 0.8341409, "learning_rate": 2.23715646521079e-07, "loss": 0.85898155, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.77779483795166 }, { "auxiliary_loss_clip": 0.01331704, "auxiliary_loss_mlp": 0.00872521, "balance_loss_clip": 1.00771832, "balance_loss_mlp": 1.00032878, "epoch": 0.852642337521794, "flos": 21793225608000.0, "grad_norm": 1.7978032433767768, "language_loss": 0.83942008, "learning_rate": 2.2335779020221724e-07, "loss": 0.8614623, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.8674864768981934 }, { "auxiliary_loss_clip": 0.01296352, "auxiliary_loss_mlp": 0.01192268, "balance_loss_clip": 1.0075984, "balance_loss_mlp": 1.00000727, "epoch": 0.8527625804124331, "flos": 69040164385440.0, "grad_norm": 0.8045691994013442, "language_loss": 0.56412065, "learning_rate": 2.2300020339856497e-07, "loss": 0.5890069, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.3582160472869873 }, { "auxiliary_loss_clip": 0.01303388, "auxiliary_loss_mlp": 0.01193087, "balance_loss_clip": 1.00684416, "balance_loss_mlp": 1.00015843, "epoch": 0.8528828233030722, "flos": 26979411854880.0, "grad_norm": 1.9705600349811825, "language_loss": 0.77681792, "learning_rate": 2.2264288616436966e-07, "loss": 0.80178261, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.8045198917388916 }, { "auxiliary_loss_clip": 0.01299097, "auxiliary_loss_mlp": 0.01193148, "balance_loss_clip": 1.00693536, "balance_loss_mlp": 1.00021958, "epoch": 0.8530030661937112, "flos": 17487606899520.0, "grad_norm": 1.944685835242286, "language_loss": 0.72413957, "learning_rate": 2.222858385538351e-07, "loss": 0.749062, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.7403950691223145 }, { "auxiliary_loss_clip": 0.01336857, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00820243, "balance_loss_mlp": 1.00013733, "epoch": 0.8531233090843504, "flos": 22160808305280.0, "grad_norm": 1.5549712307088528, "language_loss": 0.67954308, "learning_rate": 2.2192906062112527e-07, "loss": 0.70484328, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.7752370834350586 }, { "auxiliary_loss_clip": 0.01347682, "auxiliary_loss_mlp": 0.01193192, "balance_loss_clip": 1.00766671, "balance_loss_mlp": 1.0001682, "epoch": 0.8532435519749895, "flos": 37635403900320.0, "grad_norm": 1.4174045476730206, "language_loss": 0.70316547, "learning_rate": 2.2157255242036377e-07, "loss": 0.72857416, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.80777645111084 }, { "auxiliary_loss_clip": 0.01288615, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00669992, "balance_loss_mlp": 1.00014842, "epoch": 0.8533637948656285, "flos": 21398174504640.0, "grad_norm": 1.5774736656862243, "language_loss": 0.74232411, "learning_rate": 2.2121631400563135e-07, "loss": 0.76714098, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.795316219329834 }, { "auxiliary_loss_clip": 0.01291429, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00404978, "balance_loss_mlp": 1.00001037, "epoch": 0.8534840377562677, "flos": 53345148497280.0, "grad_norm": 0.7650239111641736, "language_loss": 0.53024578, "learning_rate": 2.208603454309701e-07, "loss": 0.5550828, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.2703161239624023 }, { "auxiliary_loss_clip": 0.01268683, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00674438, "balance_loss_mlp": 1.00017428, "epoch": 0.8536042806469067, "flos": 20814159042720.0, "grad_norm": 1.6888023248503414, "language_loss": 0.70945251, "learning_rate": 2.2050464675037994e-07, "loss": 0.73407125, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.822031259536743 }, { "auxiliary_loss_clip": 0.01302991, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00677156, "balance_loss_mlp": 1.00015426, "epoch": 0.8537245235375458, "flos": 24681379975200.0, "grad_norm": 1.8118192612086008, "language_loss": 0.72920871, "learning_rate": 2.2014921801782016e-07, "loss": 0.75417042, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.8444159030914307 }, { "auxiliary_loss_clip": 0.01324015, "auxiliary_loss_mlp": 0.01193148, "balance_loss_clip": 1.00771356, "balance_loss_mlp": 1.00012469, "epoch": 0.853844766428185, "flos": 24384827898720.0, "grad_norm": 2.086890581329933, "language_loss": 0.74088842, "learning_rate": 2.1979405928720872e-07, "loss": 0.76606005, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.7701263427734375 }, { "auxiliary_loss_clip": 0.01323538, "auxiliary_loss_mlp": 0.01193043, "balance_loss_clip": 1.0081073, "balance_loss_mlp": 1.00011516, "epoch": 0.853965009318824, "flos": 20955717352800.0, "grad_norm": 1.4476406407208027, "language_loss": 0.79247022, "learning_rate": 2.1943917061242257e-07, "loss": 0.81763601, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.809633493423462 }, { "auxiliary_loss_clip": 0.01338244, "auxiliary_loss_mlp": 0.00872571, "balance_loss_clip": 1.00867367, "balance_loss_mlp": 1.00045753, "epoch": 0.8540852522094631, "flos": 24201826871040.0, "grad_norm": 1.5791661097403809, "language_loss": 0.66569209, "learning_rate": 2.1908455204729903e-07, "loss": 0.68780029, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 2.782167673110962 }, { "auxiliary_loss_clip": 0.01324722, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.0079875, "balance_loss_mlp": 1.00018311, "epoch": 0.8542054951001022, "flos": 25082933264640.0, "grad_norm": 1.9030988260608381, "language_loss": 0.78152442, "learning_rate": 2.1873020364563265e-07, "loss": 0.80670375, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.7939720153808594 }, { "auxiliary_loss_clip": 0.01322936, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00754178, "balance_loss_mlp": 1.00015974, "epoch": 0.8543257379907413, "flos": 24316563401280.0, "grad_norm": 2.3394021800492113, "language_loss": 0.76018959, "learning_rate": 2.183761254611789e-07, "loss": 0.7853508, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.7445194721221924 }, { "auxiliary_loss_clip": 0.01326589, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.0077858, "balance_loss_mlp": 1.00014591, "epoch": 0.8544459808813804, "flos": 55286648964000.0, "grad_norm": 1.9701892867154878, "language_loss": 0.69991541, "learning_rate": 2.1802231754764987e-07, "loss": 0.72511208, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 3.0266120433807373 }, { "auxiliary_loss_clip": 0.01313796, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.00720096, "balance_loss_mlp": 1.0001229, "epoch": 0.8545662237720195, "flos": 25776260781120.0, "grad_norm": 3.4883153866683316, "language_loss": 0.76480728, "learning_rate": 2.17668779958718e-07, "loss": 0.7898767, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.8234047889709473 }, { "auxiliary_loss_clip": 0.0134793, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00792897, "balance_loss_mlp": 1.00013995, "epoch": 0.8546864666626586, "flos": 11108328209280.0, "grad_norm": 2.041493965819227, "language_loss": 0.80357569, "learning_rate": 2.1731551274801553e-07, "loss": 0.82898664, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 3.6121528148651123 }, { "auxiliary_loss_clip": 0.01312536, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00777209, "balance_loss_mlp": 1.00019825, "epoch": 0.8548067095532976, "flos": 25520181482880.0, "grad_norm": 2.140108016533087, "language_loss": 0.62060398, "learning_rate": 2.169625159691324e-07, "loss": 0.64566153, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.755014181137085 }, { "auxiliary_loss_clip": 0.01284504, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.0069201, "balance_loss_mlp": 1.00017285, "epoch": 0.8549269524439368, "flos": 24717865224960.0, "grad_norm": 12.452899974716589, "language_loss": 0.74037671, "learning_rate": 2.1660978967561784e-07, "loss": 0.76515371, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 4.690844297409058 }, { "auxiliary_loss_clip": 0.01348642, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00783539, "balance_loss_mlp": 1.00018156, "epoch": 0.8550471953345758, "flos": 19825608625920.0, "grad_norm": 2.315884439620249, "language_loss": 0.7945295, "learning_rate": 2.1625733392098035e-07, "loss": 0.81994796, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 3.616292953491211 }, { "auxiliary_loss_clip": 0.01347943, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00756383, "balance_loss_mlp": 1.00014043, "epoch": 0.8551674382252149, "flos": 22820452619040.0, "grad_norm": 1.57016546405824, "language_loss": 0.79195452, "learning_rate": 2.159051487586867e-07, "loss": 0.81736559, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.6834120750427246 }, { "auxiliary_loss_clip": 0.01307727, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00750482, "balance_loss_mlp": 1.00019395, "epoch": 0.8552876811158541, "flos": 20631265786080.0, "grad_norm": 2.2120665024013895, "language_loss": 0.72567773, "learning_rate": 2.155532342421642e-07, "loss": 0.75068623, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.8107383251190186 }, { "auxiliary_loss_clip": 0.01335429, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00789356, "balance_loss_mlp": 1.00017035, "epoch": 0.8554079240064931, "flos": 23112370540800.0, "grad_norm": 1.6472248690127236, "language_loss": 0.78158075, "learning_rate": 2.1520159042479636e-07, "loss": 0.80686694, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.712188959121704 }, { "auxiliary_loss_clip": 0.01325984, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00732768, "balance_loss_mlp": 1.00016594, "epoch": 0.8555281668971322, "flos": 22128059118240.0, "grad_norm": 2.1925170603932718, "language_loss": 0.70969737, "learning_rate": 2.148502173599287e-07, "loss": 0.73488903, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.7352428436279297 }, { "auxiliary_loss_clip": 0.01310655, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00807595, "balance_loss_mlp": 1.00014043, "epoch": 0.8556484097877713, "flos": 31139053640640.0, "grad_norm": 1.6410721344687924, "language_loss": 0.65583718, "learning_rate": 2.1449911510086372e-07, "loss": 0.68087441, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.8188905715942383 }, { "auxiliary_loss_clip": 0.01323777, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00718915, "balance_loss_mlp": 1.00017262, "epoch": 0.8557686526784104, "flos": 24316563401280.0, "grad_norm": 2.704219129390866, "language_loss": 0.77018368, "learning_rate": 2.141482837008628e-07, "loss": 0.79535341, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.7517013549804688 }, { "auxiliary_loss_clip": 0.01336646, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00811005, "balance_loss_mlp": 1.00017488, "epoch": 0.8558888955690495, "flos": 17712733194720.0, "grad_norm": 1.9802676993084045, "language_loss": 0.71683431, "learning_rate": 2.1379772321314826e-07, "loss": 0.74213272, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.6698224544525146 }, { "auxiliary_loss_clip": 0.01250882, "auxiliary_loss_mlp": 0.01193103, "balance_loss_clip": 1.00684333, "balance_loss_mlp": 1.000175, "epoch": 0.8560091384596886, "flos": 19171712100960.0, "grad_norm": 1.8774804495738908, "language_loss": 0.81611717, "learning_rate": 2.1344743369089802e-07, "loss": 0.84055698, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.8407928943634033 }, { "auxiliary_loss_clip": 0.01312017, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.0080992, "balance_loss_mlp": 1.00016475, "epoch": 0.8561293813503277, "flos": 23914866417120.0, "grad_norm": 1.5817843727893295, "language_loss": 0.81912196, "learning_rate": 2.130974151872522e-07, "loss": 0.84417307, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.7540736198425293 }, { "auxiliary_loss_clip": 0.01274713, "auxiliary_loss_mlp": 0.0119324, "balance_loss_clip": 1.00753605, "balance_loss_mlp": 1.00012112, "epoch": 0.8562496242409667, "flos": 22529217247200.0, "grad_norm": 1.6328181949626437, "language_loss": 0.78730232, "learning_rate": 2.1274766775530773e-07, "loss": 0.81198186, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.8050403594970703 }, { "auxiliary_loss_clip": 0.01349727, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.0079143, "balance_loss_mlp": 1.00018311, "epoch": 0.8563698671316058, "flos": 14712752115360.0, "grad_norm": 2.3579308182673526, "language_loss": 0.79225421, "learning_rate": 2.1239819144812077e-07, "loss": 0.81768352, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.6376407146453857 }, { "auxiliary_loss_clip": 0.01301824, "auxiliary_loss_mlp": 0.01193116, "balance_loss_clip": 1.00770545, "balance_loss_mlp": 1.00018775, "epoch": 0.856490110022245, "flos": 39167784390240.0, "grad_norm": 1.7116959531483764, "language_loss": 0.69708437, "learning_rate": 2.1204898631870716e-07, "loss": 0.72203374, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.9530062675476074 }, { "auxiliary_loss_clip": 0.01305975, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00729764, "balance_loss_mlp": 1.00014067, "epoch": 0.856610352912884, "flos": 29059358480640.0, "grad_norm": 1.8572048636375125, "language_loss": 0.7625016, "learning_rate": 2.1170005242004006e-07, "loss": 0.78749204, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.7943906784057617 }, { "auxiliary_loss_clip": 0.01307543, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.007725, "balance_loss_mlp": 1.00015306, "epoch": 0.8567305958035231, "flos": 23878345243680.0, "grad_norm": 1.5719085173269478, "language_loss": 0.77910745, "learning_rate": 2.1135138980505384e-07, "loss": 0.80411464, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.7187693119049072 }, { "auxiliary_loss_clip": 0.0129876, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00705218, "balance_loss_mlp": 1.00014234, "epoch": 0.8568508386941622, "flos": 22200131525760.0, "grad_norm": 1.6967396582081895, "language_loss": 0.72142363, "learning_rate": 2.110029985266395e-07, "loss": 0.74634188, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.763805627822876 }, { "auxiliary_loss_clip": 0.01309044, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00804424, "balance_loss_mlp": 1.00012696, "epoch": 0.8569710815848013, "flos": 17307515689920.0, "grad_norm": 1.9734012699977188, "language_loss": 0.73691207, "learning_rate": 2.1065487863764787e-07, "loss": 0.76193404, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.7281033992767334 }, { "auxiliary_loss_clip": 0.01285592, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00728011, "balance_loss_mlp": 1.00014472, "epoch": 0.8570913244754403, "flos": 23732296473600.0, "grad_norm": 1.5913813033702162, "language_loss": 0.85852724, "learning_rate": 2.1030703019088846e-07, "loss": 0.88331389, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.8787970542907715 }, { "auxiliary_loss_clip": 0.01323, "auxiliary_loss_mlp": 0.01193022, "balance_loss_clip": 1.00697601, "balance_loss_mlp": 1.00009418, "epoch": 0.8572115673660795, "flos": 20048759118720.0, "grad_norm": 1.7515638787388725, "language_loss": 0.70617348, "learning_rate": 2.099594532391291e-07, "loss": 0.73133373, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.7487902641296387 }, { "auxiliary_loss_clip": 0.01335144, "auxiliary_loss_mlp": 0.01193016, "balance_loss_clip": 1.00786412, "balance_loss_mlp": 1.00018299, "epoch": 0.8573318102567186, "flos": 27160401156480.0, "grad_norm": 1.4892071904041295, "language_loss": 0.78946126, "learning_rate": 2.0961214783509806e-07, "loss": 0.8147428, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.7521424293518066 }, { "auxiliary_loss_clip": 0.01319027, "auxiliary_loss_mlp": 0.01193117, "balance_loss_clip": 1.00734401, "balance_loss_mlp": 1.00018847, "epoch": 0.8574520531473576, "flos": 24936597105120.0, "grad_norm": 1.6351823868466613, "language_loss": 0.74716651, "learning_rate": 2.0926511403148051e-07, "loss": 0.77228808, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.8221023082733154 }, { "auxiliary_loss_clip": 0.012858, "auxiliary_loss_mlp": 0.01193106, "balance_loss_clip": 1.00756478, "balance_loss_mlp": 1.0001781, "epoch": 0.8575722960379968, "flos": 18771164674560.0, "grad_norm": 1.7870212625219375, "language_loss": 0.75983852, "learning_rate": 2.0891835188092143e-07, "loss": 0.78462756, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.7560715675354004 }, { "auxiliary_loss_clip": 0.01286445, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00710022, "balance_loss_mlp": 1.00014102, "epoch": 0.8576925389286358, "flos": 22200311144160.0, "grad_norm": 1.7758543278810663, "language_loss": 0.81421745, "learning_rate": 2.0857186143602434e-07, "loss": 0.83901358, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.8351149559020996 }, { "auxiliary_loss_clip": 0.01303435, "auxiliary_loss_mlp": 0.01193233, "balance_loss_clip": 1.0077287, "balance_loss_mlp": 1.00020933, "epoch": 0.8578127818192749, "flos": 22894357134240.0, "grad_norm": 1.8047141492963292, "language_loss": 0.67517102, "learning_rate": 2.0822564274935094e-07, "loss": 0.70013767, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 3.7993950843811035 }, { "auxiliary_loss_clip": 0.01301673, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00746477, "balance_loss_mlp": 1.00017869, "epoch": 0.8579330247099141, "flos": 34824854187360.0, "grad_norm": 1.6266438196754338, "language_loss": 0.66697991, "learning_rate": 2.078796958734239e-07, "loss": 0.69192863, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.908853530883789 }, { "auxiliary_loss_clip": 0.01324921, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.0074842, "balance_loss_mlp": 1.00012612, "epoch": 0.8580532676005531, "flos": 19755691639200.0, "grad_norm": 1.8293812126577222, "language_loss": 0.75013149, "learning_rate": 2.0753402086072124e-07, "loss": 0.77531219, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 5.512235641479492 }, { "auxiliary_loss_clip": 0.01231969, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00652003, "balance_loss_mlp": 1.00016963, "epoch": 0.8581735104911922, "flos": 22739327444160.0, "grad_norm": 1.891815284729798, "language_loss": 0.75341815, "learning_rate": 2.071886177636828e-07, "loss": 0.77766979, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.90094256401062 }, { "auxiliary_loss_clip": 0.01323292, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00725794, "balance_loss_mlp": 1.00017858, "epoch": 0.8582937533818313, "flos": 23149143180000.0, "grad_norm": 1.860325114572041, "language_loss": 0.83124971, "learning_rate": 2.0684348663470575e-07, "loss": 0.85641468, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.7140555381774902 }, { "auxiliary_loss_clip": 0.0132429, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00761962, "balance_loss_mlp": 1.00014377, "epoch": 0.8584139962724704, "flos": 19498678325280.0, "grad_norm": 1.7374518986315337, "language_loss": 0.61196107, "learning_rate": 2.0649862752614555e-07, "loss": 0.63713562, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.8146612644195557 }, { "auxiliary_loss_clip": 0.01286497, "auxiliary_loss_mlp": 0.01192269, "balance_loss_clip": 1.00377011, "balance_loss_mlp": 1.00000787, "epoch": 0.8585342391631094, "flos": 71276613572160.0, "grad_norm": 0.7911963237070998, "language_loss": 0.57108963, "learning_rate": 2.0615404049031838e-07, "loss": 0.59587723, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.310847043991089 }, { "auxiliary_loss_clip": 0.01323933, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00738192, "balance_loss_mlp": 1.00014901, "epoch": 0.8586544820537486, "flos": 10815440348160.0, "grad_norm": 2.298201630241026, "language_loss": 0.77982175, "learning_rate": 2.0580972557949616e-07, "loss": 0.80499279, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.751471996307373 }, { "auxiliary_loss_clip": 0.01304124, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00360632, "balance_loss_mlp": 1.00000978, "epoch": 0.8587747249443877, "flos": 64811216315520.0, "grad_norm": 0.8451684697799144, "language_loss": 0.54221439, "learning_rate": 2.054656828459125e-07, "loss": 0.56717837, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.3211536407470703 }, { "auxiliary_loss_clip": 0.0127473, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00738168, "balance_loss_mlp": 1.00015378, "epoch": 0.8588949678350267, "flos": 26834620413600.0, "grad_norm": 1.6023662995396573, "language_loss": 0.7698977, "learning_rate": 2.051219123417578e-07, "loss": 0.79457676, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.842841148376465 }, { "auxiliary_loss_clip": 0.01348933, "auxiliary_loss_mlp": 0.01193056, "balance_loss_clip": 1.00818849, "balance_loss_mlp": 1.00012732, "epoch": 0.8590152107256659, "flos": 26104268792160.0, "grad_norm": 2.041816573242237, "language_loss": 0.60392922, "learning_rate": 2.0477841411918196e-07, "loss": 0.62934911, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.755155086517334 }, { "auxiliary_loss_clip": 0.01325473, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00717664, "balance_loss_mlp": 1.00011373, "epoch": 0.859135453616305, "flos": 26140897736640.0, "grad_norm": 1.832271446342935, "language_loss": 0.74484015, "learning_rate": 2.0443518823029326e-07, "loss": 0.77002633, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.73923659324646 }, { "auxiliary_loss_clip": 0.01292942, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00680673, "balance_loss_mlp": 1.00014746, "epoch": 0.859255696506944, "flos": 12969327412800.0, "grad_norm": 3.5919650428859065, "language_loss": 0.7665121, "learning_rate": 2.0409223472715854e-07, "loss": 0.7913723, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.822370767593384 }, { "auxiliary_loss_clip": 0.01284941, "auxiliary_loss_mlp": 0.00872364, "balance_loss_clip": 1.00631714, "balance_loss_mlp": 1.00042105, "epoch": 0.8593759393975832, "flos": 18475762155840.0, "grad_norm": 1.8911997000310317, "language_loss": 0.74682522, "learning_rate": 2.0374955366180434e-07, "loss": 0.76839823, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.797111749649048 }, { "auxiliary_loss_clip": 0.01301227, "auxiliary_loss_mlp": 0.01193107, "balance_loss_clip": 1.00744355, "balance_loss_mlp": 1.00017858, "epoch": 0.8594961822882222, "flos": 22200167449440.0, "grad_norm": 1.7204296168086473, "language_loss": 0.72583044, "learning_rate": 2.034071450862147e-07, "loss": 0.75077379, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.765488862991333 }, { "auxiliary_loss_clip": 0.01325172, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00804257, "balance_loss_mlp": 1.00024772, "epoch": 0.8596164251788613, "flos": 23294760865920.0, "grad_norm": 6.822234974699772, "language_loss": 0.76796222, "learning_rate": 2.030650090523327e-07, "loss": 0.79314667, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.8374104499816895 }, { "auxiliary_loss_clip": 0.01302483, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00776792, "balance_loss_mlp": 1.00017393, "epoch": 0.8597366680695004, "flos": 31649918984640.0, "grad_norm": 1.5884299106982764, "language_loss": 0.59325707, "learning_rate": 2.0272314561205995e-07, "loss": 0.61821389, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.8955960273742676 }, { "auxiliary_loss_clip": 0.01301734, "auxiliary_loss_mlp": 0.01192986, "balance_loss_clip": 1.0070821, "balance_loss_mlp": 1.00015283, "epoch": 0.8598569109601395, "flos": 21287748816000.0, "grad_norm": 1.7855654734251614, "language_loss": 0.72788215, "learning_rate": 2.023815548172567e-07, "loss": 0.75282943, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.817812442779541 }, { "auxiliary_loss_clip": 0.01336108, "auxiliary_loss_mlp": 0.01193148, "balance_loss_clip": 1.0078212, "balance_loss_mlp": 1.00012422, "epoch": 0.8599771538507786, "flos": 25447821685920.0, "grad_norm": 1.5878856002844048, "language_loss": 0.65994054, "learning_rate": 2.0204023671974267e-07, "loss": 0.68523312, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.7294886112213135 }, { "auxiliary_loss_clip": 0.01322168, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.0072608, "balance_loss_mlp": 1.00018454, "epoch": 0.8600973967414177, "flos": 16723967235840.0, "grad_norm": 2.1256853814331853, "language_loss": 0.81147456, "learning_rate": 2.0169919137129532e-07, "loss": 0.83662832, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.6966564655303955 }, { "auxiliary_loss_clip": 0.01325059, "auxiliary_loss_mlp": 0.01193327, "balance_loss_clip": 1.0076983, "balance_loss_mlp": 1.0002079, "epoch": 0.8602176396320568, "flos": 25227940248000.0, "grad_norm": 2.1747840638851375, "language_loss": 0.70915729, "learning_rate": 2.013584188236508e-07, "loss": 0.73434114, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.7876503467559814 }, { "auxiliary_loss_clip": 0.01349792, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00817323, "balance_loss_mlp": 1.00014544, "epoch": 0.8603378825226958, "flos": 20412246516480.0, "grad_norm": 2.468114619245318, "language_loss": 0.79460859, "learning_rate": 2.0101791912850396e-07, "loss": 0.82003826, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.841318130493164 }, { "auxiliary_loss_clip": 0.01306454, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00714111, "balance_loss_mlp": 1.00019598, "epoch": 0.8604581254133349, "flos": 34930214637120.0, "grad_norm": 1.657306561292052, "language_loss": 0.63822103, "learning_rate": 2.006776923375082e-07, "loss": 0.66321778, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.929409980773926 }, { "auxiliary_loss_clip": 0.01348733, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.0080297, "balance_loss_mlp": 1.00013006, "epoch": 0.860578368303974, "flos": 22596547728960.0, "grad_norm": 1.5902525883017566, "language_loss": 0.71014583, "learning_rate": 2.003377385022764e-07, "loss": 0.73556376, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.7742843627929688 }, { "auxiliary_loss_clip": 0.01312133, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.0074594, "balance_loss_mlp": 1.00014043, "epoch": 0.8606986111946131, "flos": 21324341836800.0, "grad_norm": 1.8136549215885835, "language_loss": 0.77531242, "learning_rate": 1.9999805767437826e-07, "loss": 0.80036533, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.8645689487457275 }, { "auxiliary_loss_clip": 0.0131239, "auxiliary_loss_mlp": 0.01193114, "balance_loss_clip": 1.00674033, "balance_loss_mlp": 1.00018609, "epoch": 0.8608188540852522, "flos": 28877219621280.0, "grad_norm": 1.653121339642672, "language_loss": 0.7169081, "learning_rate": 1.9965864990534386e-07, "loss": 0.74196315, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.8175907135009766 }, { "auxiliary_loss_clip": 0.01299871, "auxiliary_loss_mlp": 0.01192978, "balance_loss_clip": 1.00709796, "balance_loss_mlp": 1.00014544, "epoch": 0.8609390969758913, "flos": 29716200747360.0, "grad_norm": 1.8626290082177965, "language_loss": 0.77236104, "learning_rate": 1.9931951524666092e-07, "loss": 0.79728949, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 3.7495412826538086 }, { "auxiliary_loss_clip": 0.01334608, "auxiliary_loss_mlp": 0.00872414, "balance_loss_clip": 1.00744748, "balance_loss_mlp": 1.00040948, "epoch": 0.8610593398665304, "flos": 21249359611200.0, "grad_norm": 1.5182598347186966, "language_loss": 0.81094831, "learning_rate": 1.9898065374977534e-07, "loss": 0.83301854, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 3.7171688079833984 }, { "auxiliary_loss_clip": 0.01280096, "auxiliary_loss_mlp": 0.01192762, "balance_loss_clip": 1.0070467, "balance_loss_mlp": 1.00011969, "epoch": 0.8611795827571694, "flos": 14830111074240.0, "grad_norm": 1.8387579776428498, "language_loss": 0.72610795, "learning_rate": 1.9864206546609342e-07, "loss": 0.75083661, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 3.7024447917938232 }, { "auxiliary_loss_clip": 0.01347983, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00744367, "balance_loss_mlp": 1.00013423, "epoch": 0.8612998256478086, "flos": 24243269588640.0, "grad_norm": 2.063675908677525, "language_loss": 0.84137964, "learning_rate": 1.983037504469771e-07, "loss": 0.86679107, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.699903964996338 }, { "auxiliary_loss_clip": 0.01326619, "auxiliary_loss_mlp": 0.01193116, "balance_loss_clip": 1.00695825, "balance_loss_mlp": 1.00018787, "epoch": 0.8614200685384477, "flos": 21252664589760.0, "grad_norm": 1.6637778717652596, "language_loss": 0.66614991, "learning_rate": 1.9796570874374984e-07, "loss": 0.69134724, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.7750556468963623 }, { "auxiliary_loss_clip": 0.01315408, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00767171, "balance_loss_mlp": 1.00013435, "epoch": 0.8615403114290867, "flos": 20007747485280.0, "grad_norm": 1.5908779392967154, "language_loss": 0.77519393, "learning_rate": 1.976279404076917e-07, "loss": 0.80027962, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.7501211166381836 }, { "auxiliary_loss_clip": 0.01281918, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00648141, "balance_loss_mlp": 1.00014567, "epoch": 0.8616605543197259, "flos": 29789386788960.0, "grad_norm": 1.9643370518251475, "language_loss": 0.76057512, "learning_rate": 1.9729044549004193e-07, "loss": 0.785326, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.8996994495391846 }, { "auxiliary_loss_clip": 0.01325442, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00720799, "balance_loss_mlp": 1.00015938, "epoch": 0.8617807972103649, "flos": 28911621297600.0, "grad_norm": 1.5334919973983432, "language_loss": 0.70364439, "learning_rate": 1.9695322404199822e-07, "loss": 0.72882867, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.8272528648376465 }, { "auxiliary_loss_clip": 0.01310939, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00777161, "balance_loss_mlp": 1.00012922, "epoch": 0.861901040101004, "flos": 27673817081760.0, "grad_norm": 2.178334552046479, "language_loss": 0.82398868, "learning_rate": 1.9661627611471654e-07, "loss": 0.8490296, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.859924793243408 }, { "auxiliary_loss_clip": 0.01300034, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00727153, "balance_loss_mlp": 1.00019312, "epoch": 0.8620212829916432, "flos": 49748075736480.0, "grad_norm": 1.813075445206514, "language_loss": 0.6994307, "learning_rate": 1.9627960175931246e-07, "loss": 0.72436321, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 3.136751651763916 }, { "auxiliary_loss_clip": 0.01327677, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00744796, "balance_loss_mlp": 1.00012922, "epoch": 0.8621415258822822, "flos": 21138682456800.0, "grad_norm": 1.7356661244163643, "language_loss": 0.74221379, "learning_rate": 1.9594320102685847e-07, "loss": 0.76742208, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.7181692123413086 }, { "auxiliary_loss_clip": 0.01310678, "auxiliary_loss_mlp": 0.00872472, "balance_loss_clip": 1.00737643, "balance_loss_mlp": 1.00036311, "epoch": 0.8622617687729213, "flos": 21689050639680.0, "grad_norm": 1.9656628228926996, "language_loss": 0.63913238, "learning_rate": 1.956070739683864e-07, "loss": 0.66096389, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.818303346633911 }, { "auxiliary_loss_clip": 0.01299146, "auxiliary_loss_mlp": 0.01192967, "balance_loss_clip": 1.00716567, "balance_loss_mlp": 1.00013363, "epoch": 0.8623820116635604, "flos": 26250604951680.0, "grad_norm": 1.5446070837237147, "language_loss": 0.73847711, "learning_rate": 1.9527122063488678e-07, "loss": 0.76339823, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.832977533340454 }, { "auxiliary_loss_clip": 0.01318803, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00700533, "balance_loss_mlp": 1.00015414, "epoch": 0.8625022545541995, "flos": 19647565066080.0, "grad_norm": 1.5241997999793104, "language_loss": 0.80151176, "learning_rate": 1.9493564107730755e-07, "loss": 0.82663059, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.8128280639648438 }, { "auxiliary_loss_clip": 0.01322738, "auxiliary_loss_mlp": 0.01192991, "balance_loss_clip": 1.00782514, "balance_loss_mlp": 1.00015831, "epoch": 0.8626224974448385, "flos": 21908393222400.0, "grad_norm": 1.7387848729604023, "language_loss": 0.60604763, "learning_rate": 1.9460033534655684e-07, "loss": 0.63120484, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.746507167816162 }, { "auxiliary_loss_clip": 0.01322673, "auxiliary_loss_mlp": 0.01192953, "balance_loss_clip": 1.00716901, "balance_loss_mlp": 1.00011992, "epoch": 0.8627427403354777, "flos": 23331210192000.0, "grad_norm": 1.6274321411185784, "language_loss": 0.83847106, "learning_rate": 1.9426530349349978e-07, "loss": 0.86362731, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.751678466796875 }, { "auxiliary_loss_clip": 0.01334217, "auxiliary_loss_mlp": 0.00872373, "balance_loss_clip": 1.00754094, "balance_loss_mlp": 1.00039363, "epoch": 0.8628629832261168, "flos": 16362886724640.0, "grad_norm": 1.7261901769480366, "language_loss": 0.64846742, "learning_rate": 1.9393054556896038e-07, "loss": 0.67053336, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.706644296646118 }, { "auxiliary_loss_clip": 0.0128673, "auxiliary_loss_mlp": 0.01192985, "balance_loss_clip": 1.00719547, "balance_loss_mlp": 1.00015187, "epoch": 0.8629832261167558, "flos": 28103952411360.0, "grad_norm": 2.2228306148699413, "language_loss": 0.69386196, "learning_rate": 1.9359606162372133e-07, "loss": 0.7186591, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 3.0433619022369385 }, { "auxiliary_loss_clip": 0.01347615, "auxiliary_loss_mlp": 0.01192972, "balance_loss_clip": 1.00786114, "balance_loss_mlp": 1.000139, "epoch": 0.863103469007395, "flos": 20230071733440.0, "grad_norm": 1.6203606930255858, "language_loss": 0.70875311, "learning_rate": 1.9326185170852293e-07, "loss": 0.73415905, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.786864995956421 }, { "auxiliary_loss_clip": 0.01335504, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00788617, "balance_loss_mlp": 1.0001893, "epoch": 0.863223711898034, "flos": 24498558565920.0, "grad_norm": 1.7950003353473702, "language_loss": 0.72136605, "learning_rate": 1.9292791587406598e-07, "loss": 0.7466532, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.751068115234375 }, { "auxiliary_loss_clip": 0.01335529, "auxiliary_loss_mlp": 0.00872482, "balance_loss_clip": 1.00752878, "balance_loss_mlp": 1.00046229, "epoch": 0.8633439547886731, "flos": 17675385776640.0, "grad_norm": 3.600979141508382, "language_loss": 0.86906016, "learning_rate": 1.9259425417100661e-07, "loss": 0.89114028, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 2.744055986404419 }, { "auxiliary_loss_clip": 0.01282187, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00696182, "balance_loss_mlp": 1.000175, "epoch": 0.8634641976793123, "flos": 12895063660800.0, "grad_norm": 2.2992725514939933, "language_loss": 0.74508953, "learning_rate": 1.9226086664996234e-07, "loss": 0.76984334, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.836923599243164 }, { "auxiliary_loss_clip": 0.01296623, "auxiliary_loss_mlp": 0.01193251, "balance_loss_clip": 1.00769353, "balance_loss_mlp": 1.00022733, "epoch": 0.8635844405699513, "flos": 23878991869920.0, "grad_norm": 1.774719434730148, "language_loss": 0.73972738, "learning_rate": 1.9192775336150712e-07, "loss": 0.76462609, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.793870449066162 }, { "auxiliary_loss_clip": 0.01294183, "auxiliary_loss_mlp": 0.01192293, "balance_loss_clip": 1.00314951, "balance_loss_mlp": 1.00003266, "epoch": 0.8637046834605904, "flos": 60453413709120.0, "grad_norm": 0.7621986470200145, "language_loss": 0.56342793, "learning_rate": 1.915949143561739e-07, "loss": 0.58829272, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.3217153549194336 }, { "auxiliary_loss_clip": 0.01326611, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.0077368, "balance_loss_mlp": 1.00015306, "epoch": 0.8638249263512295, "flos": 20558762294400.0, "grad_norm": 1.72847611679916, "language_loss": 0.78484684, "learning_rate": 1.9126234968445498e-07, "loss": 0.81004375, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 2.7981433868408203 }, { "auxiliary_loss_clip": 0.01348562, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00794888, "balance_loss_mlp": 1.00016427, "epoch": 0.8639451692418686, "flos": 26615780762400.0, "grad_norm": 1.3689065256411668, "language_loss": 0.67818505, "learning_rate": 1.9093005939679884e-07, "loss": 0.7036016, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.813877820968628 }, { "auxiliary_loss_clip": 0.01325149, "auxiliary_loss_mlp": 0.01193087, "balance_loss_clip": 1.0072608, "balance_loss_mlp": 1.00015843, "epoch": 0.8640654121325076, "flos": 15122460080160.0, "grad_norm": 2.1621319706630135, "language_loss": 0.7657547, "learning_rate": 1.9059804354361452e-07, "loss": 0.79093707, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 3.6133909225463867 }, { "auxiliary_loss_clip": 0.01324409, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00788164, "balance_loss_mlp": 1.000157, "epoch": 0.8641856550231467, "flos": 31869081948960.0, "grad_norm": 1.5618387674236653, "language_loss": 0.70394111, "learning_rate": 1.902663021752684e-07, "loss": 0.72911704, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.84519624710083 }, { "auxiliary_loss_clip": 0.01348795, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00810361, "balance_loss_mlp": 1.00017095, "epoch": 0.8643058979137859, "flos": 14976555004800.0, "grad_norm": 2.603716342989882, "language_loss": 0.82108188, "learning_rate": 1.8993483534208556e-07, "loss": 0.84650177, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 3.6608288288116455 }, { "auxiliary_loss_clip": 0.01313353, "auxiliary_loss_mlp": 0.01193243, "balance_loss_clip": 1.00818205, "balance_loss_mlp": 1.00021935, "epoch": 0.8644261408044249, "flos": 13115735419680.0, "grad_norm": 2.530566196063271, "language_loss": 0.74896801, "learning_rate": 1.8960364309434884e-07, "loss": 0.77403396, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 5.021681785583496 }, { "auxiliary_loss_clip": 0.01253998, "auxiliary_loss_mlp": 0.00872405, "balance_loss_clip": 1.00640607, "balance_loss_mlp": 1.00046635, "epoch": 0.864546383695064, "flos": 20850931681920.0, "grad_norm": 1.6077008284600796, "language_loss": 0.7839334, "learning_rate": 1.8927272548229967e-07, "loss": 0.80519742, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.903751850128174 }, { "auxiliary_loss_clip": 0.01277018, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00655794, "balance_loss_mlp": 1.00015259, "epoch": 0.8646666265857031, "flos": 21324593302560.0, "grad_norm": 2.4652103757430974, "language_loss": 0.83182836, "learning_rate": 1.8894208255613876e-07, "loss": 0.85653031, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.8550057411193848 }, { "auxiliary_loss_clip": 0.01348458, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.00789928, "balance_loss_mlp": 1.00016308, "epoch": 0.8647868694763422, "flos": 19750841942400.0, "grad_norm": 1.8205707524440822, "language_loss": 0.77506053, "learning_rate": 1.8861171436602397e-07, "loss": 0.80047506, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.653780698776245 }, { "auxiliary_loss_clip": 0.01329974, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.00760627, "balance_loss_mlp": 1.00019741, "epoch": 0.8649071123669813, "flos": 26176772283840.0, "grad_norm": 2.220799129474142, "language_loss": 0.79842567, "learning_rate": 1.882816209620719e-07, "loss": 0.82365763, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.781557559967041 }, { "auxiliary_loss_clip": 0.01312807, "auxiliary_loss_mlp": 0.01193145, "balance_loss_clip": 1.00840616, "balance_loss_mlp": 1.00021648, "epoch": 0.8650273552576204, "flos": 20302898538240.0, "grad_norm": 1.8363242019541608, "language_loss": 0.7663914, "learning_rate": 1.8795180239435738e-07, "loss": 0.79145098, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.747748374938965 }, { "auxiliary_loss_clip": 0.01316878, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00753903, "balance_loss_mlp": 1.0002538, "epoch": 0.8651475981482595, "flos": 23951100201120.0, "grad_norm": 2.698571280446493, "language_loss": 0.76198626, "learning_rate": 1.8762225871291348e-07, "loss": 0.7870869, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.815847635269165 }, { "auxiliary_loss_clip": 0.01347909, "auxiliary_loss_mlp": 0.00872511, "balance_loss_clip": 1.0076412, "balance_loss_mlp": 1.00044179, "epoch": 0.8652678410388985, "flos": 21684632027040.0, "grad_norm": 1.6818452597921814, "language_loss": 0.8071326, "learning_rate": 1.8729298996773201e-07, "loss": 0.82933676, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.8545634746551514 }, { "auxiliary_loss_clip": 0.01294826, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00371444, "balance_loss_mlp": 1.0000056, "epoch": 0.8653880839295377, "flos": 65224696266720.0, "grad_norm": 0.8346060040114264, "language_loss": 0.60934311, "learning_rate": 1.8696399620876301e-07, "loss": 0.63421404, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.2742717266082764 }, { "auxiliary_loss_clip": 0.01312322, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00779641, "balance_loss_mlp": 1.00016701, "epoch": 0.8655083268201768, "flos": 17749182520800.0, "grad_norm": 2.152273529779656, "language_loss": 0.79339612, "learning_rate": 1.866352774859141e-07, "loss": 0.81845021, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.857724666595459 }, { "auxiliary_loss_clip": 0.01301077, "auxiliary_loss_mlp": 0.01193047, "balance_loss_clip": 1.00705886, "balance_loss_mlp": 1.00011861, "epoch": 0.8656285697108158, "flos": 20703984819840.0, "grad_norm": 2.233508013254648, "language_loss": 0.69291246, "learning_rate": 1.8630683384905188e-07, "loss": 0.71785367, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.7904205322265625 }, { "auxiliary_loss_clip": 0.01348007, "auxiliary_loss_mlp": 0.00872554, "balance_loss_clip": 1.00766921, "balance_loss_mlp": 1.00040913, "epoch": 0.865748812601455, "flos": 18653841639360.0, "grad_norm": 1.7730006126611817, "language_loss": 0.8867557, "learning_rate": 1.8597866534800045e-07, "loss": 0.9089613, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.7027487754821777 }, { "auxiliary_loss_clip": 0.01335965, "auxiliary_loss_mlp": 0.00872427, "balance_loss_clip": 1.00813317, "balance_loss_mlp": 1.0003556, "epoch": 0.865869055492094, "flos": 70652579130720.0, "grad_norm": 1.8370080469804289, "language_loss": 0.74387848, "learning_rate": 1.8565077203254398e-07, "loss": 0.76596248, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 3.1487464904785156 }, { "auxiliary_loss_clip": 0.01280268, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00738537, "balance_loss_mlp": 1.00016117, "epoch": 0.8659892983827331, "flos": 17383970786400.0, "grad_norm": 3.154326830143186, "language_loss": 0.72571254, "learning_rate": 1.8532315395242203e-07, "loss": 0.75044703, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.742380142211914 }, { "auxiliary_loss_clip": 0.01293351, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.00647652, "balance_loss_mlp": 1.00016546, "epoch": 0.8661095412733723, "flos": 17895231290880.0, "grad_norm": 1.934840497801166, "language_loss": 0.72036779, "learning_rate": 1.849958111573353e-07, "loss": 0.74523127, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.8043484687805176 }, { "auxiliary_loss_clip": 0.01347068, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.00757027, "balance_loss_mlp": 1.00014591, "epoch": 0.8662297841640113, "flos": 18224173317600.0, "grad_norm": 1.7218992682564351, "language_loss": 0.64134663, "learning_rate": 1.8466874369694074e-07, "loss": 0.66674805, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.6703600883483887 }, { "auxiliary_loss_clip": 0.01311105, "auxiliary_loss_mlp": 0.01193102, "balance_loss_clip": 1.00773573, "balance_loss_mlp": 1.00017321, "epoch": 0.8663500270546504, "flos": 16362168251040.0, "grad_norm": 2.223154288975167, "language_loss": 0.70205343, "learning_rate": 1.843419516208542e-07, "loss": 0.72709554, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.78195858001709 }, { "auxiliary_loss_clip": 0.01324437, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00785482, "balance_loss_mlp": 1.00018299, "epoch": 0.8664702699452895, "flos": 17894440969920.0, "grad_norm": 1.9981257666046832, "language_loss": 0.79771793, "learning_rate": 1.8401543497865047e-07, "loss": 0.82289439, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 2.6914241313934326 }, { "auxiliary_loss_clip": 0.01330501, "auxiliary_loss_mlp": 0.00872478, "balance_loss_clip": 1.00680888, "balance_loss_mlp": 1.00043726, "epoch": 0.8665905128359286, "flos": 30736386717120.0, "grad_norm": 2.0129194270375663, "language_loss": 0.637802, "learning_rate": 1.836891938198608e-07, "loss": 0.65983176, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.8344523906707764 }, { "auxiliary_loss_clip": 0.01312016, "auxiliary_loss_mlp": 0.01193004, "balance_loss_clip": 1.00790226, "balance_loss_mlp": 1.00017142, "epoch": 0.8667107557265676, "flos": 18656428144320.0, "grad_norm": 2.1628850351048823, "language_loss": 0.71037871, "learning_rate": 1.8336322819397677e-07, "loss": 0.73542893, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.7154624462127686 }, { "auxiliary_loss_clip": 0.01307497, "auxiliary_loss_mlp": 0.01193261, "balance_loss_clip": 1.0071516, "balance_loss_mlp": 1.00023746, "epoch": 0.8668309986172068, "flos": 20083735573920.0, "grad_norm": 1.7395541210769476, "language_loss": 0.62530386, "learning_rate": 1.8303753815044654e-07, "loss": 0.65031147, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.802504062652588 }, { "auxiliary_loss_clip": 0.01324311, "auxiliary_loss_mlp": 0.01193133, "balance_loss_clip": 1.00784743, "balance_loss_mlp": 1.00020528, "epoch": 0.8669512415078459, "flos": 21615110200800.0, "grad_norm": 4.29292408500298, "language_loss": 0.70670456, "learning_rate": 1.827121237386773e-07, "loss": 0.731879, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.8607966899871826 }, { "auxiliary_loss_clip": 0.01307732, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00689328, "balance_loss_mlp": 1.00024569, "epoch": 0.8670714843984849, "flos": 17703608580000.0, "grad_norm": 2.148717680399221, "language_loss": 0.7497704, "learning_rate": 1.8238698500803374e-07, "loss": 0.77477944, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.733391761779785 }, { "auxiliary_loss_clip": 0.01304057, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.00353003, "balance_loss_mlp": 1.0000149, "epoch": 0.8671917272891241, "flos": 60705518688000.0, "grad_norm": 0.7149442758523716, "language_loss": 0.56288981, "learning_rate": 1.820621220078391e-07, "loss": 0.58785313, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 4.166049957275391 }, { "auxiliary_loss_clip": 0.01347834, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00773621, "balance_loss_mlp": 1.00013638, "epoch": 0.8673119701797631, "flos": 20451893050080.0, "grad_norm": 1.7010760561910878, "language_loss": 0.67716682, "learning_rate": 1.8173753478737553e-07, "loss": 0.7025767, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.697641372680664 }, { "auxiliary_loss_clip": 0.01348372, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00794041, "balance_loss_mlp": 1.00014293, "epoch": 0.8674322130704022, "flos": 19647421371360.0, "grad_norm": 1.9886997812822438, "language_loss": 0.79686052, "learning_rate": 1.8141322339588205e-07, "loss": 0.82227588, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 3.597181797027588 }, { "auxiliary_loss_clip": 0.01348108, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00817788, "balance_loss_mlp": 1.00015426, "epoch": 0.8675524559610414, "flos": 26025011648640.0, "grad_norm": 2.6261254644955665, "language_loss": 0.70153397, "learning_rate": 1.810891878825569e-07, "loss": 0.72694683, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 3.6906917095184326 }, { "auxiliary_loss_clip": 0.01314461, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00742018, "balance_loss_mlp": 1.00014448, "epoch": 0.8676726988516804, "flos": 15049453656960.0, "grad_norm": 2.1257157951332064, "language_loss": 0.71244955, "learning_rate": 1.8076542829655561e-07, "loss": 0.73752576, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.7180771827697754 }, { "auxiliary_loss_clip": 0.01303713, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00755012, "balance_loss_mlp": 1.0001806, "epoch": 0.8677929417423195, "flos": 16288120041120.0, "grad_norm": 2.082863791177375, "language_loss": 0.79018712, "learning_rate": 1.8044194468699203e-07, "loss": 0.81515634, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.82075572013855 }, { "auxiliary_loss_clip": 0.01301163, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.0071696, "balance_loss_mlp": 1.00016403, "epoch": 0.8679131846329585, "flos": 18844171097760.0, "grad_norm": 2.1882717023191143, "language_loss": 0.75742686, "learning_rate": 1.8011873710293912e-07, "loss": 0.78236938, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.8687212467193604 }, { "auxiliary_loss_clip": 0.01323501, "auxiliary_loss_mlp": 0.01193232, "balance_loss_clip": 1.0079987, "balance_loss_mlp": 1.0002079, "epoch": 0.8680334275235977, "flos": 33620733174240.0, "grad_norm": 1.722185684351129, "language_loss": 0.69499266, "learning_rate": 1.7979580559342677e-07, "loss": 0.72016001, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.792182207107544 }, { "auxiliary_loss_clip": 0.01310307, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00783086, "balance_loss_mlp": 1.00016737, "epoch": 0.8681536704142367, "flos": 24681164433120.0, "grad_norm": 1.5387445498042633, "language_loss": 0.66603756, "learning_rate": 1.7947315020744358e-07, "loss": 0.69107252, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.845686197280884 }, { "auxiliary_loss_clip": 0.01311509, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00736403, "balance_loss_mlp": 1.00013471, "epoch": 0.8682739133048758, "flos": 20011052463840.0, "grad_norm": 1.788265649017377, "language_loss": 0.8050195, "learning_rate": 1.7915077099393594e-07, "loss": 0.83006525, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.7200639247894287 }, { "auxiliary_loss_clip": 0.01336699, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00836825, "balance_loss_mlp": 1.00017142, "epoch": 0.868394156195515, "flos": 16654768722720.0, "grad_norm": 2.180752234642303, "language_loss": 0.73236167, "learning_rate": 1.788286680018083e-07, "loss": 0.75766057, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.7924959659576416 }, { "auxiliary_loss_clip": 0.01314624, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00748158, "balance_loss_mlp": 1.00021398, "epoch": 0.868514399086154, "flos": 28001394008640.0, "grad_norm": 1.4780841462308578, "language_loss": 0.72049952, "learning_rate": 1.7850684127992443e-07, "loss": 0.7455771, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.8004183769226074 }, { "auxiliary_loss_clip": 0.01288, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00690484, "balance_loss_mlp": 1.00015247, "epoch": 0.8686346419767931, "flos": 20084597742240.0, "grad_norm": 1.7447863570937803, "language_loss": 0.70226365, "learning_rate": 1.7818529087710378e-07, "loss": 0.7270745, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.8261189460754395 }, { "auxiliary_loss_clip": 0.0132634, "auxiliary_loss_mlp": 0.00872457, "balance_loss_clip": 1.00697327, "balance_loss_mlp": 1.00027573, "epoch": 0.8687548848674322, "flos": 18223526691360.0, "grad_norm": 1.8573899028192669, "language_loss": 0.83903253, "learning_rate": 1.7786401684212637e-07, "loss": 0.86102057, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.7233715057373047 }, { "auxiliary_loss_clip": 0.01248956, "auxiliary_loss_mlp": 0.01192261, "balance_loss_clip": 1.00405061, "balance_loss_mlp": 1.0, "epoch": 0.8688751277580713, "flos": 70457916116160.0, "grad_norm": 0.7372997957101698, "language_loss": 0.55963582, "learning_rate": 1.7754301922372883e-07, "loss": 0.58404797, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.188694477081299 }, { "auxiliary_loss_clip": 0.0126551, "auxiliary_loss_mlp": 0.01193169, "balance_loss_clip": 1.00696754, "balance_loss_mlp": 1.0001452, "epoch": 0.8689953706487104, "flos": 26906800592160.0, "grad_norm": 1.9943935242448598, "language_loss": 0.81161296, "learning_rate": 1.7722229807060617e-07, "loss": 0.8361997, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.944833993911743 }, { "auxiliary_loss_clip": 0.01298819, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00763035, "balance_loss_mlp": 1.00013411, "epoch": 0.8691156135393495, "flos": 34637398623360.0, "grad_norm": 2.1448468327738777, "language_loss": 0.81818175, "learning_rate": 1.7690185343141172e-07, "loss": 0.84310049, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.9174063205718994 }, { "auxiliary_loss_clip": 0.0131715, "auxiliary_loss_mlp": 0.01192963, "balance_loss_clip": 1.00739002, "balance_loss_mlp": 1.00012958, "epoch": 0.8692358564299886, "flos": 18989824707360.0, "grad_norm": 1.939507005335603, "language_loss": 0.69673789, "learning_rate": 1.7658168535475615e-07, "loss": 0.72183901, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.7623536586761475 }, { "auxiliary_loss_clip": 0.01313122, "auxiliary_loss_mlp": 0.01193144, "balance_loss_clip": 1.00743616, "balance_loss_mlp": 1.00012064, "epoch": 0.8693560993206276, "flos": 30370851669600.0, "grad_norm": 1.4842361623451756, "language_loss": 0.64171636, "learning_rate": 1.7626179388920948e-07, "loss": 0.66677904, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.7852463722229004 }, { "auxiliary_loss_clip": 0.01306693, "auxiliary_loss_mlp": 0.00872403, "balance_loss_clip": 1.00701785, "balance_loss_mlp": 1.00032115, "epoch": 0.8694763422112668, "flos": 27200442850560.0, "grad_norm": 1.847845004481655, "language_loss": 0.80384898, "learning_rate": 1.7594217908329866e-07, "loss": 0.8256399, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.872817039489746 }, { "auxiliary_loss_clip": 0.01312778, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00761104, "balance_loss_mlp": 1.00015736, "epoch": 0.8695965851019059, "flos": 26139173400000.0, "grad_norm": 1.9547237767316676, "language_loss": 0.73789132, "learning_rate": 1.7562284098550895e-07, "loss": 0.76294899, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.7835490703582764 }, { "auxiliary_loss_clip": 0.01271161, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00386381, "balance_loss_mlp": 1.00001037, "epoch": 0.8697168279925449, "flos": 67332650153760.0, "grad_norm": 0.8359228885192218, "language_loss": 0.62358809, "learning_rate": 1.753037796442838e-07, "loss": 0.64822233, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.2808451652526855 }, { "auxiliary_loss_clip": 0.01347706, "auxiliary_loss_mlp": 0.01193124, "balance_loss_clip": 1.00746655, "balance_loss_mlp": 1.00019598, "epoch": 0.8698370708831841, "flos": 19718703457920.0, "grad_norm": 2.0199993848270528, "language_loss": 0.7518574, "learning_rate": 1.74984995108024e-07, "loss": 0.77726567, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.7288858890533447 }, { "auxiliary_loss_clip": 0.01327269, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00732851, "balance_loss_mlp": 1.00017691, "epoch": 0.8699573137738231, "flos": 12859979434560.0, "grad_norm": 1.8368430223570278, "language_loss": 0.83373481, "learning_rate": 1.7466648742508981e-07, "loss": 0.85893857, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.668755292892456 }, { "auxiliary_loss_clip": 0.01302404, "auxiliary_loss_mlp": 0.01193046, "balance_loss_clip": 1.00691628, "balance_loss_mlp": 1.00011718, "epoch": 0.8700775566644622, "flos": 17420743425600.0, "grad_norm": 1.7528542185064977, "language_loss": 0.84520364, "learning_rate": 1.7434825664379837e-07, "loss": 0.87015808, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.76487398147583 }, { "auxiliary_loss_clip": 0.01324657, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00729692, "balance_loss_mlp": 1.00018501, "epoch": 0.8701977995551013, "flos": 13735230268320.0, "grad_norm": 2.548715602580556, "language_loss": 0.86403, "learning_rate": 1.740303028124246e-07, "loss": 0.88920861, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.6778950691223145 }, { "auxiliary_loss_clip": 0.01269641, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00755358, "balance_loss_mlp": 1.00014162, "epoch": 0.8703180424457404, "flos": 30555720728640.0, "grad_norm": 1.7734211197942968, "language_loss": 0.75470567, "learning_rate": 1.7371262597920212e-07, "loss": 0.77933276, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 3.8120477199554443 }, { "auxiliary_loss_clip": 0.01267192, "auxiliary_loss_mlp": 0.01192966, "balance_loss_clip": 1.00665748, "balance_loss_mlp": 1.0001334, "epoch": 0.8704382853363795, "flos": 19608996242880.0, "grad_norm": 2.008224409492657, "language_loss": 0.76427966, "learning_rate": 1.7339522619232195e-07, "loss": 0.78888118, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.820521831512451 }, { "auxiliary_loss_clip": 0.01324051, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00787389, "balance_loss_mlp": 1.00013351, "epoch": 0.8705585282270186, "flos": 26613912731040.0, "grad_norm": 1.8381390897156384, "language_loss": 0.75582707, "learning_rate": 1.730781034999338e-07, "loss": 0.78099823, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 3.7161877155303955 }, { "auxiliary_loss_clip": 0.01347008, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00840783, "balance_loss_mlp": 1.00015807, "epoch": 0.8706787711176577, "flos": 34090479113760.0, "grad_norm": 2.078154435471116, "language_loss": 0.73284, "learning_rate": 1.7276125795014497e-07, "loss": 0.75824189, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 4.647356271743774 }, { "auxiliary_loss_clip": 0.01324958, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00804651, "balance_loss_mlp": 1.00017595, "epoch": 0.8707990140082967, "flos": 14611522888800.0, "grad_norm": 2.1689150966758697, "language_loss": 0.67677391, "learning_rate": 1.7244468959102054e-07, "loss": 0.70195544, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.710094451904297 }, { "auxiliary_loss_clip": 0.01326018, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00726223, "balance_loss_mlp": 1.00019646, "epoch": 0.8709192568989359, "flos": 20084166658080.0, "grad_norm": 1.9848986583020607, "language_loss": 0.85162252, "learning_rate": 1.7212839847058348e-07, "loss": 0.87681496, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.6558523178100586 }, { "auxiliary_loss_clip": 0.0125318, "auxiliary_loss_mlp": 0.01192987, "balance_loss_clip": 1.0071131, "balance_loss_mlp": 1.00015426, "epoch": 0.871039499789575, "flos": 16727092596000.0, "grad_norm": 1.8069939454246093, "language_loss": 0.73870003, "learning_rate": 1.718123846368147e-07, "loss": 0.76316166, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.865694284439087 }, { "auxiliary_loss_clip": 0.0130706, "auxiliary_loss_mlp": 0.00872396, "balance_loss_clip": 1.00670123, "balance_loss_mlp": 1.00031161, "epoch": 0.871159742680214, "flos": 21068801393760.0, "grad_norm": 1.6946564044845815, "language_loss": 0.71775734, "learning_rate": 1.714966481376543e-07, "loss": 0.7395519, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.769991397857666 }, { "auxiliary_loss_clip": 0.01324715, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.00673401, "balance_loss_mlp": 1.00013685, "epoch": 0.8712799855708532, "flos": 28256539291200.0, "grad_norm": 1.7630213599233553, "language_loss": 0.82883924, "learning_rate": 1.7118118902099797e-07, "loss": 0.85401708, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.6915409564971924 }, { "auxiliary_loss_clip": 0.01329747, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.00712407, "balance_loss_mlp": 1.00016463, "epoch": 0.8714002284614922, "flos": 22236688622880.0, "grad_norm": 1.585068447861504, "language_loss": 0.80614269, "learning_rate": 1.7086600733470146e-07, "loss": 0.83137012, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.785215377807617 }, { "auxiliary_loss_clip": 0.01324001, "auxiliary_loss_mlp": 0.01193051, "balance_loss_clip": 1.00721335, "balance_loss_mlp": 1.00012302, "epoch": 0.8715204713521313, "flos": 21431929554720.0, "grad_norm": 1.7092761195728319, "language_loss": 0.76906568, "learning_rate": 1.7055110312657738e-07, "loss": 0.79423618, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.7146925926208496 }, { "auxiliary_loss_clip": 0.01314506, "auxiliary_loss_mlp": 0.01193125, "balance_loss_clip": 1.00773084, "balance_loss_mlp": 1.0001967, "epoch": 0.8716407142427703, "flos": 23440450399200.0, "grad_norm": 2.0062300666470385, "language_loss": 0.73744237, "learning_rate": 1.702364764443962e-07, "loss": 0.76251864, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 2.816884756088257 }, { "auxiliary_loss_clip": 0.01263917, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00690317, "balance_loss_mlp": 1.00020897, "epoch": 0.8717609571334095, "flos": 27958693962240.0, "grad_norm": 2.1684422635462184, "language_loss": 0.72481799, "learning_rate": 1.6992212733588685e-07, "loss": 0.74938852, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.8712995052337646 }, { "auxiliary_loss_clip": 0.01323684, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.00835729, "balance_loss_mlp": 1.00019789, "epoch": 0.8718812000240486, "flos": 25479493162560.0, "grad_norm": 1.6818131983439595, "language_loss": 0.74821079, "learning_rate": 1.6960805584873538e-07, "loss": 0.7733798, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.752079963684082 }, { "auxiliary_loss_clip": 0.01275777, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.00623488, "balance_loss_mlp": 1.00024486, "epoch": 0.8720014429146876, "flos": 23403067057440.0, "grad_norm": 1.684210652032526, "language_loss": 0.77950615, "learning_rate": 1.6929426203058684e-07, "loss": 0.80419469, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.9314417839050293 }, { "auxiliary_loss_clip": 0.01349071, "auxiliary_loss_mlp": 0.00872587, "balance_loss_clip": 1.00756061, "balance_loss_mlp": 1.00044012, "epoch": 0.8721216858053268, "flos": 24352833108960.0, "grad_norm": 2.0022926365629017, "language_loss": 0.79908192, "learning_rate": 1.689807459290431e-07, "loss": 0.8212986, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.7449867725372314 }, { "auxiliary_loss_clip": 0.01309873, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00764084, "balance_loss_mlp": 1.00017071, "epoch": 0.8722419286959658, "flos": 33869699583840.0, "grad_norm": 1.9722981956457968, "language_loss": 0.70983219, "learning_rate": 1.6866750759166437e-07, "loss": 0.73486292, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.898608446121216 }, { "auxiliary_loss_clip": 0.0130292, "auxiliary_loss_mlp": 0.01193144, "balance_loss_clip": 1.00709379, "balance_loss_mlp": 1.00021577, "epoch": 0.8723621715866049, "flos": 18369395843040.0, "grad_norm": 2.6510863921766865, "language_loss": 0.77241141, "learning_rate": 1.6835454706596865e-07, "loss": 0.7973721, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.753772020339966 }, { "auxiliary_loss_clip": 0.01347898, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00806653, "balance_loss_mlp": 1.00019407, "epoch": 0.8724824144772441, "flos": 22013358511680.0, "grad_norm": 1.7709334413019773, "language_loss": 0.73761499, "learning_rate": 1.680418643994317e-07, "loss": 0.76302618, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.703627347946167 }, { "auxiliary_loss_clip": 0.01315861, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.00356424, "balance_loss_mlp": 1.00001562, "epoch": 0.8726026573678831, "flos": 66698749909440.0, "grad_norm": 0.8849673559705827, "language_loss": 0.6457516, "learning_rate": 1.6772945963948738e-07, "loss": 0.67083299, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.274528980255127 }, { "auxiliary_loss_clip": 0.01301563, "auxiliary_loss_mlp": 0.01193241, "balance_loss_clip": 1.00701702, "balance_loss_mlp": 1.00021696, "epoch": 0.8727229002585222, "flos": 13370916625920.0, "grad_norm": 8.847376046432208, "language_loss": 0.77246833, "learning_rate": 1.6741733283352733e-07, "loss": 0.79741639, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.7428488731384277 }, { "auxiliary_loss_clip": 0.01254384, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00633335, "balance_loss_mlp": 1.00016057, "epoch": 0.8728431431491613, "flos": 21796997594400.0, "grad_norm": 3.6033223174250373, "language_loss": 0.83741403, "learning_rate": 1.6710548402890102e-07, "loss": 0.86188966, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 2.8308358192443848 }, { "auxiliary_loss_clip": 0.0135001, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00816226, "balance_loss_mlp": 1.00019073, "epoch": 0.8729633860398004, "flos": 36173838489120.0, "grad_norm": 1.579211158147833, "language_loss": 0.66718656, "learning_rate": 1.6679391327291527e-07, "loss": 0.69261873, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.752610921859741 }, { "auxiliary_loss_clip": 0.01323777, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00770843, "balance_loss_mlp": 1.00015807, "epoch": 0.8730836289304394, "flos": 16359689517120.0, "grad_norm": 2.6248714336785786, "language_loss": 0.67948127, "learning_rate": 1.6648262061283492e-07, "loss": 0.70464993, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.7368359565734863 }, { "auxiliary_loss_clip": 0.01301102, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00676966, "balance_loss_mlp": 1.0001471, "epoch": 0.8732038718210786, "flos": 21215137553280.0, "grad_norm": 3.239787410162605, "language_loss": 0.73549527, "learning_rate": 1.6617160609588353e-07, "loss": 0.76043797, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.776904821395874 }, { "auxiliary_loss_clip": 0.01298154, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00744653, "balance_loss_mlp": 1.00016165, "epoch": 0.8733241147117177, "flos": 16610703576480.0, "grad_norm": 2.0589568717168536, "language_loss": 0.71962976, "learning_rate": 1.6586086976924163e-07, "loss": 0.74454314, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.762094736099243 }, { "auxiliary_loss_clip": 0.0133547, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00782156, "balance_loss_mlp": 1.00016689, "epoch": 0.8734443576023567, "flos": 20193945720480.0, "grad_norm": 1.732033979883326, "language_loss": 0.78184807, "learning_rate": 1.6555041168004747e-07, "loss": 0.80713463, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 3.6585187911987305 }, { "auxiliary_loss_clip": 0.0130977, "auxiliary_loss_mlp": 0.01192965, "balance_loss_clip": 1.00720918, "balance_loss_mlp": 1.00013185, "epoch": 0.8735646004929959, "flos": 18041172289920.0, "grad_norm": 1.7681617461486154, "language_loss": 0.69081295, "learning_rate": 1.6524023187539715e-07, "loss": 0.71584022, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.7472805976867676 }, { "auxiliary_loss_clip": 0.01315184, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.00750268, "balance_loss_mlp": 1.00012624, "epoch": 0.873684843383635, "flos": 20262353912640.0, "grad_norm": 1.8294424960115903, "language_loss": 0.75105464, "learning_rate": 1.649303304023446e-07, "loss": 0.77613795, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 3.841170310974121 }, { "auxiliary_loss_clip": 0.01277907, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00636303, "balance_loss_mlp": 1.00011539, "epoch": 0.873805086274274, "flos": 16947297347040.0, "grad_norm": 1.589099056353662, "language_loss": 0.7863971, "learning_rate": 1.6462070730790246e-07, "loss": 0.81110764, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 4.778167009353638 }, { "auxiliary_loss_clip": 0.01323064, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.00773835, "balance_loss_mlp": 1.00018311, "epoch": 0.8739253291649132, "flos": 18041280060960.0, "grad_norm": 2.1386152064417323, "language_loss": 0.78435719, "learning_rate": 1.6431136263903912e-07, "loss": 0.80951899, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.7907803058624268 }, { "auxiliary_loss_clip": 0.01335767, "auxiliary_loss_mlp": 0.00872488, "balance_loss_clip": 1.00770283, "balance_loss_mlp": 1.0004046, "epoch": 0.8740455720555522, "flos": 21325096234080.0, "grad_norm": 1.7990055359189745, "language_loss": 0.73238719, "learning_rate": 1.6400229644268282e-07, "loss": 0.75446975, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.7342700958251953 }, { "auxiliary_loss_clip": 0.01275815, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.0073514, "balance_loss_mlp": 1.0001595, "epoch": 0.8741658149461913, "flos": 15158693864160.0, "grad_norm": 1.9644580231711437, "language_loss": 0.81047523, "learning_rate": 1.6369350876571852e-07, "loss": 0.83516526, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.7506442070007324 }, { "auxiliary_loss_clip": 0.01279868, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00742722, "balance_loss_mlp": 1.00016713, "epoch": 0.8742860578368304, "flos": 23039867049120.0, "grad_norm": 3.705760862947914, "language_loss": 0.817101, "learning_rate": 1.6338499965498874e-07, "loss": 0.84183156, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.876481771469116 }, { "auxiliary_loss_clip": 0.01301884, "auxiliary_loss_mlp": 0.01193143, "balance_loss_clip": 1.00801766, "balance_loss_mlp": 1.00021505, "epoch": 0.8744063007274695, "flos": 28145359205280.0, "grad_norm": 1.5020379655898144, "language_loss": 0.77239507, "learning_rate": 1.630767691572943e-07, "loss": 0.79734534, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.943180561065674 }, { "auxiliary_loss_clip": 0.01286987, "auxiliary_loss_mlp": 0.01192283, "balance_loss_clip": 1.0035522, "balance_loss_mlp": 1.00002182, "epoch": 0.8745265436181086, "flos": 64034105271840.0, "grad_norm": 0.7312573421052742, "language_loss": 0.53540927, "learning_rate": 1.6276881731939306e-07, "loss": 0.56020206, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.4455409049987793 }, { "auxiliary_loss_clip": 0.01323528, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.0074811, "balance_loss_mlp": 1.0001657, "epoch": 0.8746467865087477, "flos": 28658631435840.0, "grad_norm": 1.7019511858664884, "language_loss": 0.7546106, "learning_rate": 1.6246114418800193e-07, "loss": 0.77977681, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.8077478408813477 }, { "auxiliary_loss_clip": 0.01337137, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00838244, "balance_loss_mlp": 1.00014615, "epoch": 0.8747670293993868, "flos": 23985861114240.0, "grad_norm": 1.6424829712004534, "language_loss": 0.7676779, "learning_rate": 1.6215374980979423e-07, "loss": 0.79298103, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.815037488937378 }, { "auxiliary_loss_clip": 0.01321799, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00774074, "balance_loss_mlp": 1.00016046, "epoch": 0.8748872722900258, "flos": 45221641574400.0, "grad_norm": 2.1418675926611592, "language_loss": 0.68509376, "learning_rate": 1.6184663423140133e-07, "loss": 0.71024257, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.9416134357452393 }, { "auxiliary_loss_clip": 0.01279995, "auxiliary_loss_mlp": 0.01193112, "balance_loss_clip": 1.00713563, "balance_loss_mlp": 1.00018382, "epoch": 0.875007515180665, "flos": 19754290615680.0, "grad_norm": 1.7366070237047495, "language_loss": 0.63957393, "learning_rate": 1.615397974994126e-07, "loss": 0.66430497, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.808642625808716 }, { "auxiliary_loss_clip": 0.01347577, "auxiliary_loss_mlp": 0.01192859, "balance_loss_clip": 1.00776207, "balance_loss_mlp": 1.00012088, "epoch": 0.875127758071304, "flos": 22710745404000.0, "grad_norm": 1.4317250243738806, "language_loss": 0.80702198, "learning_rate": 1.6123323966037438e-07, "loss": 0.83242631, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.741751194000244 }, { "auxiliary_loss_clip": 0.01347725, "auxiliary_loss_mlp": 0.01192975, "balance_loss_clip": 1.00781465, "balance_loss_mlp": 1.00014222, "epoch": 0.8752480009619431, "flos": 23403857378400.0, "grad_norm": 1.8222877884091562, "language_loss": 0.78622323, "learning_rate": 1.6092696076079216e-07, "loss": 0.81163025, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.8138344287872314 }, { "auxiliary_loss_clip": 0.01287818, "auxiliary_loss_mlp": 0.01192901, "balance_loss_clip": 1.00653625, "balance_loss_mlp": 1.00016379, "epoch": 0.8753682438525822, "flos": 26213113838880.0, "grad_norm": 1.5711409357277881, "language_loss": 0.73661321, "learning_rate": 1.6062096084712785e-07, "loss": 0.76142043, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.862201690673828 }, { "auxiliary_loss_clip": 0.01324485, "auxiliary_loss_mlp": 0.00872528, "balance_loss_clip": 1.00793278, "balance_loss_mlp": 1.00049126, "epoch": 0.8754884867432213, "flos": 23326755655680.0, "grad_norm": 1.772154567393373, "language_loss": 0.70498461, "learning_rate": 1.6031523996580098e-07, "loss": 0.7269547, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.7685208320617676 }, { "auxiliary_loss_clip": 0.01299521, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00724912, "balance_loss_mlp": 1.00017977, "epoch": 0.8756087296338604, "flos": 12495234708000.0, "grad_norm": 1.9911437429132601, "language_loss": 0.65742016, "learning_rate": 1.6000979816318981e-07, "loss": 0.68234742, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.7290074825286865 }, { "auxiliary_loss_clip": 0.01322097, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00769615, "balance_loss_mlp": 1.00015163, "epoch": 0.8757289725244994, "flos": 18952908373440.0, "grad_norm": 2.2845673709236887, "language_loss": 0.74863482, "learning_rate": 1.5970463548562886e-07, "loss": 0.77378756, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.7628846168518066 }, { "auxiliary_loss_clip": 0.01300499, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00675249, "balance_loss_mlp": 1.00015593, "epoch": 0.8758492154151386, "flos": 25265970216000.0, "grad_norm": 1.5695378610215334, "language_loss": 0.70829499, "learning_rate": 1.5939975197941192e-07, "loss": 0.73323178, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.796290397644043 }, { "auxiliary_loss_clip": 0.01286841, "auxiliary_loss_mlp": 0.01192282, "balance_loss_clip": 1.00360179, "balance_loss_mlp": 1.00002122, "epoch": 0.8759694583057777, "flos": 65571694695360.0, "grad_norm": 0.8545865637864096, "language_loss": 0.53382754, "learning_rate": 1.5909514769078892e-07, "loss": 0.55861878, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 3.4455080032348633 }, { "auxiliary_loss_clip": 0.0128925, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00828791, "balance_loss_mlp": 1.00015223, "epoch": 0.8760897011964167, "flos": 25446205120320.0, "grad_norm": 1.4856349470126808, "language_loss": 0.77581644, "learning_rate": 1.5879082266596867e-07, "loss": 0.80064064, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.863032341003418 }, { "auxiliary_loss_clip": 0.0132236, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00749731, "balance_loss_mlp": 1.00014162, "epoch": 0.8762099440870559, "flos": 28984843262880.0, "grad_norm": 1.655955321690408, "language_loss": 0.71699351, "learning_rate": 1.5848677695111645e-07, "loss": 0.7421478, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.8245880603790283 }, { "auxiliary_loss_clip": 0.01298969, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.0073508, "balance_loss_mlp": 1.00015998, "epoch": 0.8763301869776949, "flos": 21609470183040.0, "grad_norm": 2.619986852266024, "language_loss": 0.70046628, "learning_rate": 1.5818301059235562e-07, "loss": 0.72538775, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.7829744815826416 }, { "auxiliary_loss_clip": 0.01302925, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00645256, "balance_loss_mlp": 1.00016737, "epoch": 0.876450429868334, "flos": 24644427717600.0, "grad_norm": 1.4580704916173897, "language_loss": 0.81082636, "learning_rate": 1.578795236357684e-07, "loss": 0.83578753, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.814051628112793 }, { "auxiliary_loss_clip": 0.01305373, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00692582, "balance_loss_mlp": 1.0001446, "epoch": 0.8765706727589732, "flos": 20260054797120.0, "grad_norm": 1.927996724131556, "language_loss": 0.85453731, "learning_rate": 1.5757631612739218e-07, "loss": 0.87952179, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.743807554244995 }, { "auxiliary_loss_clip": 0.01315819, "auxiliary_loss_mlp": 0.011923, "balance_loss_clip": 1.00353622, "balance_loss_mlp": 1.00003898, "epoch": 0.8766909156496122, "flos": 71371196917920.0, "grad_norm": 0.7824996072052404, "language_loss": 0.61479688, "learning_rate": 1.572733881132242e-07, "loss": 0.63987803, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 4.1116626262664795 }, { "auxiliary_loss_clip": 0.01258215, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00363064, "balance_loss_mlp": 1.00000906, "epoch": 0.8768111585402513, "flos": 69523525399680.0, "grad_norm": 0.793898782781103, "language_loss": 0.58560705, "learning_rate": 1.5697073963921814e-07, "loss": 0.61011195, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 5.099247455596924 }, { "auxiliary_loss_clip": 0.01324682, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00788379, "balance_loss_mlp": 1.00013649, "epoch": 0.8769314014308904, "flos": 18838567003680.0, "grad_norm": 2.1477495106727504, "language_loss": 0.85006756, "learning_rate": 1.566683707512857e-07, "loss": 0.87524402, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 3.634678840637207 }, { "auxiliary_loss_clip": 0.01316017, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00799632, "balance_loss_mlp": 1.00019336, "epoch": 0.8770516443215295, "flos": 14976411310080.0, "grad_norm": 1.8293206490268916, "language_loss": 0.79473662, "learning_rate": 1.5636628149529553e-07, "loss": 0.81982899, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.8053998947143555 }, { "auxiliary_loss_clip": 0.01309831, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00710249, "balance_loss_mlp": 1.00016594, "epoch": 0.8771718872121685, "flos": 31649667518880.0, "grad_norm": 2.412683562228917, "language_loss": 0.80197155, "learning_rate": 1.560644719170743e-07, "loss": 0.82700086, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.865830898284912 }, { "auxiliary_loss_clip": 0.01303351, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00710452, "balance_loss_mlp": 1.00016308, "epoch": 0.8772921301028077, "flos": 36095479437600.0, "grad_norm": 2.091598185523857, "language_loss": 0.72017622, "learning_rate": 1.5576294206240692e-07, "loss": 0.74514055, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 3.078171968460083 }, { "auxiliary_loss_clip": 0.01312488, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00800896, "balance_loss_mlp": 1.00016236, "epoch": 0.8774123729934468, "flos": 57116982553920.0, "grad_norm": 1.6281848181970144, "language_loss": 0.67540193, "learning_rate": 1.5546169197703507e-07, "loss": 0.70045769, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 3.135728597640991 }, { "auxiliary_loss_clip": 0.01323702, "auxiliary_loss_mlp": 0.01193061, "balance_loss_clip": 1.00759637, "balance_loss_mlp": 1.00013304, "epoch": 0.8775326158840858, "flos": 23914507180320.0, "grad_norm": 2.314993598123436, "language_loss": 0.77323866, "learning_rate": 1.5516072170665774e-07, "loss": 0.79840636, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.731895923614502 }, { "auxiliary_loss_clip": 0.01335145, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00820565, "balance_loss_mlp": 1.00016904, "epoch": 0.877652858774725, "flos": 17123293257120.0, "grad_norm": 1.779120375466802, "language_loss": 0.86612415, "learning_rate": 1.5486003129693214e-07, "loss": 0.8914066, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.7846269607543945 }, { "auxiliary_loss_clip": 0.01329221, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00752485, "balance_loss_mlp": 1.00015247, "epoch": 0.877773101665364, "flos": 16508971418400.0, "grad_norm": 2.050682024251946, "language_loss": 0.7814092, "learning_rate": 1.545596207934725e-07, "loss": 0.80663228, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.687459707260132 }, { "auxiliary_loss_clip": 0.01319345, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00779212, "balance_loss_mlp": 1.00019193, "epoch": 0.8778933445560031, "flos": 22053220587360.0, "grad_norm": 1.60051633934377, "language_loss": 0.77794892, "learning_rate": 1.5425949024185147e-07, "loss": 0.80307353, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.8083462715148926 }, { "auxiliary_loss_clip": 0.01322853, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00755703, "balance_loss_mlp": 1.000139, "epoch": 0.8780135874466423, "flos": 22564768481280.0, "grad_norm": 1.789845970192993, "language_loss": 0.67354399, "learning_rate": 1.5395963968759818e-07, "loss": 0.69870412, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.8694236278533936 }, { "auxiliary_loss_clip": 0.01313841, "auxiliary_loss_mlp": 0.01192973, "balance_loss_clip": 1.00706124, "balance_loss_mlp": 1.00013995, "epoch": 0.8781338303372813, "flos": 61532021088000.0, "grad_norm": 1.3750412309311608, "language_loss": 0.64158809, "learning_rate": 1.536600691761998e-07, "loss": 0.6666562, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 3.1264114379882812 }, { "auxiliary_loss_clip": 0.0129155, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00709951, "balance_loss_mlp": 1.00014234, "epoch": 0.8782540732279204, "flos": 22674763085760.0, "grad_norm": 3.258635016730797, "language_loss": 0.7182883, "learning_rate": 1.5336077875310084e-07, "loss": 0.7431345, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.8858165740966797 }, { "auxiliary_loss_clip": 0.01276034, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00623548, "balance_loss_mlp": 1.00015152, "epoch": 0.8783743161185595, "flos": 16070358100320.0, "grad_norm": 2.284335323880689, "language_loss": 0.73864841, "learning_rate": 1.5306176846370321e-07, "loss": 0.76334047, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.8044047355651855 }, { "auxiliary_loss_clip": 0.0132259, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00769198, "balance_loss_mlp": 1.00015044, "epoch": 0.8784945590091986, "flos": 26067891313440.0, "grad_norm": 1.7670632527307237, "language_loss": 0.7364282, "learning_rate": 1.5276303835336712e-07, "loss": 0.76158583, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.8450570106506348 }, { "auxiliary_loss_clip": 0.01303844, "auxiliary_loss_mlp": 0.01192282, "balance_loss_clip": 1.00348067, "balance_loss_mlp": 1.00002086, "epoch": 0.8786148018998376, "flos": 62720672204160.0, "grad_norm": 0.7743041985332239, "language_loss": 0.53538501, "learning_rate": 1.524645884674094e-07, "loss": 0.56034631, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.422549247741699 }, { "auxiliary_loss_clip": 0.01348178, "auxiliary_loss_mlp": 0.00872656, "balance_loss_clip": 1.00809741, "balance_loss_mlp": 1.00045943, "epoch": 0.8787350447904768, "flos": 21652745008320.0, "grad_norm": 2.0414415297659567, "language_loss": 0.79285157, "learning_rate": 1.521664188511047e-07, "loss": 0.8150599, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.7895615100860596 }, { "auxiliary_loss_clip": 0.01306516, "auxiliary_loss_mlp": 0.00872389, "balance_loss_clip": 1.00763154, "balance_loss_mlp": 1.00038052, "epoch": 0.8788552876811159, "flos": 25478487299520.0, "grad_norm": 1.8668781073685325, "language_loss": 0.80249691, "learning_rate": 1.518685295496851e-07, "loss": 0.82428586, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.865213632583618 }, { "auxiliary_loss_clip": 0.01335463, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00785363, "balance_loss_mlp": 1.00013995, "epoch": 0.8789755305717549, "flos": 22310233901280.0, "grad_norm": 1.5733358865422356, "language_loss": 0.85077769, "learning_rate": 1.5157092060833975e-07, "loss": 0.87606299, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.7351229190826416 }, { "auxiliary_loss_clip": 0.01307147, "auxiliary_loss_mlp": 0.01193124, "balance_loss_clip": 1.007707, "balance_loss_mlp": 1.00019562, "epoch": 0.879095773462394, "flos": 29310983242560.0, "grad_norm": 1.8910607035827993, "language_loss": 0.66262519, "learning_rate": 1.5127359207221658e-07, "loss": 0.68762791, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.8614628314971924 }, { "auxiliary_loss_clip": 0.01276822, "auxiliary_loss_mlp": 0.01193131, "balance_loss_clip": 1.00727141, "balance_loss_mlp": 1.00020242, "epoch": 0.8792160163530331, "flos": 16690032567360.0, "grad_norm": 1.8668955384563488, "language_loss": 0.73340404, "learning_rate": 1.5097654398641923e-07, "loss": 0.75810361, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.863267660140991 }, { "auxiliary_loss_clip": 0.01321307, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00836611, "balance_loss_mlp": 1.00020254, "epoch": 0.8793362592436722, "flos": 24499312963200.0, "grad_norm": 1.455101839064863, "language_loss": 0.72994447, "learning_rate": 1.5067977639601014e-07, "loss": 0.75508982, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.8309924602508545 }, { "auxiliary_loss_clip": 0.01302829, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.0069927, "balance_loss_mlp": 1.00018787, "epoch": 0.8794565021343113, "flos": 14538408694560.0, "grad_norm": 2.008720721140028, "language_loss": 0.7088449, "learning_rate": 1.5038328934600864e-07, "loss": 0.7338053, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.76973819732666 }, { "auxiliary_loss_clip": 0.0130509, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00693774, "balance_loss_mlp": 1.00018525, "epoch": 0.8795767450249504, "flos": 39530301848640.0, "grad_norm": 1.7635752257108344, "language_loss": 0.69613576, "learning_rate": 1.5008708288139161e-07, "loss": 0.72111881, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.95889949798584 }, { "auxiliary_loss_clip": 0.0132516, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00818396, "balance_loss_mlp": 1.00019598, "epoch": 0.8796969879155895, "flos": 22960681752960.0, "grad_norm": 1.8800264040027532, "language_loss": 0.7306217, "learning_rate": 1.497911570470931e-07, "loss": 0.75580549, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.8123867511749268 }, { "auxiliary_loss_clip": 0.0128933, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00671399, "balance_loss_mlp": 1.00013638, "epoch": 0.8798172308062285, "flos": 28362438596160.0, "grad_norm": 1.6974336179310792, "language_loss": 0.85870653, "learning_rate": 1.494955118880048e-07, "loss": 0.88352954, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 3.84443736076355 }, { "auxiliary_loss_clip": 0.01330185, "auxiliary_loss_mlp": 0.01192971, "balance_loss_clip": 1.00716817, "balance_loss_mlp": 1.0001384, "epoch": 0.8799374736968677, "flos": 23988986474400.0, "grad_norm": 1.655509867343498, "language_loss": 0.72787356, "learning_rate": 1.4920014744897634e-07, "loss": 0.75310504, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 3.7153561115264893 }, { "auxiliary_loss_clip": 0.01309813, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00761294, "balance_loss_mlp": 1.00019336, "epoch": 0.8800577165875068, "flos": 25630283858400.0, "grad_norm": 1.7072467694068167, "language_loss": 0.86171305, "learning_rate": 1.4890506377481392e-07, "loss": 0.88674247, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 5.318173885345459 }, { "auxiliary_loss_clip": 0.01246616, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00671029, "balance_loss_mlp": 1.00015068, "epoch": 0.8801779594781458, "flos": 23440342628160.0, "grad_norm": 1.5403029852707657, "language_loss": 0.64014292, "learning_rate": 1.486102609102815e-07, "loss": 0.66453981, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.9548516273498535 }, { "auxiliary_loss_clip": 0.0130108, "auxiliary_loss_mlp": 0.01192905, "balance_loss_clip": 1.00664139, "balance_loss_mlp": 1.00016773, "epoch": 0.880298202368785, "flos": 11508588246240.0, "grad_norm": 2.3544944744324123, "language_loss": 0.85729188, "learning_rate": 1.483157389001004e-07, "loss": 0.88223171, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.805549383163452 }, { "auxiliary_loss_clip": 0.01325059, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00807357, "balance_loss_mlp": 1.00017262, "epoch": 0.880418445259424, "flos": 22671458107200.0, "grad_norm": 2.249081310062874, "language_loss": 0.78871179, "learning_rate": 1.4802149778894933e-07, "loss": 0.81389433, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.780913829803467 }, { "auxiliary_loss_clip": 0.01334989, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00743699, "balance_loss_mlp": 1.00015044, "epoch": 0.8805386881500631, "flos": 20522169273600.0, "grad_norm": 1.5993659552371606, "language_loss": 0.87350899, "learning_rate": 1.4772753762146484e-07, "loss": 0.89878964, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.8179075717926025 }, { "auxiliary_loss_clip": 0.01336575, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00741696, "balance_loss_mlp": 1.00015521, "epoch": 0.8806589310407023, "flos": 36538906528800.0, "grad_norm": 1.5356135867018086, "language_loss": 0.70384932, "learning_rate": 1.474338584422401e-07, "loss": 0.72914588, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.8885374069213867 }, { "auxiliary_loss_clip": 0.01323597, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00753391, "balance_loss_mlp": 1.00014782, "epoch": 0.8807791739313413, "flos": 23440198933440.0, "grad_norm": 1.6149439279905773, "language_loss": 0.75569892, "learning_rate": 1.4714046029582595e-07, "loss": 0.78086656, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.915112257003784 }, { "auxiliary_loss_clip": 0.01299126, "auxiliary_loss_mlp": 0.01193235, "balance_loss_clip": 1.00680077, "balance_loss_mlp": 1.000211, "epoch": 0.8808994168219804, "flos": 25956854922240.0, "grad_norm": 1.5995959986117383, "language_loss": 0.75818098, "learning_rate": 1.46847343226731e-07, "loss": 0.78310454, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.8518991470336914 }, { "auxiliary_loss_clip": 0.01336141, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00784004, "balance_loss_mlp": 1.00017285, "epoch": 0.8810196597126195, "flos": 17092088788320.0, "grad_norm": 1.76219784275212, "language_loss": 0.69355118, "learning_rate": 1.465545072794203e-07, "loss": 0.71884453, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.8001902103424072 }, { "auxiliary_loss_clip": 0.01247602, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00723433, "balance_loss_mlp": 1.00015235, "epoch": 0.8811399026032586, "flos": 23002842944160.0, "grad_norm": 1.5443621671974004, "language_loss": 0.75379694, "learning_rate": 1.4626195249831774e-07, "loss": 0.77820373, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.8861794471740723 }, { "auxiliary_loss_clip": 0.01335999, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00800049, "balance_loss_mlp": 1.0001477, "epoch": 0.8812601454938976, "flos": 14463821629440.0, "grad_norm": 1.6977886328993659, "language_loss": 0.7192294, "learning_rate": 1.4596967892780244e-07, "loss": 0.74452114, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.7417213916778564 }, { "auxiliary_loss_clip": 0.01347362, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.0078361, "balance_loss_mlp": 1.00019646, "epoch": 0.8813803883845368, "flos": 22493234928960.0, "grad_norm": 1.7449917107749637, "language_loss": 0.746894, "learning_rate": 1.4567768661221314e-07, "loss": 0.77229977, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.6600232124328613 }, { "auxiliary_loss_clip": 0.01330961, "auxiliary_loss_mlp": 0.00872543, "balance_loss_clip": 1.00724781, "balance_loss_mlp": 1.00038719, "epoch": 0.8815006312751759, "flos": 21506911780320.0, "grad_norm": 2.0562265529064536, "language_loss": 0.74168247, "learning_rate": 1.4538597559584442e-07, "loss": 0.76371753, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.8608107566833496 }, { "auxiliary_loss_clip": 0.0131269, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00748491, "balance_loss_mlp": 1.00016415, "epoch": 0.8816208741658149, "flos": 22784578071840.0, "grad_norm": 1.8066793715790992, "language_loss": 0.78729028, "learning_rate": 1.4509454592294823e-07, "loss": 0.81234908, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.8066272735595703 }, { "auxiliary_loss_clip": 0.01289514, "auxiliary_loss_mlp": 0.00872541, "balance_loss_clip": 1.00770211, "balance_loss_mlp": 1.00040531, "epoch": 0.8817411170564541, "flos": 17779417050240.0, "grad_norm": 2.135110488270876, "language_loss": 0.78788072, "learning_rate": 1.448033976377354e-07, "loss": 0.80950129, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.804185628890991 }, { "auxiliary_loss_clip": 0.01334907, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00695348, "balance_loss_mlp": 1.00017071, "epoch": 0.8818613599470931, "flos": 18551822091840.0, "grad_norm": 2.2319640034248778, "language_loss": 0.74221808, "learning_rate": 1.445125307843713e-07, "loss": 0.76749814, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.987504720687866 }, { "auxiliary_loss_clip": 0.01324453, "auxiliary_loss_mlp": 0.01192945, "balance_loss_clip": 1.00751746, "balance_loss_mlp": 1.00011253, "epoch": 0.8819816028377322, "flos": 27599804795520.0, "grad_norm": 2.4251972192780205, "language_loss": 0.75575268, "learning_rate": 1.442219454069813e-07, "loss": 0.78092664, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.760239839553833 }, { "auxiliary_loss_clip": 0.01279782, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00700951, "balance_loss_mlp": 1.00016832, "epoch": 0.8821018457283714, "flos": 23404611775680.0, "grad_norm": 1.8715458808608494, "language_loss": 0.6598556, "learning_rate": 1.4393164154964676e-07, "loss": 0.68458438, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.856691360473633 }, { "auxiliary_loss_clip": 0.01325265, "auxiliary_loss_mlp": 0.01193027, "balance_loss_clip": 1.00792134, "balance_loss_mlp": 1.00019407, "epoch": 0.8822220886190104, "flos": 29132472674880.0, "grad_norm": 1.6495353364655332, "language_loss": 0.93721819, "learning_rate": 1.4364161925640649e-07, "loss": 0.96240103, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.849973440170288 }, { "auxiliary_loss_clip": 0.01347518, "auxiliary_loss_mlp": 0.01193004, "balance_loss_clip": 1.0075345, "balance_loss_mlp": 1.00017095, "epoch": 0.8823423315096495, "flos": 20485432558080.0, "grad_norm": 1.7602752721330308, "language_loss": 0.85177672, "learning_rate": 1.4335187857125663e-07, "loss": 0.87718195, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.698756217956543 }, { "auxiliary_loss_clip": 0.01335245, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00776839, "balance_loss_mlp": 1.00018835, "epoch": 0.8824625744002886, "flos": 24206389178400.0, "grad_norm": 1.7006230558803905, "language_loss": 0.75404114, "learning_rate": 1.4306241953815023e-07, "loss": 0.77932566, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.7943270206451416 }, { "auxiliary_loss_clip": 0.01329585, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00725675, "balance_loss_mlp": 1.00014067, "epoch": 0.8825828172909277, "flos": 24679511943840.0, "grad_norm": 1.629597976227382, "language_loss": 0.70719665, "learning_rate": 1.4277324220099862e-07, "loss": 0.73242319, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.764218330383301 }, { "auxiliary_loss_clip": 0.01296849, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00733948, "balance_loss_mlp": 1.00015354, "epoch": 0.8827030601815667, "flos": 22456174900320.0, "grad_norm": 1.67698735477804, "language_loss": 0.74324369, "learning_rate": 1.4248434660366938e-07, "loss": 0.76814395, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.8528051376342773 }, { "auxiliary_loss_clip": 0.01304257, "auxiliary_loss_mlp": 0.01193025, "balance_loss_clip": 1.00654221, "balance_loss_mlp": 1.00019193, "epoch": 0.8828233030722058, "flos": 19865650320000.0, "grad_norm": 1.8678318513236278, "language_loss": 0.70494282, "learning_rate": 1.4219573278998808e-07, "loss": 0.72991562, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.765477418899536 }, { "auxiliary_loss_clip": 0.01323702, "auxiliary_loss_mlp": 0.01193003, "balance_loss_clip": 1.00788116, "balance_loss_mlp": 1.00017023, "epoch": 0.882943545962845, "flos": 39347228973600.0, "grad_norm": 2.3437581752130265, "language_loss": 0.64775658, "learning_rate": 1.4190740080373685e-07, "loss": 0.67292356, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 3.8156659603118896 }, { "auxiliary_loss_clip": 0.0127358, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00683689, "balance_loss_mlp": 1.00016344, "epoch": 0.883063788853484, "flos": 19054532760480.0, "grad_norm": 1.7380242324102104, "language_loss": 0.83917522, "learning_rate": 1.4161935068865538e-07, "loss": 0.86384195, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.7990365028381348 }, { "auxiliary_loss_clip": 0.01347592, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00764048, "balance_loss_mlp": 1.00015926, "epoch": 0.8831840317441231, "flos": 18733206553920.0, "grad_norm": 1.7911881675130528, "language_loss": 0.75976551, "learning_rate": 1.4133158248844113e-07, "loss": 0.78517228, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 4.662179470062256 }, { "auxiliary_loss_clip": 0.01272133, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00669909, "balance_loss_mlp": 1.00017047, "epoch": 0.8833042746347622, "flos": 26827723067040.0, "grad_norm": 1.8017569717441293, "language_loss": 0.73873055, "learning_rate": 1.4104409624674785e-07, "loss": 0.76338387, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.8489878177642822 }, { "auxiliary_loss_clip": 0.01326455, "auxiliary_loss_mlp": 0.01193048, "balance_loss_clip": 1.00759244, "balance_loss_mlp": 1.00011981, "epoch": 0.8834245175254013, "flos": 26104089173760.0, "grad_norm": 1.621781751586165, "language_loss": 0.78533196, "learning_rate": 1.407568920071873e-07, "loss": 0.81052697, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.8762526512145996 }, { "auxiliary_loss_clip": 0.01349552, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00823402, "balance_loss_mlp": 1.00017929, "epoch": 0.8835447604160404, "flos": 30629050464960.0, "grad_norm": 1.7725737654285048, "language_loss": 0.67964429, "learning_rate": 1.4046996981332782e-07, "loss": 0.70507181, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.812624216079712 }, { "auxiliary_loss_clip": 0.01271728, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00651002, "balance_loss_mlp": 1.00015187, "epoch": 0.8836650033066795, "flos": 24718368156480.0, "grad_norm": 2.046581563347812, "language_loss": 0.78027308, "learning_rate": 1.4018332970869516e-07, "loss": 0.80492115, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.852652072906494 }, { "auxiliary_loss_clip": 0.01311797, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.0079149, "balance_loss_mlp": 1.00020719, "epoch": 0.8837852461973186, "flos": 25413384085920.0, "grad_norm": 1.724645969293984, "language_loss": 0.84495467, "learning_rate": 1.398969717367733e-07, "loss": 0.87000495, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.879894733428955 }, { "auxiliary_loss_clip": 0.01246644, "auxiliary_loss_mlp": 0.01193114, "balance_loss_clip": 1.00657094, "balance_loss_mlp": 1.00018525, "epoch": 0.8839054890879576, "flos": 17822584104480.0, "grad_norm": 1.622625025574331, "language_loss": 0.76173025, "learning_rate": 1.396108959410014e-07, "loss": 0.78612781, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.8439788818359375 }, { "auxiliary_loss_clip": 0.0132317, "auxiliary_loss_mlp": 0.00872568, "balance_loss_clip": 1.00765884, "balance_loss_mlp": 1.00040424, "epoch": 0.8840257319785968, "flos": 23769033189120.0, "grad_norm": 1.4731798570649481, "language_loss": 0.8146776, "learning_rate": 1.3932510236477745e-07, "loss": 0.83663493, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.7905521392822266 }, { "auxiliary_loss_clip": 0.01336503, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00787807, "balance_loss_mlp": 1.00014281, "epoch": 0.8841459748692359, "flos": 29059789564800.0, "grad_norm": 2.234321740772522, "language_loss": 0.5576061, "learning_rate": 1.3903959105145636e-07, "loss": 0.58290273, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.879348039627075 }, { "auxiliary_loss_clip": 0.0134785, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00775027, "balance_loss_mlp": 1.00014734, "epoch": 0.8842662177598749, "flos": 24311534086080.0, "grad_norm": 1.9622212567156911, "language_loss": 0.83253837, "learning_rate": 1.387543620443492e-07, "loss": 0.85794854, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.7577474117279053 }, { "auxiliary_loss_clip": 0.01346891, "auxiliary_loss_mlp": 0.01192968, "balance_loss_clip": 1.00751233, "balance_loss_mlp": 1.00013542, "epoch": 0.8843864606505141, "flos": 25007879191680.0, "grad_norm": 1.558043772846038, "language_loss": 0.84243327, "learning_rate": 1.3846941538672606e-07, "loss": 0.86783183, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.781362533569336 }, { "auxiliary_loss_clip": 0.01270698, "auxiliary_loss_mlp": 0.01192982, "balance_loss_clip": 1.00636649, "balance_loss_mlp": 1.00014961, "epoch": 0.8845067035411531, "flos": 28183928028480.0, "grad_norm": 2.0501761976004764, "language_loss": 0.81111109, "learning_rate": 1.3818475112181193e-07, "loss": 0.83574796, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.895402193069458 }, { "auxiliary_loss_clip": 0.01293829, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00682092, "balance_loss_mlp": 1.00016987, "epoch": 0.8846269464317922, "flos": 12853225782720.0, "grad_norm": 1.979318725834885, "language_loss": 0.79847348, "learning_rate": 1.3790036929279091e-07, "loss": 0.82334375, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.670961856842041 }, { "auxiliary_loss_clip": 0.01327516, "auxiliary_loss_mlp": 0.00872368, "balance_loss_clip": 1.00740087, "balance_loss_mlp": 1.0003252, "epoch": 0.8847471893224313, "flos": 18624361507200.0, "grad_norm": 3.316799509193731, "language_loss": 0.58588898, "learning_rate": 1.3761626994280363e-07, "loss": 0.60788786, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.833467721939087 }, { "auxiliary_loss_clip": 0.01298171, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00725472, "balance_loss_mlp": 1.0001955, "epoch": 0.8848674322130704, "flos": 35769447228960.0, "grad_norm": 1.6497415881178719, "language_loss": 0.73387218, "learning_rate": 1.3733245311494735e-07, "loss": 0.75878608, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.9032070636749268 }, { "auxiliary_loss_clip": 0.01328325, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.0076164, "balance_loss_mlp": 1.00017703, "epoch": 0.8849876751037095, "flos": 24243772520160.0, "grad_norm": 1.9239116803211345, "language_loss": 0.70710444, "learning_rate": 1.3704891885227676e-07, "loss": 0.73231876, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.919839859008789 }, { "auxiliary_loss_clip": 0.01312567, "auxiliary_loss_mlp": 0.01192987, "balance_loss_clip": 1.00793052, "balance_loss_mlp": 1.00015402, "epoch": 0.8851079179943486, "flos": 21500589212640.0, "grad_norm": 1.9039136138107262, "language_loss": 0.77847207, "learning_rate": 1.367656671978037e-07, "loss": 0.80352759, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.979762315750122 }, { "auxiliary_loss_clip": 0.01322805, "auxiliary_loss_mlp": 0.01193236, "balance_loss_clip": 1.00750566, "balance_loss_mlp": 1.00021219, "epoch": 0.8852281608849877, "flos": 15300719182080.0, "grad_norm": 2.662318098688091, "language_loss": 0.73413289, "learning_rate": 1.36482698194498e-07, "loss": 0.75929332, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.829312324523926 }, { "auxiliary_loss_clip": 0.01314999, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00723827, "balance_loss_mlp": 1.000144, "epoch": 0.8853484037756267, "flos": 23295730805280.0, "grad_norm": 1.8050317341183826, "language_loss": 0.72158128, "learning_rate": 1.3620001188528506e-07, "loss": 0.74666202, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.807766914367676 }, { "auxiliary_loss_clip": 0.01334824, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00765848, "balance_loss_mlp": 1.00015473, "epoch": 0.8854686466662659, "flos": 25114784359680.0, "grad_norm": 2.498278385470657, "language_loss": 0.73319501, "learning_rate": 1.3591760831304865e-07, "loss": 0.75847501, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.749239683151245 }, { "auxiliary_loss_clip": 0.0134759, "auxiliary_loss_mlp": 0.01193118, "balance_loss_clip": 1.00776243, "balance_loss_mlp": 1.00018954, "epoch": 0.885588889556905, "flos": 21390881997600.0, "grad_norm": 2.0707594249732315, "language_loss": 0.79284233, "learning_rate": 1.356354875206287e-07, "loss": 0.81824934, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.7515058517456055 }, { "auxiliary_loss_clip": 0.01279377, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00650084, "balance_loss_mlp": 1.00017118, "epoch": 0.885709132447544, "flos": 26906764668480.0, "grad_norm": 1.8985320940564363, "language_loss": 0.69530153, "learning_rate": 1.3535364955082296e-07, "loss": 0.72002631, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.9403793811798096 }, { "auxiliary_loss_clip": 0.01347945, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00815773, "balance_loss_mlp": 1.00017703, "epoch": 0.8858293753381832, "flos": 26103406623840.0, "grad_norm": 1.621023374798492, "language_loss": 0.64087015, "learning_rate": 1.3507209444638613e-07, "loss": 0.66627967, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.7434749603271484 }, { "auxiliary_loss_clip": 0.01327322, "auxiliary_loss_mlp": 0.01193193, "balance_loss_clip": 1.00753284, "balance_loss_mlp": 1.00016975, "epoch": 0.8859496182288222, "flos": 23292820987200.0, "grad_norm": 1.8129058174622925, "language_loss": 0.73827928, "learning_rate": 1.347908222500298e-07, "loss": 0.76348448, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.791800022125244 }, { "auxiliary_loss_clip": 0.01286297, "auxiliary_loss_mlp": 0.01192962, "balance_loss_clip": 1.0076232, "balance_loss_mlp": 1.00012946, "epoch": 0.8860698611194613, "flos": 16872925824000.0, "grad_norm": 1.8561981079307557, "language_loss": 0.69556606, "learning_rate": 1.3450983300442276e-07, "loss": 0.72035861, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 3.692492961883545 }, { "auxiliary_loss_clip": 0.0132911, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.0074147, "balance_loss_mlp": 1.0001471, "epoch": 0.8861901040101005, "flos": 24681415898880.0, "grad_norm": 1.7916413000327696, "language_loss": 0.73370862, "learning_rate": 1.3422912675219068e-07, "loss": 0.75893044, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 3.7295637130737305 }, { "auxiliary_loss_clip": 0.01346934, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00771952, "balance_loss_mlp": 1.00017667, "epoch": 0.8863103469007395, "flos": 24423037485120.0, "grad_norm": 2.635432115003149, "language_loss": 0.78783566, "learning_rate": 1.339487035359166e-07, "loss": 0.81323606, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 4.966240406036377 }, { "auxiliary_loss_clip": 0.01310459, "auxiliary_loss_mlp": 0.00872371, "balance_loss_clip": 1.00692153, "balance_loss_mlp": 1.00037622, "epoch": 0.8864305897913786, "flos": 22053974984640.0, "grad_norm": 1.4909562951481787, "language_loss": 0.84830713, "learning_rate": 1.336685633981409e-07, "loss": 0.87013543, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.806387424468994 }, { "auxiliary_loss_clip": 0.01335216, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00793803, "balance_loss_mlp": 1.00015926, "epoch": 0.8865508326820177, "flos": 19099460075040.0, "grad_norm": 1.7080862005277542, "language_loss": 0.74853611, "learning_rate": 1.333887063813597e-07, "loss": 0.77381819, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.8080546855926514 }, { "auxiliary_loss_clip": 0.01321018, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00740945, "balance_loss_mlp": 1.00015116, "epoch": 0.8866710755726568, "flos": 15414198383520.0, "grad_norm": 2.4274235939421023, "language_loss": 0.66241288, "learning_rate": 1.331091325280278e-07, "loss": 0.68755484, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.794621229171753 }, { "auxiliary_loss_clip": 0.01275503, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00742793, "balance_loss_mlp": 1.00016046, "epoch": 0.8867913184632958, "flos": 20083699650240.0, "grad_norm": 2.0803976645631512, "language_loss": 0.78601551, "learning_rate": 1.3282984188055625e-07, "loss": 0.81070238, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.846015453338623 }, { "auxiliary_loss_clip": 0.0134858, "auxiliary_loss_mlp": 0.01192965, "balance_loss_clip": 1.00807905, "balance_loss_mlp": 1.00013173, "epoch": 0.8869115613539349, "flos": 23365863334080.0, "grad_norm": 1.8427868319665, "language_loss": 0.79481459, "learning_rate": 1.3255083448131288e-07, "loss": 0.82023001, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.7769932746887207 }, { "auxiliary_loss_clip": 0.01334369, "auxiliary_loss_mlp": 0.01193157, "balance_loss_clip": 1.00704646, "balance_loss_mlp": 1.00013328, "epoch": 0.8870318042445741, "flos": 21286850724000.0, "grad_norm": 2.4141503114256064, "language_loss": 0.79009247, "learning_rate": 1.3227211037262365e-07, "loss": 0.8153677, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.760953187942505 }, { "auxiliary_loss_clip": 0.0126998, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00691807, "balance_loss_mlp": 1.00016773, "epoch": 0.8871520471352131, "flos": 20010872845440.0, "grad_norm": 2.58597406050468, "language_loss": 0.85457605, "learning_rate": 1.319936695967696e-07, "loss": 0.87920779, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.857140302658081 }, { "auxiliary_loss_clip": 0.01350257, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00815332, "balance_loss_mlp": 1.00019455, "epoch": 0.8872722900258522, "flos": 22601433349440.0, "grad_norm": 2.0149813142755137, "language_loss": 0.82017243, "learning_rate": 1.3171551219599097e-07, "loss": 0.84560716, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.777606725692749 }, { "auxiliary_loss_clip": 0.0134729, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.0081259, "balance_loss_mlp": 1.00016761, "epoch": 0.8873925329164913, "flos": 22163287039200.0, "grad_norm": 1.9034405212071759, "language_loss": 0.78119373, "learning_rate": 1.3143763821248377e-07, "loss": 0.80659759, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.729640245437622 }, { "auxiliary_loss_clip": 0.01347269, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.0077436, "balance_loss_mlp": 1.00014997, "epoch": 0.8875127758071304, "flos": 19208233274400.0, "grad_norm": 1.744983882383079, "language_loss": 0.71766317, "learning_rate": 1.3116004768840118e-07, "loss": 0.74306667, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.784649133682251 }, { "auxiliary_loss_clip": 0.01347896, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00774336, "balance_loss_mlp": 1.00018644, "epoch": 0.8876330186977694, "flos": 18110909658240.0, "grad_norm": 1.6456344738296464, "language_loss": 0.74118304, "learning_rate": 1.3088274066585348e-07, "loss": 0.76659417, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.6790785789489746 }, { "auxiliary_loss_clip": 0.01302345, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.0069629, "balance_loss_mlp": 1.00019085, "epoch": 0.8877532615884086, "flos": 22009442830560.0, "grad_norm": 2.093903419776489, "language_loss": 0.90296388, "learning_rate": 1.3060571718690749e-07, "loss": 0.92791951, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.797640085220337 }, { "auxiliary_loss_clip": 0.01271867, "auxiliary_loss_mlp": 0.0087188, "balance_loss_clip": 1.00361216, "balance_loss_mlp": 1.0001471, "epoch": 0.8878735044790477, "flos": 72136956078720.0, "grad_norm": 0.7433950447928498, "language_loss": 0.56901139, "learning_rate": 1.3032897729358805e-07, "loss": 0.59044892, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.375669002532959 }, { "auxiliary_loss_clip": 0.01266713, "auxiliary_loss_mlp": 0.00872573, "balance_loss_clip": 1.00719976, "balance_loss_mlp": 1.00047827, "epoch": 0.8879937473696867, "flos": 27526367288160.0, "grad_norm": 1.8217040026867761, "language_loss": 0.80160028, "learning_rate": 1.3005252102787645e-07, "loss": 0.82299304, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.907381772994995 }, { "auxiliary_loss_clip": 0.01331081, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00743091, "balance_loss_mlp": 1.00020111, "epoch": 0.8881139902603259, "flos": 22234102117920.0, "grad_norm": 1.5280836087788696, "language_loss": 0.73436177, "learning_rate": 1.297763484317105e-07, "loss": 0.75960481, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.768277168273926 }, { "auxiliary_loss_clip": 0.01272989, "auxiliary_loss_mlp": 0.00872441, "balance_loss_clip": 1.00772953, "balance_loss_mlp": 1.00034595, "epoch": 0.888234233150965, "flos": 20299557636000.0, "grad_norm": 2.2103570375989805, "language_loss": 0.70224667, "learning_rate": 1.2950045954698551e-07, "loss": 0.72370094, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.8533334732055664 }, { "auxiliary_loss_clip": 0.01289401, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00730562, "balance_loss_mlp": 1.00014639, "epoch": 0.888354476041604, "flos": 18148005610560.0, "grad_norm": 3.2057018447075207, "language_loss": 0.75588775, "learning_rate": 1.2922485441555343e-07, "loss": 0.7807135, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.816767454147339 }, { "auxiliary_loss_clip": 0.01347976, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00754595, "balance_loss_mlp": 1.00018358, "epoch": 0.8884747189322432, "flos": 22014292527360.0, "grad_norm": 1.7555763076256712, "language_loss": 0.81526077, "learning_rate": 1.2894953307922363e-07, "loss": 0.84067261, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.7174477577209473 }, { "auxiliary_loss_clip": 0.01289931, "auxiliary_loss_mlp": 0.01193047, "balance_loss_clip": 1.00703955, "balance_loss_mlp": 1.00011873, "epoch": 0.8885949618228822, "flos": 19786788336960.0, "grad_norm": 1.90291615764206, "language_loss": 0.83579957, "learning_rate": 1.2867449557976208e-07, "loss": 0.86062938, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.825777769088745 }, { "auxiliary_loss_clip": 0.01324462, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.00807214, "balance_loss_mlp": 1.00018001, "epoch": 0.8887152047135213, "flos": 20047609560960.0, "grad_norm": 2.822250643728068, "language_loss": 0.75748682, "learning_rate": 1.283997419588916e-07, "loss": 0.78266251, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.8732492923736572 }, { "auxiliary_loss_clip": 0.01334872, "auxiliary_loss_mlp": 0.01192967, "balance_loss_clip": 1.00755727, "balance_loss_mlp": 1.00013399, "epoch": 0.8888354476041604, "flos": 18588127723200.0, "grad_norm": 1.9000092966955662, "language_loss": 0.61430871, "learning_rate": 1.2812527225829216e-07, "loss": 0.63958704, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.7942566871643066 }, { "auxiliary_loss_clip": 0.01332006, "auxiliary_loss_mlp": 0.0119331, "balance_loss_clip": 1.00775921, "balance_loss_mlp": 1.00019062, "epoch": 0.8889556904947995, "flos": 21689805036960.0, "grad_norm": 2.5726450362615245, "language_loss": 0.7640456, "learning_rate": 1.2785108651960052e-07, "loss": 0.78929877, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.732783079147339 }, { "auxiliary_loss_clip": 0.01335965, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00780976, "balance_loss_mlp": 1.00018525, "epoch": 0.8890759333854386, "flos": 27381216610080.0, "grad_norm": 1.7576195812186677, "language_loss": 0.80228108, "learning_rate": 1.2757718478441094e-07, "loss": 0.82757282, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.768995523452759 }, { "auxiliary_loss_clip": 0.01314176, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.0070405, "balance_loss_mlp": 1.00016284, "epoch": 0.8891961762760777, "flos": 24498845955360.0, "grad_norm": 2.1645634772134104, "language_loss": 0.77257228, "learning_rate": 1.2730356709427302e-07, "loss": 0.79764402, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 3.7698771953582764 }, { "auxiliary_loss_clip": 0.01324745, "auxiliary_loss_mlp": 0.0119314, "balance_loss_clip": 1.0080024, "balance_loss_mlp": 1.00021172, "epoch": 0.8893164191667168, "flos": 41499786862080.0, "grad_norm": 1.4387457632354195, "language_loss": 0.5980953, "learning_rate": 1.2703023349069542e-07, "loss": 0.62327409, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 3.7750558853149414 }, { "auxiliary_loss_clip": 0.01321874, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00729144, "balance_loss_mlp": 1.00015163, "epoch": 0.8894366620573558, "flos": 33583637221920.0, "grad_norm": 1.7097171730204441, "language_loss": 0.61674249, "learning_rate": 1.2675718401514223e-07, "loss": 0.64189208, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 4.939866781234741 }, { "auxiliary_loss_clip": 0.01312401, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00745058, "balance_loss_mlp": 1.00016093, "epoch": 0.889556904947995, "flos": 16909842157920.0, "grad_norm": 2.109326215874619, "language_loss": 0.7421416, "learning_rate": 1.264844187090346e-07, "loss": 0.76719743, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.702946424484253 }, { "auxiliary_loss_clip": 0.01322716, "auxiliary_loss_mlp": 0.01192984, "balance_loss_clip": 1.00749695, "balance_loss_mlp": 1.00015068, "epoch": 0.889677147838634, "flos": 26030867208480.0, "grad_norm": 1.6059031019441423, "language_loss": 0.74968964, "learning_rate": 1.262119376137516e-07, "loss": 0.77484667, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.823209524154663 }, { "auxiliary_loss_clip": 0.01334397, "auxiliary_loss_mlp": 0.01192975, "balance_loss_clip": 1.00732493, "balance_loss_mlp": 1.00014234, "epoch": 0.8897973907292731, "flos": 26468295045120.0, "grad_norm": 1.5629907353310855, "language_loss": 0.84879088, "learning_rate": 1.2593974077062707e-07, "loss": 0.87406462, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.816389560699463 }, { "auxiliary_loss_clip": 0.01288233, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00701582, "balance_loss_mlp": 1.00017166, "epoch": 0.8899176336199123, "flos": 26249706859680.0, "grad_norm": 1.5795651362998588, "language_loss": 0.6351552, "learning_rate": 1.2566782822095423e-07, "loss": 0.65996951, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.8592820167541504 }, { "auxiliary_loss_clip": 0.01294298, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00717306, "balance_loss_mlp": 1.00013721, "epoch": 0.8900378765105513, "flos": 20811752156160.0, "grad_norm": 1.7036986048886806, "language_loss": 0.71380424, "learning_rate": 1.2539620000598162e-07, "loss": 0.73867881, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.8174853324890137 }, { "auxiliary_loss_clip": 0.01347691, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.00778246, "balance_loss_mlp": 1.00016522, "epoch": 0.8901581194011904, "flos": 16472342473920.0, "grad_norm": 1.7910224220851163, "language_loss": 0.79837132, "learning_rate": 1.2512485616691492e-07, "loss": 0.82377815, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.6595160961151123 }, { "auxiliary_loss_clip": 0.0130101, "auxiliary_loss_mlp": 0.0119314, "balance_loss_clip": 1.00801086, "balance_loss_mlp": 1.00021195, "epoch": 0.8902783622918296, "flos": 35155269084960.0, "grad_norm": 1.4558044442650975, "language_loss": 0.80786812, "learning_rate": 1.2485379674491681e-07, "loss": 0.83280957, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.8388803005218506 }, { "auxiliary_loss_clip": 0.01301542, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00721812, "balance_loss_mlp": 1.0001477, "epoch": 0.8903986051824686, "flos": 17201077529760.0, "grad_norm": 2.1408536393909903, "language_loss": 0.79266, "learning_rate": 1.2458302178110657e-07, "loss": 0.81760621, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.7283973693847656 }, { "auxiliary_loss_clip": 0.01290038, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00668359, "balance_loss_mlp": 1.00012779, "epoch": 0.8905188480731077, "flos": 25483875851520.0, "grad_norm": 1.7486193180363598, "language_loss": 0.82254469, "learning_rate": 1.2431253131656118e-07, "loss": 0.84737653, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.779616355895996 }, { "auxiliary_loss_clip": 0.01296998, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00683856, "balance_loss_mlp": 1.00014734, "epoch": 0.8906390909637467, "flos": 23365899257760.0, "grad_norm": 1.887710860161898, "language_loss": 0.76732373, "learning_rate": 1.240423253923133e-07, "loss": 0.79222453, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.7938597202301025 }, { "auxiliary_loss_clip": 0.01337906, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00852013, "balance_loss_mlp": 1.00020552, "epoch": 0.8907593338543859, "flos": 21068801393760.0, "grad_norm": 2.0967414689274317, "language_loss": 0.69406396, "learning_rate": 1.237724040493533e-07, "loss": 0.71937525, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.7808566093444824 }, { "auxiliary_loss_clip": 0.0134866, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00838351, "balance_loss_mlp": 1.00017953, "epoch": 0.8908795767450249, "flos": 21869572933440.0, "grad_norm": 2.608910085988645, "language_loss": 0.72760814, "learning_rate": 1.2350276732862773e-07, "loss": 0.75302678, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.709519624710083 }, { "auxiliary_loss_clip": 0.01303694, "auxiliary_loss_mlp": 0.01192295, "balance_loss_clip": 1.00352275, "balance_loss_mlp": 1.00003421, "epoch": 0.890999819635664, "flos": 66307901876640.0, "grad_norm": 0.8354211428456101, "language_loss": 0.56724203, "learning_rate": 1.2323341527103993e-07, "loss": 0.59220195, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.279371500015259 }, { "auxiliary_loss_clip": 0.01347919, "auxiliary_loss_mlp": 0.01193102, "balance_loss_clip": 1.00795913, "balance_loss_mlp": 1.00017381, "epoch": 0.8911200625263032, "flos": 26869920181920.0, "grad_norm": 1.8539486256034354, "language_loss": 0.85005236, "learning_rate": 1.2296434791745135e-07, "loss": 0.87546259, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.836059808731079 }, { "auxiliary_loss_clip": 0.01327097, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00741875, "balance_loss_mlp": 1.00014997, "epoch": 0.8912403054169422, "flos": 20885836289760.0, "grad_norm": 5.134576267542496, "language_loss": 0.76651394, "learning_rate": 1.2269556530867875e-07, "loss": 0.79171664, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.8087258338928223 }, { "auxiliary_loss_clip": 0.01349718, "auxiliary_loss_mlp": 0.01193248, "balance_loss_clip": 1.00826335, "balance_loss_mlp": 1.00022471, "epoch": 0.8913605483075813, "flos": 27016579654560.0, "grad_norm": 2.0079086720392305, "language_loss": 0.81674373, "learning_rate": 1.2242706748549614e-07, "loss": 0.84217346, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.747103214263916 }, { "auxiliary_loss_clip": 0.01323799, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00746489, "balance_loss_mlp": 1.0001719, "epoch": 0.8914807911982204, "flos": 23621511548160.0, "grad_norm": 1.7138982376842995, "language_loss": 0.82059312, "learning_rate": 1.2215885448863473e-07, "loss": 0.84576309, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.8645312786102295 }, { "auxiliary_loss_clip": 0.01301069, "auxiliary_loss_mlp": 0.01193138, "balance_loss_clip": 1.0065583, "balance_loss_mlp": 1.0001142, "epoch": 0.8916010340888595, "flos": 24462288858240.0, "grad_norm": 1.7789944325861797, "language_loss": 0.80500698, "learning_rate": 1.2189092635878152e-07, "loss": 0.82994902, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.815814256668091 }, { "auxiliary_loss_clip": 0.01286645, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.00736105, "balance_loss_mlp": 1.00019062, "epoch": 0.8917212769794985, "flos": 21215784179520.0, "grad_norm": 1.5806221340089954, "language_loss": 0.7733047, "learning_rate": 1.216232831365822e-07, "loss": 0.79810333, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.7709262371063232 }, { "auxiliary_loss_clip": 0.01318461, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00752366, "balance_loss_mlp": 1.00020432, "epoch": 0.8918415198701377, "flos": 25514002609920.0, "grad_norm": 1.6618149556981852, "language_loss": 0.80680227, "learning_rate": 1.2135592486263678e-07, "loss": 0.83191919, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.8844475746154785 }, { "auxiliary_loss_clip": 0.01306243, "auxiliary_loss_mlp": 0.01193048, "balance_loss_clip": 1.00666606, "balance_loss_mlp": 1.00011945, "epoch": 0.8919617627607768, "flos": 37853022146400.0, "grad_norm": 1.6466958383141332, "language_loss": 0.61223853, "learning_rate": 1.2108885157750415e-07, "loss": 0.63723147, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.8471148014068604 }, { "auxiliary_loss_clip": 0.0127779, "auxiliary_loss_mlp": 0.00872475, "balance_loss_clip": 1.00634181, "balance_loss_mlp": 1.00057447, "epoch": 0.8920820056514158, "flos": 26213688617760.0, "grad_norm": 1.65636977135069, "language_loss": 0.80354893, "learning_rate": 1.2082206332169897e-07, "loss": 0.82505167, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 3.0334625244140625 }, { "auxiliary_loss_clip": 0.01299653, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00714421, "balance_loss_mlp": 1.00013399, "epoch": 0.892202248542055, "flos": 17383144541760.0, "grad_norm": 2.5442906466431463, "language_loss": 0.73447806, "learning_rate": 1.2055556013569225e-07, "loss": 0.7594052, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.8559389114379883 }, { "auxiliary_loss_clip": 0.01297605, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00726199, "balance_loss_mlp": 1.00014782, "epoch": 0.892322491432694, "flos": 21324234065760.0, "grad_norm": 1.6932479079405436, "language_loss": 0.81865197, "learning_rate": 1.2028934205991315e-07, "loss": 0.84355879, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 3.689985513687134 }, { "auxiliary_loss_clip": 0.01335391, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00796294, "balance_loss_mlp": 1.00015271, "epoch": 0.8924427343233331, "flos": 24029387405280.0, "grad_norm": 1.4088896784857101, "language_loss": 0.76728177, "learning_rate": 1.2002340913474607e-07, "loss": 0.79256749, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 3.7969248294830322 }, { "auxiliary_loss_clip": 0.01348387, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.0079186, "balance_loss_mlp": 1.00015473, "epoch": 0.8925629772139723, "flos": 30008082745440.0, "grad_norm": 2.195916005617042, "language_loss": 0.73716807, "learning_rate": 1.1975776140053317e-07, "loss": 0.76258373, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 4.943364143371582 }, { "auxiliary_loss_clip": 0.01276777, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00719583, "balance_loss_mlp": 1.00017512, "epoch": 0.8926832201046113, "flos": 22601720738880.0, "grad_norm": 2.095618695776606, "language_loss": 0.73333716, "learning_rate": 1.194923988975729e-07, "loss": 0.75803697, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.8949484825134277 }, { "auxiliary_loss_clip": 0.01278638, "auxiliary_loss_mlp": 0.01193035, "balance_loss_clip": 1.00648141, "balance_loss_mlp": 1.00010681, "epoch": 0.8928034629952504, "flos": 13297730584320.0, "grad_norm": 2.1429482885413327, "language_loss": 0.73527825, "learning_rate": 1.192273216661206e-07, "loss": 0.75999504, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.796539306640625 }, { "auxiliary_loss_clip": 0.01234222, "auxiliary_loss_mlp": 0.01192288, "balance_loss_clip": 1.00307989, "balance_loss_mlp": 1.00002706, "epoch": 0.8929237058858895, "flos": 54854586964800.0, "grad_norm": 0.7682357340424529, "language_loss": 0.57542849, "learning_rate": 1.189625297463881e-07, "loss": 0.5996936, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.466083288192749 }, { "auxiliary_loss_clip": 0.01260894, "auxiliary_loss_mlp": 0.0119311, "balance_loss_clip": 1.00682628, "balance_loss_mlp": 1.0001812, "epoch": 0.8930439487765286, "flos": 28883865502080.0, "grad_norm": 1.7290347812279172, "language_loss": 0.79617631, "learning_rate": 1.1869802317854394e-07, "loss": 0.82071644, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.155533790588379 }, { "auxiliary_loss_clip": 0.01269836, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00593197, "balance_loss_mlp": 1.00019145, "epoch": 0.8931641916671677, "flos": 22419294490080.0, "grad_norm": 1.890145541922063, "language_loss": 0.72412938, "learning_rate": 1.1843380200271425e-07, "loss": 0.74875891, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 3.0048463344573975 }, { "auxiliary_loss_clip": 0.01283443, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00778663, "balance_loss_mlp": 1.0001415, "epoch": 0.8932844345578068, "flos": 25843160178720.0, "grad_norm": 1.7200511605846789, "language_loss": 0.8040905, "learning_rate": 1.181698662589805e-07, "loss": 0.82885563, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.9316623210906982 }, { "auxiliary_loss_clip": 0.01326041, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.0068233, "balance_loss_mlp": 1.00015962, "epoch": 0.8934046774484459, "flos": 22925813068800.0, "grad_norm": 1.7552601454511605, "language_loss": 0.7571196, "learning_rate": 1.1790621598738249e-07, "loss": 0.7823109, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.7559244632720947 }, { "auxiliary_loss_clip": 0.0134738, "auxiliary_loss_mlp": 0.01192976, "balance_loss_clip": 1.00821948, "balance_loss_mlp": 1.00014281, "epoch": 0.8935249203390849, "flos": 24462109239840.0, "grad_norm": 1.9048698404160165, "language_loss": 0.74627113, "learning_rate": 1.1764285122791461e-07, "loss": 0.77167469, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.781411647796631 }, { "auxiliary_loss_clip": 0.0133538, "auxiliary_loss_mlp": 0.01193048, "balance_loss_clip": 1.00766301, "balance_loss_mlp": 1.00011933, "epoch": 0.8936451632297241, "flos": 15742745249760.0, "grad_norm": 1.8424573899160932, "language_loss": 0.76965141, "learning_rate": 1.173797720205294e-07, "loss": 0.7949357, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.7632155418395996 }, { "auxiliary_loss_clip": 0.01324268, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00738382, "balance_loss_mlp": 1.00019801, "epoch": 0.8937654061203631, "flos": 35115514780320.0, "grad_norm": 2.6071501534931896, "language_loss": 0.7146197, "learning_rate": 1.1711697840513602e-07, "loss": 0.73979461, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.846707344055176 }, { "auxiliary_loss_clip": 0.01335375, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00775778, "balance_loss_mlp": 1.00015306, "epoch": 0.8938856490110022, "flos": 16107454052640.0, "grad_norm": 1.8643690049296409, "language_loss": 0.70637381, "learning_rate": 1.1685447042160012e-07, "loss": 0.73165846, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.7259521484375 }, { "auxiliary_loss_clip": 0.01348793, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00775349, "balance_loss_mlp": 1.00015092, "epoch": 0.8940058919016414, "flos": 20704200361920.0, "grad_norm": 1.8523465895242877, "language_loss": 0.71568429, "learning_rate": 1.1659224810974367e-07, "loss": 0.74110305, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.703260660171509 }, { "auxiliary_loss_clip": 0.01301665, "auxiliary_loss_mlp": 0.01193128, "balance_loss_clip": 1.00729954, "balance_loss_mlp": 1.00020003, "epoch": 0.8941261347922804, "flos": 25229053882080.0, "grad_norm": 2.9502852393126293, "language_loss": 0.68414569, "learning_rate": 1.1633031150934591e-07, "loss": 0.70909363, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.785146951675415 }, { "auxiliary_loss_clip": 0.01326134, "auxiliary_loss_mlp": 0.01192875, "balance_loss_clip": 1.00774014, "balance_loss_mlp": 1.00013781, "epoch": 0.8942463776829195, "flos": 19537247148480.0, "grad_norm": 1.8188840316538328, "language_loss": 0.79756212, "learning_rate": 1.1606866066014176e-07, "loss": 0.82275224, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 2.7116825580596924 }, { "auxiliary_loss_clip": 0.01282929, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.00635386, "balance_loss_mlp": 1.0000875, "epoch": 0.8943666205735585, "flos": 22301576294400.0, "grad_norm": 2.063121803087544, "language_loss": 0.7523517, "learning_rate": 1.1580729560182434e-07, "loss": 0.77711213, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 3.061939239501953 }, { "auxiliary_loss_clip": 0.01347288, "auxiliary_loss_mlp": 0.00872448, "balance_loss_clip": 1.00761604, "balance_loss_mlp": 1.00036502, "epoch": 0.8944868634641977, "flos": 18912902603040.0, "grad_norm": 1.615030093426913, "language_loss": 0.70971859, "learning_rate": 1.1554621637404171e-07, "loss": 0.73191595, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.671865224838257 }, { "auxiliary_loss_clip": 0.01327673, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.0072763, "balance_loss_mlp": 1.00015926, "epoch": 0.8946071063548368, "flos": 14460911811360.0, "grad_norm": 2.2277928587516556, "language_loss": 0.60637295, "learning_rate": 1.1528542301639999e-07, "loss": 0.63158053, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.750579595565796 }, { "auxiliary_loss_clip": 0.01289989, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00621676, "balance_loss_mlp": 1.00016022, "epoch": 0.8947273492454758, "flos": 20084094810720.0, "grad_norm": 2.236103912819673, "language_loss": 0.82170361, "learning_rate": 1.1502491556846105e-07, "loss": 0.84653538, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.8709497451782227 }, { "auxiliary_loss_clip": 0.01306404, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00700629, "balance_loss_mlp": 1.00014091, "epoch": 0.894847592136115, "flos": 18550564763040.0, "grad_norm": 2.29960575588685, "language_loss": 0.81199443, "learning_rate": 1.1476469406974331e-07, "loss": 0.83699018, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.8813250064849854 }, { "auxiliary_loss_clip": 0.01347546, "auxiliary_loss_mlp": 0.01192999, "balance_loss_clip": 1.00817919, "balance_loss_mlp": 1.00016618, "epoch": 0.894967835026754, "flos": 23478480367200.0, "grad_norm": 1.6538481237032787, "language_loss": 0.76950824, "learning_rate": 1.1450475855972341e-07, "loss": 0.79491377, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.7673377990722656 }, { "auxiliary_loss_clip": 0.01312391, "auxiliary_loss_mlp": 0.00872482, "balance_loss_clip": 1.00707698, "balance_loss_mlp": 1.000453, "epoch": 0.8950880779173931, "flos": 15188317691040.0, "grad_norm": 2.03595367083032, "language_loss": 0.70649362, "learning_rate": 1.1424510907783158e-07, "loss": 0.72834235, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.762951135635376 }, { "auxiliary_loss_clip": 0.01323577, "auxiliary_loss_mlp": 0.01193008, "balance_loss_clip": 1.0075798, "balance_loss_mlp": 1.00017524, "epoch": 0.8952083208080323, "flos": 22091969028960.0, "grad_norm": 1.5384327468896064, "language_loss": 0.82753372, "learning_rate": 1.1398574566345787e-07, "loss": 0.85269964, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.8159983158111572 }, { "auxiliary_loss_clip": 0.01317116, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00720501, "balance_loss_mlp": 1.00013852, "epoch": 0.8953285636986713, "flos": 23254036621920.0, "grad_norm": 1.9066963572083153, "language_loss": 0.82446533, "learning_rate": 1.1372666835594702e-07, "loss": 0.84956712, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.83923077583313 }, { "auxiliary_loss_clip": 0.01303431, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00694251, "balance_loss_mlp": 1.00015056, "epoch": 0.8954488065893104, "flos": 16362671182560.0, "grad_norm": 2.4723459945207473, "language_loss": 0.71562237, "learning_rate": 1.1346787719460071e-07, "loss": 0.74058747, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.8048715591430664 }, { "auxiliary_loss_clip": 0.0130524, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00682318, "balance_loss_mlp": 1.00017715, "epoch": 0.8955690494799495, "flos": 18257892444000.0, "grad_norm": 1.7451721620394804, "language_loss": 0.72056901, "learning_rate": 1.1320937221867732e-07, "loss": 0.74555337, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 4.564257621765137 }, { "auxiliary_loss_clip": 0.01321693, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00768661, "balance_loss_mlp": 1.00015688, "epoch": 0.8956892923705886, "flos": 25447498372800.0, "grad_norm": 1.6421837058077662, "language_loss": 0.79565048, "learning_rate": 1.1295115346739192e-07, "loss": 0.82079732, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 3.8836162090301514 }, { "auxiliary_loss_clip": 0.0130834, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00678277, "balance_loss_mlp": 1.0001626, "epoch": 0.8958095352612276, "flos": 52661902325760.0, "grad_norm": 2.5813057132063544, "language_loss": 0.72918046, "learning_rate": 1.1269322097991629e-07, "loss": 0.75419569, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 3.9594690799713135 }, { "auxiliary_loss_clip": 0.0132543, "auxiliary_loss_mlp": 0.01193279, "balance_loss_clip": 1.00800061, "balance_loss_mlp": 1.00015998, "epoch": 0.8959297781518668, "flos": 23186346903360.0, "grad_norm": 1.9108385507720609, "language_loss": 0.67976189, "learning_rate": 1.1243557479537846e-07, "loss": 0.70494902, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.7380709648132324 }, { "auxiliary_loss_clip": 0.01347733, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00785923, "balance_loss_mlp": 1.00014472, "epoch": 0.8960500210425059, "flos": 20334318549120.0, "grad_norm": 2.1565383815802535, "language_loss": 0.68628323, "learning_rate": 1.121782149528634e-07, "loss": 0.71169126, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.7075657844543457 }, { "auxiliary_loss_clip": 0.01297734, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00810313, "balance_loss_mlp": 1.00018549, "epoch": 0.8961702639331449, "flos": 19901704485600.0, "grad_norm": 2.485879790141811, "language_loss": 0.78853875, "learning_rate": 1.1192114149141208e-07, "loss": 0.81344819, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.786773204803467 }, { "auxiliary_loss_clip": 0.01316268, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00751448, "balance_loss_mlp": 1.00019538, "epoch": 0.8962905068237841, "flos": 12896356913280.0, "grad_norm": 2.1234828716812544, "language_loss": 0.65292442, "learning_rate": 1.1166435445002197e-07, "loss": 0.67801929, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.7335093021392822 }, { "auxiliary_loss_clip": 0.01324861, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.0072403, "balance_loss_mlp": 1.00013661, "epoch": 0.8964107497144231, "flos": 23440342628160.0, "grad_norm": 1.8348556216753151, "language_loss": 0.68669629, "learning_rate": 1.1140785386764818e-07, "loss": 0.71187556, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.7908389568328857 }, { "auxiliary_loss_clip": 0.0133543, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.0081501, "balance_loss_mlp": 1.00017631, "epoch": 0.8965309926050622, "flos": 19500187119840.0, "grad_norm": 1.9798889657358911, "language_loss": 0.69943142, "learning_rate": 1.1115163978320153e-07, "loss": 0.72471678, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.6741998195648193 }, { "auxiliary_loss_clip": 0.01336118, "auxiliary_loss_mlp": 0.00872532, "balance_loss_clip": 1.00801682, "balance_loss_mlp": 1.00045156, "epoch": 0.8966512354957014, "flos": 28658020733280.0, "grad_norm": 2.734539875763685, "language_loss": 0.82393968, "learning_rate": 1.1089571223554917e-07, "loss": 0.84602618, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.7721993923187256 }, { "auxiliary_loss_clip": 0.01335325, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00765848, "balance_loss_mlp": 1.00017607, "epoch": 0.8967714783863404, "flos": 23370928572960.0, "grad_norm": 1.7210014946312089, "language_loss": 0.85572302, "learning_rate": 1.1064007126351537e-07, "loss": 0.88100827, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.7787296772003174 }, { "auxiliary_loss_clip": 0.01299911, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00687861, "balance_loss_mlp": 1.00013769, "epoch": 0.8968917212769795, "flos": 24535187510400.0, "grad_norm": 2.0189696885734816, "language_loss": 0.76391697, "learning_rate": 1.1038471690588003e-07, "loss": 0.78884763, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.780042886734009 }, { "auxiliary_loss_clip": 0.01268433, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.00673151, "balance_loss_mlp": 1.00016487, "epoch": 0.8970119641676186, "flos": 23475426854400.0, "grad_norm": 2.137319801991043, "language_loss": 0.79893959, "learning_rate": 1.1012964920138145e-07, "loss": 0.82355386, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.910996198654175 }, { "auxiliary_loss_clip": 0.0132371, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00760579, "balance_loss_mlp": 1.00015438, "epoch": 0.8971322070582577, "flos": 24538205099520.0, "grad_norm": 1.4497581044775547, "language_loss": 0.75764096, "learning_rate": 1.0987486818871205e-07, "loss": 0.78280985, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.7950432300567627 }, { "auxiliary_loss_clip": 0.01325296, "auxiliary_loss_mlp": 0.00872493, "balance_loss_clip": 1.00755036, "balance_loss_mlp": 1.0004437, "epoch": 0.8972524499488967, "flos": 21797464602240.0, "grad_norm": 2.257466526003879, "language_loss": 0.73330188, "learning_rate": 1.0962037390652245e-07, "loss": 0.75527984, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.7464756965637207 }, { "auxiliary_loss_clip": 0.01312451, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00795841, "balance_loss_mlp": 1.00016356, "epoch": 0.8973726928395359, "flos": 21726254363040.0, "grad_norm": 1.6146338601187666, "language_loss": 0.71985525, "learning_rate": 1.0936616639341911e-07, "loss": 0.74491167, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.787792921066284 }, { "auxiliary_loss_clip": 0.01289328, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.00399959, "balance_loss_mlp": 1.00000656, "epoch": 0.897492935730175, "flos": 53837131194720.0, "grad_norm": 0.7383578544613882, "language_loss": 0.54810178, "learning_rate": 1.0911224568796473e-07, "loss": 0.5729177, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.28662371635437 }, { "auxiliary_loss_clip": 0.01326099, "auxiliary_loss_mlp": 0.01193034, "balance_loss_clip": 1.00758028, "balance_loss_mlp": 1.00020051, "epoch": 0.897613178620814, "flos": 18290354241600.0, "grad_norm": 1.727691230078017, "language_loss": 0.71114677, "learning_rate": 1.0885861182867984e-07, "loss": 0.73633814, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.7309679985046387 }, { "auxiliary_loss_clip": 0.01313489, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.0073278, "balance_loss_mlp": 1.00013638, "epoch": 0.8977334215114532, "flos": 32993730276480.0, "grad_norm": 1.7790996895807367, "language_loss": 0.70912051, "learning_rate": 1.0860526485403942e-07, "loss": 0.73418605, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.8071985244750977 }, { "auxiliary_loss_clip": 0.01347752, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.00782847, "balance_loss_mlp": 1.00023222, "epoch": 0.8978536644020922, "flos": 15195646121760.0, "grad_norm": 1.5636692089397024, "language_loss": 0.77433932, "learning_rate": 1.0835220480247675e-07, "loss": 0.79974747, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.743534803390503 }, { "auxiliary_loss_clip": 0.0130351, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00715137, "balance_loss_mlp": 1.00015628, "epoch": 0.8979739072927313, "flos": 18004399650720.0, "grad_norm": 2.0952097674933507, "language_loss": 0.83276194, "learning_rate": 1.0809943171238067e-07, "loss": 0.85772884, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.745332717895508 }, { "auxiliary_loss_clip": 0.01312835, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00736845, "balance_loss_mlp": 1.00019789, "epoch": 0.8980941501833704, "flos": 22271557307040.0, "grad_norm": 1.963304667066628, "language_loss": 0.62927997, "learning_rate": 1.078469456220965e-07, "loss": 0.65434051, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.7880070209503174 }, { "auxiliary_loss_clip": 0.01318408, "auxiliary_loss_mlp": 0.01192985, "balance_loss_clip": 1.0076108, "balance_loss_mlp": 1.00015175, "epoch": 0.8982143930740095, "flos": 37560744987840.0, "grad_norm": 1.6899348027640633, "language_loss": 0.69855845, "learning_rate": 1.0759474656992606e-07, "loss": 0.72367233, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.9036498069763184 }, { "auxiliary_loss_clip": 0.01322423, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00768447, "balance_loss_mlp": 1.00018096, "epoch": 0.8983346359646486, "flos": 18076903142400.0, "grad_norm": 2.182879964228036, "language_loss": 0.77471226, "learning_rate": 1.0734283459412785e-07, "loss": 0.79986763, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.9020256996154785 }, { "auxiliary_loss_clip": 0.01277529, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00724936, "balance_loss_mlp": 1.00015247, "epoch": 0.8984548788552876, "flos": 20558905989120.0, "grad_norm": 1.718529903147449, "language_loss": 0.80439764, "learning_rate": 1.0709120973291707e-07, "loss": 0.82910472, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.815342664718628 }, { "auxiliary_loss_clip": 0.01348192, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00808859, "balance_loss_mlp": 1.00015807, "epoch": 0.8985751217459268, "flos": 17785452228480.0, "grad_norm": 2.054565348491827, "language_loss": 0.7733084, "learning_rate": 1.0683987202446475e-07, "loss": 0.79872215, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.684540271759033 }, { "auxiliary_loss_clip": 0.0133612, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00773382, "balance_loss_mlp": 1.00017834, "epoch": 0.8986953646365659, "flos": 21617014155840.0, "grad_norm": 2.106835018183573, "language_loss": 0.6990304, "learning_rate": 1.0658882150689862e-07, "loss": 0.72432357, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 4.553081274032593 }, { "auxiliary_loss_clip": 0.01291892, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00734448, "balance_loss_mlp": 1.00016701, "epoch": 0.8988156075272049, "flos": 14027363732160.0, "grad_norm": 2.3143532371715114, "language_loss": 0.78177786, "learning_rate": 1.0633805821830288e-07, "loss": 0.80662864, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 3.884977340698242 }, { "auxiliary_loss_clip": 0.01310984, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00789356, "balance_loss_mlp": 1.00016689, "epoch": 0.8989358504178441, "flos": 29059214785920.0, "grad_norm": 2.5444920184237354, "language_loss": 0.82582855, "learning_rate": 1.0608758219671753e-07, "loss": 0.85087025, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 3.7374839782714844 }, { "auxiliary_loss_clip": 0.01318271, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00738525, "balance_loss_mlp": 1.00018668, "epoch": 0.8990560933084831, "flos": 20230430970240.0, "grad_norm": 1.6381221547394567, "language_loss": 0.70425487, "learning_rate": 1.0583739348014065e-07, "loss": 0.7293697, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.763503313064575 }, { "auxiliary_loss_clip": 0.0134885, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00842023, "balance_loss_mlp": 1.0001688, "epoch": 0.8991763361991222, "flos": 25520684414400.0, "grad_norm": 1.7740006621664326, "language_loss": 0.84557837, "learning_rate": 1.0558749210652518e-07, "loss": 0.87099791, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.7849130630493164 }, { "auxiliary_loss_clip": 0.01296884, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00678539, "balance_loss_mlp": 1.00014853, "epoch": 0.8992965790897613, "flos": 25119202972320.0, "grad_norm": 1.5477095332600348, "language_loss": 0.85545135, "learning_rate": 1.053378781137808e-07, "loss": 0.8803519, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.8510985374450684 }, { "auxiliary_loss_clip": 0.01296452, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.0072583, "balance_loss_mlp": 1.00020862, "epoch": 0.8994168219804004, "flos": 16070825108160.0, "grad_norm": 1.6544779301300954, "language_loss": 0.77574545, "learning_rate": 1.0508855153977392e-07, "loss": 0.80064136, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.7617156505584717 }, { "auxiliary_loss_clip": 0.01334994, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.0075444, "balance_loss_mlp": 1.0001893, "epoch": 0.8995370648710395, "flos": 24825776256000.0, "grad_norm": 2.139996532420149, "language_loss": 0.668176, "learning_rate": 1.0483951242232669e-07, "loss": 0.69345808, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.7582554817199707 }, { "auxiliary_loss_clip": 0.01315705, "auxiliary_loss_mlp": 0.01192278, "balance_loss_clip": 1.00357759, "balance_loss_mlp": 1.00001764, "epoch": 0.8996573077616786, "flos": 63116275371840.0, "grad_norm": 0.9769604526798834, "language_loss": 0.57819688, "learning_rate": 1.0459076079921936e-07, "loss": 0.60327673, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.3064281940460205 }, { "auxiliary_loss_clip": 0.01313015, "auxiliary_loss_mlp": 0.01192993, "balance_loss_clip": 1.00775361, "balance_loss_mlp": 1.00015974, "epoch": 0.8997775506523177, "flos": 18219682857600.0, "grad_norm": 2.2571387136116607, "language_loss": 0.85085249, "learning_rate": 1.0434229670818618e-07, "loss": 0.87591261, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.7091329097747803 }, { "auxiliary_loss_clip": 0.01309882, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00760365, "balance_loss_mlp": 1.00013447, "epoch": 0.8998977935429567, "flos": 24166778568480.0, "grad_norm": 1.4078631468529632, "language_loss": 0.80144703, "learning_rate": 1.0409412018691944e-07, "loss": 0.82647747, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.8669536113739014 }, { "auxiliary_loss_clip": 0.01312609, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00771832, "balance_loss_mlp": 1.0001806, "epoch": 0.9000180364335959, "flos": 20773039638240.0, "grad_norm": 3.0562474870357508, "language_loss": 0.75164509, "learning_rate": 1.0384623127306724e-07, "loss": 0.77670318, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.7905468940734863 }, { "auxiliary_loss_clip": 0.01303224, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00755572, "balance_loss_mlp": 1.00017071, "epoch": 0.900138279324235, "flos": 19205754540480.0, "grad_norm": 1.702823099859963, "language_loss": 0.79556018, "learning_rate": 1.0359863000423397e-07, "loss": 0.82052338, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.8109142780303955 }, { "auxiliary_loss_clip": 0.01348226, "auxiliary_loss_mlp": 0.01193123, "balance_loss_clip": 1.00760508, "balance_loss_mlp": 1.00019503, "epoch": 0.900258522214874, "flos": 28731170851200.0, "grad_norm": 1.5553548049965893, "language_loss": 0.72016102, "learning_rate": 1.0335131641798112e-07, "loss": 0.74557453, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.7402968406677246 }, { "auxiliary_loss_clip": 0.01282332, "auxiliary_loss_mlp": 0.01192272, "balance_loss_clip": 1.00354528, "balance_loss_mlp": 1.00001097, "epoch": 0.9003787651055132, "flos": 58280715845280.0, "grad_norm": 0.8029056721890059, "language_loss": 0.55648685, "learning_rate": 1.0310429055182512e-07, "loss": 0.58123291, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 3.1328139305114746 }, { "auxiliary_loss_clip": 0.01294061, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00708818, "balance_loss_mlp": 1.0001471, "epoch": 0.9004990079961522, "flos": 25556487114240.0, "grad_norm": 1.645471000203932, "language_loss": 0.73788726, "learning_rate": 1.0285755244324024e-07, "loss": 0.76275957, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.8085246086120605 }, { "auxiliary_loss_clip": 0.01317846, "auxiliary_loss_mlp": 0.00872399, "balance_loss_clip": 1.00697589, "balance_loss_mlp": 1.00038838, "epoch": 0.9006192508867913, "flos": 23335197720480.0, "grad_norm": 1.5182347860244283, "language_loss": 0.68761724, "learning_rate": 1.0261110212965629e-07, "loss": 0.70951974, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.841597557067871 }, { "auxiliary_loss_clip": 0.01314768, "auxiliary_loss_mlp": 0.01193225, "balance_loss_clip": 1.00730717, "balance_loss_mlp": 1.00020146, "epoch": 0.9007394937774305, "flos": 18040310121600.0, "grad_norm": 2.2773824071618494, "language_loss": 0.79243076, "learning_rate": 1.023649396484596e-07, "loss": 0.81751072, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.7336161136627197 }, { "auxiliary_loss_clip": 0.01347741, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00775564, "balance_loss_mlp": 1.00016785, "epoch": 0.9008597366680695, "flos": 43068472983360.0, "grad_norm": 1.7778240098165745, "language_loss": 0.67579997, "learning_rate": 1.0211906503699275e-07, "loss": 0.70120931, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.928544759750366 }, { "auxiliary_loss_clip": 0.01327513, "auxiliary_loss_mlp": 0.01193227, "balance_loss_clip": 1.007833, "balance_loss_mlp": 1.00020373, "epoch": 0.9009799795587086, "flos": 14939063892000.0, "grad_norm": 2.2261580518004003, "language_loss": 0.82175595, "learning_rate": 1.0187347833255455e-07, "loss": 0.84696335, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.6893155574798584 }, { "auxiliary_loss_clip": 0.01347395, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00809908, "balance_loss_mlp": 1.00015569, "epoch": 0.9011002224493477, "flos": 21579594890400.0, "grad_norm": 1.9761361409472111, "language_loss": 0.79153097, "learning_rate": 1.0162817957240056e-07, "loss": 0.81693572, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.7458927631378174 }, { "auxiliary_loss_clip": 0.01299996, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00344491, "balance_loss_mlp": 1.00000525, "epoch": 0.9012204653399868, "flos": 71166403425600.0, "grad_norm": 0.8836455520400092, "language_loss": 0.63055146, "learning_rate": 1.0138316879374253e-07, "loss": 0.65547407, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.506521701812744 }, { "auxiliary_loss_clip": 0.01307643, "auxiliary_loss_mlp": 0.01193011, "balance_loss_clip": 1.007447, "balance_loss_mlp": 1.00017762, "epoch": 0.9013407082306258, "flos": 15594972143040.0, "grad_norm": 2.126395351034701, "language_loss": 0.74543536, "learning_rate": 1.0113844603374833e-07, "loss": 0.77044189, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.789257049560547 }, { "auxiliary_loss_clip": 0.01315013, "auxiliary_loss_mlp": 0.01192988, "balance_loss_clip": 1.00685608, "balance_loss_mlp": 1.00015497, "epoch": 0.901460951121265, "flos": 15049166267520.0, "grad_norm": 1.9381711937811106, "language_loss": 0.71752465, "learning_rate": 1.0089401132954178e-07, "loss": 0.74260467, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.7658751010894775 }, { "auxiliary_loss_clip": 0.01309938, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00733757, "balance_loss_mlp": 1.00014448, "epoch": 0.9015811940119041, "flos": 22236868241280.0, "grad_norm": 1.9735294520098137, "language_loss": 0.72428024, "learning_rate": 1.006498647182037e-07, "loss": 0.74931031, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.8372225761413574 }, { "auxiliary_loss_clip": 0.01271763, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00695038, "balance_loss_mlp": 1.00013494, "epoch": 0.9017014369025431, "flos": 24973836752160.0, "grad_norm": 2.0542451604848098, "language_loss": 0.71585822, "learning_rate": 1.004060062367713e-07, "loss": 0.74050653, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.8967525959014893 }, { "auxiliary_loss_clip": 0.01330401, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.007231, "balance_loss_mlp": 1.00016975, "epoch": 0.9018216797931822, "flos": 18114178713120.0, "grad_norm": 1.8007236428847486, "language_loss": 0.69573593, "learning_rate": 1.0016243592223728e-07, "loss": 0.72097093, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 4.567962646484375 }, { "auxiliary_loss_clip": 0.01256623, "auxiliary_loss_mlp": 0.01192977, "balance_loss_clip": 1.00709581, "balance_loss_mlp": 1.00014353, "epoch": 0.9019419226838213, "flos": 37268467829280.0, "grad_norm": 2.2061826363866652, "language_loss": 0.65566695, "learning_rate": 9.991915381155114e-08, "loss": 0.68016291, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 4.123645305633545 }, { "auxiliary_loss_clip": 0.01336674, "auxiliary_loss_mlp": 0.01192903, "balance_loss_clip": 1.00805759, "balance_loss_mlp": 1.0001657, "epoch": 0.9020621655744604, "flos": 23441132949120.0, "grad_norm": 2.0366564487310086, "language_loss": 0.75005585, "learning_rate": 9.967615994161871e-08, "loss": 0.77535164, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 3.6824898719787598 }, { "auxiliary_loss_clip": 0.0134777, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00795674, "balance_loss_mlp": 1.00013578, "epoch": 0.9021824084650995, "flos": 22857476724000.0, "grad_norm": 2.3472218972936627, "language_loss": 0.7811501, "learning_rate": 9.943345434930161e-08, "loss": 0.80655742, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.6698484420776367 }, { "auxiliary_loss_clip": 0.01281529, "auxiliary_loss_mlp": 0.01193004, "balance_loss_clip": 1.00650012, "balance_loss_mlp": 1.00017095, "epoch": 0.9023026513557386, "flos": 22127592110400.0, "grad_norm": 1.843281827124278, "language_loss": 0.69127882, "learning_rate": 9.919103707141885e-08, "loss": 0.71602416, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.745854139328003 }, { "auxiliary_loss_clip": 0.01323237, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00775826, "balance_loss_mlp": 1.00019169, "epoch": 0.9024228942463777, "flos": 24199096671360.0, "grad_norm": 2.448884545960563, "language_loss": 0.75961488, "learning_rate": 9.89489081447441e-08, "loss": 0.78477842, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.811375379562378 }, { "auxiliary_loss_clip": 0.013152, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00733209, "balance_loss_mlp": 1.00019217, "epoch": 0.9025431371370167, "flos": 25008274352160.0, "grad_norm": 1.8323095940903695, "language_loss": 0.82663167, "learning_rate": 9.870706760600844e-08, "loss": 0.8517158, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.731666326522827 }, { "auxiliary_loss_clip": 0.01254648, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00713181, "balance_loss_mlp": 1.0001806, "epoch": 0.9026633800276559, "flos": 18952872449760.0, "grad_norm": 1.9634140187317635, "language_loss": 0.72888482, "learning_rate": 9.846551549189918e-08, "loss": 0.75336242, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.863071918487549 }, { "auxiliary_loss_clip": 0.01301042, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00665641, "balance_loss_mlp": 1.00014961, "epoch": 0.902783622918295, "flos": 32416073305920.0, "grad_norm": 1.8979191821485992, "language_loss": 0.68127376, "learning_rate": 9.822425183905902e-08, "loss": 0.70621592, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.783752679824829 }, { "auxiliary_loss_clip": 0.01266534, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00331712, "balance_loss_mlp": 1.00001049, "epoch": 0.902903865808934, "flos": 63717485067360.0, "grad_norm": 0.9119328504213298, "language_loss": 0.75327241, "learning_rate": 9.798327668408823e-08, "loss": 0.77786052, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.4720702171325684 }, { "auxiliary_loss_clip": 0.0134985, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00821757, "balance_loss_mlp": 1.00019836, "epoch": 0.9030241086995732, "flos": 23804045568000.0, "grad_norm": 1.785399412167222, "language_loss": 0.69013596, "learning_rate": 9.774259006354158e-08, "loss": 0.71556669, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.7158889770507812 }, { "auxiliary_loss_clip": 0.0131759, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00701737, "balance_loss_mlp": 1.00011873, "epoch": 0.9031443515902122, "flos": 26395899324480.0, "grad_norm": 1.7722416437424582, "language_loss": 0.76366097, "learning_rate": 9.750219201393184e-08, "loss": 0.78876829, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.8108224868774414 }, { "auxiliary_loss_clip": 0.01326241, "auxiliary_loss_mlp": 0.01193259, "balance_loss_clip": 1.00707686, "balance_loss_mlp": 1.0002352, "epoch": 0.9032645944808513, "flos": 24939363228480.0, "grad_norm": 1.6762048343783733, "language_loss": 0.77679491, "learning_rate": 9.726208257172697e-08, "loss": 0.80198991, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.8014302253723145 }, { "auxiliary_loss_clip": 0.01348067, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00819182, "balance_loss_mlp": 1.00016844, "epoch": 0.9033848373714904, "flos": 21178831921920.0, "grad_norm": 1.966920848318367, "language_loss": 0.74644643, "learning_rate": 9.702226177335115e-08, "loss": 0.7718581, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.6935534477233887 }, { "auxiliary_loss_clip": 0.01304657, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00725722, "balance_loss_mlp": 1.00015903, "epoch": 0.9035050802621295, "flos": 26286371727840.0, "grad_norm": 1.6155750352202272, "language_loss": 0.7261914, "learning_rate": 9.67827296551853e-08, "loss": 0.75116974, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.8007655143737793 }, { "auxiliary_loss_clip": 0.01324228, "auxiliary_loss_mlp": 0.00872449, "balance_loss_clip": 1.00861549, "balance_loss_mlp": 1.00047386, "epoch": 0.9036253231527686, "flos": 24204557070720.0, "grad_norm": 1.8635541730980063, "language_loss": 0.68878865, "learning_rate": 9.65434862535659e-08, "loss": 0.71075547, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.812880039215088 }, { "auxiliary_loss_clip": 0.01308692, "auxiliary_loss_mlp": 0.01193083, "balance_loss_clip": 1.0063504, "balance_loss_mlp": 1.00015473, "epoch": 0.9037455660434077, "flos": 18072664148160.0, "grad_norm": 4.999985315109179, "language_loss": 0.64880675, "learning_rate": 9.630453160478635e-08, "loss": 0.67382455, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.755669116973877 }, { "auxiliary_loss_clip": 0.01280305, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.0065527, "balance_loss_mlp": 1.00016427, "epoch": 0.9038658089340468, "flos": 24060807416160.0, "grad_norm": 1.5462840242333786, "language_loss": 0.82392639, "learning_rate": 9.60658657450959e-08, "loss": 0.84866136, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.8606317043304443 }, { "auxiliary_loss_clip": 0.01309523, "auxiliary_loss_mlp": 0.01192986, "balance_loss_clip": 1.00693095, "balance_loss_mlp": 1.00015295, "epoch": 0.9039860518246858, "flos": 21834309088800.0, "grad_norm": 1.6312760415034244, "language_loss": 0.79424715, "learning_rate": 9.582748871069979e-08, "loss": 0.81927222, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 2.769043445587158 }, { "auxiliary_loss_clip": 0.01316416, "auxiliary_loss_mlp": 0.0087247, "balance_loss_clip": 1.00712085, "balance_loss_mlp": 1.0004108, "epoch": 0.904106294715325, "flos": 26614882670400.0, "grad_norm": 1.921296238057225, "language_loss": 0.8273623, "learning_rate": 9.558940053775954e-08, "loss": 0.84925121, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.8255293369293213 }, { "auxiliary_loss_clip": 0.01326275, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00738728, "balance_loss_mlp": 1.00013936, "epoch": 0.904226537605964, "flos": 17785703694240.0, "grad_norm": 1.8058035212804944, "language_loss": 0.67610699, "learning_rate": 9.535160126239294e-08, "loss": 0.70130134, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.673847198486328 }, { "auxiliary_loss_clip": 0.01327091, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00767159, "balance_loss_mlp": 1.00019133, "epoch": 0.9043467804966031, "flos": 24790440564000.0, "grad_norm": 1.5430709098198712, "language_loss": 0.7099539, "learning_rate": 9.511409092067424e-08, "loss": 0.73515606, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.780120849609375 }, { "auxiliary_loss_clip": 0.01302362, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00690055, "balance_loss_mlp": 1.000144, "epoch": 0.9044670233872423, "flos": 22632134886720.0, "grad_norm": 1.6744720880070003, "language_loss": 0.67379129, "learning_rate": 9.487686954863327e-08, "loss": 0.69874561, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.747994899749756 }, { "auxiliary_loss_clip": 0.01326397, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00734615, "balance_loss_mlp": 1.00016451, "epoch": 0.9045872662778813, "flos": 23771332304640.0, "grad_norm": 2.4140989109686974, "language_loss": 0.77275133, "learning_rate": 9.46399371822566e-08, "loss": 0.79794723, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.8061180114746094 }, { "auxiliary_loss_clip": 0.01348431, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00811946, "balance_loss_mlp": 1.00015974, "epoch": 0.9047075091685204, "flos": 15191047890720.0, "grad_norm": 1.8682475872627522, "language_loss": 0.72254705, "learning_rate": 9.440329385748657e-08, "loss": 0.74796224, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.7194406986236572 }, { "auxiliary_loss_clip": 0.01290961, "auxiliary_loss_mlp": 0.01192869, "balance_loss_clip": 1.00720394, "balance_loss_mlp": 1.00013149, "epoch": 0.9048277520591596, "flos": 18003717100800.0, "grad_norm": 1.75008048270723, "language_loss": 0.70230061, "learning_rate": 9.416693961022137e-08, "loss": 0.72713894, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.8159923553466797 }, { "auxiliary_loss_clip": 0.01256223, "auxiliary_loss_mlp": 0.01192974, "balance_loss_clip": 1.00693977, "balance_loss_mlp": 1.00014067, "epoch": 0.9049479949497986, "flos": 21872482751520.0, "grad_norm": 1.5987444057763478, "language_loss": 0.77273822, "learning_rate": 9.393087447631654e-08, "loss": 0.79723012, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 4.731189966201782 }, { "auxiliary_loss_clip": 0.01300549, "auxiliary_loss_mlp": 0.01192997, "balance_loss_clip": 1.00705767, "balance_loss_mlp": 1.00016451, "epoch": 0.9050682378404377, "flos": 20773937730240.0, "grad_norm": 1.6465605287273586, "language_loss": 0.72577912, "learning_rate": 9.36950984915823e-08, "loss": 0.7507146, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 3.8407342433929443 }, { "auxiliary_loss_clip": 0.01348516, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.0080719, "balance_loss_mlp": 1.00018001, "epoch": 0.9051884807310768, "flos": 21580025974560.0, "grad_norm": 1.6805988023063767, "language_loss": 0.69160414, "learning_rate": 9.345961169178607e-08, "loss": 0.71702135, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 3.6747231483459473 }, { "auxiliary_loss_clip": 0.01286898, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00793743, "balance_loss_mlp": 1.00016069, "epoch": 0.9053087236217159, "flos": 21908069909280.0, "grad_norm": 1.4960009280869722, "language_loss": 0.72870779, "learning_rate": 9.322441411265081e-08, "loss": 0.75350761, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.875988721847534 }, { "auxiliary_loss_clip": 0.01311561, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00737786, "balance_loss_mlp": 1.00015736, "epoch": 0.9054289665123549, "flos": 17055819080640.0, "grad_norm": 1.8997165213988856, "language_loss": 0.73346329, "learning_rate": 9.298950578985554e-08, "loss": 0.75851065, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.7322747707366943 }, { "auxiliary_loss_clip": 0.01321316, "auxiliary_loss_mlp": 0.0087257, "balance_loss_clip": 1.00760078, "balance_loss_mlp": 1.00046217, "epoch": 0.905549209402994, "flos": 20777278632480.0, "grad_norm": 1.6568349667061617, "language_loss": 0.7119925, "learning_rate": 9.275488675903665e-08, "loss": 0.73393136, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.7375690937042236 }, { "auxiliary_loss_clip": 0.01271456, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00722444, "balance_loss_mlp": 1.00016236, "epoch": 0.9056694522936332, "flos": 21686823371520.0, "grad_norm": 1.8597366322977353, "language_loss": 0.73492062, "learning_rate": 9.252055705578454e-08, "loss": 0.75956702, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.831214666366577 }, { "auxiliary_loss_clip": 0.01334369, "auxiliary_loss_mlp": 0.01192982, "balance_loss_clip": 1.00734496, "balance_loss_mlp": 1.00014901, "epoch": 0.9057896951842722, "flos": 29569146114240.0, "grad_norm": 1.5797380497488067, "language_loss": 0.72044647, "learning_rate": 9.228651671564747e-08, "loss": 0.74572003, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.7993156909942627 }, { "auxiliary_loss_clip": 0.01264204, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00662994, "balance_loss_mlp": 1.00014055, "epoch": 0.9059099380749113, "flos": 27892261572480.0, "grad_norm": 1.4821843849765088, "language_loss": 0.77851439, "learning_rate": 9.205276577412901e-08, "loss": 0.80308717, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.8947854042053223 }, { "auxiliary_loss_clip": 0.01322745, "auxiliary_loss_mlp": 0.00872589, "balance_loss_clip": 1.00770164, "balance_loss_mlp": 1.00053239, "epoch": 0.9060301809655504, "flos": 17749002902400.0, "grad_norm": 2.866483129835721, "language_loss": 0.77206713, "learning_rate": 9.181930426668905e-08, "loss": 0.79402041, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.749624490737915 }, { "auxiliary_loss_clip": 0.01272195, "auxiliary_loss_mlp": 0.01193018, "balance_loss_clip": 1.00842714, "balance_loss_mlp": 1.00018466, "epoch": 0.9061504238561895, "flos": 31759446581280.0, "grad_norm": 1.5651501670930394, "language_loss": 0.67678565, "learning_rate": 9.158613222874346e-08, "loss": 0.70143777, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.946592092514038 }, { "auxiliary_loss_clip": 0.01311155, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00711989, "balance_loss_mlp": 1.00018489, "epoch": 0.9062706667468285, "flos": 20048076568800.0, "grad_norm": 1.4716458241047727, "language_loss": 0.81945777, "learning_rate": 9.135324969566394e-08, "loss": 0.84450138, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.7557287216186523 }, { "auxiliary_loss_clip": 0.01336296, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00824976, "balance_loss_mlp": 1.00013947, "epoch": 0.9063909096374677, "flos": 18437301103680.0, "grad_norm": 2.37739781294427, "language_loss": 0.75499189, "learning_rate": 9.112065670277913e-08, "loss": 0.78028655, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.8448307514190674 }, { "auxiliary_loss_clip": 0.01316075, "auxiliary_loss_mlp": 0.01192991, "balance_loss_clip": 1.00693893, "balance_loss_mlp": 1.00015807, "epoch": 0.9065111525281068, "flos": 33547367514240.0, "grad_norm": 1.6509872663030807, "language_loss": 0.72689474, "learning_rate": 9.088835328537303e-08, "loss": 0.75198543, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.9218010902404785 }, { "auxiliary_loss_clip": 0.01315046, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00754666, "balance_loss_mlp": 1.00016081, "epoch": 0.9066313954187458, "flos": 23367875060160.0, "grad_norm": 2.202089242613318, "language_loss": 0.70941091, "learning_rate": 9.065633947868568e-08, "loss": 0.73449326, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.7886006832122803 }, { "auxiliary_loss_clip": 0.01279923, "auxiliary_loss_mlp": 0.00872397, "balance_loss_clip": 1.00629425, "balance_loss_mlp": 1.00035894, "epoch": 0.906751638309385, "flos": 26249635012320.0, "grad_norm": 1.9586004144674058, "language_loss": 0.79684818, "learning_rate": 9.042461531791379e-08, "loss": 0.81837142, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.904670000076294 }, { "auxiliary_loss_clip": 0.01346984, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00779366, "balance_loss_mlp": 1.00015414, "epoch": 0.906871881200024, "flos": 16544486728800.0, "grad_norm": 1.7610022808801318, "language_loss": 0.77689171, "learning_rate": 9.019318083820903e-08, "loss": 0.80229235, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.6333487033843994 }, { "auxiliary_loss_clip": 0.01323649, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00725079, "balance_loss_mlp": 1.00018501, "epoch": 0.9069921240906631, "flos": 24605140420800.0, "grad_norm": 1.567210610343266, "language_loss": 0.85316616, "learning_rate": 8.996203607468045e-08, "loss": 0.8783347, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.8908534049987793 }, { "auxiliary_loss_clip": 0.01336234, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00744581, "balance_loss_mlp": 1.00012946, "epoch": 0.9071123669813023, "flos": 25374743415360.0, "grad_norm": 1.3386126090832458, "language_loss": 0.75492585, "learning_rate": 8.973118106239241e-08, "loss": 0.78021973, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 2.778120756149292 }, { "auxiliary_loss_clip": 0.01273896, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00694656, "balance_loss_mlp": 1.00013375, "epoch": 0.9072326098719413, "flos": 26725811290560.0, "grad_norm": 1.899927088480527, "language_loss": 0.94474995, "learning_rate": 8.95006158363656e-08, "loss": 0.96941954, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.891512393951416 }, { "auxiliary_loss_clip": 0.01325277, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00808764, "balance_loss_mlp": 1.00019348, "epoch": 0.9073528527625804, "flos": 23878812251520.0, "grad_norm": 1.6323567706326907, "language_loss": 0.77523047, "learning_rate": 8.9270340431576e-08, "loss": 0.8004154, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.7616052627563477 }, { "auxiliary_loss_clip": 0.01330373, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00684738, "balance_loss_mlp": 1.00015688, "epoch": 0.9074730956532195, "flos": 37852159978080.0, "grad_norm": 2.0552604412057898, "language_loss": 0.7320019, "learning_rate": 8.904035488295658e-08, "loss": 0.75723749, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.827068567276001 }, { "auxiliary_loss_clip": 0.01298601, "auxiliary_loss_mlp": 0.00871955, "balance_loss_clip": 1.00352859, "balance_loss_mlp": 1.00020683, "epoch": 0.9075933385438586, "flos": 65173338613440.0, "grad_norm": 0.6578812069976426, "language_loss": 0.53287894, "learning_rate": 8.881065922539632e-08, "loss": 0.5545845, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.262723445892334 }, { "auxiliary_loss_clip": 0.01276027, "auxiliary_loss_mlp": 0.0119313, "balance_loss_clip": 1.00603771, "balance_loss_mlp": 1.00020158, "epoch": 0.9077135814344977, "flos": 19931579778240.0, "grad_norm": 1.8734873375639238, "language_loss": 0.73230815, "learning_rate": 8.85812534937389e-08, "loss": 0.75699967, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.7976181507110596 }, { "auxiliary_loss_clip": 0.01320822, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00807393, "balance_loss_mlp": 1.00015795, "epoch": 0.9078338243251368, "flos": 17529660319680.0, "grad_norm": 2.385290661943391, "language_loss": 0.66727221, "learning_rate": 8.835213772278583e-08, "loss": 0.6924122, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.7275681495666504 }, { "auxiliary_loss_clip": 0.01275255, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00638473, "balance_loss_mlp": 1.00014424, "epoch": 0.9079540672157759, "flos": 28803422877120.0, "grad_norm": 1.749012760435369, "language_loss": 0.78889, "learning_rate": 8.812331194729373e-08, "loss": 0.81357419, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.8479909896850586 }, { "auxiliary_loss_clip": 0.01348653, "auxiliary_loss_mlp": 0.01193242, "balance_loss_clip": 1.00826335, "balance_loss_mlp": 1.00021827, "epoch": 0.9080743101064149, "flos": 23513851982880.0, "grad_norm": 2.48987444665161, "language_loss": 0.71828723, "learning_rate": 8.789477620197461e-08, "loss": 0.74370623, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 4.505397796630859 }, { "auxiliary_loss_clip": 0.01312784, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00755858, "balance_loss_mlp": 1.00017858, "epoch": 0.9081945529970541, "flos": 22778111809440.0, "grad_norm": 2.2973500019993827, "language_loss": 0.79136908, "learning_rate": 8.766653052149831e-08, "loss": 0.81642896, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 3.8811137676239014 }, { "auxiliary_loss_clip": 0.01305222, "auxiliary_loss_mlp": 0.01193157, "balance_loss_clip": 1.00695467, "balance_loss_mlp": 1.00013375, "epoch": 0.9083147958876931, "flos": 18873723077280.0, "grad_norm": 1.926606383960453, "language_loss": 0.74532449, "learning_rate": 8.743857494048823e-08, "loss": 0.77030826, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 3.783212900161743 }, { "auxiliary_loss_clip": 0.01297036, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.0079248, "balance_loss_mlp": 1.00014102, "epoch": 0.9084350387783322, "flos": 18909382082400.0, "grad_norm": 2.3314954776112833, "language_loss": 0.62473297, "learning_rate": 8.721090949352605e-08, "loss": 0.64963502, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.778292179107666 }, { "auxiliary_loss_clip": 0.01322777, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00862944, "balance_loss_mlp": 1.00021362, "epoch": 0.9085552816689714, "flos": 20595499009920.0, "grad_norm": 2.0810754154019513, "language_loss": 0.72918516, "learning_rate": 8.698353421514793e-08, "loss": 0.7543453, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.7593045234680176 }, { "auxiliary_loss_clip": 0.01325606, "auxiliary_loss_mlp": 0.01193157, "balance_loss_clip": 1.00706422, "balance_loss_mlp": 1.00022888, "epoch": 0.9086755245596104, "flos": 18113172850080.0, "grad_norm": 4.260188119302614, "language_loss": 0.80140042, "learning_rate": 8.67564491398467e-08, "loss": 0.82658803, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.7368180751800537 }, { "auxiliary_loss_clip": 0.01335075, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00758553, "balance_loss_mlp": 1.00016665, "epoch": 0.9087957674502495, "flos": 19129802375520.0, "grad_norm": 1.6494818209591169, "language_loss": 0.73191154, "learning_rate": 8.652965430207104e-08, "loss": 0.75719321, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.680337429046631 }, { "auxiliary_loss_clip": 0.01334127, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00751019, "balance_loss_mlp": 1.00017643, "epoch": 0.9089160103408886, "flos": 18109939718880.0, "grad_norm": 1.93673611315213, "language_loss": 0.654504, "learning_rate": 8.630314973622521e-08, "loss": 0.67977536, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.7756588459014893 }, { "auxiliary_loss_clip": 0.01322884, "auxiliary_loss_mlp": 0.01192955, "balance_loss_clip": 1.0074079, "balance_loss_mlp": 1.00012207, "epoch": 0.9090362532315277, "flos": 33364869418080.0, "grad_norm": 2.0727575169019237, "language_loss": 0.70594317, "learning_rate": 8.607693547666995e-08, "loss": 0.73110151, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.8263044357299805 }, { "auxiliary_loss_clip": 0.01271671, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00337362, "balance_loss_mlp": 1.00000978, "epoch": 0.9091564961221668, "flos": 71480616743520.0, "grad_norm": 0.8897118352891925, "language_loss": 0.58061165, "learning_rate": 8.585101155772201e-08, "loss": 0.60525107, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.442909002304077 }, { "auxiliary_loss_clip": 0.01324929, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00798929, "balance_loss_mlp": 1.00016475, "epoch": 0.9092767390128058, "flos": 24712584444000.0, "grad_norm": 1.6170710786497544, "language_loss": 0.68519956, "learning_rate": 8.562537801365377e-08, "loss": 0.71037978, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.792391061782837 }, { "auxiliary_loss_clip": 0.01348441, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00790024, "balance_loss_mlp": 1.00018263, "epoch": 0.909396981903445, "flos": 23586499169280.0, "grad_norm": 1.6136108069962547, "language_loss": 0.69961935, "learning_rate": 8.540003487869362e-08, "loss": 0.72503591, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.6954078674316406 }, { "auxiliary_loss_clip": 0.01287972, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00738525, "balance_loss_mlp": 1.00014353, "epoch": 0.909517224794084, "flos": 23404180691520.0, "grad_norm": 1.7810447092862374, "language_loss": 0.79586393, "learning_rate": 8.517498218702557e-08, "loss": 0.82067525, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.8480803966522217 }, { "auxiliary_loss_clip": 0.0129794, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00682724, "balance_loss_mlp": 1.00016844, "epoch": 0.9096374676847231, "flos": 19208628434880.0, "grad_norm": 1.7240592332822273, "language_loss": 0.69441092, "learning_rate": 8.49502199727905e-08, "loss": 0.71932125, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.767681121826172 }, { "auxiliary_loss_clip": 0.01336321, "auxiliary_loss_mlp": 0.01193118, "balance_loss_clip": 1.00771952, "balance_loss_mlp": 1.0001899, "epoch": 0.9097577105753623, "flos": 33292509621120.0, "grad_norm": 11.570352942243261, "language_loss": 0.65936255, "learning_rate": 8.472574827008428e-08, "loss": 0.68465698, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.796903610229492 }, { "auxiliary_loss_clip": 0.01328606, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00724924, "balance_loss_mlp": 1.00015557, "epoch": 0.9098779534660013, "flos": 21906453343680.0, "grad_norm": 2.0276835652395153, "language_loss": 0.83795714, "learning_rate": 8.450156711295942e-08, "loss": 0.86317503, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.7457005977630615 }, { "auxiliary_loss_clip": 0.01301481, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00687814, "balance_loss_mlp": 1.00013411, "epoch": 0.9099981963566404, "flos": 25730363527200.0, "grad_norm": 2.291579014942205, "language_loss": 0.86319542, "learning_rate": 8.427767653542383e-08, "loss": 0.8881408, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.829573154449463 }, { "auxiliary_loss_clip": 0.01274659, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00683427, "balance_loss_mlp": 1.00014198, "epoch": 0.9101184392472795, "flos": 21069448020000.0, "grad_norm": 1.964408454415408, "language_loss": 0.69911849, "learning_rate": 8.405407657144125e-08, "loss": 0.72379577, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.8498525619506836 }, { "auxiliary_loss_clip": 0.01314208, "auxiliary_loss_mlp": 0.01192879, "balance_loss_clip": 1.00686264, "balance_loss_mlp": 1.00014114, "epoch": 0.9102386821379186, "flos": 24752626138080.0, "grad_norm": 1.9183093221637249, "language_loss": 0.72221637, "learning_rate": 8.383076725493232e-08, "loss": 0.74728727, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.7800650596618652 }, { "auxiliary_loss_clip": 0.01329688, "auxiliary_loss_mlp": 0.01193196, "balance_loss_clip": 1.00703192, "balance_loss_mlp": 1.00017273, "epoch": 0.9103589250285576, "flos": 22562828602560.0, "grad_norm": 2.0165391905749104, "language_loss": 0.6787709, "learning_rate": 8.360774861977216e-08, "loss": 0.7039997, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.7079365253448486 }, { "auxiliary_loss_clip": 0.01323885, "auxiliary_loss_mlp": 0.01193146, "balance_loss_clip": 1.00761557, "balance_loss_mlp": 1.00012255, "epoch": 0.9104791679191968, "flos": 25373486086560.0, "grad_norm": 1.7631531080305278, "language_loss": 0.74489963, "learning_rate": 8.338502069979281e-08, "loss": 0.77006996, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.8376736640930176 }, { "auxiliary_loss_clip": 0.01335975, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00754273, "balance_loss_mlp": 1.00017357, "epoch": 0.9105994108098359, "flos": 14426689753440.0, "grad_norm": 2.579639220345727, "language_loss": 0.79028857, "learning_rate": 8.316258352878214e-08, "loss": 0.81558025, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.762842893600464 }, { "auxiliary_loss_clip": 0.01335459, "auxiliary_loss_mlp": 0.01193289, "balance_loss_clip": 1.007424, "balance_loss_mlp": 1.00016952, "epoch": 0.9107196537004749, "flos": 26718303241440.0, "grad_norm": 1.6974399190196292, "language_loss": 0.70809662, "learning_rate": 8.294043714048338e-08, "loss": 0.73338413, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.8213343620300293 }, { "auxiliary_loss_clip": 0.01285368, "auxiliary_loss_mlp": 0.01192272, "balance_loss_clip": 1.00360131, "balance_loss_mlp": 1.00001168, "epoch": 0.9108398965911141, "flos": 66532668935040.0, "grad_norm": 0.7487329175645548, "language_loss": 0.6056422, "learning_rate": 8.271858156859624e-08, "loss": 0.6304186, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.476358413696289 }, { "auxiliary_loss_clip": 0.01347997, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.00796628, "balance_loss_mlp": 1.00016975, "epoch": 0.9109601394817531, "flos": 25411084970400.0, "grad_norm": 1.5548392719791337, "language_loss": 0.74032092, "learning_rate": 8.249701684677557e-08, "loss": 0.76573187, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.761993646621704 }, { "auxiliary_loss_clip": 0.01326696, "auxiliary_loss_mlp": 0.01192978, "balance_loss_clip": 1.00783825, "balance_loss_mlp": 1.0001446, "epoch": 0.9110803823723922, "flos": 22747805432640.0, "grad_norm": 1.9037374323222962, "language_loss": 0.80651295, "learning_rate": 8.227574300863294e-08, "loss": 0.83170974, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.804461717605591 }, { "auxiliary_loss_clip": 0.01311854, "auxiliary_loss_mlp": 0.01193313, "balance_loss_clip": 1.00783825, "balance_loss_mlp": 1.00019431, "epoch": 0.9112006252630314, "flos": 48469942437120.0, "grad_norm": 2.077113664093788, "language_loss": 0.69641495, "learning_rate": 8.205476008773548e-08, "loss": 0.7214666, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 3.9664604663848877 }, { "auxiliary_loss_clip": 0.01276237, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00653434, "balance_loss_mlp": 1.00015903, "epoch": 0.9113208681536704, "flos": 30009663387360.0, "grad_norm": 1.948388233372822, "language_loss": 0.82075787, "learning_rate": 8.183406811760596e-08, "loss": 0.84545016, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 4.737865209579468 }, { "auxiliary_loss_clip": 0.0129809, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.0075388, "balance_loss_mlp": 1.00018263, "epoch": 0.9114411110443095, "flos": 25594984090080.0, "grad_norm": 1.4482241636171964, "language_loss": 0.74206007, "learning_rate": 8.161366713172313e-08, "loss": 0.76697206, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 3.8267316818237305 }, { "auxiliary_loss_clip": 0.01308023, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00767422, "balance_loss_mlp": 1.00017929, "epoch": 0.9115613539349486, "flos": 18399738143520.0, "grad_norm": 2.32249424211579, "language_loss": 0.84506398, "learning_rate": 8.139355716352137e-08, "loss": 0.8700763, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.814990520477295 }, { "auxiliary_loss_clip": 0.01308696, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00796783, "balance_loss_mlp": 1.00021517, "epoch": 0.9116815968255877, "flos": 21726182515680.0, "grad_norm": 1.5426654581952817, "language_loss": 0.70085478, "learning_rate": 8.117373824639196e-08, "loss": 0.72587407, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.7730414867401123 }, { "auxiliary_loss_clip": 0.01315713, "auxiliary_loss_mlp": 0.01192273, "balance_loss_clip": 1.00352955, "balance_loss_mlp": 1.00001204, "epoch": 0.9118018397162267, "flos": 65363560300800.0, "grad_norm": 0.7220006862146787, "language_loss": 0.59278756, "learning_rate": 8.095421041368067e-08, "loss": 0.61786741, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.167635679244995 }, { "auxiliary_loss_clip": 0.01302292, "auxiliary_loss_mlp": 0.00872498, "balance_loss_clip": 1.00656915, "balance_loss_mlp": 1.00051689, "epoch": 0.9119220826068659, "flos": 20922896318400.0, "grad_norm": 2.217889164728865, "language_loss": 0.70827752, "learning_rate": 8.073497369868999e-08, "loss": 0.73002541, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.8089759349823 }, { "auxiliary_loss_clip": 0.01322556, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00812006, "balance_loss_mlp": 1.00021088, "epoch": 0.912042325497505, "flos": 28366462048320.0, "grad_norm": 1.6986498586186751, "language_loss": 0.75521356, "learning_rate": 8.051602813467772e-08, "loss": 0.78037053, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.812810182571411 }, { "auxiliary_loss_clip": 0.01335467, "auxiliary_loss_mlp": 0.0119298, "balance_loss_clip": 1.00789237, "balance_loss_mlp": 1.00014687, "epoch": 0.912162568388144, "flos": 17566792195680.0, "grad_norm": 1.7196040975930507, "language_loss": 0.71221012, "learning_rate": 8.029737375485756e-08, "loss": 0.73749459, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.745479106903076 }, { "auxiliary_loss_clip": 0.01348663, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00818038, "balance_loss_mlp": 1.00017715, "epoch": 0.9122828112787832, "flos": 19827907741440.0, "grad_norm": 1.6136936194947276, "language_loss": 0.72727931, "learning_rate": 8.007901059239986e-08, "loss": 0.75269705, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.6773202419281006 }, { "auxiliary_loss_clip": 0.0131151, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00735211, "balance_loss_mlp": 1.00016379, "epoch": 0.9124030541694222, "flos": 20813799805920.0, "grad_norm": 1.594563682308803, "language_loss": 0.79861593, "learning_rate": 7.986093868042964e-08, "loss": 0.82366192, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.7635481357574463 }, { "auxiliary_loss_clip": 0.01325734, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.0068562, "balance_loss_mlp": 1.00013804, "epoch": 0.9125232970600613, "flos": 25192317166560.0, "grad_norm": 2.009094534962108, "language_loss": 0.67786932, "learning_rate": 7.964315805202826e-08, "loss": 0.70305741, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.8075499534606934 }, { "auxiliary_loss_clip": 0.0131167, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00790095, "balance_loss_mlp": 1.00016046, "epoch": 0.9126435399507005, "flos": 19719601549920.0, "grad_norm": 1.7816422643869976, "language_loss": 0.73227227, "learning_rate": 7.942566874023304e-08, "loss": 0.75732082, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.8070614337921143 }, { "auxiliary_loss_clip": 0.01308444, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00802338, "balance_loss_mlp": 1.00020754, "epoch": 0.9127637828413395, "flos": 19573624627200.0, "grad_norm": 2.269182213376734, "language_loss": 0.69898421, "learning_rate": 7.920847077803649e-08, "loss": 0.72400093, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.8471410274505615 }, { "auxiliary_loss_clip": 0.01291622, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00723433, "balance_loss_mlp": 1.0001924, "epoch": 0.9128840257319786, "flos": 20230646512320.0, "grad_norm": 1.7450390066849837, "language_loss": 0.82211733, "learning_rate": 7.899156419838826e-08, "loss": 0.84696472, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.83929181098938 }, { "auxiliary_loss_clip": 0.01298264, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00744152, "balance_loss_mlp": 1.00016379, "epoch": 0.9130042686226177, "flos": 24858669137760.0, "grad_norm": 1.6807284244922505, "language_loss": 0.6557979, "learning_rate": 7.87749490341918e-08, "loss": 0.6807124, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.8562910556793213 }, { "auxiliary_loss_clip": 0.01348784, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00805783, "balance_loss_mlp": 1.00017023, "epoch": 0.9131245115132568, "flos": 23581757243520.0, "grad_norm": 1.9459443310468116, "language_loss": 0.83403546, "learning_rate": 7.855862531830836e-08, "loss": 0.85945523, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.7083425521850586 }, { "auxiliary_loss_clip": 0.01336371, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00820363, "balance_loss_mlp": 1.00019193, "epoch": 0.9132447544038959, "flos": 19931615701920.0, "grad_norm": 1.6887726432944346, "language_loss": 0.72431487, "learning_rate": 7.834259308355373e-08, "loss": 0.74960977, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.7589759826660156 }, { "auxiliary_loss_clip": 0.01252021, "auxiliary_loss_mlp": 0.01193149, "balance_loss_clip": 1.0065825, "balance_loss_mlp": 1.00012565, "epoch": 0.9133649972945349, "flos": 21981758882400.0, "grad_norm": 2.9614081701396886, "language_loss": 0.75048167, "learning_rate": 7.812685236269989e-08, "loss": 0.7749334, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.883981227874756 }, { "auxiliary_loss_clip": 0.01259462, "auxiliary_loss_mlp": 0.01192268, "balance_loss_clip": 1.00399733, "balance_loss_mlp": 1.00000703, "epoch": 0.9134852401851741, "flos": 71240559406560.0, "grad_norm": 0.7894111389987567, "language_loss": 0.58717251, "learning_rate": 7.791140318847445e-08, "loss": 0.61168981, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.3281638622283936 }, { "auxiliary_loss_clip": 0.01301478, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00795007, "balance_loss_mlp": 1.00017154, "epoch": 0.9136054830758131, "flos": 23626936023840.0, "grad_norm": 1.5480892945216462, "language_loss": 0.80302936, "learning_rate": 7.769624559356081e-08, "loss": 0.82797611, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 2.92148494720459 }, { "auxiliary_loss_clip": 0.01323564, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00717258, "balance_loss_mlp": 1.00013435, "epoch": 0.9137257259664522, "flos": 23438869757280.0, "grad_norm": 2.5180281891393532, "language_loss": 0.75057131, "learning_rate": 7.748137961059842e-08, "loss": 0.77573854, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.807452917098999 }, { "auxiliary_loss_clip": 0.01346427, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00780535, "balance_loss_mlp": 1.00013399, "epoch": 0.9138459688570914, "flos": 19127862496800.0, "grad_norm": 4.868159876173167, "language_loss": 0.65541452, "learning_rate": 7.726680527218211e-08, "loss": 0.68080938, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.7271785736083984 }, { "auxiliary_loss_clip": 0.01347927, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00724483, "balance_loss_mlp": 1.00014186, "epoch": 0.9139662117477304, "flos": 46281258535680.0, "grad_norm": 1.7765390161767438, "language_loss": 0.75535071, "learning_rate": 7.70525226108627e-08, "loss": 0.78076166, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.966053009033203 }, { "auxiliary_loss_clip": 0.01330261, "auxiliary_loss_mlp": 0.01193, "balance_loss_clip": 1.00780582, "balance_loss_mlp": 1.00016677, "epoch": 0.9140864546383695, "flos": 22273209796320.0, "grad_norm": 1.6814291864127353, "language_loss": 0.79443419, "learning_rate": 7.683853165914666e-08, "loss": 0.8196668, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.827634334564209 }, { "auxiliary_loss_clip": 0.01277197, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00747645, "balance_loss_mlp": 1.00017071, "epoch": 0.9142066975290086, "flos": 17530019556480.0, "grad_norm": 1.6530884205641891, "language_loss": 0.76993626, "learning_rate": 7.662483244949602e-08, "loss": 0.79464018, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.868844985961914 }, { "auxiliary_loss_clip": 0.01275885, "auxiliary_loss_mlp": 0.01192984, "balance_loss_clip": 1.00619519, "balance_loss_mlp": 1.00015068, "epoch": 0.9143269404196477, "flos": 17712158415840.0, "grad_norm": 2.2038123931826514, "language_loss": 0.8030498, "learning_rate": 7.641142501432951e-08, "loss": 0.82773852, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 3.740887403488159 }, { "auxiliary_loss_clip": 0.01315706, "auxiliary_loss_mlp": 0.01193061, "balance_loss_clip": 1.00747871, "balance_loss_mlp": 1.00013232, "epoch": 0.9144471833102867, "flos": 33323426700480.0, "grad_norm": 1.5512661458326706, "language_loss": 0.73726952, "learning_rate": 7.619830938602013e-08, "loss": 0.76235718, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 4.779296398162842 }, { "auxiliary_loss_clip": 0.01337487, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00853825, "balance_loss_mlp": 1.00017667, "epoch": 0.9145674262009259, "flos": 21068981012160.0, "grad_norm": 2.154070871062401, "language_loss": 0.82706499, "learning_rate": 7.598548559689777e-08, "loss": 0.85237086, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 3.718104839324951 }, { "auxiliary_loss_clip": 0.01296865, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00686085, "balance_loss_mlp": 1.000144, "epoch": 0.914687669091565, "flos": 16800278637600.0, "grad_norm": 2.346524329914989, "language_loss": 0.80999315, "learning_rate": 7.577295367924751e-08, "loss": 0.83489251, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.832712411880493 }, { "auxiliary_loss_clip": 0.01311091, "auxiliary_loss_mlp": 0.0119311, "balance_loss_clip": 1.00726211, "balance_loss_mlp": 1.00018144, "epoch": 0.914807911982204, "flos": 25773638352480.0, "grad_norm": 1.6852302174436842, "language_loss": 0.8225528, "learning_rate": 7.556071366531002e-08, "loss": 0.8475948, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.7898762226104736 }, { "auxiliary_loss_clip": 0.01323934, "auxiliary_loss_mlp": 0.0119321, "balance_loss_clip": 1.00747013, "balance_loss_mlp": 1.00018668, "epoch": 0.9149281548728432, "flos": 19208053656000.0, "grad_norm": 2.3460959897822415, "language_loss": 0.78985226, "learning_rate": 7.53487655872822e-08, "loss": 0.81502372, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.7655446529388428 }, { "auxiliary_loss_clip": 0.01280834, "auxiliary_loss_mlp": 0.01192877, "balance_loss_clip": 1.00644922, "balance_loss_mlp": 1.00013924, "epoch": 0.9150483977634822, "flos": 26870566808160.0, "grad_norm": 1.8568792492729345, "language_loss": 0.743402, "learning_rate": 7.513710947731656e-08, "loss": 0.76813906, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.840813398361206 }, { "auxiliary_loss_clip": 0.01310103, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00753915, "balance_loss_mlp": 1.00016403, "epoch": 0.9151686406541213, "flos": 21908968001280.0, "grad_norm": 1.6896669356030576, "language_loss": 0.84926474, "learning_rate": 7.492574536752095e-08, "loss": 0.87429667, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.804960012435913 }, { "auxiliary_loss_clip": 0.01323543, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00724268, "balance_loss_mlp": 1.00015223, "epoch": 0.9152888835447605, "flos": 27308569423680.0, "grad_norm": 1.8225356595595879, "language_loss": 0.78022361, "learning_rate": 7.471467328995907e-08, "loss": 0.80538988, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.810131311416626 }, { "auxiliary_loss_clip": 0.0121564, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.0065732, "balance_loss_mlp": 1.00019121, "epoch": 0.9154091264353995, "flos": 13370737007520.0, "grad_norm": 2.3065230134702115, "language_loss": 0.6116299, "learning_rate": 7.450389327665018e-08, "loss": 0.63571751, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 3.1817445755004883 }, { "auxiliary_loss_clip": 0.01282128, "auxiliary_loss_mlp": 0.01193137, "balance_loss_clip": 1.00761986, "balance_loss_mlp": 1.00020838, "epoch": 0.9155293693260386, "flos": 20193047628480.0, "grad_norm": 4.20996499656659, "language_loss": 0.6780324, "learning_rate": 7.429340535957029e-08, "loss": 0.70278507, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 3.945274829864502 }, { "auxiliary_loss_clip": 0.01314948, "auxiliary_loss_mlp": 0.0119315, "balance_loss_clip": 1.00752449, "balance_loss_mlp": 1.0001266, "epoch": 0.9156496122166777, "flos": 19355000518080.0, "grad_norm": 2.426351589023011, "language_loss": 0.70601338, "learning_rate": 7.40832095706494e-08, "loss": 0.73109436, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.778578281402588 }, { "auxiliary_loss_clip": 0.01297829, "auxiliary_loss_mlp": 0.01193128, "balance_loss_clip": 1.00688541, "balance_loss_mlp": 1.00020003, "epoch": 0.9157698551073168, "flos": 21107298369600.0, "grad_norm": 10.108164445195968, "language_loss": 0.79944628, "learning_rate": 7.387330594177443e-08, "loss": 0.82435584, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.8970773220062256 }, { "auxiliary_loss_clip": 0.01291437, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00710487, "balance_loss_mlp": 1.00014257, "epoch": 0.9158900979979558, "flos": 25193179334880.0, "grad_norm": 1.5293880155103785, "language_loss": 0.79524291, "learning_rate": 7.366369450478749e-08, "loss": 0.82008886, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.955653429031372 }, { "auxiliary_loss_clip": 0.01288692, "auxiliary_loss_mlp": 0.01192868, "balance_loss_clip": 1.00678003, "balance_loss_mlp": 1.00013077, "epoch": 0.916010340888595, "flos": 30146659390080.0, "grad_norm": 1.5969306987694118, "language_loss": 0.66298699, "learning_rate": 7.345437529148646e-08, "loss": 0.68780255, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.836404323577881 }, { "auxiliary_loss_clip": 0.01296509, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00771928, "balance_loss_mlp": 1.00013459, "epoch": 0.9161305837792341, "flos": 17091837322560.0, "grad_norm": 1.9759862574755345, "language_loss": 0.72410822, "learning_rate": 7.324534833362483e-08, "loss": 0.74900389, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.8214118480682373 }, { "auxiliary_loss_clip": 0.01306852, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00651693, "balance_loss_mlp": 1.00016046, "epoch": 0.9162508266698731, "flos": 22893710508000.0, "grad_norm": 1.599882645205486, "language_loss": 0.68233585, "learning_rate": 7.303661366291192e-08, "loss": 0.70733523, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.814056396484375 }, { "auxiliary_loss_clip": 0.01289219, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.0080142, "balance_loss_mlp": 1.00016499, "epoch": 0.9163710695605123, "flos": 19974818679840.0, "grad_norm": 1.773600823873175, "language_loss": 0.81280166, "learning_rate": 7.28281713110126e-08, "loss": 0.83762383, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 2.8779900074005127 }, { "auxiliary_loss_clip": 0.01311574, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00789714, "balance_loss_mlp": 1.00013363, "epoch": 0.9164913124511513, "flos": 22783823674560.0, "grad_norm": 1.8249240291865174, "language_loss": 0.7713486, "learning_rate": 7.262002130954759e-08, "loss": 0.796395, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.8972675800323486 }, { "auxiliary_loss_clip": 0.01288341, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00706351, "balance_loss_mlp": 1.00014663, "epoch": 0.9166115553417904, "flos": 24900866252640.0, "grad_norm": 1.62195464840022, "language_loss": 0.79104334, "learning_rate": 7.241216369009296e-08, "loss": 0.81585842, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.9519875049591064 }, { "auxiliary_loss_clip": 0.01348524, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.0075748, "balance_loss_mlp": 1.00014639, "epoch": 0.9167317982324296, "flos": 25702931044800.0, "grad_norm": 1.6918866769942993, "language_loss": 0.66133565, "learning_rate": 7.220459848418037e-08, "loss": 0.6867516, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 2.7658634185791016 }, { "auxiliary_loss_clip": 0.01347632, "auxiliary_loss_mlp": 0.01192863, "balance_loss_clip": 1.00807309, "balance_loss_mlp": 1.00012589, "epoch": 0.9168520411230686, "flos": 15632822492640.0, "grad_norm": 1.7094932230706203, "language_loss": 0.79697037, "learning_rate": 7.199732572329708e-08, "loss": 0.8223753, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.735941171646118 }, { "auxiliary_loss_clip": 0.01291655, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.0079236, "balance_loss_mlp": 1.00013709, "epoch": 0.9169722840137077, "flos": 30258162789120.0, "grad_norm": 2.146439370723761, "language_loss": 0.7594294, "learning_rate": 7.179034543888684e-08, "loss": 0.7842766, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.910245180130005 }, { "auxiliary_loss_clip": 0.01335337, "auxiliary_loss_mlp": 0.0119298, "balance_loss_clip": 1.00759351, "balance_loss_mlp": 1.00014687, "epoch": 0.9170925269043467, "flos": 22491654287040.0, "grad_norm": 2.1899390475113925, "language_loss": 0.77560824, "learning_rate": 7.158365766234808e-08, "loss": 0.8008914, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.7249934673309326 }, { "auxiliary_loss_clip": 0.0130043, "auxiliary_loss_mlp": 0.01193231, "balance_loss_clip": 1.00747085, "balance_loss_mlp": 1.00020695, "epoch": 0.9172127697949859, "flos": 22893926050080.0, "grad_norm": 1.7411454463814882, "language_loss": 0.72176439, "learning_rate": 7.137726242503527e-08, "loss": 0.746701, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.8535895347595215 }, { "auxiliary_loss_clip": 0.01324023, "auxiliary_loss_mlp": 0.00872492, "balance_loss_clip": 1.00755572, "balance_loss_mlp": 1.00044775, "epoch": 0.917333012685625, "flos": 17451876047040.0, "grad_norm": 1.9621191178736344, "language_loss": 0.7837097, "learning_rate": 7.11711597582585e-08, "loss": 0.80567485, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.7861602306365967 }, { "auxiliary_loss_clip": 0.0130975, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00750899, "balance_loss_mlp": 1.0001924, "epoch": 0.917453255576264, "flos": 14318958340800.0, "grad_norm": 1.8102179503626374, "language_loss": 0.79747605, "learning_rate": 7.096534969328271e-08, "loss": 0.82250476, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 3.7813611030578613 }, { "auxiliary_loss_clip": 0.01322528, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00746226, "balance_loss_mlp": 1.00017047, "epoch": 0.9175734984669032, "flos": 20741188543200.0, "grad_norm": 2.007945562541722, "language_loss": 0.84141403, "learning_rate": 7.075983226132987e-08, "loss": 0.86657125, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 4.101221084594727 }, { "auxiliary_loss_clip": 0.01314374, "auxiliary_loss_mlp": 0.008725, "balance_loss_clip": 1.00718236, "balance_loss_mlp": 1.00042272, "epoch": 0.9176937413575422, "flos": 14830506234720.0, "grad_norm": 2.4584902019399837, "language_loss": 0.79329199, "learning_rate": 7.055460749357656e-08, "loss": 0.81516075, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 3.728854179382324 }, { "auxiliary_loss_clip": 0.01303457, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00702572, "balance_loss_mlp": 1.00015926, "epoch": 0.9178139842481813, "flos": 18474612598080.0, "grad_norm": 1.5674760602075795, "language_loss": 0.70361495, "learning_rate": 7.034967542115521e-08, "loss": 0.72858131, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.763040542602539 }, { "auxiliary_loss_clip": 0.01335088, "auxiliary_loss_mlp": 0.00872461, "balance_loss_clip": 1.00777483, "balance_loss_mlp": 1.00039196, "epoch": 0.9179342271388204, "flos": 20047465866240.0, "grad_norm": 2.547949464041808, "language_loss": 0.75300133, "learning_rate": 7.014503607515388e-08, "loss": 0.77507687, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.776738166809082 }, { "auxiliary_loss_clip": 0.01303048, "auxiliary_loss_mlp": 0.01193017, "balance_loss_clip": 1.00716722, "balance_loss_mlp": 1.00018418, "epoch": 0.9180544700294595, "flos": 24676242888960.0, "grad_norm": 2.0054946887688887, "language_loss": 0.68322676, "learning_rate": 6.994068948661592e-08, "loss": 0.70818734, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.880732297897339 }, { "auxiliary_loss_clip": 0.01324513, "auxiliary_loss_mlp": 0.01193138, "balance_loss_clip": 1.00755501, "balance_loss_mlp": 1.00020933, "epoch": 0.9181747129200986, "flos": 16727487756480.0, "grad_norm": 1.957194476513802, "language_loss": 0.76268303, "learning_rate": 6.973663568654142e-08, "loss": 0.78785944, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.7789878845214844 }, { "auxiliary_loss_clip": 0.01347821, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00790691, "balance_loss_mlp": 1.0001508, "epoch": 0.9182949558107377, "flos": 24271636086720.0, "grad_norm": 1.9910674211381483, "language_loss": 0.65430295, "learning_rate": 6.953287470588386e-08, "loss": 0.67971289, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.786527156829834 }, { "auxiliary_loss_clip": 0.01335658, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.0075686, "balance_loss_mlp": 1.00015497, "epoch": 0.9184151987013768, "flos": 22082125940640.0, "grad_norm": 2.092629054443856, "language_loss": 0.85959584, "learning_rate": 6.932940657555452e-08, "loss": 0.88488418, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.820809841156006 }, { "auxiliary_loss_clip": 0.0134674, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00751495, "balance_loss_mlp": 1.00015783, "epoch": 0.9185354415920158, "flos": 32166747659520.0, "grad_norm": 1.3423615550910515, "language_loss": 0.76486903, "learning_rate": 6.912623132641938e-08, "loss": 0.7902683, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.892390727996826 }, { "auxiliary_loss_clip": 0.01310816, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00774097, "balance_loss_mlp": 1.00019336, "epoch": 0.918655684482655, "flos": 20997842620320.0, "grad_norm": 1.8066632279992578, "language_loss": 0.76583475, "learning_rate": 6.892334898929952e-08, "loss": 0.79087412, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.873257637023926 }, { "auxiliary_loss_clip": 0.01335554, "auxiliary_loss_mlp": 0.01193144, "balance_loss_clip": 1.00790167, "balance_loss_mlp": 1.00011992, "epoch": 0.918775927373294, "flos": 15560714161440.0, "grad_norm": 1.785410642505534, "language_loss": 0.84740531, "learning_rate": 6.872075959497236e-08, "loss": 0.87269235, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.80936861038208 }, { "auxiliary_loss_clip": 0.01319343, "auxiliary_loss_mlp": 0.01192964, "balance_loss_clip": 1.00747001, "balance_loss_mlp": 1.00013077, "epoch": 0.9188961702639331, "flos": 29934070459200.0, "grad_norm": 1.8270355600585577, "language_loss": 0.82718307, "learning_rate": 6.85184631741702e-08, "loss": 0.85230613, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.861696720123291 }, { "auxiliary_loss_clip": 0.01325868, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00711417, "balance_loss_mlp": 1.00017595, "epoch": 0.9190164131545723, "flos": 20701254620160.0, "grad_norm": 1.967790687229265, "language_loss": 0.77258265, "learning_rate": 6.831645975758161e-08, "loss": 0.79777336, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.844475746154785 }, { "auxiliary_loss_clip": 0.01312362, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00820637, "balance_loss_mlp": 1.00014007, "epoch": 0.9191366560452113, "flos": 25629924621600.0, "grad_norm": 2.043336869264446, "language_loss": 0.67254603, "learning_rate": 6.811474937585026e-08, "loss": 0.69760132, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.901489019393921 }, { "auxiliary_loss_clip": 0.01294535, "auxiliary_loss_mlp": 0.01193131, "balance_loss_clip": 1.00700188, "balance_loss_mlp": 1.00020301, "epoch": 0.9192568989358504, "flos": 21434336441280.0, "grad_norm": 1.5430573881083025, "language_loss": 0.79183763, "learning_rate": 6.79133320595755e-08, "loss": 0.81671429, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.9461288452148438 }, { "auxiliary_loss_clip": 0.0130979, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.0073514, "balance_loss_mlp": 1.0001384, "epoch": 0.9193771418264896, "flos": 23185089574560.0, "grad_norm": 1.7940226940561044, "language_loss": 0.75328839, "learning_rate": 6.771220783931198e-08, "loss": 0.77831697, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.861649990081787 }, { "auxiliary_loss_clip": 0.01207558, "auxiliary_loss_mlp": 0.00871722, "balance_loss_clip": 1.00869679, "balance_loss_mlp": 0.99989587, "epoch": 0.9194973847171286, "flos": 70582998666240.0, "grad_norm": 0.8389971660929747, "language_loss": 0.6462791, "learning_rate": 6.751137674556994e-08, "loss": 0.66707194, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 4.020045518875122 }, { "auxiliary_loss_clip": 0.0133591, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00749493, "balance_loss_mlp": 1.0001651, "epoch": 0.9196176276077677, "flos": 14720691248640.0, "grad_norm": 1.90434833926509, "language_loss": 0.77450311, "learning_rate": 6.731083880881572e-08, "loss": 0.79979312, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 3.0629770755767822 }, { "auxiliary_loss_clip": 0.0131192, "auxiliary_loss_mlp": 0.01192972, "balance_loss_clip": 1.00703931, "balance_loss_mlp": 1.00013947, "epoch": 0.9197378704984068, "flos": 23294904560640.0, "grad_norm": 2.001537689177247, "language_loss": 0.81115723, "learning_rate": 6.711059405947072e-08, "loss": 0.8362062, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.8635454177856445 }, { "auxiliary_loss_clip": 0.01278687, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00640488, "balance_loss_mlp": 1.00015259, "epoch": 0.9198581133890459, "flos": 20302575225120.0, "grad_norm": 1.860095897194376, "language_loss": 0.77167571, "learning_rate": 6.691064252791156e-08, "loss": 0.79639435, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 2.8070337772369385 }, { "auxiliary_loss_clip": 0.01252666, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00615239, "balance_loss_mlp": 1.00014269, "epoch": 0.9199783562796849, "flos": 17675673166080.0, "grad_norm": 1.5088694594043282, "language_loss": 0.78006744, "learning_rate": 6.67109842444713e-08, "loss": 0.80452573, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.8421742916107178 }, { "auxiliary_loss_clip": 0.0132514, "auxiliary_loss_mlp": 0.00872488, "balance_loss_clip": 1.00825179, "balance_loss_mlp": 1.00033998, "epoch": 0.9200985991703241, "flos": 17676032402880.0, "grad_norm": 1.7186352163090535, "language_loss": 0.76390445, "learning_rate": 6.651161923943704e-08, "loss": 0.7858808, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.687711715698242 }, { "auxiliary_loss_clip": 0.01337321, "auxiliary_loss_mlp": 0.01193119, "balance_loss_clip": 1.00788569, "balance_loss_mlp": 1.0001905, "epoch": 0.9202188420609632, "flos": 20996585291520.0, "grad_norm": 1.6715693966819227, "language_loss": 0.76723862, "learning_rate": 6.631254754305326e-08, "loss": 0.79254299, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.76718807220459 }, { "auxiliary_loss_clip": 0.01348064, "auxiliary_loss_mlp": 0.0119314, "balance_loss_clip": 1.00764775, "balance_loss_mlp": 1.00021207, "epoch": 0.9203390849516022, "flos": 13918231296000.0, "grad_norm": 1.750093722897039, "language_loss": 0.7818495, "learning_rate": 6.611376918551848e-08, "loss": 0.80726159, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.713745594024658 }, { "auxiliary_loss_clip": 0.01284197, "auxiliary_loss_mlp": 0.00872498, "balance_loss_clip": 1.00781095, "balance_loss_mlp": 1.00037503, "epoch": 0.9204593278422414, "flos": 21175922103840.0, "grad_norm": 1.8786632640365917, "language_loss": 0.79223812, "learning_rate": 6.591528419698744e-08, "loss": 0.8138051, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.7739145755767822 }, { "auxiliary_loss_clip": 0.01323176, "auxiliary_loss_mlp": 0.01193052, "balance_loss_clip": 1.00738239, "balance_loss_mlp": 1.0001235, "epoch": 0.9205795707328804, "flos": 14501384589600.0, "grad_norm": 2.3650620271886114, "language_loss": 0.8351301, "learning_rate": 6.571709260756986e-08, "loss": 0.86029232, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 5.056414842605591 }, { "auxiliary_loss_clip": 0.01327184, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.0079484, "balance_loss_mlp": 1.00015926, "epoch": 0.9206998136235195, "flos": 22417570153440.0, "grad_norm": 2.3391890941171156, "language_loss": 0.76400656, "learning_rate": 6.551919444733122e-08, "loss": 0.78920835, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 3.866269826889038 }, { "auxiliary_loss_clip": 0.01303598, "auxiliary_loss_mlp": 0.01193016, "balance_loss_clip": 1.00702238, "balance_loss_mlp": 1.00018334, "epoch": 0.9208200565141585, "flos": 53358427049760.0, "grad_norm": 2.153952019751571, "language_loss": 0.65738595, "learning_rate": 6.53215897462931e-08, "loss": 0.68235219, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 3.9155571460723877 }, { "auxiliary_loss_clip": 0.01335903, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00768018, "balance_loss_mlp": 1.00015163, "epoch": 0.9209402994047977, "flos": 30589152465600.0, "grad_norm": 1.7508889116661996, "language_loss": 0.74521017, "learning_rate": 6.512427853443103e-08, "loss": 0.77050102, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.8009181022644043 }, { "auxiliary_loss_clip": 0.01335062, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00779629, "balance_loss_mlp": 1.0001483, "epoch": 0.9210605422954368, "flos": 29132724140640.0, "grad_norm": 1.4758339445946596, "language_loss": 0.75689173, "learning_rate": 6.492726084167799e-08, "loss": 0.78217316, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.8338067531585693 }, { "auxiliary_loss_clip": 0.01315654, "auxiliary_loss_mlp": 0.01192272, "balance_loss_clip": 1.00349855, "balance_loss_mlp": 1.00001132, "epoch": 0.9211807851860758, "flos": 54853868491200.0, "grad_norm": 0.7777189521786414, "language_loss": 0.57474154, "learning_rate": 6.473053669792072e-08, "loss": 0.59982079, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.165672540664673 }, { "auxiliary_loss_clip": 0.01335737, "auxiliary_loss_mlp": 0.01193056, "balance_loss_clip": 1.00796294, "balance_loss_mlp": 1.00012767, "epoch": 0.921301028076715, "flos": 19201982554080.0, "grad_norm": 2.120326852523119, "language_loss": 0.73026615, "learning_rate": 6.453410613300248e-08, "loss": 0.75555408, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.710139513015747 }, { "auxiliary_loss_clip": 0.01256927, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00646162, "balance_loss_mlp": 1.0001533, "epoch": 0.921421270967354, "flos": 27526906143360.0, "grad_norm": 1.8752935513491846, "language_loss": 0.58225012, "learning_rate": 6.43379691767214e-08, "loss": 0.60675114, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.893526077270508 }, { "auxiliary_loss_clip": 0.01253884, "auxiliary_loss_mlp": 0.01192273, "balance_loss_clip": 1.00307703, "balance_loss_mlp": 1.00001192, "epoch": 0.9215415138579931, "flos": 70209344867040.0, "grad_norm": 0.722489201592284, "language_loss": 0.55165744, "learning_rate": 6.414212585883105e-08, "loss": 0.57611895, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.4000144004821777 }, { "auxiliary_loss_clip": 0.013135, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00754642, "balance_loss_mlp": 1.0001539, "epoch": 0.9216617567486323, "flos": 35553122235360.0, "grad_norm": 1.5719848107185113, "language_loss": 0.69915271, "learning_rate": 6.394657620904143e-08, "loss": 0.72421849, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.8959767818450928 }, { "auxiliary_loss_clip": 0.01349462, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.00835752, "balance_loss_mlp": 1.00013649, "epoch": 0.9217819996392713, "flos": 29533343414400.0, "grad_norm": 2.31988044399076, "language_loss": 0.71898109, "learning_rate": 6.375132025701657e-08, "loss": 0.74440634, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.7457053661346436 }, { "auxiliary_loss_clip": 0.01348456, "auxiliary_loss_mlp": 0.01193015, "balance_loss_clip": 1.00828922, "balance_loss_mlp": 1.00018239, "epoch": 0.9219022425299104, "flos": 14574678402240.0, "grad_norm": 2.3285949188097743, "language_loss": 0.69704109, "learning_rate": 6.355635803237724e-08, "loss": 0.72245586, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.649806022644043 }, { "auxiliary_loss_clip": 0.01336166, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00816107, "balance_loss_mlp": 1.0001471, "epoch": 0.9220224854205495, "flos": 18077513844960.0, "grad_norm": 1.827796153272531, "language_loss": 0.79201007, "learning_rate": 6.336168956469867e-08, "loss": 0.81730253, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.7438595294952393 }, { "auxiliary_loss_clip": 0.01312069, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.0076375, "balance_loss_mlp": 1.00013924, "epoch": 0.9221427283111886, "flos": 24790476487680.0, "grad_norm": 2.0191708562447492, "language_loss": 0.71780324, "learning_rate": 6.316731488351168e-08, "loss": 0.7428546, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.8084123134613037 }, { "auxiliary_loss_clip": 0.01325167, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.00709128, "balance_loss_mlp": 1.00014937, "epoch": 0.9222629712018277, "flos": 13845045254400.0, "grad_norm": 1.9308279996452797, "language_loss": 0.63175547, "learning_rate": 6.297323401830334e-08, "loss": 0.6569379, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.7085072994232178 }, { "auxiliary_loss_clip": 0.0133096, "auxiliary_loss_mlp": 0.01193119, "balance_loss_clip": 1.0072515, "balance_loss_mlp": 1.00019097, "epoch": 0.9223832140924668, "flos": 21616187911200.0, "grad_norm": 1.9915075617692608, "language_loss": 0.68758571, "learning_rate": 6.277944699851523e-08, "loss": 0.71282649, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.7157511711120605 }, { "auxiliary_loss_clip": 0.01347942, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00796008, "balance_loss_mlp": 1.00016785, "epoch": 0.9225034569831059, "flos": 21142095206400.0, "grad_norm": 2.0306542639900336, "language_loss": 0.73418659, "learning_rate": 6.25859538535447e-08, "loss": 0.7595979, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.6524598598480225 }, { "auxiliary_loss_clip": 0.01302593, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00644565, "balance_loss_mlp": 1.00014317, "epoch": 0.9226236998737449, "flos": 12495055089600.0, "grad_norm": 3.025253772191176, "language_loss": 0.7740503, "learning_rate": 6.239275461274474e-08, "loss": 0.79900789, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 2.7721619606018066 }, { "auxiliary_loss_clip": 0.01326446, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00718117, "balance_loss_mlp": 1.00018048, "epoch": 0.9227439427643841, "flos": 26214083778240.0, "grad_norm": 1.6558821636855898, "language_loss": 0.85925847, "learning_rate": 6.219984930542299e-08, "loss": 0.88445497, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.8054792881011963 }, { "auxiliary_loss_clip": 0.01336251, "auxiliary_loss_mlp": 0.01193112, "balance_loss_clip": 1.00796711, "balance_loss_mlp": 1.00018358, "epoch": 0.9228641856550232, "flos": 17967591087840.0, "grad_norm": 2.231431814001006, "language_loss": 0.75627214, "learning_rate": 6.200723796084383e-08, "loss": 0.78156573, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.7229599952697754 }, { "auxiliary_loss_clip": 0.01278867, "auxiliary_loss_mlp": 0.0119231, "balance_loss_clip": 1.0035646, "balance_loss_mlp": 1.00004888, "epoch": 0.9229844285456622, "flos": 70420640545440.0, "grad_norm": 0.7562749718547628, "language_loss": 0.63036096, "learning_rate": 6.181492060822546e-08, "loss": 0.65507269, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.241671562194824 }, { "auxiliary_loss_clip": 0.01287455, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00701571, "balance_loss_mlp": 1.00015676, "epoch": 0.9231046714363014, "flos": 17967842553600.0, "grad_norm": 1.93959096589573, "language_loss": 0.8179608, "learning_rate": 6.162289727674274e-08, "loss": 0.84276617, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.8001441955566406 }, { "auxiliary_loss_clip": 0.01291935, "auxiliary_loss_mlp": 0.01192866, "balance_loss_clip": 1.00621343, "balance_loss_mlp": 1.00012803, "epoch": 0.9232249143269404, "flos": 17858243109600.0, "grad_norm": 2.1973817030918394, "language_loss": 0.87637019, "learning_rate": 6.143116799552527e-08, "loss": 0.90121824, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.76754093170166 }, { "auxiliary_loss_clip": 0.01328696, "auxiliary_loss_mlp": 0.01193154, "balance_loss_clip": 1.00725627, "balance_loss_mlp": 1.00022578, "epoch": 0.9233451572175795, "flos": 23404396233600.0, "grad_norm": 2.4203615138823, "language_loss": 0.55929959, "learning_rate": 6.123973279365802e-08, "loss": 0.58451807, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.746596336364746 }, { "auxiliary_loss_clip": 0.01334332, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00784862, "balance_loss_mlp": 1.00018501, "epoch": 0.9234654001082186, "flos": 17999334411840.0, "grad_norm": 1.7602846322635577, "language_loss": 0.77817291, "learning_rate": 6.10485917001824e-08, "loss": 0.80344832, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.7161948680877686 }, { "auxiliary_loss_clip": 0.01307501, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00784111, "balance_loss_mlp": 1.00014603, "epoch": 0.9235856429988577, "flos": 24750758106720.0, "grad_norm": 1.602645503609418, "language_loss": 0.80899316, "learning_rate": 6.085774474409322e-08, "loss": 0.83399987, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.8132164478302 }, { "auxiliary_loss_clip": 0.01304262, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.0066191, "balance_loss_mlp": 1.00019526, "epoch": 0.9237058858894968, "flos": 14099903147520.0, "grad_norm": 1.9424412287015178, "language_loss": 0.69846815, "learning_rate": 6.066719195434267e-08, "loss": 0.72344291, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 3.63042950630188 }, { "auxiliary_loss_clip": 0.01327878, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00750113, "balance_loss_mlp": 1.00018489, "epoch": 0.9238261287801359, "flos": 28694541906720.0, "grad_norm": 1.7407547075922214, "language_loss": 0.6617021, "learning_rate": 6.047693335983717e-08, "loss": 0.68691295, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 5.058806896209717 }, { "auxiliary_loss_clip": 0.01331017, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00741577, "balance_loss_mlp": 1.00015128, "epoch": 0.923946371670775, "flos": 23111867609280.0, "grad_norm": 2.781251377633526, "language_loss": 0.82176423, "learning_rate": 6.028696898943853e-08, "loss": 0.84700614, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 3.685476303100586 }, { "auxiliary_loss_clip": 0.01315216, "auxiliary_loss_mlp": 0.00872558, "balance_loss_clip": 1.00717258, "balance_loss_mlp": 1.00045395, "epoch": 0.924066614561414, "flos": 21867130123200.0, "grad_norm": 1.9510066652565035, "language_loss": 0.70761001, "learning_rate": 6.00972988719648e-08, "loss": 0.72948778, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.7923760414123535 }, { "auxiliary_loss_clip": 0.01288364, "auxiliary_loss_mlp": 0.00872565, "balance_loss_clip": 1.00702655, "balance_loss_mlp": 1.00051427, "epoch": 0.9241868574520532, "flos": 28511900115840.0, "grad_norm": 2.720539496939064, "language_loss": 0.70548487, "learning_rate": 5.990792303618807e-08, "loss": 0.72709423, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.7917320728302 }, { "auxiliary_loss_clip": 0.01286992, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00697398, "balance_loss_mlp": 1.00012958, "epoch": 0.9243071003426923, "flos": 30518337386880.0, "grad_norm": 1.8143361804078035, "language_loss": 0.69131595, "learning_rate": 5.971884151083695e-08, "loss": 0.71611738, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.899838924407959 }, { "auxiliary_loss_clip": 0.01322965, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00832152, "balance_loss_mlp": 1.000144, "epoch": 0.9244273432333313, "flos": 28658344046400.0, "grad_norm": 1.6977241258926965, "language_loss": 0.74409932, "learning_rate": 5.9530054324595124e-08, "loss": 0.76926064, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.8007020950317383 }, { "auxiliary_loss_clip": 0.01294526, "auxiliary_loss_mlp": 0.00871752, "balance_loss_clip": 1.00351477, "balance_loss_mlp": 1.00000179, "epoch": 0.9245475861239704, "flos": 66230620535520.0, "grad_norm": 0.718684143366176, "language_loss": 0.57550299, "learning_rate": 5.934156150610103e-08, "loss": 0.5971657, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.3538458347320557 }, { "auxiliary_loss_clip": 0.01312484, "auxiliary_loss_mlp": 0.01193059, "balance_loss_clip": 1.0071497, "balance_loss_mlp": 1.00022626, "epoch": 0.9246678290146095, "flos": 24239928686400.0, "grad_norm": 2.073072673548774, "language_loss": 0.7923885, "learning_rate": 5.915336308394914e-08, "loss": 0.81744391, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.755587339401245 }, { "auxiliary_loss_clip": 0.01322662, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00679517, "balance_loss_mlp": 1.00014615, "epoch": 0.9247880719052486, "flos": 18988818844320.0, "grad_norm": 1.5701354786767738, "language_loss": 0.77001637, "learning_rate": 5.89654590866886e-08, "loss": 0.79517472, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.7128090858459473 }, { "auxiliary_loss_clip": 0.01247525, "auxiliary_loss_mlp": 0.01193284, "balance_loss_clip": 1.00660133, "balance_loss_mlp": 1.00016546, "epoch": 0.9249083147958876, "flos": 24024106624320.0, "grad_norm": 1.9359168988239401, "language_loss": 0.88488245, "learning_rate": 5.877784954282483e-08, "loss": 0.90929055, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.8857178688049316 }, { "auxiliary_loss_clip": 0.01330539, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.0073384, "balance_loss_mlp": 1.00019789, "epoch": 0.9250285576865268, "flos": 30773985600960.0, "grad_norm": 1.8174555765537752, "language_loss": 0.72381884, "learning_rate": 5.8590534480817963e-08, "loss": 0.74905646, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.7883496284484863 }, { "auxiliary_loss_clip": 0.01348296, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.0082171, "balance_loss_mlp": 1.00014424, "epoch": 0.9251488005771659, "flos": 10633588878240.0, "grad_norm": 2.088414979339929, "language_loss": 0.71833313, "learning_rate": 5.840351392908349e-08, "loss": 0.74374771, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.7476847171783447 }, { "auxiliary_loss_clip": 0.01323758, "auxiliary_loss_mlp": 0.0087247, "balance_loss_clip": 1.00809288, "balance_loss_mlp": 1.00047541, "epoch": 0.9252690434678049, "flos": 23586427321920.0, "grad_norm": 2.1346352578062953, "language_loss": 0.70523685, "learning_rate": 5.821678791599205e-08, "loss": 0.7271992, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 2.754242420196533 }, { "auxiliary_loss_clip": 0.01305756, "auxiliary_loss_mlp": 0.01193055, "balance_loss_clip": 1.00689197, "balance_loss_mlp": 1.00012624, "epoch": 0.9253892863584441, "flos": 21469169201760.0, "grad_norm": 3.105440817756859, "language_loss": 0.80867565, "learning_rate": 5.803035646986965e-08, "loss": 0.83366376, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.811613082885742 }, { "auxiliary_loss_clip": 0.01349174, "auxiliary_loss_mlp": 0.01192962, "balance_loss_clip": 1.00813115, "balance_loss_mlp": 1.00012934, "epoch": 0.9255095292490831, "flos": 17456689820160.0, "grad_norm": 2.0353129235729277, "language_loss": 0.67352605, "learning_rate": 5.7844219618998766e-08, "loss": 0.69894743, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.642988443374634 }, { "auxiliary_loss_clip": 0.01302212, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00690579, "balance_loss_mlp": 1.00013614, "epoch": 0.9256297721397222, "flos": 24750686259360.0, "grad_norm": 2.128004412306031, "language_loss": 0.71757758, "learning_rate": 5.765837739161505e-08, "loss": 0.7425313, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 2.7715864181518555 }, { "auxiliary_loss_clip": 0.01292137, "auxiliary_loss_mlp": 0.01192981, "balance_loss_clip": 1.00697112, "balance_loss_mlp": 1.0001483, "epoch": 0.9257500150303614, "flos": 23112226846080.0, "grad_norm": 1.6001743159321933, "language_loss": 0.74132061, "learning_rate": 5.7472829815911504e-08, "loss": 0.76617181, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.846774101257324 }, { "auxiliary_loss_clip": 0.01313007, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00741398, "balance_loss_mlp": 1.00016093, "epoch": 0.9258702579210004, "flos": 22564696633920.0, "grad_norm": 1.6309343252802508, "language_loss": 0.81811988, "learning_rate": 5.7287576920035164e-08, "loss": 0.84318089, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.7189505100250244 }, { "auxiliary_loss_clip": 0.012817, "auxiliary_loss_mlp": 0.01192945, "balance_loss_clip": 1.00624061, "balance_loss_mlp": 1.00011206, "epoch": 0.9259905008116395, "flos": 30004310759040.0, "grad_norm": 1.7550826143359506, "language_loss": 0.76507837, "learning_rate": 5.7102618732088435e-08, "loss": 0.78982484, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.7759499549865723 }, { "auxiliary_loss_clip": 0.01307146, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00789225, "balance_loss_mlp": 1.00015306, "epoch": 0.9261107437022786, "flos": 24572139768000.0, "grad_norm": 1.5318973437635883, "language_loss": 0.74531114, "learning_rate": 5.6917955280130216e-08, "loss": 0.77031338, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.6473450660705566 }, { "auxiliary_loss_clip": 0.01322095, "auxiliary_loss_mlp": 0.01193042, "balance_loss_clip": 1.00764656, "balance_loss_mlp": 1.00011337, "epoch": 0.9262309865929177, "flos": 22018459674240.0, "grad_norm": 2.742926602496671, "language_loss": 0.71816516, "learning_rate": 5.6733586592172755e-08, "loss": 0.74331653, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.627591848373413 }, { "auxiliary_loss_clip": 0.01309662, "auxiliary_loss_mlp": 0.00872372, "balance_loss_clip": 1.00695288, "balance_loss_mlp": 1.00041556, "epoch": 0.9263512294835567, "flos": 20339491559040.0, "grad_norm": 1.8593893337673018, "language_loss": 0.79950678, "learning_rate": 5.6549512696185244e-08, "loss": 0.82132709, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.840118408203125 }, { "auxiliary_loss_clip": 0.01347109, "auxiliary_loss_mlp": 0.01193008, "balance_loss_clip": 1.00780058, "balance_loss_mlp": 1.00017548, "epoch": 0.9264714723741959, "flos": 21215389019040.0, "grad_norm": 2.0074161803319197, "language_loss": 0.68821728, "learning_rate": 5.636573362009156e-08, "loss": 0.71361852, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.708686113357544 }, { "auxiliary_loss_clip": 0.01348909, "auxiliary_loss_mlp": 0.01193213, "balance_loss_clip": 1.00795448, "balance_loss_mlp": 1.0001893, "epoch": 0.926591715264835, "flos": 18004974429600.0, "grad_norm": 2.085684932970657, "language_loss": 0.77267104, "learning_rate": 5.618224939177074e-08, "loss": 0.79809225, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.6496543884277344 }, { "auxiliary_loss_clip": 0.01310978, "auxiliary_loss_mlp": 0.01192958, "balance_loss_clip": 1.0073185, "balance_loss_mlp": 1.00012517, "epoch": 0.926711958155474, "flos": 36167982929280.0, "grad_norm": 1.7694555625996577, "language_loss": 0.70094281, "learning_rate": 5.599906003905719e-08, "loss": 0.72598213, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 3.01910400390625 }, { "auxiliary_loss_clip": 0.01323255, "auxiliary_loss_mlp": 0.01192924, "balance_loss_clip": 1.00815558, "balance_loss_mlp": 1.00018644, "epoch": 0.9268322010461132, "flos": 21032747228160.0, "grad_norm": 2.2389651589118396, "language_loss": 0.81666082, "learning_rate": 5.581616558974023e-08, "loss": 0.84182262, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 3.6328530311584473 }, { "auxiliary_loss_clip": 0.01336724, "auxiliary_loss_mlp": 0.00872622, "balance_loss_clip": 1.00801182, "balance_loss_mlp": 1.00052071, "epoch": 0.9269524439367522, "flos": 22964848899840.0, "grad_norm": 1.7995074613757291, "language_loss": 0.78998709, "learning_rate": 5.5633566071565444e-08, "loss": 0.81208056, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 3.718916893005371 }, { "auxiliary_loss_clip": 0.01257143, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00610614, "balance_loss_mlp": 1.00016296, "epoch": 0.9270726868273913, "flos": 41975567979840.0, "grad_norm": 1.980290166122422, "language_loss": 0.70831746, "learning_rate": 5.5451261512232896e-08, "loss": 0.7328198, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 3.991255283355713 }, { "auxiliary_loss_clip": 0.01336035, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00728452, "balance_loss_mlp": 1.00014257, "epoch": 0.9271929297180305, "flos": 19791781728480.0, "grad_norm": 1.9099503865682157, "language_loss": 0.62591338, "learning_rate": 5.5269251939397576e-08, "loss": 0.65120447, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 3.8855533599853516 }, { "auxiliary_loss_clip": 0.01297585, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00726533, "balance_loss_mlp": 1.00014782, "epoch": 0.9273131726086695, "flos": 19968352417440.0, "grad_norm": 1.861638145369941, "language_loss": 0.76343745, "learning_rate": 5.508753738067073e-08, "loss": 0.78834498, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.8000869750976562 }, { "auxiliary_loss_clip": 0.01335708, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00754142, "balance_loss_mlp": 1.00015879, "epoch": 0.9274334154993086, "flos": 23258598929280.0, "grad_norm": 1.7555634526709907, "language_loss": 0.79301214, "learning_rate": 5.4906117863617875e-08, "loss": 0.81830102, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.772974967956543 }, { "auxiliary_loss_clip": 0.01299933, "auxiliary_loss_mlp": 0.01193059, "balance_loss_clip": 1.00744367, "balance_loss_mlp": 1.00013077, "epoch": 0.9275536583899477, "flos": 31795357052160.0, "grad_norm": 1.706025704672785, "language_loss": 0.78073239, "learning_rate": 5.4724993415760533e-08, "loss": 0.80566233, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.865473508834839 }, { "auxiliary_loss_clip": 0.01305648, "auxiliary_loss_mlp": 0.00872445, "balance_loss_clip": 1.00754225, "balance_loss_mlp": 1.00042641, "epoch": 0.9276739012805868, "flos": 18697008693600.0, "grad_norm": 4.62048036035117, "language_loss": 0.7478894, "learning_rate": 5.454416406457496e-08, "loss": 0.76967037, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.712131977081299 }, { "auxiliary_loss_clip": 0.01334615, "auxiliary_loss_mlp": 0.01192967, "balance_loss_clip": 1.00753915, "balance_loss_mlp": 1.00013399, "epoch": 0.9277941441712259, "flos": 13879087693920.0, "grad_norm": 2.7691192390864874, "language_loss": 0.74478579, "learning_rate": 5.436362983749299e-08, "loss": 0.77006161, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.7149651050567627 }, { "auxiliary_loss_clip": 0.01276109, "auxiliary_loss_mlp": 0.0119298, "balance_loss_clip": 1.00622439, "balance_loss_mlp": 1.00014687, "epoch": 0.927914387061865, "flos": 23258670776640.0, "grad_norm": 1.8808980031588483, "language_loss": 0.64363879, "learning_rate": 5.418339076190137e-08, "loss": 0.66832966, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.8279829025268555 }, { "auxiliary_loss_clip": 0.01314559, "auxiliary_loss_mlp": 0.011929, "balance_loss_clip": 1.00780606, "balance_loss_mlp": 1.00016236, "epoch": 0.9280346299525041, "flos": 18073741858560.0, "grad_norm": 1.7261368235222554, "language_loss": 0.88484919, "learning_rate": 5.400344686514202e-08, "loss": 0.90992379, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.7597172260284424 }, { "auxiliary_loss_clip": 0.01324939, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00754666, "balance_loss_mlp": 1.00015414, "epoch": 0.9281548728431431, "flos": 22342911240960.0, "grad_norm": 1.7689799127692123, "language_loss": 0.66673785, "learning_rate": 5.38237981745131e-08, "loss": 0.69191802, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.718550443649292 }, { "auxiliary_loss_clip": 0.01335492, "auxiliary_loss_mlp": 0.00872367, "balance_loss_clip": 1.00795913, "balance_loss_mlp": 1.00046289, "epoch": 0.9282751157337822, "flos": 18843775937280.0, "grad_norm": 1.6523704790165024, "language_loss": 0.81471437, "learning_rate": 5.364444471726592e-08, "loss": 0.83679295, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.700269937515259 }, { "auxiliary_loss_clip": 0.01324441, "auxiliary_loss_mlp": 0.01193045, "balance_loss_clip": 1.00704527, "balance_loss_mlp": 1.00011683, "epoch": 0.9283953586244214, "flos": 25556846351040.0, "grad_norm": 2.368463965468393, "language_loss": 0.80237758, "learning_rate": 5.346538652060939e-08, "loss": 0.82755238, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.7693519592285156 }, { "auxiliary_loss_clip": 0.01309431, "auxiliary_loss_mlp": 0.01192987, "balance_loss_clip": 1.00785708, "balance_loss_mlp": 1.00015402, "epoch": 0.9285156015150604, "flos": 18223490767680.0, "grad_norm": 1.9299119677320744, "language_loss": 0.70242417, "learning_rate": 5.3286623611705994e-08, "loss": 0.72744834, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.7827017307281494 }, { "auxiliary_loss_clip": 0.01315726, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.0035429, "balance_loss_mlp": 1.00000954, "epoch": 0.9286358444056995, "flos": 66400055621280.0, "grad_norm": 0.8279447346386225, "language_loss": 0.60582459, "learning_rate": 5.3108156017673824e-08, "loss": 0.63090456, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.3558220863342285 }, { "auxiliary_loss_clip": 0.01297045, "auxiliary_loss_mlp": 0.01193156, "balance_loss_clip": 1.00723004, "balance_loss_mlp": 1.00013185, "epoch": 0.9287560872963386, "flos": 22345641440640.0, "grad_norm": 1.801482924274972, "language_loss": 0.7164315, "learning_rate": 5.2929983765586775e-08, "loss": 0.74133348, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 2.7956318855285645 }, { "auxiliary_loss_clip": 0.01347287, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00770092, "balance_loss_mlp": 1.00016069, "epoch": 0.9288763301869777, "flos": 25700236768800.0, "grad_norm": 2.3220235527010304, "language_loss": 0.62167525, "learning_rate": 5.275210688247278e-08, "loss": 0.64707905, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.7892470359802246 }, { "auxiliary_loss_clip": 0.01270667, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00689507, "balance_loss_mlp": 1.00014114, "epoch": 0.9289965730776167, "flos": 12312054061920.0, "grad_norm": 4.251607064529413, "language_loss": 0.84803581, "learning_rate": 5.257452539531604e-08, "loss": 0.87267321, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.8273732662200928 }, { "auxiliary_loss_clip": 0.01334935, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00777471, "balance_loss_mlp": 1.00015712, "epoch": 0.9291168159682559, "flos": 26685985138560.0, "grad_norm": 1.526589621766338, "language_loss": 0.6853261, "learning_rate": 5.2397239331055445e-08, "loss": 0.71060634, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.8522403240203857 }, { "auxiliary_loss_clip": 0.01301664, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.00686324, "balance_loss_mlp": 1.00014138, "epoch": 0.929237058858895, "flos": 14538264999840.0, "grad_norm": 2.086843941835983, "language_loss": 0.81215537, "learning_rate": 5.2220248716585036e-08, "loss": 0.83710366, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 2.8458263874053955 }, { "auxiliary_loss_clip": 0.01335589, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00760782, "balance_loss_mlp": 1.00015962, "epoch": 0.929357301749534, "flos": 23835465578880.0, "grad_norm": 5.329625280413745, "language_loss": 0.753497, "learning_rate": 5.204355357875445e-08, "loss": 0.77878374, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.7576780319213867 }, { "auxiliary_loss_clip": 0.01323462, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00791025, "balance_loss_mlp": 1.00014853, "epoch": 0.9294775446401732, "flos": 12969327412800.0, "grad_norm": 2.2883412933147547, "language_loss": 0.70200086, "learning_rate": 5.1867153944367584e-08, "loss": 0.72716618, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.771510601043701 }, { "auxiliary_loss_clip": 0.01295656, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00653589, "balance_loss_mlp": 1.00018084, "epoch": 0.9295977875308122, "flos": 26211820586400.0, "grad_norm": 1.556980322127914, "language_loss": 0.73296154, "learning_rate": 5.16910498401848e-08, "loss": 0.75785011, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.810930013656616 }, { "auxiliary_loss_clip": 0.01347859, "auxiliary_loss_mlp": 0.01192885, "balance_loss_clip": 1.0080241, "balance_loss_mlp": 1.00014782, "epoch": 0.9297180304214513, "flos": 16472306550240.0, "grad_norm": 1.9831730657275428, "language_loss": 0.83345306, "learning_rate": 5.151524129292073e-08, "loss": 0.85886043, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.6849448680877686 }, { "auxiliary_loss_clip": 0.01323244, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.0070672, "balance_loss_mlp": 1.00016689, "epoch": 0.9298382733120905, "flos": 24060448179360.0, "grad_norm": 1.847083925105161, "language_loss": 0.66308987, "learning_rate": 5.1339728329245155e-08, "loss": 0.68825424, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.737767457962036 }, { "auxiliary_loss_clip": 0.01349078, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00785112, "balance_loss_mlp": 1.000144, "epoch": 0.9299585162027295, "flos": 22127663957760.0, "grad_norm": 1.9406842962187647, "language_loss": 0.79414129, "learning_rate": 5.116451097578367e-08, "loss": 0.81956279, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 3.8353726863861084 }, { "auxiliary_loss_clip": 0.01300134, "auxiliary_loss_mlp": 0.01193121, "balance_loss_clip": 1.00741339, "balance_loss_mlp": 1.0001924, "epoch": 0.9300787590933686, "flos": 21471791630400.0, "grad_norm": 1.5852622249166939, "language_loss": 0.74426413, "learning_rate": 5.0989589259115895e-08, "loss": 0.76919663, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 3.897481918334961 }, { "auxiliary_loss_clip": 0.01336309, "auxiliary_loss_mlp": 0.01193209, "balance_loss_clip": 1.00757027, "balance_loss_mlp": 1.00018513, "epoch": 0.9301990019840077, "flos": 17779596668640.0, "grad_norm": 1.832524323571213, "language_loss": 0.71668589, "learning_rate": 5.081496320577816e-08, "loss": 0.74198109, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 3.6409051418304443 }, { "auxiliary_loss_clip": 0.01284301, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.00770521, "balance_loss_mlp": 1.00000656, "epoch": 0.9303192448746468, "flos": 58896151318080.0, "grad_norm": 0.9174891135773311, "language_loss": 0.61232948, "learning_rate": 5.0640632842260835e-08, "loss": 0.63709521, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 4.308693885803223 }, { "auxiliary_loss_clip": 0.01279731, "auxiliary_loss_mlp": 0.0087236, "balance_loss_clip": 1.00642478, "balance_loss_mlp": 1.00037837, "epoch": 0.9304394877652858, "flos": 57663542826720.0, "grad_norm": 1.412457527040736, "language_loss": 0.72635347, "learning_rate": 5.0466598195009426e-08, "loss": 0.74787438, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.190739870071411 }, { "auxiliary_loss_clip": 0.01276482, "auxiliary_loss_mlp": 0.01193056, "balance_loss_clip": 1.00708079, "balance_loss_mlp": 1.00012779, "epoch": 0.930559730655925, "flos": 20996154207360.0, "grad_norm": 1.7500637536339, "language_loss": 0.70048338, "learning_rate": 5.0292859290425036e-08, "loss": 0.72517878, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.788811683654785 }, { "auxiliary_loss_clip": 0.01347412, "auxiliary_loss_mlp": 0.01193049, "balance_loss_clip": 1.00775862, "balance_loss_mlp": 1.0001204, "epoch": 0.9306799735465641, "flos": 23258275616160.0, "grad_norm": 1.8199472121920193, "language_loss": 0.77801633, "learning_rate": 5.011941615486348e-08, "loss": 0.8034209, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.7011971473693848 }, { "auxiliary_loss_clip": 0.01348368, "auxiliary_loss_mlp": 0.01192967, "balance_loss_clip": 1.00794351, "balance_loss_mlp": 1.00013423, "epoch": 0.9308002164372031, "flos": 15231556592640.0, "grad_norm": 1.9678377421479938, "language_loss": 0.84131682, "learning_rate": 4.994626881463659e-08, "loss": 0.86673021, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.670201301574707 }, { "auxiliary_loss_clip": 0.01279301, "auxiliary_loss_mlp": 0.01193002, "balance_loss_clip": 1.00693011, "balance_loss_mlp": 1.0001694, "epoch": 0.9309204593278423, "flos": 30847494955680.0, "grad_norm": 1.613476439944527, "language_loss": 0.70692772, "learning_rate": 4.9773417296009814e-08, "loss": 0.73165077, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.9314844608306885 }, { "auxiliary_loss_clip": 0.01336345, "auxiliary_loss_mlp": 0.01193133, "balance_loss_clip": 1.00820506, "balance_loss_mlp": 1.00020444, "epoch": 0.9310407022184813, "flos": 23037280544160.0, "grad_norm": 1.681095026730521, "language_loss": 0.65161532, "learning_rate": 4.960086162520527e-08, "loss": 0.6769101, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.7470054626464844 }, { "auxiliary_loss_clip": 0.0126219, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.0070473, "balance_loss_mlp": 1.0001229, "epoch": 0.9311609451091204, "flos": 22127987270880.0, "grad_norm": 1.8618795930226146, "language_loss": 0.82115966, "learning_rate": 4.942860182839936e-08, "loss": 0.84571308, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.850421190261841 }, { "auxiliary_loss_clip": 0.01315377, "auxiliary_loss_mlp": 0.01193215, "balance_loss_clip": 1.00752449, "balance_loss_mlp": 1.00019169, "epoch": 0.9312811879997596, "flos": 21099215541600.0, "grad_norm": 1.6624972360924268, "language_loss": 0.79423976, "learning_rate": 4.925663793172341e-08, "loss": 0.81932569, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.7679600715637207 }, { "auxiliary_loss_clip": 0.01273929, "auxiliary_loss_mlp": 0.00871848, "balance_loss_clip": 1.00318193, "balance_loss_mlp": 1.00008678, "epoch": 0.9314014308903986, "flos": 67148212178880.0, "grad_norm": 0.7850056508543384, "language_loss": 0.56541401, "learning_rate": 4.908496996126477e-08, "loss": 0.5868718, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.387826919555664 }, { "auxiliary_loss_clip": 0.01324822, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00821376, "balance_loss_mlp": 1.00014997, "epoch": 0.9315216737810377, "flos": 22565594725920.0, "grad_norm": 1.5170914808815292, "language_loss": 0.76239759, "learning_rate": 4.89135979430646e-08, "loss": 0.78757757, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.7060701847076416 }, { "auxiliary_loss_clip": 0.01348295, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.0086031, "balance_loss_mlp": 1.00017762, "epoch": 0.9316419166716768, "flos": 23984064930240.0, "grad_norm": 1.9121132956580675, "language_loss": 0.85621536, "learning_rate": 4.874252190312078e-08, "loss": 0.8816303, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 2.7661306858062744 }, { "auxiliary_loss_clip": 0.01319438, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00793338, "balance_loss_mlp": 1.00015235, "epoch": 0.9317621595623159, "flos": 30230478840960.0, "grad_norm": 4.177909665305289, "language_loss": 0.64782715, "learning_rate": 4.857174186738477e-08, "loss": 0.67295235, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.9235610961914062 }, { "auxiliary_loss_clip": 0.01348301, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00854754, "balance_loss_mlp": 1.0001446, "epoch": 0.931882402452955, "flos": 15742745249760.0, "grad_norm": 1.9878914072465392, "language_loss": 0.73261797, "learning_rate": 4.840125786176408e-08, "loss": 0.75803262, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 2.7226598262786865 }, { "auxiliary_loss_clip": 0.01322006, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00830424, "balance_loss_mlp": 1.00017524, "epoch": 0.932002645343594, "flos": 28366533895680.0, "grad_norm": 1.849595960328552, "language_loss": 0.77175438, "learning_rate": 4.823106991212067e-08, "loss": 0.79690641, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.8255372047424316 }, { "auxiliary_loss_clip": 0.01334729, "auxiliary_loss_mlp": 0.01192975, "balance_loss_clip": 1.00734186, "balance_loss_mlp": 1.00014257, "epoch": 0.9321228882342332, "flos": 15341156036640.0, "grad_norm": 1.8066630033590578, "language_loss": 0.83357704, "learning_rate": 4.806117804427212e-08, "loss": 0.85885406, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.771345853805542 }, { "auxiliary_loss_clip": 0.01335736, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00824356, "balance_loss_mlp": 1.00016069, "epoch": 0.9322431311248722, "flos": 17895374985600.0, "grad_norm": 1.7311835284926869, "language_loss": 0.64370573, "learning_rate": 4.7891582283990926e-08, "loss": 0.66899395, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.802553653717041 }, { "auxiliary_loss_clip": 0.01286097, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00764012, "balance_loss_mlp": 1.00017023, "epoch": 0.9323633740155113, "flos": 24169724310240.0, "grad_norm": 1.4931576570688847, "language_loss": 0.72580224, "learning_rate": 4.772228265700473e-08, "loss": 0.7505942, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 2.8571414947509766 }, { "auxiliary_loss_clip": 0.01335273, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.0080905, "balance_loss_mlp": 1.00014055, "epoch": 0.9324836169061504, "flos": 15043490326080.0, "grad_norm": 2.2530737437003907, "language_loss": 0.75703037, "learning_rate": 4.75532791889961e-08, "loss": 0.78231472, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.8177714347839355 }, { "auxiliary_loss_clip": 0.01336219, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00772333, "balance_loss_mlp": 1.00015926, "epoch": 0.9326038597967895, "flos": 18624900362400.0, "grad_norm": 8.266982931369139, "language_loss": 0.6555016, "learning_rate": 4.738457190560252e-08, "loss": 0.68079561, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.7220237255096436 }, { "auxiliary_loss_clip": 0.01263096, "auxiliary_loss_mlp": 0.01193139, "balance_loss_clip": 1.00579381, "balance_loss_mlp": 1.00021076, "epoch": 0.9327241026874286, "flos": 18952656907680.0, "grad_norm": 1.9640290986383782, "language_loss": 0.78716153, "learning_rate": 4.721616083241664e-08, "loss": 0.81172383, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.8955438137054443 }, { "auxiliary_loss_clip": 0.01336025, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00825214, "balance_loss_mlp": 1.00016546, "epoch": 0.9328443455780677, "flos": 29570295672000.0, "grad_norm": 1.6071990309534436, "language_loss": 0.77419543, "learning_rate": 4.7048045994986684e-08, "loss": 0.79948658, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.83378529548645 }, { "auxiliary_loss_clip": 0.01320951, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00846767, "balance_loss_mlp": 1.00015211, "epoch": 0.9329645884687068, "flos": 30081735794880.0, "grad_norm": 2.0652556673075178, "language_loss": 0.91090035, "learning_rate": 4.688022741881559e-08, "loss": 0.93604153, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.827009439468384 }, { "auxiliary_loss_clip": 0.01324303, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00694168, "balance_loss_mlp": 1.00013614, "epoch": 0.9330848313593458, "flos": 21867992291520.0, "grad_norm": 1.989831332688479, "language_loss": 0.75197113, "learning_rate": 4.671270512936076e-08, "loss": 0.7771439, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 3.7315754890441895 }, { "auxiliary_loss_clip": 0.01295343, "auxiliary_loss_mlp": 0.01192941, "balance_loss_clip": 1.00668669, "balance_loss_mlp": 1.000108, "epoch": 0.933205074249985, "flos": 22127232873600.0, "grad_norm": 1.8662403176117146, "language_loss": 0.82927936, "learning_rate": 4.6545479152035884e-08, "loss": 0.85416222, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 3.7108027935028076 }, { "auxiliary_loss_clip": 0.01329104, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.00759411, "balance_loss_mlp": 1.00014973, "epoch": 0.9333253171406241, "flos": 15341263807680.0, "grad_norm": 1.9379020268158218, "language_loss": 0.7644285, "learning_rate": 4.637854951220821e-08, "loss": 0.78965026, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 3.724290609359741 }, { "auxiliary_loss_clip": 0.01288976, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00691724, "balance_loss_mlp": 1.00017607, "epoch": 0.9334455600312631, "flos": 15706152228960.0, "grad_norm": 1.890116449259111, "language_loss": 0.75296009, "learning_rate": 4.621191623520171e-08, "loss": 0.77778178, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 3.807389259338379 }, { "auxiliary_loss_clip": 0.01252586, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.0065521, "balance_loss_mlp": 1.00015283, "epoch": 0.9335658029219023, "flos": 22163574428640.0, "grad_norm": 2.2168803576447855, "language_loss": 0.84365714, "learning_rate": 4.604557934629372e-08, "loss": 0.86811376, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.9064412117004395 }, { "auxiliary_loss_clip": 0.01298607, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00673258, "balance_loss_mlp": 1.00013614, "epoch": 0.9336860458125413, "flos": 20266844372640.0, "grad_norm": 1.7410780916153583, "language_loss": 0.80231667, "learning_rate": 4.587953887071805e-08, "loss": 0.82723248, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.786167860031128 }, { "auxiliary_loss_clip": 0.01313662, "auxiliary_loss_mlp": 0.01192972, "balance_loss_clip": 1.00708747, "balance_loss_mlp": 1.00013888, "epoch": 0.9338062887031804, "flos": 20919699110880.0, "grad_norm": 1.7581744455200858, "language_loss": 0.85948694, "learning_rate": 4.5713794833662554e-08, "loss": 0.88455325, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.772237777709961 }, { "auxiliary_loss_clip": 0.01347676, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00775278, "balance_loss_mlp": 1.00019193, "epoch": 0.9339265315938196, "flos": 23221646671680.0, "grad_norm": 1.8011566485554362, "language_loss": 0.63423884, "learning_rate": 4.5548347260270236e-08, "loss": 0.65964776, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.714564323425293 }, { "auxiliary_loss_clip": 0.01288304, "auxiliary_loss_mlp": 0.01193146, "balance_loss_clip": 1.00743377, "balance_loss_mlp": 1.00012243, "epoch": 0.9340467744844586, "flos": 22820273000640.0, "grad_norm": 1.6015888287442885, "language_loss": 0.69099134, "learning_rate": 4.538319617564012e-08, "loss": 0.71580583, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.8417413234710693 }, { "auxiliary_loss_clip": 0.01314375, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00723124, "balance_loss_mlp": 1.00015759, "epoch": 0.9341670173750977, "flos": 23660439608160.0, "grad_norm": 1.8767267786517805, "language_loss": 0.74378216, "learning_rate": 4.521834160482485e-08, "loss": 0.76885676, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.8722856044769287 }, { "auxiliary_loss_clip": 0.01334717, "auxiliary_loss_mlp": 0.01193162, "balance_loss_clip": 1.00731516, "balance_loss_mlp": 1.00013876, "epoch": 0.9342872602657368, "flos": 24824267461440.0, "grad_norm": 1.483972239004575, "language_loss": 0.81961846, "learning_rate": 4.5053783572832846e-08, "loss": 0.84489727, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.8808772563934326 }, { "auxiliary_loss_clip": 0.01326085, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00704479, "balance_loss_mlp": 1.00016356, "epoch": 0.9344075031563759, "flos": 25771842168480.0, "grad_norm": 1.6480968074718714, "language_loss": 0.76142156, "learning_rate": 4.488952210462771e-08, "loss": 0.78661335, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.810133218765259 }, { "auxiliary_loss_clip": 0.01347452, "auxiliary_loss_mlp": 0.01192977, "balance_loss_clip": 1.0079931, "balance_loss_mlp": 1.00014448, "epoch": 0.9345277460470149, "flos": 25551314104320.0, "grad_norm": 1.814558156102484, "language_loss": 0.85326958, "learning_rate": 4.4725557225127495e-08, "loss": 0.87867391, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.8028223514556885 }, { "auxiliary_loss_clip": 0.01333784, "auxiliary_loss_mlp": 0.01192967, "balance_loss_clip": 1.00746083, "balance_loss_mlp": 1.00013423, "epoch": 0.9346479889376541, "flos": 34313126675040.0, "grad_norm": 1.5706660641577586, "language_loss": 0.79423928, "learning_rate": 4.456188895920565e-08, "loss": 0.81950688, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.8867290019989014 }, { "auxiliary_loss_clip": 0.01347623, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00797153, "balance_loss_mlp": 1.00015378, "epoch": 0.9347682318282932, "flos": 19093748209920.0, "grad_norm": 1.8455061673555937, "language_loss": 0.85010564, "learning_rate": 4.439851733169031e-08, "loss": 0.87551367, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 2.768423318862915 }, { "auxiliary_loss_clip": 0.01289292, "auxiliary_loss_mlp": 0.01193103, "balance_loss_clip": 1.006405, "balance_loss_mlp": 1.0001744, "epoch": 0.9348884747189322, "flos": 26249599088640.0, "grad_norm": 2.6789281432674827, "language_loss": 0.69572699, "learning_rate": 4.4235442367365204e-08, "loss": 0.72055095, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.8862993717193604 }, { "auxiliary_loss_clip": 0.01324481, "auxiliary_loss_mlp": 0.01192998, "balance_loss_clip": 1.00762987, "balance_loss_mlp": 1.00016463, "epoch": 0.9350087176095714, "flos": 18333090211680.0, "grad_norm": 2.0555753629474176, "language_loss": 0.79514706, "learning_rate": 4.4072664090968545e-08, "loss": 0.8203218, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.7985339164733887 }, { "auxiliary_loss_clip": 0.01323742, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00760889, "balance_loss_mlp": 1.00015116, "epoch": 0.9351289605002104, "flos": 19318263802560.0, "grad_norm": 1.809208087579917, "language_loss": 0.84670234, "learning_rate": 4.391018252719347e-08, "loss": 0.87187159, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.833966016769409 }, { "auxiliary_loss_clip": 0.01324189, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00816774, "balance_loss_mlp": 1.0001446, "epoch": 0.9352492033908495, "flos": 18799998180480.0, "grad_norm": 1.728103578352884, "language_loss": 0.69405055, "learning_rate": 4.374799770068849e-08, "loss": 0.7192241, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.7439420223236084 }, { "auxiliary_loss_clip": 0.01324473, "auxiliary_loss_mlp": 0.01193024, "balance_loss_clip": 1.00772452, "balance_loss_mlp": 1.00019073, "epoch": 0.9353694462814887, "flos": 29530146206880.0, "grad_norm": 2.1369214609424914, "language_loss": 0.74847031, "learning_rate": 4.358610963605658e-08, "loss": 0.77364528, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.7765817642211914 }, { "auxiliary_loss_clip": 0.01348723, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00820494, "balance_loss_mlp": 1.00017965, "epoch": 0.9354896891721277, "flos": 30665463867360.0, "grad_norm": 2.280532586265116, "language_loss": 0.68230855, "learning_rate": 4.342451835785677e-08, "loss": 0.70772779, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.7666165828704834 }, { "auxiliary_loss_clip": 0.01315217, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00735331, "balance_loss_mlp": 1.00014234, "epoch": 0.9356099320627668, "flos": 19463917412160.0, "grad_norm": 1.442821713696341, "language_loss": 0.74902999, "learning_rate": 4.3263223890601665e-08, "loss": 0.77411288, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.8512303829193115 }, { "auxiliary_loss_clip": 0.01321975, "auxiliary_loss_mlp": 0.00872442, "balance_loss_clip": 1.00793719, "balance_loss_mlp": 1.00041175, "epoch": 0.9357301749534058, "flos": 19098166822560.0, "grad_norm": 1.6494725644911268, "language_loss": 0.79169649, "learning_rate": 4.31022262587597e-08, "loss": 0.81364065, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.7602384090423584 }, { "auxiliary_loss_clip": 0.01324347, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.0074513, "balance_loss_mlp": 1.00014687, "epoch": 0.935850417844045, "flos": 23550373156320.0, "grad_norm": 1.5131210618727633, "language_loss": 0.66065264, "learning_rate": 4.2941525486754225e-08, "loss": 0.68582785, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.7318553924560547 }, { "auxiliary_loss_clip": 0.01285293, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00696099, "balance_loss_mlp": 1.00014806, "epoch": 0.935970660734684, "flos": 18588343265280.0, "grad_norm": 1.73322344424154, "language_loss": 0.79407454, "learning_rate": 4.278112159896286e-08, "loss": 0.81885922, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.7804229259490967 }, { "auxiliary_loss_clip": 0.01322432, "auxiliary_loss_mlp": 0.01192794, "balance_loss_clip": 1.00738847, "balance_loss_mlp": 1.0001514, "epoch": 0.9360909036253231, "flos": 20631265786080.0, "grad_norm": 1.577404540043614, "language_loss": 0.67692167, "learning_rate": 4.2621014619719896e-08, "loss": 0.70207393, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.8060755729675293 }, { "auxiliary_loss_clip": 0.01282135, "auxiliary_loss_mlp": 0.01192279, "balance_loss_clip": 1.00308502, "balance_loss_mlp": 1.0000186, "epoch": 0.9362111465159623, "flos": 61791454497600.0, "grad_norm": 0.8158236295915164, "language_loss": 0.58633268, "learning_rate": 4.246120457331215e-08, "loss": 0.61107683, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 4.229833126068115 }, { "auxiliary_loss_clip": 0.01310512, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00799978, "balance_loss_mlp": 1.00016451, "epoch": 0.9363313894066013, "flos": 24170406860160.0, "grad_norm": 1.7481377299933836, "language_loss": 0.71909928, "learning_rate": 4.2301691483983325e-08, "loss": 0.74413538, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 3.744265079498291 }, { "auxiliary_loss_clip": 0.01330916, "auxiliary_loss_mlp": 0.01192973, "balance_loss_clip": 1.0073514, "balance_loss_mlp": 1.00013971, "epoch": 0.9364516322972404, "flos": 20120364518400.0, "grad_norm": 8.07624641787012, "language_loss": 0.76024771, "learning_rate": 4.214247537593163e-08, "loss": 0.78548658, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 3.7568471431732178 }, { "auxiliary_loss_clip": 0.0132367, "auxiliary_loss_mlp": 0.01193083, "balance_loss_clip": 1.00760055, "balance_loss_mlp": 1.00015473, "epoch": 0.9365718751878795, "flos": 20703769277760.0, "grad_norm": 1.7522150945574866, "language_loss": 0.80222154, "learning_rate": 4.1983556273309293e-08, "loss": 0.827389, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 3.7291338443756104 }, { "auxiliary_loss_clip": 0.01348734, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00811529, "balance_loss_mlp": 1.0001967, "epoch": 0.9366921180785186, "flos": 18655278586560.0, "grad_norm": 2.974125382378927, "language_loss": 0.68842721, "learning_rate": 4.182493420022526e-08, "loss": 0.7138468, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.8027937412261963 }, { "auxiliary_loss_clip": 0.01283132, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00797772, "balance_loss_mlp": 1.0001595, "epoch": 0.9368123609691577, "flos": 25774967528640.0, "grad_norm": 1.6624899633604933, "language_loss": 0.7853452, "learning_rate": 4.166660918074139e-08, "loss": 0.81010747, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.92690110206604 }, { "auxiliary_loss_clip": 0.01292878, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00704527, "balance_loss_mlp": 1.00015569, "epoch": 0.9369326038597968, "flos": 25553397677760.0, "grad_norm": 1.5783939634184354, "language_loss": 0.73353219, "learning_rate": 4.15085812388758e-08, "loss": 0.75839174, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.904799699783325 }, { "auxiliary_loss_clip": 0.01310869, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.00751579, "balance_loss_mlp": 1.00014353, "epoch": 0.9370528467504359, "flos": 23220030106080.0, "grad_norm": 1.7030370160876167, "language_loss": 0.78594577, "learning_rate": 4.135085039860153e-08, "loss": 0.8109861, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.777268409729004 }, { "auxiliary_loss_clip": 0.01301041, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.00718069, "balance_loss_mlp": 1.00017643, "epoch": 0.9371730896410749, "flos": 24967478260800.0, "grad_norm": 2.0837264314992106, "language_loss": 0.78602135, "learning_rate": 4.1193416683845906e-08, "loss": 0.81096381, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.855586528778076 }, { "auxiliary_loss_clip": 0.01294101, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00700426, "balance_loss_mlp": 1.00015068, "epoch": 0.9372933325317141, "flos": 15553098341280.0, "grad_norm": 2.541229067381737, "language_loss": 0.83376914, "learning_rate": 4.103628011849136e-08, "loss": 0.85864198, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.8638341426849365 }, { "auxiliary_loss_clip": 0.01306009, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00706744, "balance_loss_mlp": 1.00016284, "epoch": 0.9374135754223532, "flos": 21871872048960.0, "grad_norm": 1.8400834382318234, "language_loss": 0.75632519, "learning_rate": 4.0879440726375506e-08, "loss": 0.78131711, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.8257575035095215 }, { "auxiliary_loss_clip": 0.01324078, "auxiliary_loss_mlp": 0.0119309, "balance_loss_clip": 1.00813246, "balance_loss_mlp": 1.00016189, "epoch": 0.9375338183129922, "flos": 22631057176320.0, "grad_norm": 2.433090264223968, "language_loss": 0.56072664, "learning_rate": 4.0722898531291074e-08, "loss": 0.58589828, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.814572334289551 }, { "auxiliary_loss_clip": 0.01313632, "auxiliary_loss_mlp": 0.01193114, "balance_loss_clip": 1.00693178, "balance_loss_mlp": 1.00018573, "epoch": 0.9376540612036314, "flos": 26104304715840.0, "grad_norm": 1.609493388932899, "language_loss": 0.76598608, "learning_rate": 4.0566653556985295e-08, "loss": 0.79105353, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.7765073776245117 }, { "auxiliary_loss_clip": 0.01233869, "auxiliary_loss_mlp": 0.01193316, "balance_loss_clip": 1.00618911, "balance_loss_mlp": 1.00019729, "epoch": 0.9377743040942704, "flos": 19717589823840.0, "grad_norm": 2.2066659623702765, "language_loss": 0.81677771, "learning_rate": 4.0410705827159886e-08, "loss": 0.84104967, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.898900032043457 }, { "auxiliary_loss_clip": 0.01323978, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00744426, "balance_loss_mlp": 1.00015366, "epoch": 0.9378945469849095, "flos": 15267538910880.0, "grad_norm": 1.8655824061967412, "language_loss": 0.70692527, "learning_rate": 4.0255055365472356e-08, "loss": 0.73209584, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.8288376331329346 }, { "auxiliary_loss_clip": 0.01273709, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00654125, "balance_loss_mlp": 1.00015855, "epoch": 0.9380147898755486, "flos": 20591403710400.0, "grad_norm": 2.0752541876350152, "language_loss": 0.74720132, "learning_rate": 4.009970219553471e-08, "loss": 0.77187014, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.833428144454956 }, { "auxiliary_loss_clip": 0.01327561, "auxiliary_loss_mlp": 0.01193201, "balance_loss_clip": 1.00725174, "balance_loss_mlp": 1.00017691, "epoch": 0.9381350327661877, "flos": 26281126870560.0, "grad_norm": 3.046484036056613, "language_loss": 0.75995445, "learning_rate": 3.99446463409141e-08, "loss": 0.78516215, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.8042995929718018 }, { "auxiliary_loss_clip": 0.01336411, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00763011, "balance_loss_mlp": 1.00016308, "epoch": 0.9382552756568268, "flos": 23586355474560.0, "grad_norm": 2.0625385284042377, "language_loss": 0.6887179, "learning_rate": 3.978988782513215e-08, "loss": 0.71401387, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.7724759578704834 }, { "auxiliary_loss_clip": 0.01333858, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00714624, "balance_loss_mlp": 1.0001719, "epoch": 0.9383755185474659, "flos": 28438821845280.0, "grad_norm": 1.5690256731470544, "language_loss": 0.76450455, "learning_rate": 3.963542667166586e-08, "loss": 0.78977406, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 2.840113878250122 }, { "auxiliary_loss_clip": 0.01273368, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.00774074, "balance_loss_mlp": 1.0001936, "epoch": 0.938495761438105, "flos": 20449593934560.0, "grad_norm": 1.798516036479219, "language_loss": 0.68433237, "learning_rate": 3.9481262903946486e-08, "loss": 0.70899731, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.7260982990264893 }, { "auxiliary_loss_clip": 0.01258511, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.0032506, "balance_loss_mlp": 1.00000656, "epoch": 0.938616004328744, "flos": 69302745869760.0, "grad_norm": 0.76159841318484, "language_loss": 0.54474688, "learning_rate": 3.932739654536066e-08, "loss": 0.56925464, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.361557960510254 }, { "auxiliary_loss_clip": 0.01325429, "auxiliary_loss_mlp": 0.01192897, "balance_loss_clip": 1.00697148, "balance_loss_mlp": 1.00015974, "epoch": 0.9387362472193832, "flos": 18911645274240.0, "grad_norm": 1.980634507206391, "language_loss": 0.7417233, "learning_rate": 3.917382761925014e-08, "loss": 0.76690662, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.7168397903442383 }, { "auxiliary_loss_clip": 0.01324507, "auxiliary_loss_mlp": 0.01193057, "balance_loss_clip": 1.00709152, "balance_loss_mlp": 1.00012851, "epoch": 0.9388564901100223, "flos": 26501978247840.0, "grad_norm": 1.6791611901995527, "language_loss": 0.79187864, "learning_rate": 3.9020556148910754e-08, "loss": 0.81705427, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.7654409408569336 }, { "auxiliary_loss_clip": 0.01288773, "auxiliary_loss_mlp": 0.01192291, "balance_loss_clip": 1.00332689, "balance_loss_mlp": 1.00002992, "epoch": 0.9389767330006613, "flos": 58941114556320.0, "grad_norm": 0.709163909585703, "language_loss": 0.56690323, "learning_rate": 3.8867582157593895e-08, "loss": 0.59171385, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 3.1768715381622314 }, { "auxiliary_loss_clip": 0.01324937, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00728226, "balance_loss_mlp": 1.00016248, "epoch": 0.9390969758913005, "flos": 31102568390880.0, "grad_norm": 2.084711972286713, "language_loss": 0.76343089, "learning_rate": 3.871490566850544e-08, "loss": 0.78861213, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.834681749343872 }, { "auxiliary_loss_clip": 0.01312343, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00731087, "balance_loss_mlp": 1.00017214, "epoch": 0.9392172187819395, "flos": 22419402261120.0, "grad_norm": 1.6363154123561314, "language_loss": 0.7069453, "learning_rate": 3.856252670480642e-08, "loss": 0.73199975, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 3.6878435611724854 }, { "auxiliary_loss_clip": 0.01323554, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00757837, "balance_loss_mlp": 1.00014806, "epoch": 0.9393374616725786, "flos": 19719493778880.0, "grad_norm": 1.8174118649648727, "language_loss": 0.81169736, "learning_rate": 3.841044528961279e-08, "loss": 0.83686364, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.7736289501190186 }, { "auxiliary_loss_clip": 0.01348918, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00796175, "balance_loss_mlp": 1.00013375, "epoch": 0.9394577045632178, "flos": 24170227241760.0, "grad_norm": 1.974969221830498, "language_loss": 0.78505051, "learning_rate": 3.825866144599477e-08, "loss": 0.81047028, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 3.7570483684539795 }, { "auxiliary_loss_clip": 0.01310986, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.0069232, "balance_loss_mlp": 1.00015235, "epoch": 0.9395779474538568, "flos": 19023938994240.0, "grad_norm": 1.9740839428545323, "language_loss": 0.75367773, "learning_rate": 3.8107175196978145e-08, "loss": 0.77871847, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 3.6604385375976562 }, { "auxiliary_loss_clip": 0.01300275, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00833988, "balance_loss_mlp": 1.00016797, "epoch": 0.9396981903444959, "flos": 14319137959200.0, "grad_norm": 1.9151075003749638, "language_loss": 0.76831079, "learning_rate": 3.7955986565542996e-08, "loss": 0.79324549, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 3.720552444458008 }, { "auxiliary_loss_clip": 0.0129468, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00655222, "balance_loss_mlp": 1.00014007, "epoch": 0.9398184332351349, "flos": 34787578616640.0, "grad_norm": 1.8388978497562813, "language_loss": 0.68223476, "learning_rate": 3.780509557462497e-08, "loss": 0.70711225, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.916214942932129 }, { "auxiliary_loss_clip": 0.01309869, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00744617, "balance_loss_mlp": 1.00015068, "epoch": 0.9399386761257741, "flos": 25372264681440.0, "grad_norm": 1.641014150345167, "language_loss": 0.75599837, "learning_rate": 3.765450224711375e-08, "loss": 0.78102791, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.898871898651123 }, { "auxiliary_loss_clip": 0.01300448, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00722837, "balance_loss_mlp": 1.00013995, "epoch": 0.9400589190164131, "flos": 27304977055680.0, "grad_norm": 5.675998900143367, "language_loss": 0.79706919, "learning_rate": 3.750420660585396e-08, "loss": 0.82200432, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.919905662536621 }, { "auxiliary_loss_clip": 0.0134808, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.0079236, "balance_loss_mlp": 1.00016594, "epoch": 0.9401791619070522, "flos": 23399869849920.0, "grad_norm": 1.6696822122332038, "language_loss": 0.79870826, "learning_rate": 3.735420867364603e-08, "loss": 0.82412094, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.7521181106567383 }, { "auxiliary_loss_clip": 0.012604, "auxiliary_loss_mlp": 0.0119288, "balance_loss_clip": 1.00627267, "balance_loss_mlp": 1.00014234, "epoch": 0.9402994047976914, "flos": 35881417635840.0, "grad_norm": 1.5732564290307836, "language_loss": 0.61697304, "learning_rate": 3.7204508473244186e-08, "loss": 0.6415059, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 3.053335428237915 }, { "auxiliary_loss_clip": 0.01219858, "auxiliary_loss_mlp": 0.01192982, "balance_loss_clip": 1.00565791, "balance_loss_mlp": 1.00014925, "epoch": 0.9404196476883304, "flos": 22236832317600.0, "grad_norm": 1.5650619798340715, "language_loss": 0.68924212, "learning_rate": 3.7055106027357395e-08, "loss": 0.7133705, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.9139902591705322 }, { "auxiliary_loss_clip": 0.01322197, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.0075804, "balance_loss_mlp": 1.00015628, "epoch": 0.9405398905789695, "flos": 18915812421120.0, "grad_norm": 2.1439318646863867, "language_loss": 0.7176851, "learning_rate": 3.690600135865063e-08, "loss": 0.74283791, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 3.007533073425293 }, { "auxiliary_loss_clip": 0.01237333, "auxiliary_loss_mlp": 0.01192292, "balance_loss_clip": 1.00356007, "balance_loss_mlp": 1.00003111, "epoch": 0.9406601334696086, "flos": 70274160691200.0, "grad_norm": 0.7895390815976767, "language_loss": 0.58128417, "learning_rate": 3.675719448974246e-08, "loss": 0.60558033, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.5089104175567627 }, { "auxiliary_loss_clip": 0.01273956, "auxiliary_loss_mlp": 0.00872408, "balance_loss_clip": 1.00701284, "balance_loss_mlp": 1.00033188, "epoch": 0.9407803763602477, "flos": 22165083223200.0, "grad_norm": 1.9717197316666477, "language_loss": 0.59939885, "learning_rate": 3.6608685443207054e-08, "loss": 0.62086248, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.954587936401367 }, { "auxiliary_loss_clip": 0.01297191, "auxiliary_loss_mlp": 0.01193061, "balance_loss_clip": 1.00759888, "balance_loss_mlp": 1.00013232, "epoch": 0.9409006192508867, "flos": 18879506789760.0, "grad_norm": 2.330250844522936, "language_loss": 0.66678584, "learning_rate": 3.646047424157306e-08, "loss": 0.6916883, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.9075963497161865 }, { "auxiliary_loss_clip": 0.01301587, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00723362, "balance_loss_mlp": 1.00015914, "epoch": 0.9410208621415259, "flos": 23368270220640.0, "grad_norm": 2.390795926887446, "language_loss": 0.68670928, "learning_rate": 3.631256090732382e-08, "loss": 0.71165699, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.996636390686035 }, { "auxiliary_loss_clip": 0.0129145, "auxiliary_loss_mlp": 0.01192803, "balance_loss_clip": 1.00703585, "balance_loss_mlp": 1.00016081, "epoch": 0.941141105032165, "flos": 22742237262240.0, "grad_norm": 1.6798106932378454, "language_loss": 0.82561803, "learning_rate": 3.6164945462897833e-08, "loss": 0.85046059, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.915644407272339 }, { "auxiliary_loss_clip": 0.01323605, "auxiliary_loss_mlp": 0.00872438, "balance_loss_clip": 1.00741911, "balance_loss_mlp": 1.00040925, "epoch": 0.941261347922804, "flos": 20704918835520.0, "grad_norm": 1.657474437179124, "language_loss": 0.75874639, "learning_rate": 3.6017627930687856e-08, "loss": 0.78070676, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.912471294403076 }, { "auxiliary_loss_clip": 0.01277531, "auxiliary_loss_mlp": 0.01192892, "balance_loss_clip": 1.00644708, "balance_loss_mlp": 1.00015473, "epoch": 0.9413815908134432, "flos": 19422007686720.0, "grad_norm": 1.8755524691225762, "language_loss": 0.76476824, "learning_rate": 3.587060833304267e-08, "loss": 0.78947258, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.9025425910949707 }, { "auxiliary_loss_clip": 0.01330247, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00732708, "balance_loss_mlp": 1.00012708, "epoch": 0.9415018337040822, "flos": 17493462459360.0, "grad_norm": 2.2432476297398485, "language_loss": 0.64199686, "learning_rate": 3.5723886692264225e-08, "loss": 0.66723084, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 2.833847761154175 }, { "auxiliary_loss_clip": 0.01311398, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00697684, "balance_loss_mlp": 1.00018084, "epoch": 0.9416220765947213, "flos": 31831626759840.0, "grad_norm": 2.1698421686571088, "language_loss": 0.6158613, "learning_rate": 3.557746303061071e-08, "loss": 0.64090735, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.9259839057922363 }, { "auxiliary_loss_clip": 0.01313347, "auxiliary_loss_mlp": 0.01193066, "balance_loss_clip": 1.00717676, "balance_loss_mlp": 1.00013757, "epoch": 0.9417423194853605, "flos": 23511983951520.0, "grad_norm": 1.549513810714033, "language_loss": 0.72305441, "learning_rate": 3.543133737029391e-08, "loss": 0.74811852, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.861539840698242 }, { "auxiliary_loss_clip": 0.01335724, "auxiliary_loss_mlp": 0.01193103, "balance_loss_clip": 1.00787163, "balance_loss_mlp": 1.00017452, "epoch": 0.9418625623759995, "flos": 23915117882880.0, "grad_norm": 4.879017268009783, "language_loss": 0.68935227, "learning_rate": 3.5285509733481214e-08, "loss": 0.71464062, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.8036322593688965 }, { "auxiliary_loss_clip": 0.01335989, "auxiliary_loss_mlp": 0.01192963, "balance_loss_clip": 1.00784802, "balance_loss_mlp": 1.0001303, "epoch": 0.9419828052666386, "flos": 18076974989760.0, "grad_norm": 1.6367516137049576, "language_loss": 0.76647365, "learning_rate": 3.513998014229469e-08, "loss": 0.79176319, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.8767576217651367 }, { "auxiliary_loss_clip": 0.01296103, "auxiliary_loss_mlp": 0.01193005, "balance_loss_clip": 1.00732946, "balance_loss_mlp": 1.0001719, "epoch": 0.9421030481572777, "flos": 17712338034240.0, "grad_norm": 2.9008183429718346, "language_loss": 0.86557353, "learning_rate": 3.499474861881069e-08, "loss": 0.89046454, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.865274667739868 }, { "auxiliary_loss_clip": 0.01250835, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00631344, "balance_loss_mlp": 1.00019956, "epoch": 0.9422232910479168, "flos": 20194125338880.0, "grad_norm": 1.9744004324584543, "language_loss": 0.68115848, "learning_rate": 3.4849815185061136e-08, "loss": 0.70559907, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.9396047592163086 }, { "auxiliary_loss_clip": 0.01334837, "auxiliary_loss_mlp": 0.01192942, "balance_loss_clip": 1.00761986, "balance_loss_mlp": 1.00010884, "epoch": 0.9423435339385559, "flos": 18442581884640.0, "grad_norm": 1.8720365735832347, "language_loss": 0.76048476, "learning_rate": 3.470517986303223e-08, "loss": 0.78576255, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.909226655960083 }, { "auxiliary_loss_clip": 0.01288075, "auxiliary_loss_mlp": 0.01193046, "balance_loss_clip": 1.007218, "balance_loss_mlp": 1.00011802, "epoch": 0.942463776829195, "flos": 20080646137440.0, "grad_norm": 2.2067576440016334, "language_loss": 0.79185927, "learning_rate": 3.4560842674664856e-08, "loss": 0.81667054, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 3.7692208290100098 }, { "auxiliary_loss_clip": 0.01335215, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00752401, "balance_loss_mlp": 1.00014901, "epoch": 0.9425840197198341, "flos": 22636266109920.0, "grad_norm": 1.947002987763146, "language_loss": 0.75179493, "learning_rate": 3.441680364185506e-08, "loss": 0.77707887, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 3.750685930252075 }, { "auxiliary_loss_clip": 0.01304171, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00715446, "balance_loss_mlp": 1.00018311, "epoch": 0.9427042626104731, "flos": 19937902345920.0, "grad_norm": 2.051424414235542, "language_loss": 0.74877954, "learning_rate": 3.427306278645314e-08, "loss": 0.7737534, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 3.804675340652466 }, { "auxiliary_loss_clip": 0.01263114, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00588012, "balance_loss_mlp": 1.00016785, "epoch": 0.9428245055011123, "flos": 22857009716160.0, "grad_norm": 2.3278419909418138, "language_loss": 0.72806442, "learning_rate": 3.4129620130264767e-08, "loss": 0.75262654, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.8933639526367188 }, { "auxiliary_loss_clip": 0.01312831, "auxiliary_loss_mlp": 0.00872426, "balance_loss_clip": 1.00747848, "balance_loss_mlp": 1.0003947, "epoch": 0.9429447483917514, "flos": 20951765748000.0, "grad_norm": 2.113805527866903, "language_loss": 0.7781316, "learning_rate": 3.398647569505009e-08, "loss": 0.79998416, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 3.7671315670013428 }, { "auxiliary_loss_clip": 0.01287275, "auxiliary_loss_mlp": 0.01193165, "balance_loss_clip": 1.0062356, "balance_loss_mlp": 1.00014091, "epoch": 0.9430649912823904, "flos": 18843668166240.0, "grad_norm": 2.1202678436159625, "language_loss": 0.744102, "learning_rate": 3.384362950252373e-08, "loss": 0.76890641, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.866523027420044 }, { "auxiliary_loss_clip": 0.01313159, "auxiliary_loss_mlp": 0.01192953, "balance_loss_clip": 1.00695002, "balance_loss_mlp": 1.00021529, "epoch": 0.9431852341730296, "flos": 32556050974080.0, "grad_norm": 1.9263645353266556, "language_loss": 0.57086527, "learning_rate": 3.3701081574355473e-08, "loss": 0.5959264, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.9018280506134033 }, { "auxiliary_loss_clip": 0.01274483, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00351846, "balance_loss_mlp": 1.00001013, "epoch": 0.9433054770636686, "flos": 66904526550240.0, "grad_norm": 0.6382095535315442, "language_loss": 0.51720917, "learning_rate": 3.3558831932169796e-08, "loss": 0.54187673, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.410651445388794 }, { "auxiliary_loss_clip": 0.01323865, "auxiliary_loss_mlp": 0.01193151, "balance_loss_clip": 1.00739646, "balance_loss_mlp": 1.00012684, "epoch": 0.9434257199543077, "flos": 26140358881440.0, "grad_norm": 1.8806170167128, "language_loss": 0.88488728, "learning_rate": 3.341688059754588e-08, "loss": 0.91005743, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.8034613132476807 }, { "auxiliary_loss_clip": 0.01309792, "auxiliary_loss_mlp": 0.00872532, "balance_loss_clip": 1.00785875, "balance_loss_mlp": 1.00043571, "epoch": 0.9435459628449467, "flos": 25003496502720.0, "grad_norm": 2.2448646206952647, "language_loss": 0.77035987, "learning_rate": 3.327522759201762e-08, "loss": 0.79218316, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.812870502471924 }, { "auxiliary_loss_clip": 0.01289592, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00714111, "balance_loss_mlp": 1.00018167, "epoch": 0.9436662057355859, "flos": 22163251115520.0, "grad_norm": 2.0798125605255953, "language_loss": 0.66803718, "learning_rate": 3.313387293707359e-08, "loss": 0.69286519, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.834613084793091 }, { "auxiliary_loss_clip": 0.01292582, "auxiliary_loss_mlp": 0.01193068, "balance_loss_clip": 1.00786972, "balance_loss_mlp": 1.00014019, "epoch": 0.943786448626225, "flos": 20118532410720.0, "grad_norm": 1.8069305656348007, "language_loss": 0.68037844, "learning_rate": 3.29928166541571e-08, "loss": 0.70523489, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.8075175285339355 }, { "auxiliary_loss_clip": 0.01310181, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00757921, "balance_loss_mlp": 1.00013447, "epoch": 0.943906691516864, "flos": 22090819471200.0, "grad_norm": 1.7965740777520038, "language_loss": 0.80514681, "learning_rate": 3.2852058764666346e-08, "loss": 0.83017921, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.7992491722106934 }, { "auxiliary_loss_clip": 0.01271027, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00657308, "balance_loss_mlp": 1.00015807, "epoch": 0.9440269344075032, "flos": 35298515808000.0, "grad_norm": 1.7285423783573657, "language_loss": 0.6877588, "learning_rate": 3.2711599289954264e-08, "loss": 0.71239996, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.911449432373047 }, { "auxiliary_loss_clip": 0.01249979, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00615788, "balance_loss_mlp": 1.00015986, "epoch": 0.9441471772981422, "flos": 19238144490720.0, "grad_norm": 1.730501975509585, "language_loss": 0.77545023, "learning_rate": 3.257143825132847e-08, "loss": 0.79988098, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.821498394012451 }, { "auxiliary_loss_clip": 0.01314092, "auxiliary_loss_mlp": 0.01192847, "balance_loss_clip": 1.00718141, "balance_loss_mlp": 1.00010908, "epoch": 0.9442674201887813, "flos": 25739811455040.0, "grad_norm": 1.5975037038802014, "language_loss": 0.75819588, "learning_rate": 3.243157567005106e-08, "loss": 0.78326523, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.806316614151001 }, { "auxiliary_loss_clip": 0.01348825, "auxiliary_loss_mlp": 0.01193204, "balance_loss_clip": 1.00873756, "balance_loss_mlp": 1.00018024, "epoch": 0.9443876630794205, "flos": 15523330819680.0, "grad_norm": 3.101376818271654, "language_loss": 0.63972473, "learning_rate": 3.2292011567339296e-08, "loss": 0.66514498, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.708848714828491 }, { "auxiliary_loss_clip": 0.01336172, "auxiliary_loss_mlp": 0.00872401, "balance_loss_clip": 1.00788713, "balance_loss_mlp": 1.00034714, "epoch": 0.9445079059700595, "flos": 13400827842240.0, "grad_norm": 1.9176249034905206, "language_loss": 0.56034946, "learning_rate": 3.21527459643649e-08, "loss": 0.58243525, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 2.7812671661376953 }, { "auxiliary_loss_clip": 0.01334192, "auxiliary_loss_mlp": 0.01193212, "balance_loss_clip": 1.00799465, "balance_loss_mlp": 1.00018787, "epoch": 0.9446281488606986, "flos": 23659254126720.0, "grad_norm": 1.8327227128646908, "language_loss": 0.73918086, "learning_rate": 3.2013778882254536e-08, "loss": 0.7644549, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 2.789335250854492 }, { "auxiliary_loss_clip": 0.01335823, "auxiliary_loss_mlp": 0.01193064, "balance_loss_clip": 1.00772953, "balance_loss_mlp": 1.00013602, "epoch": 0.9447483917513377, "flos": 25557349282560.0, "grad_norm": 1.7687906563213431, "language_loss": 0.75761855, "learning_rate": 3.1875110342088676e-08, "loss": 0.78290737, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.7891807556152344 }, { "auxiliary_loss_clip": 0.0129793, "auxiliary_loss_mlp": 0.01192991, "balance_loss_clip": 1.00711441, "balance_loss_mlp": 1.00015783, "epoch": 0.9448686346419768, "flos": 24535474899840.0, "grad_norm": 1.61461301531517, "language_loss": 0.65572, "learning_rate": 3.1736740364904035e-08, "loss": 0.68062913, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.826648712158203 }, { "auxiliary_loss_clip": 0.0128132, "auxiliary_loss_mlp": 0.00872512, "balance_loss_clip": 1.00705719, "balance_loss_mlp": 1.00050306, "epoch": 0.9449888775326158, "flos": 14721266027520.0, "grad_norm": 2.044491991818641, "language_loss": 0.77093995, "learning_rate": 3.159866897169094e-08, "loss": 0.79247826, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.8380424976348877 }, { "auxiliary_loss_clip": 0.01278687, "auxiliary_loss_mlp": 0.01193101, "balance_loss_clip": 1.00705278, "balance_loss_mlp": 1.00017262, "epoch": 0.945109120423255, "flos": 15447881586240.0, "grad_norm": 1.751916362268919, "language_loss": 0.75544447, "learning_rate": 3.146089618339487e-08, "loss": 0.78016233, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.806034803390503 }, { "auxiliary_loss_clip": 0.01295918, "auxiliary_loss_mlp": 0.01193226, "balance_loss_clip": 1.00687373, "balance_loss_mlp": 1.00020218, "epoch": 0.9452293633138941, "flos": 25448109075360.0, "grad_norm": 1.8809020306034483, "language_loss": 0.68189156, "learning_rate": 3.132342202091554e-08, "loss": 0.70678294, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.8333492279052734 }, { "auxiliary_loss_clip": 0.01348853, "auxiliary_loss_mlp": 0.01193222, "balance_loss_clip": 1.00766873, "balance_loss_mlp": 1.0001986, "epoch": 0.9453496062045331, "flos": 21215353095360.0, "grad_norm": 2.2140053726569993, "language_loss": 0.68574798, "learning_rate": 3.1186246505107595e-08, "loss": 0.71116877, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.717681407928467 }, { "auxiliary_loss_clip": 0.01325562, "auxiliary_loss_mlp": 0.01193153, "balance_loss_clip": 1.00763488, "balance_loss_mlp": 1.00012887, "epoch": 0.9454698490951723, "flos": 20010908769120.0, "grad_norm": 1.5762334133494782, "language_loss": 0.83530879, "learning_rate": 3.104936965678084e-08, "loss": 0.86049592, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.686002254486084 }, { "auxiliary_loss_clip": 0.01326787, "auxiliary_loss_mlp": 0.01193052, "balance_loss_clip": 1.00690329, "balance_loss_mlp": 1.00012398, "epoch": 0.9455900919858113, "flos": 21069663562080.0, "grad_norm": 1.8212665692287913, "language_loss": 0.8184002, "learning_rate": 3.091279149669956e-08, "loss": 0.8435986, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 3.6713168621063232 }, { "auxiliary_loss_clip": 0.01327657, "auxiliary_loss_mlp": 0.00872522, "balance_loss_clip": 1.00683558, "balance_loss_mlp": 1.00045967, "epoch": 0.9457103348764504, "flos": 20740865230080.0, "grad_norm": 1.7890416437479504, "language_loss": 0.73833972, "learning_rate": 3.0776512045581624e-08, "loss": 0.76034153, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 4.619750738143921 }, { "auxiliary_loss_clip": 0.01310976, "auxiliary_loss_mlp": 0.01193178, "balance_loss_clip": 1.00802016, "balance_loss_mlp": 1.00015402, "epoch": 0.9458305777670896, "flos": 21428373110400.0, "grad_norm": 1.8002960824712064, "language_loss": 0.77810335, "learning_rate": 3.0640531324101384e-08, "loss": 0.80314493, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.7246077060699463 }, { "auxiliary_loss_clip": 0.01326874, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00807655, "balance_loss_mlp": 1.00015545, "epoch": 0.9459508206577286, "flos": 20011196158560.0, "grad_norm": 1.5595202874201217, "language_loss": 0.75808227, "learning_rate": 3.0504849352886554e-08, "loss": 0.7832818, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.7313735485076904 }, { "auxiliary_loss_clip": 0.01323915, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00754142, "balance_loss_mlp": 1.00014114, "epoch": 0.9460710635483677, "flos": 12166436376000.0, "grad_norm": 2.3186071579374428, "language_loss": 0.71590912, "learning_rate": 3.036946615252023e-08, "loss": 0.74107891, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 3.665070056915283 }, { "auxiliary_loss_clip": 0.01305922, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00776434, "balance_loss_mlp": 1.00014389, "epoch": 0.9461913064390068, "flos": 34276210341120.0, "grad_norm": 2.0675806241415566, "language_loss": 0.66731876, "learning_rate": 3.0234381743539984e-08, "loss": 0.69230872, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.813523292541504 }, { "auxiliary_loss_clip": 0.01324499, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00826609, "balance_loss_mlp": 1.00016427, "epoch": 0.9463115493296459, "flos": 19463773717440.0, "grad_norm": 2.6416852270392326, "language_loss": 0.79796314, "learning_rate": 3.0099596146437863e-08, "loss": 0.82313907, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.8038787841796875 }, { "auxiliary_loss_clip": 0.01315735, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00352359, "balance_loss_mlp": 1.00001037, "epoch": 0.946431792220285, "flos": 70570856462400.0, "grad_norm": 0.7680154175419668, "language_loss": 0.60115004, "learning_rate": 2.996510938166086e-08, "loss": 0.62623012, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.3464953899383545 }, { "auxiliary_loss_clip": 0.01326626, "auxiliary_loss_mlp": 0.0119306, "balance_loss_clip": 1.00750828, "balance_loss_mlp": 1.00013208, "epoch": 0.9465520351109241, "flos": 18947914981920.0, "grad_norm": 2.027449363359582, "language_loss": 0.73245931, "learning_rate": 2.983092146960997e-08, "loss": 0.75765622, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.7525908946990967 }, { "auxiliary_loss_clip": 0.01325419, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00830412, "balance_loss_mlp": 1.00016093, "epoch": 0.9466722780015632, "flos": 19135657935360.0, "grad_norm": 1.9755987427837605, "language_loss": 0.80502629, "learning_rate": 2.9697032430642256e-08, "loss": 0.8302123, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.7984604835510254 }, { "auxiliary_loss_clip": 0.01346609, "auxiliary_loss_mlp": 0.01192876, "balance_loss_clip": 1.00766253, "balance_loss_mlp": 1.00013876, "epoch": 0.9467925208922022, "flos": 17237922016320.0, "grad_norm": 2.276496247931245, "language_loss": 0.73781967, "learning_rate": 2.9563442285067906e-08, "loss": 0.76321453, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.6377170085906982 }, { "auxiliary_loss_clip": 0.01327085, "auxiliary_loss_mlp": 0.01193066, "balance_loss_clip": 1.00737572, "balance_loss_mlp": 1.00013769, "epoch": 0.9469127637828414, "flos": 29169029772000.0, "grad_norm": 1.866447004744834, "language_loss": 0.79504275, "learning_rate": 2.943015105315294e-08, "loss": 0.82024425, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.8630709648132324 }, { "auxiliary_loss_clip": 0.01273919, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00676596, "balance_loss_mlp": 1.00019479, "epoch": 0.9470330066734804, "flos": 26030472048000.0, "grad_norm": 2.3830949202432503, "language_loss": 0.661654, "learning_rate": 2.929715875511718e-08, "loss": 0.68632531, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.883143663406372 }, { "auxiliary_loss_clip": 0.01335775, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00747371, "balance_loss_mlp": 1.00017071, "epoch": 0.9471532495641195, "flos": 23440917407040.0, "grad_norm": 1.7558153264739425, "language_loss": 0.69811988, "learning_rate": 2.9164465411135375e-08, "loss": 0.72340864, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.7972402572631836 }, { "auxiliary_loss_clip": 0.01324363, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.00754642, "balance_loss_mlp": 1.0001626, "epoch": 0.9472734924547586, "flos": 15815859444000.0, "grad_norm": 1.740948673424493, "language_loss": 0.80730999, "learning_rate": 2.9032071041337426e-08, "loss": 0.83248359, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.7082679271698 }, { "auxiliary_loss_clip": 0.01310904, "auxiliary_loss_mlp": 0.01192945, "balance_loss_clip": 1.00770235, "balance_loss_mlp": 1.00011241, "epoch": 0.9473937353453977, "flos": 11181801640320.0, "grad_norm": 1.6801028926770945, "language_loss": 0.72870231, "learning_rate": 2.889997566580704e-08, "loss": 0.75374079, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.730684757232666 }, { "auxiliary_loss_clip": 0.01348018, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00773597, "balance_loss_mlp": 1.00016356, "epoch": 0.9475139782360368, "flos": 25775542307520.0, "grad_norm": 1.5889076221478695, "language_loss": 0.70170867, "learning_rate": 2.8768179304583086e-08, "loss": 0.72712064, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.7392666339874268 }, { "auxiliary_loss_clip": 0.01288142, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00707304, "balance_loss_mlp": 1.00016356, "epoch": 0.9476342211266758, "flos": 22820057458560.0, "grad_norm": 1.692635507847481, "language_loss": 0.73451942, "learning_rate": 2.8636681977659117e-08, "loss": 0.7593317, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.8201160430908203 }, { "auxiliary_loss_clip": 0.01271249, "auxiliary_loss_mlp": 0.01193269, "balance_loss_clip": 1.00708735, "balance_loss_mlp": 1.00024581, "epoch": 0.947754464017315, "flos": 20193622407360.0, "grad_norm": 2.004660554728404, "language_loss": 0.77972251, "learning_rate": 2.850548370498318e-08, "loss": 0.80436766, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.844766855239868 }, { "auxiliary_loss_clip": 0.01334272, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.00752294, "balance_loss_mlp": 1.00016308, "epoch": 0.9478747069079541, "flos": 24717937072320.0, "grad_norm": 1.4527924652444846, "language_loss": 0.71422541, "learning_rate": 2.8374584506457798e-08, "loss": 0.73949808, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.7546768188476562 }, { "auxiliary_loss_clip": 0.01299883, "auxiliary_loss_mlp": 0.01193091, "balance_loss_clip": 1.00703788, "balance_loss_mlp": 1.00016296, "epoch": 0.9479949497985931, "flos": 21361365941760.0, "grad_norm": 2.4844847706329407, "language_loss": 0.67491293, "learning_rate": 2.824398440193998e-08, "loss": 0.69984269, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.7939062118530273 }, { "auxiliary_loss_clip": 0.01280038, "auxiliary_loss_mlp": 0.01193152, "balance_loss_clip": 1.00759113, "balance_loss_mlp": 1.00012851, "epoch": 0.9481151926892323, "flos": 18148616313120.0, "grad_norm": 2.3846690398819925, "language_loss": 0.71516621, "learning_rate": 2.811368341124232e-08, "loss": 0.73989809, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.7934725284576416 }, { "auxiliary_loss_clip": 0.01323574, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00797069, "balance_loss_mlp": 1.00016284, "epoch": 0.9482354355798713, "flos": 22128023194560.0, "grad_norm": 2.0282441722962945, "language_loss": 0.6805805, "learning_rate": 2.7983681554131222e-08, "loss": 0.70574808, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.8335251808166504 }, { "auxiliary_loss_clip": 0.01324636, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00812411, "balance_loss_mlp": 1.00017691, "epoch": 0.9483556784705104, "flos": 19063082596320.0, "grad_norm": 2.392693512916525, "language_loss": 0.7022835, "learning_rate": 2.7853978850327365e-08, "loss": 0.72746098, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.757148504257202 }, { "auxiliary_loss_clip": 0.01283287, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.00722694, "balance_loss_mlp": 1.00014603, "epoch": 0.9484759213611496, "flos": 25777122949440.0, "grad_norm": 1.7150624094627007, "language_loss": 0.87330544, "learning_rate": 2.7724575319507225e-08, "loss": 0.89806902, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.8487367630004883 }, { "auxiliary_loss_clip": 0.01334787, "auxiliary_loss_mlp": 0.01192994, "balance_loss_clip": 1.00729334, "balance_loss_mlp": 1.00016105, "epoch": 0.9485961642517886, "flos": 20667751035840.0, "grad_norm": 1.7939634223695138, "language_loss": 0.76839817, "learning_rate": 2.759547098130044e-08, "loss": 0.79367602, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.7523117065429688 }, { "auxiliary_loss_clip": 0.01347343, "auxiliary_loss_mlp": 0.01192987, "balance_loss_clip": 1.00749373, "balance_loss_mlp": 1.00015378, "epoch": 0.9487164071424277, "flos": 22674080535840.0, "grad_norm": 1.74893493315981, "language_loss": 0.76942384, "learning_rate": 2.746666585529267e-08, "loss": 0.7948271, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 3.5555732250213623 }, { "auxiliary_loss_clip": 0.01335585, "auxiliary_loss_mlp": 0.01193061, "balance_loss_clip": 1.00771105, "balance_loss_mlp": 1.00013232, "epoch": 0.9488366500330668, "flos": 38726476796160.0, "grad_norm": 1.956600368325723, "language_loss": 0.74230742, "learning_rate": 2.73381599610234e-08, "loss": 0.76759386, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 3.9075565338134766 }, { "auxiliary_loss_clip": 0.0133583, "auxiliary_loss_mlp": 0.01193202, "balance_loss_clip": 1.00765991, "balance_loss_mlp": 1.00017834, "epoch": 0.9489568929237059, "flos": 27890932396320.0, "grad_norm": 1.7448372373166117, "language_loss": 0.71697378, "learning_rate": 2.7209953317987033e-08, "loss": 0.74226415, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 3.6675751209259033 }, { "auxiliary_loss_clip": 0.01329219, "auxiliary_loss_mlp": 0.01192978, "balance_loss_clip": 1.00751102, "balance_loss_mlp": 1.0001452, "epoch": 0.9490771358143449, "flos": 33580655556480.0, "grad_norm": 1.7728136436862887, "language_loss": 0.78346777, "learning_rate": 2.7082045945631793e-08, "loss": 0.80868971, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.859116554260254 }, { "auxiliary_loss_clip": 0.01289903, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00677574, "balance_loss_mlp": 1.00016081, "epoch": 0.9491973787049841, "flos": 14793805442880.0, "grad_norm": 1.878966145393492, "language_loss": 0.69239593, "learning_rate": 2.6954437863361712e-08, "loss": 0.71722579, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 3.744619131088257 }, { "auxiliary_loss_clip": 0.01255503, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00605845, "balance_loss_mlp": 1.00016916, "epoch": 0.9493176215956232, "flos": 25332546300480.0, "grad_norm": 1.8654809285701022, "language_loss": 0.71201992, "learning_rate": 2.6827129090534862e-08, "loss": 0.73650593, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 3.013018846511841 }, { "auxiliary_loss_clip": 0.01304648, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00731683, "balance_loss_mlp": 1.00015593, "epoch": 0.9494378644862622, "flos": 21029981104800.0, "grad_norm": 1.7705156158704352, "language_loss": 0.77536649, "learning_rate": 2.670011964646335e-08, "loss": 0.80034471, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.9066219329833984 }, { "auxiliary_loss_clip": 0.01266839, "auxiliary_loss_mlp": 0.01193152, "balance_loss_clip": 1.00755239, "balance_loss_mlp": 1.00012827, "epoch": 0.9495581073769014, "flos": 15195143190240.0, "grad_norm": 1.9025198011345588, "language_loss": 0.67720014, "learning_rate": 2.657340955041487e-08, "loss": 0.70179999, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 3.095885992050171 }, { "auxiliary_loss_clip": 0.01300628, "auxiliary_loss_mlp": 0.01193031, "balance_loss_clip": 1.00762105, "balance_loss_mlp": 1.00019813, "epoch": 0.9496783502675404, "flos": 28616578015680.0, "grad_norm": 2.0606363649527597, "language_loss": 0.71507692, "learning_rate": 2.6446998821611167e-08, "loss": 0.74001348, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 3.172239065170288 }, { "auxiliary_loss_clip": 0.01274337, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00728559, "balance_loss_mlp": 1.0001812, "epoch": 0.9497985931581795, "flos": 14866883713440.0, "grad_norm": 2.3719840339239378, "language_loss": 0.71848595, "learning_rate": 2.6320887479228228e-08, "loss": 0.74316037, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.9026830196380615 }, { "auxiliary_loss_clip": 0.0131269, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00756812, "balance_loss_mlp": 1.00016475, "epoch": 0.9499188360488187, "flos": 27193329961920.0, "grad_norm": 2.480566641320235, "language_loss": 0.72496945, "learning_rate": 2.619507554239786e-08, "loss": 0.7500273, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.9136435985565186 }, { "auxiliary_loss_clip": 0.01311506, "auxiliary_loss_mlp": 0.01193047, "balance_loss_clip": 1.00770319, "balance_loss_mlp": 1.00011897, "epoch": 0.9500390789394577, "flos": 24316491553920.0, "grad_norm": 1.5303221385293064, "language_loss": 0.69581705, "learning_rate": 2.606956303020502e-08, "loss": 0.72086263, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.8380231857299805 }, { "auxiliary_loss_clip": 0.01326661, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00735188, "balance_loss_mlp": 1.00014973, "epoch": 0.9501593218300968, "flos": 14354761040640.0, "grad_norm": 1.6488784275018533, "language_loss": 0.84299403, "learning_rate": 2.5944349961690036e-08, "loss": 0.86819232, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.7818641662597656 }, { "auxiliary_loss_clip": 0.01285015, "auxiliary_loss_mlp": 0.01193026, "balance_loss_clip": 1.00629354, "balance_loss_mlp": 1.00009751, "epoch": 0.9502795647207359, "flos": 38728129285440.0, "grad_norm": 2.1612602327209074, "language_loss": 0.72914112, "learning_rate": 2.581943635584749e-08, "loss": 0.75392151, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.934218168258667 }, { "auxiliary_loss_clip": 0.01313535, "auxiliary_loss_mlp": 0.01192881, "balance_loss_clip": 1.00697529, "balance_loss_mlp": 1.00014377, "epoch": 0.950399807611375, "flos": 40808039987520.0, "grad_norm": 1.5267612223378952, "language_loss": 0.65279669, "learning_rate": 2.569482223162689e-08, "loss": 0.67786086, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.9191787242889404 }, { "auxiliary_loss_clip": 0.01324965, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00713849, "balance_loss_mlp": 1.00013888, "epoch": 0.950520050502014, "flos": 23440414475520.0, "grad_norm": 1.604667548955669, "language_loss": 0.72438765, "learning_rate": 2.5570507607932e-08, "loss": 0.74956894, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.7690985202789307 }, { "auxiliary_loss_clip": 0.0133576, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.00770891, "balance_loss_mlp": 1.00017953, "epoch": 0.9506402933926532, "flos": 17783727891840.0, "grad_norm": 4.034936017489844, "language_loss": 0.63650727, "learning_rate": 2.54464925036213e-08, "loss": 0.66179603, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.6621291637420654 }, { "auxiliary_loss_clip": 0.01324996, "auxiliary_loss_mlp": 0.0119298, "balance_loss_clip": 1.00741458, "balance_loss_mlp": 1.00014734, "epoch": 0.9507605362832923, "flos": 32561942457600.0, "grad_norm": 2.1832275429438623, "language_loss": 0.61019123, "learning_rate": 2.532277693750773e-08, "loss": 0.63537103, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 2.851630926132202 }, { "auxiliary_loss_clip": 0.01274107, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00805163, "balance_loss_mlp": 1.00015664, "epoch": 0.9508807791739313, "flos": 19602062972640.0, "grad_norm": 1.8194352543745473, "language_loss": 0.75569266, "learning_rate": 2.5199360928358948e-08, "loss": 0.78036457, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.7762973308563232 }, { "auxiliary_loss_clip": 0.01333296, "auxiliary_loss_mlp": 0.00872422, "balance_loss_clip": 1.00735497, "balance_loss_mlp": 1.00038528, "epoch": 0.9510010220645704, "flos": 21471863477760.0, "grad_norm": 1.665063216022236, "language_loss": 0.8693589, "learning_rate": 2.507624449489665e-08, "loss": 0.89141607, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.776085376739502 }, { "auxiliary_loss_clip": 0.01306163, "auxiliary_loss_mlp": 0.01192879, "balance_loss_clip": 1.00715685, "balance_loss_mlp": 1.00014114, "epoch": 0.9511212649552095, "flos": 18880009721280.0, "grad_norm": 1.7725922202545037, "language_loss": 0.64679164, "learning_rate": 2.495342765579811e-08, "loss": 0.67178208, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.739112138748169 }, { "auxiliary_loss_clip": 0.01264402, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00647056, "balance_loss_mlp": 1.00016606, "epoch": 0.9512415078458486, "flos": 20810530751040.0, "grad_norm": 1.7957092154627976, "language_loss": 0.70794916, "learning_rate": 2.4830910429693984e-08, "loss": 0.73252416, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.930161237716675 }, { "auxiliary_loss_clip": 0.01347207, "auxiliary_loss_mlp": 0.01192977, "balance_loss_clip": 1.00749218, "balance_loss_mlp": 1.000144, "epoch": 0.9513617507364877, "flos": 18369575461440.0, "grad_norm": 1.9196754482337477, "language_loss": 0.79394186, "learning_rate": 2.470869283517052e-08, "loss": 0.81934369, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.7768585681915283 }, { "auxiliary_loss_clip": 0.01335087, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00777757, "balance_loss_mlp": 1.00014722, "epoch": 0.9514819936271268, "flos": 25010178307200.0, "grad_norm": 1.5312665818507925, "language_loss": 0.77018082, "learning_rate": 2.458677489076777e-08, "loss": 0.79546243, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.8394675254821777 }, { "auxiliary_loss_clip": 0.01335376, "auxiliary_loss_mlp": 0.01192898, "balance_loss_clip": 1.00763953, "balance_loss_mlp": 1.00015998, "epoch": 0.9516022365177659, "flos": 18662140009440.0, "grad_norm": 1.5721294329373254, "language_loss": 0.82684565, "learning_rate": 2.446515661498072e-08, "loss": 0.85212839, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 3.1104745864868164 }, { "auxiliary_loss_clip": 0.01255999, "auxiliary_loss_mlp": 0.01192994, "balance_loss_clip": 1.0068469, "balance_loss_mlp": 1.00016141, "epoch": 0.9517224794084049, "flos": 25372120986720.0, "grad_norm": 3.3989185898736354, "language_loss": 0.74491721, "learning_rate": 2.434383802625861e-08, "loss": 0.76940715, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.955808639526367 }, { "auxiliary_loss_clip": 0.01302237, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.0071609, "balance_loss_mlp": 1.00016069, "epoch": 0.9518427222990441, "flos": 21470929462080.0, "grad_norm": 2.216383990558234, "language_loss": 0.73551202, "learning_rate": 2.4222819143005168e-08, "loss": 0.76046622, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 3.8341264724731445 }, { "auxiliary_loss_clip": 0.01347754, "auxiliary_loss_mlp": 0.01193066, "balance_loss_clip": 1.00832725, "balance_loss_mlp": 1.00013757, "epoch": 0.9519629651896832, "flos": 21033645320160.0, "grad_norm": 1.9532506544852737, "language_loss": 0.80770195, "learning_rate": 2.4102099983579706e-08, "loss": 0.83311015, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 3.7898592948913574 }, { "auxiliary_loss_clip": 0.01336473, "auxiliary_loss_mlp": 0.01193224, "balance_loss_clip": 1.00801706, "balance_loss_mlp": 1.00020051, "epoch": 0.9520832080803222, "flos": 21689230258080.0, "grad_norm": 1.5517122863184143, "language_loss": 0.77141243, "learning_rate": 2.3981680566294236e-08, "loss": 0.79670942, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.756014585494995 }, { "auxiliary_loss_clip": 0.01346434, "auxiliary_loss_mlp": 0.01192807, "balance_loss_clip": 1.00750911, "balance_loss_mlp": 1.00016499, "epoch": 0.9522034509709614, "flos": 23145299346240.0, "grad_norm": 2.3688691447419647, "language_loss": 0.73598087, "learning_rate": 2.3861560909416822e-08, "loss": 0.76137322, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 3.69897723197937 }, { "auxiliary_loss_clip": 0.0126349, "auxiliary_loss_mlp": 0.01193239, "balance_loss_clip": 1.00705934, "balance_loss_mlp": 1.00021493, "epoch": 0.9523236938616004, "flos": 24679440096480.0, "grad_norm": 1.646695484295272, "language_loss": 0.82542956, "learning_rate": 2.3741741031169325e-08, "loss": 0.84999692, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.9055745601654053 }, { "auxiliary_loss_clip": 0.01270704, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00619185, "balance_loss_mlp": 1.00013638, "epoch": 0.9524439367522395, "flos": 22672320275520.0, "grad_norm": 1.6972906361578157, "language_loss": 0.71487594, "learning_rate": 2.3622220949728544e-08, "loss": 0.73951459, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.875546455383301 }, { "auxiliary_loss_clip": 0.01336203, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00791228, "balance_loss_mlp": 1.00018108, "epoch": 0.9525641796428787, "flos": 34055538582240.0, "grad_norm": 2.4610486601081125, "language_loss": 0.61131322, "learning_rate": 2.3503000683225526e-08, "loss": 0.63660729, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.907595634460449 }, { "auxiliary_loss_clip": 0.01347876, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00771642, "balance_loss_mlp": 1.00019646, "epoch": 0.9526844225335177, "flos": 16727092596000.0, "grad_norm": 2.0116920058876153, "language_loss": 0.84183478, "learning_rate": 2.3384080249745585e-08, "loss": 0.86724579, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.7421021461486816 }, { "auxiliary_loss_clip": 0.01284924, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00683498, "balance_loss_mlp": 1.00013936, "epoch": 0.9528046654241568, "flos": 36939382107840.0, "grad_norm": 2.158076285952917, "language_loss": 0.82584977, "learning_rate": 2.3265459667329178e-08, "loss": 0.85063064, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 3.0500261783599854 }, { "auxiliary_loss_clip": 0.0130794, "auxiliary_loss_mlp": 0.01192953, "balance_loss_clip": 1.00681603, "balance_loss_mlp": 1.0001204, "epoch": 0.9529249083147959, "flos": 18255018549600.0, "grad_norm": 2.0918899580704755, "language_loss": 0.86406493, "learning_rate": 2.31471389539708e-08, "loss": 0.88907385, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.914233446121216 }, { "auxiliary_loss_clip": 0.01333491, "auxiliary_loss_mlp": 0.0087238, "balance_loss_clip": 1.00788856, "balance_loss_mlp": 1.0004456, "epoch": 0.953045151205435, "flos": 28658451817440.0, "grad_norm": 2.43912475510351, "language_loss": 0.72861797, "learning_rate": 2.3029118127619872e-08, "loss": 0.75067663, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.891077995300293 }, { "auxiliary_loss_clip": 0.01311796, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.007195, "balance_loss_mlp": 1.00015712, "epoch": 0.953165394096074, "flos": 21835243104480.0, "grad_norm": 1.9660731348111513, "language_loss": 0.8701092, "learning_rate": 2.2911397206179628e-08, "loss": 0.89515889, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.793295383453369 }, { "auxiliary_loss_clip": 0.01347694, "auxiliary_loss_mlp": 0.01193155, "balance_loss_clip": 1.00821292, "balance_loss_mlp": 1.00013113, "epoch": 0.9532856369867132, "flos": 19975070145600.0, "grad_norm": 1.7020235157100327, "language_loss": 0.62357879, "learning_rate": 2.279397620750845e-08, "loss": 0.64898729, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.9411187171936035 }, { "auxiliary_loss_clip": 0.01312315, "auxiliary_loss_mlp": 0.01192986, "balance_loss_clip": 1.00707352, "balance_loss_mlp": 1.0001533, "epoch": 0.9534058798773523, "flos": 15049597351680.0, "grad_norm": 1.9161152429761596, "language_loss": 0.78504729, "learning_rate": 2.2676855149419195e-08, "loss": 0.81010032, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.769664764404297 }, { "auxiliary_loss_clip": 0.01301755, "auxiliary_loss_mlp": 0.01193194, "balance_loss_clip": 1.00763023, "balance_loss_mlp": 1.00017023, "epoch": 0.9535261227679913, "flos": 17602810437600.0, "grad_norm": 2.2949306547481947, "language_loss": 0.74860299, "learning_rate": 2.2560034049678988e-08, "loss": 0.77355254, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.7550928592681885 }, { "auxiliary_loss_clip": 0.01348887, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.0083878, "balance_loss_mlp": 1.00014806, "epoch": 0.9536463656586305, "flos": 23142964307040.0, "grad_norm": 1.5520371597548754, "language_loss": 0.75255197, "learning_rate": 2.2443512926008988e-08, "loss": 0.77797258, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.7547872066497803 }, { "auxiliary_loss_clip": 0.01298479, "auxiliary_loss_mlp": 0.01192988, "balance_loss_clip": 1.00745666, "balance_loss_mlp": 1.00015521, "epoch": 0.9537666085492695, "flos": 18625044057120.0, "grad_norm": 2.148369883379328, "language_loss": 0.69858789, "learning_rate": 2.2327291796085946e-08, "loss": 0.72350264, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.778353214263916 }, { "auxiliary_loss_clip": 0.01348254, "auxiliary_loss_mlp": 0.01193006, "balance_loss_clip": 1.00798619, "balance_loss_mlp": 1.00017262, "epoch": 0.9538868514399086, "flos": 18989357699520.0, "grad_norm": 2.884164050132151, "language_loss": 0.77064157, "learning_rate": 2.2211370677540197e-08, "loss": 0.79605412, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 2.694207191467285 }, { "auxiliary_loss_clip": 0.01348382, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00791347, "balance_loss_mlp": 1.00017202, "epoch": 0.9540070943305478, "flos": 16800566027040.0, "grad_norm": 2.3008944206312347, "language_loss": 0.77984488, "learning_rate": 2.2095749587957012e-08, "loss": 0.80525976, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.6770260334014893 }, { "auxiliary_loss_clip": 0.0132355, "auxiliary_loss_mlp": 0.0119303, "balance_loss_clip": 1.00784385, "balance_loss_mlp": 1.0001018, "epoch": 0.9541273372211868, "flos": 20156921615520.0, "grad_norm": 2.870291173316631, "language_loss": 0.69062126, "learning_rate": 2.1980428544876138e-08, "loss": 0.71578705, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.7838261127471924 }, { "auxiliary_loss_clip": 0.01299671, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00760615, "balance_loss_mlp": 1.00018358, "epoch": 0.9542475801118259, "flos": 26725523901120.0, "grad_norm": 1.4799654614847961, "language_loss": 0.74389744, "learning_rate": 2.1865407565791584e-08, "loss": 0.76882625, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.8670432567596436 }, { "auxiliary_loss_clip": 0.01311969, "auxiliary_loss_mlp": 0.01193023, "balance_loss_clip": 1.00691187, "balance_loss_mlp": 1.0001899, "epoch": 0.954367823002465, "flos": 23330922802560.0, "grad_norm": 1.7764006231825624, "language_loss": 0.7715348, "learning_rate": 2.175068666815183e-08, "loss": 0.79658467, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.786928653717041 }, { "auxiliary_loss_clip": 0.01302974, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00751817, "balance_loss_mlp": 1.00015688, "epoch": 0.9544880658931041, "flos": 14902722336960.0, "grad_norm": 1.9786502408104536, "language_loss": 0.78466344, "learning_rate": 2.163626586935985e-08, "loss": 0.809623, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.8094112873077393 }, { "auxiliary_loss_clip": 0.01335794, "auxiliary_loss_mlp": 0.01193115, "balance_loss_clip": 1.00747907, "balance_loss_mlp": 1.00018716, "epoch": 0.9546083087837431, "flos": 29095915577760.0, "grad_norm": 2.7141630504438607, "language_loss": 0.63233531, "learning_rate": 2.1522145186773755e-08, "loss": 0.65762442, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.850344181060791 }, { "auxiliary_loss_clip": 0.01310307, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00693703, "balance_loss_mlp": 1.00013483, "epoch": 0.9547285516743822, "flos": 21142346672160.0, "grad_norm": 1.6288225933883211, "language_loss": 0.8551985, "learning_rate": 2.140832463770481e-08, "loss": 0.88023221, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.772066831588745 }, { "auxiliary_loss_clip": 0.01318847, "auxiliary_loss_mlp": 0.01193011, "balance_loss_clip": 1.00740504, "balance_loss_mlp": 1.00017762, "epoch": 0.9548487945650214, "flos": 27490169427840.0, "grad_norm": 2.0206463185436982, "language_loss": 0.76003259, "learning_rate": 2.129480423941987e-08, "loss": 0.78515112, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.7456793785095215 }, { "auxiliary_loss_clip": 0.01305769, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00824976, "balance_loss_mlp": 1.00015318, "epoch": 0.9549690374556604, "flos": 22273209796320.0, "grad_norm": 1.609382513471954, "language_loss": 0.80331576, "learning_rate": 2.1181584009140052e-08, "loss": 0.82830429, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 3.6508946418762207 }, { "auxiliary_loss_clip": 0.0130516, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00715446, "balance_loss_mlp": 1.00016046, "epoch": 0.9550892803462995, "flos": 17595302388480.0, "grad_norm": 1.868458859981593, "language_loss": 0.83588499, "learning_rate": 2.10686639640405e-08, "loss": 0.8608675, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 3.716773748397827 }, { "auxiliary_loss_clip": 0.01319207, "auxiliary_loss_mlp": 0.01193217, "balance_loss_clip": 1.00748634, "balance_loss_mlp": 1.0001936, "epoch": 0.9552095232369386, "flos": 24353156422080.0, "grad_norm": 1.6691127810796167, "language_loss": 0.81205821, "learning_rate": 2.0956044121251294e-08, "loss": 0.8371824, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.7147364616394043 }, { "auxiliary_loss_clip": 0.01280784, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00683427, "balance_loss_mlp": 1.00014746, "epoch": 0.9553297661275777, "flos": 22746871416960.0, "grad_norm": 1.8794845879469586, "language_loss": 0.80887973, "learning_rate": 2.084372449785654e-08, "loss": 0.83361936, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 3.706997871398926 }, { "auxiliary_loss_clip": 0.01323638, "auxiliary_loss_mlp": 0.01193045, "balance_loss_clip": 1.00837779, "balance_loss_mlp": 1.00011671, "epoch": 0.9554500090182168, "flos": 15413875070400.0, "grad_norm": 1.6274612621782136, "language_loss": 0.68848526, "learning_rate": 2.0731705110895282e-08, "loss": 0.71365201, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.7637557983398438 }, { "auxiliary_loss_clip": 0.01325888, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.00761855, "balance_loss_mlp": 1.00018716, "epoch": 0.9555702519088559, "flos": 23513528669760.0, "grad_norm": 2.342176636626769, "language_loss": 0.86673123, "learning_rate": 2.0619985977360587e-08, "loss": 0.89192224, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.732273817062378 }, { "auxiliary_loss_clip": 0.01285305, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00758934, "balance_loss_mlp": 1.00014281, "epoch": 0.955690494799495, "flos": 22962082776480.0, "grad_norm": 1.76375318340877, "language_loss": 0.76865816, "learning_rate": 2.0508567114200237e-08, "loss": 0.79344189, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.791041374206543 }, { "auxiliary_loss_clip": 0.01316336, "auxiliary_loss_mlp": 0.01192928, "balance_loss_clip": 1.00748694, "balance_loss_mlp": 1.00019073, "epoch": 0.955810737690134, "flos": 26031262368960.0, "grad_norm": 1.742193692995385, "language_loss": 0.78598613, "learning_rate": 2.0397448538316485e-08, "loss": 0.81107879, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.8161585330963135 }, { "auxiliary_loss_clip": 0.01297525, "auxiliary_loss_mlp": 0.01193103, "balance_loss_clip": 1.0065937, "balance_loss_mlp": 1.0001744, "epoch": 0.9559309805807732, "flos": 20849961742560.0, "grad_norm": 2.429255011522086, "language_loss": 0.66889441, "learning_rate": 2.028663026656563e-08, "loss": 0.69380069, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.879469633102417 }, { "auxiliary_loss_clip": 0.01346588, "auxiliary_loss_mlp": 0.00872452, "balance_loss_clip": 1.00762081, "balance_loss_mlp": 1.00046659, "epoch": 0.9560512234714122, "flos": 21578229790560.0, "grad_norm": 1.8179826262795862, "language_loss": 0.71675754, "learning_rate": 2.0176112315758885e-08, "loss": 0.73894793, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.702577590942383 }, { "auxiliary_loss_clip": 0.01279741, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00682461, "balance_loss_mlp": 1.00019598, "epoch": 0.9561714663620513, "flos": 17450151710400.0, "grad_norm": 3.9905827490359065, "language_loss": 0.69074619, "learning_rate": 2.0065894702661957e-08, "loss": 0.71547574, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.8133902549743652 }, { "auxiliary_loss_clip": 0.01298391, "auxiliary_loss_mlp": 0.00872501, "balance_loss_clip": 1.00731993, "balance_loss_mlp": 1.0004077, "epoch": 0.9562917092526905, "flos": 26098521003360.0, "grad_norm": 1.669667858725456, "language_loss": 0.77816296, "learning_rate": 1.9955977443994577e-08, "loss": 0.79987186, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.8388431072235107 }, { "auxiliary_loss_clip": 0.01314541, "auxiliary_loss_mlp": 0.01193143, "balance_loss_clip": 1.00749755, "balance_loss_mlp": 1.00021505, "epoch": 0.9564119521433295, "flos": 24096753810720.0, "grad_norm": 1.98828054124749, "language_loss": 0.61792934, "learning_rate": 1.9846360556430965e-08, "loss": 0.64300621, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.7631452083587646 }, { "auxiliary_loss_clip": 0.01347365, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00779843, "balance_loss_mlp": 1.00016642, "epoch": 0.9565321950339686, "flos": 32008915922400.0, "grad_norm": 2.3401095072419706, "language_loss": 0.61764431, "learning_rate": 1.973704405660004e-08, "loss": 0.64304888, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.7703821659088135 }, { "auxiliary_loss_clip": 0.01247206, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00548351, "balance_loss_mlp": 1.00014687, "epoch": 0.9566524379246077, "flos": 23588654590080.0, "grad_norm": 1.4579891129776816, "language_loss": 0.77987301, "learning_rate": 1.9628027961085203e-08, "loss": 0.80427587, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.889003276824951 }, { "auxiliary_loss_clip": 0.01299761, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00712252, "balance_loss_mlp": 1.00015926, "epoch": 0.9567726808152468, "flos": 38067730574400.0, "grad_norm": 1.8241223909609199, "language_loss": 0.84095609, "learning_rate": 1.9519312286423894e-08, "loss": 0.86588371, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.918762445449829 }, { "auxiliary_loss_clip": 0.01323288, "auxiliary_loss_mlp": 0.01192977, "balance_loss_clip": 1.00727654, "balance_loss_mlp": 1.00014436, "epoch": 0.9568929237058859, "flos": 22744069369920.0, "grad_norm": 1.6015603624792127, "language_loss": 0.77625811, "learning_rate": 1.9410897049108255e-08, "loss": 0.80142075, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.719416618347168 }, { "auxiliary_loss_clip": 0.01350149, "auxiliary_loss_mlp": 0.01193092, "balance_loss_clip": 1.00847077, "balance_loss_mlp": 1.0001632, "epoch": 0.957013166596525, "flos": 23841644451840.0, "grad_norm": 2.274855327448688, "language_loss": 0.91044515, "learning_rate": 1.9302782265584905e-08, "loss": 0.93587756, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 2.7158641815185547 }, { "auxiliary_loss_clip": 0.01267895, "auxiliary_loss_mlp": 0.01192772, "balance_loss_clip": 1.00695217, "balance_loss_mlp": 1.00012946, "epoch": 0.9571334094871641, "flos": 17639295687360.0, "grad_norm": 2.1925836112808312, "language_loss": 0.87000453, "learning_rate": 1.9194967952254282e-08, "loss": 0.89461124, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.805774450302124 }, { "auxiliary_loss_clip": 0.01324204, "auxiliary_loss_mlp": 0.01192878, "balance_loss_clip": 1.00751817, "balance_loss_mlp": 1.00014067, "epoch": 0.9572536523778031, "flos": 15369630305760.0, "grad_norm": 2.1058470209888074, "language_loss": 0.80815959, "learning_rate": 1.9087454125472635e-08, "loss": 0.83333039, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.7701785564422607 }, { "auxiliary_loss_clip": 0.01347838, "auxiliary_loss_mlp": 0.01192986, "balance_loss_clip": 1.00786018, "balance_loss_mlp": 1.00015318, "epoch": 0.9573738952684423, "flos": 24969849223680.0, "grad_norm": 1.7096010575271519, "language_loss": 0.78530335, "learning_rate": 1.8980240801548696e-08, "loss": 0.81071156, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.7474424839019775 }, { "auxiliary_loss_clip": 0.01299962, "auxiliary_loss_mlp": 0.01193122, "balance_loss_clip": 1.0072701, "balance_loss_mlp": 1.00019324, "epoch": 0.9574941381590814, "flos": 25769471205600.0, "grad_norm": 1.8906677768818498, "language_loss": 0.74020231, "learning_rate": 1.8873327996747458e-08, "loss": 0.76513314, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.7512104511260986 }, { "auxiliary_loss_clip": 0.01336054, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.00783443, "balance_loss_mlp": 1.0001626, "epoch": 0.9576143810497204, "flos": 32307192335520.0, "grad_norm": 1.8261335320873895, "language_loss": 0.6587171, "learning_rate": 1.8766715727287053e-08, "loss": 0.68400764, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.8760013580322266 }, { "auxiliary_loss_clip": 0.01331504, "auxiliary_loss_mlp": 0.00872598, "balance_loss_clip": 1.00737166, "balance_loss_mlp": 1.00045562, "epoch": 0.9577346239403596, "flos": 27745745794560.0, "grad_norm": 1.721400235298662, "language_loss": 0.79424083, "learning_rate": 1.8660404009340546e-08, "loss": 0.81628186, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.7416415214538574 }, { "auxiliary_loss_clip": 0.01303805, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.00354254, "balance_loss_mlp": 1.00000596, "epoch": 0.9578548668309986, "flos": 57468340956960.0, "grad_norm": 0.8680365846447157, "language_loss": 0.5955236, "learning_rate": 1.8554392859035485e-08, "loss": 0.62048435, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.327317953109741 }, { "auxiliary_loss_clip": 0.01234971, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.00595522, "balance_loss_mlp": 1.00018263, "epoch": 0.9579751097216377, "flos": 19756050876000.0, "grad_norm": 1.7687134638712674, "language_loss": 0.79067361, "learning_rate": 1.8448682292453444e-08, "loss": 0.81495452, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.9283511638641357 }, { "auxiliary_loss_clip": 0.01348395, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00823009, "balance_loss_mlp": 1.00016761, "epoch": 0.9580953526122769, "flos": 18041280060960.0, "grad_norm": 1.7580629054921948, "language_loss": 0.65792567, "learning_rate": 1.8343272325631154e-08, "loss": 0.68334055, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 3.7586629390716553 }, { "auxiliary_loss_clip": 0.01240527, "auxiliary_loss_mlp": 0.00872565, "balance_loss_clip": 1.00666034, "balance_loss_mlp": 1.00051785, "epoch": 0.9582155955029159, "flos": 24270163215840.0, "grad_norm": 2.1019432817615353, "language_loss": 0.78114998, "learning_rate": 1.8238162974558492e-08, "loss": 0.8022809, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 3.921724319458008 }, { "auxiliary_loss_clip": 0.013038, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00680995, "balance_loss_mlp": 1.00016248, "epoch": 0.958335838393555, "flos": 22783320743040.0, "grad_norm": 1.9238983202796343, "language_loss": 0.74604183, "learning_rate": 1.8133354255181144e-08, "loss": 0.77101171, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.7583601474761963 }, { "auxiliary_loss_clip": 0.01335671, "auxiliary_loss_mlp": 0.01192957, "balance_loss_clip": 1.00749695, "balance_loss_mlp": 1.00012374, "epoch": 0.958456081284194, "flos": 16911494647200.0, "grad_norm": 1.7493472194854758, "language_loss": 0.74218917, "learning_rate": 1.802884618339795e-08, "loss": 0.76747549, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 3.756880521774292 }, { "auxiliary_loss_clip": 0.01325387, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00764191, "balance_loss_mlp": 1.00019455, "epoch": 0.9585763241748332, "flos": 19974962374560.0, "grad_norm": 1.8788379428105844, "language_loss": 0.81003737, "learning_rate": 1.7924638775062894e-08, "loss": 0.83522344, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.713695764541626 }, { "auxiliary_loss_clip": 0.01276675, "auxiliary_loss_mlp": 0.01192867, "balance_loss_clip": 1.00617743, "balance_loss_mlp": 1.0001297, "epoch": 0.9586965670654722, "flos": 21395659847040.0, "grad_norm": 1.9828702846898203, "language_loss": 0.81558979, "learning_rate": 1.7820732045984444e-08, "loss": 0.84028518, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.8486976623535156 }, { "auxiliary_loss_clip": 0.01336929, "auxiliary_loss_mlp": 0.01193129, "balance_loss_clip": 1.00849295, "balance_loss_mlp": 1.00020051, "epoch": 0.9588168099561113, "flos": 21435126762240.0, "grad_norm": 1.935772626224602, "language_loss": 0.74013078, "learning_rate": 1.7717126011924655e-08, "loss": 0.76543134, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.7182364463806152 }, { "auxiliary_loss_clip": 0.01287001, "auxiliary_loss_mlp": 0.01193124, "balance_loss_clip": 1.00804758, "balance_loss_mlp": 1.0001955, "epoch": 0.9589370528467505, "flos": 11763769452480.0, "grad_norm": 2.354492361444599, "language_loss": 0.76899505, "learning_rate": 1.7613820688600957e-08, "loss": 0.7937963, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.7981767654418945 }, { "auxiliary_loss_clip": 0.01307839, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00775599, "balance_loss_mlp": 1.00015855, "epoch": 0.9590572957373895, "flos": 23441528109600.0, "grad_norm": 1.7084034488345354, "language_loss": 0.78671724, "learning_rate": 1.7510816091684588e-08, "loss": 0.81172746, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.7461912631988525 }, { "auxiliary_loss_clip": 0.01303039, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00686765, "balance_loss_mlp": 1.00015748, "epoch": 0.9591775386280286, "flos": 22528283231520.0, "grad_norm": 2.3579170014604878, "language_loss": 0.78270471, "learning_rate": 1.740811223680083e-08, "loss": 0.8076669, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.8327369689941406 }, { "auxiliary_loss_clip": 0.0134753, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00783348, "balance_loss_mlp": 1.00014448, "epoch": 0.9592977815186677, "flos": 18186969594240.0, "grad_norm": 2.4352169921917493, "language_loss": 0.7456671, "learning_rate": 1.7305709139530334e-08, "loss": 0.77107412, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.72469162940979 }, { "auxiliary_loss_clip": 0.01336916, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00795615, "balance_loss_mlp": 1.00016558, "epoch": 0.9594180244093068, "flos": 16537804924320.0, "grad_norm": 2.1714233172575126, "language_loss": 0.74309778, "learning_rate": 1.7203606815407334e-08, "loss": 0.76839787, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.8709797859191895 }, { "auxiliary_loss_clip": 0.01313976, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00795233, "balance_loss_mlp": 1.00015092, "epoch": 0.9595382672999458, "flos": 20554343681760.0, "grad_norm": 1.8517498931573197, "language_loss": 0.79438937, "learning_rate": 1.7101805279920557e-08, "loss": 0.81945997, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.760164976119995 }, { "auxiliary_loss_clip": 0.01348375, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00844812, "balance_loss_mlp": 1.00015235, "epoch": 0.959658510190585, "flos": 22638277836000.0, "grad_norm": 1.9884879501972028, "language_loss": 0.81069678, "learning_rate": 1.7000304548513643e-08, "loss": 0.83611131, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.8700785636901855 }, { "auxiliary_loss_clip": 0.0129549, "auxiliary_loss_mlp": 0.01192958, "balance_loss_clip": 1.00685155, "balance_loss_mlp": 1.00012541, "epoch": 0.9597787530812241, "flos": 19135262774880.0, "grad_norm": 3.4779110362074257, "language_loss": 0.82650036, "learning_rate": 1.6899104636583394e-08, "loss": 0.85138482, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.8577239513397217 }, { "auxiliary_loss_clip": 0.01303817, "auxiliary_loss_mlp": 0.0119228, "balance_loss_clip": 1.00356054, "balance_loss_mlp": 1.00001931, "epoch": 0.9598989959718631, "flos": 60098152834080.0, "grad_norm": 0.7254161384592547, "language_loss": 0.61961484, "learning_rate": 1.6798205559482638e-08, "loss": 0.64457583, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.470184087753296 }, { "auxiliary_loss_clip": 0.01289903, "auxiliary_loss_mlp": 0.01193112, "balance_loss_clip": 1.00720334, "balance_loss_mlp": 1.00018334, "epoch": 0.9600192388625023, "flos": 20886806229120.0, "grad_norm": 2.2687929361087886, "language_loss": 0.76283443, "learning_rate": 1.669760733251713e-08, "loss": 0.78766459, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.8973312377929688 }, { "auxiliary_loss_clip": 0.01248056, "auxiliary_loss_mlp": 0.01192958, "balance_loss_clip": 1.00655067, "balance_loss_mlp": 1.00012469, "epoch": 0.9601394817531413, "flos": 20445750100800.0, "grad_norm": 1.6955498565624623, "language_loss": 0.82582426, "learning_rate": 1.659730997094755e-08, "loss": 0.85023439, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 2.8169174194335938 }, { "auxiliary_loss_clip": 0.01335202, "auxiliary_loss_mlp": 0.01193174, "balance_loss_clip": 1.00821185, "balance_loss_mlp": 1.00014985, "epoch": 0.9602597246437804, "flos": 21507163246080.0, "grad_norm": 1.7280520290830679, "language_loss": 0.61834931, "learning_rate": 1.6497313489989283e-08, "loss": 0.64363307, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.75734806060791 }, { "auxiliary_loss_clip": 0.01299629, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00741541, "balance_loss_mlp": 1.00016713, "epoch": 0.9603799675344196, "flos": 29935112245920.0, "grad_norm": 2.0083958176047267, "language_loss": 0.69890618, "learning_rate": 1.639761790481131e-08, "loss": 0.72383434, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.9121944904327393 }, { "auxiliary_loss_clip": 0.01319312, "auxiliary_loss_mlp": 0.01192906, "balance_loss_clip": 1.008003, "balance_loss_mlp": 1.00016809, "epoch": 0.9605002104250586, "flos": 28001537703360.0, "grad_norm": 1.8305596164574156, "language_loss": 0.793015, "learning_rate": 1.6298223230537754e-08, "loss": 0.81813717, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.7453107833862305 }, { "auxiliary_loss_clip": 0.01301866, "auxiliary_loss_mlp": 0.00872571, "balance_loss_clip": 1.00698948, "balance_loss_mlp": 1.00049114, "epoch": 0.9606204533156977, "flos": 35590505577120.0, "grad_norm": 1.7992193380728139, "language_loss": 0.69621336, "learning_rate": 1.619912948224611e-08, "loss": 0.71795774, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 2.8916780948638916 }, { "auxiliary_loss_clip": 0.01298401, "auxiliary_loss_mlp": 0.01193088, "balance_loss_clip": 1.00779819, "balance_loss_mlp": 1.00015926, "epoch": 0.9607406962063368, "flos": 26574625434240.0, "grad_norm": 2.5831192343548506, "language_loss": 0.61069185, "learning_rate": 1.6100336674969682e-08, "loss": 0.63560677, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.7845304012298584 }, { "auxiliary_loss_clip": 0.01272531, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00721884, "balance_loss_mlp": 1.00014901, "epoch": 0.9608609390969759, "flos": 25331791903200.0, "grad_norm": 1.6445185934495352, "language_loss": 0.76711047, "learning_rate": 1.600184482369449e-08, "loss": 0.79176658, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.872159242630005 }, { "auxiliary_loss_clip": 0.0129565, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00702286, "balance_loss_mlp": 1.00016069, "epoch": 0.960981181987615, "flos": 21069124706880.0, "grad_norm": 2.686863233112935, "language_loss": 0.88554704, "learning_rate": 1.5903653943362126e-08, "loss": 0.91043544, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.7433278560638428 }, { "auxiliary_loss_clip": 0.01309282, "auxiliary_loss_mlp": 0.0119309, "balance_loss_clip": 1.00698197, "balance_loss_mlp": 1.00016177, "epoch": 0.9611014248782541, "flos": 17823266654400.0, "grad_norm": 1.8321262792256208, "language_loss": 0.7690438, "learning_rate": 1.580576404886802e-08, "loss": 0.79406756, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 3.7770559787750244 }, { "auxiliary_loss_clip": 0.01334455, "auxiliary_loss_mlp": 0.01192959, "balance_loss_clip": 1.00793719, "balance_loss_mlp": 1.00012648, "epoch": 0.9612216677688932, "flos": 19354641281280.0, "grad_norm": 1.8413309726947265, "language_loss": 0.79904687, "learning_rate": 1.570817515506162e-08, "loss": 0.82432103, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.690265655517578 }, { "auxiliary_loss_clip": 0.01346784, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00777423, "balance_loss_mlp": 1.00014305, "epoch": 0.9613419106595322, "flos": 15808746555360.0, "grad_norm": 1.8423450044526353, "language_loss": 0.81345087, "learning_rate": 1.561088727674753e-08, "loss": 0.83884943, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 3.4928994178771973 }, { "auxiliary_loss_clip": 0.01284286, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00710607, "balance_loss_mlp": 1.00014067, "epoch": 0.9614621535501714, "flos": 25702499960640.0, "grad_norm": 2.083603299022249, "language_loss": 0.7165072, "learning_rate": 1.551390042868417e-08, "loss": 0.74128079, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 3.846255302429199 }, { "auxiliary_loss_clip": 0.01328498, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00733924, "balance_loss_mlp": 1.00014472, "epoch": 0.9615823964408104, "flos": 17819063583840.0, "grad_norm": 1.8998496348845098, "language_loss": 0.70904338, "learning_rate": 1.5417214625584207e-08, "loss": 0.73425907, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 3.6302194595336914 }, { "auxiliary_loss_clip": 0.01335647, "auxiliary_loss_mlp": 0.01193123, "balance_loss_clip": 1.00787425, "balance_loss_mlp": 1.00019455, "epoch": 0.9617026393314495, "flos": 20190030039360.0, "grad_norm": 1.572942026639867, "language_loss": 0.85112453, "learning_rate": 1.5320829882114806e-08, "loss": 0.87641221, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.7438547611236572 }, { "auxiliary_loss_clip": 0.01347526, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00731003, "balance_loss_mlp": 1.00013399, "epoch": 0.9618228822220887, "flos": 20267023991040.0, "grad_norm": 1.8716144212665793, "language_loss": 0.79131055, "learning_rate": 1.5224746212897378e-08, "loss": 0.81671643, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.654719352722168 }, { "auxiliary_loss_clip": 0.0134588, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00762796, "balance_loss_mlp": 1.00011849, "epoch": 0.9619431251127277, "flos": 21031310280960.0, "grad_norm": 1.7625367195431427, "language_loss": 0.77290392, "learning_rate": 1.512896363250804e-08, "loss": 0.79829419, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.676295042037964 }, { "auxiliary_loss_clip": 0.01335317, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00776064, "balance_loss_mlp": 1.00016475, "epoch": 0.9620633680033668, "flos": 22382665545600.0, "grad_norm": 1.8365876008570734, "language_loss": 0.75518394, "learning_rate": 1.503348215547673e-08, "loss": 0.78046799, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.7733378410339355 }, { "auxiliary_loss_clip": 0.01302646, "auxiliary_loss_mlp": 0.01193015, "balance_loss_clip": 1.00685287, "balance_loss_mlp": 1.00018179, "epoch": 0.962183610894006, "flos": 18471738703680.0, "grad_norm": 2.0224672893142412, "language_loss": 0.80818665, "learning_rate": 1.4938301796288078e-08, "loss": 0.83314329, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.835573673248291 }, { "auxiliary_loss_clip": 0.01347768, "auxiliary_loss_mlp": 0.01193172, "balance_loss_clip": 1.00803041, "balance_loss_mlp": 1.00014782, "epoch": 0.962303853784645, "flos": 18435253453920.0, "grad_norm": 2.449373908685522, "language_loss": 0.81668699, "learning_rate": 1.4843422569380537e-08, "loss": 0.84209639, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.737398386001587 }, { "auxiliary_loss_clip": 0.01262541, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00707388, "balance_loss_mlp": 1.00016665, "epoch": 0.9624240966752841, "flos": 26391085551360.0, "grad_norm": 1.6647321547925193, "language_loss": 0.8259936, "learning_rate": 1.4748844489147483e-08, "loss": 0.85055, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.8833096027374268 }, { "auxiliary_loss_clip": 0.01313213, "auxiliary_loss_mlp": 0.01192869, "balance_loss_clip": 1.00682247, "balance_loss_mlp": 1.00013113, "epoch": 0.9625443395659231, "flos": 14647684825440.0, "grad_norm": 2.05212082737854, "language_loss": 0.70871979, "learning_rate": 1.4654567569936326e-08, "loss": 0.73378056, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.7332606315612793 }, { "auxiliary_loss_clip": 0.01280578, "auxiliary_loss_mlp": 0.01193148, "balance_loss_clip": 1.0072124, "balance_loss_mlp": 1.00021935, "epoch": 0.9626645824565623, "flos": 18367635582720.0, "grad_norm": 1.7690832064717459, "language_loss": 0.82965457, "learning_rate": 1.456059182604874e-08, "loss": 0.85439187, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.8757967948913574 }, { "auxiliary_loss_clip": 0.01347739, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00787091, "balance_loss_mlp": 1.00017142, "epoch": 0.9627848253472013, "flos": 16580433123360.0, "grad_norm": 1.715821097341535, "language_loss": 0.76503336, "learning_rate": 1.4466917271740653e-08, "loss": 0.79044271, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.680748701095581 }, { "auxiliary_loss_clip": 0.01313319, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00767469, "balance_loss_mlp": 1.00014198, "epoch": 0.9629050682378404, "flos": 20886878076480.0, "grad_norm": 2.3112020505124145, "language_loss": 0.6754151, "learning_rate": 1.4373543921222697e-08, "loss": 0.70047998, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.744431734085083 }, { "auxiliary_loss_clip": 0.01300897, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.0069809, "balance_loss_mlp": 1.00015092, "epoch": 0.9630253111284796, "flos": 17019261983520.0, "grad_norm": 1.6939314196547388, "language_loss": 0.77576077, "learning_rate": 1.428047178865932e-08, "loss": 0.80070055, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.7047741413116455 }, { "auxiliary_loss_clip": 0.0132316, "auxiliary_loss_mlp": 0.01193078, "balance_loss_clip": 1.00740981, "balance_loss_mlp": 1.00014961, "epoch": 0.9631455540191186, "flos": 20338952703840.0, "grad_norm": 1.774343510435628, "language_loss": 0.74538147, "learning_rate": 1.4187700888169451e-08, "loss": 0.77054387, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.734248161315918 }, { "auxiliary_loss_clip": 0.01297941, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00330019, "balance_loss_mlp": 1.00000572, "epoch": 0.9632657969097577, "flos": 65956736301120.0, "grad_norm": 0.7513378831497214, "language_loss": 0.57052386, "learning_rate": 1.40952312338265e-08, "loss": 0.5954259, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.2989206314086914 }, { "auxiliary_loss_clip": 0.0129671, "auxiliary_loss_mlp": 0.01193067, "balance_loss_clip": 1.00705647, "balance_loss_mlp": 1.00013876, "epoch": 0.9633860398003968, "flos": 44419540858560.0, "grad_norm": 1.6685863794502813, "language_loss": 0.6894716, "learning_rate": 1.4003062839657909e-08, "loss": 0.71436936, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.9657883644104004 }, { "auxiliary_loss_clip": 0.01275499, "auxiliary_loss_mlp": 0.01193135, "balance_loss_clip": 1.00650787, "balance_loss_mlp": 1.00011122, "epoch": 0.9635062826910359, "flos": 24827716134720.0, "grad_norm": 1.5726608588829674, "language_loss": 0.80033445, "learning_rate": 1.391119571964583e-08, "loss": 0.82502079, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.8365933895111084 }, { "auxiliary_loss_clip": 0.01326353, "auxiliary_loss_mlp": 0.01192736, "balance_loss_clip": 1.00718117, "balance_loss_mlp": 1.00018883, "epoch": 0.9636265255816749, "flos": 15961369358880.0, "grad_norm": 2.1434676710070075, "language_loss": 0.73001248, "learning_rate": 1.3819629887726225e-08, "loss": 0.75520337, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.8136794567108154 }, { "auxiliary_loss_clip": 0.01313418, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.00797367, "balance_loss_mlp": 1.00016308, "epoch": 0.9637467684723141, "flos": 22601792586240.0, "grad_norm": 1.6677729272031536, "language_loss": 0.76258373, "learning_rate": 1.3728365357789317e-08, "loss": 0.78764981, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 2.7425243854522705 }, { "auxiliary_loss_clip": 0.01249975, "auxiliary_loss_mlp": 0.01193039, "balance_loss_clip": 1.00703502, "balance_loss_mlp": 1.00011122, "epoch": 0.9638670113629532, "flos": 17565822256320.0, "grad_norm": 2.4122059377314278, "language_loss": 0.76935947, "learning_rate": 1.3637402143680254e-08, "loss": 0.79378968, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.793513059616089 }, { "auxiliary_loss_clip": 0.01243332, "auxiliary_loss_mlp": 0.01192275, "balance_loss_clip": 1.00398207, "balance_loss_mlp": 1.00001383, "epoch": 0.9639872542535922, "flos": 55072169287200.0, "grad_norm": 0.7221686278287671, "language_loss": 0.55106741, "learning_rate": 1.3546740259197998e-08, "loss": 0.57542348, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.2629690170288086 }, { "auxiliary_loss_clip": 0.01310954, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00814378, "balance_loss_mlp": 1.00013614, "epoch": 0.9641074971442314, "flos": 24134496389280.0, "grad_norm": 2.319661913915872, "language_loss": 0.69941652, "learning_rate": 1.3456379718095989e-08, "loss": 0.72445774, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.7697582244873047 }, { "auxiliary_loss_clip": 0.01282385, "auxiliary_loss_mlp": 0.01192265, "balance_loss_clip": 1.00360942, "balance_loss_mlp": 1.00000465, "epoch": 0.9642277400348704, "flos": 66747449210400.0, "grad_norm": 0.9718192776221128, "language_loss": 0.62073654, "learning_rate": 1.3366320534081487e-08, "loss": 0.64548308, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 4.233444929122925 }, { "auxiliary_loss_clip": 0.01336334, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00837684, "balance_loss_mlp": 1.00015569, "epoch": 0.9643479829255095, "flos": 30920285836800.0, "grad_norm": 1.9810184927530803, "language_loss": 0.7560125, "learning_rate": 1.3276562720816675e-08, "loss": 0.78130764, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.715724468231201 }, { "auxiliary_loss_clip": 0.01347796, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00753212, "balance_loss_mlp": 1.00015545, "epoch": 0.9644682258161487, "flos": 20048256187200.0, "grad_norm": 2.23500358818986, "language_loss": 0.82180989, "learning_rate": 1.3187106291917549e-08, "loss": 0.84721959, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 3.567566156387329 }, { "auxiliary_loss_clip": 0.01321849, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00734913, "balance_loss_mlp": 1.00014317, "epoch": 0.9645884687067877, "flos": 21178724150880.0, "grad_norm": 1.7755154713606753, "language_loss": 0.71087116, "learning_rate": 1.309795126095503e-08, "loss": 0.73602045, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 3.6523118019104004 }, { "auxiliary_loss_clip": 0.01235548, "auxiliary_loss_mlp": 0.01192957, "balance_loss_clip": 1.00620079, "balance_loss_mlp": 1.00012374, "epoch": 0.9647087115974268, "flos": 18945975103200.0, "grad_norm": 1.90300663208895, "language_loss": 0.80523849, "learning_rate": 1.3009097641453192e-08, "loss": 0.82952356, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.838095188140869 }, { "auxiliary_loss_clip": 0.01303467, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00717819, "balance_loss_mlp": 1.00018358, "epoch": 0.9648289544880659, "flos": 16545097431360.0, "grad_norm": 1.7969401485679783, "language_loss": 0.76075912, "learning_rate": 1.2920545446891474e-08, "loss": 0.78572595, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 3.783012866973877 }, { "auxiliary_loss_clip": 0.01304654, "auxiliary_loss_mlp": 0.0119313, "balance_loss_clip": 1.00682497, "balance_loss_mlp": 1.00020146, "epoch": 0.964949197378705, "flos": 24057538361280.0, "grad_norm": 1.5522784403727357, "language_loss": 0.70720088, "learning_rate": 1.2832294690703127e-08, "loss": 0.73217869, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.761101007461548 }, { "auxiliary_loss_clip": 0.01323784, "auxiliary_loss_mlp": 0.01192992, "balance_loss_clip": 1.00779891, "balance_loss_mlp": 1.00015903, "epoch": 0.965069440269344, "flos": 23365575944640.0, "grad_norm": 1.8172895766276294, "language_loss": 0.77551818, "learning_rate": 1.2744345386275668e-08, "loss": 0.800686, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.72056245803833 }, { "auxiliary_loss_clip": 0.01299324, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.0079217, "balance_loss_mlp": 1.00016427, "epoch": 0.9651896831599832, "flos": 25374887110080.0, "grad_norm": 1.5023750888598948, "language_loss": 0.78584909, "learning_rate": 1.265669754695109e-08, "loss": 0.81077331, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.7917606830596924 }, { "auxiliary_loss_clip": 0.01267335, "auxiliary_loss_mlp": 0.01193177, "balance_loss_clip": 1.00711977, "balance_loss_mlp": 1.00015366, "epoch": 0.9653099260506223, "flos": 22272886483200.0, "grad_norm": 1.838480310578686, "language_loss": 0.81993556, "learning_rate": 1.2569351186025201e-08, "loss": 0.84454072, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.8271005153656006 }, { "auxiliary_loss_clip": 0.01286846, "auxiliary_loss_mlp": 0.01193166, "balance_loss_clip": 1.00652385, "balance_loss_mlp": 1.00014269, "epoch": 0.9654301689412613, "flos": 26760859593120.0, "grad_norm": 1.6723215509153557, "language_loss": 0.7546885, "learning_rate": 1.2482306316748737e-08, "loss": 0.77948868, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.8346428871154785 }, { "auxiliary_loss_clip": 0.01336676, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00765204, "balance_loss_mlp": 1.00018334, "epoch": 0.9655504118319005, "flos": 17412696521280.0, "grad_norm": 2.1283521525062685, "language_loss": 0.78292739, "learning_rate": 1.2395562952326021e-08, "loss": 0.80822623, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.6857681274414062 }, { "auxiliary_loss_clip": 0.01319836, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00798464, "balance_loss_mlp": 1.00021362, "epoch": 0.9656706547225395, "flos": 22126981407840.0, "grad_norm": 2.0065953822554228, "language_loss": 0.81417733, "learning_rate": 1.2309121105916309e-08, "loss": 0.83930808, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.771347761154175 }, { "auxiliary_loss_clip": 0.0132937, "auxiliary_loss_mlp": 0.01193026, "balance_loss_clip": 1.00722969, "balance_loss_mlp": 1.00019264, "epoch": 0.9657908976131786, "flos": 37049304864960.0, "grad_norm": 1.6787759748604754, "language_loss": 0.69203603, "learning_rate": 1.222298079063222e-08, "loss": 0.71726, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.876927614212036 }, { "auxiliary_loss_clip": 0.01329056, "auxiliary_loss_mlp": 0.0119295, "balance_loss_clip": 1.00710297, "balance_loss_mlp": 1.00021207, "epoch": 0.9659111405038178, "flos": 24389821290240.0, "grad_norm": 2.0987563208228615, "language_loss": 0.72248662, "learning_rate": 1.2137142019541524e-08, "loss": 0.74770665, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.770453453063965 }, { "auxiliary_loss_clip": 0.01308089, "auxiliary_loss_mlp": 0.01192995, "balance_loss_clip": 1.00811672, "balance_loss_mlp": 1.00016236, "epoch": 0.9660313833944568, "flos": 25009423909920.0, "grad_norm": 1.9639635606354606, "language_loss": 0.73344839, "learning_rate": 1.2051604805666027e-08, "loss": 0.75845927, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 2.729830026626587 }, { "auxiliary_loss_clip": 0.01347425, "auxiliary_loss_mlp": 0.00872415, "balance_loss_clip": 1.00796819, "balance_loss_mlp": 1.00042498, "epoch": 0.9661516262850959, "flos": 11801583878400.0, "grad_norm": 2.0723308166395014, "language_loss": 0.78148818, "learning_rate": 1.196636916198135e-08, "loss": 0.80368662, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.66731858253479 }, { "auxiliary_loss_clip": 0.01347879, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00748825, "balance_loss_mlp": 1.00014198, "epoch": 0.9662718691757349, "flos": 20047789179360.0, "grad_norm": 1.828864583209332, "language_loss": 0.76889515, "learning_rate": 1.1881435101418036e-08, "loss": 0.79430467, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.770745277404785 }, { "auxiliary_loss_clip": 0.01282918, "auxiliary_loss_mlp": 0.01192276, "balance_loss_clip": 1.00352383, "balance_loss_mlp": 1.00001526, "epoch": 0.9663921120663741, "flos": 68027737930560.0, "grad_norm": 0.8227151220671551, "language_loss": 0.65559816, "learning_rate": 1.1796802636860003e-08, "loss": 0.68035007, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 3.357020139694214 }, { "auxiliary_loss_clip": 0.0134837, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00767732, "balance_loss_mlp": 1.00016737, "epoch": 0.9665123549570132, "flos": 26322928824960.0, "grad_norm": 2.056737102204042, "language_loss": 0.73774254, "learning_rate": 1.1712471781146316e-08, "loss": 0.7631582, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.7492191791534424 }, { "auxiliary_loss_clip": 0.01347305, "auxiliary_loss_mlp": 0.01193081, "balance_loss_clip": 1.00725329, "balance_loss_mlp": 1.00015259, "epoch": 0.9666325978476522, "flos": 43941137312160.0, "grad_norm": 1.7790965347058478, "language_loss": 0.6687724, "learning_rate": 1.1628442547069628e-08, "loss": 0.69417626, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.875148057937622 }, { "auxiliary_loss_clip": 0.01330982, "auxiliary_loss_mlp": 0.00872575, "balance_loss_clip": 1.00716984, "balance_loss_mlp": 1.00046074, "epoch": 0.9667528407382914, "flos": 21543432953760.0, "grad_norm": 1.8619780972650166, "language_loss": 0.77615571, "learning_rate": 1.1544714947377521e-08, "loss": 0.79819125, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.7375924587249756 }, { "auxiliary_loss_clip": 0.01348299, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00820494, "balance_loss_mlp": 1.00015533, "epoch": 0.9668730836289304, "flos": 23878596709440.0, "grad_norm": 1.774386202908339, "language_loss": 0.69507712, "learning_rate": 1.1461288994770945e-08, "loss": 0.72049195, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.7027289867401123 }, { "auxiliary_loss_clip": 0.01348791, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00781846, "balance_loss_mlp": 1.00014055, "epoch": 0.9669933265195695, "flos": 28293024540960.0, "grad_norm": 2.0694086765239708, "language_loss": 0.77234679, "learning_rate": 1.1378164701906002e-08, "loss": 0.79776633, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 2.7185590267181396 }, { "auxiliary_loss_clip": 0.01348983, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00827932, "balance_loss_mlp": 1.0001471, "epoch": 0.9671135694102087, "flos": 22454773876800.0, "grad_norm": 1.5832487630787309, "language_loss": 0.66598719, "learning_rate": 1.1295342081392156e-08, "loss": 0.69140774, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.684791326522827 }, { "auxiliary_loss_clip": 0.01298263, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00701976, "balance_loss_mlp": 1.0001471, "epoch": 0.9672338123008477, "flos": 20155951676160.0, "grad_norm": 1.594079870971177, "language_loss": 0.69415033, "learning_rate": 1.1212821145793804e-08, "loss": 0.71906471, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.6925582885742188 }, { "auxiliary_loss_clip": 0.01313776, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00721788, "balance_loss_mlp": 1.00017679, "epoch": 0.9673540551914868, "flos": 16977495952800.0, "grad_norm": 1.936684642195157, "language_loss": 0.79066038, "learning_rate": 1.1130601907629156e-08, "loss": 0.81572819, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.7644333839416504 }, { "auxiliary_loss_clip": 0.01303696, "auxiliary_loss_mlp": 0.01192287, "balance_loss_clip": 1.00347686, "balance_loss_mlp": 1.00002658, "epoch": 0.9674742980821259, "flos": 61892935189920.0, "grad_norm": 0.8150095474810568, "language_loss": 0.6487236, "learning_rate": 1.1048684379370899e-08, "loss": 0.67368352, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 4.208838939666748 }, { "auxiliary_loss_clip": 0.01310984, "auxiliary_loss_mlp": 0.01192854, "balance_loss_clip": 1.0074904, "balance_loss_mlp": 1.00011611, "epoch": 0.967594540972765, "flos": 18697834938240.0, "grad_norm": 1.7986733994145476, "language_loss": 0.7420435, "learning_rate": 1.0967068573445759e-08, "loss": 0.76708186, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.816784143447876 }, { "auxiliary_loss_clip": 0.01311845, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00753164, "balance_loss_mlp": 1.00016046, "epoch": 0.967714783863404, "flos": 20777422327200.0, "grad_norm": 2.3991147890650093, "language_loss": 0.65087944, "learning_rate": 1.0885754502234945e-08, "loss": 0.67592973, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 4.4332756996154785 }, { "auxiliary_loss_clip": 0.01269838, "auxiliary_loss_mlp": 0.0119298, "balance_loss_clip": 1.0068264, "balance_loss_mlp": 1.0001471, "epoch": 0.9678350267540432, "flos": 23185484735040.0, "grad_norm": 1.767952638525497, "language_loss": 0.78121817, "learning_rate": 1.08047421780737e-08, "loss": 0.80584633, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 3.8423032760620117 }, { "auxiliary_loss_clip": 0.01318389, "auxiliary_loss_mlp": 0.00872539, "balance_loss_clip": 1.00724268, "balance_loss_mlp": 1.00051427, "epoch": 0.9679552696446823, "flos": 21726074744640.0, "grad_norm": 2.1260819505265234, "language_loss": 0.74002761, "learning_rate": 1.0724031613251305e-08, "loss": 0.7619369, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.786623477935791 }, { "auxiliary_loss_clip": 0.01332199, "auxiliary_loss_mlp": 0.01193244, "balance_loss_clip": 1.00755215, "balance_loss_mlp": 1.00022054, "epoch": 0.9680755125353213, "flos": 26869058013600.0, "grad_norm": 1.9085895200096725, "language_loss": 0.66514385, "learning_rate": 1.0643622820011744e-08, "loss": 0.69039834, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.858285427093506 }, { "auxiliary_loss_clip": 0.01349295, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00760269, "balance_loss_mlp": 1.00019908, "epoch": 0.9681957554259605, "flos": 28325019330720.0, "grad_norm": 5.7968015550081295, "language_loss": 0.68032217, "learning_rate": 1.0563515810552814e-08, "loss": 0.70574737, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.763852119445801 }, { "auxiliary_loss_clip": 0.0134929, "auxiliary_loss_mlp": 0.01193101, "balance_loss_clip": 1.00870252, "balance_loss_mlp": 1.00017297, "epoch": 0.9683159983165995, "flos": 20557684584000.0, "grad_norm": 1.6373251256545478, "language_loss": 0.73282981, "learning_rate": 1.0483710597026795e-08, "loss": 0.75825381, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.802534341812134 }, { "auxiliary_loss_clip": 0.0129248, "auxiliary_loss_mlp": 0.01193027, "balance_loss_clip": 1.00704551, "balance_loss_mlp": 1.00019431, "epoch": 0.9684362412072386, "flos": 24207969820320.0, "grad_norm": 1.957421971099857, "language_loss": 0.74184448, "learning_rate": 1.0404207191540227e-08, "loss": 0.76669955, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.8149428367614746 }, { "auxiliary_loss_clip": 0.01347808, "auxiliary_loss_mlp": 0.01193105, "balance_loss_clip": 1.00795066, "balance_loss_mlp": 1.00017679, "epoch": 0.9685564840978778, "flos": 22346252143200.0, "grad_norm": 1.8155849636884884, "language_loss": 0.74491525, "learning_rate": 1.0325005606153236e-08, "loss": 0.77032441, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.7744994163513184 }, { "auxiliary_loss_clip": 0.01273282, "auxiliary_loss_mlp": 0.01193123, "balance_loss_clip": 1.00737953, "balance_loss_mlp": 1.00019479, "epoch": 0.9686767269885168, "flos": 14386396593600.0, "grad_norm": 2.307638798869658, "language_loss": 0.78692484, "learning_rate": 1.0246105852881104e-08, "loss": 0.81158888, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.8582653999328613 }, { "auxiliary_loss_clip": 0.01347915, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00749516, "balance_loss_mlp": 1.00015712, "epoch": 0.9687969698791559, "flos": 21287641044960.0, "grad_norm": 1.8008452064172862, "language_loss": 0.78893101, "learning_rate": 1.0167507943692476e-08, "loss": 0.81434101, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.7696452140808105 }, { "auxiliary_loss_clip": 0.01322712, "auxiliary_loss_mlp": 0.01193112, "balance_loss_clip": 1.00788116, "balance_loss_mlp": 1.00018382, "epoch": 0.968917212769795, "flos": 19828338825600.0, "grad_norm": 2.234301847069388, "language_loss": 0.71400553, "learning_rate": 1.008921189051093e-08, "loss": 0.73916376, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.7577924728393555 }, { "auxiliary_loss_clip": 0.01347877, "auxiliary_loss_mlp": 0.01193195, "balance_loss_clip": 1.00808263, "balance_loss_mlp": 1.00017118, "epoch": 0.9690374556604341, "flos": 21681758132640.0, "grad_norm": 1.931761235269739, "language_loss": 0.77112985, "learning_rate": 1.0011217705213848e-08, "loss": 0.79654056, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.623506546020508 }, { "auxiliary_loss_clip": 0.0132472, "auxiliary_loss_mlp": 0.01192866, "balance_loss_clip": 1.00704598, "balance_loss_mlp": 1.00012851, "epoch": 0.9691576985510731, "flos": 32635451812320.0, "grad_norm": 12.682384625149737, "language_loss": 0.74737132, "learning_rate": 9.933525399632658e-09, "loss": 0.77254713, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.835202693939209 }, { "auxiliary_loss_clip": 0.01299936, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00685453, "balance_loss_mlp": 1.00018299, "epoch": 0.9692779414417123, "flos": 35663188687200.0, "grad_norm": 1.668456599396263, "language_loss": 0.64857709, "learning_rate": 9.856134985553488e-09, "loss": 0.67350852, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.8858988285064697 }, { "auxiliary_loss_clip": 0.01347754, "auxiliary_loss_mlp": 0.0119307, "balance_loss_clip": 1.00773239, "balance_loss_mlp": 1.0001415, "epoch": 0.9693981843323514, "flos": 28366964979840.0, "grad_norm": 1.6359243979713434, "language_loss": 0.73442602, "learning_rate": 9.77904647471628e-09, "loss": 0.75983429, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.7777798175811768 }, { "auxiliary_loss_clip": 0.01266702, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00613856, "balance_loss_mlp": 1.0001564, "epoch": 0.9695184272229904, "flos": 23622876648000.0, "grad_norm": 1.446690415833889, "language_loss": 0.74029422, "learning_rate": 9.702259878815454e-09, "loss": 0.76489204, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.8075296878814697 }, { "auxiliary_loss_clip": 0.01326896, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.00771379, "balance_loss_mlp": 1.00021923, "epoch": 0.9696386701136296, "flos": 23294689018560.0, "grad_norm": 1.908307320439185, "language_loss": 0.74310935, "learning_rate": 9.625775209499254e-09, "loss": 0.76830977, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.7314765453338623 }, { "auxiliary_loss_clip": 0.01301765, "auxiliary_loss_mlp": 0.01193076, "balance_loss_clip": 1.00763464, "balance_loss_mlp": 1.00014734, "epoch": 0.9697589130042686, "flos": 15121885301280.0, "grad_norm": 1.855577746152744, "language_loss": 0.74272591, "learning_rate": 9.549592478370172e-09, "loss": 0.76767433, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.79099440574646 }, { "auxiliary_loss_clip": 0.0132919, "auxiliary_loss_mlp": 0.01193014, "balance_loss_clip": 1.00682056, "balance_loss_mlp": 1.00018096, "epoch": 0.9698791558949077, "flos": 18879542713440.0, "grad_norm": 1.67665144568529, "language_loss": 0.79351765, "learning_rate": 9.473711696985632e-09, "loss": 0.81873977, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.7149038314819336 }, { "auxiliary_loss_clip": 0.01293217, "auxiliary_loss_mlp": 0.01193171, "balance_loss_clip": 1.00726533, "balance_loss_mlp": 1.00014746, "epoch": 0.9699993987855468, "flos": 17931465074880.0, "grad_norm": 1.9326168062150628, "language_loss": 0.76066101, "learning_rate": 9.398132876856201e-09, "loss": 0.7855249, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.8037266731262207 }, { "auxiliary_loss_clip": 0.01249706, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00336742, "balance_loss_mlp": 1.00001049, "epoch": 0.9701196416761859, "flos": 67182218694720.0, "grad_norm": 0.7736345504502693, "language_loss": 0.60796726, "learning_rate": 9.322856029447379e-09, "loss": 0.63238704, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.265042781829834 }, { "auxiliary_loss_clip": 0.01346755, "auxiliary_loss_mlp": 0.01193095, "balance_loss_clip": 1.00761461, "balance_loss_mlp": 1.00016677, "epoch": 0.970239884566825, "flos": 24277814959680.0, "grad_norm": 2.0709288449549885, "language_loss": 0.80318093, "learning_rate": 9.247881166178695e-09, "loss": 0.82857949, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.7671329975128174 }, { "auxiliary_loss_clip": 0.01300017, "auxiliary_loss_mlp": 0.01192986, "balance_loss_clip": 1.00666475, "balance_loss_mlp": 1.00015306, "epoch": 0.970360127457464, "flos": 25301701068480.0, "grad_norm": 1.9376916591555307, "language_loss": 0.7652694, "learning_rate": 9.173208298423274e-09, "loss": 0.7901994, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.803413152694702 }, { "auxiliary_loss_clip": 0.01259561, "auxiliary_loss_mlp": 0.00872463, "balance_loss_clip": 1.00757396, "balance_loss_mlp": 1.00041437, "epoch": 0.9704803703481032, "flos": 29572487016480.0, "grad_norm": 1.5927130073513838, "language_loss": 0.7622202, "learning_rate": 9.09883743750961e-09, "loss": 0.78354043, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.8915481567382812 }, { "auxiliary_loss_clip": 0.01304658, "auxiliary_loss_mlp": 0.01193146, "balance_loss_clip": 1.00657725, "balance_loss_mlp": 1.00012255, "epoch": 0.9706006132387422, "flos": 17380055105280.0, "grad_norm": 1.5238861946204554, "language_loss": 0.83770716, "learning_rate": 9.024768594719124e-09, "loss": 0.8626852, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 3.6424717903137207 }, { "auxiliary_loss_clip": 0.01287291, "auxiliary_loss_mlp": 0.01193161, "balance_loss_clip": 1.00673294, "balance_loss_mlp": 1.00013757, "epoch": 0.9707208561293813, "flos": 18186430739040.0, "grad_norm": 1.9303925632713541, "language_loss": 0.72700226, "learning_rate": 8.95100178128816e-09, "loss": 0.75180674, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.7343528270721436 }, { "auxiliary_loss_clip": 0.01317835, "auxiliary_loss_mlp": 0.01193182, "balance_loss_clip": 1.00748754, "balance_loss_mlp": 1.00015819, "epoch": 0.9708410990200205, "flos": 31248401618880.0, "grad_norm": 1.8705991735040772, "language_loss": 0.6992377, "learning_rate": 8.877537008407321e-09, "loss": 0.72434783, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 4.701359272003174 }, { "auxiliary_loss_clip": 0.01313624, "auxiliary_loss_mlp": 0.0119311, "balance_loss_clip": 1.00708139, "balance_loss_mlp": 1.00018167, "epoch": 0.9709613419106595, "flos": 30554463399840.0, "grad_norm": 1.5165861523269824, "language_loss": 0.68605363, "learning_rate": 8.804374287221028e-09, "loss": 0.71112096, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 3.7936172485351562 }, { "auxiliary_loss_clip": 0.0131075, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.0076251, "balance_loss_mlp": 1.00013697, "epoch": 0.9710815848012986, "flos": 23730176976480.0, "grad_norm": 1.653009818153071, "language_loss": 0.84803891, "learning_rate": 8.731513628827958e-09, "loss": 0.87307703, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.819793462753296 }, { "auxiliary_loss_clip": 0.01327334, "auxiliary_loss_mlp": 0.01193159, "balance_loss_clip": 1.00728369, "balance_loss_mlp": 1.00013471, "epoch": 0.9712018276919377, "flos": 23761884376800.0, "grad_norm": 1.8045492552976845, "language_loss": 0.82539219, "learning_rate": 8.658955044280825e-09, "loss": 0.85059714, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.718740701675415 }, { "auxiliary_loss_clip": 0.01325183, "auxiliary_loss_mlp": 0.01193062, "balance_loss_clip": 1.00817013, "balance_loss_mlp": 1.00013423, "epoch": 0.9713220705825768, "flos": 23330994649920.0, "grad_norm": 1.478739428364105, "language_loss": 0.77382642, "learning_rate": 8.586698544587268e-09, "loss": 0.79900885, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.8360722064971924 }, { "auxiliary_loss_clip": 0.01324757, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00841606, "balance_loss_mlp": 1.00023937, "epoch": 0.9714423134732159, "flos": 22200957770400.0, "grad_norm": 1.7838682174716127, "language_loss": 0.74153972, "learning_rate": 8.514744140707853e-09, "loss": 0.76671898, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.738555431365967 }, { "auxiliary_loss_clip": 0.01347187, "auxiliary_loss_mlp": 0.01193053, "balance_loss_clip": 1.00776768, "balance_loss_mlp": 1.00012422, "epoch": 0.971562556363855, "flos": 20229928038720.0, "grad_norm": 1.512354102530658, "language_loss": 0.76484942, "learning_rate": 8.443091843558515e-09, "loss": 0.79025185, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.7488105297088623 }, { "auxiliary_loss_clip": 0.01312608, "auxiliary_loss_mlp": 0.01193003, "balance_loss_clip": 1.00743818, "balance_loss_mlp": 1.00017035, "epoch": 0.9716827992544941, "flos": 24970208460480.0, "grad_norm": 1.9723461984792188, "language_loss": 0.64963925, "learning_rate": 8.37174166400878e-09, "loss": 0.67469537, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.7743639945983887 }, { "auxiliary_loss_clip": 0.01348058, "auxiliary_loss_mlp": 0.01193187, "balance_loss_clip": 1.0082289, "balance_loss_mlp": 1.00016379, "epoch": 0.9718030421451331, "flos": 24681487746240.0, "grad_norm": 1.8526191102196088, "language_loss": 0.849401, "learning_rate": 8.300693612881992e-09, "loss": 0.87481344, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.695554733276367 }, { "auxiliary_loss_clip": 0.01323852, "auxiliary_loss_mlp": 0.00872559, "balance_loss_clip": 1.00731874, "balance_loss_mlp": 1.00046074, "epoch": 0.9719232850357723, "flos": 22090711700160.0, "grad_norm": 1.8666233568722497, "language_loss": 0.81211329, "learning_rate": 8.22994770095664e-09, "loss": 0.83407748, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.739772319793701 }, { "auxiliary_loss_clip": 0.01302142, "auxiliary_loss_mlp": 0.01193113, "balance_loss_clip": 1.00809312, "balance_loss_mlp": 1.00018525, "epoch": 0.9720435279264114, "flos": 23656919087520.0, "grad_norm": 2.117418871549902, "language_loss": 0.75184286, "learning_rate": 8.159503938964585e-09, "loss": 0.77679539, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.722308874130249 }, { "auxiliary_loss_clip": 0.01293739, "auxiliary_loss_mlp": 0.0119266, "balance_loss_clip": 1.00695932, "balance_loss_mlp": 1.00011289, "epoch": 0.9721637708170504, "flos": 28365923193120.0, "grad_norm": 1.7268553992732525, "language_loss": 0.70594049, "learning_rate": 8.089362337592164e-09, "loss": 0.73080444, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.8833694458007812 }, { "auxiliary_loss_clip": 0.01301609, "auxiliary_loss_mlp": 0.01193042, "balance_loss_clip": 1.00647402, "balance_loss_mlp": 1.00011373, "epoch": 0.9722840137076896, "flos": 29130820185600.0, "grad_norm": 1.7493395410832615, "language_loss": 0.72168016, "learning_rate": 8.019522907479536e-09, "loss": 0.74662662, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.7585155963897705 }, { "auxiliary_loss_clip": 0.01336184, "auxiliary_loss_mlp": 0.01193106, "balance_loss_clip": 1.00827289, "balance_loss_mlp": 1.00017738, "epoch": 0.9724042565983286, "flos": 19243964126880.0, "grad_norm": 2.1354328615398748, "language_loss": 0.77478147, "learning_rate": 7.949985659221558e-09, "loss": 0.80007434, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.756894111633301 }, { "auxiliary_loss_clip": 0.01312859, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00716221, "balance_loss_mlp": 1.0001508, "epoch": 0.9725244994889677, "flos": 23039687430720.0, "grad_norm": 1.8609793996163566, "language_loss": 0.789114, "learning_rate": 7.880750603366904e-09, "loss": 0.81417435, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.760118007659912 }, { "auxiliary_loss_clip": 0.01301214, "auxiliary_loss_mlp": 0.01193065, "balance_loss_clip": 1.0069232, "balance_loss_mlp": 1.00013697, "epoch": 0.9726447423796069, "flos": 23367479899680.0, "grad_norm": 1.7131677522407034, "language_loss": 0.79380989, "learning_rate": 7.811817750418282e-09, "loss": 0.81875265, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.8285627365112305 }, { "auxiliary_loss_clip": 0.01278541, "auxiliary_loss_mlp": 0.01193191, "balance_loss_clip": 1.00693607, "balance_loss_mlp": 1.00016725, "epoch": 0.9727649852702459, "flos": 26541660705120.0, "grad_norm": 1.6506695205097963, "language_loss": 0.7998116, "learning_rate": 7.743187110833105e-09, "loss": 0.82452887, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.8734700679779053 }, { "auxiliary_loss_clip": 0.01316333, "auxiliary_loss_mlp": 0.01193004, "balance_loss_clip": 1.0069418, "balance_loss_mlp": 1.00007558, "epoch": 0.972885228160885, "flos": 20522348892000.0, "grad_norm": 1.5237620438300103, "language_loss": 0.80299979, "learning_rate": 7.674858695022602e-09, "loss": 0.82809317, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.764315128326416 }, { "auxiliary_loss_clip": 0.01348528, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00796056, "balance_loss_mlp": 1.00018382, "epoch": 0.9730054710515241, "flos": 17566073722080.0, "grad_norm": 4.613210917664077, "language_loss": 0.75541782, "learning_rate": 7.606832513351591e-09, "loss": 0.78083515, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.6945765018463135 }, { "auxiliary_loss_clip": 0.01315439, "auxiliary_loss_mlp": 0.00871805, "balance_loss_clip": 1.00348186, "balance_loss_mlp": 1.00008714, "epoch": 0.9731257139421632, "flos": 68972043582720.0, "grad_norm": 0.8251307246611669, "language_loss": 0.63960648, "learning_rate": 7.539108576140264e-09, "loss": 0.66147894, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.3050994873046875 }, { "auxiliary_loss_clip": 0.0126737, "auxiliary_loss_mlp": 0.01193063, "balance_loss_clip": 1.00584412, "balance_loss_mlp": 1.00013471, "epoch": 0.9732459568328022, "flos": 18478851592320.0, "grad_norm": 1.9685291246912335, "language_loss": 0.70366263, "learning_rate": 7.471686893661732e-09, "loss": 0.72826695, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.851059675216675 }, { "auxiliary_loss_clip": 0.01301401, "auxiliary_loss_mlp": 0.01192879, "balance_loss_clip": 1.00673842, "balance_loss_mlp": 1.00014102, "epoch": 0.9733661997234414, "flos": 20883896411040.0, "grad_norm": 1.6769766265926431, "language_loss": 0.6427238, "learning_rate": 7.4045674761442636e-09, "loss": 0.66766661, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.7158255577087402 }, { "auxiliary_loss_clip": 0.0134756, "auxiliary_loss_mlp": 0.00872473, "balance_loss_clip": 1.00784707, "balance_loss_mlp": 1.00042343, "epoch": 0.9734864426140805, "flos": 23766805920960.0, "grad_norm": 1.6511732251251177, "language_loss": 0.74476695, "learning_rate": 7.337750333769488e-09, "loss": 0.76696724, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.7010819911956787 }, { "auxiliary_loss_clip": 0.01325032, "auxiliary_loss_mlp": 0.01193179, "balance_loss_clip": 1.00792587, "balance_loss_mlp": 1.00015545, "epoch": 0.9736066855047195, "flos": 35042436509760.0, "grad_norm": 2.1313702643070105, "language_loss": 0.72599107, "learning_rate": 7.2712354766737425e-09, "loss": 0.75117314, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.8421168327331543 }, { "auxiliary_loss_clip": 0.01287747, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00800657, "balance_loss_mlp": 1.00016427, "epoch": 0.9737269283953586, "flos": 20410629950880.0, "grad_norm": 1.4695769504292582, "language_loss": 0.80805266, "learning_rate": 7.2050229149469565e-09, "loss": 0.83286202, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 3.6784660816192627 }, { "auxiliary_loss_clip": 0.01305102, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.0072242, "balance_loss_mlp": 1.00016809, "epoch": 0.9738471712859977, "flos": 28911693144960.0, "grad_norm": 1.656729969384536, "language_loss": 0.63530821, "learning_rate": 7.139112658633984e-09, "loss": 0.66029024, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.837224245071411 }, { "auxiliary_loss_clip": 0.01282782, "auxiliary_loss_mlp": 0.01192995, "balance_loss_clip": 1.00643384, "balance_loss_mlp": 1.00016189, "epoch": 0.9739674141766368, "flos": 27782338815360.0, "grad_norm": 8.511917084349925, "language_loss": 0.70597696, "learning_rate": 7.073504717733048e-09, "loss": 0.73073471, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 3.7556538581848145 }, { "auxiliary_loss_clip": 0.01236434, "auxiliary_loss_mlp": 0.01192267, "balance_loss_clip": 1.00394988, "balance_loss_mlp": 1.00000596, "epoch": 0.9740876570672758, "flos": 68863090764960.0, "grad_norm": 0.733918985751076, "language_loss": 0.57227123, "learning_rate": 7.008199102196855e-09, "loss": 0.59655821, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 4.163166284561157 }, { "auxiliary_loss_clip": 0.01268446, "auxiliary_loss_mlp": 0.01192273, "balance_loss_clip": 1.00306916, "balance_loss_mlp": 1.00001204, "epoch": 0.974207899957915, "flos": 58236650699040.0, "grad_norm": 0.7954205289953704, "language_loss": 0.59001994, "learning_rate": 6.9431958219321464e-09, "loss": 0.61462712, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.290287971496582 }, { "auxiliary_loss_clip": 0.01314744, "auxiliary_loss_mlp": 0.01193142, "balance_loss_clip": 1.00746775, "balance_loss_mlp": 1.00021338, "epoch": 0.9743281428485541, "flos": 22600068249600.0, "grad_norm": 1.485449495595708, "language_loss": 0.77474165, "learning_rate": 6.878494886800146e-09, "loss": 0.79982054, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.7949752807617188 }, { "auxiliary_loss_clip": 0.01294046, "auxiliary_loss_mlp": 0.01193198, "balance_loss_clip": 1.00774217, "balance_loss_mlp": 1.00017452, "epoch": 0.9744483857391931, "flos": 20008825195680.0, "grad_norm": 1.8479811764836238, "language_loss": 0.76452184, "learning_rate": 6.814096306615669e-09, "loss": 0.78939426, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.6810407638549805 }, { "auxiliary_loss_clip": 0.01318018, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00693893, "balance_loss_mlp": 1.0001514, "epoch": 0.9745686286298323, "flos": 17675278005600.0, "grad_norm": 2.0076169543864433, "language_loss": 0.6536966, "learning_rate": 6.750000091148011e-09, "loss": 0.67880857, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.7010066509246826 }, { "auxiliary_loss_clip": 0.01348033, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.00812054, "balance_loss_mlp": 1.00017929, "epoch": 0.9746888715204713, "flos": 29460265143840.0, "grad_norm": 1.737031112599599, "language_loss": 0.72474968, "learning_rate": 6.686206250120729e-09, "loss": 0.75016105, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.737950325012207 }, { "auxiliary_loss_clip": 0.01302498, "auxiliary_loss_mlp": 0.01192994, "balance_loss_clip": 1.00701118, "balance_loss_mlp": 1.00016081, "epoch": 0.9748091144111104, "flos": 18479318600160.0, "grad_norm": 1.79119625685663, "language_loss": 0.74593914, "learning_rate": 6.622714793210749e-09, "loss": 0.77089405, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.7852537631988525 }, { "auxiliary_loss_clip": 0.01348334, "auxiliary_loss_mlp": 0.01192907, "balance_loss_clip": 1.00784397, "balance_loss_mlp": 1.00016916, "epoch": 0.9749293573017496, "flos": 20665164530880.0, "grad_norm": 1.6750149850655425, "language_loss": 0.78595346, "learning_rate": 6.559525730050364e-09, "loss": 0.8113659, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.644604206085205 }, { "auxiliary_loss_clip": 0.01291322, "auxiliary_loss_mlp": 0.01192887, "balance_loss_clip": 1.00663173, "balance_loss_mlp": 1.00014901, "epoch": 0.9750496001923886, "flos": 18478600126560.0, "grad_norm": 1.7423892650226382, "language_loss": 0.75668752, "learning_rate": 6.496639070224574e-09, "loss": 0.78152955, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.8099942207336426 }, { "auxiliary_loss_clip": 0.01335856, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00792181, "balance_loss_mlp": 1.00017142, "epoch": 0.9751698430830277, "flos": 19572978000960.0, "grad_norm": 2.182333202524766, "language_loss": 0.83624423, "learning_rate": 6.4340548232739714e-09, "loss": 0.86153376, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.674304723739624 }, { "auxiliary_loss_clip": 0.01307112, "auxiliary_loss_mlp": 0.01193173, "balance_loss_clip": 1.00776505, "balance_loss_mlp": 1.00014973, "epoch": 0.9752900859736668, "flos": 23550337232640.0, "grad_norm": 2.3263199332737043, "language_loss": 0.79417777, "learning_rate": 6.371772998692071e-09, "loss": 0.81918061, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.883229970932007 }, { "auxiliary_loss_clip": 0.01299358, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00684285, "balance_loss_mlp": 1.00015211, "epoch": 0.9754103288643059, "flos": 20303221851360.0, "grad_norm": 2.6941064840199926, "language_loss": 0.64774078, "learning_rate": 6.309793605927094e-09, "loss": 0.67266512, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.7220022678375244 }, { "auxiliary_loss_clip": 0.01307539, "auxiliary_loss_mlp": 0.01192965, "balance_loss_clip": 1.00786662, "balance_loss_mlp": 1.00013256, "epoch": 0.975530571754945, "flos": 19350689676480.0, "grad_norm": 1.655689676431581, "language_loss": 0.80110943, "learning_rate": 6.248116654381297e-09, "loss": 0.82611448, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.7833008766174316 }, { "auxiliary_loss_clip": 0.01322874, "auxiliary_loss_mlp": 0.01193017, "balance_loss_clip": 1.00772977, "balance_loss_mlp": 1.00018442, "epoch": 0.9756508146455841, "flos": 23583409732800.0, "grad_norm": 1.6708057024411624, "language_loss": 0.72949672, "learning_rate": 6.186742153410751e-09, "loss": 0.75465566, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.7707207202911377 }, { "auxiliary_loss_clip": 0.01311016, "auxiliary_loss_mlp": 0.01193203, "balance_loss_clip": 1.00789976, "balance_loss_mlp": 1.00017929, "epoch": 0.9757710575362232, "flos": 22966932473280.0, "grad_norm": 1.9921447776924848, "language_loss": 0.87697434, "learning_rate": 6.125670112326453e-09, "loss": 0.90201658, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.7820560932159424 }, { "auxiliary_loss_clip": 0.01334994, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.0076735, "balance_loss_mlp": 1.00019729, "epoch": 0.9758913004268622, "flos": 27966022392960.0, "grad_norm": 1.6286292559581454, "language_loss": 0.69995683, "learning_rate": 6.064900540392548e-09, "loss": 0.72523898, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.8312904834747314 }, { "auxiliary_loss_clip": 0.01301206, "auxiliary_loss_mlp": 0.01193048, "balance_loss_clip": 1.00690198, "balance_loss_mlp": 1.00011945, "epoch": 0.9760115433175014, "flos": 22200167449440.0, "grad_norm": 1.8460606792439382, "language_loss": 0.78504688, "learning_rate": 6.0044334468278835e-09, "loss": 0.80998945, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.733245372772217 }, { "auxiliary_loss_clip": 0.01269631, "auxiliary_loss_mlp": 0.01193096, "balance_loss_clip": 1.00648367, "balance_loss_mlp": 1.00016773, "epoch": 0.9761317862081405, "flos": 26250748646400.0, "grad_norm": 1.7564937032064234, "language_loss": 0.71529913, "learning_rate": 5.944268840805345e-09, "loss": 0.73992646, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.8478662967681885 }, { "auxiliary_loss_clip": 0.01295367, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00708747, "balance_loss_mlp": 1.00017059, "epoch": 0.9762520290987795, "flos": 26575451678880.0, "grad_norm": 1.8979704594441664, "language_loss": 0.64275581, "learning_rate": 5.88440673145163e-09, "loss": 0.66764051, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.8164851665496826 }, { "auxiliary_loss_clip": 0.01325216, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00730658, "balance_loss_mlp": 1.0001775, "epoch": 0.9763722719894187, "flos": 18005046276960.0, "grad_norm": 2.199666645176423, "language_loss": 0.82336837, "learning_rate": 5.824847127848142e-09, "loss": 0.84855068, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.7097456455230713 }, { "auxiliary_loss_clip": 0.01270932, "auxiliary_loss_mlp": 0.01193271, "balance_loss_clip": 1.00602424, "balance_loss_mlp": 1.00024688, "epoch": 0.9764925148800577, "flos": 22455671968800.0, "grad_norm": 1.9315445134156288, "language_loss": 0.7895202, "learning_rate": 5.765590039029433e-09, "loss": 0.81416225, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.793196201324463 }, { "auxiliary_loss_clip": 0.01347707, "auxiliary_loss_mlp": 0.01192988, "balance_loss_clip": 1.00796747, "balance_loss_mlp": 1.00015473, "epoch": 0.9766127577706968, "flos": 36757099553760.0, "grad_norm": 1.6976665144802923, "language_loss": 0.71171772, "learning_rate": 5.706635473985422e-09, "loss": 0.73712462, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.8645260334014893 }, { "auxiliary_loss_clip": 0.01323425, "auxiliary_loss_mlp": 0.01193113, "balance_loss_clip": 1.00699043, "balance_loss_mlp": 1.00018454, "epoch": 0.976733000661336, "flos": 22309982435520.0, "grad_norm": 1.6651049464214758, "language_loss": 0.85228407, "learning_rate": 5.6479834416591764e-09, "loss": 0.87744945, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 3.6875197887420654 }, { "auxiliary_loss_clip": 0.01323207, "auxiliary_loss_mlp": 0.00872636, "balance_loss_clip": 1.00774431, "balance_loss_mlp": 1.0004338, "epoch": 0.976853243551975, "flos": 25810949846880.0, "grad_norm": 2.229291484478115, "language_loss": 0.68578422, "learning_rate": 5.589633950947803e-09, "loss": 0.70774269, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.691920280456543 }, { "auxiliary_loss_clip": 0.01314582, "auxiliary_loss_mlp": 0.01193197, "balance_loss_clip": 1.00751865, "balance_loss_mlp": 1.00017333, "epoch": 0.9769734864426141, "flos": 21397456031040.0, "grad_norm": 2.1643282014057847, "language_loss": 0.69933027, "learning_rate": 5.5315870107035535e-09, "loss": 0.72440809, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.728118658065796 }, { "auxiliary_loss_clip": 0.01299916, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00715232, "balance_loss_mlp": 1.00016832, "epoch": 0.9770937293332532, "flos": 13990986253440.0, "grad_norm": 1.7223205148592287, "language_loss": 0.78641719, "learning_rate": 5.473842629731607e-09, "loss": 0.81134737, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 4.574214935302734 }, { "auxiliary_loss_clip": 0.01319871, "auxiliary_loss_mlp": 0.00872499, "balance_loss_clip": 1.0076859, "balance_loss_mlp": 1.00049245, "epoch": 0.9772139722238923, "flos": 17932003930080.0, "grad_norm": 2.0516620205091582, "language_loss": 0.77840948, "learning_rate": 5.416400816792066e-09, "loss": 0.80033314, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.71453595161438 }, { "auxiliary_loss_clip": 0.01347443, "auxiliary_loss_mlp": 0.01193093, "balance_loss_clip": 1.00771475, "balance_loss_mlp": 1.00016487, "epoch": 0.9773342151145313, "flos": 20446181184960.0, "grad_norm": 5.095307919894026, "language_loss": 0.78553092, "learning_rate": 5.359261580598407e-09, "loss": 0.81093627, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 3.5658111572265625 }, { "auxiliary_loss_clip": 0.01326543, "auxiliary_loss_mlp": 0.01193131, "balance_loss_clip": 1.00749552, "balance_loss_mlp": 1.00020242, "epoch": 0.9774544580051704, "flos": 11837314730880.0, "grad_norm": 2.2067696299410007, "language_loss": 0.78056324, "learning_rate": 5.302424929819027e-09, "loss": 0.80576003, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.625281810760498 }, { "auxiliary_loss_clip": 0.01336704, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00751865, "balance_loss_mlp": 1.00014114, "epoch": 0.9775747008958096, "flos": 13479941291040.0, "grad_norm": 3.062222020587076, "language_loss": 0.72894877, "learning_rate": 5.24589087307592e-09, "loss": 0.75424653, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.682919979095459 }, { "auxiliary_loss_clip": 0.01348766, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00788057, "balance_loss_mlp": 1.00014591, "epoch": 0.9776949437864486, "flos": 59532337468800.0, "grad_norm": 1.3982092703282036, "language_loss": 0.64930618, "learning_rate": 5.189659418944891e-09, "loss": 0.67472553, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 3.1185195446014404 }, { "auxiliary_loss_clip": 0.01348213, "auxiliary_loss_mlp": 0.01192898, "balance_loss_clip": 1.00791907, "balance_loss_mlp": 1.00015998, "epoch": 0.9778151866770877, "flos": 21178616379840.0, "grad_norm": 1.8552324337580781, "language_loss": 0.78497064, "learning_rate": 5.133730575956674e-09, "loss": 0.81038177, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.628188371658325 }, { "auxiliary_loss_clip": 0.01311886, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00724113, "balance_loss_mlp": 1.00017095, "epoch": 0.9779354295677268, "flos": 20886806229120.0, "grad_norm": 1.9302316988373591, "language_loss": 0.72219789, "learning_rate": 5.0781043525953696e-09, "loss": 0.7472477, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.8266396522521973 }, { "auxiliary_loss_clip": 0.01297967, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00704122, "balance_loss_mlp": 1.00024629, "epoch": 0.9780556724583659, "flos": 23440630017600.0, "grad_norm": 1.764325516863185, "language_loss": 0.74029601, "learning_rate": 5.0227807572995605e-09, "loss": 0.76520741, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.726468801498413 }, { "auxiliary_loss_clip": 0.01314494, "auxiliary_loss_mlp": 0.0119295, "balance_loss_clip": 1.00719619, "balance_loss_mlp": 1.00011683, "epoch": 0.9781759153490049, "flos": 20923255555200.0, "grad_norm": 1.8678910611265958, "language_loss": 0.67576611, "learning_rate": 4.967759798461646e-09, "loss": 0.70084053, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.7214839458465576 }, { "auxiliary_loss_clip": 0.01347804, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00795197, "balance_loss_mlp": 1.00014424, "epoch": 0.9782961582396441, "flos": 28293204159360.0, "grad_norm": 1.919480749929684, "language_loss": 0.74866301, "learning_rate": 4.913041484428282e-09, "loss": 0.77407175, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.66823410987854 }, { "auxiliary_loss_clip": 0.01334419, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00759888, "balance_loss_mlp": 1.00015771, "epoch": 0.9784164011302832, "flos": 25552966593600.0, "grad_norm": 2.9029951599531016, "language_loss": 0.74134731, "learning_rate": 4.858625823500384e-09, "loss": 0.76662242, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.9032492637634277 }, { "auxiliary_loss_clip": 0.01334666, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.00739753, "balance_loss_mlp": 1.00017548, "epoch": 0.9785366440209222, "flos": 29965957477920.0, "grad_norm": 1.821456008727623, "language_loss": 0.7340287, "learning_rate": 4.80451282393246e-09, "loss": 0.75930738, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.769200086593628 }, { "auxiliary_loss_clip": 0.01294745, "auxiliary_loss_mlp": 0.01193021, "balance_loss_clip": 1.00774419, "balance_loss_mlp": 1.00018787, "epoch": 0.9786568869115614, "flos": 32343605737920.0, "grad_norm": 1.9227721795497321, "language_loss": 0.67377663, "learning_rate": 4.750702493933722e-09, "loss": 0.69865429, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.792170286178589 }, { "auxiliary_loss_clip": 0.01311398, "auxiliary_loss_mlp": 0.00872494, "balance_loss_clip": 1.0079546, "balance_loss_mlp": 1.00038123, "epoch": 0.9787771298022004, "flos": 23331425734080.0, "grad_norm": 1.9479707233762382, "language_loss": 0.85289317, "learning_rate": 4.697194841666974e-09, "loss": 0.87473214, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.7731242179870605 }, { "auxiliary_loss_clip": 0.01324885, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.00723481, "balance_loss_mlp": 1.00018239, "epoch": 0.9788973726928395, "flos": 21468558499200.0, "grad_norm": 1.854064766479596, "language_loss": 0.81744945, "learning_rate": 4.6439898752492764e-09, "loss": 0.84263039, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 2.697253704071045 }, { "auxiliary_loss_clip": 0.01300681, "auxiliary_loss_mlp": 0.00871871, "balance_loss_clip": 1.00333309, "balance_loss_mlp": 1.00013089, "epoch": 0.9790176155834787, "flos": 68897492441280.0, "grad_norm": 0.7453258061342384, "language_loss": 0.63694131, "learning_rate": 4.591087602751731e-09, "loss": 0.65866679, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.3552794456481934 }, { "auxiliary_loss_clip": 0.01326688, "auxiliary_loss_mlp": 0.01192784, "balance_loss_clip": 1.0072093, "balance_loss_mlp": 1.00014198, "epoch": 0.9791378584741177, "flos": 21430887768000.0, "grad_norm": 1.6475750811992873, "language_loss": 0.71851957, "learning_rate": 4.538488032199916e-09, "loss": 0.74371433, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.721933364868164 }, { "auxiliary_loss_clip": 0.01336549, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00761962, "balance_loss_mlp": 1.00016618, "epoch": 0.9792581013647568, "flos": 20153041858080.0, "grad_norm": 1.9193742117620334, "language_loss": 0.68667436, "learning_rate": 4.486191171572784e-09, "loss": 0.71197176, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.7263779640197754 }, { "auxiliary_loss_clip": 0.01334913, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00792217, "balance_loss_mlp": 1.0001564, "epoch": 0.9793783442553959, "flos": 23728201174080.0, "grad_norm": 1.5347358916513631, "language_loss": 0.77541745, "learning_rate": 4.434197028803766e-09, "loss": 0.80069745, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.7200560569763184 }, { "auxiliary_loss_clip": 0.01303688, "auxiliary_loss_mlp": 0.01193097, "balance_loss_clip": 1.00691462, "balance_loss_mlp": 1.0001688, "epoch": 0.979498587146035, "flos": 23038753415040.0, "grad_norm": 1.9921259865613516, "language_loss": 0.81839794, "learning_rate": 4.3825056117805514e-09, "loss": 0.84336579, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 2.779805898666382 }, { "auxiliary_loss_clip": 0.01348634, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00766325, "balance_loss_mlp": 1.00014591, "epoch": 0.979618830036674, "flos": 14318850569760.0, "grad_norm": 2.197420078609911, "language_loss": 0.79216766, "learning_rate": 4.331116928344425e-09, "loss": 0.81758571, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.6250338554382324 }, { "auxiliary_loss_clip": 0.01318469, "auxiliary_loss_mlp": 0.00872578, "balance_loss_clip": 1.00749409, "balance_loss_mlp": 1.00046635, "epoch": 0.9797390729273132, "flos": 16727523680160.0, "grad_norm": 2.0662153464809347, "language_loss": 0.62460291, "learning_rate": 4.28003098629115e-09, "loss": 0.64651334, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.7722721099853516 }, { "auxiliary_loss_clip": 0.01309642, "auxiliary_loss_mlp": 0.01193025, "balance_loss_clip": 1.00725842, "balance_loss_mlp": 1.00019205, "epoch": 0.9798593158179523, "flos": 24532672852800.0, "grad_norm": 12.23286381799526, "language_loss": 0.7842856, "learning_rate": 4.229247793370305e-09, "loss": 0.80931222, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.8335280418395996 }, { "auxiliary_loss_clip": 0.01348798, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00801337, "balance_loss_mlp": 1.00017691, "epoch": 0.9799795587085913, "flos": 27308820889440.0, "grad_norm": 1.9756173999956725, "language_loss": 0.70306426, "learning_rate": 4.178767357285951e-09, "loss": 0.72848231, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 3.625020980834961 }, { "auxiliary_loss_clip": 0.01329775, "auxiliary_loss_mlp": 0.00872426, "balance_loss_clip": 1.00773191, "balance_loss_mlp": 1.00047803, "epoch": 0.9800998015992305, "flos": 26286587269920.0, "grad_norm": 2.0250744891352044, "language_loss": 0.71505719, "learning_rate": 4.128589685695516e-09, "loss": 0.73707914, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.7376787662506104 }, { "auxiliary_loss_clip": 0.01348964, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.0081315, "balance_loss_mlp": 1.00013661, "epoch": 0.9802200444898695, "flos": 16723643922720.0, "grad_norm": 2.068927860688158, "language_loss": 0.84511757, "learning_rate": 4.078714786211135e-09, "loss": 0.87053877, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 3.6041624546051025 }, { "auxiliary_loss_clip": 0.0132535, "auxiliary_loss_mlp": 0.01192963, "balance_loss_clip": 1.00705886, "balance_loss_mlp": 1.00012994, "epoch": 0.9803402873805086, "flos": 24900471092160.0, "grad_norm": 1.7253647011639102, "language_loss": 0.767434, "learning_rate": 4.029142666398977e-09, "loss": 0.7926172, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 3.6402997970581055 }, { "auxiliary_loss_clip": 0.01347254, "auxiliary_loss_mlp": 0.01192984, "balance_loss_clip": 1.00766635, "balance_loss_mlp": 1.00015068, "epoch": 0.9804605302711478, "flos": 22564948099680.0, "grad_norm": 1.7113096111162356, "language_loss": 0.79745412, "learning_rate": 3.979873333778805e-09, "loss": 0.82285655, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.6898810863494873 }, { "auxiliary_loss_clip": 0.01311991, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.00703859, "balance_loss_mlp": 1.00013006, "epoch": 0.9805807731617868, "flos": 38905382524320.0, "grad_norm": 1.7445837617770643, "language_loss": 0.73617446, "learning_rate": 3.930906795824862e-09, "loss": 0.76122499, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.9236655235290527 }, { "auxiliary_loss_clip": 0.01325409, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00712609, "balance_loss_mlp": 1.00015366, "epoch": 0.9807010160524259, "flos": 17822009325600.0, "grad_norm": 1.9134076270492797, "language_loss": 0.76892322, "learning_rate": 3.882243059965207e-09, "loss": 0.79410809, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.737201690673828 }, { "auxiliary_loss_clip": 0.01336198, "auxiliary_loss_mlp": 0.01193216, "balance_loss_clip": 1.00777578, "balance_loss_mlp": 1.00019288, "epoch": 0.980821258943065, "flos": 13552983637920.0, "grad_norm": 2.445444576047278, "language_loss": 0.65711498, "learning_rate": 3.833882133582156e-09, "loss": 0.68240911, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.714611530303955 }, { "auxiliary_loss_clip": 0.01335603, "auxiliary_loss_mlp": 0.01193079, "balance_loss_clip": 1.00788939, "balance_loss_mlp": 1.00015068, "epoch": 0.9809415018337041, "flos": 21689805036960.0, "grad_norm": 1.600337352677251, "language_loss": 0.78297031, "learning_rate": 3.785824024012285e-09, "loss": 0.8082571, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.6893398761749268 }, { "auxiliary_loss_clip": 0.01296599, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00665808, "balance_loss_mlp": 1.00015676, "epoch": 0.9810617447243432, "flos": 23294868636960.0, "grad_norm": 1.4637318645570634, "language_loss": 0.78442168, "learning_rate": 3.738068738545541e-09, "loss": 0.80931759, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.7550456523895264 }, { "auxiliary_loss_clip": 0.01335987, "auxiliary_loss_mlp": 0.0119311, "balance_loss_clip": 1.00800133, "balance_loss_mlp": 1.0001812, "epoch": 0.9811819876149822, "flos": 18332048424960.0, "grad_norm": 2.406859217149522, "language_loss": 0.78013754, "learning_rate": 3.6906162844265733e-09, "loss": 0.8054285, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.665985345840454 }, { "auxiliary_loss_clip": 0.01313125, "auxiliary_loss_mlp": 0.01193058, "balance_loss_clip": 1.00759029, "balance_loss_mlp": 1.00012946, "epoch": 0.9813022305056214, "flos": 22601972204640.0, "grad_norm": 2.1267663774899597, "language_loss": 0.70777929, "learning_rate": 3.643466668853845e-09, "loss": 0.73284113, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.772449016571045 }, { "auxiliary_loss_clip": 0.01322917, "auxiliary_loss_mlp": 0.01192994, "balance_loss_clip": 1.00827956, "balance_loss_mlp": 1.00016081, "epoch": 0.9814224733962604, "flos": 25413348162240.0, "grad_norm": 1.8249515323781311, "language_loss": 0.75228804, "learning_rate": 3.59661989898008e-09, "loss": 0.7774471, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.7688405513763428 }, { "auxiliary_loss_clip": 0.01275091, "auxiliary_loss_mlp": 0.01192957, "balance_loss_clip": 1.00628018, "balance_loss_mlp": 1.00012422, "epoch": 0.9815427162868995, "flos": 25007196641760.0, "grad_norm": 1.9281626109539238, "language_loss": 0.76626027, "learning_rate": 3.5500759819115934e-09, "loss": 0.79094082, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.8003628253936768 }, { "auxiliary_loss_clip": 0.01347957, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00796008, "balance_loss_mlp": 1.00015211, "epoch": 0.9816629591775387, "flos": 20662613949600.0, "grad_norm": 1.951338773177117, "language_loss": 0.81362909, "learning_rate": 3.5038349247094034e-09, "loss": 0.83903944, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.6229984760284424 }, { "auxiliary_loss_clip": 0.01318279, "auxiliary_loss_mlp": 0.01193214, "balance_loss_clip": 1.0070591, "balance_loss_mlp": 1.00019002, "epoch": 0.9817832020681777, "flos": 17712230263200.0, "grad_norm": 2.5393020558553423, "language_loss": 0.77178967, "learning_rate": 3.4578967343878994e-09, "loss": 0.79690456, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.729719877243042 }, { "auxiliary_loss_clip": 0.01306254, "auxiliary_loss_mlp": 0.01193147, "balance_loss_clip": 1.00676298, "balance_loss_mlp": 1.00012279, "epoch": 0.9819034449588168, "flos": 22530043491840.0, "grad_norm": 1.7116686078661418, "language_loss": 0.80866027, "learning_rate": 3.4122614179161733e-09, "loss": 0.83365428, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.7232582569122314 }, { "auxiliary_loss_clip": 0.01299982, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00784218, "balance_loss_mlp": 1.00015163, "epoch": 0.9820236878494559, "flos": 20011232082240.0, "grad_norm": 1.7203475464745845, "language_loss": 0.78195924, "learning_rate": 3.36692898221691e-09, "loss": 0.80688989, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.9954349994659424 }, { "auxiliary_loss_clip": 0.01335548, "auxiliary_loss_mlp": 0.0119305, "balance_loss_clip": 1.00801635, "balance_loss_mlp": 1.00012183, "epoch": 0.982143930740095, "flos": 18807326611200.0, "grad_norm": 1.6495627831803343, "language_loss": 0.73306954, "learning_rate": 3.3218994341668305e-09, "loss": 0.7583555, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.755990982055664 }, { "auxiliary_loss_clip": 0.0134735, "auxiliary_loss_mlp": 0.01193073, "balance_loss_clip": 1.00832105, "balance_loss_mlp": 1.00014472, "epoch": 0.982264173630734, "flos": 26578038183840.0, "grad_norm": 2.3370416204230433, "language_loss": 0.75305855, "learning_rate": 3.2771727805971373e-09, "loss": 0.77846277, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.7630162239074707 }, { "auxiliary_loss_clip": 0.01286012, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00690508, "balance_loss_mlp": 1.00014424, "epoch": 0.9823844165213732, "flos": 22014472145760.0, "grad_norm": 2.0413295076722, "language_loss": 0.76868308, "learning_rate": 3.232749028292847e-09, "loss": 0.79347479, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.8518388271331787 }, { "auxiliary_loss_clip": 0.01348844, "auxiliary_loss_mlp": 0.01192879, "balance_loss_clip": 1.0079304, "balance_loss_mlp": 1.00014114, "epoch": 0.9825046594120123, "flos": 21908177680320.0, "grad_norm": 1.6968807197869513, "language_loss": 0.88313872, "learning_rate": 3.188628183992792e-09, "loss": 0.90855592, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.723768949508667 }, { "auxiliary_loss_clip": 0.01303717, "auxiliary_loss_mlp": 0.01192273, "balance_loss_clip": 1.00348234, "balance_loss_mlp": 1.00001228, "epoch": 0.9826249023026513, "flos": 59494644023040.0, "grad_norm": 0.7372675392277094, "language_loss": 0.62571692, "learning_rate": 3.1448102543902844e-09, "loss": 0.65067685, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.281357526779175 }, { "auxiliary_loss_clip": 0.01315988, "auxiliary_loss_mlp": 0.01192891, "balance_loss_clip": 1.00750279, "balance_loss_mlp": 1.00015306, "epoch": 0.9827451451932905, "flos": 16071040650240.0, "grad_norm": 1.8374600698069568, "language_loss": 0.67342734, "learning_rate": 3.1012952461324515e-09, "loss": 0.69851607, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.690239906311035 }, { "auxiliary_loss_clip": 0.0132282, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00734973, "balance_loss_mlp": 1.00013614, "epoch": 0.9828653880839295, "flos": 20262784996800.0, "grad_norm": 3.1590735899680378, "language_loss": 0.73602664, "learning_rate": 3.0580831658204575e-09, "loss": 0.76118451, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.762669086456299 }, { "auxiliary_loss_clip": 0.01324464, "auxiliary_loss_mlp": 0.01193083, "balance_loss_clip": 1.00730658, "balance_loss_mlp": 1.00015461, "epoch": 0.9829856309745686, "flos": 21616151987520.0, "grad_norm": 1.5475018067476165, "language_loss": 0.77898955, "learning_rate": 3.015174020009281e-09, "loss": 0.80416501, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.753397226333618 }, { "auxiliary_loss_clip": 0.01307116, "auxiliary_loss_mlp": 0.01192968, "balance_loss_clip": 1.0077256, "balance_loss_mlp": 1.00013518, "epoch": 0.9831058738652078, "flos": 23764219416000.0, "grad_norm": 1.9787707948523119, "language_loss": 0.75083542, "learning_rate": 2.9725678152086043e-09, "loss": 0.77583623, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 3.7875397205352783 }, { "auxiliary_loss_clip": 0.01309315, "auxiliary_loss_mlp": 0.01192977, "balance_loss_clip": 1.00719595, "balance_loss_mlp": 1.00014424, "epoch": 0.9832261167558468, "flos": 11320917140160.0, "grad_norm": 2.5314321084928983, "language_loss": 0.82336164, "learning_rate": 2.930264557881257e-09, "loss": 0.8483845, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.779709815979004 }, { "auxiliary_loss_clip": 0.01315705, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00343657, "balance_loss_mlp": 1.00001049, "epoch": 0.9833463596464859, "flos": 60000336357120.0, "grad_norm": 0.8397673595747853, "language_loss": 0.58188272, "learning_rate": 2.8882642544452163e-09, "loss": 0.60696244, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 4.154119491577148 }, { "auxiliary_loss_clip": 0.0131116, "auxiliary_loss_mlp": 0.01192996, "balance_loss_clip": 1.00722003, "balance_loss_mlp": 1.00016284, "epoch": 0.983466602537125, "flos": 13626708534720.0, "grad_norm": 2.9089490023384914, "language_loss": 0.74532533, "learning_rate": 2.8465669112716083e-09, "loss": 0.77036691, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 3.682330846786499 }, { "auxiliary_loss_clip": 0.01336544, "auxiliary_loss_mlp": 0.00872582, "balance_loss_clip": 1.00819969, "balance_loss_mlp": 1.00054109, "epoch": 0.9835868454277641, "flos": 22926854855520.0, "grad_norm": 5.242019889608141, "language_loss": 0.76411909, "learning_rate": 2.8051725346858177e-09, "loss": 0.7862103, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 3.625067710876465 }, { "auxiliary_loss_clip": 0.01349186, "auxiliary_loss_mlp": 0.01193115, "balance_loss_clip": 1.00788498, "balance_loss_mlp": 1.00018692, "epoch": 0.9837070883184031, "flos": 27673421921280.0, "grad_norm": 3.5488537406725165, "language_loss": 0.70772803, "learning_rate": 2.7640811309674883e-09, "loss": 0.73315102, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.773392677307129 }, { "auxiliary_loss_clip": 0.01290814, "auxiliary_loss_mlp": 0.01193007, "balance_loss_clip": 1.00710034, "balance_loss_mlp": 1.00017452, "epoch": 0.9838273312090423, "flos": 29241964347840.0, "grad_norm": 1.5641650288388016, "language_loss": 0.80841893, "learning_rate": 2.7232927063498557e-09, "loss": 0.83325714, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.9054808616638184 }, { "auxiliary_loss_clip": 0.01326714, "auxiliary_loss_mlp": 0.0119318, "balance_loss_clip": 1.00711596, "balance_loss_mlp": 1.00015688, "epoch": 0.9839475740996814, "flos": 40110222011040.0, "grad_norm": 1.8284137058678598, "language_loss": 0.69064075, "learning_rate": 2.682807267020859e-09, "loss": 0.71583974, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.9036173820495605 }, { "auxiliary_loss_clip": 0.01324568, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00721169, "balance_loss_mlp": 1.00015354, "epoch": 0.9840678169903204, "flos": 24169400997120.0, "grad_norm": 1.6185624058388446, "language_loss": 0.62282193, "learning_rate": 2.642624819121808e-09, "loss": 0.64799845, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.7293505668640137 }, { "auxiliary_loss_clip": 0.01305018, "auxiliary_loss_mlp": 0.01192978, "balance_loss_clip": 1.00654984, "balance_loss_mlp": 1.0001446, "epoch": 0.9841880598809596, "flos": 14684493388320.0, "grad_norm": 2.034035891376712, "language_loss": 0.61813945, "learning_rate": 2.6027453687487154e-09, "loss": 0.64311934, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.7591986656188965 }, { "auxiliary_loss_clip": 0.01304328, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00677514, "balance_loss_mlp": 1.00016475, "epoch": 0.9843083027715986, "flos": 22344779272320.0, "grad_norm": 2.221969368097397, "language_loss": 0.537489, "learning_rate": 2.5631689219509643e-09, "loss": 0.56246418, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.782470941543579 }, { "auxiliary_loss_clip": 0.01296591, "auxiliary_loss_mlp": 0.01192787, "balance_loss_clip": 1.00735676, "balance_loss_mlp": 1.00014448, "epoch": 0.9844285456622377, "flos": 21800122954560.0, "grad_norm": 1.581851662591011, "language_loss": 0.83225679, "learning_rate": 2.523895484732197e-09, "loss": 0.85715055, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.777181625366211 }, { "auxiliary_loss_clip": 0.01336862, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00782228, "balance_loss_mlp": 1.00021386, "epoch": 0.9845487885528769, "flos": 18035388577440.0, "grad_norm": 1.8089089395124303, "language_loss": 0.7504667, "learning_rate": 2.4849250630505357e-09, "loss": 0.77576768, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.8911688327789307 }, { "auxiliary_loss_clip": 0.01229947, "auxiliary_loss_mlp": 0.0119304, "balance_loss_clip": 1.00641549, "balance_loss_mlp": 1.00011182, "epoch": 0.9846690314435159, "flos": 25228622797920.0, "grad_norm": 1.7831931873905917, "language_loss": 0.73355913, "learning_rate": 2.4462576628172528e-09, "loss": 0.75778902, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 2.9190430641174316 }, { "auxiliary_loss_clip": 0.01322386, "auxiliary_loss_mlp": 0.01193071, "balance_loss_clip": 1.00739861, "balance_loss_mlp": 1.00014305, "epoch": 0.984789274334155, "flos": 18552181328640.0, "grad_norm": 1.8347397028550971, "language_loss": 0.74075061, "learning_rate": 2.407893289898766e-09, "loss": 0.76590514, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.822592258453369 }, { "auxiliary_loss_clip": 0.01292966, "auxiliary_loss_mlp": 0.01193155, "balance_loss_clip": 1.00650454, "balance_loss_mlp": 1.00013089, "epoch": 0.984909517224794, "flos": 27345449833920.0, "grad_norm": 1.8314940037727119, "language_loss": 0.83963931, "learning_rate": 2.3698319501144202e-09, "loss": 0.86450052, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.789881944656372 }, { "auxiliary_loss_clip": 0.01320627, "auxiliary_loss_mlp": 0.01193199, "balance_loss_clip": 1.0079751, "balance_loss_mlp": 1.00017524, "epoch": 0.9850297601154332, "flos": 18734068722240.0, "grad_norm": 1.5036885238611257, "language_loss": 0.73137057, "learning_rate": 2.3320736492382644e-09, "loss": 0.75650883, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.730266809463501 }, { "auxiliary_loss_clip": 0.01346868, "auxiliary_loss_mlp": 0.01192831, "balance_loss_clip": 1.00774097, "balance_loss_mlp": 1.00018859, "epoch": 0.9851500030060723, "flos": 22308258098880.0, "grad_norm": 1.6778123657045603, "language_loss": 0.67910594, "learning_rate": 2.29461839299816e-09, "loss": 0.70450294, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.719755172729492 }, { "auxiliary_loss_clip": 0.0128365, "auxiliary_loss_mlp": 0.01193006, "balance_loss_clip": 1.00778067, "balance_loss_mlp": 1.00017321, "epoch": 0.9852702458967113, "flos": 26353702209600.0, "grad_norm": 1.5399924176170812, "language_loss": 0.80043042, "learning_rate": 2.257466187076229e-09, "loss": 0.82519698, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.826942205429077 }, { "auxiliary_loss_clip": 0.01336752, "auxiliary_loss_mlp": 0.00872505, "balance_loss_clip": 1.00796735, "balance_loss_mlp": 1.00041151, "epoch": 0.9853904887873505, "flos": 20883609021600.0, "grad_norm": 1.7780650415437993, "language_loss": 0.71274376, "learning_rate": 2.2206170371081854e-09, "loss": 0.73483634, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.7402865886688232 }, { "auxiliary_loss_clip": 0.01324889, "auxiliary_loss_mlp": 0.01192976, "balance_loss_clip": 1.00800276, "balance_loss_mlp": 1.00014329, "epoch": 0.9855107316779895, "flos": 25263455558400.0, "grad_norm": 1.5435772611854819, "language_loss": 0.84703702, "learning_rate": 2.1840709486842247e-09, "loss": 0.87221563, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.7373125553131104 }, { "auxiliary_loss_clip": 0.01310688, "auxiliary_loss_mlp": 0.01193189, "balance_loss_clip": 1.00739253, "balance_loss_mlp": 1.00016546, "epoch": 0.9856309745686286, "flos": 19062112656960.0, "grad_norm": 1.9262467021712582, "language_loss": 0.78813452, "learning_rate": 2.1478279273481335e-09, "loss": 0.81317329, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.7833547592163086 }, { "auxiliary_loss_clip": 0.013249, "auxiliary_loss_mlp": 0.01193023, "balance_loss_clip": 1.00786996, "balance_loss_mlp": 1.00019002, "epoch": 0.9857512174592677, "flos": 34130772273600.0, "grad_norm": 2.152313208130392, "language_loss": 0.80288219, "learning_rate": 2.1118879785981815e-09, "loss": 0.8280614, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.842956066131592 }, { "auxiliary_loss_clip": 0.01308462, "auxiliary_loss_mlp": 0.01192855, "balance_loss_clip": 1.00710297, "balance_loss_mlp": 1.00011742, "epoch": 0.9858714603499068, "flos": 25994705271840.0, "grad_norm": 1.7253613771174658, "language_loss": 0.79326761, "learning_rate": 2.0762511078862288e-09, "loss": 0.81828082, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.8989012241363525 }, { "auxiliary_loss_clip": 0.01306743, "auxiliary_loss_mlp": 0.01193205, "balance_loss_clip": 1.00736594, "balance_loss_mlp": 1.0001812, "epoch": 0.9859917032405459, "flos": 23696242308000.0, "grad_norm": 4.419729474561171, "language_loss": 0.65241623, "learning_rate": 2.0409173206186183e-09, "loss": 0.67741573, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.820096015930176 }, { "auxiliary_loss_clip": 0.01285113, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00736952, "balance_loss_mlp": 1.00015676, "epoch": 0.986111946131185, "flos": 19938297506400.0, "grad_norm": 1.75980101132779, "language_loss": 0.86815959, "learning_rate": 2.0058866221550617e-09, "loss": 0.89294064, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.807339906692505 }, { "auxiliary_loss_clip": 0.01347931, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00740957, "balance_loss_mlp": 1.00015688, "epoch": 0.9862321890218241, "flos": 19828841757120.0, "grad_norm": 1.8425733374855724, "language_loss": 0.74938226, "learning_rate": 1.971159017809976e-09, "loss": 0.77479339, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 3.690279722213745 }, { "auxiliary_loss_clip": 0.01323428, "auxiliary_loss_mlp": 0.01193207, "balance_loss_clip": 1.00731158, "balance_loss_mlp": 1.00018311, "epoch": 0.9863524319124631, "flos": 21652062458400.0, "grad_norm": 2.153105493190416, "language_loss": 0.77681756, "learning_rate": 1.93673451285159e-09, "loss": 0.80198383, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.762406826019287 }, { "auxiliary_loss_clip": 0.01291785, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00353229, "balance_loss_mlp": 1.00000978, "epoch": 0.9864726748031023, "flos": 52770006184320.0, "grad_norm": 0.7339026504185374, "language_loss": 0.56508207, "learning_rate": 1.9026131125019495e-09, "loss": 0.58992267, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 4.1103901863098145 }, { "auxiliary_loss_clip": 0.01321424, "auxiliary_loss_mlp": 0.01192854, "balance_loss_clip": 1.00757217, "balance_loss_mlp": 1.00011635, "epoch": 0.9865929176937414, "flos": 23364641928960.0, "grad_norm": 1.7353111013978177, "language_loss": 0.86764503, "learning_rate": 1.8687948219371363e-09, "loss": 0.89278781, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.7449655532836914 }, { "auxiliary_loss_clip": 0.01349392, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00746059, "balance_loss_mlp": 1.00014663, "epoch": 0.9867131605843804, "flos": 21616690842720.0, "grad_norm": 1.8622136616586866, "language_loss": 0.88567537, "learning_rate": 1.835279646287491e-09, "loss": 0.91110104, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 3.5945935249328613 }, { "auxiliary_loss_clip": 0.01337568, "auxiliary_loss_mlp": 0.01193033, "balance_loss_clip": 1.00808597, "balance_loss_mlp": 1.00020051, "epoch": 0.9868334034750196, "flos": 22271413612320.0, "grad_norm": 1.7320577854777965, "language_loss": 0.76243758, "learning_rate": 1.8020675906371685e-09, "loss": 0.78774357, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.781930446624756 }, { "auxiliary_loss_clip": 0.01271906, "auxiliary_loss_mlp": 0.01192969, "balance_loss_clip": 1.00624359, "balance_loss_mlp": 1.00013638, "epoch": 0.9869536463656586, "flos": 25809584747040.0, "grad_norm": 2.0229811623274414, "language_loss": 0.74733001, "learning_rate": 1.7691586600243612e-09, "loss": 0.7719788, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.8191351890563965 }, { "auxiliary_loss_clip": 0.01302635, "auxiliary_loss_mlp": 0.01193111, "balance_loss_clip": 1.00745595, "balance_loss_mlp": 1.00018263, "epoch": 0.9870738892562977, "flos": 16398509806080.0, "grad_norm": 2.7813122282688174, "language_loss": 0.87141192, "learning_rate": 1.7365528594415202e-09, "loss": 0.89636934, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.833746910095215 }, { "auxiliary_loss_clip": 0.01330836, "auxiliary_loss_mlp": 0.00872482, "balance_loss_clip": 1.00716615, "balance_loss_mlp": 1.00036955, "epoch": 0.9871941321469369, "flos": 35481373140960.0, "grad_norm": 1.7040921934968953, "language_loss": 0.67308623, "learning_rate": 1.7042501938346888e-09, "loss": 0.69511944, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.8891236782073975 }, { "auxiliary_loss_clip": 0.01322969, "auxiliary_loss_mlp": 0.01192958, "balance_loss_clip": 1.00748301, "balance_loss_mlp": 1.00012541, "epoch": 0.9873143750375759, "flos": 21434228670240.0, "grad_norm": 2.025852418531597, "language_loss": 0.76488048, "learning_rate": 1.6722506681043913e-09, "loss": 0.79003978, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.7562365531921387 }, { "auxiliary_loss_clip": 0.01315005, "auxiliary_loss_mlp": 0.01193089, "balance_loss_clip": 1.00755215, "balance_loss_mlp": 1.00016069, "epoch": 0.987434617928215, "flos": 16326509245920.0, "grad_norm": 2.5664925322189514, "language_loss": 0.69279975, "learning_rate": 1.640554287104745e-09, "loss": 0.71788067, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.792050838470459 }, { "auxiliary_loss_clip": 0.01303187, "auxiliary_loss_mlp": 0.01193208, "balance_loss_clip": 1.00711191, "balance_loss_mlp": 1.00018394, "epoch": 0.9875548608188541, "flos": 17851992389280.0, "grad_norm": 2.7603218118520045, "language_loss": 0.79782569, "learning_rate": 1.609161055644348e-09, "loss": 0.82278967, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.7709708213806152 }, { "auxiliary_loss_clip": 0.01336465, "auxiliary_loss_mlp": 0.01193228, "balance_loss_clip": 1.00736797, "balance_loss_mlp": 1.00020444, "epoch": 0.9876751037094932, "flos": 26132886756000.0, "grad_norm": 1.8896601802681687, "language_loss": 0.68133038, "learning_rate": 1.5780709784849467e-09, "loss": 0.70662731, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.7536401748657227 }, { "auxiliary_loss_clip": 0.01245524, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00591159, "balance_loss_mlp": 1.00023437, "epoch": 0.9877953466001322, "flos": 15991352422560.0, "grad_norm": 2.015031595606834, "language_loss": 0.82113862, "learning_rate": 1.5472840603436565e-09, "loss": 0.8455255, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.8087213039398193 }, { "auxiliary_loss_clip": 0.01297699, "auxiliary_loss_mlp": 0.01193049, "balance_loss_clip": 1.00719666, "balance_loss_mlp": 1.00021577, "epoch": 0.9879155894907714, "flos": 18806787756000.0, "grad_norm": 1.8105491875647606, "language_loss": 0.77873766, "learning_rate": 1.5168003058900757e-09, "loss": 0.80364513, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.760378122329712 }, { "auxiliary_loss_clip": 0.01289862, "auxiliary_loss_mlp": 0.01192959, "balance_loss_clip": 1.0067668, "balance_loss_mlp": 1.00012612, "epoch": 0.9880358323814105, "flos": 22382054843040.0, "grad_norm": 1.9800830152343019, "language_loss": 0.91996038, "learning_rate": 1.4866197197491715e-09, "loss": 0.94478858, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.823136329650879 }, { "auxiliary_loss_clip": 0.01326151, "auxiliary_loss_mlp": 0.00872522, "balance_loss_clip": 1.00757861, "balance_loss_mlp": 1.00053883, "epoch": 0.9881560752720495, "flos": 15668840734560.0, "grad_norm": 3.500505227563257, "language_loss": 0.77920771, "learning_rate": 1.4567423064988371e-09, "loss": 0.80119443, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.7627313137054443 }, { "auxiliary_loss_clip": 0.0134901, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.00781155, "balance_loss_mlp": 1.00016952, "epoch": 0.9882763181626887, "flos": 21500122204800.0, "grad_norm": 2.147789614925889, "language_loss": 0.77625489, "learning_rate": 1.4271680706718913e-09, "loss": 0.80167598, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 2.7274813652038574 }, { "auxiliary_loss_clip": 0.01327446, "auxiliary_loss_mlp": 0.01193264, "balance_loss_clip": 1.0078342, "balance_loss_mlp": 1.00023985, "epoch": 0.9883965610533277, "flos": 28034610203520.0, "grad_norm": 1.733569528538533, "language_loss": 0.82608008, "learning_rate": 1.3978970167543013e-09, "loss": 0.85128719, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.765079975128174 }, { "auxiliary_loss_clip": 0.01313722, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00776076, "balance_loss_mlp": 1.000144, "epoch": 0.9885168039439668, "flos": 14098609895040.0, "grad_norm": 1.831858431309808, "language_loss": 0.77523494, "learning_rate": 1.3689291491867372e-09, "loss": 0.8003028, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.862947940826416 }, { "auxiliary_loss_clip": 0.01347945, "auxiliary_loss_mlp": 0.01193175, "balance_loss_clip": 1.00759614, "balance_loss_mlp": 1.00015092, "epoch": 0.988637046834606, "flos": 26432025337440.0, "grad_norm": 1.7374291855206334, "language_loss": 0.7355904, "learning_rate": 1.3402644723636836e-09, "loss": 0.76100159, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.7009172439575195 }, { "auxiliary_loss_clip": 0.01297566, "auxiliary_loss_mlp": 0.0119316, "balance_loss_clip": 1.00700903, "balance_loss_mlp": 1.00013685, "epoch": 0.988757289725245, "flos": 25229125729440.0, "grad_norm": 1.8897405672004302, "language_loss": 0.83547711, "learning_rate": 1.311902990633218e-09, "loss": 0.86038446, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.7918407917022705 }, { "auxiliary_loss_clip": 0.01323106, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00734115, "balance_loss_mlp": 1.00014877, "epoch": 0.9888775326158841, "flos": 26359054837920.0, "grad_norm": 1.6147433027291187, "language_loss": 0.71188271, "learning_rate": 1.2838447082978987e-09, "loss": 0.73704457, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.8097317218780518 }, { "auxiliary_loss_clip": 0.01335405, "auxiliary_loss_mlp": 0.01192965, "balance_loss_clip": 1.00755095, "balance_loss_mlp": 1.00013185, "epoch": 0.9889977755065231, "flos": 24316132317120.0, "grad_norm": 2.2224023874592005, "language_loss": 0.82850134, "learning_rate": 1.2560896296143208e-09, "loss": 0.85378504, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.7702383995056152 }, { "auxiliary_loss_clip": 0.01348153, "auxiliary_loss_mlp": 0.01193104, "balance_loss_clip": 1.0081681, "balance_loss_mlp": 1.00017524, "epoch": 0.9891180183971623, "flos": 18951076265760.0, "grad_norm": 3.89710117018191, "language_loss": 0.82325333, "learning_rate": 1.2286377587926722e-09, "loss": 0.84866589, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.6681435108184814 }, { "auxiliary_loss_clip": 0.01347543, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00757909, "balance_loss_mlp": 1.0001514, "epoch": 0.9892382612878013, "flos": 26176592665440.0, "grad_norm": 1.9570061419394256, "language_loss": 0.75079489, "learning_rate": 1.2014890999973992e-09, "loss": 0.77620113, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.738816022872925 }, { "auxiliary_loss_clip": 0.01348002, "auxiliary_loss_mlp": 0.01192988, "balance_loss_clip": 1.0075866, "balance_loss_mlp": 1.00015473, "epoch": 0.9893585041784404, "flos": 25449617869920.0, "grad_norm": 2.67088936457448, "language_loss": 0.78850204, "learning_rate": 1.1746436573472073e-09, "loss": 0.81391191, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 3.6704719066619873 }, { "auxiliary_loss_clip": 0.01323957, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.00800776, "balance_loss_mlp": 1.00019753, "epoch": 0.9894787470690796, "flos": 20189311565760.0, "grad_norm": 1.9854260198678595, "language_loss": 0.69311315, "learning_rate": 1.1481014349141726e-09, "loss": 0.71828496, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.713901996612549 }, { "auxiliary_loss_clip": 0.01305238, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00653696, "balance_loss_mlp": 1.00016189, "epoch": 0.9895989899597186, "flos": 24644319946560.0, "grad_norm": 1.654327035726437, "language_loss": 0.84329462, "learning_rate": 1.121862436724852e-09, "loss": 0.86827886, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.8405649662017822 }, { "auxiliary_loss_clip": 0.01337457, "auxiliary_loss_mlp": 0.01193229, "balance_loss_clip": 1.00877213, "balance_loss_mlp": 1.00020504, "epoch": 0.9897192328503577, "flos": 21799045244160.0, "grad_norm": 1.6285209495839494, "language_loss": 0.70644778, "learning_rate": 1.0959266667598388e-09, "loss": 0.73175466, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 3.7665693759918213 }, { "auxiliary_loss_clip": 0.01282261, "auxiliary_loss_mlp": 0.01193241, "balance_loss_clip": 1.00650334, "balance_loss_mlp": 1.00021744, "epoch": 0.9898394757409968, "flos": 21325239928800.0, "grad_norm": 1.8410152523326253, "language_loss": 0.74570119, "learning_rate": 1.0702941289533196e-09, "loss": 0.77045614, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 3.7474546432495117 }, { "auxiliary_loss_clip": 0.01289307, "auxiliary_loss_mlp": 0.0119301, "balance_loss_clip": 1.00677299, "balance_loss_mlp": 1.00017738, "epoch": 0.9899597186316359, "flos": 18545032516320.0, "grad_norm": 1.836045425560913, "language_loss": 0.8881765, "learning_rate": 1.0449648271939615e-09, "loss": 0.91299969, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.79705548286438 }, { "auxiliary_loss_clip": 0.01272669, "auxiliary_loss_mlp": 0.00872428, "balance_loss_clip": 1.0070796, "balance_loss_mlp": 1.00038409, "epoch": 0.990079961522275, "flos": 23766734073600.0, "grad_norm": 1.8712202482028213, "language_loss": 0.72775435, "learning_rate": 1.0199387653240243e-09, "loss": 0.74920535, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.833939790725708 }, { "auxiliary_loss_clip": 0.01311916, "auxiliary_loss_mlp": 0.01193176, "balance_loss_clip": 1.00777173, "balance_loss_mlp": 1.00015259, "epoch": 0.9902002044129141, "flos": 16399192356000.0, "grad_norm": 1.6551918285452134, "language_loss": 0.70518541, "learning_rate": 9.952159471400267e-10, "loss": 0.73023629, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.688305377960205 }, { "auxiliary_loss_clip": 0.01319622, "auxiliary_loss_mlp": 0.00872372, "balance_loss_clip": 1.00816703, "balance_loss_mlp": 1.0004319, "epoch": 0.9903204473035532, "flos": 22559667318720.0, "grad_norm": 1.878033129113982, "language_loss": 0.84655702, "learning_rate": 9.707963763923022e-10, "loss": 0.86847699, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.8743820190429688 }, { "auxiliary_loss_clip": 0.01321336, "auxiliary_loss_mlp": 0.0119312, "balance_loss_clip": 1.00747585, "balance_loss_mlp": 1.00019145, "epoch": 0.9904406901941922, "flos": 16144011149760.0, "grad_norm": 1.6883881271361245, "language_loss": 0.78974378, "learning_rate": 9.466800567854427e-10, "loss": 0.81488836, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.715198040008545 }, { "auxiliary_loss_clip": 0.01295956, "auxiliary_loss_mlp": 0.01193009, "balance_loss_clip": 1.00780749, "balance_loss_mlp": 1.00017643, "epoch": 0.9905609330848314, "flos": 26651511614880.0, "grad_norm": 2.2307611325642664, "language_loss": 0.67901653, "learning_rate": 9.228669919778553e-10, "loss": 0.70390612, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.8221278190612793 }, { "auxiliary_loss_clip": 0.01307392, "auxiliary_loss_mlp": 0.01192995, "balance_loss_clip": 1.00787044, "balance_loss_mlp": 1.00016201, "epoch": 0.9906811759754705, "flos": 23111831685600.0, "grad_norm": 5.785149774920269, "language_loss": 0.79682559, "learning_rate": 8.993571855817617e-10, "loss": 0.82182944, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.717641592025757 }, { "auxiliary_loss_clip": 0.0132452, "auxiliary_loss_mlp": 0.01193123, "balance_loss_clip": 1.00712585, "balance_loss_mlp": 1.00019503, "epoch": 0.9908014188661095, "flos": 22090603929120.0, "grad_norm": 1.8864466959430337, "language_loss": 0.74706352, "learning_rate": 8.761506411638642e-10, "loss": 0.77223992, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.7548673152923584 }, { "auxiliary_loss_clip": 0.01306507, "auxiliary_loss_mlp": 0.01193221, "balance_loss_clip": 1.00742757, "balance_loss_mlp": 1.00019741, "epoch": 0.9909216617567487, "flos": 19242958263840.0, "grad_norm": 1.6557027513716818, "language_loss": 0.73566288, "learning_rate": 8.53247362244236e-10, "loss": 0.76066017, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.783332109451294 }, { "auxiliary_loss_clip": 0.01306656, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00709093, "balance_loss_mlp": 1.00015545, "epoch": 0.9910419046473877, "flos": 23621224158720.0, "grad_norm": 1.7835906801241141, "language_loss": 0.68309313, "learning_rate": 8.306473522976532e-10, "loss": 0.70809054, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.8996422290802 }, { "auxiliary_loss_clip": 0.013482, "auxiliary_loss_mlp": 0.01193083, "balance_loss_clip": 1.00768685, "balance_loss_mlp": 1.00015497, "epoch": 0.9911621475380268, "flos": 22711392030240.0, "grad_norm": 1.702942558141924, "language_loss": 0.71395552, "learning_rate": 8.083506147522623e-10, "loss": 0.73936832, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.687817096710205 }, { "auxiliary_loss_clip": 0.01335563, "auxiliary_loss_mlp": 0.0119308, "balance_loss_clip": 1.00779724, "balance_loss_mlp": 1.00015223, "epoch": 0.991282390428666, "flos": 13516965396000.0, "grad_norm": 1.8602235484798053, "language_loss": 0.84973419, "learning_rate": 7.863571529906909e-10, "loss": 0.87502062, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.718543767929077 }, { "auxiliary_loss_clip": 0.01303596, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00347519, "balance_loss_mlp": 1.00000572, "epoch": 0.991402633319305, "flos": 61830526252320.0, "grad_norm": 0.7278581840399951, "language_loss": 0.59677309, "learning_rate": 7.646669703489372e-10, "loss": 0.6217317, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 3.3647568225860596 }, { "auxiliary_loss_clip": 0.01215702, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00629258, "balance_loss_mlp": 1.00014436, "epoch": 0.9915228762099441, "flos": 18770158811520.0, "grad_norm": 2.0275129953371174, "language_loss": 0.57270992, "learning_rate": 7.432800701177023e-10, "loss": 0.5967986, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 3.139094352722168 }, { "auxiliary_loss_clip": 0.01285063, "auxiliary_loss_mlp": 0.01192277, "balance_loss_clip": 1.00329137, "balance_loss_mlp": 1.00001621, "epoch": 0.9916431191005832, "flos": 65936690887680.0, "grad_norm": 0.7860971836114783, "language_loss": 0.57889056, "learning_rate": 7.221964555415017e-10, "loss": 0.60366398, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 4.602272272109985 }, { "auxiliary_loss_clip": 0.01305402, "auxiliary_loss_mlp": 0.01192958, "balance_loss_clip": 1.00678015, "balance_loss_mlp": 1.00012469, "epoch": 0.9917633619912223, "flos": 16581582681120.0, "grad_norm": 1.6587602141140152, "language_loss": 0.74889886, "learning_rate": 7.01416129818222e-10, "loss": 0.77388245, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.780911922454834 }, { "auxiliary_loss_clip": 0.01298576, "auxiliary_loss_mlp": 0.01193237, "balance_loss_clip": 1.00665689, "balance_loss_mlp": 1.00021338, "epoch": 0.9918836048818613, "flos": 25411120894080.0, "grad_norm": 2.393440472875627, "language_loss": 0.58529115, "learning_rate": 6.809390961006745e-10, "loss": 0.61020923, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.8088831901550293 }, { "auxiliary_loss_clip": 0.0130844, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.00718093, "balance_loss_mlp": 1.00016117, "epoch": 0.9920038477725005, "flos": 25046879099040.0, "grad_norm": 1.650352478398733, "language_loss": 0.68559921, "learning_rate": 6.607653574948191e-10, "loss": 0.71061552, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 2.7855372428894043 }, { "auxiliary_loss_clip": 0.01334655, "auxiliary_loss_mlp": 0.01192876, "balance_loss_clip": 1.00754762, "balance_loss_mlp": 1.00013876, "epoch": 0.9921240906631396, "flos": 21829782705120.0, "grad_norm": 1.8589144897245553, "language_loss": 0.82149237, "learning_rate": 6.408949170613187e-10, "loss": 0.84676766, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.7673702239990234 }, { "auxiliary_loss_clip": 0.01318993, "auxiliary_loss_mlp": 0.0119317, "balance_loss_clip": 1.00774515, "balance_loss_mlp": 1.00014591, "epoch": 0.9922443335537786, "flos": 24864237308160.0, "grad_norm": 1.5151025286077289, "language_loss": 0.81787872, "learning_rate": 6.213277778144288e-10, "loss": 0.84300035, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.807115077972412 }, { "auxiliary_loss_clip": 0.01252533, "auxiliary_loss_mlp": 0.01193059, "balance_loss_clip": 1.00683582, "balance_loss_mlp": 1.00013101, "epoch": 0.9923645764444178, "flos": 21613098474720.0, "grad_norm": 1.9776653386609422, "language_loss": 0.66988707, "learning_rate": 6.020639427224416e-10, "loss": 0.69434297, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 3.987434148788452 }, { "auxiliary_loss_clip": 0.01306112, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.00689101, "balance_loss_mlp": 1.00015998, "epoch": 0.9924848193350568, "flos": 25001808089760.0, "grad_norm": 2.014467436725267, "language_loss": 0.72886932, "learning_rate": 5.831034147076864e-10, "loss": 0.75386226, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.8050639629364014 }, { "auxiliary_loss_clip": 0.01293523, "auxiliary_loss_mlp": 0.01192271, "balance_loss_clip": 1.00304592, "balance_loss_mlp": 1.00001013, "epoch": 0.9926050622256959, "flos": 68912580386880.0, "grad_norm": 0.6840861651981207, "language_loss": 0.55733424, "learning_rate": 5.644461966463065e-10, "loss": 0.58219218, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.3547122478485107 }, { "auxiliary_loss_clip": 0.01305406, "auxiliary_loss_mlp": 0.01192894, "balance_loss_clip": 1.00675416, "balance_loss_mlp": 1.00015652, "epoch": 0.9927253051163349, "flos": 20923686639360.0, "grad_norm": 1.6538200454107221, "language_loss": 0.75585544, "learning_rate": 5.460922913687049e-10, "loss": 0.78083849, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.8187873363494873 }, { "auxiliary_loss_clip": 0.01287261, "auxiliary_loss_mlp": 0.00872651, "balance_loss_clip": 1.00718832, "balance_loss_mlp": 1.00047851, "epoch": 0.9928455480069741, "flos": 22308222175200.0, "grad_norm": 5.280439319824326, "language_loss": 0.75477433, "learning_rate": 5.280417016593208e-10, "loss": 0.77637351, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 4.763352394104004 }, { "auxiliary_loss_clip": 0.01322993, "auxiliary_loss_mlp": 0.00872403, "balance_loss_clip": 1.0080564, "balance_loss_mlp": 1.00031614, "epoch": 0.9929657908976132, "flos": 17383898939040.0, "grad_norm": 1.9190005331861222, "language_loss": 0.74731845, "learning_rate": 5.102944302559642e-10, "loss": 0.76927239, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 3.580207109451294 }, { "auxiliary_loss_clip": 0.01259294, "auxiliary_loss_mlp": 0.0119302, "balance_loss_clip": 1.00713611, "balance_loss_mlp": 1.00018728, "epoch": 0.9930860337882522, "flos": 22674691238400.0, "grad_norm": 1.9158074969337133, "language_loss": 0.79564047, "learning_rate": 4.9285047985137e-10, "loss": 0.82016361, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.9460043907165527 }, { "auxiliary_loss_clip": 0.01336584, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.00839245, "balance_loss_mlp": 1.00013435, "epoch": 0.9932062766788914, "flos": 28147802015520.0, "grad_norm": 2.1125453324042742, "language_loss": 0.74560642, "learning_rate": 4.757098530916436e-10, "loss": 0.77090383, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.7719521522521973 }, { "auxiliary_loss_clip": 0.01324154, "auxiliary_loss_mlp": 0.01193181, "balance_loss_clip": 1.00722146, "balance_loss_mlp": 1.00015759, "epoch": 0.9933265195695304, "flos": 20156670149760.0, "grad_norm": 3.176769461574425, "language_loss": 0.77449584, "learning_rate": 4.5887255257670563e-10, "loss": 0.79966915, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.738800048828125 }, { "auxiliary_loss_clip": 0.01347526, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00747538, "balance_loss_mlp": 1.00016594, "epoch": 0.9934467624601695, "flos": 21362048491680.0, "grad_norm": 2.166445331218028, "language_loss": 0.77184761, "learning_rate": 4.4233858086117906e-10, "loss": 0.79725373, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.764404058456421 }, { "auxiliary_loss_clip": 0.01255471, "auxiliary_loss_mlp": 0.01193124, "balance_loss_clip": 1.00612783, "balance_loss_mlp": 1.00019574, "epoch": 0.9935670053508087, "flos": 19756050876000.0, "grad_norm": 1.9433842366867327, "language_loss": 0.67368728, "learning_rate": 4.261079404528356e-10, "loss": 0.69817317, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.8064370155334473 }, { "auxiliary_loss_clip": 0.01337358, "auxiliary_loss_mlp": 0.01193223, "balance_loss_clip": 1.00834274, "balance_loss_mlp": 1.00019979, "epoch": 0.9936872482414477, "flos": 21978849064320.0, "grad_norm": 2.7226199358764673, "language_loss": 0.69071817, "learning_rate": 4.1018063381437205e-10, "loss": 0.71602404, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.7627806663513184 }, { "auxiliary_loss_clip": 0.01288955, "auxiliary_loss_mlp": 0.01192266, "balance_loss_clip": 1.00386691, "balance_loss_mlp": 1.00000513, "epoch": 0.9938074911320868, "flos": 69810701395680.0, "grad_norm": 0.8621475722424462, "language_loss": 0.61186683, "learning_rate": 3.9455666336141167e-10, "loss": 0.63667905, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.2860615253448486 }, { "auxiliary_loss_clip": 0.0134662, "auxiliary_loss_mlp": 0.01193069, "balance_loss_clip": 1.00814009, "balance_loss_mlp": 1.00014091, "epoch": 0.9939277340227259, "flos": 15084178646400.0, "grad_norm": 4.688713192537586, "language_loss": 0.83039033, "learning_rate": 3.7923603146450267e-10, "loss": 0.85578722, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.7070202827453613 }, { "auxiliary_loss_clip": 0.012928, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00632429, "balance_loss_mlp": 1.0001446, "epoch": 0.994047976913365, "flos": 17712373957920.0, "grad_norm": 2.126027429667034, "language_loss": 0.81024086, "learning_rate": 3.642187404473418e-10, "loss": 0.83510053, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.749077796936035 }, { "auxiliary_loss_clip": 0.01330468, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00723016, "balance_loss_mlp": 1.00019538, "epoch": 0.994168219804004, "flos": 19171568406240.0, "grad_norm": 1.9284117424030571, "language_loss": 0.85920346, "learning_rate": 3.495047925885508e-10, "loss": 0.8844403, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 2.729548692703247 }, { "auxiliary_loss_clip": 0.01324184, "auxiliary_loss_mlp": 0.0119322, "balance_loss_clip": 1.00841832, "balance_loss_mlp": 1.00019646, "epoch": 0.9942884626946432, "flos": 17851597228800.0, "grad_norm": 2.117206677678143, "language_loss": 0.82864261, "learning_rate": 3.350941901199e-10, "loss": 0.85381657, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.7256569862365723 }, { "auxiliary_loss_clip": 0.01318428, "auxiliary_loss_mlp": 0.01193129, "balance_loss_clip": 1.00770521, "balance_loss_mlp": 1.00020027, "epoch": 0.9944087055852823, "flos": 18796585430880.0, "grad_norm": 2.4557496315661322, "language_loss": 0.83388799, "learning_rate": 3.2098693522764066e-10, "loss": 0.85900354, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.758970022201538 }, { "auxiliary_loss_clip": 0.01316958, "auxiliary_loss_mlp": 0.00872525, "balance_loss_clip": 1.00745308, "balance_loss_mlp": 1.00044417, "epoch": 0.9945289484759213, "flos": 20996980452000.0, "grad_norm": 1.8155936330163114, "language_loss": 0.80890226, "learning_rate": 3.071830300516165e-10, "loss": 0.83079708, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.796658754348755 }, { "auxiliary_loss_clip": 0.01331234, "auxiliary_loss_mlp": 0.01193167, "balance_loss_clip": 1.0071553, "balance_loss_mlp": 1.00014305, "epoch": 0.9946491913665605, "flos": 14756960956320.0, "grad_norm": 1.9475284889320705, "language_loss": 0.70838851, "learning_rate": 2.9368247668615234e-10, "loss": 0.7336325, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.672780990600586 }, { "auxiliary_loss_clip": 0.01349662, "auxiliary_loss_mlp": 0.011931, "balance_loss_clip": 1.00867391, "balance_loss_mlp": 1.00017202, "epoch": 0.9947694342571995, "flos": 12669937365600.0, "grad_norm": 2.2084720876425874, "language_loss": 0.61026192, "learning_rate": 2.804852771789434e-10, "loss": 0.6356895, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.7390193939208984 }, { "auxiliary_loss_clip": 0.01347659, "auxiliary_loss_mlp": 0.01193126, "balance_loss_clip": 1.00740921, "balance_loss_mlp": 1.00019813, "epoch": 0.9948896771478386, "flos": 18843452624160.0, "grad_norm": 1.8584028029461017, "language_loss": 0.55893874, "learning_rate": 2.675914335321661e-10, "loss": 0.58434653, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.7407493591308594 }, { "auxiliary_loss_clip": 0.01328716, "auxiliary_loss_mlp": 0.01193163, "balance_loss_clip": 1.00741982, "balance_loss_mlp": 1.00013947, "epoch": 0.9950099200384778, "flos": 24900219626400.0, "grad_norm": 2.585676849468516, "language_loss": 0.79387486, "learning_rate": 2.550009477018111e-10, "loss": 0.8190937, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.768768787384033 }, { "auxiliary_loss_clip": 0.01301479, "auxiliary_loss_mlp": 0.00872524, "balance_loss_clip": 1.00692892, "balance_loss_mlp": 1.00038314, "epoch": 0.9951301629291168, "flos": 23733625649760.0, "grad_norm": 1.8607283132652552, "language_loss": 0.62617385, "learning_rate": 2.4271382159790634e-10, "loss": 0.64791393, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 2.7737317085266113 }, { "auxiliary_loss_clip": 0.01254733, "auxiliary_loss_mlp": 0.0119319, "balance_loss_clip": 1.00750005, "balance_loss_mlp": 1.00016594, "epoch": 0.9952504058197559, "flos": 22236904164960.0, "grad_norm": 10.255940094856152, "language_loss": 0.8590064, "learning_rate": 2.3073005708429406e-10, "loss": 0.88348556, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.8525147438049316 }, { "auxiliary_loss_clip": 0.01281226, "auxiliary_loss_mlp": 0.0119299, "balance_loss_clip": 1.00665498, "balance_loss_mlp": 1.00015736, "epoch": 0.995370648710395, "flos": 21211042253760.0, "grad_norm": 1.8528692344477184, "language_loss": 0.72304529, "learning_rate": 2.190496559788535e-10, "loss": 0.74778748, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.767650604248047 }, { "auxiliary_loss_clip": 0.01302653, "auxiliary_loss_mlp": 0.01193241, "balance_loss_clip": 1.00682878, "balance_loss_mlp": 1.00031245, "epoch": 0.9954908916010341, "flos": 14866740018720.0, "grad_norm": 2.444574280574553, "language_loss": 0.76406968, "learning_rate": 2.0767262005372265e-10, "loss": 0.78902864, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 3.675097703933716 }, { "auxiliary_loss_clip": 0.01291441, "auxiliary_loss_mlp": 0.01193051, "balance_loss_clip": 1.00726652, "balance_loss_mlp": 1.00021803, "epoch": 0.9956111344916732, "flos": 19208269198080.0, "grad_norm": 1.8851916493275533, "language_loss": 0.75028551, "learning_rate": 1.965989510346322e-10, "loss": 0.77513045, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.794388771057129 }, { "auxiliary_loss_clip": 0.01269204, "auxiliary_loss_mlp": 0.01193077, "balance_loss_clip": 1.00682628, "balance_loss_mlp": 1.00014853, "epoch": 0.9957313773823123, "flos": 20047070705760.0, "grad_norm": 1.9871996041408675, "language_loss": 0.70951253, "learning_rate": 1.8582865060134955e-10, "loss": 0.73413539, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.8463618755340576 }, { "auxiliary_loss_clip": 0.0131551, "auxiliary_loss_mlp": 0.0119226, "balance_loss_clip": 1.00343966, "balance_loss_mlp": 0.99999964, "epoch": 0.9958516202729514, "flos": 57483285207840.0, "grad_norm": 0.7836674556864769, "language_loss": 0.55765986, "learning_rate": 1.7536172038790098e-10, "loss": 0.58273757, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.394526720046997 }, { "auxiliary_loss_clip": 0.01306359, "auxiliary_loss_mlp": 0.01193188, "balance_loss_clip": 1.00719595, "balance_loss_mlp": 1.00016451, "epoch": 0.9959718631635904, "flos": 27782913594240.0, "grad_norm": 2.005873566292674, "language_loss": 0.69484377, "learning_rate": 1.651981619819054e-10, "loss": 0.71983927, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 3.8457911014556885 }, { "auxiliary_loss_clip": 0.0126893, "auxiliary_loss_mlp": 0.01193084, "balance_loss_clip": 1.00612473, "balance_loss_mlp": 1.00015569, "epoch": 0.9960921060542296, "flos": 24024106624320.0, "grad_norm": 2.431704576387608, "language_loss": 0.70781946, "learning_rate": 1.5533797692546257e-10, "loss": 0.73243964, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 3.8957836627960205 }, { "auxiliary_loss_clip": 0.01336571, "auxiliary_loss_mlp": 0.01193086, "balance_loss_clip": 1.00790489, "balance_loss_mlp": 1.00015795, "epoch": 0.9962123489448687, "flos": 18697403854080.0, "grad_norm": 1.9042730693295349, "language_loss": 0.84113365, "learning_rate": 1.4578116671404296e-10, "loss": 0.86643022, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.681568145751953 }, { "auxiliary_loss_clip": 0.01321325, "auxiliary_loss_mlp": 0.01193102, "balance_loss_clip": 1.00764906, "balance_loss_mlp": 1.00017369, "epoch": 0.9963325918355077, "flos": 20010765074400.0, "grad_norm": 2.6203340511260658, "language_loss": 0.71232015, "learning_rate": 1.3652773279759777e-10, "loss": 0.73746443, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.7221741676330566 }, { "auxiliary_loss_clip": 0.01336361, "auxiliary_loss_mlp": 0.01193168, "balance_loss_clip": 1.00809836, "balance_loss_mlp": 1.00014389, "epoch": 0.9964528347261468, "flos": 33108502730400.0, "grad_norm": 1.5179528478839792, "language_loss": 0.6298936, "learning_rate": 1.2757767657989305e-10, "loss": 0.65518886, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.8151803016662598 }, { "auxiliary_loss_clip": 0.01325924, "auxiliary_loss_mlp": 0.01193085, "balance_loss_clip": 1.00781631, "balance_loss_mlp": 1.00015652, "epoch": 0.9965730776167859, "flos": 23109352951680.0, "grad_norm": 1.8014769995369588, "language_loss": 0.8722955, "learning_rate": 1.1893099941850948e-10, "loss": 0.89748561, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.7512381076812744 }, { "auxiliary_loss_clip": 0.01315601, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00710571, "balance_loss_mlp": 1.00017118, "epoch": 0.996693320507425, "flos": 22965854762880.0, "grad_norm": 2.9068696457161374, "language_loss": 0.77264023, "learning_rate": 1.105877026252866e-10, "loss": 0.79772723, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.7402114868164062 }, { "auxiliary_loss_clip": 0.01348352, "auxiliary_loss_mlp": 0.01193234, "balance_loss_clip": 1.00791109, "balance_loss_mlp": 1.00021064, "epoch": 0.996813563398064, "flos": 13222748358720.0, "grad_norm": 1.8643711529929146, "language_loss": 0.72418928, "learning_rate": 1.0254778746565663e-10, "loss": 0.74960518, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.6377317905426025 }, { "auxiliary_loss_clip": 0.01285379, "auxiliary_loss_mlp": 0.01192718, "balance_loss_clip": 1.00667536, "balance_loss_mlp": 1.00017083, "epoch": 0.9969338062887032, "flos": 14647864443840.0, "grad_norm": 1.8411670341432476, "language_loss": 0.73274553, "learning_rate": 9.481125515953259e-11, "loss": 0.7575264, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.7475438117980957 }, { "auxiliary_loss_clip": 0.01290292, "auxiliary_loss_mlp": 0.01193074, "balance_loss_clip": 1.0072006, "balance_loss_mlp": 1.00014532, "epoch": 0.9970540491793423, "flos": 25735752079200.0, "grad_norm": 1.7133489355682472, "language_loss": 0.79627311, "learning_rate": 8.737810688064228e-11, "loss": 0.82110679, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.8633170127868652 }, { "auxiliary_loss_clip": 0.0129125, "auxiliary_loss_mlp": 0.01193185, "balance_loss_clip": 1.0074718, "balance_loss_mlp": 1.00016129, "epoch": 0.9971742920699813, "flos": 21470246912160.0, "grad_norm": 1.856294647985463, "language_loss": 0.78991699, "learning_rate": 8.024834375608414e-11, "loss": 0.81476128, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.8159220218658447 }, { "auxiliary_loss_clip": 0.01315498, "auxiliary_loss_mlp": 0.0119227, "balance_loss_clip": 1.00345552, "balance_loss_mlp": 1.00000882, "epoch": 0.9972945349606205, "flos": 72211255754400.0, "grad_norm": 0.8190510445108706, "language_loss": 0.62903559, "learning_rate": 7.342196686788149e-11, "loss": 0.65411329, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 3.194286346435547 }, { "auxiliary_loss_clip": 0.0129978, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00820982, "balance_loss_mlp": 1.00019395, "epoch": 0.9974147778512595, "flos": 19678302527040.0, "grad_norm": 2.097458853069931, "language_loss": 0.68527836, "learning_rate": 6.689897725142834e-11, "loss": 0.7102083, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.8862862586975098 }, { "auxiliary_loss_clip": 0.01312038, "auxiliary_loss_mlp": 0.01193183, "balance_loss_clip": 1.00705481, "balance_loss_mlp": 1.00015926, "epoch": 0.9975350207418986, "flos": 15960830503680.0, "grad_norm": 2.29158512005285, "language_loss": 0.88538301, "learning_rate": 6.067937589615545e-11, "loss": 0.91043526, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.7451016902923584 }, { "auxiliary_loss_clip": 0.01290765, "auxiliary_loss_mlp": 0.01192279, "balance_loss_clip": 1.00351703, "balance_loss_mlp": 1.00001776, "epoch": 0.9976552636325378, "flos": 59961911228640.0, "grad_norm": 0.7527879357312254, "language_loss": 0.57692218, "learning_rate": 5.476316374575241e-11, "loss": 0.60175258, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.27937912940979 }, { "auxiliary_loss_clip": 0.01348445, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.00804377, "balance_loss_mlp": 1.00017977, "epoch": 0.9977755065231768, "flos": 22487882300640.0, "grad_norm": 1.879237231831825, "language_loss": 0.72419155, "learning_rate": 4.9150341697723476e-11, "loss": 0.74960709, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.710256814956665 }, { "auxiliary_loss_clip": 0.01319319, "auxiliary_loss_mlp": 0.01193119, "balance_loss_clip": 1.00790191, "balance_loss_mlp": 1.00019073, "epoch": 0.9978957494138159, "flos": 26030292429600.0, "grad_norm": 1.7234455238238064, "language_loss": 0.66406953, "learning_rate": 4.384091060338768e-11, "loss": 0.68919396, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.7978363037109375 }, { "auxiliary_loss_clip": 0.01325893, "auxiliary_loss_mlp": 0.01193053, "balance_loss_clip": 1.0069828, "balance_loss_mlp": 1.00012529, "epoch": 0.998015992304455, "flos": 22637846751840.0, "grad_norm": 2.1685688210145164, "language_loss": 0.7353214, "learning_rate": 3.883487126810081e-11, "loss": 0.76051086, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.7732865810394287 }, { "auxiliary_loss_clip": 0.01335242, "auxiliary_loss_mlp": 0.01193094, "balance_loss_clip": 1.00753188, "balance_loss_mlp": 1.00016582, "epoch": 0.9981362351950941, "flos": 18223454844000.0, "grad_norm": 1.6441604921559134, "language_loss": 0.79063964, "learning_rate": 3.41322244516995e-11, "loss": 0.81592304, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.7127208709716797 }, { "auxiliary_loss_clip": 0.01263608, "auxiliary_loss_mlp": 0.01193098, "balance_loss_clip": 1.00687051, "balance_loss_mlp": 1.00017023, "epoch": 0.9982564780857331, "flos": 33474109625280.0, "grad_norm": 1.5020187843937758, "language_loss": 0.63292181, "learning_rate": 2.9732970866946925e-11, "loss": 0.65748888, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 2.9558424949645996 }, { "auxiliary_loss_clip": 0.01301142, "auxiliary_loss_mlp": 0.01193219, "balance_loss_clip": 1.00767827, "balance_loss_mlp": 1.0001955, "epoch": 0.9983767209763723, "flos": 15523474514400.0, "grad_norm": 4.260649671680703, "language_loss": 0.78019696, "learning_rate": 2.563711118175327e-11, "loss": 0.80514061, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.7751479148864746 }, { "auxiliary_loss_clip": 0.01291889, "auxiliary_loss_mlp": 0.01192946, "balance_loss_clip": 1.00724947, "balance_loss_mlp": 1.00020885, "epoch": 0.9984969638670114, "flos": 19974387595680.0, "grad_norm": 2.084719750120342, "language_loss": 0.83528292, "learning_rate": 2.184464601717728e-11, "loss": 0.86013132, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.777301549911499 }, { "auxiliary_loss_clip": 0.01327867, "auxiliary_loss_mlp": 0.01193218, "balance_loss_clip": 1.00793147, "balance_loss_mlp": 1.00019455, "epoch": 0.9986172067576504, "flos": 20375761266720.0, "grad_norm": 2.3105724667657968, "language_loss": 0.78239411, "learning_rate": 1.8355575948758585e-11, "loss": 0.80760497, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.7836179733276367 }, { "auxiliary_loss_clip": 0.01323849, "auxiliary_loss_mlp": 0.01193186, "balance_loss_clip": 1.00770867, "balance_loss_mlp": 1.00016284, "epoch": 0.9987374496482896, "flos": 23727913784640.0, "grad_norm": 2.0301994747617575, "language_loss": 0.73557711, "learning_rate": 1.5169901505407424e-11, "loss": 0.76074755, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 3.8325719833374023 }, { "auxiliary_loss_clip": 0.01304261, "auxiliary_loss_mlp": 0.01192985, "balance_loss_clip": 1.00705218, "balance_loss_mlp": 1.00015187, "epoch": 0.9988576925389286, "flos": 25044041128320.0, "grad_norm": 1.6672733880588515, "language_loss": 0.74108016, "learning_rate": 1.228762317073695e-11, "loss": 0.7660526, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.8813955783843994 }, { "auxiliary_loss_clip": 0.01311101, "auxiliary_loss_mlp": 0.01193109, "balance_loss_clip": 1.00752079, "balance_loss_mlp": 1.00018048, "epoch": 0.9989779354295677, "flos": 31285641265920.0, "grad_norm": 1.9472056088877099, "language_loss": 0.78733516, "learning_rate": 9.70874138195299e-12, "loss": 0.81237733, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.882819414138794 }, { "auxiliary_loss_clip": 0.01348837, "auxiliary_loss_mlp": 0.01193099, "balance_loss_clip": 1.00797772, "balance_loss_mlp": 1.00017023, "epoch": 0.9990981783202069, "flos": 19573409085120.0, "grad_norm": 1.6679670758037959, "language_loss": 0.74647498, "learning_rate": 7.433256530076093e-12, "loss": 0.77189434, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 3.592623233795166 }, { "auxiliary_loss_clip": 0.01283868, "auxiliary_loss_mlp": 0.01192859, "balance_loss_clip": 1.00645232, "balance_loss_mlp": 1.00012147, "epoch": 0.9992184212108459, "flos": 17199676506240.0, "grad_norm": 2.3559197648627492, "language_loss": 0.75941288, "learning_rate": 5.46116896038562e-12, "loss": 0.78418016, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 4.650662899017334 }, { "auxiliary_loss_clip": 0.01312397, "auxiliary_loss_mlp": 0.01193072, "balance_loss_clip": 1.00842261, "balance_loss_mlp": 1.00014377, "epoch": 0.999338664101485, "flos": 46497870918720.0, "grad_norm": 1.9454705830710974, "language_loss": 0.61695254, "learning_rate": 3.792478972197699e-12, "loss": 0.64200723, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.990351915359497 }, { "auxiliary_loss_clip": 0.01348032, "auxiliary_loss_mlp": 0.01193075, "balance_loss_clip": 1.00768876, "balance_loss_mlp": 1.00014675, "epoch": 0.9994589069921241, "flos": 15158262780000.0, "grad_norm": 2.3708217582261706, "language_loss": 0.70051026, "learning_rate": 2.4271868181990895e-12, "loss": 0.72592127, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.692991018295288 }, { "auxiliary_loss_clip": 0.01330087, "auxiliary_loss_mlp": 0.01193082, "balance_loss_clip": 1.00723839, "balance_loss_mlp": 1.00015378, "epoch": 0.9995791498827632, "flos": 12531468492000.0, "grad_norm": 2.1961408289228657, "language_loss": 0.81410611, "learning_rate": 1.3652927060014973e-12, "loss": 0.83933783, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.763441801071167 }, { "auxiliary_loss_clip": 0.01290767, "auxiliary_loss_mlp": 0.01193164, "balance_loss_clip": 1.00728202, "balance_loss_mlp": 1.00013995, "epoch": 0.9996993927734023, "flos": 19245185532000.0, "grad_norm": 1.8924863418601383, "language_loss": 0.63654423, "learning_rate": 6.067967965872612e-13, "loss": 0.66138351, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.7763118743896484 }, { "auxiliary_loss_clip": 0.01284693, "auxiliary_loss_mlp": 0.01193156, "balance_loss_clip": 1.00718951, "balance_loss_mlp": 1.00013232, "epoch": 0.9998196356640414, "flos": 62952826331520.0, "grad_norm": 1.5391795693201575, "language_loss": 0.77113426, "learning_rate": 1.5169920497548615e-13, "loss": 0.79591274, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.2519493103027344 }, { "auxiliary_loss_clip": 0.01320516, "auxiliary_loss_mlp": 0.01192427, "balance_loss_clip": 1.00553989, "balance_loss_mlp": 1.00007081, "epoch": 0.9999398785546805, "flos": 50922406513440.0, "grad_norm": 1.1688300905593985, "language_loss": 0.55095315, "learning_rate": 0.0, "loss": 0.57608259, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.378781318664551 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996752710280151e+17, "train_loss": 0.7923522646923716, "train_runtime": 26228.9362, "train_samples_per_second": 12.682, "train_steps_per_second": 0.317 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996752710280151e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }