{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6553, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015260477271426664, "grad_norm": 36.718162536621094, "learning_rate": 1.3719512195121953e-07, "loss": 1.7945, "step": 10 }, { "epoch": 0.0030520954542853328, "grad_norm": 33.861351013183594, "learning_rate": 2.896341463414635e-07, "loss": 1.7388, "step": 20 }, { "epoch": 0.004578143181427999, "grad_norm": 32.35844802856445, "learning_rate": 4.4207317073170735e-07, "loss": 1.6264, "step": 30 }, { "epoch": 0.0061041909085706655, "grad_norm": 18.111286163330078, "learning_rate": 5.945121951219512e-07, "loss": 1.4267, "step": 40 }, { "epoch": 0.007630238635713332, "grad_norm": 12.056061744689941, "learning_rate": 7.469512195121952e-07, "loss": 1.1626, "step": 50 }, { "epoch": 0.009156286362855998, "grad_norm": 9.076939582824707, "learning_rate": 8.993902439024391e-07, "loss": 0.9372, "step": 60 }, { "epoch": 0.010682334089998665, "grad_norm": 8.621874809265137, "learning_rate": 1.051829268292683e-06, "loss": 0.8972, "step": 70 }, { "epoch": 0.012208381817141331, "grad_norm": 6.114352226257324, "learning_rate": 1.204268292682927e-06, "loss": 0.771, "step": 80 }, { "epoch": 0.013734429544283997, "grad_norm": 7.380756855010986, "learning_rate": 1.356707317073171e-06, "loss": 0.7386, "step": 90 }, { "epoch": 0.015260477271426664, "grad_norm": 6.799697399139404, "learning_rate": 1.5091463414634146e-06, "loss": 0.7209, "step": 100 }, { "epoch": 0.016786524998569332, "grad_norm": 7.576807975769043, "learning_rate": 1.6615853658536587e-06, "loss": 0.6985, "step": 110 }, { "epoch": 0.018312572725711997, "grad_norm": 7.400335788726807, "learning_rate": 1.8140243902439026e-06, "loss": 0.6846, "step": 120 }, { "epoch": 0.019838620452854665, "grad_norm": 6.237104892730713, "learning_rate": 1.9664634146341467e-06, "loss": 0.6377, "step": 130 }, { "epoch": 0.02136466817999733, "grad_norm": 6.404150485992432, "learning_rate": 2.1189024390243905e-06, "loss": 0.6354, "step": 140 }, { "epoch": 0.022890715907139998, "grad_norm": 7.284428596496582, "learning_rate": 2.2713414634146344e-06, "loss": 0.6543, "step": 150 }, { "epoch": 0.024416763634282662, "grad_norm": 5.5003156661987305, "learning_rate": 2.4237804878048783e-06, "loss": 0.6016, "step": 160 }, { "epoch": 0.02594281136142533, "grad_norm": 7.994759559631348, "learning_rate": 2.576219512195122e-06, "loss": 0.6073, "step": 170 }, { "epoch": 0.027468859088567995, "grad_norm": 7.063292980194092, "learning_rate": 2.7286585365853664e-06, "loss": 0.5698, "step": 180 }, { "epoch": 0.028994906815710663, "grad_norm": 5.892914295196533, "learning_rate": 2.88109756097561e-06, "loss": 0.5679, "step": 190 }, { "epoch": 0.030520954542853328, "grad_norm": 5.543047904968262, "learning_rate": 3.0335365853658537e-06, "loss": 0.5655, "step": 200 }, { "epoch": 0.032047002269995996, "grad_norm": 6.323866844177246, "learning_rate": 3.185975609756098e-06, "loss": 0.5619, "step": 210 }, { "epoch": 0.033573049997138664, "grad_norm": 6.957103729248047, "learning_rate": 3.338414634146342e-06, "loss": 0.553, "step": 220 }, { "epoch": 0.035099097724281325, "grad_norm": 5.986090183258057, "learning_rate": 3.4908536585365853e-06, "loss": 0.5503, "step": 230 }, { "epoch": 0.03662514545142399, "grad_norm": 5.522103786468506, "learning_rate": 3.6432926829268296e-06, "loss": 0.5493, "step": 240 }, { "epoch": 0.03815119317856666, "grad_norm": 5.235415935516357, "learning_rate": 3.7957317073170735e-06, "loss": 0.5534, "step": 250 }, { "epoch": 0.03967724090570933, "grad_norm": 5.9757819175720215, "learning_rate": 3.948170731707318e-06, "loss": 0.5368, "step": 260 }, { "epoch": 0.04120328863285199, "grad_norm": 5.101258754730225, "learning_rate": 4.100609756097561e-06, "loss": 0.5254, "step": 270 }, { "epoch": 0.04272933635999466, "grad_norm": 6.364596843719482, "learning_rate": 4.2530487804878055e-06, "loss": 0.5423, "step": 280 }, { "epoch": 0.04425538408713733, "grad_norm": 6.032998085021973, "learning_rate": 4.405487804878049e-06, "loss": 0.5257, "step": 290 }, { "epoch": 0.045781431814279995, "grad_norm": 5.804418563842773, "learning_rate": 4.557926829268293e-06, "loss": 0.5432, "step": 300 }, { "epoch": 0.047307479541422656, "grad_norm": 5.553419589996338, "learning_rate": 4.710365853658536e-06, "loss": 0.5413, "step": 310 }, { "epoch": 0.048833527268565324, "grad_norm": 7.154079914093018, "learning_rate": 4.862804878048781e-06, "loss": 0.5407, "step": 320 }, { "epoch": 0.05035957499570799, "grad_norm": 7.198996543884277, "learning_rate": 5.015243902439024e-06, "loss": 0.5432, "step": 330 }, { "epoch": 0.05188562272285066, "grad_norm": 5.0478434562683105, "learning_rate": 5.167682926829268e-06, "loss": 0.5125, "step": 340 }, { "epoch": 0.05341167044999332, "grad_norm": 6.1683573722839355, "learning_rate": 5.320121951219513e-06, "loss": 0.5321, "step": 350 }, { "epoch": 0.05493771817713599, "grad_norm": 7.884255409240723, "learning_rate": 5.4725609756097565e-06, "loss": 0.5399, "step": 360 }, { "epoch": 0.05646376590427866, "grad_norm": 5.224252700805664, "learning_rate": 5.625e-06, "loss": 0.5539, "step": 370 }, { "epoch": 0.057989813631421326, "grad_norm": 4.74583101272583, "learning_rate": 5.777439024390244e-06, "loss": 0.5503, "step": 380 }, { "epoch": 0.05951586135856399, "grad_norm": 5.914930820465088, "learning_rate": 5.929878048780489e-06, "loss": 0.5398, "step": 390 }, { "epoch": 0.061041909085706655, "grad_norm": 4.671189308166504, "learning_rate": 6.082317073170733e-06, "loss": 0.5481, "step": 400 }, { "epoch": 0.06256795681284932, "grad_norm": 4.802231788635254, "learning_rate": 6.234756097560977e-06, "loss": 0.568, "step": 410 }, { "epoch": 0.06409400453999199, "grad_norm": 4.851030349731445, "learning_rate": 6.38719512195122e-06, "loss": 0.5599, "step": 420 }, { "epoch": 0.06562005226713466, "grad_norm": 5.160627365112305, "learning_rate": 6.5396341463414636e-06, "loss": 0.5352, "step": 430 }, { "epoch": 0.06714609999427733, "grad_norm": 5.0278754234313965, "learning_rate": 6.6920731707317074e-06, "loss": 0.545, "step": 440 }, { "epoch": 0.06867214772141998, "grad_norm": 5.1211090087890625, "learning_rate": 6.844512195121952e-06, "loss": 0.5321, "step": 450 }, { "epoch": 0.07019819544856265, "grad_norm": 4.986229419708252, "learning_rate": 6.996951219512196e-06, "loss": 0.5584, "step": 460 }, { "epoch": 0.07172424317570532, "grad_norm": 4.192492485046387, "learning_rate": 7.14939024390244e-06, "loss": 0.5462, "step": 470 }, { "epoch": 0.07325029090284799, "grad_norm": 3.977277994155884, "learning_rate": 7.301829268292684e-06, "loss": 0.5366, "step": 480 }, { "epoch": 0.07477633862999065, "grad_norm": 4.757632732391357, "learning_rate": 7.454268292682928e-06, "loss": 0.5493, "step": 490 }, { "epoch": 0.07630238635713332, "grad_norm": 4.987619400024414, "learning_rate": 7.606707317073171e-06, "loss": 0.5282, "step": 500 }, { "epoch": 0.07630238635713332, "eval_loss": 0.5320242047309875, "eval_runtime": 100.1496, "eval_samples_per_second": 5.292, "eval_steps_per_second": 2.646, "step": 500 }, { "epoch": 0.07782843408427599, "grad_norm": 5.458449840545654, "learning_rate": 7.759146341463415e-06, "loss": 0.5464, "step": 510 }, { "epoch": 0.07935448181141866, "grad_norm": 5.1865081787109375, "learning_rate": 7.911585365853658e-06, "loss": 0.5456, "step": 520 }, { "epoch": 0.08088052953856131, "grad_norm": 4.639908313751221, "learning_rate": 8.064024390243903e-06, "loss": 0.5226, "step": 530 }, { "epoch": 0.08240657726570398, "grad_norm": 4.7745537757873535, "learning_rate": 8.216463414634148e-06, "loss": 0.5623, "step": 540 }, { "epoch": 0.08393262499284665, "grad_norm": 4.4498515129089355, "learning_rate": 8.36890243902439e-06, "loss": 0.5402, "step": 550 }, { "epoch": 0.08545867271998932, "grad_norm": 4.127673149108887, "learning_rate": 8.521341463414636e-06, "loss": 0.5526, "step": 560 }, { "epoch": 0.08698472044713199, "grad_norm": 5.61818790435791, "learning_rate": 8.673780487804879e-06, "loss": 0.5413, "step": 570 }, { "epoch": 0.08851076817427465, "grad_norm": 3.518537998199463, "learning_rate": 8.826219512195122e-06, "loss": 0.5536, "step": 580 }, { "epoch": 0.09003681590141732, "grad_norm": 4.232659339904785, "learning_rate": 8.978658536585366e-06, "loss": 0.5572, "step": 590 }, { "epoch": 0.09156286362855999, "grad_norm": 4.013524532318115, "learning_rate": 9.131097560975611e-06, "loss": 0.5236, "step": 600 }, { "epoch": 0.09308891135570264, "grad_norm": 3.9425785541534424, "learning_rate": 9.283536585365854e-06, "loss": 0.5643, "step": 610 }, { "epoch": 0.09461495908284531, "grad_norm": 4.538547515869141, "learning_rate": 9.435975609756099e-06, "loss": 0.5565, "step": 620 }, { "epoch": 0.09614100680998798, "grad_norm": 3.850074291229248, "learning_rate": 9.588414634146342e-06, "loss": 0.5319, "step": 630 }, { "epoch": 0.09766705453713065, "grad_norm": 5.455791473388672, "learning_rate": 9.740853658536586e-06, "loss": 0.5412, "step": 640 }, { "epoch": 0.09919310226427332, "grad_norm": 3.3886823654174805, "learning_rate": 9.893292682926831e-06, "loss": 0.566, "step": 650 }, { "epoch": 0.10071914999141598, "grad_norm": 3.8116540908813477, "learning_rate": 9.999993614132319e-06, "loss": 0.5627, "step": 660 }, { "epoch": 0.10224519771855865, "grad_norm": 3.609804630279541, "learning_rate": 9.99988008804953e-06, "loss": 0.5576, "step": 670 }, { "epoch": 0.10377124544570132, "grad_norm": 3.923624038696289, "learning_rate": 9.999624657504754e-06, "loss": 0.5728, "step": 680 }, { "epoch": 0.10529729317284398, "grad_norm": 4.188690185546875, "learning_rate": 9.99922732974751e-06, "loss": 0.5676, "step": 690 }, { "epoch": 0.10682334089998664, "grad_norm": 3.9573514461517334, "learning_rate": 9.998688116054583e-06, "loss": 0.5362, "step": 700 }, { "epoch": 0.10834938862712931, "grad_norm": 3.8924996852874756, "learning_rate": 9.99800703172971e-06, "loss": 0.557, "step": 710 }, { "epoch": 0.10987543635427198, "grad_norm": 4.514781475067139, "learning_rate": 9.997184096103133e-06, "loss": 0.5729, "step": 720 }, { "epoch": 0.11140148408141465, "grad_norm": 3.555657148361206, "learning_rate": 9.996219332531059e-06, "loss": 0.5735, "step": 730 }, { "epoch": 0.11292753180855732, "grad_norm": 3.8453004360198975, "learning_rate": 9.995112768394996e-06, "loss": 0.5492, "step": 740 }, { "epoch": 0.11445357953569998, "grad_norm": 4.1153435707092285, "learning_rate": 9.993864435100976e-06, "loss": 0.5273, "step": 750 }, { "epoch": 0.11597962726284265, "grad_norm": 3.793621063232422, "learning_rate": 9.992474368078664e-06, "loss": 0.5744, "step": 760 }, { "epoch": 0.1175056749899853, "grad_norm": 3.394721508026123, "learning_rate": 9.990942606780344e-06, "loss": 0.5554, "step": 770 }, { "epoch": 0.11903172271712797, "grad_norm": 4.319253444671631, "learning_rate": 9.989269194679814e-06, "loss": 0.5161, "step": 780 }, { "epoch": 0.12055777044427064, "grad_norm": 3.6960957050323486, "learning_rate": 9.987454179271138e-06, "loss": 0.5467, "step": 790 }, { "epoch": 0.12208381817141331, "grad_norm": 4.272833347320557, "learning_rate": 9.985497612067315e-06, "loss": 0.5764, "step": 800 }, { "epoch": 0.12360986589855598, "grad_norm": 3.716593027114868, "learning_rate": 9.983399548598795e-06, "loss": 0.5644, "step": 810 }, { "epoch": 0.12513591362569865, "grad_norm": 3.8028016090393066, "learning_rate": 9.981160048411922e-06, "loss": 0.5442, "step": 820 }, { "epoch": 0.1266619613528413, "grad_norm": 3.634533405303955, "learning_rate": 9.978779175067232e-06, "loss": 0.5642, "step": 830 }, { "epoch": 0.12818800907998398, "grad_norm": 4.484054088592529, "learning_rate": 9.976256996137657e-06, "loss": 0.5826, "step": 840 }, { "epoch": 0.12971405680712664, "grad_norm": 3.2868642807006836, "learning_rate": 9.973593583206602e-06, "loss": 0.5643, "step": 850 }, { "epoch": 0.13124010453426932, "grad_norm": 3.641526937484741, "learning_rate": 9.970789011865916e-06, "loss": 0.5577, "step": 860 }, { "epoch": 0.13276615226141197, "grad_norm": 3.82144832611084, "learning_rate": 9.967843361713747e-06, "loss": 0.5374, "step": 870 }, { "epoch": 0.13429219998855466, "grad_norm": 3.786513090133667, "learning_rate": 9.964756716352277e-06, "loss": 0.5523, "step": 880 }, { "epoch": 0.1358182477156973, "grad_norm": 3.7389333248138428, "learning_rate": 9.96152916338536e-06, "loss": 0.5708, "step": 890 }, { "epoch": 0.13734429544283996, "grad_norm": 3.5762176513671875, "learning_rate": 9.958160794416022e-06, "loss": 0.5481, "step": 900 }, { "epoch": 0.13887034316998265, "grad_norm": 3.7925140857696533, "learning_rate": 9.954651705043878e-06, "loss": 0.5814, "step": 910 }, { "epoch": 0.1403963908971253, "grad_norm": 4.10577917098999, "learning_rate": 9.951001994862402e-06, "loss": 0.5574, "step": 920 }, { "epoch": 0.14192243862426798, "grad_norm": 3.477315664291382, "learning_rate": 9.947211767456111e-06, "loss": 0.5472, "step": 930 }, { "epoch": 0.14344848635141064, "grad_norm": 3.1365439891815186, "learning_rate": 9.943281130397619e-06, "loss": 0.5759, "step": 940 }, { "epoch": 0.14497453407855332, "grad_norm": 4.209506988525391, "learning_rate": 9.93921019524459e-06, "loss": 0.5544, "step": 950 }, { "epoch": 0.14650058180569597, "grad_norm": 3.269994020462036, "learning_rate": 9.934999077536567e-06, "loss": 0.5577, "step": 960 }, { "epoch": 0.14802662953283863, "grad_norm": 3.4426379203796387, "learning_rate": 9.930647896791696e-06, "loss": 0.5498, "step": 970 }, { "epoch": 0.1495526772599813, "grad_norm": 3.949375629425049, "learning_rate": 9.92615677650333e-06, "loss": 0.5452, "step": 980 }, { "epoch": 0.15107872498712396, "grad_norm": 3.42270827293396, "learning_rate": 9.92152584413653e-06, "loss": 0.5393, "step": 990 }, { "epoch": 0.15260477271426665, "grad_norm": 4.055193901062012, "learning_rate": 9.916755231124437e-06, "loss": 0.5294, "step": 1000 }, { "epoch": 0.15260477271426665, "eval_loss": 0.5252559781074524, "eval_runtime": 99.9603, "eval_samples_per_second": 5.302, "eval_steps_per_second": 2.651, "step": 1000 }, { "epoch": 0.1541308204414093, "grad_norm": 3.2706804275512695, "learning_rate": 9.911845072864556e-06, "loss": 0.522, "step": 1010 }, { "epoch": 0.15565686816855198, "grad_norm": 3.859898090362549, "learning_rate": 9.906795508714901e-06, "loss": 0.5373, "step": 1020 }, { "epoch": 0.15718291589569464, "grad_norm": 3.1161351203918457, "learning_rate": 9.901606681990048e-06, "loss": 0.5471, "step": 1030 }, { "epoch": 0.15870896362283732, "grad_norm": 3.452761650085449, "learning_rate": 9.89627873995706e-06, "loss": 0.5532, "step": 1040 }, { "epoch": 0.16023501134997997, "grad_norm": 3.6008405685424805, "learning_rate": 9.890811833831312e-06, "loss": 0.5075, "step": 1050 }, { "epoch": 0.16176105907712263, "grad_norm": 4.958362579345703, "learning_rate": 9.885206118772201e-06, "loss": 0.5404, "step": 1060 }, { "epoch": 0.1632871068042653, "grad_norm": 3.1011452674865723, "learning_rate": 9.879461753878738e-06, "loss": 0.5456, "step": 1070 }, { "epoch": 0.16481315453140796, "grad_norm": 3.445542097091675, "learning_rate": 9.873578902185034e-06, "loss": 0.5299, "step": 1080 }, { "epoch": 0.16633920225855064, "grad_norm": 3.2124245166778564, "learning_rate": 9.867557730655676e-06, "loss": 0.5555, "step": 1090 }, { "epoch": 0.1678652499856933, "grad_norm": 3.3892674446105957, "learning_rate": 9.861398410180985e-06, "loss": 0.5429, "step": 1100 }, { "epoch": 0.16939129771283598, "grad_norm": 3.534641981124878, "learning_rate": 9.855101115572161e-06, "loss": 0.5564, "step": 1110 }, { "epoch": 0.17091734543997864, "grad_norm": 2.8586363792419434, "learning_rate": 9.848666025556332e-06, "loss": 0.5155, "step": 1120 }, { "epoch": 0.1724433931671213, "grad_norm": 4.134357452392578, "learning_rate": 9.842093322771479e-06, "loss": 0.5292, "step": 1130 }, { "epoch": 0.17396944089426397, "grad_norm": 2.917952060699463, "learning_rate": 9.83538319376124e-06, "loss": 0.5471, "step": 1140 }, { "epoch": 0.17549548862140663, "grad_norm": 3.148503065109253, "learning_rate": 9.82853582896964e-06, "loss": 0.5084, "step": 1150 }, { "epoch": 0.1770215363485493, "grad_norm": 2.9326202869415283, "learning_rate": 9.821551422735655e-06, "loss": 0.5303, "step": 1160 }, { "epoch": 0.17854758407569196, "grad_norm": 2.8527936935424805, "learning_rate": 9.814430173287724e-06, "loss": 0.5265, "step": 1170 }, { "epoch": 0.18007363180283464, "grad_norm": 3.373987913131714, "learning_rate": 9.807172282738109e-06, "loss": 0.5267, "step": 1180 }, { "epoch": 0.1815996795299773, "grad_norm": 3.886758804321289, "learning_rate": 9.799777957077161e-06, "loss": 0.5449, "step": 1190 }, { "epoch": 0.18312572725711998, "grad_norm": 2.8181774616241455, "learning_rate": 9.792247406167471e-06, "loss": 0.5477, "step": 1200 }, { "epoch": 0.18465177498426263, "grad_norm": 3.1215786933898926, "learning_rate": 9.784580843737924e-06, "loss": 0.5317, "step": 1210 }, { "epoch": 0.1861778227114053, "grad_norm": 3.4757418632507324, "learning_rate": 9.776778487377622e-06, "loss": 0.5312, "step": 1220 }, { "epoch": 0.18770387043854797, "grad_norm": 2.941584587097168, "learning_rate": 9.768840558529708e-06, "loss": 0.5372, "step": 1230 }, { "epoch": 0.18922991816569062, "grad_norm": 3.1221237182617188, "learning_rate": 9.760767282485091e-06, "loss": 0.5246, "step": 1240 }, { "epoch": 0.1907559658928333, "grad_norm": 2.970177173614502, "learning_rate": 9.752558888376045e-06, "loss": 0.5269, "step": 1250 }, { "epoch": 0.19228201361997596, "grad_norm": 3.686633348464966, "learning_rate": 9.744215609169709e-06, "loss": 0.5239, "step": 1260 }, { "epoch": 0.19380806134711864, "grad_norm": 2.774200439453125, "learning_rate": 9.735737681661467e-06, "loss": 0.5175, "step": 1270 }, { "epoch": 0.1953341090742613, "grad_norm": 3.612818717956543, "learning_rate": 9.727125346468243e-06, "loss": 0.5144, "step": 1280 }, { "epoch": 0.19686015680140395, "grad_norm": 2.7817158699035645, "learning_rate": 9.718378848021655e-06, "loss": 0.5417, "step": 1290 }, { "epoch": 0.19838620452854663, "grad_norm": 3.400517463684082, "learning_rate": 9.709498434561086e-06, "loss": 0.4989, "step": 1300 }, { "epoch": 0.1999122522556893, "grad_norm": 2.9461493492126465, "learning_rate": 9.70048435812664e-06, "loss": 0.5235, "step": 1310 }, { "epoch": 0.20143829998283197, "grad_norm": 3.0229151248931885, "learning_rate": 9.69133687455199e-06, "loss": 0.4988, "step": 1320 }, { "epoch": 0.20296434770997462, "grad_norm": 4.051263332366943, "learning_rate": 9.682056243457105e-06, "loss": 0.5394, "step": 1330 }, { "epoch": 0.2044903954371173, "grad_norm": 2.998913049697876, "learning_rate": 9.672642728240896e-06, "loss": 0.549, "step": 1340 }, { "epoch": 0.20601644316425996, "grad_norm": 3.4110162258148193, "learning_rate": 9.663096596073732e-06, "loss": 0.4888, "step": 1350 }, { "epoch": 0.20754249089140264, "grad_norm": 3.530796766281128, "learning_rate": 9.653418117889862e-06, "loss": 0.5272, "step": 1360 }, { "epoch": 0.2090685386185453, "grad_norm": 3.0355026721954346, "learning_rate": 9.64360756837972e-06, "loss": 0.5322, "step": 1370 }, { "epoch": 0.21059458634568795, "grad_norm": 2.8864426612854004, "learning_rate": 9.633665225982134e-06, "loss": 0.49, "step": 1380 }, { "epoch": 0.21212063407283063, "grad_norm": 4.0944132804870605, "learning_rate": 9.623591372876422e-06, "loss": 0.5502, "step": 1390 }, { "epoch": 0.2136466817999733, "grad_norm": 2.89972186088562, "learning_rate": 9.61338629497438e-06, "loss": 0.5178, "step": 1400 }, { "epoch": 0.21517272952711597, "grad_norm": 2.904897689819336, "learning_rate": 9.603050281912175e-06, "loss": 0.5471, "step": 1410 }, { "epoch": 0.21669877725425862, "grad_norm": 2.9118282794952393, "learning_rate": 9.592583627042115e-06, "loss": 0.5214, "step": 1420 }, { "epoch": 0.2182248249814013, "grad_norm": 3.003340244293213, "learning_rate": 9.581986627424329e-06, "loss": 0.5266, "step": 1430 }, { "epoch": 0.21975087270854396, "grad_norm": 2.8239073753356934, "learning_rate": 9.571259583818337e-06, "loss": 0.5305, "step": 1440 }, { "epoch": 0.2212769204356866, "grad_norm": 3.4803032875061035, "learning_rate": 9.56040280067451e-06, "loss": 0.5336, "step": 1450 }, { "epoch": 0.2228029681628293, "grad_norm": 2.8414413928985596, "learning_rate": 9.549416586125435e-06, "loss": 0.5341, "step": 1460 }, { "epoch": 0.22432901588997195, "grad_norm": 3.3560791015625, "learning_rate": 9.538301251977158e-06, "loss": 0.5175, "step": 1470 }, { "epoch": 0.22585506361711463, "grad_norm": 3.5832326412200928, "learning_rate": 9.52705711370035e-06, "loss": 0.5453, "step": 1480 }, { "epoch": 0.2273811113442573, "grad_norm": 2.907194137573242, "learning_rate": 9.515684490421342e-06, "loss": 0.5256, "step": 1490 }, { "epoch": 0.22890715907139997, "grad_norm": 3.446336269378662, "learning_rate": 9.504183704913075e-06, "loss": 0.5116, "step": 1500 }, { "epoch": 0.22890715907139997, "eval_loss": 0.5038516521453857, "eval_runtime": 99.9985, "eval_samples_per_second": 5.3, "eval_steps_per_second": 2.65, "step": 1500 }, { "epoch": 0.23043320679854262, "grad_norm": 3.575700044631958, "learning_rate": 9.492555083585928e-06, "loss": 0.5215, "step": 1510 }, { "epoch": 0.2319592545256853, "grad_norm": 2.7438437938690186, "learning_rate": 9.480798956478473e-06, "loss": 0.5381, "step": 1520 }, { "epoch": 0.23348530225282796, "grad_norm": 3.586581230163574, "learning_rate": 9.468915657248083e-06, "loss": 0.5361, "step": 1530 }, { "epoch": 0.2350113499799706, "grad_norm": 2.979966878890991, "learning_rate": 9.45690552316149e-06, "loss": 0.5385, "step": 1540 }, { "epoch": 0.2365373977071133, "grad_norm": 3.4089224338531494, "learning_rate": 9.44476889508519e-06, "loss": 0.5122, "step": 1550 }, { "epoch": 0.23806344543425595, "grad_norm": 3.400916337966919, "learning_rate": 9.432506117475777e-06, "loss": 0.4855, "step": 1560 }, { "epoch": 0.23958949316139863, "grad_norm": 3.0944440364837646, "learning_rate": 9.420117538370173e-06, "loss": 0.5314, "step": 1570 }, { "epoch": 0.24111554088854129, "grad_norm": 3.072086811065674, "learning_rate": 9.407603509375737e-06, "loss": 0.5487, "step": 1580 }, { "epoch": 0.24264158861568397, "grad_norm": 2.866974353790283, "learning_rate": 9.394964385660302e-06, "loss": 0.5199, "step": 1590 }, { "epoch": 0.24416763634282662, "grad_norm": 3.2548046112060547, "learning_rate": 9.382200525942076e-06, "loss": 0.536, "step": 1600 }, { "epoch": 0.24569368406996928, "grad_norm": 3.636455535888672, "learning_rate": 9.369312292479479e-06, "loss": 0.5102, "step": 1610 }, { "epoch": 0.24721973179711196, "grad_norm": 2.7913310527801514, "learning_rate": 9.35630005106085e-06, "loss": 0.5174, "step": 1620 }, { "epoch": 0.2487457795242546, "grad_norm": 3.905932903289795, "learning_rate": 9.34316417099407e-06, "loss": 0.5399, "step": 1630 }, { "epoch": 0.2502718272513973, "grad_norm": 2.9708142280578613, "learning_rate": 9.329905025096079e-06, "loss": 0.5139, "step": 1640 }, { "epoch": 0.25179787497853995, "grad_norm": 2.849421977996826, "learning_rate": 9.316522989682293e-06, "loss": 0.4887, "step": 1650 }, { "epoch": 0.2533239227056826, "grad_norm": 3.2548842430114746, "learning_rate": 9.30301844455593e-06, "loss": 0.5173, "step": 1660 }, { "epoch": 0.2548499704328253, "grad_norm": 3.9787535667419434, "learning_rate": 9.289391772997223e-06, "loss": 0.5295, "step": 1670 }, { "epoch": 0.25637601815996797, "grad_norm": 2.5555968284606934, "learning_rate": 9.275643361752546e-06, "loss": 0.5371, "step": 1680 }, { "epoch": 0.2579020658871106, "grad_norm": 3.158561944961548, "learning_rate": 9.261773601023439e-06, "loss": 0.5305, "step": 1690 }, { "epoch": 0.2594281136142533, "grad_norm": 3.1799142360687256, "learning_rate": 9.247782884455526e-06, "loss": 0.5293, "step": 1700 }, { "epoch": 0.26095416134139593, "grad_norm": 2.7630088329315186, "learning_rate": 9.233671609127352e-06, "loss": 0.5102, "step": 1710 }, { "epoch": 0.26248020906853864, "grad_norm": 3.3492443561553955, "learning_rate": 9.219440175539113e-06, "loss": 0.516, "step": 1720 }, { "epoch": 0.2640062567956813, "grad_norm": 3.2532637119293213, "learning_rate": 9.205088987601277e-06, "loss": 0.5063, "step": 1730 }, { "epoch": 0.26553230452282395, "grad_norm": 3.098576068878174, "learning_rate": 9.190618452623135e-06, "loss": 0.4973, "step": 1740 }, { "epoch": 0.2670583522499666, "grad_norm": 3.852489709854126, "learning_rate": 9.176028981301229e-06, "loss": 0.4778, "step": 1750 }, { "epoch": 0.2685843999771093, "grad_norm": 3.133566379547119, "learning_rate": 9.16132098770771e-06, "loss": 0.5031, "step": 1760 }, { "epoch": 0.27011044770425197, "grad_norm": 3.0958361625671387, "learning_rate": 9.146494889278568e-06, "loss": 0.4784, "step": 1770 }, { "epoch": 0.2716364954313946, "grad_norm": 3.499459743499756, "learning_rate": 9.131551106801803e-06, "loss": 0.5071, "step": 1780 }, { "epoch": 0.2731625431585373, "grad_norm": 2.8909738063812256, "learning_rate": 9.116490064405467e-06, "loss": 0.5116, "step": 1790 }, { "epoch": 0.27468859088567993, "grad_norm": 2.8877241611480713, "learning_rate": 9.101312189545636e-06, "loss": 0.4888, "step": 1800 }, { "epoch": 0.27621463861282264, "grad_norm": 2.978130578994751, "learning_rate": 9.086017912994272e-06, "loss": 0.5325, "step": 1810 }, { "epoch": 0.2777406863399653, "grad_norm": 3.5364253520965576, "learning_rate": 9.070607668827003e-06, "loss": 0.5285, "step": 1820 }, { "epoch": 0.27926673406710795, "grad_norm": 2.8093996047973633, "learning_rate": 9.055081894410802e-06, "loss": 0.4959, "step": 1830 }, { "epoch": 0.2807927817942506, "grad_norm": 2.98183274269104, "learning_rate": 9.03944103039157e-06, "loss": 0.501, "step": 1840 }, { "epoch": 0.28231882952139326, "grad_norm": 3.1950182914733887, "learning_rate": 9.023685520681626e-06, "loss": 0.5204, "step": 1850 }, { "epoch": 0.28384487724853597, "grad_norm": 2.9772353172302246, "learning_rate": 9.007815812447126e-06, "loss": 0.4989, "step": 1860 }, { "epoch": 0.2853709249756786, "grad_norm": 3.50301194190979, "learning_rate": 8.991832356095351e-06, "loss": 0.5032, "step": 1870 }, { "epoch": 0.2868969727028213, "grad_norm": 2.9427924156188965, "learning_rate": 8.975735605261936e-06, "loss": 0.5196, "step": 1880 }, { "epoch": 0.2884230204299639, "grad_norm": 2.9805080890655518, "learning_rate": 8.95952601679799e-06, "loss": 0.5092, "step": 1890 }, { "epoch": 0.28994906815710664, "grad_norm": 2.65608811378479, "learning_rate": 8.943204050757133e-06, "loss": 0.5106, "step": 1900 }, { "epoch": 0.2914751158842493, "grad_norm": 2.7500367164611816, "learning_rate": 8.926770170382434e-06, "loss": 0.4999, "step": 1910 }, { "epoch": 0.29300116361139195, "grad_norm": 3.063328266143799, "learning_rate": 8.910224842093275e-06, "loss": 0.5164, "step": 1920 }, { "epoch": 0.2945272113385346, "grad_norm": 3.1675572395324707, "learning_rate": 8.893568535472094e-06, "loss": 0.4857, "step": 1930 }, { "epoch": 0.29605325906567725, "grad_norm": 4.051036357879639, "learning_rate": 8.876801723251077e-06, "loss": 0.4995, "step": 1940 }, { "epoch": 0.29757930679281996, "grad_norm": 3.0843279361724854, "learning_rate": 8.859924881298729e-06, "loss": 0.4869, "step": 1950 }, { "epoch": 0.2991053545199626, "grad_norm": 3.1268937587738037, "learning_rate": 8.842938488606373e-06, "loss": 0.516, "step": 1960 }, { "epoch": 0.3006314022471053, "grad_norm": 2.659601926803589, "learning_rate": 8.825843027274554e-06, "loss": 0.4658, "step": 1970 }, { "epoch": 0.3021574499742479, "grad_norm": 2.756713628768921, "learning_rate": 8.80863898249936e-06, "loss": 0.5115, "step": 1980 }, { "epoch": 0.30368349770139064, "grad_norm": 2.470733404159546, "learning_rate": 8.79132684255864e-06, "loss": 0.5134, "step": 1990 }, { "epoch": 0.3052095454285333, "grad_norm": 3.007568597793579, "learning_rate": 8.773907098798158e-06, "loss": 0.5121, "step": 2000 }, { "epoch": 0.3052095454285333, "eval_loss": 0.48503902554512024, "eval_runtime": 99.9999, "eval_samples_per_second": 5.3, "eval_steps_per_second": 2.65, "step": 2000 }, { "epoch": 0.30673559315567595, "grad_norm": 3.3363804817199707, "learning_rate": 8.756380245617645e-06, "loss": 0.4861, "step": 2010 }, { "epoch": 0.3082616408828186, "grad_norm": 3.0314290523529053, "learning_rate": 8.73874678045677e-06, "loss": 0.4963, "step": 2020 }, { "epoch": 0.30978768860996125, "grad_norm": 2.7498457431793213, "learning_rate": 8.721007203781008e-06, "loss": 0.5125, "step": 2030 }, { "epoch": 0.31131373633710396, "grad_norm": 3.2180142402648926, "learning_rate": 8.703162019067451e-06, "loss": 0.5148, "step": 2040 }, { "epoch": 0.3128397840642466, "grad_norm": 3.160686492919922, "learning_rate": 8.685211732790513e-06, "loss": 0.4915, "step": 2050 }, { "epoch": 0.31436583179138927, "grad_norm": 2.880316972732544, "learning_rate": 8.667156854407555e-06, "loss": 0.5154, "step": 2060 }, { "epoch": 0.3158918795185319, "grad_norm": 3.6688642501831055, "learning_rate": 8.648997896344429e-06, "loss": 0.5257, "step": 2070 }, { "epoch": 0.31741792724567464, "grad_norm": 2.8660826683044434, "learning_rate": 8.630735373980926e-06, "loss": 0.5134, "step": 2080 }, { "epoch": 0.3189439749728173, "grad_norm": 2.69706392288208, "learning_rate": 8.612369805636161e-06, "loss": 0.4847, "step": 2090 }, { "epoch": 0.32047002269995994, "grad_norm": 2.907597064971924, "learning_rate": 8.593901712553853e-06, "loss": 0.5196, "step": 2100 }, { "epoch": 0.3219960704271026, "grad_norm": 3.240574836730957, "learning_rate": 8.575331618887537e-06, "loss": 0.5104, "step": 2110 }, { "epoch": 0.32352211815424525, "grad_norm": 3.215815782546997, "learning_rate": 8.556660051685679e-06, "loss": 0.4973, "step": 2120 }, { "epoch": 0.32504816588138796, "grad_norm": 2.7997822761535645, "learning_rate": 8.537887540876732e-06, "loss": 0.5099, "step": 2130 }, { "epoch": 0.3265742136085306, "grad_norm": 2.945455551147461, "learning_rate": 8.519014619254081e-06, "loss": 0.5112, "step": 2140 }, { "epoch": 0.32810026133567327, "grad_norm": 3.219611644744873, "learning_rate": 8.50004182246093e-06, "loss": 0.5086, "step": 2150 }, { "epoch": 0.3296263090628159, "grad_norm": 2.8295400142669678, "learning_rate": 8.480969688975094e-06, "loss": 0.4897, "step": 2160 }, { "epoch": 0.3311523567899586, "grad_norm": 2.7116832733154297, "learning_rate": 8.461798760093728e-06, "loss": 0.4751, "step": 2170 }, { "epoch": 0.3326784045171013, "grad_norm": 3.04294490814209, "learning_rate": 8.44252957991795e-06, "loss": 0.4784, "step": 2180 }, { "epoch": 0.33420445224424394, "grad_norm": 2.6779112815856934, "learning_rate": 8.423162695337402e-06, "loss": 0.5155, "step": 2190 }, { "epoch": 0.3357304999713866, "grad_norm": 3.5296261310577393, "learning_rate": 8.403698656014734e-06, "loss": 0.509, "step": 2200 }, { "epoch": 0.33725654769852925, "grad_norm": 3.0861217975616455, "learning_rate": 8.384138014370003e-06, "loss": 0.4961, "step": 2210 }, { "epoch": 0.33878259542567196, "grad_norm": 2.9720520973205566, "learning_rate": 8.364481325564983e-06, "loss": 0.4759, "step": 2220 }, { "epoch": 0.3403086431528146, "grad_norm": 2.8393325805664062, "learning_rate": 8.344729147487431e-06, "loss": 0.502, "step": 2230 }, { "epoch": 0.34183469087995727, "grad_norm": 2.930060386657715, "learning_rate": 8.324882040735227e-06, "loss": 0.4914, "step": 2240 }, { "epoch": 0.3433607386070999, "grad_norm": 3.05932879447937, "learning_rate": 8.304940568600482e-06, "loss": 0.5184, "step": 2250 }, { "epoch": 0.3448867863342426, "grad_norm": 3.455152750015259, "learning_rate": 8.284905297053544e-06, "loss": 0.4881, "step": 2260 }, { "epoch": 0.3464128340613853, "grad_norm": 3.1637487411499023, "learning_rate": 8.264776794726938e-06, "loss": 0.5001, "step": 2270 }, { "epoch": 0.34793888178852794, "grad_norm": 3.0358266830444336, "learning_rate": 8.244555632899223e-06, "loss": 0.4795, "step": 2280 }, { "epoch": 0.3494649295156706, "grad_norm": 2.930438756942749, "learning_rate": 8.224242385478778e-06, "loss": 0.4814, "step": 2290 }, { "epoch": 0.35099097724281325, "grad_norm": 2.8416764736175537, "learning_rate": 8.203837628987523e-06, "loss": 0.463, "step": 2300 }, { "epoch": 0.35251702496995596, "grad_norm": 2.61423921585083, "learning_rate": 8.18334194254454e-06, "loss": 0.4984, "step": 2310 }, { "epoch": 0.3540430726970986, "grad_norm": 2.8394827842712402, "learning_rate": 8.162755907849653e-06, "loss": 0.4884, "step": 2320 }, { "epoch": 0.35556912042424127, "grad_norm": 2.678520917892456, "learning_rate": 8.142080109166912e-06, "loss": 0.493, "step": 2330 }, { "epoch": 0.3570951681513839, "grad_norm": 3.0217721462249756, "learning_rate": 8.121315133308008e-06, "loss": 0.483, "step": 2340 }, { "epoch": 0.3586212158785266, "grad_norm": 2.741018533706665, "learning_rate": 8.100461569615615e-06, "loss": 0.5032, "step": 2350 }, { "epoch": 0.3601472636056693, "grad_norm": 3.2703857421875, "learning_rate": 8.079520009946678e-06, "loss": 0.4871, "step": 2360 }, { "epoch": 0.36167331133281194, "grad_norm": 2.5858423709869385, "learning_rate": 8.058491048655603e-06, "loss": 0.5108, "step": 2370 }, { "epoch": 0.3631993590599546, "grad_norm": 3.112304925918579, "learning_rate": 8.037375282577391e-06, "loss": 0.5019, "step": 2380 }, { "epoch": 0.36472540678709725, "grad_norm": 2.5532495975494385, "learning_rate": 8.016173311010697e-06, "loss": 0.4929, "step": 2390 }, { "epoch": 0.36625145451423996, "grad_norm": 2.342569589614868, "learning_rate": 7.994885735700832e-06, "loss": 0.4905, "step": 2400 }, { "epoch": 0.3677775022413826, "grad_norm": 3.1640357971191406, "learning_rate": 7.973513160822664e-06, "loss": 0.4745, "step": 2410 }, { "epoch": 0.36930354996852527, "grad_norm": 2.825571298599243, "learning_rate": 7.952056192963488e-06, "loss": 0.4843, "step": 2420 }, { "epoch": 0.3708295976956679, "grad_norm": 2.888791084289551, "learning_rate": 7.93051544110581e-06, "loss": 0.488, "step": 2430 }, { "epoch": 0.3723556454228106, "grad_norm": 2.8343687057495117, "learning_rate": 7.908891516610048e-06, "loss": 0.4568, "step": 2440 }, { "epoch": 0.3738816931499533, "grad_norm": 3.1001429557800293, "learning_rate": 7.887185033197198e-06, "loss": 0.4695, "step": 2450 }, { "epoch": 0.37540774087709594, "grad_norm": 2.368257761001587, "learning_rate": 7.8653966069314e-06, "loss": 0.5056, "step": 2460 }, { "epoch": 0.3769337886042386, "grad_norm": 2.99737286567688, "learning_rate": 7.843526856202472e-06, "loss": 0.4855, "step": 2470 }, { "epoch": 0.37845983633138125, "grad_norm": 3.0355498790740967, "learning_rate": 7.821576401708339e-06, "loss": 0.4784, "step": 2480 }, { "epoch": 0.3799858840585239, "grad_norm": 2.5011422634124756, "learning_rate": 7.799545866437429e-06, "loss": 0.518, "step": 2490 }, { "epoch": 0.3815119317856666, "grad_norm": 2.846738338470459, "learning_rate": 7.777435875650986e-06, "loss": 0.4778, "step": 2500 }, { "epoch": 0.3815119317856666, "eval_loss": 0.4665885865688324, "eval_runtime": 99.9829, "eval_samples_per_second": 5.301, "eval_steps_per_second": 2.65, "step": 2500 }, { "epoch": 0.38303797951280927, "grad_norm": 2.755929708480835, "learning_rate": 7.755247056865332e-06, "loss": 0.4726, "step": 2510 }, { "epoch": 0.3845640272399519, "grad_norm": 3.3097054958343506, "learning_rate": 7.732980039834048e-06, "loss": 0.4839, "step": 2520 }, { "epoch": 0.3860900749670946, "grad_norm": 2.85306978225708, "learning_rate": 7.710635456530102e-06, "loss": 0.4888, "step": 2530 }, { "epoch": 0.3876161226942373, "grad_norm": 3.0452582836151123, "learning_rate": 7.68821394112792e-06, "loss": 0.477, "step": 2540 }, { "epoch": 0.38914217042137994, "grad_norm": 2.805910348892212, "learning_rate": 7.665716129985379e-06, "loss": 0.5016, "step": 2550 }, { "epoch": 0.3906682181485226, "grad_norm": 3.086005687713623, "learning_rate": 7.64314266162575e-06, "loss": 0.4842, "step": 2560 }, { "epoch": 0.39219426587566525, "grad_norm": 2.4592947959899902, "learning_rate": 7.620494176719572e-06, "loss": 0.4977, "step": 2570 }, { "epoch": 0.3937203136028079, "grad_norm": 2.79803729057312, "learning_rate": 7.597771318066476e-06, "loss": 0.4974, "step": 2580 }, { "epoch": 0.3952463613299506, "grad_norm": 3.157926559448242, "learning_rate": 7.574974730576936e-06, "loss": 0.4815, "step": 2590 }, { "epoch": 0.39677240905709327, "grad_norm": 4.1300458908081055, "learning_rate": 7.552105061253962e-06, "loss": 0.4876, "step": 2600 }, { "epoch": 0.3982984567842359, "grad_norm": 3.6879398822784424, "learning_rate": 7.529162959174746e-06, "loss": 0.4905, "step": 2610 }, { "epoch": 0.3998245045113786, "grad_norm": 2.8170852661132812, "learning_rate": 7.5061490754722355e-06, "loss": 0.4849, "step": 2620 }, { "epoch": 0.4013505522385213, "grad_norm": 2.4680588245391846, "learning_rate": 7.4830640633166516e-06, "loss": 0.4541, "step": 2630 }, { "epoch": 0.40287659996566394, "grad_norm": 2.8768832683563232, "learning_rate": 7.4599085778969524e-06, "loss": 0.4773, "step": 2640 }, { "epoch": 0.4044026476928066, "grad_norm": 2.7005279064178467, "learning_rate": 7.436683276402239e-06, "loss": 0.47, "step": 2650 }, { "epoch": 0.40592869541994925, "grad_norm": 3.036959409713745, "learning_rate": 7.413388818003104e-06, "loss": 0.4992, "step": 2660 }, { "epoch": 0.4074547431470919, "grad_norm": 3.3453052043914795, "learning_rate": 7.3900258638329196e-06, "loss": 0.4713, "step": 2670 }, { "epoch": 0.4089807908742346, "grad_norm": 3.5486536026000977, "learning_rate": 7.366595076969073e-06, "loss": 0.4724, "step": 2680 }, { "epoch": 0.41050683860137727, "grad_norm": 2.4689748287200928, "learning_rate": 7.343097122414159e-06, "loss": 0.4972, "step": 2690 }, { "epoch": 0.4120328863285199, "grad_norm": 3.1132616996765137, "learning_rate": 7.319532667077088e-06, "loss": 0.4766, "step": 2700 }, { "epoch": 0.4135589340556626, "grad_norm": 2.663201332092285, "learning_rate": 7.295902379754172e-06, "loss": 0.4833, "step": 2710 }, { "epoch": 0.4150849817828053, "grad_norm": 3.298428773880005, "learning_rate": 7.272206931110135e-06, "loss": 0.4533, "step": 2720 }, { "epoch": 0.41661102950994794, "grad_norm": 2.6843929290771484, "learning_rate": 7.248446993659086e-06, "loss": 0.4337, "step": 2730 }, { "epoch": 0.4181370772370906, "grad_norm": 2.778803825378418, "learning_rate": 7.224623241745423e-06, "loss": 0.4992, "step": 2740 }, { "epoch": 0.41966312496423325, "grad_norm": 3.245333194732666, "learning_rate": 7.200736351524705e-06, "loss": 0.483, "step": 2750 }, { "epoch": 0.4211891726913759, "grad_norm": 3.0267553329467773, "learning_rate": 7.176787000944449e-06, "loss": 0.4927, "step": 2760 }, { "epoch": 0.4227152204185186, "grad_norm": 2.559861183166504, "learning_rate": 7.152775869724902e-06, "loss": 0.4803, "step": 2770 }, { "epoch": 0.42424126814566127, "grad_norm": 2.693279504776001, "learning_rate": 7.128703639339732e-06, "loss": 0.4622, "step": 2780 }, { "epoch": 0.4257673158728039, "grad_norm": 2.9602789878845215, "learning_rate": 7.104570992996711e-06, "loss": 0.5023, "step": 2790 }, { "epoch": 0.4272933635999466, "grad_norm": 3.0726730823516846, "learning_rate": 7.080378615618299e-06, "loss": 0.4737, "step": 2800 }, { "epoch": 0.42881941132708923, "grad_norm": 2.7500321865081787, "learning_rate": 7.0561271938222275e-06, "loss": 0.4669, "step": 2810 }, { "epoch": 0.43034545905423194, "grad_norm": 2.4754300117492676, "learning_rate": 7.031817415901991e-06, "loss": 0.4597, "step": 2820 }, { "epoch": 0.4318715067813746, "grad_norm": 2.9996578693389893, "learning_rate": 7.007449971807331e-06, "loss": 0.4693, "step": 2830 }, { "epoch": 0.43339755450851725, "grad_norm": 2.9249792098999023, "learning_rate": 6.983025553124638e-06, "loss": 0.4778, "step": 2840 }, { "epoch": 0.4349236022356599, "grad_norm": 3.4900503158569336, "learning_rate": 6.958544853057339e-06, "loss": 0.4768, "step": 2850 }, { "epoch": 0.4364496499628026, "grad_norm": 2.6894686222076416, "learning_rate": 6.934008566406211e-06, "loss": 0.4828, "step": 2860 }, { "epoch": 0.43797569768994526, "grad_norm": 2.501004934310913, "learning_rate": 6.909417389549664e-06, "loss": 0.4719, "step": 2870 }, { "epoch": 0.4395017454170879, "grad_norm": 2.8693606853485107, "learning_rate": 6.8847720204239835e-06, "loss": 0.4464, "step": 2880 }, { "epoch": 0.4410277931442306, "grad_norm": 3.2556777000427246, "learning_rate": 6.860073158503511e-06, "loss": 0.4667, "step": 2890 }, { "epoch": 0.4425538408713732, "grad_norm": 3.124281406402588, "learning_rate": 6.8353215047808006e-06, "loss": 0.4647, "step": 2900 }, { "epoch": 0.44407988859851594, "grad_norm": 3.2909648418426514, "learning_rate": 6.810517761746724e-06, "loss": 0.4543, "step": 2910 }, { "epoch": 0.4456059363256586, "grad_norm": 3.157102584838867, "learning_rate": 6.785662633370521e-06, "loss": 0.4802, "step": 2920 }, { "epoch": 0.44713198405280125, "grad_norm": 3.0583925247192383, "learning_rate": 6.760756825079838e-06, "loss": 0.4788, "step": 2930 }, { "epoch": 0.4486580317799439, "grad_norm": 3.167233943939209, "learning_rate": 6.735801043740691e-06, "loss": 0.4607, "step": 2940 }, { "epoch": 0.4501840795070866, "grad_norm": 2.5458922386169434, "learning_rate": 6.710795997637412e-06, "loss": 0.4574, "step": 2950 }, { "epoch": 0.45171012723422926, "grad_norm": 2.914799690246582, "learning_rate": 6.68574239645254e-06, "loss": 0.4686, "step": 2960 }, { "epoch": 0.4532361749613719, "grad_norm": 2.5135247707366943, "learning_rate": 6.660640951246691e-06, "loss": 0.4771, "step": 2970 }, { "epoch": 0.4547622226885146, "grad_norm": 2.9928719997406006, "learning_rate": 6.635492374438366e-06, "loss": 0.4602, "step": 2980 }, { "epoch": 0.4562882704156572, "grad_norm": 2.775026321411133, "learning_rate": 6.6102973797837324e-06, "loss": 0.4585, "step": 2990 }, { "epoch": 0.45781431814279994, "grad_norm": 2.5386760234832764, "learning_rate": 6.585056682356375e-06, "loss": 0.4823, "step": 3000 }, { "epoch": 0.45781431814279994, "eval_loss": 0.45332393050193787, "eval_runtime": 100.0205, "eval_samples_per_second": 5.299, "eval_steps_per_second": 2.649, "step": 3000 }, { "epoch": 0.4593403658699426, "grad_norm": 2.6468212604522705, "learning_rate": 6.5597709985269895e-06, "loss": 0.4769, "step": 3010 }, { "epoch": 0.46086641359708524, "grad_norm": 2.3556416034698486, "learning_rate": 6.534441045943059e-06, "loss": 0.4712, "step": 3020 }, { "epoch": 0.4623924613242279, "grad_norm": 2.7787866592407227, "learning_rate": 6.509067543508483e-06, "loss": 0.4574, "step": 3030 }, { "epoch": 0.4639185090513706, "grad_norm": 3.103032112121582, "learning_rate": 6.483651211363175e-06, "loss": 0.4505, "step": 3040 }, { "epoch": 0.46544455677851326, "grad_norm": 2.5027055740356445, "learning_rate": 6.4581927708626235e-06, "loss": 0.4669, "step": 3050 }, { "epoch": 0.4669706045056559, "grad_norm": 3.0449085235595703, "learning_rate": 6.432692944557416e-06, "loss": 0.4616, "step": 3060 }, { "epoch": 0.46849665223279857, "grad_norm": 2.4839391708374023, "learning_rate": 6.407152456172736e-06, "loss": 0.4435, "step": 3070 }, { "epoch": 0.4700226999599412, "grad_norm": 2.436234474182129, "learning_rate": 6.381572030587822e-06, "loss": 0.4635, "step": 3080 }, { "epoch": 0.47154874768708394, "grad_norm": 2.8912863731384277, "learning_rate": 6.355952393815388e-06, "loss": 0.4652, "step": 3090 }, { "epoch": 0.4730747954142266, "grad_norm": 2.5968050956726074, "learning_rate": 6.33029427298103e-06, "loss": 0.4729, "step": 3100 }, { "epoch": 0.47460084314136924, "grad_norm": 3.2073683738708496, "learning_rate": 6.304598396302578e-06, "loss": 0.4953, "step": 3110 }, { "epoch": 0.4761268908685119, "grad_norm": 3.0304925441741943, "learning_rate": 6.278865493069434e-06, "loss": 0.4474, "step": 3120 }, { "epoch": 0.47765293859565455, "grad_norm": 2.594212293624878, "learning_rate": 6.25309629362187e-06, "loss": 0.4613, "step": 3130 }, { "epoch": 0.47917898632279726, "grad_norm": 3.9076614379882812, "learning_rate": 6.227291529330302e-06, "loss": 0.4581, "step": 3140 }, { "epoch": 0.4807050340499399, "grad_norm": 3.236703634262085, "learning_rate": 6.201451932574533e-06, "loss": 0.491, "step": 3150 }, { "epoch": 0.48223108177708257, "grad_norm": 3.039473056793213, "learning_rate": 6.175578236722968e-06, "loss": 0.4632, "step": 3160 }, { "epoch": 0.4837571295042252, "grad_norm": 2.9076642990112305, "learning_rate": 6.149671176111793e-06, "loss": 0.4587, "step": 3170 }, { "epoch": 0.48528317723136793, "grad_norm": 2.884756565093994, "learning_rate": 6.123731486024146e-06, "loss": 0.4576, "step": 3180 }, { "epoch": 0.4868092249585106, "grad_norm": 2.9610495567321777, "learning_rate": 6.097759902669232e-06, "loss": 0.4562, "step": 3190 }, { "epoch": 0.48833527268565324, "grad_norm": 2.4013702869415283, "learning_rate": 6.071757163161443e-06, "loss": 0.4451, "step": 3200 }, { "epoch": 0.4898613204127959, "grad_norm": 2.82401180267334, "learning_rate": 6.045724005499429e-06, "loss": 0.4658, "step": 3210 }, { "epoch": 0.49138736813993855, "grad_norm": 3.196622133255005, "learning_rate": 6.019661168545159e-06, "loss": 0.4443, "step": 3220 }, { "epoch": 0.49291341586708126, "grad_norm": 2.9927377700805664, "learning_rate": 5.9935693920029405e-06, "loss": 0.4538, "step": 3230 }, { "epoch": 0.4944394635942239, "grad_norm": 2.5958001613616943, "learning_rate": 5.967449416398438e-06, "loss": 0.4481, "step": 3240 }, { "epoch": 0.49596551132136657, "grad_norm": 3.0835108757019043, "learning_rate": 5.941301983057646e-06, "loss": 0.4629, "step": 3250 }, { "epoch": 0.4974915590485092, "grad_norm": 2.6167795658111572, "learning_rate": 5.915127834085853e-06, "loss": 0.4626, "step": 3260 }, { "epoch": 0.49901760677565193, "grad_norm": 2.769148826599121, "learning_rate": 5.888927712346582e-06, "loss": 0.4574, "step": 3270 }, { "epoch": 0.5005436545027946, "grad_norm": 2.5669050216674805, "learning_rate": 5.862702361440502e-06, "loss": 0.4594, "step": 3280 }, { "epoch": 0.5020697022299372, "grad_norm": 3.04659104347229, "learning_rate": 5.836452525684326e-06, "loss": 0.4495, "step": 3290 }, { "epoch": 0.5035957499570799, "grad_norm": 2.9052836894989014, "learning_rate": 5.8101789500896855e-06, "loss": 0.4302, "step": 3300 }, { "epoch": 0.5051217976842226, "grad_norm": 2.720864772796631, "learning_rate": 5.783882380341985e-06, "loss": 0.4522, "step": 3310 }, { "epoch": 0.5066478454113652, "grad_norm": 2.5311403274536133, "learning_rate": 5.7575635627792384e-06, "loss": 0.4807, "step": 3320 }, { "epoch": 0.5081738931385079, "grad_norm": 3.34335994720459, "learning_rate": 5.731223244370891e-06, "loss": 0.4576, "step": 3330 }, { "epoch": 0.5096999408656506, "grad_norm": 3.448711633682251, "learning_rate": 5.704862172696612e-06, "loss": 0.4498, "step": 3340 }, { "epoch": 0.5112259885927932, "grad_norm": 3.406877279281616, "learning_rate": 5.678481095925087e-06, "loss": 0.4633, "step": 3350 }, { "epoch": 0.5127520363199359, "grad_norm": 2.611567974090576, "learning_rate": 5.65208076279277e-06, "loss": 0.4687, "step": 3360 }, { "epoch": 0.5142780840470785, "grad_norm": 2.893051862716675, "learning_rate": 5.625661922582646e-06, "loss": 0.4572, "step": 3370 }, { "epoch": 0.5158041317742212, "grad_norm": 2.572845935821533, "learning_rate": 5.599225325102957e-06, "loss": 0.461, "step": 3380 }, { "epoch": 0.517330179501364, "grad_norm": 2.917742967605591, "learning_rate": 5.572771720665928e-06, "loss": 0.4679, "step": 3390 }, { "epoch": 0.5188562272285066, "grad_norm": 2.3963794708251953, "learning_rate": 5.546301860066463e-06, "loss": 0.4423, "step": 3400 }, { "epoch": 0.5203822749556493, "grad_norm": 3.034247636795044, "learning_rate": 5.519816494560848e-06, "loss": 0.4689, "step": 3410 }, { "epoch": 0.5219083226827919, "grad_norm": 2.448772668838501, "learning_rate": 5.493316375845417e-06, "loss": 0.4295, "step": 3420 }, { "epoch": 0.5234343704099346, "grad_norm": 2.448565721511841, "learning_rate": 5.466802256035225e-06, "loss": 0.4405, "step": 3430 }, { "epoch": 0.5249604181370773, "grad_norm": 3.043869733810425, "learning_rate": 5.4402748876427e-06, "loss": 0.4114, "step": 3440 }, { "epoch": 0.5264864658642199, "grad_norm": 2.8036019802093506, "learning_rate": 5.413735023556288e-06, "loss": 0.4556, "step": 3450 }, { "epoch": 0.5280125135913626, "grad_norm": 2.72363018989563, "learning_rate": 5.387183417019079e-06, "loss": 0.458, "step": 3460 }, { "epoch": 0.5295385613185052, "grad_norm": 2.6680619716644287, "learning_rate": 5.360620821607433e-06, "loss": 0.4522, "step": 3470 }, { "epoch": 0.5310646090456479, "grad_norm": 3.0074567794799805, "learning_rate": 5.334047991209594e-06, "loss": 0.4683, "step": 3480 }, { "epoch": 0.5325906567727906, "grad_norm": 2.9890754222869873, "learning_rate": 5.307465680004289e-06, "loss": 0.4417, "step": 3490 }, { "epoch": 0.5341167044999332, "grad_norm": 2.6241977214813232, "learning_rate": 5.280874642439323e-06, "loss": 0.4292, "step": 3500 }, { "epoch": 0.5341167044999332, "eval_loss": 0.4343341290950775, "eval_runtime": 99.9521, "eval_samples_per_second": 5.303, "eval_steps_per_second": 2.651, "step": 3500 }, { "epoch": 0.5356427522270759, "grad_norm": 2.369849920272827, "learning_rate": 5.254275633210175e-06, "loss": 0.4566, "step": 3510 }, { "epoch": 0.5371687999542186, "grad_norm": 2.8073859214782715, "learning_rate": 5.227669407238565e-06, "loss": 0.4706, "step": 3520 }, { "epoch": 0.5386948476813612, "grad_norm": 2.9969775676727295, "learning_rate": 5.201056719651042e-06, "loss": 0.4656, "step": 3530 }, { "epoch": 0.5402208954085039, "grad_norm": 2.4478580951690674, "learning_rate": 5.174438325757542e-06, "loss": 0.4419, "step": 3540 }, { "epoch": 0.5417469431356465, "grad_norm": 2.7153167724609375, "learning_rate": 5.147814981029956e-06, "loss": 0.4913, "step": 3550 }, { "epoch": 0.5432729908627892, "grad_norm": 2.724480390548706, "learning_rate": 5.121187441080687e-06, "loss": 0.4523, "step": 3560 }, { "epoch": 0.544799038589932, "grad_norm": 2.984091281890869, "learning_rate": 5.094556461641205e-06, "loss": 0.4696, "step": 3570 }, { "epoch": 0.5463250863170745, "grad_norm": 2.665983200073242, "learning_rate": 5.0679227985406e-06, "loss": 0.4405, "step": 3580 }, { "epoch": 0.5478511340442173, "grad_norm": 2.8179776668548584, "learning_rate": 5.041287207684125e-06, "loss": 0.4503, "step": 3590 }, { "epoch": 0.5493771817713599, "grad_norm": 3.146925687789917, "learning_rate": 5.014650445031749e-06, "loss": 0.4908, "step": 3600 }, { "epoch": 0.5509032294985026, "grad_norm": 3.112048864364624, "learning_rate": 4.988013266576699e-06, "loss": 0.4404, "step": 3610 }, { "epoch": 0.5524292772256453, "grad_norm": 2.5898945331573486, "learning_rate": 4.961376428323997e-06, "loss": 0.4422, "step": 3620 }, { "epoch": 0.5539553249527879, "grad_norm": 2.4043118953704834, "learning_rate": 4.934740686269016e-06, "loss": 0.4688, "step": 3630 }, { "epoch": 0.5554813726799306, "grad_norm": 2.969442367553711, "learning_rate": 4.908106796376015e-06, "loss": 0.4497, "step": 3640 }, { "epoch": 0.5570074204070732, "grad_norm": 2.687509298324585, "learning_rate": 4.881475514556689e-06, "loss": 0.4356, "step": 3650 }, { "epoch": 0.5585334681342159, "grad_norm": 2.7803378105163574, "learning_rate": 4.854847596648704e-06, "loss": 0.468, "step": 3660 }, { "epoch": 0.5600595158613586, "grad_norm": 2.924004316329956, "learning_rate": 4.828223798394257e-06, "loss": 0.4478, "step": 3670 }, { "epoch": 0.5615855635885012, "grad_norm": 3.0467331409454346, "learning_rate": 4.8016048754186265e-06, "loss": 0.4515, "step": 3680 }, { "epoch": 0.5631116113156439, "grad_norm": 2.7318050861358643, "learning_rate": 4.77499158320872e-06, "loss": 0.4441, "step": 3690 }, { "epoch": 0.5646376590427865, "grad_norm": 2.8721094131469727, "learning_rate": 4.748384677091631e-06, "loss": 0.442, "step": 3700 }, { "epoch": 0.5661637067699292, "grad_norm": 2.5854904651641846, "learning_rate": 4.721784912213209e-06, "loss": 0.4523, "step": 3710 }, { "epoch": 0.5676897544970719, "grad_norm": 2.962846279144287, "learning_rate": 4.695193043516629e-06, "loss": 0.4208, "step": 3720 }, { "epoch": 0.5692158022242145, "grad_norm": 3.385815382003784, "learning_rate": 4.668609825720953e-06, "loss": 0.4212, "step": 3730 }, { "epoch": 0.5707418499513572, "grad_norm": 3.1869301795959473, "learning_rate": 4.642036013299716e-06, "loss": 0.4366, "step": 3740 }, { "epoch": 0.5722678976785, "grad_norm": 2.8463032245635986, "learning_rate": 4.615472360459519e-06, "loss": 0.4318, "step": 3750 }, { "epoch": 0.5737939454056425, "grad_norm": 2.9622936248779297, "learning_rate": 4.588919621118615e-06, "loss": 0.4352, "step": 3760 }, { "epoch": 0.5753199931327853, "grad_norm": 2.8792457580566406, "learning_rate": 4.562378548885519e-06, "loss": 0.4473, "step": 3770 }, { "epoch": 0.5768460408599279, "grad_norm": 2.265306234359741, "learning_rate": 4.535849897037607e-06, "loss": 0.4603, "step": 3780 }, { "epoch": 0.5783720885870706, "grad_norm": 2.707305431365967, "learning_rate": 4.50933441849975e-06, "loss": 0.4211, "step": 3790 }, { "epoch": 0.5798981363142133, "grad_norm": 2.334364891052246, "learning_rate": 4.4828328658229406e-06, "loss": 0.4478, "step": 3800 }, { "epoch": 0.5814241840413559, "grad_norm": 2.8786513805389404, "learning_rate": 4.456345991162933e-06, "loss": 0.4451, "step": 3810 }, { "epoch": 0.5829502317684986, "grad_norm": 2.4972634315490723, "learning_rate": 4.429874546258893e-06, "loss": 0.4675, "step": 3820 }, { "epoch": 0.5844762794956412, "grad_norm": 3.0034310817718506, "learning_rate": 4.40341928241207e-06, "loss": 0.4294, "step": 3830 }, { "epoch": 0.5860023272227839, "grad_norm": 3.1585464477539062, "learning_rate": 4.376980950464467e-06, "loss": 0.4143, "step": 3840 }, { "epoch": 0.5875283749499266, "grad_norm": 2.7901790142059326, "learning_rate": 4.350560300777533e-06, "loss": 0.459, "step": 3850 }, { "epoch": 0.5890544226770692, "grad_norm": 2.7392337322235107, "learning_rate": 4.324158083210867e-06, "loss": 0.4349, "step": 3860 }, { "epoch": 0.5905804704042119, "grad_norm": 2.9482500553131104, "learning_rate": 4.297775047100935e-06, "loss": 0.4362, "step": 3870 }, { "epoch": 0.5921065181313545, "grad_norm": 2.583388566970825, "learning_rate": 4.271411941239809e-06, "loss": 0.4702, "step": 3880 }, { "epoch": 0.5936325658584972, "grad_norm": 2.743952512741089, "learning_rate": 4.245069513853897e-06, "loss": 0.4401, "step": 3890 }, { "epoch": 0.5951586135856399, "grad_norm": 2.7953054904937744, "learning_rate": 4.218748512582732e-06, "loss": 0.4425, "step": 3900 }, { "epoch": 0.5966846613127825, "grad_norm": 2.898608684539795, "learning_rate": 4.19244968445773e-06, "loss": 0.4488, "step": 3910 }, { "epoch": 0.5982107090399252, "grad_norm": 2.4456114768981934, "learning_rate": 4.166173775881007e-06, "loss": 0.4226, "step": 3920 }, { "epoch": 0.5997367567670678, "grad_norm": 3.178201913833618, "learning_rate": 4.139921532604177e-06, "loss": 0.4267, "step": 3930 }, { "epoch": 0.6012628044942105, "grad_norm": 2.3235249519348145, "learning_rate": 4.113693699707203e-06, "loss": 0.4486, "step": 3940 }, { "epoch": 0.6027888522213533, "grad_norm": 2.7228195667266846, "learning_rate": 4.0874910215772426e-06, "loss": 0.4271, "step": 3950 }, { "epoch": 0.6043148999484959, "grad_norm": 2.8123674392700195, "learning_rate": 4.061314241887516e-06, "loss": 0.4111, "step": 3960 }, { "epoch": 0.6058409476756386, "grad_norm": 2.610856533050537, "learning_rate": 4.03516410357621e-06, "loss": 0.4229, "step": 3970 }, { "epoch": 0.6073669954027813, "grad_norm": 2.711467742919922, "learning_rate": 4.009041348825383e-06, "loss": 0.4265, "step": 3980 }, { "epoch": 0.6088930431299239, "grad_norm": 3.1023402214050293, "learning_rate": 3.982946719039911e-06, "loss": 0.4249, "step": 3990 }, { "epoch": 0.6104190908570666, "grad_norm": 2.8750693798065186, "learning_rate": 3.956880954826435e-06, "loss": 0.4333, "step": 4000 }, { "epoch": 0.6104190908570666, "eval_loss": 0.4167136251926422, "eval_runtime": 99.9809, "eval_samples_per_second": 5.301, "eval_steps_per_second": 2.651, "step": 4000 }, { "epoch": 0.6119451385842092, "grad_norm": 2.5793118476867676, "learning_rate": 3.930844795972344e-06, "loss": 0.4512, "step": 4010 }, { "epoch": 0.6134711863113519, "grad_norm": 3.219802141189575, "learning_rate": 3.904838981424785e-06, "loss": 0.4203, "step": 4020 }, { "epoch": 0.6149972340384946, "grad_norm": 3.2610111236572266, "learning_rate": 3.878864249269681e-06, "loss": 0.439, "step": 4030 }, { "epoch": 0.6165232817656372, "grad_norm": 2.9082117080688477, "learning_rate": 3.852921336710794e-06, "loss": 0.4365, "step": 4040 }, { "epoch": 0.6180493294927799, "grad_norm": 3.1823573112487793, "learning_rate": 3.827010980048787e-06, "loss": 0.4377, "step": 4050 }, { "epoch": 0.6195753772199225, "grad_norm": 2.5103094577789307, "learning_rate": 3.801133914660344e-06, "loss": 0.4389, "step": 4060 }, { "epoch": 0.6211014249470652, "grad_norm": 2.895665168762207, "learning_rate": 3.7752908749772865e-06, "loss": 0.4417, "step": 4070 }, { "epoch": 0.6226274726742079, "grad_norm": 3.2190115451812744, "learning_rate": 3.749482594465733e-06, "loss": 0.4232, "step": 4080 }, { "epoch": 0.6241535204013505, "grad_norm": 2.946439504623413, "learning_rate": 3.7237098056052816e-06, "loss": 0.4205, "step": 4090 }, { "epoch": 0.6256795681284932, "grad_norm": 2.475071907043457, "learning_rate": 3.6979732398682223e-06, "loss": 0.4627, "step": 4100 }, { "epoch": 0.6272056158556358, "grad_norm": 3.475555181503296, "learning_rate": 3.672273627698775e-06, "loss": 0.4135, "step": 4110 }, { "epoch": 0.6287316635827785, "grad_norm": 2.8138234615325928, "learning_rate": 3.646611698492364e-06, "loss": 0.4203, "step": 4120 }, { "epoch": 0.6302577113099213, "grad_norm": 3.4674346446990967, "learning_rate": 3.6209881805749025e-06, "loss": 0.4225, "step": 4130 }, { "epoch": 0.6317837590370639, "grad_norm": 2.8863394260406494, "learning_rate": 3.59540380118214e-06, "loss": 0.4256, "step": 4140 }, { "epoch": 0.6333098067642066, "grad_norm": 2.5023999214172363, "learning_rate": 3.5698592864390085e-06, "loss": 0.4494, "step": 4150 }, { "epoch": 0.6348358544913493, "grad_norm": 3.108616828918457, "learning_rate": 3.544355361339017e-06, "loss": 0.4278, "step": 4160 }, { "epoch": 0.6363619022184919, "grad_norm": 3.5009264945983887, "learning_rate": 3.51889274972368e-06, "loss": 0.4145, "step": 4170 }, { "epoch": 0.6378879499456346, "grad_norm": 4.06900691986084, "learning_rate": 3.4934721742619714e-06, "loss": 0.4327, "step": 4180 }, { "epoch": 0.6394139976727772, "grad_norm": 3.3994603157043457, "learning_rate": 3.46809435642981e-06, "loss": 0.4123, "step": 4190 }, { "epoch": 0.6409400453999199, "grad_norm": 2.9589715003967285, "learning_rate": 3.442760016489586e-06, "loss": 0.3977, "step": 4200 }, { "epoch": 0.6424660931270626, "grad_norm": 2.398531436920166, "learning_rate": 3.4174698734697177e-06, "loss": 0.4197, "step": 4210 }, { "epoch": 0.6439921408542052, "grad_norm": 3.2008326053619385, "learning_rate": 3.3922246451442474e-06, "loss": 0.4286, "step": 4220 }, { "epoch": 0.6455181885813479, "grad_norm": 3.441654920578003, "learning_rate": 3.3670250480124712e-06, "loss": 0.4568, "step": 4230 }, { "epoch": 0.6470442363084905, "grad_norm": 2.481886386871338, "learning_rate": 3.3418717972785906e-06, "loss": 0.403, "step": 4240 }, { "epoch": 0.6485702840356332, "grad_norm": 2.639709949493408, "learning_rate": 3.316765606831432e-06, "loss": 0.4567, "step": 4250 }, { "epoch": 0.6500963317627759, "grad_norm": 2.857813596725464, "learning_rate": 3.2917071892241714e-06, "loss": 0.4147, "step": 4260 }, { "epoch": 0.6516223794899185, "grad_norm": 2.872697353363037, "learning_rate": 3.266697255654123e-06, "loss": 0.4138, "step": 4270 }, { "epoch": 0.6531484272170612, "grad_norm": 3.090141773223877, "learning_rate": 3.2417365159425383e-06, "loss": 0.4238, "step": 4280 }, { "epoch": 0.6546744749442038, "grad_norm": 3.7141196727752686, "learning_rate": 3.216825678514478e-06, "loss": 0.4198, "step": 4290 }, { "epoch": 0.6562005226713465, "grad_norm": 2.859290838241577, "learning_rate": 3.1919654503786935e-06, "loss": 0.4159, "step": 4300 }, { "epoch": 0.6577265703984893, "grad_norm": 3.016757011413574, "learning_rate": 3.1671565371075687e-06, "loss": 0.4218, "step": 4310 }, { "epoch": 0.6592526181256319, "grad_norm": 2.7048025131225586, "learning_rate": 3.142399642817084e-06, "loss": 0.4327, "step": 4320 }, { "epoch": 0.6607786658527746, "grad_norm": 2.9763782024383545, "learning_rate": 3.117695470146844e-06, "loss": 0.4071, "step": 4330 }, { "epoch": 0.6623047135799172, "grad_norm": 2.885979652404785, "learning_rate": 3.0930447202401303e-06, "loss": 0.4279, "step": 4340 }, { "epoch": 0.6638307613070599, "grad_norm": 2.6588382720947266, "learning_rate": 3.0684480927240057e-06, "loss": 0.4199, "step": 4350 }, { "epoch": 0.6653568090342026, "grad_norm": 3.2519760131835938, "learning_rate": 3.0439062856894463e-06, "loss": 0.4067, "step": 4360 }, { "epoch": 0.6668828567613452, "grad_norm": 5.050004482269287, "learning_rate": 3.0194199956715443e-06, "loss": 0.4256, "step": 4370 }, { "epoch": 0.6684089044884879, "grad_norm": 2.7873661518096924, "learning_rate": 2.994989917629726e-06, "loss": 0.4203, "step": 4380 }, { "epoch": 0.6699349522156306, "grad_norm": 2.685523509979248, "learning_rate": 2.9706167449280404e-06, "loss": 0.4546, "step": 4390 }, { "epoch": 0.6714609999427732, "grad_norm": 2.6410155296325684, "learning_rate": 2.9463011693154643e-06, "loss": 0.4257, "step": 4400 }, { "epoch": 0.6729870476699159, "grad_norm": 2.6280877590179443, "learning_rate": 2.9220438809062855e-06, "loss": 0.4154, "step": 4410 }, { "epoch": 0.6745130953970585, "grad_norm": 2.6469454765319824, "learning_rate": 2.897845568160508e-06, "loss": 0.4245, "step": 4420 }, { "epoch": 0.6760391431242012, "grad_norm": 3.425985097885132, "learning_rate": 2.873706917864314e-06, "loss": 0.4173, "step": 4430 }, { "epoch": 0.6775651908513439, "grad_norm": 2.8293681144714355, "learning_rate": 2.8496286151105644e-06, "loss": 0.4337, "step": 4440 }, { "epoch": 0.6790912385784865, "grad_norm": 2.9185335636138916, "learning_rate": 2.825611343279374e-06, "loss": 0.4132, "step": 4450 }, { "epoch": 0.6806172863056292, "grad_norm": 2.4097468852996826, "learning_rate": 2.801655784018696e-06, "loss": 0.4206, "step": 4460 }, { "epoch": 0.6821433340327718, "grad_norm": 2.4502408504486084, "learning_rate": 2.777762617224985e-06, "loss": 0.4192, "step": 4470 }, { "epoch": 0.6836693817599145, "grad_norm": 2.847097873687744, "learning_rate": 2.7539325210239e-06, "loss": 0.4347, "step": 4480 }, { "epoch": 0.6851954294870573, "grad_norm": 3.346667766571045, "learning_rate": 2.730166171751056e-06, "loss": 0.4328, "step": 4490 }, { "epoch": 0.6867214772141998, "grad_norm": 2.8015189170837402, "learning_rate": 2.706464243932836e-06, "loss": 0.4153, "step": 4500 }, { "epoch": 0.6867214772141998, "eval_loss": 0.40170425176620483, "eval_runtime": 99.9566, "eval_samples_per_second": 5.302, "eval_steps_per_second": 2.651, "step": 4500 }, { "epoch": 0.6882475249413426, "grad_norm": 3.115753173828125, "learning_rate": 2.6828274102672292e-06, "loss": 0.4192, "step": 4510 }, { "epoch": 0.6897735726684852, "grad_norm": 3.2326269149780273, "learning_rate": 2.6592563416047616e-06, "loss": 0.4203, "step": 4520 }, { "epoch": 0.6912996203956279, "grad_norm": 2.6536991596221924, "learning_rate": 2.6357517069294397e-06, "loss": 0.4023, "step": 4530 }, { "epoch": 0.6928256681227706, "grad_norm": 3.132383346557617, "learning_rate": 2.6123141733397695e-06, "loss": 0.4251, "step": 4540 }, { "epoch": 0.6943517158499132, "grad_norm": 2.5857174396514893, "learning_rate": 2.5889444060298217e-06, "loss": 0.3893, "step": 4550 }, { "epoch": 0.6958777635770559, "grad_norm": 2.402454137802124, "learning_rate": 2.5656430682703547e-06, "loss": 0.3777, "step": 4560 }, { "epoch": 0.6974038113041985, "grad_norm": 3.208631992340088, "learning_rate": 2.5424108213899902e-06, "loss": 0.4077, "step": 4570 }, { "epoch": 0.6989298590313412, "grad_norm": 3.2448372840881348, "learning_rate": 2.5192483247564393e-06, "loss": 0.4218, "step": 4580 }, { "epoch": 0.7004559067584839, "grad_norm": 2.6008667945861816, "learning_rate": 2.496156235757792e-06, "loss": 0.4466, "step": 4590 }, { "epoch": 0.7019819544856265, "grad_norm": 2.548492431640625, "learning_rate": 2.47313520978386e-06, "loss": 0.3949, "step": 4600 }, { "epoch": 0.7035080022127692, "grad_norm": 2.979745626449585, "learning_rate": 2.4501859002075713e-06, "loss": 0.4244, "step": 4610 }, { "epoch": 0.7050340499399119, "grad_norm": 2.696276903152466, "learning_rate": 2.4273089583664376e-06, "loss": 0.4144, "step": 4620 }, { "epoch": 0.7065600976670545, "grad_norm": 2.8598382472991943, "learning_rate": 2.404505033544048e-06, "loss": 0.391, "step": 4630 }, { "epoch": 0.7080861453941972, "grad_norm": 3.1868958473205566, "learning_rate": 2.381774772951666e-06, "loss": 0.4227, "step": 4640 }, { "epoch": 0.7096121931213398, "grad_norm": 3.3258581161499023, "learning_rate": 2.359118821709842e-06, "loss": 0.4306, "step": 4650 }, { "epoch": 0.7111382408484825, "grad_norm": 2.390016794204712, "learning_rate": 2.3365378228301107e-06, "loss": 0.4245, "step": 4660 }, { "epoch": 0.7126642885756252, "grad_norm": 2.966630220413208, "learning_rate": 2.314032417196742e-06, "loss": 0.4135, "step": 4670 }, { "epoch": 0.7141903363027678, "grad_norm": 3.0981130599975586, "learning_rate": 2.2916032435485477e-06, "loss": 0.4195, "step": 4680 }, { "epoch": 0.7157163840299106, "grad_norm": 2.610236644744873, "learning_rate": 2.269250938460762e-06, "loss": 0.4101, "step": 4690 }, { "epoch": 0.7172424317570532, "grad_norm": 2.589944839477539, "learning_rate": 2.246976136326963e-06, "loss": 0.4167, "step": 4700 }, { "epoch": 0.7187684794841959, "grad_norm": 2.9684040546417236, "learning_rate": 2.2247794693410746e-06, "loss": 0.3946, "step": 4710 }, { "epoch": 0.7202945272113386, "grad_norm": 2.7830798625946045, "learning_rate": 2.202661567479423e-06, "loss": 0.4112, "step": 4720 }, { "epoch": 0.7218205749384812, "grad_norm": 3.1135716438293457, "learning_rate": 2.180623058482853e-06, "loss": 0.4371, "step": 4730 }, { "epoch": 0.7233466226656239, "grad_norm": 2.432995557785034, "learning_rate": 2.158664567838924e-06, "loss": 0.3921, "step": 4740 }, { "epoch": 0.7248726703927665, "grad_norm": 2.704394817352295, "learning_rate": 2.136786718764135e-06, "loss": 0.4117, "step": 4750 }, { "epoch": 0.7263987181199092, "grad_norm": 3.2139337062835693, "learning_rate": 2.1149901321862624e-06, "loss": 0.3888, "step": 4760 }, { "epoch": 0.7279247658470519, "grad_norm": 2.8158254623413086, "learning_rate": 2.093275426726722e-06, "loss": 0.3953, "step": 4770 }, { "epoch": 0.7294508135741945, "grad_norm": 2.6510894298553467, "learning_rate": 2.0716432186830064e-06, "loss": 0.4003, "step": 4780 }, { "epoch": 0.7309768613013372, "grad_norm": 3.2726964950561523, "learning_rate": 2.0500941220112153e-06, "loss": 0.4097, "step": 4790 }, { "epoch": 0.7325029090284799, "grad_norm": 2.590909242630005, "learning_rate": 2.0286287483086046e-06, "loss": 0.4016, "step": 4800 }, { "epoch": 0.7340289567556225, "grad_norm": 2.8233532905578613, "learning_rate": 2.007247706796254e-06, "loss": 0.4012, "step": 4810 }, { "epoch": 0.7355550044827652, "grad_norm": 2.646611452102661, "learning_rate": 1.985951604301746e-06, "loss": 0.402, "step": 4820 }, { "epoch": 0.7370810522099078, "grad_norm": 2.817006826400757, "learning_rate": 1.9647410452419763e-06, "loss": 0.4361, "step": 4830 }, { "epoch": 0.7386070999370505, "grad_norm": 2.7627429962158203, "learning_rate": 1.943616631605973e-06, "loss": 0.4032, "step": 4840 }, { "epoch": 0.7401331476641932, "grad_norm": 2.847055673599243, "learning_rate": 1.922578962937826e-06, "loss": 0.3871, "step": 4850 }, { "epoch": 0.7416591953913358, "grad_norm": 3.007472515106201, "learning_rate": 1.9016286363196656e-06, "loss": 0.3938, "step": 4860 }, { "epoch": 0.7431852431184786, "grad_norm": 3.840334177017212, "learning_rate": 1.8807662463547156e-06, "loss": 0.416, "step": 4870 }, { "epoch": 0.7447112908456212, "grad_norm": 2.870105504989624, "learning_rate": 1.8599923851504237e-06, "loss": 0.4244, "step": 4880 }, { "epoch": 0.7462373385727639, "grad_norm": 2.780932664871216, "learning_rate": 1.8393076423016493e-06, "loss": 0.4187, "step": 4890 }, { "epoch": 0.7477633862999066, "grad_norm": 3.142162561416626, "learning_rate": 1.8187126048739284e-06, "loss": 0.3896, "step": 4900 }, { "epoch": 0.7492894340270492, "grad_norm": 2.7128045558929443, "learning_rate": 1.7982078573868245e-06, "loss": 0.4062, "step": 4910 }, { "epoch": 0.7508154817541919, "grad_norm": 2.843827247619629, "learning_rate": 1.7777939817973238e-06, "loss": 0.4194, "step": 4920 }, { "epoch": 0.7523415294813345, "grad_norm": 2.7338247299194336, "learning_rate": 1.7574715574833324e-06, "loss": 0.4018, "step": 4930 }, { "epoch": 0.7538675772084772, "grad_norm": 2.4693098068237305, "learning_rate": 1.7372411612272149e-06, "loss": 0.4073, "step": 4940 }, { "epoch": 0.7553936249356199, "grad_norm": 2.3850250244140625, "learning_rate": 1.7171033671994418e-06, "loss": 0.3987, "step": 4950 }, { "epoch": 0.7569196726627625, "grad_norm": 2.474433183670044, "learning_rate": 1.6970587469422889e-06, "loss": 0.4142, "step": 4960 }, { "epoch": 0.7584457203899052, "grad_norm": 2.722198247909546, "learning_rate": 1.6771078693536075e-06, "loss": 0.3817, "step": 4970 }, { "epoch": 0.7599717681170478, "grad_norm": 2.9978530406951904, "learning_rate": 1.657251300670688e-06, "loss": 0.4322, "step": 4980 }, { "epoch": 0.7614978158441905, "grad_norm": 2.862351179122925, "learning_rate": 1.6374896044541854e-06, "loss": 0.4025, "step": 4990 }, { "epoch": 0.7630238635713332, "grad_norm": 3.3892769813537598, "learning_rate": 1.6178233415721228e-06, "loss": 0.3973, "step": 5000 }, { "epoch": 0.7630238635713332, "eval_loss": 0.39075401425361633, "eval_runtime": 99.9532, "eval_samples_per_second": 5.302, "eval_steps_per_second": 2.651, "step": 5000 }, { "epoch": 0.7645499112984758, "grad_norm": 3.2107391357421875, "learning_rate": 1.5982530701839815e-06, "loss": 0.4204, "step": 5010 }, { "epoch": 0.7660759590256185, "grad_norm": 2.611640691757202, "learning_rate": 1.5787793457248425e-06, "loss": 0.3878, "step": 5020 }, { "epoch": 0.7676020067527612, "grad_norm": 3.0495364665985107, "learning_rate": 1.5594027208896433e-06, "loss": 0.4075, "step": 5030 }, { "epoch": 0.7691280544799038, "grad_norm": 3.080049991607666, "learning_rate": 1.5401237456174755e-06, "loss": 0.4277, "step": 5040 }, { "epoch": 0.7706541022070466, "grad_norm": 2.9949750900268555, "learning_rate": 1.5209429670759874e-06, "loss": 0.4008, "step": 5050 }, { "epoch": 0.7721801499341892, "grad_norm": 2.866619348526001, "learning_rate": 1.5018609296458425e-06, "loss": 0.3935, "step": 5060 }, { "epoch": 0.7737061976613319, "grad_norm": 2.5663259029388428, "learning_rate": 1.4828781749052807e-06, "loss": 0.3999, "step": 5070 }, { "epoch": 0.7752322453884746, "grad_norm": 2.572970151901245, "learning_rate": 1.4639952416147457e-06, "loss": 0.4095, "step": 5080 }, { "epoch": 0.7767582931156172, "grad_norm": 2.68562650680542, "learning_rate": 1.445212665701583e-06, "loss": 0.419, "step": 5090 }, { "epoch": 0.7782843408427599, "grad_norm": 3.246164321899414, "learning_rate": 1.426530980244848e-06, "loss": 0.4138, "step": 5100 }, { "epoch": 0.7798103885699025, "grad_norm": 3.2083702087402344, "learning_rate": 1.407950715460159e-06, "loss": 0.4208, "step": 5110 }, { "epoch": 0.7813364362970452, "grad_norm": 3.3285470008850098, "learning_rate": 1.389472398684658e-06, "loss": 0.3787, "step": 5120 }, { "epoch": 0.7828624840241879, "grad_norm": 2.917363405227661, "learning_rate": 1.3710965543620442e-06, "loss": 0.4057, "step": 5130 }, { "epoch": 0.7843885317513305, "grad_norm": 4.440506935119629, "learning_rate": 1.3528237040276825e-06, "loss": 0.3901, "step": 5140 }, { "epoch": 0.7859145794784732, "grad_norm": 4.1051788330078125, "learning_rate": 1.3346543662938132e-06, "loss": 0.4323, "step": 5150 }, { "epoch": 0.7874406272056158, "grad_norm": 2.771667242050171, "learning_rate": 1.316589056834821e-06, "loss": 0.4127, "step": 5160 }, { "epoch": 0.7889666749327585, "grad_norm": 2.9452033042907715, "learning_rate": 1.2986282883726065e-06, "loss": 0.4235, "step": 5170 }, { "epoch": 0.7904927226599012, "grad_norm": 3.01763916015625, "learning_rate": 1.2807725706620317e-06, "loss": 0.4059, "step": 5180 }, { "epoch": 0.7920187703870438, "grad_norm": 3.0742299556732178, "learning_rate": 1.2630224104764516e-06, "loss": 0.416, "step": 5190 }, { "epoch": 0.7935448181141865, "grad_norm": 2.9382846355438232, "learning_rate": 1.2453783115933387e-06, "loss": 0.3976, "step": 5200 }, { "epoch": 0.7950708658413291, "grad_norm": 2.8469552993774414, "learning_rate": 1.2278407747799687e-06, "loss": 0.407, "step": 5210 }, { "epoch": 0.7965969135684718, "grad_norm": 2.8619937896728516, "learning_rate": 1.2104102977792282e-06, "loss": 0.3873, "step": 5220 }, { "epoch": 0.7981229612956146, "grad_norm": 3.257063388824463, "learning_rate": 1.1930873752954725e-06, "loss": 0.3868, "step": 5230 }, { "epoch": 0.7996490090227572, "grad_norm": 3.016730785369873, "learning_rate": 1.1758724989804908e-06, "loss": 0.4081, "step": 5240 }, { "epoch": 0.8011750567498999, "grad_norm": 3.1905858516693115, "learning_rate": 1.1587661574195536e-06, "loss": 0.3895, "step": 5250 }, { "epoch": 0.8027011044770426, "grad_norm": 2.977105140686035, "learning_rate": 1.1417688361175422e-06, "loss": 0.4028, "step": 5260 }, { "epoch": 0.8042271522041852, "grad_norm": 2.6317853927612305, "learning_rate": 1.1248810174851755e-06, "loss": 0.377, "step": 5270 }, { "epoch": 0.8057531999313279, "grad_norm": 3.0427422523498535, "learning_rate": 1.1081031808253096e-06, "loss": 0.3763, "step": 5280 }, { "epoch": 0.8072792476584705, "grad_norm": 3.044510841369629, "learning_rate": 1.0914358023193428e-06, "loss": 0.382, "step": 5290 }, { "epoch": 0.8088052953856132, "grad_norm": 2.3702402114868164, "learning_rate": 1.0748793550136949e-06, "loss": 0.4003, "step": 5300 }, { "epoch": 0.8103313431127559, "grad_norm": 2.994016647338867, "learning_rate": 1.0584343088063837e-06, "loss": 0.3966, "step": 5310 }, { "epoch": 0.8118573908398985, "grad_norm": 3.0523016452789307, "learning_rate": 1.0421011304336932e-06, "loss": 0.4129, "step": 5320 }, { "epoch": 0.8133834385670412, "grad_norm": 3.0171058177948, "learning_rate": 1.0258802834569137e-06, "loss": 0.3687, "step": 5330 }, { "epoch": 0.8149094862941838, "grad_norm": 2.748992681503296, "learning_rate": 1.0097722282492023e-06, "loss": 0.3936, "step": 5340 }, { "epoch": 0.8164355340213265, "grad_norm": 2.96637225151062, "learning_rate": 9.93777421982503e-07, "loss": 0.4234, "step": 5350 }, { "epoch": 0.8179615817484692, "grad_norm": 3.161268949508667, "learning_rate": 9.778963186145796e-07, "loss": 0.3938, "step": 5360 }, { "epoch": 0.8194876294756118, "grad_norm": 2.630280017852783, "learning_rate": 9.621293688761263e-07, "loss": 0.3739, "step": 5370 }, { "epoch": 0.8210136772027545, "grad_norm": 3.382284641265869, "learning_rate": 9.464770202579787e-07, "loss": 0.3918, "step": 5380 }, { "epoch": 0.8225397249298971, "grad_norm": 3.014678716659546, "learning_rate": 9.309397169984158e-07, "loss": 0.4185, "step": 5390 }, { "epoch": 0.8240657726570398, "grad_norm": 2.7957093715667725, "learning_rate": 9.155179000705399e-07, "loss": 0.3897, "step": 5400 }, { "epoch": 0.8255918203841826, "grad_norm": 3.4030938148498535, "learning_rate": 9.00212007169779e-07, "loss": 0.3989, "step": 5410 }, { "epoch": 0.8271178681113251, "grad_norm": 3.2532286643981934, "learning_rate": 8.850224727014489e-07, "loss": 0.4053, "step": 5420 }, { "epoch": 0.8286439158384679, "grad_norm": 3.2262065410614014, "learning_rate": 8.699497277684326e-07, "loss": 0.413, "step": 5430 }, { "epoch": 0.8301699635656106, "grad_norm": 3.0282540321350098, "learning_rate": 8.549942001589406e-07, "loss": 0.3965, "step": 5440 }, { "epoch": 0.8316960112927532, "grad_norm": 2.6417813301086426, "learning_rate": 8.401563143343721e-07, "loss": 0.4071, "step": 5450 }, { "epoch": 0.8332220590198959, "grad_norm": 3.082578182220459, "learning_rate": 8.254364914172697e-07, "loss": 0.3975, "step": 5460 }, { "epoch": 0.8347481067470385, "grad_norm": 3.2389848232269287, "learning_rate": 8.108351491793615e-07, "loss": 0.404, "step": 5470 }, { "epoch": 0.8362741544741812, "grad_norm": 3.0996053218841553, "learning_rate": 7.963527020297085e-07, "loss": 0.3817, "step": 5480 }, { "epoch": 0.8378002022013239, "grad_norm": 2.964110851287842, "learning_rate": 7.819895610029433e-07, "loss": 0.3821, "step": 5490 }, { "epoch": 0.8393262499284665, "grad_norm": 3.0734763145446777, "learning_rate": 7.677461337476005e-07, "loss": 0.4168, "step": 5500 }, { "epoch": 0.8393262499284665, "eval_loss": 0.3822996914386749, "eval_runtime": 99.942, "eval_samples_per_second": 5.303, "eval_steps_per_second": 2.652, "step": 5500 }, { "epoch": 0.8408522976556092, "grad_norm": 3.2727982997894287, "learning_rate": 7.536228245145554e-07, "loss": 0.3812, "step": 5510 }, { "epoch": 0.8423783453827518, "grad_norm": 2.7713711261749268, "learning_rate": 7.396200341455356e-07, "loss": 0.3969, "step": 5520 }, { "epoch": 0.8439043931098945, "grad_norm": 3.2900307178497314, "learning_rate": 7.25738160061763e-07, "loss": 0.4094, "step": 5530 }, { "epoch": 0.8454304408370372, "grad_norm": 2.305082321166992, "learning_rate": 7.119775962526593e-07, "loss": 0.4075, "step": 5540 }, { "epoch": 0.8469564885641798, "grad_norm": 2.9151320457458496, "learning_rate": 6.983387332646718e-07, "loss": 0.3881, "step": 5550 }, { "epoch": 0.8484825362913225, "grad_norm": 2.791234254837036, "learning_rate": 6.848219581901866e-07, "loss": 0.3946, "step": 5560 }, { "epoch": 0.8500085840184651, "grad_norm": 3.1531620025634766, "learning_rate": 6.714276546565423e-07, "loss": 0.4137, "step": 5570 }, { "epoch": 0.8515346317456078, "grad_norm": 2.58180570602417, "learning_rate": 6.581562028151451e-07, "loss": 0.4073, "step": 5580 }, { "epoch": 0.8530606794727505, "grad_norm": 2.863215208053589, "learning_rate": 6.450079793306735e-07, "loss": 0.4321, "step": 5590 }, { "epoch": 0.8545867271998931, "grad_norm": 2.909364700317383, "learning_rate": 6.319833573703938e-07, "loss": 0.3989, "step": 5600 }, { "epoch": 0.8561127749270359, "grad_norm": 2.7926652431488037, "learning_rate": 6.190827065935645e-07, "loss": 0.404, "step": 5610 }, { "epoch": 0.8576388226541785, "grad_norm": 2.7227327823638916, "learning_rate": 6.06306393140948e-07, "loss": 0.3815, "step": 5620 }, { "epoch": 0.8591648703813212, "grad_norm": 2.705726385116577, "learning_rate": 5.936547796244207e-07, "loss": 0.3741, "step": 5630 }, { "epoch": 0.8606909181084639, "grad_norm": 3.357189178466797, "learning_rate": 5.811282251166716e-07, "loss": 0.398, "step": 5640 }, { "epoch": 0.8622169658356065, "grad_norm": 2.5761477947235107, "learning_rate": 5.687270851410265e-07, "loss": 0.396, "step": 5650 }, { "epoch": 0.8637430135627492, "grad_norm": 4.029236316680908, "learning_rate": 5.564517116613433e-07, "loss": 0.4209, "step": 5660 }, { "epoch": 0.8652690612898919, "grad_norm": 3.4346041679382324, "learning_rate": 5.443024530720326e-07, "loss": 0.3933, "step": 5670 }, { "epoch": 0.8667951090170345, "grad_norm": 2.5683741569519043, "learning_rate": 5.32279654188163e-07, "loss": 0.3668, "step": 5680 }, { "epoch": 0.8683211567441772, "grad_norm": 2.6289405822753906, "learning_rate": 5.203836562356795e-07, "loss": 0.3781, "step": 5690 }, { "epoch": 0.8698472044713198, "grad_norm": 2.9439804553985596, "learning_rate": 5.086147968417199e-07, "loss": 0.4096, "step": 5700 }, { "epoch": 0.8713732521984625, "grad_norm": 2.9020564556121826, "learning_rate": 4.969734100250229e-07, "loss": 0.413, "step": 5710 }, { "epoch": 0.8728992999256052, "grad_norm": 3.4045605659484863, "learning_rate": 4.854598261864618e-07, "loss": 0.3765, "step": 5720 }, { "epoch": 0.8744253476527478, "grad_norm": 3.001821517944336, "learning_rate": 4.74074372099656e-07, "loss": 0.3817, "step": 5730 }, { "epoch": 0.8759513953798905, "grad_norm": 2.8464105129241943, "learning_rate": 4.628173709017031e-07, "loss": 0.3914, "step": 5740 }, { "epoch": 0.8774774431070331, "grad_norm": 3.0081145763397217, "learning_rate": 4.516891420840047e-07, "loss": 0.4129, "step": 5750 }, { "epoch": 0.8790034908341758, "grad_norm": 2.8356759548187256, "learning_rate": 4.4069000148319885e-07, "loss": 0.3986, "step": 5760 }, { "epoch": 0.8805295385613185, "grad_norm": 2.759951114654541, "learning_rate": 4.298202612722008e-07, "loss": 0.3763, "step": 5770 }, { "epoch": 0.8820555862884611, "grad_norm": 3.221383571624756, "learning_rate": 4.1908022995133526e-07, "loss": 0.3885, "step": 5780 }, { "epoch": 0.8835816340156039, "grad_norm": 2.8623874187469482, "learning_rate": 4.084702123395834e-07, "loss": 0.3748, "step": 5790 }, { "epoch": 0.8851076817427465, "grad_norm": 2.868194341659546, "learning_rate": 3.979905095659381e-07, "loss": 0.3913, "step": 5800 }, { "epoch": 0.8866337294698892, "grad_norm": 3.0048179626464844, "learning_rate": 3.8764141906084794e-07, "loss": 0.3843, "step": 5810 }, { "epoch": 0.8881597771970319, "grad_norm": 2.4534389972686768, "learning_rate": 3.7742323454778296e-07, "loss": 0.3878, "step": 5820 }, { "epoch": 0.8896858249241745, "grad_norm": 2.534501552581787, "learning_rate": 3.6733624603489e-07, "loss": 0.3899, "step": 5830 }, { "epoch": 0.8912118726513172, "grad_norm": 3.0121171474456787, "learning_rate": 3.5738073980677355e-07, "loss": 0.407, "step": 5840 }, { "epoch": 0.8927379203784598, "grad_norm": 2.708857774734497, "learning_rate": 3.475569984163596e-07, "loss": 0.4173, "step": 5850 }, { "epoch": 0.8942639681056025, "grad_norm": 3.2787721157073975, "learning_rate": 3.378653006768823e-07, "loss": 0.3978, "step": 5860 }, { "epoch": 0.8957900158327452, "grad_norm": 2.9193058013916016, "learning_rate": 3.2830592165396913e-07, "loss": 0.3986, "step": 5870 }, { "epoch": 0.8973160635598878, "grad_norm": 2.758446455001831, "learning_rate": 3.188791326578339e-07, "loss": 0.3574, "step": 5880 }, { "epoch": 0.8988421112870305, "grad_norm": 2.8080217838287354, "learning_rate": 3.0958520123557767e-07, "loss": 0.3749, "step": 5890 }, { "epoch": 0.9003681590141732, "grad_norm": 3.7036428451538086, "learning_rate": 3.0042439116359455e-07, "loss": 0.3887, "step": 5900 }, { "epoch": 0.9018942067413158, "grad_norm": 3.0411696434020996, "learning_rate": 2.9139696244008255e-07, "loss": 0.3961, "step": 5910 }, { "epoch": 0.9034202544684585, "grad_norm": 2.942746877670288, "learning_rate": 2.8250317127767213e-07, "loss": 0.3875, "step": 5920 }, { "epoch": 0.9049463021956011, "grad_norm": 3.230517864227295, "learning_rate": 2.7374327009614456e-07, "loss": 0.3981, "step": 5930 }, { "epoch": 0.9064723499227438, "grad_norm": 2.7047958374023438, "learning_rate": 2.651175075152784e-07, "loss": 0.3996, "step": 5940 }, { "epoch": 0.9079983976498865, "grad_norm": 3.278867244720459, "learning_rate": 2.5662612834778164e-07, "loss": 0.3625, "step": 5950 }, { "epoch": 0.9095244453770291, "grad_norm": 2.955040693283081, "learning_rate": 2.4826937359235305e-07, "loss": 0.3662, "step": 5960 }, { "epoch": 0.9110504931041719, "grad_norm": 2.9156110286712646, "learning_rate": 2.4004748042683933e-07, "loss": 0.405, "step": 5970 }, { "epoch": 0.9125765408313145, "grad_norm": 3.318800926208496, "learning_rate": 2.3196068220150025e-07, "loss": 0.4027, "step": 5980 }, { "epoch": 0.9141025885584572, "grad_norm": 4.083282470703125, "learning_rate": 2.2400920843239194e-07, "loss": 0.3634, "step": 5990 }, { "epoch": 0.9156286362855999, "grad_norm": 3.180896759033203, "learning_rate": 2.161932847948478e-07, "loss": 0.3918, "step": 6000 }, { "epoch": 0.9156286362855999, "eval_loss": 0.37766218185424805, "eval_runtime": 99.9689, "eval_samples_per_second": 5.302, "eval_steps_per_second": 2.651, "step": 6000 }, { "epoch": 0.9171546840127425, "grad_norm": 3.35513973236084, "learning_rate": 2.0851313311707532e-07, "loss": 0.3857, "step": 6010 }, { "epoch": 0.9186807317398852, "grad_norm": 2.6778345108032227, "learning_rate": 2.0096897137386052e-07, "loss": 0.3725, "step": 6020 }, { "epoch": 0.9202067794670278, "grad_norm": 2.9898831844329834, "learning_rate": 1.9356101368038005e-07, "loss": 0.4051, "step": 6030 }, { "epoch": 0.9217328271941705, "grad_norm": 2.924804449081421, "learning_rate": 1.8628947028612788e-07, "loss": 0.3742, "step": 6040 }, { "epoch": 0.9232588749213132, "grad_norm": 2.871389389038086, "learning_rate": 1.791545475689438e-07, "loss": 0.3881, "step": 6050 }, { "epoch": 0.9247849226484558, "grad_norm": 3.050419569015503, "learning_rate": 1.721564480291571e-07, "loss": 0.3972, "step": 6060 }, { "epoch": 0.9263109703755985, "grad_norm": 3.090453863143921, "learning_rate": 1.652953702838428e-07, "loss": 0.3852, "step": 6070 }, { "epoch": 0.9278370181027412, "grad_norm": 3.4830410480499268, "learning_rate": 1.585715090611778e-07, "loss": 0.3964, "step": 6080 }, { "epoch": 0.9293630658298838, "grad_norm": 3.4521846771240234, "learning_rate": 1.5198505519492368e-07, "loss": 0.4143, "step": 6090 }, { "epoch": 0.9308891135570265, "grad_norm": 3.032611131668091, "learning_rate": 1.4553619561899935e-07, "loss": 0.3881, "step": 6100 }, { "epoch": 0.9324151612841691, "grad_norm": 3.2053749561309814, "learning_rate": 1.3922511336218524e-07, "loss": 0.397, "step": 6110 }, { "epoch": 0.9339412090113118, "grad_norm": 3.1718320846557617, "learning_rate": 1.330519875429237e-07, "loss": 0.4042, "step": 6120 }, { "epoch": 0.9354672567384545, "grad_norm": 2.4236302375793457, "learning_rate": 1.2701699336423513e-07, "loss": 0.3791, "step": 6130 }, { "epoch": 0.9369933044655971, "grad_norm": 2.4723575115203857, "learning_rate": 1.211203021087487e-07, "loss": 0.4056, "step": 6140 }, { "epoch": 0.9385193521927399, "grad_norm": 3.279250383377075, "learning_rate": 1.1536208113383684e-07, "loss": 0.4043, "step": 6150 }, { "epoch": 0.9400453999198825, "grad_norm": 2.6275689601898193, "learning_rate": 1.0974249386687064e-07, "loss": 0.3986, "step": 6160 }, { "epoch": 0.9415714476470252, "grad_norm": 2.8793821334838867, "learning_rate": 1.042616998005752e-07, "loss": 0.3891, "step": 6170 }, { "epoch": 0.9430974953741679, "grad_norm": 3.2804644107818604, "learning_rate": 9.891985448850839e-08, "loss": 0.3963, "step": 6180 }, { "epoch": 0.9446235431013105, "grad_norm": 2.7859580516815186, "learning_rate": 9.37171095406425e-08, "loss": 0.3832, "step": 6190 }, { "epoch": 0.9461495908284532, "grad_norm": 2.705620288848877, "learning_rate": 8.865361261906402e-08, "loss": 0.3851, "step": 6200 }, { "epoch": 0.9476756385555958, "grad_norm": 2.792065382003784, "learning_rate": 8.372950743378128e-08, "loss": 0.403, "step": 6210 }, { "epoch": 0.9492016862827385, "grad_norm": 3.0607993602752686, "learning_rate": 7.894493373864332e-08, "loss": 0.4104, "step": 6220 }, { "epoch": 0.9507277340098812, "grad_norm": 3.5412745475769043, "learning_rate": 7.430002732737973e-08, "loss": 0.3937, "step": 6230 }, { "epoch": 0.9522537817370238, "grad_norm": 3.418365955352783, "learning_rate": 6.979492002974098e-08, "loss": 0.3954, "step": 6240 }, { "epoch": 0.9537798294641665, "grad_norm": 2.9009461402893066, "learning_rate": 6.542973970775912e-08, "loss": 0.3904, "step": 6250 }, { "epoch": 0.9553058771913091, "grad_norm": 2.7992494106292725, "learning_rate": 6.120461025211744e-08, "loss": 0.3897, "step": 6260 }, { "epoch": 0.9568319249184518, "grad_norm": 2.860257625579834, "learning_rate": 5.711965157863597e-08, "loss": 0.3673, "step": 6270 }, { "epoch": 0.9583579726455945, "grad_norm": 2.605058431625366, "learning_rate": 5.317497962486984e-08, "loss": 0.3961, "step": 6280 }, { "epoch": 0.9598840203727371, "grad_norm": 3.152754545211792, "learning_rate": 4.937070634681185e-08, "loss": 0.3884, "step": 6290 }, { "epoch": 0.9614100680998798, "grad_norm": 2.6811258792877197, "learning_rate": 4.570693971572393e-08, "loss": 0.3968, "step": 6300 }, { "epoch": 0.9629361158270225, "grad_norm": 3.2074809074401855, "learning_rate": 4.218378371506515e-08, "loss": 0.4024, "step": 6310 }, { "epoch": 0.9644621635541651, "grad_norm": 3.393841028213501, "learning_rate": 3.880133833754518e-08, "loss": 0.3852, "step": 6320 }, { "epoch": 0.9659882112813079, "grad_norm": 3.0189993381500244, "learning_rate": 3.555969958228489e-08, "loss": 0.3804, "step": 6330 }, { "epoch": 0.9675142590084505, "grad_norm": 2.8455655574798584, "learning_rate": 3.245895945209132e-08, "loss": 0.4228, "step": 6340 }, { "epoch": 0.9690403067355932, "grad_norm": 3.668877601623535, "learning_rate": 2.949920595084643e-08, "loss": 0.369, "step": 6350 }, { "epoch": 0.9705663544627359, "grad_norm": 3.0246024131774902, "learning_rate": 2.6680523081011878e-08, "loss": 0.3899, "step": 6360 }, { "epoch": 0.9720924021898785, "grad_norm": 2.8767731189727783, "learning_rate": 2.4002990841239804e-08, "loss": 0.393, "step": 6370 }, { "epoch": 0.9736184499170212, "grad_norm": 3.3473153114318848, "learning_rate": 2.1466685224107995e-08, "loss": 0.3722, "step": 6380 }, { "epoch": 0.9751444976441638, "grad_norm": 2.7829878330230713, "learning_rate": 1.9071678213959388e-08, "loss": 0.3788, "step": 6390 }, { "epoch": 0.9766705453713065, "grad_norm": 2.5925357341766357, "learning_rate": 1.6818037784860908e-08, "loss": 0.3862, "step": 6400 }, { "epoch": 0.9781965930984492, "grad_norm": 3.0968399047851562, "learning_rate": 1.4705827898672254e-08, "loss": 0.3738, "step": 6410 }, { "epoch": 0.9797226408255918, "grad_norm": 3.40238881111145, "learning_rate": 1.2735108503232896e-08, "loss": 0.4121, "step": 6420 }, { "epoch": 0.9812486885527345, "grad_norm": 2.9040355682373047, "learning_rate": 1.0905935530658996e-08, "loss": 0.4003, "step": 6430 }, { "epoch": 0.9827747362798771, "grad_norm": 2.6801180839538574, "learning_rate": 9.218360895758006e-09, "loss": 0.3973, "step": 6440 }, { "epoch": 0.9843007840070198, "grad_norm": 2.591391086578369, "learning_rate": 7.672432494551518e-09, "loss": 0.3936, "step": 6450 }, { "epoch": 0.9858268317341625, "grad_norm": 2.7946035861968994, "learning_rate": 6.268194202920241e-09, "loss": 0.3641, "step": 6460 }, { "epoch": 0.9873528794613051, "grad_norm": 4.159729480743408, "learning_rate": 5.005685875354993e-09, "loss": 0.3685, "step": 6470 }, { "epoch": 0.9888789271884478, "grad_norm": 2.7406532764434814, "learning_rate": 3.884943343829273e-09, "loss": 0.4149, "step": 6480 }, { "epoch": 0.9904049749155904, "grad_norm": 3.1383161544799805, "learning_rate": 2.9059984167778553e-09, "loss": 0.3814, "step": 6490 }, { "epoch": 0.9919310226427331, "grad_norm": 2.687572956085205, "learning_rate": 2.0688788781980664e-09, "loss": 0.3942, "step": 6500 }, { "epoch": 0.9919310226427331, "eval_loss": 0.377034068107605, "eval_runtime": 100.214, "eval_samples_per_second": 5.289, "eval_steps_per_second": 2.644, "step": 6500 }, { "epoch": 0.9934570703698758, "grad_norm": 2.9962236881256104, "learning_rate": 1.3736084868598564e-09, "loss": 0.3747, "step": 6510 }, { "epoch": 0.9949831180970184, "grad_norm": 2.946183204650879, "learning_rate": 8.202069756302333e-10, "loss": 0.3763, "step": 6520 }, { "epoch": 0.9965091658241612, "grad_norm": 3.0049428939819336, "learning_rate": 4.0869005091481727e-10, "loss": 0.4033, "step": 6530 }, { "epoch": 0.9980352135513039, "grad_norm": 3.199441432952881, "learning_rate": 1.3906939221042247e-10, "loss": 0.3847, "step": 6540 }, { "epoch": 0.9995612612784465, "grad_norm": 2.7879321575164795, "learning_rate": 1.1352651776985746e-11, "loss": 0.4005, "step": 6550 }, { "epoch": 1.0, "step": 6553, "total_flos": 2.1597904813481902e+18, "train_loss": 0.4742196609496197, "train_runtime": 41833.5481, "train_samples_per_second": 1.253, "train_steps_per_second": 0.157 } ], "logging_steps": 10, "max_steps": 6553, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1597904813481902e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }