| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 6553, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0015260477271426664, | |
| "grad_norm": 36.718162536621094, | |
| "learning_rate": 1.3719512195121953e-07, | |
| "loss": 1.7945, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0030520954542853328, | |
| "grad_norm": 33.861351013183594, | |
| "learning_rate": 2.896341463414635e-07, | |
| "loss": 1.7388, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004578143181427999, | |
| "grad_norm": 32.35844802856445, | |
| "learning_rate": 4.4207317073170735e-07, | |
| "loss": 1.6264, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0061041909085706655, | |
| "grad_norm": 18.111286163330078, | |
| "learning_rate": 5.945121951219512e-07, | |
| "loss": 1.4267, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.007630238635713332, | |
| "grad_norm": 12.056061744689941, | |
| "learning_rate": 7.469512195121952e-07, | |
| "loss": 1.1626, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.009156286362855998, | |
| "grad_norm": 9.076939582824707, | |
| "learning_rate": 8.993902439024391e-07, | |
| "loss": 0.9372, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.010682334089998665, | |
| "grad_norm": 8.621874809265137, | |
| "learning_rate": 1.051829268292683e-06, | |
| "loss": 0.8972, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.012208381817141331, | |
| "grad_norm": 6.114352226257324, | |
| "learning_rate": 1.204268292682927e-06, | |
| "loss": 0.771, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.013734429544283997, | |
| "grad_norm": 7.380756855010986, | |
| "learning_rate": 1.356707317073171e-06, | |
| "loss": 0.7386, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.015260477271426664, | |
| "grad_norm": 6.799697399139404, | |
| "learning_rate": 1.5091463414634146e-06, | |
| "loss": 0.7209, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.016786524998569332, | |
| "grad_norm": 7.576807975769043, | |
| "learning_rate": 1.6615853658536587e-06, | |
| "loss": 0.6985, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.018312572725711997, | |
| "grad_norm": 7.400335788726807, | |
| "learning_rate": 1.8140243902439026e-06, | |
| "loss": 0.6846, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.019838620452854665, | |
| "grad_norm": 6.237104892730713, | |
| "learning_rate": 1.9664634146341467e-06, | |
| "loss": 0.6377, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02136466817999733, | |
| "grad_norm": 6.404150485992432, | |
| "learning_rate": 2.1189024390243905e-06, | |
| "loss": 0.6354, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.022890715907139998, | |
| "grad_norm": 7.284428596496582, | |
| "learning_rate": 2.2713414634146344e-06, | |
| "loss": 0.6543, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.024416763634282662, | |
| "grad_norm": 5.5003156661987305, | |
| "learning_rate": 2.4237804878048783e-06, | |
| "loss": 0.6016, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.02594281136142533, | |
| "grad_norm": 7.994759559631348, | |
| "learning_rate": 2.576219512195122e-06, | |
| "loss": 0.6073, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.027468859088567995, | |
| "grad_norm": 7.063292980194092, | |
| "learning_rate": 2.7286585365853664e-06, | |
| "loss": 0.5698, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.028994906815710663, | |
| "grad_norm": 5.892914295196533, | |
| "learning_rate": 2.88109756097561e-06, | |
| "loss": 0.5679, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.030520954542853328, | |
| "grad_norm": 5.543047904968262, | |
| "learning_rate": 3.0335365853658537e-06, | |
| "loss": 0.5655, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.032047002269995996, | |
| "grad_norm": 6.323866844177246, | |
| "learning_rate": 3.185975609756098e-06, | |
| "loss": 0.5619, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.033573049997138664, | |
| "grad_norm": 6.957103729248047, | |
| "learning_rate": 3.338414634146342e-06, | |
| "loss": 0.553, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.035099097724281325, | |
| "grad_norm": 5.986090183258057, | |
| "learning_rate": 3.4908536585365853e-06, | |
| "loss": 0.5503, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03662514545142399, | |
| "grad_norm": 5.522103786468506, | |
| "learning_rate": 3.6432926829268296e-06, | |
| "loss": 0.5493, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03815119317856666, | |
| "grad_norm": 5.235415935516357, | |
| "learning_rate": 3.7957317073170735e-06, | |
| "loss": 0.5534, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03967724090570933, | |
| "grad_norm": 5.9757819175720215, | |
| "learning_rate": 3.948170731707318e-06, | |
| "loss": 0.5368, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04120328863285199, | |
| "grad_norm": 5.101258754730225, | |
| "learning_rate": 4.100609756097561e-06, | |
| "loss": 0.5254, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.04272933635999466, | |
| "grad_norm": 6.364596843719482, | |
| "learning_rate": 4.2530487804878055e-06, | |
| "loss": 0.5423, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.04425538408713733, | |
| "grad_norm": 6.032998085021973, | |
| "learning_rate": 4.405487804878049e-06, | |
| "loss": 0.5257, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.045781431814279995, | |
| "grad_norm": 5.804418563842773, | |
| "learning_rate": 4.557926829268293e-06, | |
| "loss": 0.5432, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.047307479541422656, | |
| "grad_norm": 5.553419589996338, | |
| "learning_rate": 4.710365853658536e-06, | |
| "loss": 0.5413, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.048833527268565324, | |
| "grad_norm": 7.154079914093018, | |
| "learning_rate": 4.862804878048781e-06, | |
| "loss": 0.5407, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.05035957499570799, | |
| "grad_norm": 7.198996543884277, | |
| "learning_rate": 5.015243902439024e-06, | |
| "loss": 0.5432, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.05188562272285066, | |
| "grad_norm": 5.0478434562683105, | |
| "learning_rate": 5.167682926829268e-06, | |
| "loss": 0.5125, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05341167044999332, | |
| "grad_norm": 6.1683573722839355, | |
| "learning_rate": 5.320121951219513e-06, | |
| "loss": 0.5321, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05493771817713599, | |
| "grad_norm": 7.884255409240723, | |
| "learning_rate": 5.4725609756097565e-06, | |
| "loss": 0.5399, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.05646376590427866, | |
| "grad_norm": 5.224252700805664, | |
| "learning_rate": 5.625e-06, | |
| "loss": 0.5539, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.057989813631421326, | |
| "grad_norm": 4.74583101272583, | |
| "learning_rate": 5.777439024390244e-06, | |
| "loss": 0.5503, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05951586135856399, | |
| "grad_norm": 5.914930820465088, | |
| "learning_rate": 5.929878048780489e-06, | |
| "loss": 0.5398, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.061041909085706655, | |
| "grad_norm": 4.671189308166504, | |
| "learning_rate": 6.082317073170733e-06, | |
| "loss": 0.5481, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06256795681284932, | |
| "grad_norm": 4.802231788635254, | |
| "learning_rate": 6.234756097560977e-06, | |
| "loss": 0.568, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06409400453999199, | |
| "grad_norm": 4.851030349731445, | |
| "learning_rate": 6.38719512195122e-06, | |
| "loss": 0.5599, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06562005226713466, | |
| "grad_norm": 5.160627365112305, | |
| "learning_rate": 6.5396341463414636e-06, | |
| "loss": 0.5352, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.06714609999427733, | |
| "grad_norm": 5.0278754234313965, | |
| "learning_rate": 6.6920731707317074e-06, | |
| "loss": 0.545, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.06867214772141998, | |
| "grad_norm": 5.1211090087890625, | |
| "learning_rate": 6.844512195121952e-06, | |
| "loss": 0.5321, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07019819544856265, | |
| "grad_norm": 4.986229419708252, | |
| "learning_rate": 6.996951219512196e-06, | |
| "loss": 0.5584, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07172424317570532, | |
| "grad_norm": 4.192492485046387, | |
| "learning_rate": 7.14939024390244e-06, | |
| "loss": 0.5462, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.07325029090284799, | |
| "grad_norm": 3.977277994155884, | |
| "learning_rate": 7.301829268292684e-06, | |
| "loss": 0.5366, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07477633862999065, | |
| "grad_norm": 4.757632732391357, | |
| "learning_rate": 7.454268292682928e-06, | |
| "loss": 0.5493, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.07630238635713332, | |
| "grad_norm": 4.987619400024414, | |
| "learning_rate": 7.606707317073171e-06, | |
| "loss": 0.5282, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07630238635713332, | |
| "eval_loss": 0.5320242047309875, | |
| "eval_runtime": 100.1496, | |
| "eval_samples_per_second": 5.292, | |
| "eval_steps_per_second": 2.646, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07782843408427599, | |
| "grad_norm": 5.458449840545654, | |
| "learning_rate": 7.759146341463415e-06, | |
| "loss": 0.5464, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.07935448181141866, | |
| "grad_norm": 5.1865081787109375, | |
| "learning_rate": 7.911585365853658e-06, | |
| "loss": 0.5456, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.08088052953856131, | |
| "grad_norm": 4.639908313751221, | |
| "learning_rate": 8.064024390243903e-06, | |
| "loss": 0.5226, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08240657726570398, | |
| "grad_norm": 4.7745537757873535, | |
| "learning_rate": 8.216463414634148e-06, | |
| "loss": 0.5623, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.08393262499284665, | |
| "grad_norm": 4.4498515129089355, | |
| "learning_rate": 8.36890243902439e-06, | |
| "loss": 0.5402, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08545867271998932, | |
| "grad_norm": 4.127673149108887, | |
| "learning_rate": 8.521341463414636e-06, | |
| "loss": 0.5526, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.08698472044713199, | |
| "grad_norm": 5.61818790435791, | |
| "learning_rate": 8.673780487804879e-06, | |
| "loss": 0.5413, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.08851076817427465, | |
| "grad_norm": 3.518537998199463, | |
| "learning_rate": 8.826219512195122e-06, | |
| "loss": 0.5536, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.09003681590141732, | |
| "grad_norm": 4.232659339904785, | |
| "learning_rate": 8.978658536585366e-06, | |
| "loss": 0.5572, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.09156286362855999, | |
| "grad_norm": 4.013524532318115, | |
| "learning_rate": 9.131097560975611e-06, | |
| "loss": 0.5236, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09308891135570264, | |
| "grad_norm": 3.9425785541534424, | |
| "learning_rate": 9.283536585365854e-06, | |
| "loss": 0.5643, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.09461495908284531, | |
| "grad_norm": 4.538547515869141, | |
| "learning_rate": 9.435975609756099e-06, | |
| "loss": 0.5565, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.09614100680998798, | |
| "grad_norm": 3.850074291229248, | |
| "learning_rate": 9.588414634146342e-06, | |
| "loss": 0.5319, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.09766705453713065, | |
| "grad_norm": 5.455791473388672, | |
| "learning_rate": 9.740853658536586e-06, | |
| "loss": 0.5412, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.09919310226427332, | |
| "grad_norm": 3.3886823654174805, | |
| "learning_rate": 9.893292682926831e-06, | |
| "loss": 0.566, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10071914999141598, | |
| "grad_norm": 3.8116540908813477, | |
| "learning_rate": 9.999993614132319e-06, | |
| "loss": 0.5627, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10224519771855865, | |
| "grad_norm": 3.609804630279541, | |
| "learning_rate": 9.99988008804953e-06, | |
| "loss": 0.5576, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.10377124544570132, | |
| "grad_norm": 3.923624038696289, | |
| "learning_rate": 9.999624657504754e-06, | |
| "loss": 0.5728, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.10529729317284398, | |
| "grad_norm": 4.188690185546875, | |
| "learning_rate": 9.99922732974751e-06, | |
| "loss": 0.5676, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.10682334089998664, | |
| "grad_norm": 3.9573514461517334, | |
| "learning_rate": 9.998688116054583e-06, | |
| "loss": 0.5362, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10834938862712931, | |
| "grad_norm": 3.8924996852874756, | |
| "learning_rate": 9.99800703172971e-06, | |
| "loss": 0.557, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.10987543635427198, | |
| "grad_norm": 4.514781475067139, | |
| "learning_rate": 9.997184096103133e-06, | |
| "loss": 0.5729, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.11140148408141465, | |
| "grad_norm": 3.555657148361206, | |
| "learning_rate": 9.996219332531059e-06, | |
| "loss": 0.5735, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.11292753180855732, | |
| "grad_norm": 3.8453004360198975, | |
| "learning_rate": 9.995112768394996e-06, | |
| "loss": 0.5492, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.11445357953569998, | |
| "grad_norm": 4.1153435707092285, | |
| "learning_rate": 9.993864435100976e-06, | |
| "loss": 0.5273, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.11597962726284265, | |
| "grad_norm": 3.793621063232422, | |
| "learning_rate": 9.992474368078664e-06, | |
| "loss": 0.5744, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1175056749899853, | |
| "grad_norm": 3.394721508026123, | |
| "learning_rate": 9.990942606780344e-06, | |
| "loss": 0.5554, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.11903172271712797, | |
| "grad_norm": 4.319253444671631, | |
| "learning_rate": 9.989269194679814e-06, | |
| "loss": 0.5161, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.12055777044427064, | |
| "grad_norm": 3.6960957050323486, | |
| "learning_rate": 9.987454179271138e-06, | |
| "loss": 0.5467, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.12208381817141331, | |
| "grad_norm": 4.272833347320557, | |
| "learning_rate": 9.985497612067315e-06, | |
| "loss": 0.5764, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12360986589855598, | |
| "grad_norm": 3.716593027114868, | |
| "learning_rate": 9.983399548598795e-06, | |
| "loss": 0.5644, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.12513591362569865, | |
| "grad_norm": 3.8028016090393066, | |
| "learning_rate": 9.981160048411922e-06, | |
| "loss": 0.5442, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1266619613528413, | |
| "grad_norm": 3.634533405303955, | |
| "learning_rate": 9.978779175067232e-06, | |
| "loss": 0.5642, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.12818800907998398, | |
| "grad_norm": 4.484054088592529, | |
| "learning_rate": 9.976256996137657e-06, | |
| "loss": 0.5826, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.12971405680712664, | |
| "grad_norm": 3.2868642807006836, | |
| "learning_rate": 9.973593583206602e-06, | |
| "loss": 0.5643, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.13124010453426932, | |
| "grad_norm": 3.641526937484741, | |
| "learning_rate": 9.970789011865916e-06, | |
| "loss": 0.5577, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13276615226141197, | |
| "grad_norm": 3.82144832611084, | |
| "learning_rate": 9.967843361713747e-06, | |
| "loss": 0.5374, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.13429219998855466, | |
| "grad_norm": 3.786513090133667, | |
| "learning_rate": 9.964756716352277e-06, | |
| "loss": 0.5523, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1358182477156973, | |
| "grad_norm": 3.7389333248138428, | |
| "learning_rate": 9.96152916338536e-06, | |
| "loss": 0.5708, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.13734429544283996, | |
| "grad_norm": 3.5762176513671875, | |
| "learning_rate": 9.958160794416022e-06, | |
| "loss": 0.5481, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.13887034316998265, | |
| "grad_norm": 3.7925140857696533, | |
| "learning_rate": 9.954651705043878e-06, | |
| "loss": 0.5814, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1403963908971253, | |
| "grad_norm": 4.10577917098999, | |
| "learning_rate": 9.951001994862402e-06, | |
| "loss": 0.5574, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.14192243862426798, | |
| "grad_norm": 3.477315664291382, | |
| "learning_rate": 9.947211767456111e-06, | |
| "loss": 0.5472, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.14344848635141064, | |
| "grad_norm": 3.1365439891815186, | |
| "learning_rate": 9.943281130397619e-06, | |
| "loss": 0.5759, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.14497453407855332, | |
| "grad_norm": 4.209506988525391, | |
| "learning_rate": 9.93921019524459e-06, | |
| "loss": 0.5544, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.14650058180569597, | |
| "grad_norm": 3.269994020462036, | |
| "learning_rate": 9.934999077536567e-06, | |
| "loss": 0.5577, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.14802662953283863, | |
| "grad_norm": 3.4426379203796387, | |
| "learning_rate": 9.930647896791696e-06, | |
| "loss": 0.5498, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1495526772599813, | |
| "grad_norm": 3.949375629425049, | |
| "learning_rate": 9.92615677650333e-06, | |
| "loss": 0.5452, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.15107872498712396, | |
| "grad_norm": 3.42270827293396, | |
| "learning_rate": 9.92152584413653e-06, | |
| "loss": 0.5393, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.15260477271426665, | |
| "grad_norm": 4.055193901062012, | |
| "learning_rate": 9.916755231124437e-06, | |
| "loss": 0.5294, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15260477271426665, | |
| "eval_loss": 0.5252559781074524, | |
| "eval_runtime": 99.9603, | |
| "eval_samples_per_second": 5.302, | |
| "eval_steps_per_second": 2.651, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1541308204414093, | |
| "grad_norm": 3.2706804275512695, | |
| "learning_rate": 9.911845072864556e-06, | |
| "loss": 0.522, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.15565686816855198, | |
| "grad_norm": 3.859898090362549, | |
| "learning_rate": 9.906795508714901e-06, | |
| "loss": 0.5373, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.15718291589569464, | |
| "grad_norm": 3.1161351203918457, | |
| "learning_rate": 9.901606681990048e-06, | |
| "loss": 0.5471, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.15870896362283732, | |
| "grad_norm": 3.452761650085449, | |
| "learning_rate": 9.89627873995706e-06, | |
| "loss": 0.5532, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.16023501134997997, | |
| "grad_norm": 3.6008405685424805, | |
| "learning_rate": 9.890811833831312e-06, | |
| "loss": 0.5075, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.16176105907712263, | |
| "grad_norm": 4.958362579345703, | |
| "learning_rate": 9.885206118772201e-06, | |
| "loss": 0.5404, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1632871068042653, | |
| "grad_norm": 3.1011452674865723, | |
| "learning_rate": 9.879461753878738e-06, | |
| "loss": 0.5456, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.16481315453140796, | |
| "grad_norm": 3.445542097091675, | |
| "learning_rate": 9.873578902185034e-06, | |
| "loss": 0.5299, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.16633920225855064, | |
| "grad_norm": 3.2124245166778564, | |
| "learning_rate": 9.867557730655676e-06, | |
| "loss": 0.5555, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1678652499856933, | |
| "grad_norm": 3.3892674446105957, | |
| "learning_rate": 9.861398410180985e-06, | |
| "loss": 0.5429, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.16939129771283598, | |
| "grad_norm": 3.534641981124878, | |
| "learning_rate": 9.855101115572161e-06, | |
| "loss": 0.5564, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.17091734543997864, | |
| "grad_norm": 2.8586363792419434, | |
| "learning_rate": 9.848666025556332e-06, | |
| "loss": 0.5155, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.1724433931671213, | |
| "grad_norm": 4.134357452392578, | |
| "learning_rate": 9.842093322771479e-06, | |
| "loss": 0.5292, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.17396944089426397, | |
| "grad_norm": 2.917952060699463, | |
| "learning_rate": 9.83538319376124e-06, | |
| "loss": 0.5471, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.17549548862140663, | |
| "grad_norm": 3.148503065109253, | |
| "learning_rate": 9.82853582896964e-06, | |
| "loss": 0.5084, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1770215363485493, | |
| "grad_norm": 2.9326202869415283, | |
| "learning_rate": 9.821551422735655e-06, | |
| "loss": 0.5303, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.17854758407569196, | |
| "grad_norm": 2.8527936935424805, | |
| "learning_rate": 9.814430173287724e-06, | |
| "loss": 0.5265, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.18007363180283464, | |
| "grad_norm": 3.373987913131714, | |
| "learning_rate": 9.807172282738109e-06, | |
| "loss": 0.5267, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1815996795299773, | |
| "grad_norm": 3.886758804321289, | |
| "learning_rate": 9.799777957077161e-06, | |
| "loss": 0.5449, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.18312572725711998, | |
| "grad_norm": 2.8181774616241455, | |
| "learning_rate": 9.792247406167471e-06, | |
| "loss": 0.5477, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18465177498426263, | |
| "grad_norm": 3.1215786933898926, | |
| "learning_rate": 9.784580843737924e-06, | |
| "loss": 0.5317, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.1861778227114053, | |
| "grad_norm": 3.4757418632507324, | |
| "learning_rate": 9.776778487377622e-06, | |
| "loss": 0.5312, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.18770387043854797, | |
| "grad_norm": 2.941584587097168, | |
| "learning_rate": 9.768840558529708e-06, | |
| "loss": 0.5372, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.18922991816569062, | |
| "grad_norm": 3.1221237182617188, | |
| "learning_rate": 9.760767282485091e-06, | |
| "loss": 0.5246, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.1907559658928333, | |
| "grad_norm": 2.970177173614502, | |
| "learning_rate": 9.752558888376045e-06, | |
| "loss": 0.5269, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.19228201361997596, | |
| "grad_norm": 3.686633348464966, | |
| "learning_rate": 9.744215609169709e-06, | |
| "loss": 0.5239, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.19380806134711864, | |
| "grad_norm": 2.774200439453125, | |
| "learning_rate": 9.735737681661467e-06, | |
| "loss": 0.5175, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.1953341090742613, | |
| "grad_norm": 3.612818717956543, | |
| "learning_rate": 9.727125346468243e-06, | |
| "loss": 0.5144, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.19686015680140395, | |
| "grad_norm": 2.7817158699035645, | |
| "learning_rate": 9.718378848021655e-06, | |
| "loss": 0.5417, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.19838620452854663, | |
| "grad_norm": 3.400517463684082, | |
| "learning_rate": 9.709498434561086e-06, | |
| "loss": 0.4989, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1999122522556893, | |
| "grad_norm": 2.9461493492126465, | |
| "learning_rate": 9.70048435812664e-06, | |
| "loss": 0.5235, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.20143829998283197, | |
| "grad_norm": 3.0229151248931885, | |
| "learning_rate": 9.69133687455199e-06, | |
| "loss": 0.4988, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.20296434770997462, | |
| "grad_norm": 4.051263332366943, | |
| "learning_rate": 9.682056243457105e-06, | |
| "loss": 0.5394, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.2044903954371173, | |
| "grad_norm": 2.998913049697876, | |
| "learning_rate": 9.672642728240896e-06, | |
| "loss": 0.549, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.20601644316425996, | |
| "grad_norm": 3.4110162258148193, | |
| "learning_rate": 9.663096596073732e-06, | |
| "loss": 0.4888, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.20754249089140264, | |
| "grad_norm": 3.530796766281128, | |
| "learning_rate": 9.653418117889862e-06, | |
| "loss": 0.5272, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2090685386185453, | |
| "grad_norm": 3.0355026721954346, | |
| "learning_rate": 9.64360756837972e-06, | |
| "loss": 0.5322, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.21059458634568795, | |
| "grad_norm": 2.8864426612854004, | |
| "learning_rate": 9.633665225982134e-06, | |
| "loss": 0.49, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.21212063407283063, | |
| "grad_norm": 4.0944132804870605, | |
| "learning_rate": 9.623591372876422e-06, | |
| "loss": 0.5502, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.2136466817999733, | |
| "grad_norm": 2.89972186088562, | |
| "learning_rate": 9.61338629497438e-06, | |
| "loss": 0.5178, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21517272952711597, | |
| "grad_norm": 2.904897689819336, | |
| "learning_rate": 9.603050281912175e-06, | |
| "loss": 0.5471, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.21669877725425862, | |
| "grad_norm": 2.9118282794952393, | |
| "learning_rate": 9.592583627042115e-06, | |
| "loss": 0.5214, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.2182248249814013, | |
| "grad_norm": 3.003340244293213, | |
| "learning_rate": 9.581986627424329e-06, | |
| "loss": 0.5266, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.21975087270854396, | |
| "grad_norm": 2.8239073753356934, | |
| "learning_rate": 9.571259583818337e-06, | |
| "loss": 0.5305, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.2212769204356866, | |
| "grad_norm": 3.4803032875061035, | |
| "learning_rate": 9.56040280067451e-06, | |
| "loss": 0.5336, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.2228029681628293, | |
| "grad_norm": 2.8414413928985596, | |
| "learning_rate": 9.549416586125435e-06, | |
| "loss": 0.5341, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.22432901588997195, | |
| "grad_norm": 3.3560791015625, | |
| "learning_rate": 9.538301251977158e-06, | |
| "loss": 0.5175, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.22585506361711463, | |
| "grad_norm": 3.5832326412200928, | |
| "learning_rate": 9.52705711370035e-06, | |
| "loss": 0.5453, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2273811113442573, | |
| "grad_norm": 2.907194137573242, | |
| "learning_rate": 9.515684490421342e-06, | |
| "loss": 0.5256, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.22890715907139997, | |
| "grad_norm": 3.446336269378662, | |
| "learning_rate": 9.504183704913075e-06, | |
| "loss": 0.5116, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22890715907139997, | |
| "eval_loss": 0.5038516521453857, | |
| "eval_runtime": 99.9985, | |
| "eval_samples_per_second": 5.3, | |
| "eval_steps_per_second": 2.65, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.23043320679854262, | |
| "grad_norm": 3.575700044631958, | |
| "learning_rate": 9.492555083585928e-06, | |
| "loss": 0.5215, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.2319592545256853, | |
| "grad_norm": 2.7438437938690186, | |
| "learning_rate": 9.480798956478473e-06, | |
| "loss": 0.5381, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.23348530225282796, | |
| "grad_norm": 3.586581230163574, | |
| "learning_rate": 9.468915657248083e-06, | |
| "loss": 0.5361, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.2350113499799706, | |
| "grad_norm": 2.979966878890991, | |
| "learning_rate": 9.45690552316149e-06, | |
| "loss": 0.5385, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2365373977071133, | |
| "grad_norm": 3.4089224338531494, | |
| "learning_rate": 9.44476889508519e-06, | |
| "loss": 0.5122, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.23806344543425595, | |
| "grad_norm": 3.400916337966919, | |
| "learning_rate": 9.432506117475777e-06, | |
| "loss": 0.4855, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.23958949316139863, | |
| "grad_norm": 3.0944440364837646, | |
| "learning_rate": 9.420117538370173e-06, | |
| "loss": 0.5314, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.24111554088854129, | |
| "grad_norm": 3.072086811065674, | |
| "learning_rate": 9.407603509375737e-06, | |
| "loss": 0.5487, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.24264158861568397, | |
| "grad_norm": 2.866974353790283, | |
| "learning_rate": 9.394964385660302e-06, | |
| "loss": 0.5199, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.24416763634282662, | |
| "grad_norm": 3.2548046112060547, | |
| "learning_rate": 9.382200525942076e-06, | |
| "loss": 0.536, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.24569368406996928, | |
| "grad_norm": 3.636455535888672, | |
| "learning_rate": 9.369312292479479e-06, | |
| "loss": 0.5102, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.24721973179711196, | |
| "grad_norm": 2.7913310527801514, | |
| "learning_rate": 9.35630005106085e-06, | |
| "loss": 0.5174, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2487457795242546, | |
| "grad_norm": 3.905932903289795, | |
| "learning_rate": 9.34316417099407e-06, | |
| "loss": 0.5399, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2502718272513973, | |
| "grad_norm": 2.9708142280578613, | |
| "learning_rate": 9.329905025096079e-06, | |
| "loss": 0.5139, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.25179787497853995, | |
| "grad_norm": 2.849421977996826, | |
| "learning_rate": 9.316522989682293e-06, | |
| "loss": 0.4887, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.2533239227056826, | |
| "grad_norm": 3.2548842430114746, | |
| "learning_rate": 9.30301844455593e-06, | |
| "loss": 0.5173, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2548499704328253, | |
| "grad_norm": 3.9787535667419434, | |
| "learning_rate": 9.289391772997223e-06, | |
| "loss": 0.5295, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.25637601815996797, | |
| "grad_norm": 2.5555968284606934, | |
| "learning_rate": 9.275643361752546e-06, | |
| "loss": 0.5371, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.2579020658871106, | |
| "grad_norm": 3.158561944961548, | |
| "learning_rate": 9.261773601023439e-06, | |
| "loss": 0.5305, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.2594281136142533, | |
| "grad_norm": 3.1799142360687256, | |
| "learning_rate": 9.247782884455526e-06, | |
| "loss": 0.5293, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.26095416134139593, | |
| "grad_norm": 2.7630088329315186, | |
| "learning_rate": 9.233671609127352e-06, | |
| "loss": 0.5102, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.26248020906853864, | |
| "grad_norm": 3.3492443561553955, | |
| "learning_rate": 9.219440175539113e-06, | |
| "loss": 0.516, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.2640062567956813, | |
| "grad_norm": 3.2532637119293213, | |
| "learning_rate": 9.205088987601277e-06, | |
| "loss": 0.5063, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.26553230452282395, | |
| "grad_norm": 3.098576068878174, | |
| "learning_rate": 9.190618452623135e-06, | |
| "loss": 0.4973, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.2670583522499666, | |
| "grad_norm": 3.852489709854126, | |
| "learning_rate": 9.176028981301229e-06, | |
| "loss": 0.4778, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2685843999771093, | |
| "grad_norm": 3.133566379547119, | |
| "learning_rate": 9.16132098770771e-06, | |
| "loss": 0.5031, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.27011044770425197, | |
| "grad_norm": 3.0958361625671387, | |
| "learning_rate": 9.146494889278568e-06, | |
| "loss": 0.4784, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.2716364954313946, | |
| "grad_norm": 3.499459743499756, | |
| "learning_rate": 9.131551106801803e-06, | |
| "loss": 0.5071, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.2731625431585373, | |
| "grad_norm": 2.8909738063812256, | |
| "learning_rate": 9.116490064405467e-06, | |
| "loss": 0.5116, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.27468859088567993, | |
| "grad_norm": 2.8877241611480713, | |
| "learning_rate": 9.101312189545636e-06, | |
| "loss": 0.4888, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.27621463861282264, | |
| "grad_norm": 2.978130578994751, | |
| "learning_rate": 9.086017912994272e-06, | |
| "loss": 0.5325, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.2777406863399653, | |
| "grad_norm": 3.5364253520965576, | |
| "learning_rate": 9.070607668827003e-06, | |
| "loss": 0.5285, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.27926673406710795, | |
| "grad_norm": 2.8093996047973633, | |
| "learning_rate": 9.055081894410802e-06, | |
| "loss": 0.4959, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.2807927817942506, | |
| "grad_norm": 2.98183274269104, | |
| "learning_rate": 9.03944103039157e-06, | |
| "loss": 0.501, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.28231882952139326, | |
| "grad_norm": 3.1950182914733887, | |
| "learning_rate": 9.023685520681626e-06, | |
| "loss": 0.5204, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.28384487724853597, | |
| "grad_norm": 2.9772353172302246, | |
| "learning_rate": 9.007815812447126e-06, | |
| "loss": 0.4989, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2853709249756786, | |
| "grad_norm": 3.50301194190979, | |
| "learning_rate": 8.991832356095351e-06, | |
| "loss": 0.5032, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.2868969727028213, | |
| "grad_norm": 2.9427924156188965, | |
| "learning_rate": 8.975735605261936e-06, | |
| "loss": 0.5196, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.2884230204299639, | |
| "grad_norm": 2.9805080890655518, | |
| "learning_rate": 8.95952601679799e-06, | |
| "loss": 0.5092, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.28994906815710664, | |
| "grad_norm": 2.65608811378479, | |
| "learning_rate": 8.943204050757133e-06, | |
| "loss": 0.5106, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2914751158842493, | |
| "grad_norm": 2.7500367164611816, | |
| "learning_rate": 8.926770170382434e-06, | |
| "loss": 0.4999, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.29300116361139195, | |
| "grad_norm": 3.063328266143799, | |
| "learning_rate": 8.910224842093275e-06, | |
| "loss": 0.5164, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.2945272113385346, | |
| "grad_norm": 3.1675572395324707, | |
| "learning_rate": 8.893568535472094e-06, | |
| "loss": 0.4857, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.29605325906567725, | |
| "grad_norm": 4.051036357879639, | |
| "learning_rate": 8.876801723251077e-06, | |
| "loss": 0.4995, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.29757930679281996, | |
| "grad_norm": 3.0843279361724854, | |
| "learning_rate": 8.859924881298729e-06, | |
| "loss": 0.4869, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2991053545199626, | |
| "grad_norm": 3.1268937587738037, | |
| "learning_rate": 8.842938488606373e-06, | |
| "loss": 0.516, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3006314022471053, | |
| "grad_norm": 2.659601926803589, | |
| "learning_rate": 8.825843027274554e-06, | |
| "loss": 0.4658, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.3021574499742479, | |
| "grad_norm": 2.756713628768921, | |
| "learning_rate": 8.80863898249936e-06, | |
| "loss": 0.5115, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.30368349770139064, | |
| "grad_norm": 2.470733404159546, | |
| "learning_rate": 8.79132684255864e-06, | |
| "loss": 0.5134, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.3052095454285333, | |
| "grad_norm": 3.007568597793579, | |
| "learning_rate": 8.773907098798158e-06, | |
| "loss": 0.5121, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3052095454285333, | |
| "eval_loss": 0.48503902554512024, | |
| "eval_runtime": 99.9999, | |
| "eval_samples_per_second": 5.3, | |
| "eval_steps_per_second": 2.65, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.30673559315567595, | |
| "grad_norm": 3.3363804817199707, | |
| "learning_rate": 8.756380245617645e-06, | |
| "loss": 0.4861, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3082616408828186, | |
| "grad_norm": 3.0314290523529053, | |
| "learning_rate": 8.73874678045677e-06, | |
| "loss": 0.4963, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.30978768860996125, | |
| "grad_norm": 2.7498457431793213, | |
| "learning_rate": 8.721007203781008e-06, | |
| "loss": 0.5125, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.31131373633710396, | |
| "grad_norm": 3.2180142402648926, | |
| "learning_rate": 8.703162019067451e-06, | |
| "loss": 0.5148, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3128397840642466, | |
| "grad_norm": 3.160686492919922, | |
| "learning_rate": 8.685211732790513e-06, | |
| "loss": 0.4915, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.31436583179138927, | |
| "grad_norm": 2.880316972732544, | |
| "learning_rate": 8.667156854407555e-06, | |
| "loss": 0.5154, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.3158918795185319, | |
| "grad_norm": 3.6688642501831055, | |
| "learning_rate": 8.648997896344429e-06, | |
| "loss": 0.5257, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.31741792724567464, | |
| "grad_norm": 2.8660826683044434, | |
| "learning_rate": 8.630735373980926e-06, | |
| "loss": 0.5134, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3189439749728173, | |
| "grad_norm": 2.69706392288208, | |
| "learning_rate": 8.612369805636161e-06, | |
| "loss": 0.4847, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.32047002269995994, | |
| "grad_norm": 2.907597064971924, | |
| "learning_rate": 8.593901712553853e-06, | |
| "loss": 0.5196, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3219960704271026, | |
| "grad_norm": 3.240574836730957, | |
| "learning_rate": 8.575331618887537e-06, | |
| "loss": 0.5104, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.32352211815424525, | |
| "grad_norm": 3.215815782546997, | |
| "learning_rate": 8.556660051685679e-06, | |
| "loss": 0.4973, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.32504816588138796, | |
| "grad_norm": 2.7997822761535645, | |
| "learning_rate": 8.537887540876732e-06, | |
| "loss": 0.5099, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.3265742136085306, | |
| "grad_norm": 2.945455551147461, | |
| "learning_rate": 8.519014619254081e-06, | |
| "loss": 0.5112, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.32810026133567327, | |
| "grad_norm": 3.219611644744873, | |
| "learning_rate": 8.50004182246093e-06, | |
| "loss": 0.5086, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3296263090628159, | |
| "grad_norm": 2.8295400142669678, | |
| "learning_rate": 8.480969688975094e-06, | |
| "loss": 0.4897, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3311523567899586, | |
| "grad_norm": 2.7116832733154297, | |
| "learning_rate": 8.461798760093728e-06, | |
| "loss": 0.4751, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3326784045171013, | |
| "grad_norm": 3.04294490814209, | |
| "learning_rate": 8.44252957991795e-06, | |
| "loss": 0.4784, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.33420445224424394, | |
| "grad_norm": 2.6779112815856934, | |
| "learning_rate": 8.423162695337402e-06, | |
| "loss": 0.5155, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.3357304999713866, | |
| "grad_norm": 3.5296261310577393, | |
| "learning_rate": 8.403698656014734e-06, | |
| "loss": 0.509, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.33725654769852925, | |
| "grad_norm": 3.0861217975616455, | |
| "learning_rate": 8.384138014370003e-06, | |
| "loss": 0.4961, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.33878259542567196, | |
| "grad_norm": 2.9720520973205566, | |
| "learning_rate": 8.364481325564983e-06, | |
| "loss": 0.4759, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.3403086431528146, | |
| "grad_norm": 2.8393325805664062, | |
| "learning_rate": 8.344729147487431e-06, | |
| "loss": 0.502, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.34183469087995727, | |
| "grad_norm": 2.930060386657715, | |
| "learning_rate": 8.324882040735227e-06, | |
| "loss": 0.4914, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3433607386070999, | |
| "grad_norm": 3.05932879447937, | |
| "learning_rate": 8.304940568600482e-06, | |
| "loss": 0.5184, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3448867863342426, | |
| "grad_norm": 3.455152750015259, | |
| "learning_rate": 8.284905297053544e-06, | |
| "loss": 0.4881, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.3464128340613853, | |
| "grad_norm": 3.1637487411499023, | |
| "learning_rate": 8.264776794726938e-06, | |
| "loss": 0.5001, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.34793888178852794, | |
| "grad_norm": 3.0358266830444336, | |
| "learning_rate": 8.244555632899223e-06, | |
| "loss": 0.4795, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3494649295156706, | |
| "grad_norm": 2.930438756942749, | |
| "learning_rate": 8.224242385478778e-06, | |
| "loss": 0.4814, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.35099097724281325, | |
| "grad_norm": 2.8416764736175537, | |
| "learning_rate": 8.203837628987523e-06, | |
| "loss": 0.463, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.35251702496995596, | |
| "grad_norm": 2.61423921585083, | |
| "learning_rate": 8.18334194254454e-06, | |
| "loss": 0.4984, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3540430726970986, | |
| "grad_norm": 2.8394827842712402, | |
| "learning_rate": 8.162755907849653e-06, | |
| "loss": 0.4884, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.35556912042424127, | |
| "grad_norm": 2.678520917892456, | |
| "learning_rate": 8.142080109166912e-06, | |
| "loss": 0.493, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.3570951681513839, | |
| "grad_norm": 3.0217721462249756, | |
| "learning_rate": 8.121315133308008e-06, | |
| "loss": 0.483, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.3586212158785266, | |
| "grad_norm": 2.741018533706665, | |
| "learning_rate": 8.100461569615615e-06, | |
| "loss": 0.5032, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3601472636056693, | |
| "grad_norm": 3.2703857421875, | |
| "learning_rate": 8.079520009946678e-06, | |
| "loss": 0.4871, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.36167331133281194, | |
| "grad_norm": 2.5858423709869385, | |
| "learning_rate": 8.058491048655603e-06, | |
| "loss": 0.5108, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3631993590599546, | |
| "grad_norm": 3.112304925918579, | |
| "learning_rate": 8.037375282577391e-06, | |
| "loss": 0.5019, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.36472540678709725, | |
| "grad_norm": 2.5532495975494385, | |
| "learning_rate": 8.016173311010697e-06, | |
| "loss": 0.4929, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.36625145451423996, | |
| "grad_norm": 2.342569589614868, | |
| "learning_rate": 7.994885735700832e-06, | |
| "loss": 0.4905, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3677775022413826, | |
| "grad_norm": 3.1640357971191406, | |
| "learning_rate": 7.973513160822664e-06, | |
| "loss": 0.4745, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.36930354996852527, | |
| "grad_norm": 2.825571298599243, | |
| "learning_rate": 7.952056192963488e-06, | |
| "loss": 0.4843, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.3708295976956679, | |
| "grad_norm": 2.888791084289551, | |
| "learning_rate": 7.93051544110581e-06, | |
| "loss": 0.488, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.3723556454228106, | |
| "grad_norm": 2.8343687057495117, | |
| "learning_rate": 7.908891516610048e-06, | |
| "loss": 0.4568, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3738816931499533, | |
| "grad_norm": 3.1001429557800293, | |
| "learning_rate": 7.887185033197198e-06, | |
| "loss": 0.4695, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.37540774087709594, | |
| "grad_norm": 2.368257761001587, | |
| "learning_rate": 7.8653966069314e-06, | |
| "loss": 0.5056, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.3769337886042386, | |
| "grad_norm": 2.99737286567688, | |
| "learning_rate": 7.843526856202472e-06, | |
| "loss": 0.4855, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.37845983633138125, | |
| "grad_norm": 3.0355498790740967, | |
| "learning_rate": 7.821576401708339e-06, | |
| "loss": 0.4784, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3799858840585239, | |
| "grad_norm": 2.5011422634124756, | |
| "learning_rate": 7.799545866437429e-06, | |
| "loss": 0.518, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.3815119317856666, | |
| "grad_norm": 2.846738338470459, | |
| "learning_rate": 7.777435875650986e-06, | |
| "loss": 0.4778, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3815119317856666, | |
| "eval_loss": 0.4665885865688324, | |
| "eval_runtime": 99.9829, | |
| "eval_samples_per_second": 5.301, | |
| "eval_steps_per_second": 2.65, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.38303797951280927, | |
| "grad_norm": 2.755929708480835, | |
| "learning_rate": 7.755247056865332e-06, | |
| "loss": 0.4726, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.3845640272399519, | |
| "grad_norm": 3.3097054958343506, | |
| "learning_rate": 7.732980039834048e-06, | |
| "loss": 0.4839, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.3860900749670946, | |
| "grad_norm": 2.85306978225708, | |
| "learning_rate": 7.710635456530102e-06, | |
| "loss": 0.4888, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.3876161226942373, | |
| "grad_norm": 3.0452582836151123, | |
| "learning_rate": 7.68821394112792e-06, | |
| "loss": 0.477, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.38914217042137994, | |
| "grad_norm": 2.805910348892212, | |
| "learning_rate": 7.665716129985379e-06, | |
| "loss": 0.5016, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3906682181485226, | |
| "grad_norm": 3.086005687713623, | |
| "learning_rate": 7.64314266162575e-06, | |
| "loss": 0.4842, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.39219426587566525, | |
| "grad_norm": 2.4592947959899902, | |
| "learning_rate": 7.620494176719572e-06, | |
| "loss": 0.4977, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.3937203136028079, | |
| "grad_norm": 2.79803729057312, | |
| "learning_rate": 7.597771318066476e-06, | |
| "loss": 0.4974, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.3952463613299506, | |
| "grad_norm": 3.157926559448242, | |
| "learning_rate": 7.574974730576936e-06, | |
| "loss": 0.4815, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.39677240905709327, | |
| "grad_norm": 4.1300458908081055, | |
| "learning_rate": 7.552105061253962e-06, | |
| "loss": 0.4876, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3982984567842359, | |
| "grad_norm": 3.6879398822784424, | |
| "learning_rate": 7.529162959174746e-06, | |
| "loss": 0.4905, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.3998245045113786, | |
| "grad_norm": 2.8170852661132812, | |
| "learning_rate": 7.5061490754722355e-06, | |
| "loss": 0.4849, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4013505522385213, | |
| "grad_norm": 2.4680588245391846, | |
| "learning_rate": 7.4830640633166516e-06, | |
| "loss": 0.4541, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.40287659996566394, | |
| "grad_norm": 2.8768832683563232, | |
| "learning_rate": 7.4599085778969524e-06, | |
| "loss": 0.4773, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4044026476928066, | |
| "grad_norm": 2.7005279064178467, | |
| "learning_rate": 7.436683276402239e-06, | |
| "loss": 0.47, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.40592869541994925, | |
| "grad_norm": 3.036959409713745, | |
| "learning_rate": 7.413388818003104e-06, | |
| "loss": 0.4992, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4074547431470919, | |
| "grad_norm": 3.3453052043914795, | |
| "learning_rate": 7.3900258638329196e-06, | |
| "loss": 0.4713, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4089807908742346, | |
| "grad_norm": 3.5486536026000977, | |
| "learning_rate": 7.366595076969073e-06, | |
| "loss": 0.4724, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.41050683860137727, | |
| "grad_norm": 2.4689748287200928, | |
| "learning_rate": 7.343097122414159e-06, | |
| "loss": 0.4972, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4120328863285199, | |
| "grad_norm": 3.1132616996765137, | |
| "learning_rate": 7.319532667077088e-06, | |
| "loss": 0.4766, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4135589340556626, | |
| "grad_norm": 2.663201332092285, | |
| "learning_rate": 7.295902379754172e-06, | |
| "loss": 0.4833, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4150849817828053, | |
| "grad_norm": 3.298428773880005, | |
| "learning_rate": 7.272206931110135e-06, | |
| "loss": 0.4533, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.41661102950994794, | |
| "grad_norm": 2.6843929290771484, | |
| "learning_rate": 7.248446993659086e-06, | |
| "loss": 0.4337, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.4181370772370906, | |
| "grad_norm": 2.778803825378418, | |
| "learning_rate": 7.224623241745423e-06, | |
| "loss": 0.4992, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.41966312496423325, | |
| "grad_norm": 3.245333194732666, | |
| "learning_rate": 7.200736351524705e-06, | |
| "loss": 0.483, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.4211891726913759, | |
| "grad_norm": 3.0267553329467773, | |
| "learning_rate": 7.176787000944449e-06, | |
| "loss": 0.4927, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.4227152204185186, | |
| "grad_norm": 2.559861183166504, | |
| "learning_rate": 7.152775869724902e-06, | |
| "loss": 0.4803, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.42424126814566127, | |
| "grad_norm": 2.693279504776001, | |
| "learning_rate": 7.128703639339732e-06, | |
| "loss": 0.4622, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4257673158728039, | |
| "grad_norm": 2.9602789878845215, | |
| "learning_rate": 7.104570992996711e-06, | |
| "loss": 0.5023, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4272933635999466, | |
| "grad_norm": 3.0726730823516846, | |
| "learning_rate": 7.080378615618299e-06, | |
| "loss": 0.4737, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.42881941132708923, | |
| "grad_norm": 2.7500321865081787, | |
| "learning_rate": 7.0561271938222275e-06, | |
| "loss": 0.4669, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.43034545905423194, | |
| "grad_norm": 2.4754300117492676, | |
| "learning_rate": 7.031817415901991e-06, | |
| "loss": 0.4597, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.4318715067813746, | |
| "grad_norm": 2.9996578693389893, | |
| "learning_rate": 7.007449971807331e-06, | |
| "loss": 0.4693, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.43339755450851725, | |
| "grad_norm": 2.9249792098999023, | |
| "learning_rate": 6.983025553124638e-06, | |
| "loss": 0.4778, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.4349236022356599, | |
| "grad_norm": 3.4900503158569336, | |
| "learning_rate": 6.958544853057339e-06, | |
| "loss": 0.4768, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4364496499628026, | |
| "grad_norm": 2.6894686222076416, | |
| "learning_rate": 6.934008566406211e-06, | |
| "loss": 0.4828, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.43797569768994526, | |
| "grad_norm": 2.501004934310913, | |
| "learning_rate": 6.909417389549664e-06, | |
| "loss": 0.4719, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.4395017454170879, | |
| "grad_norm": 2.8693606853485107, | |
| "learning_rate": 6.8847720204239835e-06, | |
| "loss": 0.4464, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4410277931442306, | |
| "grad_norm": 3.2556777000427246, | |
| "learning_rate": 6.860073158503511e-06, | |
| "loss": 0.4667, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.4425538408713732, | |
| "grad_norm": 3.124281406402588, | |
| "learning_rate": 6.8353215047808006e-06, | |
| "loss": 0.4647, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.44407988859851594, | |
| "grad_norm": 3.2909648418426514, | |
| "learning_rate": 6.810517761746724e-06, | |
| "loss": 0.4543, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.4456059363256586, | |
| "grad_norm": 3.157102584838867, | |
| "learning_rate": 6.785662633370521e-06, | |
| "loss": 0.4802, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.44713198405280125, | |
| "grad_norm": 3.0583925247192383, | |
| "learning_rate": 6.760756825079838e-06, | |
| "loss": 0.4788, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.4486580317799439, | |
| "grad_norm": 3.167233943939209, | |
| "learning_rate": 6.735801043740691e-06, | |
| "loss": 0.4607, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.4501840795070866, | |
| "grad_norm": 2.5458922386169434, | |
| "learning_rate": 6.710795997637412e-06, | |
| "loss": 0.4574, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.45171012723422926, | |
| "grad_norm": 2.914799690246582, | |
| "learning_rate": 6.68574239645254e-06, | |
| "loss": 0.4686, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4532361749613719, | |
| "grad_norm": 2.5135247707366943, | |
| "learning_rate": 6.660640951246691e-06, | |
| "loss": 0.4771, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.4547622226885146, | |
| "grad_norm": 2.9928719997406006, | |
| "learning_rate": 6.635492374438366e-06, | |
| "loss": 0.4602, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.4562882704156572, | |
| "grad_norm": 2.775026321411133, | |
| "learning_rate": 6.6102973797837324e-06, | |
| "loss": 0.4585, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.45781431814279994, | |
| "grad_norm": 2.5386760234832764, | |
| "learning_rate": 6.585056682356375e-06, | |
| "loss": 0.4823, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.45781431814279994, | |
| "eval_loss": 0.45332393050193787, | |
| "eval_runtime": 100.0205, | |
| "eval_samples_per_second": 5.299, | |
| "eval_steps_per_second": 2.649, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4593403658699426, | |
| "grad_norm": 2.6468212604522705, | |
| "learning_rate": 6.5597709985269895e-06, | |
| "loss": 0.4769, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.46086641359708524, | |
| "grad_norm": 2.3556416034698486, | |
| "learning_rate": 6.534441045943059e-06, | |
| "loss": 0.4712, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.4623924613242279, | |
| "grad_norm": 2.7787866592407227, | |
| "learning_rate": 6.509067543508483e-06, | |
| "loss": 0.4574, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.4639185090513706, | |
| "grad_norm": 3.103032112121582, | |
| "learning_rate": 6.483651211363175e-06, | |
| "loss": 0.4505, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.46544455677851326, | |
| "grad_norm": 2.5027055740356445, | |
| "learning_rate": 6.4581927708626235e-06, | |
| "loss": 0.4669, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.4669706045056559, | |
| "grad_norm": 3.0449085235595703, | |
| "learning_rate": 6.432692944557416e-06, | |
| "loss": 0.4616, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.46849665223279857, | |
| "grad_norm": 2.4839391708374023, | |
| "learning_rate": 6.407152456172736e-06, | |
| "loss": 0.4435, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.4700226999599412, | |
| "grad_norm": 2.436234474182129, | |
| "learning_rate": 6.381572030587822e-06, | |
| "loss": 0.4635, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.47154874768708394, | |
| "grad_norm": 2.8912863731384277, | |
| "learning_rate": 6.355952393815388e-06, | |
| "loss": 0.4652, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.4730747954142266, | |
| "grad_norm": 2.5968050956726074, | |
| "learning_rate": 6.33029427298103e-06, | |
| "loss": 0.4729, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.47460084314136924, | |
| "grad_norm": 3.2073683738708496, | |
| "learning_rate": 6.304598396302578e-06, | |
| "loss": 0.4953, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.4761268908685119, | |
| "grad_norm": 3.0304925441741943, | |
| "learning_rate": 6.278865493069434e-06, | |
| "loss": 0.4474, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.47765293859565455, | |
| "grad_norm": 2.594212293624878, | |
| "learning_rate": 6.25309629362187e-06, | |
| "loss": 0.4613, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.47917898632279726, | |
| "grad_norm": 3.9076614379882812, | |
| "learning_rate": 6.227291529330302e-06, | |
| "loss": 0.4581, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.4807050340499399, | |
| "grad_norm": 3.236703634262085, | |
| "learning_rate": 6.201451932574533e-06, | |
| "loss": 0.491, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.48223108177708257, | |
| "grad_norm": 3.039473056793213, | |
| "learning_rate": 6.175578236722968e-06, | |
| "loss": 0.4632, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.4837571295042252, | |
| "grad_norm": 2.9076642990112305, | |
| "learning_rate": 6.149671176111793e-06, | |
| "loss": 0.4587, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.48528317723136793, | |
| "grad_norm": 2.884756565093994, | |
| "learning_rate": 6.123731486024146e-06, | |
| "loss": 0.4576, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.4868092249585106, | |
| "grad_norm": 2.9610495567321777, | |
| "learning_rate": 6.097759902669232e-06, | |
| "loss": 0.4562, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.48833527268565324, | |
| "grad_norm": 2.4013702869415283, | |
| "learning_rate": 6.071757163161443e-06, | |
| "loss": 0.4451, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4898613204127959, | |
| "grad_norm": 2.82401180267334, | |
| "learning_rate": 6.045724005499429e-06, | |
| "loss": 0.4658, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.49138736813993855, | |
| "grad_norm": 3.196622133255005, | |
| "learning_rate": 6.019661168545159e-06, | |
| "loss": 0.4443, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.49291341586708126, | |
| "grad_norm": 2.9927377700805664, | |
| "learning_rate": 5.9935693920029405e-06, | |
| "loss": 0.4538, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.4944394635942239, | |
| "grad_norm": 2.5958001613616943, | |
| "learning_rate": 5.967449416398438e-06, | |
| "loss": 0.4481, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.49596551132136657, | |
| "grad_norm": 3.0835108757019043, | |
| "learning_rate": 5.941301983057646e-06, | |
| "loss": 0.4629, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4974915590485092, | |
| "grad_norm": 2.6167795658111572, | |
| "learning_rate": 5.915127834085853e-06, | |
| "loss": 0.4626, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.49901760677565193, | |
| "grad_norm": 2.769148826599121, | |
| "learning_rate": 5.888927712346582e-06, | |
| "loss": 0.4574, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.5005436545027946, | |
| "grad_norm": 2.5669050216674805, | |
| "learning_rate": 5.862702361440502e-06, | |
| "loss": 0.4594, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5020697022299372, | |
| "grad_norm": 3.04659104347229, | |
| "learning_rate": 5.836452525684326e-06, | |
| "loss": 0.4495, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.5035957499570799, | |
| "grad_norm": 2.9052836894989014, | |
| "learning_rate": 5.8101789500896855e-06, | |
| "loss": 0.4302, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5051217976842226, | |
| "grad_norm": 2.720864772796631, | |
| "learning_rate": 5.783882380341985e-06, | |
| "loss": 0.4522, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.5066478454113652, | |
| "grad_norm": 2.5311403274536133, | |
| "learning_rate": 5.7575635627792384e-06, | |
| "loss": 0.4807, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.5081738931385079, | |
| "grad_norm": 3.34335994720459, | |
| "learning_rate": 5.731223244370891e-06, | |
| "loss": 0.4576, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5096999408656506, | |
| "grad_norm": 3.448711633682251, | |
| "learning_rate": 5.704862172696612e-06, | |
| "loss": 0.4498, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5112259885927932, | |
| "grad_norm": 3.406877279281616, | |
| "learning_rate": 5.678481095925087e-06, | |
| "loss": 0.4633, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5127520363199359, | |
| "grad_norm": 2.611567974090576, | |
| "learning_rate": 5.65208076279277e-06, | |
| "loss": 0.4687, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5142780840470785, | |
| "grad_norm": 2.893051862716675, | |
| "learning_rate": 5.625661922582646e-06, | |
| "loss": 0.4572, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5158041317742212, | |
| "grad_norm": 2.572845935821533, | |
| "learning_rate": 5.599225325102957e-06, | |
| "loss": 0.461, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.517330179501364, | |
| "grad_norm": 2.917742967605591, | |
| "learning_rate": 5.572771720665928e-06, | |
| "loss": 0.4679, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.5188562272285066, | |
| "grad_norm": 2.3963794708251953, | |
| "learning_rate": 5.546301860066463e-06, | |
| "loss": 0.4423, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5203822749556493, | |
| "grad_norm": 3.034247636795044, | |
| "learning_rate": 5.519816494560848e-06, | |
| "loss": 0.4689, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5219083226827919, | |
| "grad_norm": 2.448772668838501, | |
| "learning_rate": 5.493316375845417e-06, | |
| "loss": 0.4295, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5234343704099346, | |
| "grad_norm": 2.448565721511841, | |
| "learning_rate": 5.466802256035225e-06, | |
| "loss": 0.4405, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5249604181370773, | |
| "grad_norm": 3.043869733810425, | |
| "learning_rate": 5.4402748876427e-06, | |
| "loss": 0.4114, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5264864658642199, | |
| "grad_norm": 2.8036019802093506, | |
| "learning_rate": 5.413735023556288e-06, | |
| "loss": 0.4556, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5280125135913626, | |
| "grad_norm": 2.72363018989563, | |
| "learning_rate": 5.387183417019079e-06, | |
| "loss": 0.458, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.5295385613185052, | |
| "grad_norm": 2.6680619716644287, | |
| "learning_rate": 5.360620821607433e-06, | |
| "loss": 0.4522, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5310646090456479, | |
| "grad_norm": 3.0074567794799805, | |
| "learning_rate": 5.334047991209594e-06, | |
| "loss": 0.4683, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5325906567727906, | |
| "grad_norm": 2.9890754222869873, | |
| "learning_rate": 5.307465680004289e-06, | |
| "loss": 0.4417, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5341167044999332, | |
| "grad_norm": 2.6241977214813232, | |
| "learning_rate": 5.280874642439323e-06, | |
| "loss": 0.4292, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5341167044999332, | |
| "eval_loss": 0.4343341290950775, | |
| "eval_runtime": 99.9521, | |
| "eval_samples_per_second": 5.303, | |
| "eval_steps_per_second": 2.651, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5356427522270759, | |
| "grad_norm": 2.369849920272827, | |
| "learning_rate": 5.254275633210175e-06, | |
| "loss": 0.4566, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5371687999542186, | |
| "grad_norm": 2.8073859214782715, | |
| "learning_rate": 5.227669407238565e-06, | |
| "loss": 0.4706, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5386948476813612, | |
| "grad_norm": 2.9969775676727295, | |
| "learning_rate": 5.201056719651042e-06, | |
| "loss": 0.4656, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.5402208954085039, | |
| "grad_norm": 2.4478580951690674, | |
| "learning_rate": 5.174438325757542e-06, | |
| "loss": 0.4419, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5417469431356465, | |
| "grad_norm": 2.7153167724609375, | |
| "learning_rate": 5.147814981029956e-06, | |
| "loss": 0.4913, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5432729908627892, | |
| "grad_norm": 2.724480390548706, | |
| "learning_rate": 5.121187441080687e-06, | |
| "loss": 0.4523, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.544799038589932, | |
| "grad_norm": 2.984091281890869, | |
| "learning_rate": 5.094556461641205e-06, | |
| "loss": 0.4696, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.5463250863170745, | |
| "grad_norm": 2.665983200073242, | |
| "learning_rate": 5.0679227985406e-06, | |
| "loss": 0.4405, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5478511340442173, | |
| "grad_norm": 2.8179776668548584, | |
| "learning_rate": 5.041287207684125e-06, | |
| "loss": 0.4503, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.5493771817713599, | |
| "grad_norm": 3.146925687789917, | |
| "learning_rate": 5.014650445031749e-06, | |
| "loss": 0.4908, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5509032294985026, | |
| "grad_norm": 3.112048864364624, | |
| "learning_rate": 4.988013266576699e-06, | |
| "loss": 0.4404, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.5524292772256453, | |
| "grad_norm": 2.5898945331573486, | |
| "learning_rate": 4.961376428323997e-06, | |
| "loss": 0.4422, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5539553249527879, | |
| "grad_norm": 2.4043118953704834, | |
| "learning_rate": 4.934740686269016e-06, | |
| "loss": 0.4688, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.5554813726799306, | |
| "grad_norm": 2.969442367553711, | |
| "learning_rate": 4.908106796376015e-06, | |
| "loss": 0.4497, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5570074204070732, | |
| "grad_norm": 2.687509298324585, | |
| "learning_rate": 4.881475514556689e-06, | |
| "loss": 0.4356, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5585334681342159, | |
| "grad_norm": 2.7803378105163574, | |
| "learning_rate": 4.854847596648704e-06, | |
| "loss": 0.468, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5600595158613586, | |
| "grad_norm": 2.924004316329956, | |
| "learning_rate": 4.828223798394257e-06, | |
| "loss": 0.4478, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.5615855635885012, | |
| "grad_norm": 3.0467331409454346, | |
| "learning_rate": 4.8016048754186265e-06, | |
| "loss": 0.4515, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5631116113156439, | |
| "grad_norm": 2.7318050861358643, | |
| "learning_rate": 4.77499158320872e-06, | |
| "loss": 0.4441, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.5646376590427865, | |
| "grad_norm": 2.8721094131469727, | |
| "learning_rate": 4.748384677091631e-06, | |
| "loss": 0.442, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5661637067699292, | |
| "grad_norm": 2.5854904651641846, | |
| "learning_rate": 4.721784912213209e-06, | |
| "loss": 0.4523, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.5676897544970719, | |
| "grad_norm": 2.962846279144287, | |
| "learning_rate": 4.695193043516629e-06, | |
| "loss": 0.4208, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5692158022242145, | |
| "grad_norm": 3.385815382003784, | |
| "learning_rate": 4.668609825720953e-06, | |
| "loss": 0.4212, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.5707418499513572, | |
| "grad_norm": 3.1869301795959473, | |
| "learning_rate": 4.642036013299716e-06, | |
| "loss": 0.4366, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.5722678976785, | |
| "grad_norm": 2.8463032245635986, | |
| "learning_rate": 4.615472360459519e-06, | |
| "loss": 0.4318, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5737939454056425, | |
| "grad_norm": 2.9622936248779297, | |
| "learning_rate": 4.588919621118615e-06, | |
| "loss": 0.4352, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.5753199931327853, | |
| "grad_norm": 2.8792457580566406, | |
| "learning_rate": 4.562378548885519e-06, | |
| "loss": 0.4473, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.5768460408599279, | |
| "grad_norm": 2.265306234359741, | |
| "learning_rate": 4.535849897037607e-06, | |
| "loss": 0.4603, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.5783720885870706, | |
| "grad_norm": 2.707305431365967, | |
| "learning_rate": 4.50933441849975e-06, | |
| "loss": 0.4211, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.5798981363142133, | |
| "grad_norm": 2.334364891052246, | |
| "learning_rate": 4.4828328658229406e-06, | |
| "loss": 0.4478, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5814241840413559, | |
| "grad_norm": 2.8786513805389404, | |
| "learning_rate": 4.456345991162933e-06, | |
| "loss": 0.4451, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.5829502317684986, | |
| "grad_norm": 2.4972634315490723, | |
| "learning_rate": 4.429874546258893e-06, | |
| "loss": 0.4675, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.5844762794956412, | |
| "grad_norm": 3.0034310817718506, | |
| "learning_rate": 4.40341928241207e-06, | |
| "loss": 0.4294, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.5860023272227839, | |
| "grad_norm": 3.1585464477539062, | |
| "learning_rate": 4.376980950464467e-06, | |
| "loss": 0.4143, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.5875283749499266, | |
| "grad_norm": 2.7901790142059326, | |
| "learning_rate": 4.350560300777533e-06, | |
| "loss": 0.459, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5890544226770692, | |
| "grad_norm": 2.7392337322235107, | |
| "learning_rate": 4.324158083210867e-06, | |
| "loss": 0.4349, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.5905804704042119, | |
| "grad_norm": 2.9482500553131104, | |
| "learning_rate": 4.297775047100935e-06, | |
| "loss": 0.4362, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.5921065181313545, | |
| "grad_norm": 2.583388566970825, | |
| "learning_rate": 4.271411941239809e-06, | |
| "loss": 0.4702, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.5936325658584972, | |
| "grad_norm": 2.743952512741089, | |
| "learning_rate": 4.245069513853897e-06, | |
| "loss": 0.4401, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.5951586135856399, | |
| "grad_norm": 2.7953054904937744, | |
| "learning_rate": 4.218748512582732e-06, | |
| "loss": 0.4425, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5966846613127825, | |
| "grad_norm": 2.898608684539795, | |
| "learning_rate": 4.19244968445773e-06, | |
| "loss": 0.4488, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.5982107090399252, | |
| "grad_norm": 2.4456114768981934, | |
| "learning_rate": 4.166173775881007e-06, | |
| "loss": 0.4226, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.5997367567670678, | |
| "grad_norm": 3.178201913833618, | |
| "learning_rate": 4.139921532604177e-06, | |
| "loss": 0.4267, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.6012628044942105, | |
| "grad_norm": 2.3235249519348145, | |
| "learning_rate": 4.113693699707203e-06, | |
| "loss": 0.4486, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.6027888522213533, | |
| "grad_norm": 2.7228195667266846, | |
| "learning_rate": 4.0874910215772426e-06, | |
| "loss": 0.4271, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.6043148999484959, | |
| "grad_norm": 2.8123674392700195, | |
| "learning_rate": 4.061314241887516e-06, | |
| "loss": 0.4111, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.6058409476756386, | |
| "grad_norm": 2.610856533050537, | |
| "learning_rate": 4.03516410357621e-06, | |
| "loss": 0.4229, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.6073669954027813, | |
| "grad_norm": 2.711467742919922, | |
| "learning_rate": 4.009041348825383e-06, | |
| "loss": 0.4265, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.6088930431299239, | |
| "grad_norm": 3.1023402214050293, | |
| "learning_rate": 3.982946719039911e-06, | |
| "loss": 0.4249, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.6104190908570666, | |
| "grad_norm": 2.8750693798065186, | |
| "learning_rate": 3.956880954826435e-06, | |
| "loss": 0.4333, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6104190908570666, | |
| "eval_loss": 0.4167136251926422, | |
| "eval_runtime": 99.9809, | |
| "eval_samples_per_second": 5.301, | |
| "eval_steps_per_second": 2.651, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6119451385842092, | |
| "grad_norm": 2.5793118476867676, | |
| "learning_rate": 3.930844795972344e-06, | |
| "loss": 0.4512, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6134711863113519, | |
| "grad_norm": 3.219802141189575, | |
| "learning_rate": 3.904838981424785e-06, | |
| "loss": 0.4203, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.6149972340384946, | |
| "grad_norm": 3.2610111236572266, | |
| "learning_rate": 3.878864249269681e-06, | |
| "loss": 0.439, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6165232817656372, | |
| "grad_norm": 2.9082117080688477, | |
| "learning_rate": 3.852921336710794e-06, | |
| "loss": 0.4365, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6180493294927799, | |
| "grad_norm": 3.1823573112487793, | |
| "learning_rate": 3.827010980048787e-06, | |
| "loss": 0.4377, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6195753772199225, | |
| "grad_norm": 2.5103094577789307, | |
| "learning_rate": 3.801133914660344e-06, | |
| "loss": 0.4389, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6211014249470652, | |
| "grad_norm": 2.895665168762207, | |
| "learning_rate": 3.7752908749772865e-06, | |
| "loss": 0.4417, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6226274726742079, | |
| "grad_norm": 3.2190115451812744, | |
| "learning_rate": 3.749482594465733e-06, | |
| "loss": 0.4232, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6241535204013505, | |
| "grad_norm": 2.946439504623413, | |
| "learning_rate": 3.7237098056052816e-06, | |
| "loss": 0.4205, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6256795681284932, | |
| "grad_norm": 2.475071907043457, | |
| "learning_rate": 3.6979732398682223e-06, | |
| "loss": 0.4627, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6272056158556358, | |
| "grad_norm": 3.475555181503296, | |
| "learning_rate": 3.672273627698775e-06, | |
| "loss": 0.4135, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6287316635827785, | |
| "grad_norm": 2.8138234615325928, | |
| "learning_rate": 3.646611698492364e-06, | |
| "loss": 0.4203, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6302577113099213, | |
| "grad_norm": 3.4674346446990967, | |
| "learning_rate": 3.6209881805749025e-06, | |
| "loss": 0.4225, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6317837590370639, | |
| "grad_norm": 2.8863394260406494, | |
| "learning_rate": 3.59540380118214e-06, | |
| "loss": 0.4256, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6333098067642066, | |
| "grad_norm": 2.5023999214172363, | |
| "learning_rate": 3.5698592864390085e-06, | |
| "loss": 0.4494, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6348358544913493, | |
| "grad_norm": 3.108616828918457, | |
| "learning_rate": 3.544355361339017e-06, | |
| "loss": 0.4278, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.6363619022184919, | |
| "grad_norm": 3.5009264945983887, | |
| "learning_rate": 3.51889274972368e-06, | |
| "loss": 0.4145, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6378879499456346, | |
| "grad_norm": 4.06900691986084, | |
| "learning_rate": 3.4934721742619714e-06, | |
| "loss": 0.4327, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.6394139976727772, | |
| "grad_norm": 3.3994603157043457, | |
| "learning_rate": 3.46809435642981e-06, | |
| "loss": 0.4123, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.6409400453999199, | |
| "grad_norm": 2.9589715003967285, | |
| "learning_rate": 3.442760016489586e-06, | |
| "loss": 0.3977, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6424660931270626, | |
| "grad_norm": 2.398531436920166, | |
| "learning_rate": 3.4174698734697177e-06, | |
| "loss": 0.4197, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.6439921408542052, | |
| "grad_norm": 3.2008326053619385, | |
| "learning_rate": 3.3922246451442474e-06, | |
| "loss": 0.4286, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6455181885813479, | |
| "grad_norm": 3.441654920578003, | |
| "learning_rate": 3.3670250480124712e-06, | |
| "loss": 0.4568, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6470442363084905, | |
| "grad_norm": 2.481886386871338, | |
| "learning_rate": 3.3418717972785906e-06, | |
| "loss": 0.403, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6485702840356332, | |
| "grad_norm": 2.639709949493408, | |
| "learning_rate": 3.316765606831432e-06, | |
| "loss": 0.4567, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6500963317627759, | |
| "grad_norm": 2.857813596725464, | |
| "learning_rate": 3.2917071892241714e-06, | |
| "loss": 0.4147, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6516223794899185, | |
| "grad_norm": 2.872697353363037, | |
| "learning_rate": 3.266697255654123e-06, | |
| "loss": 0.4138, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6531484272170612, | |
| "grad_norm": 3.090141773223877, | |
| "learning_rate": 3.2417365159425383e-06, | |
| "loss": 0.4238, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6546744749442038, | |
| "grad_norm": 3.7141196727752686, | |
| "learning_rate": 3.216825678514478e-06, | |
| "loss": 0.4198, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.6562005226713465, | |
| "grad_norm": 2.859290838241577, | |
| "learning_rate": 3.1919654503786935e-06, | |
| "loss": 0.4159, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6577265703984893, | |
| "grad_norm": 3.016757011413574, | |
| "learning_rate": 3.1671565371075687e-06, | |
| "loss": 0.4218, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.6592526181256319, | |
| "grad_norm": 2.7048025131225586, | |
| "learning_rate": 3.142399642817084e-06, | |
| "loss": 0.4327, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.6607786658527746, | |
| "grad_norm": 2.9763782024383545, | |
| "learning_rate": 3.117695470146844e-06, | |
| "loss": 0.4071, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.6623047135799172, | |
| "grad_norm": 2.885979652404785, | |
| "learning_rate": 3.0930447202401303e-06, | |
| "loss": 0.4279, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.6638307613070599, | |
| "grad_norm": 2.6588382720947266, | |
| "learning_rate": 3.0684480927240057e-06, | |
| "loss": 0.4199, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6653568090342026, | |
| "grad_norm": 3.2519760131835938, | |
| "learning_rate": 3.0439062856894463e-06, | |
| "loss": 0.4067, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6668828567613452, | |
| "grad_norm": 5.050004482269287, | |
| "learning_rate": 3.0194199956715443e-06, | |
| "loss": 0.4256, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.6684089044884879, | |
| "grad_norm": 2.7873661518096924, | |
| "learning_rate": 2.994989917629726e-06, | |
| "loss": 0.4203, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.6699349522156306, | |
| "grad_norm": 2.685523509979248, | |
| "learning_rate": 2.9706167449280404e-06, | |
| "loss": 0.4546, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.6714609999427732, | |
| "grad_norm": 2.6410155296325684, | |
| "learning_rate": 2.9463011693154643e-06, | |
| "loss": 0.4257, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6729870476699159, | |
| "grad_norm": 2.6280877590179443, | |
| "learning_rate": 2.9220438809062855e-06, | |
| "loss": 0.4154, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.6745130953970585, | |
| "grad_norm": 2.6469454765319824, | |
| "learning_rate": 2.897845568160508e-06, | |
| "loss": 0.4245, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.6760391431242012, | |
| "grad_norm": 3.425985097885132, | |
| "learning_rate": 2.873706917864314e-06, | |
| "loss": 0.4173, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.6775651908513439, | |
| "grad_norm": 2.8293681144714355, | |
| "learning_rate": 2.8496286151105644e-06, | |
| "loss": 0.4337, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.6790912385784865, | |
| "grad_norm": 2.9185335636138916, | |
| "learning_rate": 2.825611343279374e-06, | |
| "loss": 0.4132, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6806172863056292, | |
| "grad_norm": 2.4097468852996826, | |
| "learning_rate": 2.801655784018696e-06, | |
| "loss": 0.4206, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.6821433340327718, | |
| "grad_norm": 2.4502408504486084, | |
| "learning_rate": 2.777762617224985e-06, | |
| "loss": 0.4192, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.6836693817599145, | |
| "grad_norm": 2.847097873687744, | |
| "learning_rate": 2.7539325210239e-06, | |
| "loss": 0.4347, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.6851954294870573, | |
| "grad_norm": 3.346667766571045, | |
| "learning_rate": 2.730166171751056e-06, | |
| "loss": 0.4328, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.6867214772141998, | |
| "grad_norm": 2.8015189170837402, | |
| "learning_rate": 2.706464243932836e-06, | |
| "loss": 0.4153, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6867214772141998, | |
| "eval_loss": 0.40170425176620483, | |
| "eval_runtime": 99.9566, | |
| "eval_samples_per_second": 5.302, | |
| "eval_steps_per_second": 2.651, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6882475249413426, | |
| "grad_norm": 3.115753173828125, | |
| "learning_rate": 2.6828274102672292e-06, | |
| "loss": 0.4192, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.6897735726684852, | |
| "grad_norm": 3.2326269149780273, | |
| "learning_rate": 2.6592563416047616e-06, | |
| "loss": 0.4203, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.6912996203956279, | |
| "grad_norm": 2.6536991596221924, | |
| "learning_rate": 2.6357517069294397e-06, | |
| "loss": 0.4023, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.6928256681227706, | |
| "grad_norm": 3.132383346557617, | |
| "learning_rate": 2.6123141733397695e-06, | |
| "loss": 0.4251, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.6943517158499132, | |
| "grad_norm": 2.5857174396514893, | |
| "learning_rate": 2.5889444060298217e-06, | |
| "loss": 0.3893, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6958777635770559, | |
| "grad_norm": 2.402454137802124, | |
| "learning_rate": 2.5656430682703547e-06, | |
| "loss": 0.3777, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.6974038113041985, | |
| "grad_norm": 3.208631992340088, | |
| "learning_rate": 2.5424108213899902e-06, | |
| "loss": 0.4077, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.6989298590313412, | |
| "grad_norm": 3.2448372840881348, | |
| "learning_rate": 2.5192483247564393e-06, | |
| "loss": 0.4218, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.7004559067584839, | |
| "grad_norm": 2.6008667945861816, | |
| "learning_rate": 2.496156235757792e-06, | |
| "loss": 0.4466, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.7019819544856265, | |
| "grad_norm": 2.548492431640625, | |
| "learning_rate": 2.47313520978386e-06, | |
| "loss": 0.3949, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.7035080022127692, | |
| "grad_norm": 2.979745626449585, | |
| "learning_rate": 2.4501859002075713e-06, | |
| "loss": 0.4244, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.7050340499399119, | |
| "grad_norm": 2.696276903152466, | |
| "learning_rate": 2.4273089583664376e-06, | |
| "loss": 0.4144, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.7065600976670545, | |
| "grad_norm": 2.8598382472991943, | |
| "learning_rate": 2.404505033544048e-06, | |
| "loss": 0.391, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.7080861453941972, | |
| "grad_norm": 3.1868958473205566, | |
| "learning_rate": 2.381774772951666e-06, | |
| "loss": 0.4227, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.7096121931213398, | |
| "grad_norm": 3.3258581161499023, | |
| "learning_rate": 2.359118821709842e-06, | |
| "loss": 0.4306, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.7111382408484825, | |
| "grad_norm": 2.390016794204712, | |
| "learning_rate": 2.3365378228301107e-06, | |
| "loss": 0.4245, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.7126642885756252, | |
| "grad_norm": 2.966630220413208, | |
| "learning_rate": 2.314032417196742e-06, | |
| "loss": 0.4135, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7141903363027678, | |
| "grad_norm": 3.0981130599975586, | |
| "learning_rate": 2.2916032435485477e-06, | |
| "loss": 0.4195, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7157163840299106, | |
| "grad_norm": 2.610236644744873, | |
| "learning_rate": 2.269250938460762e-06, | |
| "loss": 0.4101, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.7172424317570532, | |
| "grad_norm": 2.589944839477539, | |
| "learning_rate": 2.246976136326963e-06, | |
| "loss": 0.4167, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7187684794841959, | |
| "grad_norm": 2.9684040546417236, | |
| "learning_rate": 2.2247794693410746e-06, | |
| "loss": 0.3946, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7202945272113386, | |
| "grad_norm": 2.7830798625946045, | |
| "learning_rate": 2.202661567479423e-06, | |
| "loss": 0.4112, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.7218205749384812, | |
| "grad_norm": 3.1135716438293457, | |
| "learning_rate": 2.180623058482853e-06, | |
| "loss": 0.4371, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7233466226656239, | |
| "grad_norm": 2.432995557785034, | |
| "learning_rate": 2.158664567838924e-06, | |
| "loss": 0.3921, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.7248726703927665, | |
| "grad_norm": 2.704394817352295, | |
| "learning_rate": 2.136786718764135e-06, | |
| "loss": 0.4117, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7263987181199092, | |
| "grad_norm": 3.2139337062835693, | |
| "learning_rate": 2.1149901321862624e-06, | |
| "loss": 0.3888, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7279247658470519, | |
| "grad_norm": 2.8158254623413086, | |
| "learning_rate": 2.093275426726722e-06, | |
| "loss": 0.3953, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.7294508135741945, | |
| "grad_norm": 2.6510894298553467, | |
| "learning_rate": 2.0716432186830064e-06, | |
| "loss": 0.4003, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7309768613013372, | |
| "grad_norm": 3.2726964950561523, | |
| "learning_rate": 2.0500941220112153e-06, | |
| "loss": 0.4097, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.7325029090284799, | |
| "grad_norm": 2.590909242630005, | |
| "learning_rate": 2.0286287483086046e-06, | |
| "loss": 0.4016, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7340289567556225, | |
| "grad_norm": 2.8233532905578613, | |
| "learning_rate": 2.007247706796254e-06, | |
| "loss": 0.4012, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7355550044827652, | |
| "grad_norm": 2.646611452102661, | |
| "learning_rate": 1.985951604301746e-06, | |
| "loss": 0.402, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7370810522099078, | |
| "grad_norm": 2.817006826400757, | |
| "learning_rate": 1.9647410452419763e-06, | |
| "loss": 0.4361, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7386070999370505, | |
| "grad_norm": 2.7627429962158203, | |
| "learning_rate": 1.943616631605973e-06, | |
| "loss": 0.4032, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7401331476641932, | |
| "grad_norm": 2.847055673599243, | |
| "learning_rate": 1.922578962937826e-06, | |
| "loss": 0.3871, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7416591953913358, | |
| "grad_norm": 3.007472515106201, | |
| "learning_rate": 1.9016286363196656e-06, | |
| "loss": 0.3938, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7431852431184786, | |
| "grad_norm": 3.840334177017212, | |
| "learning_rate": 1.8807662463547156e-06, | |
| "loss": 0.416, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.7447112908456212, | |
| "grad_norm": 2.870105504989624, | |
| "learning_rate": 1.8599923851504237e-06, | |
| "loss": 0.4244, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7462373385727639, | |
| "grad_norm": 2.780932664871216, | |
| "learning_rate": 1.8393076423016493e-06, | |
| "loss": 0.4187, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.7477633862999066, | |
| "grad_norm": 3.142162561416626, | |
| "learning_rate": 1.8187126048739284e-06, | |
| "loss": 0.3896, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7492894340270492, | |
| "grad_norm": 2.7128045558929443, | |
| "learning_rate": 1.7982078573868245e-06, | |
| "loss": 0.4062, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.7508154817541919, | |
| "grad_norm": 2.843827247619629, | |
| "learning_rate": 1.7777939817973238e-06, | |
| "loss": 0.4194, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7523415294813345, | |
| "grad_norm": 2.7338247299194336, | |
| "learning_rate": 1.7574715574833324e-06, | |
| "loss": 0.4018, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.7538675772084772, | |
| "grad_norm": 2.4693098068237305, | |
| "learning_rate": 1.7372411612272149e-06, | |
| "loss": 0.4073, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7553936249356199, | |
| "grad_norm": 2.3850250244140625, | |
| "learning_rate": 1.7171033671994418e-06, | |
| "loss": 0.3987, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7569196726627625, | |
| "grad_norm": 2.474433183670044, | |
| "learning_rate": 1.6970587469422889e-06, | |
| "loss": 0.4142, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7584457203899052, | |
| "grad_norm": 2.722198247909546, | |
| "learning_rate": 1.6771078693536075e-06, | |
| "loss": 0.3817, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7599717681170478, | |
| "grad_norm": 2.9978530406951904, | |
| "learning_rate": 1.657251300670688e-06, | |
| "loss": 0.4322, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7614978158441905, | |
| "grad_norm": 2.862351179122925, | |
| "learning_rate": 1.6374896044541854e-06, | |
| "loss": 0.4025, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.7630238635713332, | |
| "grad_norm": 3.3892769813537598, | |
| "learning_rate": 1.6178233415721228e-06, | |
| "loss": 0.3973, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7630238635713332, | |
| "eval_loss": 0.39075401425361633, | |
| "eval_runtime": 99.9532, | |
| "eval_samples_per_second": 5.302, | |
| "eval_steps_per_second": 2.651, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7645499112984758, | |
| "grad_norm": 3.2107391357421875, | |
| "learning_rate": 1.5982530701839815e-06, | |
| "loss": 0.4204, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.7660759590256185, | |
| "grad_norm": 2.611640691757202, | |
| "learning_rate": 1.5787793457248425e-06, | |
| "loss": 0.3878, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.7676020067527612, | |
| "grad_norm": 3.0495364665985107, | |
| "learning_rate": 1.5594027208896433e-06, | |
| "loss": 0.4075, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.7691280544799038, | |
| "grad_norm": 3.080049991607666, | |
| "learning_rate": 1.5401237456174755e-06, | |
| "loss": 0.4277, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.7706541022070466, | |
| "grad_norm": 2.9949750900268555, | |
| "learning_rate": 1.5209429670759874e-06, | |
| "loss": 0.4008, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7721801499341892, | |
| "grad_norm": 2.866619348526001, | |
| "learning_rate": 1.5018609296458425e-06, | |
| "loss": 0.3935, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.7737061976613319, | |
| "grad_norm": 2.5663259029388428, | |
| "learning_rate": 1.4828781749052807e-06, | |
| "loss": 0.3999, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.7752322453884746, | |
| "grad_norm": 2.572970151901245, | |
| "learning_rate": 1.4639952416147457e-06, | |
| "loss": 0.4095, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.7767582931156172, | |
| "grad_norm": 2.68562650680542, | |
| "learning_rate": 1.445212665701583e-06, | |
| "loss": 0.419, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.7782843408427599, | |
| "grad_norm": 3.246164321899414, | |
| "learning_rate": 1.426530980244848e-06, | |
| "loss": 0.4138, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7798103885699025, | |
| "grad_norm": 3.2083702087402344, | |
| "learning_rate": 1.407950715460159e-06, | |
| "loss": 0.4208, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.7813364362970452, | |
| "grad_norm": 3.3285470008850098, | |
| "learning_rate": 1.389472398684658e-06, | |
| "loss": 0.3787, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.7828624840241879, | |
| "grad_norm": 2.917363405227661, | |
| "learning_rate": 1.3710965543620442e-06, | |
| "loss": 0.4057, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.7843885317513305, | |
| "grad_norm": 4.440506935119629, | |
| "learning_rate": 1.3528237040276825e-06, | |
| "loss": 0.3901, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.7859145794784732, | |
| "grad_norm": 4.1051788330078125, | |
| "learning_rate": 1.3346543662938132e-06, | |
| "loss": 0.4323, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7874406272056158, | |
| "grad_norm": 2.771667242050171, | |
| "learning_rate": 1.316589056834821e-06, | |
| "loss": 0.4127, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.7889666749327585, | |
| "grad_norm": 2.9452033042907715, | |
| "learning_rate": 1.2986282883726065e-06, | |
| "loss": 0.4235, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.7904927226599012, | |
| "grad_norm": 3.01763916015625, | |
| "learning_rate": 1.2807725706620317e-06, | |
| "loss": 0.4059, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.7920187703870438, | |
| "grad_norm": 3.0742299556732178, | |
| "learning_rate": 1.2630224104764516e-06, | |
| "loss": 0.416, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.7935448181141865, | |
| "grad_norm": 2.9382846355438232, | |
| "learning_rate": 1.2453783115933387e-06, | |
| "loss": 0.3976, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7950708658413291, | |
| "grad_norm": 2.8469552993774414, | |
| "learning_rate": 1.2278407747799687e-06, | |
| "loss": 0.407, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.7965969135684718, | |
| "grad_norm": 2.8619937896728516, | |
| "learning_rate": 1.2104102977792282e-06, | |
| "loss": 0.3873, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.7981229612956146, | |
| "grad_norm": 3.257063388824463, | |
| "learning_rate": 1.1930873752954725e-06, | |
| "loss": 0.3868, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.7996490090227572, | |
| "grad_norm": 3.016730785369873, | |
| "learning_rate": 1.1758724989804908e-06, | |
| "loss": 0.4081, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.8011750567498999, | |
| "grad_norm": 3.1905858516693115, | |
| "learning_rate": 1.1587661574195536e-06, | |
| "loss": 0.3895, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.8027011044770426, | |
| "grad_norm": 2.977105140686035, | |
| "learning_rate": 1.1417688361175422e-06, | |
| "loss": 0.4028, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.8042271522041852, | |
| "grad_norm": 2.6317853927612305, | |
| "learning_rate": 1.1248810174851755e-06, | |
| "loss": 0.377, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.8057531999313279, | |
| "grad_norm": 3.0427422523498535, | |
| "learning_rate": 1.1081031808253096e-06, | |
| "loss": 0.3763, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.8072792476584705, | |
| "grad_norm": 3.044510841369629, | |
| "learning_rate": 1.0914358023193428e-06, | |
| "loss": 0.382, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.8088052953856132, | |
| "grad_norm": 2.3702402114868164, | |
| "learning_rate": 1.0748793550136949e-06, | |
| "loss": 0.4003, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8103313431127559, | |
| "grad_norm": 2.994016647338867, | |
| "learning_rate": 1.0584343088063837e-06, | |
| "loss": 0.3966, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.8118573908398985, | |
| "grad_norm": 3.0523016452789307, | |
| "learning_rate": 1.0421011304336932e-06, | |
| "loss": 0.4129, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.8133834385670412, | |
| "grad_norm": 3.0171058177948, | |
| "learning_rate": 1.0258802834569137e-06, | |
| "loss": 0.3687, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.8149094862941838, | |
| "grad_norm": 2.748992681503296, | |
| "learning_rate": 1.0097722282492023e-06, | |
| "loss": 0.3936, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.8164355340213265, | |
| "grad_norm": 2.96637225151062, | |
| "learning_rate": 9.93777421982503e-07, | |
| "loss": 0.4234, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8179615817484692, | |
| "grad_norm": 3.161268949508667, | |
| "learning_rate": 9.778963186145796e-07, | |
| "loss": 0.3938, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8194876294756118, | |
| "grad_norm": 2.630280017852783, | |
| "learning_rate": 9.621293688761263e-07, | |
| "loss": 0.3739, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.8210136772027545, | |
| "grad_norm": 3.382284641265869, | |
| "learning_rate": 9.464770202579787e-07, | |
| "loss": 0.3918, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.8225397249298971, | |
| "grad_norm": 3.014678716659546, | |
| "learning_rate": 9.309397169984158e-07, | |
| "loss": 0.4185, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.8240657726570398, | |
| "grad_norm": 2.7957093715667725, | |
| "learning_rate": 9.155179000705399e-07, | |
| "loss": 0.3897, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8255918203841826, | |
| "grad_norm": 3.4030938148498535, | |
| "learning_rate": 9.00212007169779e-07, | |
| "loss": 0.3989, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.8271178681113251, | |
| "grad_norm": 3.2532286643981934, | |
| "learning_rate": 8.850224727014489e-07, | |
| "loss": 0.4053, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8286439158384679, | |
| "grad_norm": 3.2262065410614014, | |
| "learning_rate": 8.699497277684326e-07, | |
| "loss": 0.413, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.8301699635656106, | |
| "grad_norm": 3.0282540321350098, | |
| "learning_rate": 8.549942001589406e-07, | |
| "loss": 0.3965, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.8316960112927532, | |
| "grad_norm": 2.6417813301086426, | |
| "learning_rate": 8.401563143343721e-07, | |
| "loss": 0.4071, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.8332220590198959, | |
| "grad_norm": 3.082578182220459, | |
| "learning_rate": 8.254364914172697e-07, | |
| "loss": 0.3975, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8347481067470385, | |
| "grad_norm": 3.2389848232269287, | |
| "learning_rate": 8.108351491793615e-07, | |
| "loss": 0.404, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.8362741544741812, | |
| "grad_norm": 3.0996053218841553, | |
| "learning_rate": 7.963527020297085e-07, | |
| "loss": 0.3817, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.8378002022013239, | |
| "grad_norm": 2.964110851287842, | |
| "learning_rate": 7.819895610029433e-07, | |
| "loss": 0.3821, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.8393262499284665, | |
| "grad_norm": 3.0734763145446777, | |
| "learning_rate": 7.677461337476005e-07, | |
| "loss": 0.4168, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8393262499284665, | |
| "eval_loss": 0.3822996914386749, | |
| "eval_runtime": 99.942, | |
| "eval_samples_per_second": 5.303, | |
| "eval_steps_per_second": 2.652, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8408522976556092, | |
| "grad_norm": 3.2727982997894287, | |
| "learning_rate": 7.536228245145554e-07, | |
| "loss": 0.3812, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.8423783453827518, | |
| "grad_norm": 2.7713711261749268, | |
| "learning_rate": 7.396200341455356e-07, | |
| "loss": 0.3969, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.8439043931098945, | |
| "grad_norm": 3.2900307178497314, | |
| "learning_rate": 7.25738160061763e-07, | |
| "loss": 0.4094, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.8454304408370372, | |
| "grad_norm": 2.305082321166992, | |
| "learning_rate": 7.119775962526593e-07, | |
| "loss": 0.4075, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.8469564885641798, | |
| "grad_norm": 2.9151320457458496, | |
| "learning_rate": 6.983387332646718e-07, | |
| "loss": 0.3881, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8484825362913225, | |
| "grad_norm": 2.791234254837036, | |
| "learning_rate": 6.848219581901866e-07, | |
| "loss": 0.3946, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.8500085840184651, | |
| "grad_norm": 3.1531620025634766, | |
| "learning_rate": 6.714276546565423e-07, | |
| "loss": 0.4137, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.8515346317456078, | |
| "grad_norm": 2.58180570602417, | |
| "learning_rate": 6.581562028151451e-07, | |
| "loss": 0.4073, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.8530606794727505, | |
| "grad_norm": 2.863215208053589, | |
| "learning_rate": 6.450079793306735e-07, | |
| "loss": 0.4321, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.8545867271998931, | |
| "grad_norm": 2.909364700317383, | |
| "learning_rate": 6.319833573703938e-07, | |
| "loss": 0.3989, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8561127749270359, | |
| "grad_norm": 2.7926652431488037, | |
| "learning_rate": 6.190827065935645e-07, | |
| "loss": 0.404, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.8576388226541785, | |
| "grad_norm": 2.7227327823638916, | |
| "learning_rate": 6.06306393140948e-07, | |
| "loss": 0.3815, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.8591648703813212, | |
| "grad_norm": 2.705726385116577, | |
| "learning_rate": 5.936547796244207e-07, | |
| "loss": 0.3741, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.8606909181084639, | |
| "grad_norm": 3.357189178466797, | |
| "learning_rate": 5.811282251166716e-07, | |
| "loss": 0.398, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.8622169658356065, | |
| "grad_norm": 2.5761477947235107, | |
| "learning_rate": 5.687270851410265e-07, | |
| "loss": 0.396, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8637430135627492, | |
| "grad_norm": 4.029236316680908, | |
| "learning_rate": 5.564517116613433e-07, | |
| "loss": 0.4209, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.8652690612898919, | |
| "grad_norm": 3.4346041679382324, | |
| "learning_rate": 5.443024530720326e-07, | |
| "loss": 0.3933, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.8667951090170345, | |
| "grad_norm": 2.5683741569519043, | |
| "learning_rate": 5.32279654188163e-07, | |
| "loss": 0.3668, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.8683211567441772, | |
| "grad_norm": 2.6289405822753906, | |
| "learning_rate": 5.203836562356795e-07, | |
| "loss": 0.3781, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.8698472044713198, | |
| "grad_norm": 2.9439804553985596, | |
| "learning_rate": 5.086147968417199e-07, | |
| "loss": 0.4096, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8713732521984625, | |
| "grad_norm": 2.9020564556121826, | |
| "learning_rate": 4.969734100250229e-07, | |
| "loss": 0.413, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.8728992999256052, | |
| "grad_norm": 3.4045605659484863, | |
| "learning_rate": 4.854598261864618e-07, | |
| "loss": 0.3765, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.8744253476527478, | |
| "grad_norm": 3.001821517944336, | |
| "learning_rate": 4.74074372099656e-07, | |
| "loss": 0.3817, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.8759513953798905, | |
| "grad_norm": 2.8464105129241943, | |
| "learning_rate": 4.628173709017031e-07, | |
| "loss": 0.3914, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.8774774431070331, | |
| "grad_norm": 3.0081145763397217, | |
| "learning_rate": 4.516891420840047e-07, | |
| "loss": 0.4129, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8790034908341758, | |
| "grad_norm": 2.8356759548187256, | |
| "learning_rate": 4.4069000148319885e-07, | |
| "loss": 0.3986, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.8805295385613185, | |
| "grad_norm": 2.759951114654541, | |
| "learning_rate": 4.298202612722008e-07, | |
| "loss": 0.3763, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.8820555862884611, | |
| "grad_norm": 3.221383571624756, | |
| "learning_rate": 4.1908022995133526e-07, | |
| "loss": 0.3885, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.8835816340156039, | |
| "grad_norm": 2.8623874187469482, | |
| "learning_rate": 4.084702123395834e-07, | |
| "loss": 0.3748, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.8851076817427465, | |
| "grad_norm": 2.868194341659546, | |
| "learning_rate": 3.979905095659381e-07, | |
| "loss": 0.3913, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8866337294698892, | |
| "grad_norm": 3.0048179626464844, | |
| "learning_rate": 3.8764141906084794e-07, | |
| "loss": 0.3843, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.8881597771970319, | |
| "grad_norm": 2.4534389972686768, | |
| "learning_rate": 3.7742323454778296e-07, | |
| "loss": 0.3878, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.8896858249241745, | |
| "grad_norm": 2.534501552581787, | |
| "learning_rate": 3.6733624603489e-07, | |
| "loss": 0.3899, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.8912118726513172, | |
| "grad_norm": 3.0121171474456787, | |
| "learning_rate": 3.5738073980677355e-07, | |
| "loss": 0.407, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.8927379203784598, | |
| "grad_norm": 2.708857774734497, | |
| "learning_rate": 3.475569984163596e-07, | |
| "loss": 0.4173, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8942639681056025, | |
| "grad_norm": 3.2787721157073975, | |
| "learning_rate": 3.378653006768823e-07, | |
| "loss": 0.3978, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.8957900158327452, | |
| "grad_norm": 2.9193058013916016, | |
| "learning_rate": 3.2830592165396913e-07, | |
| "loss": 0.3986, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.8973160635598878, | |
| "grad_norm": 2.758446455001831, | |
| "learning_rate": 3.188791326578339e-07, | |
| "loss": 0.3574, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.8988421112870305, | |
| "grad_norm": 2.8080217838287354, | |
| "learning_rate": 3.0958520123557767e-07, | |
| "loss": 0.3749, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.9003681590141732, | |
| "grad_norm": 3.7036428451538086, | |
| "learning_rate": 3.0042439116359455e-07, | |
| "loss": 0.3887, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.9018942067413158, | |
| "grad_norm": 3.0411696434020996, | |
| "learning_rate": 2.9139696244008255e-07, | |
| "loss": 0.3961, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.9034202544684585, | |
| "grad_norm": 2.942746877670288, | |
| "learning_rate": 2.8250317127767213e-07, | |
| "loss": 0.3875, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.9049463021956011, | |
| "grad_norm": 3.230517864227295, | |
| "learning_rate": 2.7374327009614456e-07, | |
| "loss": 0.3981, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.9064723499227438, | |
| "grad_norm": 2.7047958374023438, | |
| "learning_rate": 2.651175075152784e-07, | |
| "loss": 0.3996, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.9079983976498865, | |
| "grad_norm": 3.278867244720459, | |
| "learning_rate": 2.5662612834778164e-07, | |
| "loss": 0.3625, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.9095244453770291, | |
| "grad_norm": 2.955040693283081, | |
| "learning_rate": 2.4826937359235305e-07, | |
| "loss": 0.3662, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.9110504931041719, | |
| "grad_norm": 2.9156110286712646, | |
| "learning_rate": 2.4004748042683933e-07, | |
| "loss": 0.405, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.9125765408313145, | |
| "grad_norm": 3.318800926208496, | |
| "learning_rate": 2.3196068220150025e-07, | |
| "loss": 0.4027, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.9141025885584572, | |
| "grad_norm": 4.083282470703125, | |
| "learning_rate": 2.2400920843239194e-07, | |
| "loss": 0.3634, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.9156286362855999, | |
| "grad_norm": 3.180896759033203, | |
| "learning_rate": 2.161932847948478e-07, | |
| "loss": 0.3918, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9156286362855999, | |
| "eval_loss": 0.37766218185424805, | |
| "eval_runtime": 99.9689, | |
| "eval_samples_per_second": 5.302, | |
| "eval_steps_per_second": 2.651, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9171546840127425, | |
| "grad_norm": 3.35513973236084, | |
| "learning_rate": 2.0851313311707532e-07, | |
| "loss": 0.3857, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.9186807317398852, | |
| "grad_norm": 2.6778345108032227, | |
| "learning_rate": 2.0096897137386052e-07, | |
| "loss": 0.3725, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9202067794670278, | |
| "grad_norm": 2.9898831844329834, | |
| "learning_rate": 1.9356101368038005e-07, | |
| "loss": 0.4051, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.9217328271941705, | |
| "grad_norm": 2.924804449081421, | |
| "learning_rate": 1.8628947028612788e-07, | |
| "loss": 0.3742, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.9232588749213132, | |
| "grad_norm": 2.871389389038086, | |
| "learning_rate": 1.791545475689438e-07, | |
| "loss": 0.3881, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.9247849226484558, | |
| "grad_norm": 3.050419569015503, | |
| "learning_rate": 1.721564480291571e-07, | |
| "loss": 0.3972, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9263109703755985, | |
| "grad_norm": 3.090453863143921, | |
| "learning_rate": 1.652953702838428e-07, | |
| "loss": 0.3852, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.9278370181027412, | |
| "grad_norm": 3.4830410480499268, | |
| "learning_rate": 1.585715090611778e-07, | |
| "loss": 0.3964, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.9293630658298838, | |
| "grad_norm": 3.4521846771240234, | |
| "learning_rate": 1.5198505519492368e-07, | |
| "loss": 0.4143, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.9308891135570265, | |
| "grad_norm": 3.032611131668091, | |
| "learning_rate": 1.4553619561899935e-07, | |
| "loss": 0.3881, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9324151612841691, | |
| "grad_norm": 3.2053749561309814, | |
| "learning_rate": 1.3922511336218524e-07, | |
| "loss": 0.397, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.9339412090113118, | |
| "grad_norm": 3.1718320846557617, | |
| "learning_rate": 1.330519875429237e-07, | |
| "loss": 0.4042, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.9354672567384545, | |
| "grad_norm": 2.4236302375793457, | |
| "learning_rate": 1.2701699336423513e-07, | |
| "loss": 0.3791, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.9369933044655971, | |
| "grad_norm": 2.4723575115203857, | |
| "learning_rate": 1.211203021087487e-07, | |
| "loss": 0.4056, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.9385193521927399, | |
| "grad_norm": 3.279250383377075, | |
| "learning_rate": 1.1536208113383684e-07, | |
| "loss": 0.4043, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.9400453999198825, | |
| "grad_norm": 2.6275689601898193, | |
| "learning_rate": 1.0974249386687064e-07, | |
| "loss": 0.3986, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.9415714476470252, | |
| "grad_norm": 2.8793821334838867, | |
| "learning_rate": 1.042616998005752e-07, | |
| "loss": 0.3891, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.9430974953741679, | |
| "grad_norm": 3.2804644107818604, | |
| "learning_rate": 9.891985448850839e-08, | |
| "loss": 0.3963, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9446235431013105, | |
| "grad_norm": 2.7859580516815186, | |
| "learning_rate": 9.37171095406425e-08, | |
| "loss": 0.3832, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.9461495908284532, | |
| "grad_norm": 2.705620288848877, | |
| "learning_rate": 8.865361261906402e-08, | |
| "loss": 0.3851, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9476756385555958, | |
| "grad_norm": 2.792065382003784, | |
| "learning_rate": 8.372950743378128e-08, | |
| "loss": 0.403, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.9492016862827385, | |
| "grad_norm": 3.0607993602752686, | |
| "learning_rate": 7.894493373864332e-08, | |
| "loss": 0.4104, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.9507277340098812, | |
| "grad_norm": 3.5412745475769043, | |
| "learning_rate": 7.430002732737973e-08, | |
| "loss": 0.3937, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.9522537817370238, | |
| "grad_norm": 3.418365955352783, | |
| "learning_rate": 6.979492002974098e-08, | |
| "loss": 0.3954, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.9537798294641665, | |
| "grad_norm": 2.9009461402893066, | |
| "learning_rate": 6.542973970775912e-08, | |
| "loss": 0.3904, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.9553058771913091, | |
| "grad_norm": 2.7992494106292725, | |
| "learning_rate": 6.120461025211744e-08, | |
| "loss": 0.3897, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.9568319249184518, | |
| "grad_norm": 2.860257625579834, | |
| "learning_rate": 5.711965157863597e-08, | |
| "loss": 0.3673, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.9583579726455945, | |
| "grad_norm": 2.605058431625366, | |
| "learning_rate": 5.317497962486984e-08, | |
| "loss": 0.3961, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.9598840203727371, | |
| "grad_norm": 3.152754545211792, | |
| "learning_rate": 4.937070634681185e-08, | |
| "loss": 0.3884, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.9614100680998798, | |
| "grad_norm": 2.6811258792877197, | |
| "learning_rate": 4.570693971572393e-08, | |
| "loss": 0.3968, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9629361158270225, | |
| "grad_norm": 3.2074809074401855, | |
| "learning_rate": 4.218378371506515e-08, | |
| "loss": 0.4024, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.9644621635541651, | |
| "grad_norm": 3.393841028213501, | |
| "learning_rate": 3.880133833754518e-08, | |
| "loss": 0.3852, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.9659882112813079, | |
| "grad_norm": 3.0189993381500244, | |
| "learning_rate": 3.555969958228489e-08, | |
| "loss": 0.3804, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.9675142590084505, | |
| "grad_norm": 2.8455655574798584, | |
| "learning_rate": 3.245895945209132e-08, | |
| "loss": 0.4228, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.9690403067355932, | |
| "grad_norm": 3.668877601623535, | |
| "learning_rate": 2.949920595084643e-08, | |
| "loss": 0.369, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.9705663544627359, | |
| "grad_norm": 3.0246024131774902, | |
| "learning_rate": 2.6680523081011878e-08, | |
| "loss": 0.3899, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.9720924021898785, | |
| "grad_norm": 2.8767731189727783, | |
| "learning_rate": 2.4002990841239804e-08, | |
| "loss": 0.393, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.9736184499170212, | |
| "grad_norm": 3.3473153114318848, | |
| "learning_rate": 2.1466685224107995e-08, | |
| "loss": 0.3722, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.9751444976441638, | |
| "grad_norm": 2.7829878330230713, | |
| "learning_rate": 1.9071678213959388e-08, | |
| "loss": 0.3788, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.9766705453713065, | |
| "grad_norm": 2.5925357341766357, | |
| "learning_rate": 1.6818037784860908e-08, | |
| "loss": 0.3862, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9781965930984492, | |
| "grad_norm": 3.0968399047851562, | |
| "learning_rate": 1.4705827898672254e-08, | |
| "loss": 0.3738, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.9797226408255918, | |
| "grad_norm": 3.40238881111145, | |
| "learning_rate": 1.2735108503232896e-08, | |
| "loss": 0.4121, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.9812486885527345, | |
| "grad_norm": 2.9040355682373047, | |
| "learning_rate": 1.0905935530658996e-08, | |
| "loss": 0.4003, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.9827747362798771, | |
| "grad_norm": 2.6801180839538574, | |
| "learning_rate": 9.218360895758006e-09, | |
| "loss": 0.3973, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.9843007840070198, | |
| "grad_norm": 2.591391086578369, | |
| "learning_rate": 7.672432494551518e-09, | |
| "loss": 0.3936, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9858268317341625, | |
| "grad_norm": 2.7946035861968994, | |
| "learning_rate": 6.268194202920241e-09, | |
| "loss": 0.3641, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.9873528794613051, | |
| "grad_norm": 4.159729480743408, | |
| "learning_rate": 5.005685875354993e-09, | |
| "loss": 0.3685, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.9888789271884478, | |
| "grad_norm": 2.7406532764434814, | |
| "learning_rate": 3.884943343829273e-09, | |
| "loss": 0.4149, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.9904049749155904, | |
| "grad_norm": 3.1383161544799805, | |
| "learning_rate": 2.9059984167778553e-09, | |
| "loss": 0.3814, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.9919310226427331, | |
| "grad_norm": 2.687572956085205, | |
| "learning_rate": 2.0688788781980664e-09, | |
| "loss": 0.3942, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9919310226427331, | |
| "eval_loss": 0.377034068107605, | |
| "eval_runtime": 100.214, | |
| "eval_samples_per_second": 5.289, | |
| "eval_steps_per_second": 2.644, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9934570703698758, | |
| "grad_norm": 2.9962236881256104, | |
| "learning_rate": 1.3736084868598564e-09, | |
| "loss": 0.3747, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.9949831180970184, | |
| "grad_norm": 2.946183204650879, | |
| "learning_rate": 8.202069756302333e-10, | |
| "loss": 0.3763, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.9965091658241612, | |
| "grad_norm": 3.0049428939819336, | |
| "learning_rate": 4.0869005091481727e-10, | |
| "loss": 0.4033, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.9980352135513039, | |
| "grad_norm": 3.199441432952881, | |
| "learning_rate": 1.3906939221042247e-10, | |
| "loss": 0.3847, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.9995612612784465, | |
| "grad_norm": 2.7879321575164795, | |
| "learning_rate": 1.1352651776985746e-11, | |
| "loss": 0.4005, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 6553, | |
| "total_flos": 2.1597904813481902e+18, | |
| "train_loss": 0.4742196609496197, | |
| "train_runtime": 41833.5481, | |
| "train_samples_per_second": 1.253, | |
| "train_steps_per_second": 0.157 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6553, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.1597904813481902e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |