SCOPE-CoT-sft-v2 / trainer_state.json
Cooolder's picture
Upload folder using huggingface_hub
fa7cea5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6553,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015260477271426664,
"grad_norm": 36.718162536621094,
"learning_rate": 1.3719512195121953e-07,
"loss": 1.7945,
"step": 10
},
{
"epoch": 0.0030520954542853328,
"grad_norm": 33.861351013183594,
"learning_rate": 2.896341463414635e-07,
"loss": 1.7388,
"step": 20
},
{
"epoch": 0.004578143181427999,
"grad_norm": 32.35844802856445,
"learning_rate": 4.4207317073170735e-07,
"loss": 1.6264,
"step": 30
},
{
"epoch": 0.0061041909085706655,
"grad_norm": 18.111286163330078,
"learning_rate": 5.945121951219512e-07,
"loss": 1.4267,
"step": 40
},
{
"epoch": 0.007630238635713332,
"grad_norm": 12.056061744689941,
"learning_rate": 7.469512195121952e-07,
"loss": 1.1626,
"step": 50
},
{
"epoch": 0.009156286362855998,
"grad_norm": 9.076939582824707,
"learning_rate": 8.993902439024391e-07,
"loss": 0.9372,
"step": 60
},
{
"epoch": 0.010682334089998665,
"grad_norm": 8.621874809265137,
"learning_rate": 1.051829268292683e-06,
"loss": 0.8972,
"step": 70
},
{
"epoch": 0.012208381817141331,
"grad_norm": 6.114352226257324,
"learning_rate": 1.204268292682927e-06,
"loss": 0.771,
"step": 80
},
{
"epoch": 0.013734429544283997,
"grad_norm": 7.380756855010986,
"learning_rate": 1.356707317073171e-06,
"loss": 0.7386,
"step": 90
},
{
"epoch": 0.015260477271426664,
"grad_norm": 6.799697399139404,
"learning_rate": 1.5091463414634146e-06,
"loss": 0.7209,
"step": 100
},
{
"epoch": 0.016786524998569332,
"grad_norm": 7.576807975769043,
"learning_rate": 1.6615853658536587e-06,
"loss": 0.6985,
"step": 110
},
{
"epoch": 0.018312572725711997,
"grad_norm": 7.400335788726807,
"learning_rate": 1.8140243902439026e-06,
"loss": 0.6846,
"step": 120
},
{
"epoch": 0.019838620452854665,
"grad_norm": 6.237104892730713,
"learning_rate": 1.9664634146341467e-06,
"loss": 0.6377,
"step": 130
},
{
"epoch": 0.02136466817999733,
"grad_norm": 6.404150485992432,
"learning_rate": 2.1189024390243905e-06,
"loss": 0.6354,
"step": 140
},
{
"epoch": 0.022890715907139998,
"grad_norm": 7.284428596496582,
"learning_rate": 2.2713414634146344e-06,
"loss": 0.6543,
"step": 150
},
{
"epoch": 0.024416763634282662,
"grad_norm": 5.5003156661987305,
"learning_rate": 2.4237804878048783e-06,
"loss": 0.6016,
"step": 160
},
{
"epoch": 0.02594281136142533,
"grad_norm": 7.994759559631348,
"learning_rate": 2.576219512195122e-06,
"loss": 0.6073,
"step": 170
},
{
"epoch": 0.027468859088567995,
"grad_norm": 7.063292980194092,
"learning_rate": 2.7286585365853664e-06,
"loss": 0.5698,
"step": 180
},
{
"epoch": 0.028994906815710663,
"grad_norm": 5.892914295196533,
"learning_rate": 2.88109756097561e-06,
"loss": 0.5679,
"step": 190
},
{
"epoch": 0.030520954542853328,
"grad_norm": 5.543047904968262,
"learning_rate": 3.0335365853658537e-06,
"loss": 0.5655,
"step": 200
},
{
"epoch": 0.032047002269995996,
"grad_norm": 6.323866844177246,
"learning_rate": 3.185975609756098e-06,
"loss": 0.5619,
"step": 210
},
{
"epoch": 0.033573049997138664,
"grad_norm": 6.957103729248047,
"learning_rate": 3.338414634146342e-06,
"loss": 0.553,
"step": 220
},
{
"epoch": 0.035099097724281325,
"grad_norm": 5.986090183258057,
"learning_rate": 3.4908536585365853e-06,
"loss": 0.5503,
"step": 230
},
{
"epoch": 0.03662514545142399,
"grad_norm": 5.522103786468506,
"learning_rate": 3.6432926829268296e-06,
"loss": 0.5493,
"step": 240
},
{
"epoch": 0.03815119317856666,
"grad_norm": 5.235415935516357,
"learning_rate": 3.7957317073170735e-06,
"loss": 0.5534,
"step": 250
},
{
"epoch": 0.03967724090570933,
"grad_norm": 5.9757819175720215,
"learning_rate": 3.948170731707318e-06,
"loss": 0.5368,
"step": 260
},
{
"epoch": 0.04120328863285199,
"grad_norm": 5.101258754730225,
"learning_rate": 4.100609756097561e-06,
"loss": 0.5254,
"step": 270
},
{
"epoch": 0.04272933635999466,
"grad_norm": 6.364596843719482,
"learning_rate": 4.2530487804878055e-06,
"loss": 0.5423,
"step": 280
},
{
"epoch": 0.04425538408713733,
"grad_norm": 6.032998085021973,
"learning_rate": 4.405487804878049e-06,
"loss": 0.5257,
"step": 290
},
{
"epoch": 0.045781431814279995,
"grad_norm": 5.804418563842773,
"learning_rate": 4.557926829268293e-06,
"loss": 0.5432,
"step": 300
},
{
"epoch": 0.047307479541422656,
"grad_norm": 5.553419589996338,
"learning_rate": 4.710365853658536e-06,
"loss": 0.5413,
"step": 310
},
{
"epoch": 0.048833527268565324,
"grad_norm": 7.154079914093018,
"learning_rate": 4.862804878048781e-06,
"loss": 0.5407,
"step": 320
},
{
"epoch": 0.05035957499570799,
"grad_norm": 7.198996543884277,
"learning_rate": 5.015243902439024e-06,
"loss": 0.5432,
"step": 330
},
{
"epoch": 0.05188562272285066,
"grad_norm": 5.0478434562683105,
"learning_rate": 5.167682926829268e-06,
"loss": 0.5125,
"step": 340
},
{
"epoch": 0.05341167044999332,
"grad_norm": 6.1683573722839355,
"learning_rate": 5.320121951219513e-06,
"loss": 0.5321,
"step": 350
},
{
"epoch": 0.05493771817713599,
"grad_norm": 7.884255409240723,
"learning_rate": 5.4725609756097565e-06,
"loss": 0.5399,
"step": 360
},
{
"epoch": 0.05646376590427866,
"grad_norm": 5.224252700805664,
"learning_rate": 5.625e-06,
"loss": 0.5539,
"step": 370
},
{
"epoch": 0.057989813631421326,
"grad_norm": 4.74583101272583,
"learning_rate": 5.777439024390244e-06,
"loss": 0.5503,
"step": 380
},
{
"epoch": 0.05951586135856399,
"grad_norm": 5.914930820465088,
"learning_rate": 5.929878048780489e-06,
"loss": 0.5398,
"step": 390
},
{
"epoch": 0.061041909085706655,
"grad_norm": 4.671189308166504,
"learning_rate": 6.082317073170733e-06,
"loss": 0.5481,
"step": 400
},
{
"epoch": 0.06256795681284932,
"grad_norm": 4.802231788635254,
"learning_rate": 6.234756097560977e-06,
"loss": 0.568,
"step": 410
},
{
"epoch": 0.06409400453999199,
"grad_norm": 4.851030349731445,
"learning_rate": 6.38719512195122e-06,
"loss": 0.5599,
"step": 420
},
{
"epoch": 0.06562005226713466,
"grad_norm": 5.160627365112305,
"learning_rate": 6.5396341463414636e-06,
"loss": 0.5352,
"step": 430
},
{
"epoch": 0.06714609999427733,
"grad_norm": 5.0278754234313965,
"learning_rate": 6.6920731707317074e-06,
"loss": 0.545,
"step": 440
},
{
"epoch": 0.06867214772141998,
"grad_norm": 5.1211090087890625,
"learning_rate": 6.844512195121952e-06,
"loss": 0.5321,
"step": 450
},
{
"epoch": 0.07019819544856265,
"grad_norm": 4.986229419708252,
"learning_rate": 6.996951219512196e-06,
"loss": 0.5584,
"step": 460
},
{
"epoch": 0.07172424317570532,
"grad_norm": 4.192492485046387,
"learning_rate": 7.14939024390244e-06,
"loss": 0.5462,
"step": 470
},
{
"epoch": 0.07325029090284799,
"grad_norm": 3.977277994155884,
"learning_rate": 7.301829268292684e-06,
"loss": 0.5366,
"step": 480
},
{
"epoch": 0.07477633862999065,
"grad_norm": 4.757632732391357,
"learning_rate": 7.454268292682928e-06,
"loss": 0.5493,
"step": 490
},
{
"epoch": 0.07630238635713332,
"grad_norm": 4.987619400024414,
"learning_rate": 7.606707317073171e-06,
"loss": 0.5282,
"step": 500
},
{
"epoch": 0.07630238635713332,
"eval_loss": 0.5320242047309875,
"eval_runtime": 100.1496,
"eval_samples_per_second": 5.292,
"eval_steps_per_second": 2.646,
"step": 500
},
{
"epoch": 0.07782843408427599,
"grad_norm": 5.458449840545654,
"learning_rate": 7.759146341463415e-06,
"loss": 0.5464,
"step": 510
},
{
"epoch": 0.07935448181141866,
"grad_norm": 5.1865081787109375,
"learning_rate": 7.911585365853658e-06,
"loss": 0.5456,
"step": 520
},
{
"epoch": 0.08088052953856131,
"grad_norm": 4.639908313751221,
"learning_rate": 8.064024390243903e-06,
"loss": 0.5226,
"step": 530
},
{
"epoch": 0.08240657726570398,
"grad_norm": 4.7745537757873535,
"learning_rate": 8.216463414634148e-06,
"loss": 0.5623,
"step": 540
},
{
"epoch": 0.08393262499284665,
"grad_norm": 4.4498515129089355,
"learning_rate": 8.36890243902439e-06,
"loss": 0.5402,
"step": 550
},
{
"epoch": 0.08545867271998932,
"grad_norm": 4.127673149108887,
"learning_rate": 8.521341463414636e-06,
"loss": 0.5526,
"step": 560
},
{
"epoch": 0.08698472044713199,
"grad_norm": 5.61818790435791,
"learning_rate": 8.673780487804879e-06,
"loss": 0.5413,
"step": 570
},
{
"epoch": 0.08851076817427465,
"grad_norm": 3.518537998199463,
"learning_rate": 8.826219512195122e-06,
"loss": 0.5536,
"step": 580
},
{
"epoch": 0.09003681590141732,
"grad_norm": 4.232659339904785,
"learning_rate": 8.978658536585366e-06,
"loss": 0.5572,
"step": 590
},
{
"epoch": 0.09156286362855999,
"grad_norm": 4.013524532318115,
"learning_rate": 9.131097560975611e-06,
"loss": 0.5236,
"step": 600
},
{
"epoch": 0.09308891135570264,
"grad_norm": 3.9425785541534424,
"learning_rate": 9.283536585365854e-06,
"loss": 0.5643,
"step": 610
},
{
"epoch": 0.09461495908284531,
"grad_norm": 4.538547515869141,
"learning_rate": 9.435975609756099e-06,
"loss": 0.5565,
"step": 620
},
{
"epoch": 0.09614100680998798,
"grad_norm": 3.850074291229248,
"learning_rate": 9.588414634146342e-06,
"loss": 0.5319,
"step": 630
},
{
"epoch": 0.09766705453713065,
"grad_norm": 5.455791473388672,
"learning_rate": 9.740853658536586e-06,
"loss": 0.5412,
"step": 640
},
{
"epoch": 0.09919310226427332,
"grad_norm": 3.3886823654174805,
"learning_rate": 9.893292682926831e-06,
"loss": 0.566,
"step": 650
},
{
"epoch": 0.10071914999141598,
"grad_norm": 3.8116540908813477,
"learning_rate": 9.999993614132319e-06,
"loss": 0.5627,
"step": 660
},
{
"epoch": 0.10224519771855865,
"grad_norm": 3.609804630279541,
"learning_rate": 9.99988008804953e-06,
"loss": 0.5576,
"step": 670
},
{
"epoch": 0.10377124544570132,
"grad_norm": 3.923624038696289,
"learning_rate": 9.999624657504754e-06,
"loss": 0.5728,
"step": 680
},
{
"epoch": 0.10529729317284398,
"grad_norm": 4.188690185546875,
"learning_rate": 9.99922732974751e-06,
"loss": 0.5676,
"step": 690
},
{
"epoch": 0.10682334089998664,
"grad_norm": 3.9573514461517334,
"learning_rate": 9.998688116054583e-06,
"loss": 0.5362,
"step": 700
},
{
"epoch": 0.10834938862712931,
"grad_norm": 3.8924996852874756,
"learning_rate": 9.99800703172971e-06,
"loss": 0.557,
"step": 710
},
{
"epoch": 0.10987543635427198,
"grad_norm": 4.514781475067139,
"learning_rate": 9.997184096103133e-06,
"loss": 0.5729,
"step": 720
},
{
"epoch": 0.11140148408141465,
"grad_norm": 3.555657148361206,
"learning_rate": 9.996219332531059e-06,
"loss": 0.5735,
"step": 730
},
{
"epoch": 0.11292753180855732,
"grad_norm": 3.8453004360198975,
"learning_rate": 9.995112768394996e-06,
"loss": 0.5492,
"step": 740
},
{
"epoch": 0.11445357953569998,
"grad_norm": 4.1153435707092285,
"learning_rate": 9.993864435100976e-06,
"loss": 0.5273,
"step": 750
},
{
"epoch": 0.11597962726284265,
"grad_norm": 3.793621063232422,
"learning_rate": 9.992474368078664e-06,
"loss": 0.5744,
"step": 760
},
{
"epoch": 0.1175056749899853,
"grad_norm": 3.394721508026123,
"learning_rate": 9.990942606780344e-06,
"loss": 0.5554,
"step": 770
},
{
"epoch": 0.11903172271712797,
"grad_norm": 4.319253444671631,
"learning_rate": 9.989269194679814e-06,
"loss": 0.5161,
"step": 780
},
{
"epoch": 0.12055777044427064,
"grad_norm": 3.6960957050323486,
"learning_rate": 9.987454179271138e-06,
"loss": 0.5467,
"step": 790
},
{
"epoch": 0.12208381817141331,
"grad_norm": 4.272833347320557,
"learning_rate": 9.985497612067315e-06,
"loss": 0.5764,
"step": 800
},
{
"epoch": 0.12360986589855598,
"grad_norm": 3.716593027114868,
"learning_rate": 9.983399548598795e-06,
"loss": 0.5644,
"step": 810
},
{
"epoch": 0.12513591362569865,
"grad_norm": 3.8028016090393066,
"learning_rate": 9.981160048411922e-06,
"loss": 0.5442,
"step": 820
},
{
"epoch": 0.1266619613528413,
"grad_norm": 3.634533405303955,
"learning_rate": 9.978779175067232e-06,
"loss": 0.5642,
"step": 830
},
{
"epoch": 0.12818800907998398,
"grad_norm": 4.484054088592529,
"learning_rate": 9.976256996137657e-06,
"loss": 0.5826,
"step": 840
},
{
"epoch": 0.12971405680712664,
"grad_norm": 3.2868642807006836,
"learning_rate": 9.973593583206602e-06,
"loss": 0.5643,
"step": 850
},
{
"epoch": 0.13124010453426932,
"grad_norm": 3.641526937484741,
"learning_rate": 9.970789011865916e-06,
"loss": 0.5577,
"step": 860
},
{
"epoch": 0.13276615226141197,
"grad_norm": 3.82144832611084,
"learning_rate": 9.967843361713747e-06,
"loss": 0.5374,
"step": 870
},
{
"epoch": 0.13429219998855466,
"grad_norm": 3.786513090133667,
"learning_rate": 9.964756716352277e-06,
"loss": 0.5523,
"step": 880
},
{
"epoch": 0.1358182477156973,
"grad_norm": 3.7389333248138428,
"learning_rate": 9.96152916338536e-06,
"loss": 0.5708,
"step": 890
},
{
"epoch": 0.13734429544283996,
"grad_norm": 3.5762176513671875,
"learning_rate": 9.958160794416022e-06,
"loss": 0.5481,
"step": 900
},
{
"epoch": 0.13887034316998265,
"grad_norm": 3.7925140857696533,
"learning_rate": 9.954651705043878e-06,
"loss": 0.5814,
"step": 910
},
{
"epoch": 0.1403963908971253,
"grad_norm": 4.10577917098999,
"learning_rate": 9.951001994862402e-06,
"loss": 0.5574,
"step": 920
},
{
"epoch": 0.14192243862426798,
"grad_norm": 3.477315664291382,
"learning_rate": 9.947211767456111e-06,
"loss": 0.5472,
"step": 930
},
{
"epoch": 0.14344848635141064,
"grad_norm": 3.1365439891815186,
"learning_rate": 9.943281130397619e-06,
"loss": 0.5759,
"step": 940
},
{
"epoch": 0.14497453407855332,
"grad_norm": 4.209506988525391,
"learning_rate": 9.93921019524459e-06,
"loss": 0.5544,
"step": 950
},
{
"epoch": 0.14650058180569597,
"grad_norm": 3.269994020462036,
"learning_rate": 9.934999077536567e-06,
"loss": 0.5577,
"step": 960
},
{
"epoch": 0.14802662953283863,
"grad_norm": 3.4426379203796387,
"learning_rate": 9.930647896791696e-06,
"loss": 0.5498,
"step": 970
},
{
"epoch": 0.1495526772599813,
"grad_norm": 3.949375629425049,
"learning_rate": 9.92615677650333e-06,
"loss": 0.5452,
"step": 980
},
{
"epoch": 0.15107872498712396,
"grad_norm": 3.42270827293396,
"learning_rate": 9.92152584413653e-06,
"loss": 0.5393,
"step": 990
},
{
"epoch": 0.15260477271426665,
"grad_norm": 4.055193901062012,
"learning_rate": 9.916755231124437e-06,
"loss": 0.5294,
"step": 1000
},
{
"epoch": 0.15260477271426665,
"eval_loss": 0.5252559781074524,
"eval_runtime": 99.9603,
"eval_samples_per_second": 5.302,
"eval_steps_per_second": 2.651,
"step": 1000
},
{
"epoch": 0.1541308204414093,
"grad_norm": 3.2706804275512695,
"learning_rate": 9.911845072864556e-06,
"loss": 0.522,
"step": 1010
},
{
"epoch": 0.15565686816855198,
"grad_norm": 3.859898090362549,
"learning_rate": 9.906795508714901e-06,
"loss": 0.5373,
"step": 1020
},
{
"epoch": 0.15718291589569464,
"grad_norm": 3.1161351203918457,
"learning_rate": 9.901606681990048e-06,
"loss": 0.5471,
"step": 1030
},
{
"epoch": 0.15870896362283732,
"grad_norm": 3.452761650085449,
"learning_rate": 9.89627873995706e-06,
"loss": 0.5532,
"step": 1040
},
{
"epoch": 0.16023501134997997,
"grad_norm": 3.6008405685424805,
"learning_rate": 9.890811833831312e-06,
"loss": 0.5075,
"step": 1050
},
{
"epoch": 0.16176105907712263,
"grad_norm": 4.958362579345703,
"learning_rate": 9.885206118772201e-06,
"loss": 0.5404,
"step": 1060
},
{
"epoch": 0.1632871068042653,
"grad_norm": 3.1011452674865723,
"learning_rate": 9.879461753878738e-06,
"loss": 0.5456,
"step": 1070
},
{
"epoch": 0.16481315453140796,
"grad_norm": 3.445542097091675,
"learning_rate": 9.873578902185034e-06,
"loss": 0.5299,
"step": 1080
},
{
"epoch": 0.16633920225855064,
"grad_norm": 3.2124245166778564,
"learning_rate": 9.867557730655676e-06,
"loss": 0.5555,
"step": 1090
},
{
"epoch": 0.1678652499856933,
"grad_norm": 3.3892674446105957,
"learning_rate": 9.861398410180985e-06,
"loss": 0.5429,
"step": 1100
},
{
"epoch": 0.16939129771283598,
"grad_norm": 3.534641981124878,
"learning_rate": 9.855101115572161e-06,
"loss": 0.5564,
"step": 1110
},
{
"epoch": 0.17091734543997864,
"grad_norm": 2.8586363792419434,
"learning_rate": 9.848666025556332e-06,
"loss": 0.5155,
"step": 1120
},
{
"epoch": 0.1724433931671213,
"grad_norm": 4.134357452392578,
"learning_rate": 9.842093322771479e-06,
"loss": 0.5292,
"step": 1130
},
{
"epoch": 0.17396944089426397,
"grad_norm": 2.917952060699463,
"learning_rate": 9.83538319376124e-06,
"loss": 0.5471,
"step": 1140
},
{
"epoch": 0.17549548862140663,
"grad_norm": 3.148503065109253,
"learning_rate": 9.82853582896964e-06,
"loss": 0.5084,
"step": 1150
},
{
"epoch": 0.1770215363485493,
"grad_norm": 2.9326202869415283,
"learning_rate": 9.821551422735655e-06,
"loss": 0.5303,
"step": 1160
},
{
"epoch": 0.17854758407569196,
"grad_norm": 2.8527936935424805,
"learning_rate": 9.814430173287724e-06,
"loss": 0.5265,
"step": 1170
},
{
"epoch": 0.18007363180283464,
"grad_norm": 3.373987913131714,
"learning_rate": 9.807172282738109e-06,
"loss": 0.5267,
"step": 1180
},
{
"epoch": 0.1815996795299773,
"grad_norm": 3.886758804321289,
"learning_rate": 9.799777957077161e-06,
"loss": 0.5449,
"step": 1190
},
{
"epoch": 0.18312572725711998,
"grad_norm": 2.8181774616241455,
"learning_rate": 9.792247406167471e-06,
"loss": 0.5477,
"step": 1200
},
{
"epoch": 0.18465177498426263,
"grad_norm": 3.1215786933898926,
"learning_rate": 9.784580843737924e-06,
"loss": 0.5317,
"step": 1210
},
{
"epoch": 0.1861778227114053,
"grad_norm": 3.4757418632507324,
"learning_rate": 9.776778487377622e-06,
"loss": 0.5312,
"step": 1220
},
{
"epoch": 0.18770387043854797,
"grad_norm": 2.941584587097168,
"learning_rate": 9.768840558529708e-06,
"loss": 0.5372,
"step": 1230
},
{
"epoch": 0.18922991816569062,
"grad_norm": 3.1221237182617188,
"learning_rate": 9.760767282485091e-06,
"loss": 0.5246,
"step": 1240
},
{
"epoch": 0.1907559658928333,
"grad_norm": 2.970177173614502,
"learning_rate": 9.752558888376045e-06,
"loss": 0.5269,
"step": 1250
},
{
"epoch": 0.19228201361997596,
"grad_norm": 3.686633348464966,
"learning_rate": 9.744215609169709e-06,
"loss": 0.5239,
"step": 1260
},
{
"epoch": 0.19380806134711864,
"grad_norm": 2.774200439453125,
"learning_rate": 9.735737681661467e-06,
"loss": 0.5175,
"step": 1270
},
{
"epoch": 0.1953341090742613,
"grad_norm": 3.612818717956543,
"learning_rate": 9.727125346468243e-06,
"loss": 0.5144,
"step": 1280
},
{
"epoch": 0.19686015680140395,
"grad_norm": 2.7817158699035645,
"learning_rate": 9.718378848021655e-06,
"loss": 0.5417,
"step": 1290
},
{
"epoch": 0.19838620452854663,
"grad_norm": 3.400517463684082,
"learning_rate": 9.709498434561086e-06,
"loss": 0.4989,
"step": 1300
},
{
"epoch": 0.1999122522556893,
"grad_norm": 2.9461493492126465,
"learning_rate": 9.70048435812664e-06,
"loss": 0.5235,
"step": 1310
},
{
"epoch": 0.20143829998283197,
"grad_norm": 3.0229151248931885,
"learning_rate": 9.69133687455199e-06,
"loss": 0.4988,
"step": 1320
},
{
"epoch": 0.20296434770997462,
"grad_norm": 4.051263332366943,
"learning_rate": 9.682056243457105e-06,
"loss": 0.5394,
"step": 1330
},
{
"epoch": 0.2044903954371173,
"grad_norm": 2.998913049697876,
"learning_rate": 9.672642728240896e-06,
"loss": 0.549,
"step": 1340
},
{
"epoch": 0.20601644316425996,
"grad_norm": 3.4110162258148193,
"learning_rate": 9.663096596073732e-06,
"loss": 0.4888,
"step": 1350
},
{
"epoch": 0.20754249089140264,
"grad_norm": 3.530796766281128,
"learning_rate": 9.653418117889862e-06,
"loss": 0.5272,
"step": 1360
},
{
"epoch": 0.2090685386185453,
"grad_norm": 3.0355026721954346,
"learning_rate": 9.64360756837972e-06,
"loss": 0.5322,
"step": 1370
},
{
"epoch": 0.21059458634568795,
"grad_norm": 2.8864426612854004,
"learning_rate": 9.633665225982134e-06,
"loss": 0.49,
"step": 1380
},
{
"epoch": 0.21212063407283063,
"grad_norm": 4.0944132804870605,
"learning_rate": 9.623591372876422e-06,
"loss": 0.5502,
"step": 1390
},
{
"epoch": 0.2136466817999733,
"grad_norm": 2.89972186088562,
"learning_rate": 9.61338629497438e-06,
"loss": 0.5178,
"step": 1400
},
{
"epoch": 0.21517272952711597,
"grad_norm": 2.904897689819336,
"learning_rate": 9.603050281912175e-06,
"loss": 0.5471,
"step": 1410
},
{
"epoch": 0.21669877725425862,
"grad_norm": 2.9118282794952393,
"learning_rate": 9.592583627042115e-06,
"loss": 0.5214,
"step": 1420
},
{
"epoch": 0.2182248249814013,
"grad_norm": 3.003340244293213,
"learning_rate": 9.581986627424329e-06,
"loss": 0.5266,
"step": 1430
},
{
"epoch": 0.21975087270854396,
"grad_norm": 2.8239073753356934,
"learning_rate": 9.571259583818337e-06,
"loss": 0.5305,
"step": 1440
},
{
"epoch": 0.2212769204356866,
"grad_norm": 3.4803032875061035,
"learning_rate": 9.56040280067451e-06,
"loss": 0.5336,
"step": 1450
},
{
"epoch": 0.2228029681628293,
"grad_norm": 2.8414413928985596,
"learning_rate": 9.549416586125435e-06,
"loss": 0.5341,
"step": 1460
},
{
"epoch": 0.22432901588997195,
"grad_norm": 3.3560791015625,
"learning_rate": 9.538301251977158e-06,
"loss": 0.5175,
"step": 1470
},
{
"epoch": 0.22585506361711463,
"grad_norm": 3.5832326412200928,
"learning_rate": 9.52705711370035e-06,
"loss": 0.5453,
"step": 1480
},
{
"epoch": 0.2273811113442573,
"grad_norm": 2.907194137573242,
"learning_rate": 9.515684490421342e-06,
"loss": 0.5256,
"step": 1490
},
{
"epoch": 0.22890715907139997,
"grad_norm": 3.446336269378662,
"learning_rate": 9.504183704913075e-06,
"loss": 0.5116,
"step": 1500
},
{
"epoch": 0.22890715907139997,
"eval_loss": 0.5038516521453857,
"eval_runtime": 99.9985,
"eval_samples_per_second": 5.3,
"eval_steps_per_second": 2.65,
"step": 1500
},
{
"epoch": 0.23043320679854262,
"grad_norm": 3.575700044631958,
"learning_rate": 9.492555083585928e-06,
"loss": 0.5215,
"step": 1510
},
{
"epoch": 0.2319592545256853,
"grad_norm": 2.7438437938690186,
"learning_rate": 9.480798956478473e-06,
"loss": 0.5381,
"step": 1520
},
{
"epoch": 0.23348530225282796,
"grad_norm": 3.586581230163574,
"learning_rate": 9.468915657248083e-06,
"loss": 0.5361,
"step": 1530
},
{
"epoch": 0.2350113499799706,
"grad_norm": 2.979966878890991,
"learning_rate": 9.45690552316149e-06,
"loss": 0.5385,
"step": 1540
},
{
"epoch": 0.2365373977071133,
"grad_norm": 3.4089224338531494,
"learning_rate": 9.44476889508519e-06,
"loss": 0.5122,
"step": 1550
},
{
"epoch": 0.23806344543425595,
"grad_norm": 3.400916337966919,
"learning_rate": 9.432506117475777e-06,
"loss": 0.4855,
"step": 1560
},
{
"epoch": 0.23958949316139863,
"grad_norm": 3.0944440364837646,
"learning_rate": 9.420117538370173e-06,
"loss": 0.5314,
"step": 1570
},
{
"epoch": 0.24111554088854129,
"grad_norm": 3.072086811065674,
"learning_rate": 9.407603509375737e-06,
"loss": 0.5487,
"step": 1580
},
{
"epoch": 0.24264158861568397,
"grad_norm": 2.866974353790283,
"learning_rate": 9.394964385660302e-06,
"loss": 0.5199,
"step": 1590
},
{
"epoch": 0.24416763634282662,
"grad_norm": 3.2548046112060547,
"learning_rate": 9.382200525942076e-06,
"loss": 0.536,
"step": 1600
},
{
"epoch": 0.24569368406996928,
"grad_norm": 3.636455535888672,
"learning_rate": 9.369312292479479e-06,
"loss": 0.5102,
"step": 1610
},
{
"epoch": 0.24721973179711196,
"grad_norm": 2.7913310527801514,
"learning_rate": 9.35630005106085e-06,
"loss": 0.5174,
"step": 1620
},
{
"epoch": 0.2487457795242546,
"grad_norm": 3.905932903289795,
"learning_rate": 9.34316417099407e-06,
"loss": 0.5399,
"step": 1630
},
{
"epoch": 0.2502718272513973,
"grad_norm": 2.9708142280578613,
"learning_rate": 9.329905025096079e-06,
"loss": 0.5139,
"step": 1640
},
{
"epoch": 0.25179787497853995,
"grad_norm": 2.849421977996826,
"learning_rate": 9.316522989682293e-06,
"loss": 0.4887,
"step": 1650
},
{
"epoch": 0.2533239227056826,
"grad_norm": 3.2548842430114746,
"learning_rate": 9.30301844455593e-06,
"loss": 0.5173,
"step": 1660
},
{
"epoch": 0.2548499704328253,
"grad_norm": 3.9787535667419434,
"learning_rate": 9.289391772997223e-06,
"loss": 0.5295,
"step": 1670
},
{
"epoch": 0.25637601815996797,
"grad_norm": 2.5555968284606934,
"learning_rate": 9.275643361752546e-06,
"loss": 0.5371,
"step": 1680
},
{
"epoch": 0.2579020658871106,
"grad_norm": 3.158561944961548,
"learning_rate": 9.261773601023439e-06,
"loss": 0.5305,
"step": 1690
},
{
"epoch": 0.2594281136142533,
"grad_norm": 3.1799142360687256,
"learning_rate": 9.247782884455526e-06,
"loss": 0.5293,
"step": 1700
},
{
"epoch": 0.26095416134139593,
"grad_norm": 2.7630088329315186,
"learning_rate": 9.233671609127352e-06,
"loss": 0.5102,
"step": 1710
},
{
"epoch": 0.26248020906853864,
"grad_norm": 3.3492443561553955,
"learning_rate": 9.219440175539113e-06,
"loss": 0.516,
"step": 1720
},
{
"epoch": 0.2640062567956813,
"grad_norm": 3.2532637119293213,
"learning_rate": 9.205088987601277e-06,
"loss": 0.5063,
"step": 1730
},
{
"epoch": 0.26553230452282395,
"grad_norm": 3.098576068878174,
"learning_rate": 9.190618452623135e-06,
"loss": 0.4973,
"step": 1740
},
{
"epoch": 0.2670583522499666,
"grad_norm": 3.852489709854126,
"learning_rate": 9.176028981301229e-06,
"loss": 0.4778,
"step": 1750
},
{
"epoch": 0.2685843999771093,
"grad_norm": 3.133566379547119,
"learning_rate": 9.16132098770771e-06,
"loss": 0.5031,
"step": 1760
},
{
"epoch": 0.27011044770425197,
"grad_norm": 3.0958361625671387,
"learning_rate": 9.146494889278568e-06,
"loss": 0.4784,
"step": 1770
},
{
"epoch": 0.2716364954313946,
"grad_norm": 3.499459743499756,
"learning_rate": 9.131551106801803e-06,
"loss": 0.5071,
"step": 1780
},
{
"epoch": 0.2731625431585373,
"grad_norm": 2.8909738063812256,
"learning_rate": 9.116490064405467e-06,
"loss": 0.5116,
"step": 1790
},
{
"epoch": 0.27468859088567993,
"grad_norm": 2.8877241611480713,
"learning_rate": 9.101312189545636e-06,
"loss": 0.4888,
"step": 1800
},
{
"epoch": 0.27621463861282264,
"grad_norm": 2.978130578994751,
"learning_rate": 9.086017912994272e-06,
"loss": 0.5325,
"step": 1810
},
{
"epoch": 0.2777406863399653,
"grad_norm": 3.5364253520965576,
"learning_rate": 9.070607668827003e-06,
"loss": 0.5285,
"step": 1820
},
{
"epoch": 0.27926673406710795,
"grad_norm": 2.8093996047973633,
"learning_rate": 9.055081894410802e-06,
"loss": 0.4959,
"step": 1830
},
{
"epoch": 0.2807927817942506,
"grad_norm": 2.98183274269104,
"learning_rate": 9.03944103039157e-06,
"loss": 0.501,
"step": 1840
},
{
"epoch": 0.28231882952139326,
"grad_norm": 3.1950182914733887,
"learning_rate": 9.023685520681626e-06,
"loss": 0.5204,
"step": 1850
},
{
"epoch": 0.28384487724853597,
"grad_norm": 2.9772353172302246,
"learning_rate": 9.007815812447126e-06,
"loss": 0.4989,
"step": 1860
},
{
"epoch": 0.2853709249756786,
"grad_norm": 3.50301194190979,
"learning_rate": 8.991832356095351e-06,
"loss": 0.5032,
"step": 1870
},
{
"epoch": 0.2868969727028213,
"grad_norm": 2.9427924156188965,
"learning_rate": 8.975735605261936e-06,
"loss": 0.5196,
"step": 1880
},
{
"epoch": 0.2884230204299639,
"grad_norm": 2.9805080890655518,
"learning_rate": 8.95952601679799e-06,
"loss": 0.5092,
"step": 1890
},
{
"epoch": 0.28994906815710664,
"grad_norm": 2.65608811378479,
"learning_rate": 8.943204050757133e-06,
"loss": 0.5106,
"step": 1900
},
{
"epoch": 0.2914751158842493,
"grad_norm": 2.7500367164611816,
"learning_rate": 8.926770170382434e-06,
"loss": 0.4999,
"step": 1910
},
{
"epoch": 0.29300116361139195,
"grad_norm": 3.063328266143799,
"learning_rate": 8.910224842093275e-06,
"loss": 0.5164,
"step": 1920
},
{
"epoch": 0.2945272113385346,
"grad_norm": 3.1675572395324707,
"learning_rate": 8.893568535472094e-06,
"loss": 0.4857,
"step": 1930
},
{
"epoch": 0.29605325906567725,
"grad_norm": 4.051036357879639,
"learning_rate": 8.876801723251077e-06,
"loss": 0.4995,
"step": 1940
},
{
"epoch": 0.29757930679281996,
"grad_norm": 3.0843279361724854,
"learning_rate": 8.859924881298729e-06,
"loss": 0.4869,
"step": 1950
},
{
"epoch": 0.2991053545199626,
"grad_norm": 3.1268937587738037,
"learning_rate": 8.842938488606373e-06,
"loss": 0.516,
"step": 1960
},
{
"epoch": 0.3006314022471053,
"grad_norm": 2.659601926803589,
"learning_rate": 8.825843027274554e-06,
"loss": 0.4658,
"step": 1970
},
{
"epoch": 0.3021574499742479,
"grad_norm": 2.756713628768921,
"learning_rate": 8.80863898249936e-06,
"loss": 0.5115,
"step": 1980
},
{
"epoch": 0.30368349770139064,
"grad_norm": 2.470733404159546,
"learning_rate": 8.79132684255864e-06,
"loss": 0.5134,
"step": 1990
},
{
"epoch": 0.3052095454285333,
"grad_norm": 3.007568597793579,
"learning_rate": 8.773907098798158e-06,
"loss": 0.5121,
"step": 2000
},
{
"epoch": 0.3052095454285333,
"eval_loss": 0.48503902554512024,
"eval_runtime": 99.9999,
"eval_samples_per_second": 5.3,
"eval_steps_per_second": 2.65,
"step": 2000
},
{
"epoch": 0.30673559315567595,
"grad_norm": 3.3363804817199707,
"learning_rate": 8.756380245617645e-06,
"loss": 0.4861,
"step": 2010
},
{
"epoch": 0.3082616408828186,
"grad_norm": 3.0314290523529053,
"learning_rate": 8.73874678045677e-06,
"loss": 0.4963,
"step": 2020
},
{
"epoch": 0.30978768860996125,
"grad_norm": 2.7498457431793213,
"learning_rate": 8.721007203781008e-06,
"loss": 0.5125,
"step": 2030
},
{
"epoch": 0.31131373633710396,
"grad_norm": 3.2180142402648926,
"learning_rate": 8.703162019067451e-06,
"loss": 0.5148,
"step": 2040
},
{
"epoch": 0.3128397840642466,
"grad_norm": 3.160686492919922,
"learning_rate": 8.685211732790513e-06,
"loss": 0.4915,
"step": 2050
},
{
"epoch": 0.31436583179138927,
"grad_norm": 2.880316972732544,
"learning_rate": 8.667156854407555e-06,
"loss": 0.5154,
"step": 2060
},
{
"epoch": 0.3158918795185319,
"grad_norm": 3.6688642501831055,
"learning_rate": 8.648997896344429e-06,
"loss": 0.5257,
"step": 2070
},
{
"epoch": 0.31741792724567464,
"grad_norm": 2.8660826683044434,
"learning_rate": 8.630735373980926e-06,
"loss": 0.5134,
"step": 2080
},
{
"epoch": 0.3189439749728173,
"grad_norm": 2.69706392288208,
"learning_rate": 8.612369805636161e-06,
"loss": 0.4847,
"step": 2090
},
{
"epoch": 0.32047002269995994,
"grad_norm": 2.907597064971924,
"learning_rate": 8.593901712553853e-06,
"loss": 0.5196,
"step": 2100
},
{
"epoch": 0.3219960704271026,
"grad_norm": 3.240574836730957,
"learning_rate": 8.575331618887537e-06,
"loss": 0.5104,
"step": 2110
},
{
"epoch": 0.32352211815424525,
"grad_norm": 3.215815782546997,
"learning_rate": 8.556660051685679e-06,
"loss": 0.4973,
"step": 2120
},
{
"epoch": 0.32504816588138796,
"grad_norm": 2.7997822761535645,
"learning_rate": 8.537887540876732e-06,
"loss": 0.5099,
"step": 2130
},
{
"epoch": 0.3265742136085306,
"grad_norm": 2.945455551147461,
"learning_rate": 8.519014619254081e-06,
"loss": 0.5112,
"step": 2140
},
{
"epoch": 0.32810026133567327,
"grad_norm": 3.219611644744873,
"learning_rate": 8.50004182246093e-06,
"loss": 0.5086,
"step": 2150
},
{
"epoch": 0.3296263090628159,
"grad_norm": 2.8295400142669678,
"learning_rate": 8.480969688975094e-06,
"loss": 0.4897,
"step": 2160
},
{
"epoch": 0.3311523567899586,
"grad_norm": 2.7116832733154297,
"learning_rate": 8.461798760093728e-06,
"loss": 0.4751,
"step": 2170
},
{
"epoch": 0.3326784045171013,
"grad_norm": 3.04294490814209,
"learning_rate": 8.44252957991795e-06,
"loss": 0.4784,
"step": 2180
},
{
"epoch": 0.33420445224424394,
"grad_norm": 2.6779112815856934,
"learning_rate": 8.423162695337402e-06,
"loss": 0.5155,
"step": 2190
},
{
"epoch": 0.3357304999713866,
"grad_norm": 3.5296261310577393,
"learning_rate": 8.403698656014734e-06,
"loss": 0.509,
"step": 2200
},
{
"epoch": 0.33725654769852925,
"grad_norm": 3.0861217975616455,
"learning_rate": 8.384138014370003e-06,
"loss": 0.4961,
"step": 2210
},
{
"epoch": 0.33878259542567196,
"grad_norm": 2.9720520973205566,
"learning_rate": 8.364481325564983e-06,
"loss": 0.4759,
"step": 2220
},
{
"epoch": 0.3403086431528146,
"grad_norm": 2.8393325805664062,
"learning_rate": 8.344729147487431e-06,
"loss": 0.502,
"step": 2230
},
{
"epoch": 0.34183469087995727,
"grad_norm": 2.930060386657715,
"learning_rate": 8.324882040735227e-06,
"loss": 0.4914,
"step": 2240
},
{
"epoch": 0.3433607386070999,
"grad_norm": 3.05932879447937,
"learning_rate": 8.304940568600482e-06,
"loss": 0.5184,
"step": 2250
},
{
"epoch": 0.3448867863342426,
"grad_norm": 3.455152750015259,
"learning_rate": 8.284905297053544e-06,
"loss": 0.4881,
"step": 2260
},
{
"epoch": 0.3464128340613853,
"grad_norm": 3.1637487411499023,
"learning_rate": 8.264776794726938e-06,
"loss": 0.5001,
"step": 2270
},
{
"epoch": 0.34793888178852794,
"grad_norm": 3.0358266830444336,
"learning_rate": 8.244555632899223e-06,
"loss": 0.4795,
"step": 2280
},
{
"epoch": 0.3494649295156706,
"grad_norm": 2.930438756942749,
"learning_rate": 8.224242385478778e-06,
"loss": 0.4814,
"step": 2290
},
{
"epoch": 0.35099097724281325,
"grad_norm": 2.8416764736175537,
"learning_rate": 8.203837628987523e-06,
"loss": 0.463,
"step": 2300
},
{
"epoch": 0.35251702496995596,
"grad_norm": 2.61423921585083,
"learning_rate": 8.18334194254454e-06,
"loss": 0.4984,
"step": 2310
},
{
"epoch": 0.3540430726970986,
"grad_norm": 2.8394827842712402,
"learning_rate": 8.162755907849653e-06,
"loss": 0.4884,
"step": 2320
},
{
"epoch": 0.35556912042424127,
"grad_norm": 2.678520917892456,
"learning_rate": 8.142080109166912e-06,
"loss": 0.493,
"step": 2330
},
{
"epoch": 0.3570951681513839,
"grad_norm": 3.0217721462249756,
"learning_rate": 8.121315133308008e-06,
"loss": 0.483,
"step": 2340
},
{
"epoch": 0.3586212158785266,
"grad_norm": 2.741018533706665,
"learning_rate": 8.100461569615615e-06,
"loss": 0.5032,
"step": 2350
},
{
"epoch": 0.3601472636056693,
"grad_norm": 3.2703857421875,
"learning_rate": 8.079520009946678e-06,
"loss": 0.4871,
"step": 2360
},
{
"epoch": 0.36167331133281194,
"grad_norm": 2.5858423709869385,
"learning_rate": 8.058491048655603e-06,
"loss": 0.5108,
"step": 2370
},
{
"epoch": 0.3631993590599546,
"grad_norm": 3.112304925918579,
"learning_rate": 8.037375282577391e-06,
"loss": 0.5019,
"step": 2380
},
{
"epoch": 0.36472540678709725,
"grad_norm": 2.5532495975494385,
"learning_rate": 8.016173311010697e-06,
"loss": 0.4929,
"step": 2390
},
{
"epoch": 0.36625145451423996,
"grad_norm": 2.342569589614868,
"learning_rate": 7.994885735700832e-06,
"loss": 0.4905,
"step": 2400
},
{
"epoch": 0.3677775022413826,
"grad_norm": 3.1640357971191406,
"learning_rate": 7.973513160822664e-06,
"loss": 0.4745,
"step": 2410
},
{
"epoch": 0.36930354996852527,
"grad_norm": 2.825571298599243,
"learning_rate": 7.952056192963488e-06,
"loss": 0.4843,
"step": 2420
},
{
"epoch": 0.3708295976956679,
"grad_norm": 2.888791084289551,
"learning_rate": 7.93051544110581e-06,
"loss": 0.488,
"step": 2430
},
{
"epoch": 0.3723556454228106,
"grad_norm": 2.8343687057495117,
"learning_rate": 7.908891516610048e-06,
"loss": 0.4568,
"step": 2440
},
{
"epoch": 0.3738816931499533,
"grad_norm": 3.1001429557800293,
"learning_rate": 7.887185033197198e-06,
"loss": 0.4695,
"step": 2450
},
{
"epoch": 0.37540774087709594,
"grad_norm": 2.368257761001587,
"learning_rate": 7.8653966069314e-06,
"loss": 0.5056,
"step": 2460
},
{
"epoch": 0.3769337886042386,
"grad_norm": 2.99737286567688,
"learning_rate": 7.843526856202472e-06,
"loss": 0.4855,
"step": 2470
},
{
"epoch": 0.37845983633138125,
"grad_norm": 3.0355498790740967,
"learning_rate": 7.821576401708339e-06,
"loss": 0.4784,
"step": 2480
},
{
"epoch": 0.3799858840585239,
"grad_norm": 2.5011422634124756,
"learning_rate": 7.799545866437429e-06,
"loss": 0.518,
"step": 2490
},
{
"epoch": 0.3815119317856666,
"grad_norm": 2.846738338470459,
"learning_rate": 7.777435875650986e-06,
"loss": 0.4778,
"step": 2500
},
{
"epoch": 0.3815119317856666,
"eval_loss": 0.4665885865688324,
"eval_runtime": 99.9829,
"eval_samples_per_second": 5.301,
"eval_steps_per_second": 2.65,
"step": 2500
},
{
"epoch": 0.38303797951280927,
"grad_norm": 2.755929708480835,
"learning_rate": 7.755247056865332e-06,
"loss": 0.4726,
"step": 2510
},
{
"epoch": 0.3845640272399519,
"grad_norm": 3.3097054958343506,
"learning_rate": 7.732980039834048e-06,
"loss": 0.4839,
"step": 2520
},
{
"epoch": 0.3860900749670946,
"grad_norm": 2.85306978225708,
"learning_rate": 7.710635456530102e-06,
"loss": 0.4888,
"step": 2530
},
{
"epoch": 0.3876161226942373,
"grad_norm": 3.0452582836151123,
"learning_rate": 7.68821394112792e-06,
"loss": 0.477,
"step": 2540
},
{
"epoch": 0.38914217042137994,
"grad_norm": 2.805910348892212,
"learning_rate": 7.665716129985379e-06,
"loss": 0.5016,
"step": 2550
},
{
"epoch": 0.3906682181485226,
"grad_norm": 3.086005687713623,
"learning_rate": 7.64314266162575e-06,
"loss": 0.4842,
"step": 2560
},
{
"epoch": 0.39219426587566525,
"grad_norm": 2.4592947959899902,
"learning_rate": 7.620494176719572e-06,
"loss": 0.4977,
"step": 2570
},
{
"epoch": 0.3937203136028079,
"grad_norm": 2.79803729057312,
"learning_rate": 7.597771318066476e-06,
"loss": 0.4974,
"step": 2580
},
{
"epoch": 0.3952463613299506,
"grad_norm": 3.157926559448242,
"learning_rate": 7.574974730576936e-06,
"loss": 0.4815,
"step": 2590
},
{
"epoch": 0.39677240905709327,
"grad_norm": 4.1300458908081055,
"learning_rate": 7.552105061253962e-06,
"loss": 0.4876,
"step": 2600
},
{
"epoch": 0.3982984567842359,
"grad_norm": 3.6879398822784424,
"learning_rate": 7.529162959174746e-06,
"loss": 0.4905,
"step": 2610
},
{
"epoch": 0.3998245045113786,
"grad_norm": 2.8170852661132812,
"learning_rate": 7.5061490754722355e-06,
"loss": 0.4849,
"step": 2620
},
{
"epoch": 0.4013505522385213,
"grad_norm": 2.4680588245391846,
"learning_rate": 7.4830640633166516e-06,
"loss": 0.4541,
"step": 2630
},
{
"epoch": 0.40287659996566394,
"grad_norm": 2.8768832683563232,
"learning_rate": 7.4599085778969524e-06,
"loss": 0.4773,
"step": 2640
},
{
"epoch": 0.4044026476928066,
"grad_norm": 2.7005279064178467,
"learning_rate": 7.436683276402239e-06,
"loss": 0.47,
"step": 2650
},
{
"epoch": 0.40592869541994925,
"grad_norm": 3.036959409713745,
"learning_rate": 7.413388818003104e-06,
"loss": 0.4992,
"step": 2660
},
{
"epoch": 0.4074547431470919,
"grad_norm": 3.3453052043914795,
"learning_rate": 7.3900258638329196e-06,
"loss": 0.4713,
"step": 2670
},
{
"epoch": 0.4089807908742346,
"grad_norm": 3.5486536026000977,
"learning_rate": 7.366595076969073e-06,
"loss": 0.4724,
"step": 2680
},
{
"epoch": 0.41050683860137727,
"grad_norm": 2.4689748287200928,
"learning_rate": 7.343097122414159e-06,
"loss": 0.4972,
"step": 2690
},
{
"epoch": 0.4120328863285199,
"grad_norm": 3.1132616996765137,
"learning_rate": 7.319532667077088e-06,
"loss": 0.4766,
"step": 2700
},
{
"epoch": 0.4135589340556626,
"grad_norm": 2.663201332092285,
"learning_rate": 7.295902379754172e-06,
"loss": 0.4833,
"step": 2710
},
{
"epoch": 0.4150849817828053,
"grad_norm": 3.298428773880005,
"learning_rate": 7.272206931110135e-06,
"loss": 0.4533,
"step": 2720
},
{
"epoch": 0.41661102950994794,
"grad_norm": 2.6843929290771484,
"learning_rate": 7.248446993659086e-06,
"loss": 0.4337,
"step": 2730
},
{
"epoch": 0.4181370772370906,
"grad_norm": 2.778803825378418,
"learning_rate": 7.224623241745423e-06,
"loss": 0.4992,
"step": 2740
},
{
"epoch": 0.41966312496423325,
"grad_norm": 3.245333194732666,
"learning_rate": 7.200736351524705e-06,
"loss": 0.483,
"step": 2750
},
{
"epoch": 0.4211891726913759,
"grad_norm": 3.0267553329467773,
"learning_rate": 7.176787000944449e-06,
"loss": 0.4927,
"step": 2760
},
{
"epoch": 0.4227152204185186,
"grad_norm": 2.559861183166504,
"learning_rate": 7.152775869724902e-06,
"loss": 0.4803,
"step": 2770
},
{
"epoch": 0.42424126814566127,
"grad_norm": 2.693279504776001,
"learning_rate": 7.128703639339732e-06,
"loss": 0.4622,
"step": 2780
},
{
"epoch": 0.4257673158728039,
"grad_norm": 2.9602789878845215,
"learning_rate": 7.104570992996711e-06,
"loss": 0.5023,
"step": 2790
},
{
"epoch": 0.4272933635999466,
"grad_norm": 3.0726730823516846,
"learning_rate": 7.080378615618299e-06,
"loss": 0.4737,
"step": 2800
},
{
"epoch": 0.42881941132708923,
"grad_norm": 2.7500321865081787,
"learning_rate": 7.0561271938222275e-06,
"loss": 0.4669,
"step": 2810
},
{
"epoch": 0.43034545905423194,
"grad_norm": 2.4754300117492676,
"learning_rate": 7.031817415901991e-06,
"loss": 0.4597,
"step": 2820
},
{
"epoch": 0.4318715067813746,
"grad_norm": 2.9996578693389893,
"learning_rate": 7.007449971807331e-06,
"loss": 0.4693,
"step": 2830
},
{
"epoch": 0.43339755450851725,
"grad_norm": 2.9249792098999023,
"learning_rate": 6.983025553124638e-06,
"loss": 0.4778,
"step": 2840
},
{
"epoch": 0.4349236022356599,
"grad_norm": 3.4900503158569336,
"learning_rate": 6.958544853057339e-06,
"loss": 0.4768,
"step": 2850
},
{
"epoch": 0.4364496499628026,
"grad_norm": 2.6894686222076416,
"learning_rate": 6.934008566406211e-06,
"loss": 0.4828,
"step": 2860
},
{
"epoch": 0.43797569768994526,
"grad_norm": 2.501004934310913,
"learning_rate": 6.909417389549664e-06,
"loss": 0.4719,
"step": 2870
},
{
"epoch": 0.4395017454170879,
"grad_norm": 2.8693606853485107,
"learning_rate": 6.8847720204239835e-06,
"loss": 0.4464,
"step": 2880
},
{
"epoch": 0.4410277931442306,
"grad_norm": 3.2556777000427246,
"learning_rate": 6.860073158503511e-06,
"loss": 0.4667,
"step": 2890
},
{
"epoch": 0.4425538408713732,
"grad_norm": 3.124281406402588,
"learning_rate": 6.8353215047808006e-06,
"loss": 0.4647,
"step": 2900
},
{
"epoch": 0.44407988859851594,
"grad_norm": 3.2909648418426514,
"learning_rate": 6.810517761746724e-06,
"loss": 0.4543,
"step": 2910
},
{
"epoch": 0.4456059363256586,
"grad_norm": 3.157102584838867,
"learning_rate": 6.785662633370521e-06,
"loss": 0.4802,
"step": 2920
},
{
"epoch": 0.44713198405280125,
"grad_norm": 3.0583925247192383,
"learning_rate": 6.760756825079838e-06,
"loss": 0.4788,
"step": 2930
},
{
"epoch": 0.4486580317799439,
"grad_norm": 3.167233943939209,
"learning_rate": 6.735801043740691e-06,
"loss": 0.4607,
"step": 2940
},
{
"epoch": 0.4501840795070866,
"grad_norm": 2.5458922386169434,
"learning_rate": 6.710795997637412e-06,
"loss": 0.4574,
"step": 2950
},
{
"epoch": 0.45171012723422926,
"grad_norm": 2.914799690246582,
"learning_rate": 6.68574239645254e-06,
"loss": 0.4686,
"step": 2960
},
{
"epoch": 0.4532361749613719,
"grad_norm": 2.5135247707366943,
"learning_rate": 6.660640951246691e-06,
"loss": 0.4771,
"step": 2970
},
{
"epoch": 0.4547622226885146,
"grad_norm": 2.9928719997406006,
"learning_rate": 6.635492374438366e-06,
"loss": 0.4602,
"step": 2980
},
{
"epoch": 0.4562882704156572,
"grad_norm": 2.775026321411133,
"learning_rate": 6.6102973797837324e-06,
"loss": 0.4585,
"step": 2990
},
{
"epoch": 0.45781431814279994,
"grad_norm": 2.5386760234832764,
"learning_rate": 6.585056682356375e-06,
"loss": 0.4823,
"step": 3000
},
{
"epoch": 0.45781431814279994,
"eval_loss": 0.45332393050193787,
"eval_runtime": 100.0205,
"eval_samples_per_second": 5.299,
"eval_steps_per_second": 2.649,
"step": 3000
},
{
"epoch": 0.4593403658699426,
"grad_norm": 2.6468212604522705,
"learning_rate": 6.5597709985269895e-06,
"loss": 0.4769,
"step": 3010
},
{
"epoch": 0.46086641359708524,
"grad_norm": 2.3556416034698486,
"learning_rate": 6.534441045943059e-06,
"loss": 0.4712,
"step": 3020
},
{
"epoch": 0.4623924613242279,
"grad_norm": 2.7787866592407227,
"learning_rate": 6.509067543508483e-06,
"loss": 0.4574,
"step": 3030
},
{
"epoch": 0.4639185090513706,
"grad_norm": 3.103032112121582,
"learning_rate": 6.483651211363175e-06,
"loss": 0.4505,
"step": 3040
},
{
"epoch": 0.46544455677851326,
"grad_norm": 2.5027055740356445,
"learning_rate": 6.4581927708626235e-06,
"loss": 0.4669,
"step": 3050
},
{
"epoch": 0.4669706045056559,
"grad_norm": 3.0449085235595703,
"learning_rate": 6.432692944557416e-06,
"loss": 0.4616,
"step": 3060
},
{
"epoch": 0.46849665223279857,
"grad_norm": 2.4839391708374023,
"learning_rate": 6.407152456172736e-06,
"loss": 0.4435,
"step": 3070
},
{
"epoch": 0.4700226999599412,
"grad_norm": 2.436234474182129,
"learning_rate": 6.381572030587822e-06,
"loss": 0.4635,
"step": 3080
},
{
"epoch": 0.47154874768708394,
"grad_norm": 2.8912863731384277,
"learning_rate": 6.355952393815388e-06,
"loss": 0.4652,
"step": 3090
},
{
"epoch": 0.4730747954142266,
"grad_norm": 2.5968050956726074,
"learning_rate": 6.33029427298103e-06,
"loss": 0.4729,
"step": 3100
},
{
"epoch": 0.47460084314136924,
"grad_norm": 3.2073683738708496,
"learning_rate": 6.304598396302578e-06,
"loss": 0.4953,
"step": 3110
},
{
"epoch": 0.4761268908685119,
"grad_norm": 3.0304925441741943,
"learning_rate": 6.278865493069434e-06,
"loss": 0.4474,
"step": 3120
},
{
"epoch": 0.47765293859565455,
"grad_norm": 2.594212293624878,
"learning_rate": 6.25309629362187e-06,
"loss": 0.4613,
"step": 3130
},
{
"epoch": 0.47917898632279726,
"grad_norm": 3.9076614379882812,
"learning_rate": 6.227291529330302e-06,
"loss": 0.4581,
"step": 3140
},
{
"epoch": 0.4807050340499399,
"grad_norm": 3.236703634262085,
"learning_rate": 6.201451932574533e-06,
"loss": 0.491,
"step": 3150
},
{
"epoch": 0.48223108177708257,
"grad_norm": 3.039473056793213,
"learning_rate": 6.175578236722968e-06,
"loss": 0.4632,
"step": 3160
},
{
"epoch": 0.4837571295042252,
"grad_norm": 2.9076642990112305,
"learning_rate": 6.149671176111793e-06,
"loss": 0.4587,
"step": 3170
},
{
"epoch": 0.48528317723136793,
"grad_norm": 2.884756565093994,
"learning_rate": 6.123731486024146e-06,
"loss": 0.4576,
"step": 3180
},
{
"epoch": 0.4868092249585106,
"grad_norm": 2.9610495567321777,
"learning_rate": 6.097759902669232e-06,
"loss": 0.4562,
"step": 3190
},
{
"epoch": 0.48833527268565324,
"grad_norm": 2.4013702869415283,
"learning_rate": 6.071757163161443e-06,
"loss": 0.4451,
"step": 3200
},
{
"epoch": 0.4898613204127959,
"grad_norm": 2.82401180267334,
"learning_rate": 6.045724005499429e-06,
"loss": 0.4658,
"step": 3210
},
{
"epoch": 0.49138736813993855,
"grad_norm": 3.196622133255005,
"learning_rate": 6.019661168545159e-06,
"loss": 0.4443,
"step": 3220
},
{
"epoch": 0.49291341586708126,
"grad_norm": 2.9927377700805664,
"learning_rate": 5.9935693920029405e-06,
"loss": 0.4538,
"step": 3230
},
{
"epoch": 0.4944394635942239,
"grad_norm": 2.5958001613616943,
"learning_rate": 5.967449416398438e-06,
"loss": 0.4481,
"step": 3240
},
{
"epoch": 0.49596551132136657,
"grad_norm": 3.0835108757019043,
"learning_rate": 5.941301983057646e-06,
"loss": 0.4629,
"step": 3250
},
{
"epoch": 0.4974915590485092,
"grad_norm": 2.6167795658111572,
"learning_rate": 5.915127834085853e-06,
"loss": 0.4626,
"step": 3260
},
{
"epoch": 0.49901760677565193,
"grad_norm": 2.769148826599121,
"learning_rate": 5.888927712346582e-06,
"loss": 0.4574,
"step": 3270
},
{
"epoch": 0.5005436545027946,
"grad_norm": 2.5669050216674805,
"learning_rate": 5.862702361440502e-06,
"loss": 0.4594,
"step": 3280
},
{
"epoch": 0.5020697022299372,
"grad_norm": 3.04659104347229,
"learning_rate": 5.836452525684326e-06,
"loss": 0.4495,
"step": 3290
},
{
"epoch": 0.5035957499570799,
"grad_norm": 2.9052836894989014,
"learning_rate": 5.8101789500896855e-06,
"loss": 0.4302,
"step": 3300
},
{
"epoch": 0.5051217976842226,
"grad_norm": 2.720864772796631,
"learning_rate": 5.783882380341985e-06,
"loss": 0.4522,
"step": 3310
},
{
"epoch": 0.5066478454113652,
"grad_norm": 2.5311403274536133,
"learning_rate": 5.7575635627792384e-06,
"loss": 0.4807,
"step": 3320
},
{
"epoch": 0.5081738931385079,
"grad_norm": 3.34335994720459,
"learning_rate": 5.731223244370891e-06,
"loss": 0.4576,
"step": 3330
},
{
"epoch": 0.5096999408656506,
"grad_norm": 3.448711633682251,
"learning_rate": 5.704862172696612e-06,
"loss": 0.4498,
"step": 3340
},
{
"epoch": 0.5112259885927932,
"grad_norm": 3.406877279281616,
"learning_rate": 5.678481095925087e-06,
"loss": 0.4633,
"step": 3350
},
{
"epoch": 0.5127520363199359,
"grad_norm": 2.611567974090576,
"learning_rate": 5.65208076279277e-06,
"loss": 0.4687,
"step": 3360
},
{
"epoch": 0.5142780840470785,
"grad_norm": 2.893051862716675,
"learning_rate": 5.625661922582646e-06,
"loss": 0.4572,
"step": 3370
},
{
"epoch": 0.5158041317742212,
"grad_norm": 2.572845935821533,
"learning_rate": 5.599225325102957e-06,
"loss": 0.461,
"step": 3380
},
{
"epoch": 0.517330179501364,
"grad_norm": 2.917742967605591,
"learning_rate": 5.572771720665928e-06,
"loss": 0.4679,
"step": 3390
},
{
"epoch": 0.5188562272285066,
"grad_norm": 2.3963794708251953,
"learning_rate": 5.546301860066463e-06,
"loss": 0.4423,
"step": 3400
},
{
"epoch": 0.5203822749556493,
"grad_norm": 3.034247636795044,
"learning_rate": 5.519816494560848e-06,
"loss": 0.4689,
"step": 3410
},
{
"epoch": 0.5219083226827919,
"grad_norm": 2.448772668838501,
"learning_rate": 5.493316375845417e-06,
"loss": 0.4295,
"step": 3420
},
{
"epoch": 0.5234343704099346,
"grad_norm": 2.448565721511841,
"learning_rate": 5.466802256035225e-06,
"loss": 0.4405,
"step": 3430
},
{
"epoch": 0.5249604181370773,
"grad_norm": 3.043869733810425,
"learning_rate": 5.4402748876427e-06,
"loss": 0.4114,
"step": 3440
},
{
"epoch": 0.5264864658642199,
"grad_norm": 2.8036019802093506,
"learning_rate": 5.413735023556288e-06,
"loss": 0.4556,
"step": 3450
},
{
"epoch": 0.5280125135913626,
"grad_norm": 2.72363018989563,
"learning_rate": 5.387183417019079e-06,
"loss": 0.458,
"step": 3460
},
{
"epoch": 0.5295385613185052,
"grad_norm": 2.6680619716644287,
"learning_rate": 5.360620821607433e-06,
"loss": 0.4522,
"step": 3470
},
{
"epoch": 0.5310646090456479,
"grad_norm": 3.0074567794799805,
"learning_rate": 5.334047991209594e-06,
"loss": 0.4683,
"step": 3480
},
{
"epoch": 0.5325906567727906,
"grad_norm": 2.9890754222869873,
"learning_rate": 5.307465680004289e-06,
"loss": 0.4417,
"step": 3490
},
{
"epoch": 0.5341167044999332,
"grad_norm": 2.6241977214813232,
"learning_rate": 5.280874642439323e-06,
"loss": 0.4292,
"step": 3500
},
{
"epoch": 0.5341167044999332,
"eval_loss": 0.4343341290950775,
"eval_runtime": 99.9521,
"eval_samples_per_second": 5.303,
"eval_steps_per_second": 2.651,
"step": 3500
},
{
"epoch": 0.5356427522270759,
"grad_norm": 2.369849920272827,
"learning_rate": 5.254275633210175e-06,
"loss": 0.4566,
"step": 3510
},
{
"epoch": 0.5371687999542186,
"grad_norm": 2.8073859214782715,
"learning_rate": 5.227669407238565e-06,
"loss": 0.4706,
"step": 3520
},
{
"epoch": 0.5386948476813612,
"grad_norm": 2.9969775676727295,
"learning_rate": 5.201056719651042e-06,
"loss": 0.4656,
"step": 3530
},
{
"epoch": 0.5402208954085039,
"grad_norm": 2.4478580951690674,
"learning_rate": 5.174438325757542e-06,
"loss": 0.4419,
"step": 3540
},
{
"epoch": 0.5417469431356465,
"grad_norm": 2.7153167724609375,
"learning_rate": 5.147814981029956e-06,
"loss": 0.4913,
"step": 3550
},
{
"epoch": 0.5432729908627892,
"grad_norm": 2.724480390548706,
"learning_rate": 5.121187441080687e-06,
"loss": 0.4523,
"step": 3560
},
{
"epoch": 0.544799038589932,
"grad_norm": 2.984091281890869,
"learning_rate": 5.094556461641205e-06,
"loss": 0.4696,
"step": 3570
},
{
"epoch": 0.5463250863170745,
"grad_norm": 2.665983200073242,
"learning_rate": 5.0679227985406e-06,
"loss": 0.4405,
"step": 3580
},
{
"epoch": 0.5478511340442173,
"grad_norm": 2.8179776668548584,
"learning_rate": 5.041287207684125e-06,
"loss": 0.4503,
"step": 3590
},
{
"epoch": 0.5493771817713599,
"grad_norm": 3.146925687789917,
"learning_rate": 5.014650445031749e-06,
"loss": 0.4908,
"step": 3600
},
{
"epoch": 0.5509032294985026,
"grad_norm": 3.112048864364624,
"learning_rate": 4.988013266576699e-06,
"loss": 0.4404,
"step": 3610
},
{
"epoch": 0.5524292772256453,
"grad_norm": 2.5898945331573486,
"learning_rate": 4.961376428323997e-06,
"loss": 0.4422,
"step": 3620
},
{
"epoch": 0.5539553249527879,
"grad_norm": 2.4043118953704834,
"learning_rate": 4.934740686269016e-06,
"loss": 0.4688,
"step": 3630
},
{
"epoch": 0.5554813726799306,
"grad_norm": 2.969442367553711,
"learning_rate": 4.908106796376015e-06,
"loss": 0.4497,
"step": 3640
},
{
"epoch": 0.5570074204070732,
"grad_norm": 2.687509298324585,
"learning_rate": 4.881475514556689e-06,
"loss": 0.4356,
"step": 3650
},
{
"epoch": 0.5585334681342159,
"grad_norm": 2.7803378105163574,
"learning_rate": 4.854847596648704e-06,
"loss": 0.468,
"step": 3660
},
{
"epoch": 0.5600595158613586,
"grad_norm": 2.924004316329956,
"learning_rate": 4.828223798394257e-06,
"loss": 0.4478,
"step": 3670
},
{
"epoch": 0.5615855635885012,
"grad_norm": 3.0467331409454346,
"learning_rate": 4.8016048754186265e-06,
"loss": 0.4515,
"step": 3680
},
{
"epoch": 0.5631116113156439,
"grad_norm": 2.7318050861358643,
"learning_rate": 4.77499158320872e-06,
"loss": 0.4441,
"step": 3690
},
{
"epoch": 0.5646376590427865,
"grad_norm": 2.8721094131469727,
"learning_rate": 4.748384677091631e-06,
"loss": 0.442,
"step": 3700
},
{
"epoch": 0.5661637067699292,
"grad_norm": 2.5854904651641846,
"learning_rate": 4.721784912213209e-06,
"loss": 0.4523,
"step": 3710
},
{
"epoch": 0.5676897544970719,
"grad_norm": 2.962846279144287,
"learning_rate": 4.695193043516629e-06,
"loss": 0.4208,
"step": 3720
},
{
"epoch": 0.5692158022242145,
"grad_norm": 3.385815382003784,
"learning_rate": 4.668609825720953e-06,
"loss": 0.4212,
"step": 3730
},
{
"epoch": 0.5707418499513572,
"grad_norm": 3.1869301795959473,
"learning_rate": 4.642036013299716e-06,
"loss": 0.4366,
"step": 3740
},
{
"epoch": 0.5722678976785,
"grad_norm": 2.8463032245635986,
"learning_rate": 4.615472360459519e-06,
"loss": 0.4318,
"step": 3750
},
{
"epoch": 0.5737939454056425,
"grad_norm": 2.9622936248779297,
"learning_rate": 4.588919621118615e-06,
"loss": 0.4352,
"step": 3760
},
{
"epoch": 0.5753199931327853,
"grad_norm": 2.8792457580566406,
"learning_rate": 4.562378548885519e-06,
"loss": 0.4473,
"step": 3770
},
{
"epoch": 0.5768460408599279,
"grad_norm": 2.265306234359741,
"learning_rate": 4.535849897037607e-06,
"loss": 0.4603,
"step": 3780
},
{
"epoch": 0.5783720885870706,
"grad_norm": 2.707305431365967,
"learning_rate": 4.50933441849975e-06,
"loss": 0.4211,
"step": 3790
},
{
"epoch": 0.5798981363142133,
"grad_norm": 2.334364891052246,
"learning_rate": 4.4828328658229406e-06,
"loss": 0.4478,
"step": 3800
},
{
"epoch": 0.5814241840413559,
"grad_norm": 2.8786513805389404,
"learning_rate": 4.456345991162933e-06,
"loss": 0.4451,
"step": 3810
},
{
"epoch": 0.5829502317684986,
"grad_norm": 2.4972634315490723,
"learning_rate": 4.429874546258893e-06,
"loss": 0.4675,
"step": 3820
},
{
"epoch": 0.5844762794956412,
"grad_norm": 3.0034310817718506,
"learning_rate": 4.40341928241207e-06,
"loss": 0.4294,
"step": 3830
},
{
"epoch": 0.5860023272227839,
"grad_norm": 3.1585464477539062,
"learning_rate": 4.376980950464467e-06,
"loss": 0.4143,
"step": 3840
},
{
"epoch": 0.5875283749499266,
"grad_norm": 2.7901790142059326,
"learning_rate": 4.350560300777533e-06,
"loss": 0.459,
"step": 3850
},
{
"epoch": 0.5890544226770692,
"grad_norm": 2.7392337322235107,
"learning_rate": 4.324158083210867e-06,
"loss": 0.4349,
"step": 3860
},
{
"epoch": 0.5905804704042119,
"grad_norm": 2.9482500553131104,
"learning_rate": 4.297775047100935e-06,
"loss": 0.4362,
"step": 3870
},
{
"epoch": 0.5921065181313545,
"grad_norm": 2.583388566970825,
"learning_rate": 4.271411941239809e-06,
"loss": 0.4702,
"step": 3880
},
{
"epoch": 0.5936325658584972,
"grad_norm": 2.743952512741089,
"learning_rate": 4.245069513853897e-06,
"loss": 0.4401,
"step": 3890
},
{
"epoch": 0.5951586135856399,
"grad_norm": 2.7953054904937744,
"learning_rate": 4.218748512582732e-06,
"loss": 0.4425,
"step": 3900
},
{
"epoch": 0.5966846613127825,
"grad_norm": 2.898608684539795,
"learning_rate": 4.19244968445773e-06,
"loss": 0.4488,
"step": 3910
},
{
"epoch": 0.5982107090399252,
"grad_norm": 2.4456114768981934,
"learning_rate": 4.166173775881007e-06,
"loss": 0.4226,
"step": 3920
},
{
"epoch": 0.5997367567670678,
"grad_norm": 3.178201913833618,
"learning_rate": 4.139921532604177e-06,
"loss": 0.4267,
"step": 3930
},
{
"epoch": 0.6012628044942105,
"grad_norm": 2.3235249519348145,
"learning_rate": 4.113693699707203e-06,
"loss": 0.4486,
"step": 3940
},
{
"epoch": 0.6027888522213533,
"grad_norm": 2.7228195667266846,
"learning_rate": 4.0874910215772426e-06,
"loss": 0.4271,
"step": 3950
},
{
"epoch": 0.6043148999484959,
"grad_norm": 2.8123674392700195,
"learning_rate": 4.061314241887516e-06,
"loss": 0.4111,
"step": 3960
},
{
"epoch": 0.6058409476756386,
"grad_norm": 2.610856533050537,
"learning_rate": 4.03516410357621e-06,
"loss": 0.4229,
"step": 3970
},
{
"epoch": 0.6073669954027813,
"grad_norm": 2.711467742919922,
"learning_rate": 4.009041348825383e-06,
"loss": 0.4265,
"step": 3980
},
{
"epoch": 0.6088930431299239,
"grad_norm": 3.1023402214050293,
"learning_rate": 3.982946719039911e-06,
"loss": 0.4249,
"step": 3990
},
{
"epoch": 0.6104190908570666,
"grad_norm": 2.8750693798065186,
"learning_rate": 3.956880954826435e-06,
"loss": 0.4333,
"step": 4000
},
{
"epoch": 0.6104190908570666,
"eval_loss": 0.4167136251926422,
"eval_runtime": 99.9809,
"eval_samples_per_second": 5.301,
"eval_steps_per_second": 2.651,
"step": 4000
},
{
"epoch": 0.6119451385842092,
"grad_norm": 2.5793118476867676,
"learning_rate": 3.930844795972344e-06,
"loss": 0.4512,
"step": 4010
},
{
"epoch": 0.6134711863113519,
"grad_norm": 3.219802141189575,
"learning_rate": 3.904838981424785e-06,
"loss": 0.4203,
"step": 4020
},
{
"epoch": 0.6149972340384946,
"grad_norm": 3.2610111236572266,
"learning_rate": 3.878864249269681e-06,
"loss": 0.439,
"step": 4030
},
{
"epoch": 0.6165232817656372,
"grad_norm": 2.9082117080688477,
"learning_rate": 3.852921336710794e-06,
"loss": 0.4365,
"step": 4040
},
{
"epoch": 0.6180493294927799,
"grad_norm": 3.1823573112487793,
"learning_rate": 3.827010980048787e-06,
"loss": 0.4377,
"step": 4050
},
{
"epoch": 0.6195753772199225,
"grad_norm": 2.5103094577789307,
"learning_rate": 3.801133914660344e-06,
"loss": 0.4389,
"step": 4060
},
{
"epoch": 0.6211014249470652,
"grad_norm": 2.895665168762207,
"learning_rate": 3.7752908749772865e-06,
"loss": 0.4417,
"step": 4070
},
{
"epoch": 0.6226274726742079,
"grad_norm": 3.2190115451812744,
"learning_rate": 3.749482594465733e-06,
"loss": 0.4232,
"step": 4080
},
{
"epoch": 0.6241535204013505,
"grad_norm": 2.946439504623413,
"learning_rate": 3.7237098056052816e-06,
"loss": 0.4205,
"step": 4090
},
{
"epoch": 0.6256795681284932,
"grad_norm": 2.475071907043457,
"learning_rate": 3.6979732398682223e-06,
"loss": 0.4627,
"step": 4100
},
{
"epoch": 0.6272056158556358,
"grad_norm": 3.475555181503296,
"learning_rate": 3.672273627698775e-06,
"loss": 0.4135,
"step": 4110
},
{
"epoch": 0.6287316635827785,
"grad_norm": 2.8138234615325928,
"learning_rate": 3.646611698492364e-06,
"loss": 0.4203,
"step": 4120
},
{
"epoch": 0.6302577113099213,
"grad_norm": 3.4674346446990967,
"learning_rate": 3.6209881805749025e-06,
"loss": 0.4225,
"step": 4130
},
{
"epoch": 0.6317837590370639,
"grad_norm": 2.8863394260406494,
"learning_rate": 3.59540380118214e-06,
"loss": 0.4256,
"step": 4140
},
{
"epoch": 0.6333098067642066,
"grad_norm": 2.5023999214172363,
"learning_rate": 3.5698592864390085e-06,
"loss": 0.4494,
"step": 4150
},
{
"epoch": 0.6348358544913493,
"grad_norm": 3.108616828918457,
"learning_rate": 3.544355361339017e-06,
"loss": 0.4278,
"step": 4160
},
{
"epoch": 0.6363619022184919,
"grad_norm": 3.5009264945983887,
"learning_rate": 3.51889274972368e-06,
"loss": 0.4145,
"step": 4170
},
{
"epoch": 0.6378879499456346,
"grad_norm": 4.06900691986084,
"learning_rate": 3.4934721742619714e-06,
"loss": 0.4327,
"step": 4180
},
{
"epoch": 0.6394139976727772,
"grad_norm": 3.3994603157043457,
"learning_rate": 3.46809435642981e-06,
"loss": 0.4123,
"step": 4190
},
{
"epoch": 0.6409400453999199,
"grad_norm": 2.9589715003967285,
"learning_rate": 3.442760016489586e-06,
"loss": 0.3977,
"step": 4200
},
{
"epoch": 0.6424660931270626,
"grad_norm": 2.398531436920166,
"learning_rate": 3.4174698734697177e-06,
"loss": 0.4197,
"step": 4210
},
{
"epoch": 0.6439921408542052,
"grad_norm": 3.2008326053619385,
"learning_rate": 3.3922246451442474e-06,
"loss": 0.4286,
"step": 4220
},
{
"epoch": 0.6455181885813479,
"grad_norm": 3.441654920578003,
"learning_rate": 3.3670250480124712e-06,
"loss": 0.4568,
"step": 4230
},
{
"epoch": 0.6470442363084905,
"grad_norm": 2.481886386871338,
"learning_rate": 3.3418717972785906e-06,
"loss": 0.403,
"step": 4240
},
{
"epoch": 0.6485702840356332,
"grad_norm": 2.639709949493408,
"learning_rate": 3.316765606831432e-06,
"loss": 0.4567,
"step": 4250
},
{
"epoch": 0.6500963317627759,
"grad_norm": 2.857813596725464,
"learning_rate": 3.2917071892241714e-06,
"loss": 0.4147,
"step": 4260
},
{
"epoch": 0.6516223794899185,
"grad_norm": 2.872697353363037,
"learning_rate": 3.266697255654123e-06,
"loss": 0.4138,
"step": 4270
},
{
"epoch": 0.6531484272170612,
"grad_norm": 3.090141773223877,
"learning_rate": 3.2417365159425383e-06,
"loss": 0.4238,
"step": 4280
},
{
"epoch": 0.6546744749442038,
"grad_norm": 3.7141196727752686,
"learning_rate": 3.216825678514478e-06,
"loss": 0.4198,
"step": 4290
},
{
"epoch": 0.6562005226713465,
"grad_norm": 2.859290838241577,
"learning_rate": 3.1919654503786935e-06,
"loss": 0.4159,
"step": 4300
},
{
"epoch": 0.6577265703984893,
"grad_norm": 3.016757011413574,
"learning_rate": 3.1671565371075687e-06,
"loss": 0.4218,
"step": 4310
},
{
"epoch": 0.6592526181256319,
"grad_norm": 2.7048025131225586,
"learning_rate": 3.142399642817084e-06,
"loss": 0.4327,
"step": 4320
},
{
"epoch": 0.6607786658527746,
"grad_norm": 2.9763782024383545,
"learning_rate": 3.117695470146844e-06,
"loss": 0.4071,
"step": 4330
},
{
"epoch": 0.6623047135799172,
"grad_norm": 2.885979652404785,
"learning_rate": 3.0930447202401303e-06,
"loss": 0.4279,
"step": 4340
},
{
"epoch": 0.6638307613070599,
"grad_norm": 2.6588382720947266,
"learning_rate": 3.0684480927240057e-06,
"loss": 0.4199,
"step": 4350
},
{
"epoch": 0.6653568090342026,
"grad_norm": 3.2519760131835938,
"learning_rate": 3.0439062856894463e-06,
"loss": 0.4067,
"step": 4360
},
{
"epoch": 0.6668828567613452,
"grad_norm": 5.050004482269287,
"learning_rate": 3.0194199956715443e-06,
"loss": 0.4256,
"step": 4370
},
{
"epoch": 0.6684089044884879,
"grad_norm": 2.7873661518096924,
"learning_rate": 2.994989917629726e-06,
"loss": 0.4203,
"step": 4380
},
{
"epoch": 0.6699349522156306,
"grad_norm": 2.685523509979248,
"learning_rate": 2.9706167449280404e-06,
"loss": 0.4546,
"step": 4390
},
{
"epoch": 0.6714609999427732,
"grad_norm": 2.6410155296325684,
"learning_rate": 2.9463011693154643e-06,
"loss": 0.4257,
"step": 4400
},
{
"epoch": 0.6729870476699159,
"grad_norm": 2.6280877590179443,
"learning_rate": 2.9220438809062855e-06,
"loss": 0.4154,
"step": 4410
},
{
"epoch": 0.6745130953970585,
"grad_norm": 2.6469454765319824,
"learning_rate": 2.897845568160508e-06,
"loss": 0.4245,
"step": 4420
},
{
"epoch": 0.6760391431242012,
"grad_norm": 3.425985097885132,
"learning_rate": 2.873706917864314e-06,
"loss": 0.4173,
"step": 4430
},
{
"epoch": 0.6775651908513439,
"grad_norm": 2.8293681144714355,
"learning_rate": 2.8496286151105644e-06,
"loss": 0.4337,
"step": 4440
},
{
"epoch": 0.6790912385784865,
"grad_norm": 2.9185335636138916,
"learning_rate": 2.825611343279374e-06,
"loss": 0.4132,
"step": 4450
},
{
"epoch": 0.6806172863056292,
"grad_norm": 2.4097468852996826,
"learning_rate": 2.801655784018696e-06,
"loss": 0.4206,
"step": 4460
},
{
"epoch": 0.6821433340327718,
"grad_norm": 2.4502408504486084,
"learning_rate": 2.777762617224985e-06,
"loss": 0.4192,
"step": 4470
},
{
"epoch": 0.6836693817599145,
"grad_norm": 2.847097873687744,
"learning_rate": 2.7539325210239e-06,
"loss": 0.4347,
"step": 4480
},
{
"epoch": 0.6851954294870573,
"grad_norm": 3.346667766571045,
"learning_rate": 2.730166171751056e-06,
"loss": 0.4328,
"step": 4490
},
{
"epoch": 0.6867214772141998,
"grad_norm": 2.8015189170837402,
"learning_rate": 2.706464243932836e-06,
"loss": 0.4153,
"step": 4500
},
{
"epoch": 0.6867214772141998,
"eval_loss": 0.40170425176620483,
"eval_runtime": 99.9566,
"eval_samples_per_second": 5.302,
"eval_steps_per_second": 2.651,
"step": 4500
},
{
"epoch": 0.6882475249413426,
"grad_norm": 3.115753173828125,
"learning_rate": 2.6828274102672292e-06,
"loss": 0.4192,
"step": 4510
},
{
"epoch": 0.6897735726684852,
"grad_norm": 3.2326269149780273,
"learning_rate": 2.6592563416047616e-06,
"loss": 0.4203,
"step": 4520
},
{
"epoch": 0.6912996203956279,
"grad_norm": 2.6536991596221924,
"learning_rate": 2.6357517069294397e-06,
"loss": 0.4023,
"step": 4530
},
{
"epoch": 0.6928256681227706,
"grad_norm": 3.132383346557617,
"learning_rate": 2.6123141733397695e-06,
"loss": 0.4251,
"step": 4540
},
{
"epoch": 0.6943517158499132,
"grad_norm": 2.5857174396514893,
"learning_rate": 2.5889444060298217e-06,
"loss": 0.3893,
"step": 4550
},
{
"epoch": 0.6958777635770559,
"grad_norm": 2.402454137802124,
"learning_rate": 2.5656430682703547e-06,
"loss": 0.3777,
"step": 4560
},
{
"epoch": 0.6974038113041985,
"grad_norm": 3.208631992340088,
"learning_rate": 2.5424108213899902e-06,
"loss": 0.4077,
"step": 4570
},
{
"epoch": 0.6989298590313412,
"grad_norm": 3.2448372840881348,
"learning_rate": 2.5192483247564393e-06,
"loss": 0.4218,
"step": 4580
},
{
"epoch": 0.7004559067584839,
"grad_norm": 2.6008667945861816,
"learning_rate": 2.496156235757792e-06,
"loss": 0.4466,
"step": 4590
},
{
"epoch": 0.7019819544856265,
"grad_norm": 2.548492431640625,
"learning_rate": 2.47313520978386e-06,
"loss": 0.3949,
"step": 4600
},
{
"epoch": 0.7035080022127692,
"grad_norm": 2.979745626449585,
"learning_rate": 2.4501859002075713e-06,
"loss": 0.4244,
"step": 4610
},
{
"epoch": 0.7050340499399119,
"grad_norm": 2.696276903152466,
"learning_rate": 2.4273089583664376e-06,
"loss": 0.4144,
"step": 4620
},
{
"epoch": 0.7065600976670545,
"grad_norm": 2.8598382472991943,
"learning_rate": 2.404505033544048e-06,
"loss": 0.391,
"step": 4630
},
{
"epoch": 0.7080861453941972,
"grad_norm": 3.1868958473205566,
"learning_rate": 2.381774772951666e-06,
"loss": 0.4227,
"step": 4640
},
{
"epoch": 0.7096121931213398,
"grad_norm": 3.3258581161499023,
"learning_rate": 2.359118821709842e-06,
"loss": 0.4306,
"step": 4650
},
{
"epoch": 0.7111382408484825,
"grad_norm": 2.390016794204712,
"learning_rate": 2.3365378228301107e-06,
"loss": 0.4245,
"step": 4660
},
{
"epoch": 0.7126642885756252,
"grad_norm": 2.966630220413208,
"learning_rate": 2.314032417196742e-06,
"loss": 0.4135,
"step": 4670
},
{
"epoch": 0.7141903363027678,
"grad_norm": 3.0981130599975586,
"learning_rate": 2.2916032435485477e-06,
"loss": 0.4195,
"step": 4680
},
{
"epoch": 0.7157163840299106,
"grad_norm": 2.610236644744873,
"learning_rate": 2.269250938460762e-06,
"loss": 0.4101,
"step": 4690
},
{
"epoch": 0.7172424317570532,
"grad_norm": 2.589944839477539,
"learning_rate": 2.246976136326963e-06,
"loss": 0.4167,
"step": 4700
},
{
"epoch": 0.7187684794841959,
"grad_norm": 2.9684040546417236,
"learning_rate": 2.2247794693410746e-06,
"loss": 0.3946,
"step": 4710
},
{
"epoch": 0.7202945272113386,
"grad_norm": 2.7830798625946045,
"learning_rate": 2.202661567479423e-06,
"loss": 0.4112,
"step": 4720
},
{
"epoch": 0.7218205749384812,
"grad_norm": 3.1135716438293457,
"learning_rate": 2.180623058482853e-06,
"loss": 0.4371,
"step": 4730
},
{
"epoch": 0.7233466226656239,
"grad_norm": 2.432995557785034,
"learning_rate": 2.158664567838924e-06,
"loss": 0.3921,
"step": 4740
},
{
"epoch": 0.7248726703927665,
"grad_norm": 2.704394817352295,
"learning_rate": 2.136786718764135e-06,
"loss": 0.4117,
"step": 4750
},
{
"epoch": 0.7263987181199092,
"grad_norm": 3.2139337062835693,
"learning_rate": 2.1149901321862624e-06,
"loss": 0.3888,
"step": 4760
},
{
"epoch": 0.7279247658470519,
"grad_norm": 2.8158254623413086,
"learning_rate": 2.093275426726722e-06,
"loss": 0.3953,
"step": 4770
},
{
"epoch": 0.7294508135741945,
"grad_norm": 2.6510894298553467,
"learning_rate": 2.0716432186830064e-06,
"loss": 0.4003,
"step": 4780
},
{
"epoch": 0.7309768613013372,
"grad_norm": 3.2726964950561523,
"learning_rate": 2.0500941220112153e-06,
"loss": 0.4097,
"step": 4790
},
{
"epoch": 0.7325029090284799,
"grad_norm": 2.590909242630005,
"learning_rate": 2.0286287483086046e-06,
"loss": 0.4016,
"step": 4800
},
{
"epoch": 0.7340289567556225,
"grad_norm": 2.8233532905578613,
"learning_rate": 2.007247706796254e-06,
"loss": 0.4012,
"step": 4810
},
{
"epoch": 0.7355550044827652,
"grad_norm": 2.646611452102661,
"learning_rate": 1.985951604301746e-06,
"loss": 0.402,
"step": 4820
},
{
"epoch": 0.7370810522099078,
"grad_norm": 2.817006826400757,
"learning_rate": 1.9647410452419763e-06,
"loss": 0.4361,
"step": 4830
},
{
"epoch": 0.7386070999370505,
"grad_norm": 2.7627429962158203,
"learning_rate": 1.943616631605973e-06,
"loss": 0.4032,
"step": 4840
},
{
"epoch": 0.7401331476641932,
"grad_norm": 2.847055673599243,
"learning_rate": 1.922578962937826e-06,
"loss": 0.3871,
"step": 4850
},
{
"epoch": 0.7416591953913358,
"grad_norm": 3.007472515106201,
"learning_rate": 1.9016286363196656e-06,
"loss": 0.3938,
"step": 4860
},
{
"epoch": 0.7431852431184786,
"grad_norm": 3.840334177017212,
"learning_rate": 1.8807662463547156e-06,
"loss": 0.416,
"step": 4870
},
{
"epoch": 0.7447112908456212,
"grad_norm": 2.870105504989624,
"learning_rate": 1.8599923851504237e-06,
"loss": 0.4244,
"step": 4880
},
{
"epoch": 0.7462373385727639,
"grad_norm": 2.780932664871216,
"learning_rate": 1.8393076423016493e-06,
"loss": 0.4187,
"step": 4890
},
{
"epoch": 0.7477633862999066,
"grad_norm": 3.142162561416626,
"learning_rate": 1.8187126048739284e-06,
"loss": 0.3896,
"step": 4900
},
{
"epoch": 0.7492894340270492,
"grad_norm": 2.7128045558929443,
"learning_rate": 1.7982078573868245e-06,
"loss": 0.4062,
"step": 4910
},
{
"epoch": 0.7508154817541919,
"grad_norm": 2.843827247619629,
"learning_rate": 1.7777939817973238e-06,
"loss": 0.4194,
"step": 4920
},
{
"epoch": 0.7523415294813345,
"grad_norm": 2.7338247299194336,
"learning_rate": 1.7574715574833324e-06,
"loss": 0.4018,
"step": 4930
},
{
"epoch": 0.7538675772084772,
"grad_norm": 2.4693098068237305,
"learning_rate": 1.7372411612272149e-06,
"loss": 0.4073,
"step": 4940
},
{
"epoch": 0.7553936249356199,
"grad_norm": 2.3850250244140625,
"learning_rate": 1.7171033671994418e-06,
"loss": 0.3987,
"step": 4950
},
{
"epoch": 0.7569196726627625,
"grad_norm": 2.474433183670044,
"learning_rate": 1.6970587469422889e-06,
"loss": 0.4142,
"step": 4960
},
{
"epoch": 0.7584457203899052,
"grad_norm": 2.722198247909546,
"learning_rate": 1.6771078693536075e-06,
"loss": 0.3817,
"step": 4970
},
{
"epoch": 0.7599717681170478,
"grad_norm": 2.9978530406951904,
"learning_rate": 1.657251300670688e-06,
"loss": 0.4322,
"step": 4980
},
{
"epoch": 0.7614978158441905,
"grad_norm": 2.862351179122925,
"learning_rate": 1.6374896044541854e-06,
"loss": 0.4025,
"step": 4990
},
{
"epoch": 0.7630238635713332,
"grad_norm": 3.3892769813537598,
"learning_rate": 1.6178233415721228e-06,
"loss": 0.3973,
"step": 5000
},
{
"epoch": 0.7630238635713332,
"eval_loss": 0.39075401425361633,
"eval_runtime": 99.9532,
"eval_samples_per_second": 5.302,
"eval_steps_per_second": 2.651,
"step": 5000
},
{
"epoch": 0.7645499112984758,
"grad_norm": 3.2107391357421875,
"learning_rate": 1.5982530701839815e-06,
"loss": 0.4204,
"step": 5010
},
{
"epoch": 0.7660759590256185,
"grad_norm": 2.611640691757202,
"learning_rate": 1.5787793457248425e-06,
"loss": 0.3878,
"step": 5020
},
{
"epoch": 0.7676020067527612,
"grad_norm": 3.0495364665985107,
"learning_rate": 1.5594027208896433e-06,
"loss": 0.4075,
"step": 5030
},
{
"epoch": 0.7691280544799038,
"grad_norm": 3.080049991607666,
"learning_rate": 1.5401237456174755e-06,
"loss": 0.4277,
"step": 5040
},
{
"epoch": 0.7706541022070466,
"grad_norm": 2.9949750900268555,
"learning_rate": 1.5209429670759874e-06,
"loss": 0.4008,
"step": 5050
},
{
"epoch": 0.7721801499341892,
"grad_norm": 2.866619348526001,
"learning_rate": 1.5018609296458425e-06,
"loss": 0.3935,
"step": 5060
},
{
"epoch": 0.7737061976613319,
"grad_norm": 2.5663259029388428,
"learning_rate": 1.4828781749052807e-06,
"loss": 0.3999,
"step": 5070
},
{
"epoch": 0.7752322453884746,
"grad_norm": 2.572970151901245,
"learning_rate": 1.4639952416147457e-06,
"loss": 0.4095,
"step": 5080
},
{
"epoch": 0.7767582931156172,
"grad_norm": 2.68562650680542,
"learning_rate": 1.445212665701583e-06,
"loss": 0.419,
"step": 5090
},
{
"epoch": 0.7782843408427599,
"grad_norm": 3.246164321899414,
"learning_rate": 1.426530980244848e-06,
"loss": 0.4138,
"step": 5100
},
{
"epoch": 0.7798103885699025,
"grad_norm": 3.2083702087402344,
"learning_rate": 1.407950715460159e-06,
"loss": 0.4208,
"step": 5110
},
{
"epoch": 0.7813364362970452,
"grad_norm": 3.3285470008850098,
"learning_rate": 1.389472398684658e-06,
"loss": 0.3787,
"step": 5120
},
{
"epoch": 0.7828624840241879,
"grad_norm": 2.917363405227661,
"learning_rate": 1.3710965543620442e-06,
"loss": 0.4057,
"step": 5130
},
{
"epoch": 0.7843885317513305,
"grad_norm": 4.440506935119629,
"learning_rate": 1.3528237040276825e-06,
"loss": 0.3901,
"step": 5140
},
{
"epoch": 0.7859145794784732,
"grad_norm": 4.1051788330078125,
"learning_rate": 1.3346543662938132e-06,
"loss": 0.4323,
"step": 5150
},
{
"epoch": 0.7874406272056158,
"grad_norm": 2.771667242050171,
"learning_rate": 1.316589056834821e-06,
"loss": 0.4127,
"step": 5160
},
{
"epoch": 0.7889666749327585,
"grad_norm": 2.9452033042907715,
"learning_rate": 1.2986282883726065e-06,
"loss": 0.4235,
"step": 5170
},
{
"epoch": 0.7904927226599012,
"grad_norm": 3.01763916015625,
"learning_rate": 1.2807725706620317e-06,
"loss": 0.4059,
"step": 5180
},
{
"epoch": 0.7920187703870438,
"grad_norm": 3.0742299556732178,
"learning_rate": 1.2630224104764516e-06,
"loss": 0.416,
"step": 5190
},
{
"epoch": 0.7935448181141865,
"grad_norm": 2.9382846355438232,
"learning_rate": 1.2453783115933387e-06,
"loss": 0.3976,
"step": 5200
},
{
"epoch": 0.7950708658413291,
"grad_norm": 2.8469552993774414,
"learning_rate": 1.2278407747799687e-06,
"loss": 0.407,
"step": 5210
},
{
"epoch": 0.7965969135684718,
"grad_norm": 2.8619937896728516,
"learning_rate": 1.2104102977792282e-06,
"loss": 0.3873,
"step": 5220
},
{
"epoch": 0.7981229612956146,
"grad_norm": 3.257063388824463,
"learning_rate": 1.1930873752954725e-06,
"loss": 0.3868,
"step": 5230
},
{
"epoch": 0.7996490090227572,
"grad_norm": 3.016730785369873,
"learning_rate": 1.1758724989804908e-06,
"loss": 0.4081,
"step": 5240
},
{
"epoch": 0.8011750567498999,
"grad_norm": 3.1905858516693115,
"learning_rate": 1.1587661574195536e-06,
"loss": 0.3895,
"step": 5250
},
{
"epoch": 0.8027011044770426,
"grad_norm": 2.977105140686035,
"learning_rate": 1.1417688361175422e-06,
"loss": 0.4028,
"step": 5260
},
{
"epoch": 0.8042271522041852,
"grad_norm": 2.6317853927612305,
"learning_rate": 1.1248810174851755e-06,
"loss": 0.377,
"step": 5270
},
{
"epoch": 0.8057531999313279,
"grad_norm": 3.0427422523498535,
"learning_rate": 1.1081031808253096e-06,
"loss": 0.3763,
"step": 5280
},
{
"epoch": 0.8072792476584705,
"grad_norm": 3.044510841369629,
"learning_rate": 1.0914358023193428e-06,
"loss": 0.382,
"step": 5290
},
{
"epoch": 0.8088052953856132,
"grad_norm": 2.3702402114868164,
"learning_rate": 1.0748793550136949e-06,
"loss": 0.4003,
"step": 5300
},
{
"epoch": 0.8103313431127559,
"grad_norm": 2.994016647338867,
"learning_rate": 1.0584343088063837e-06,
"loss": 0.3966,
"step": 5310
},
{
"epoch": 0.8118573908398985,
"grad_norm": 3.0523016452789307,
"learning_rate": 1.0421011304336932e-06,
"loss": 0.4129,
"step": 5320
},
{
"epoch": 0.8133834385670412,
"grad_norm": 3.0171058177948,
"learning_rate": 1.0258802834569137e-06,
"loss": 0.3687,
"step": 5330
},
{
"epoch": 0.8149094862941838,
"grad_norm": 2.748992681503296,
"learning_rate": 1.0097722282492023e-06,
"loss": 0.3936,
"step": 5340
},
{
"epoch": 0.8164355340213265,
"grad_norm": 2.96637225151062,
"learning_rate": 9.93777421982503e-07,
"loss": 0.4234,
"step": 5350
},
{
"epoch": 0.8179615817484692,
"grad_norm": 3.161268949508667,
"learning_rate": 9.778963186145796e-07,
"loss": 0.3938,
"step": 5360
},
{
"epoch": 0.8194876294756118,
"grad_norm": 2.630280017852783,
"learning_rate": 9.621293688761263e-07,
"loss": 0.3739,
"step": 5370
},
{
"epoch": 0.8210136772027545,
"grad_norm": 3.382284641265869,
"learning_rate": 9.464770202579787e-07,
"loss": 0.3918,
"step": 5380
},
{
"epoch": 0.8225397249298971,
"grad_norm": 3.014678716659546,
"learning_rate": 9.309397169984158e-07,
"loss": 0.4185,
"step": 5390
},
{
"epoch": 0.8240657726570398,
"grad_norm": 2.7957093715667725,
"learning_rate": 9.155179000705399e-07,
"loss": 0.3897,
"step": 5400
},
{
"epoch": 0.8255918203841826,
"grad_norm": 3.4030938148498535,
"learning_rate": 9.00212007169779e-07,
"loss": 0.3989,
"step": 5410
},
{
"epoch": 0.8271178681113251,
"grad_norm": 3.2532286643981934,
"learning_rate": 8.850224727014489e-07,
"loss": 0.4053,
"step": 5420
},
{
"epoch": 0.8286439158384679,
"grad_norm": 3.2262065410614014,
"learning_rate": 8.699497277684326e-07,
"loss": 0.413,
"step": 5430
},
{
"epoch": 0.8301699635656106,
"grad_norm": 3.0282540321350098,
"learning_rate": 8.549942001589406e-07,
"loss": 0.3965,
"step": 5440
},
{
"epoch": 0.8316960112927532,
"grad_norm": 2.6417813301086426,
"learning_rate": 8.401563143343721e-07,
"loss": 0.4071,
"step": 5450
},
{
"epoch": 0.8332220590198959,
"grad_norm": 3.082578182220459,
"learning_rate": 8.254364914172697e-07,
"loss": 0.3975,
"step": 5460
},
{
"epoch": 0.8347481067470385,
"grad_norm": 3.2389848232269287,
"learning_rate": 8.108351491793615e-07,
"loss": 0.404,
"step": 5470
},
{
"epoch": 0.8362741544741812,
"grad_norm": 3.0996053218841553,
"learning_rate": 7.963527020297085e-07,
"loss": 0.3817,
"step": 5480
},
{
"epoch": 0.8378002022013239,
"grad_norm": 2.964110851287842,
"learning_rate": 7.819895610029433e-07,
"loss": 0.3821,
"step": 5490
},
{
"epoch": 0.8393262499284665,
"grad_norm": 3.0734763145446777,
"learning_rate": 7.677461337476005e-07,
"loss": 0.4168,
"step": 5500
},
{
"epoch": 0.8393262499284665,
"eval_loss": 0.3822996914386749,
"eval_runtime": 99.942,
"eval_samples_per_second": 5.303,
"eval_steps_per_second": 2.652,
"step": 5500
},
{
"epoch": 0.8408522976556092,
"grad_norm": 3.2727982997894287,
"learning_rate": 7.536228245145554e-07,
"loss": 0.3812,
"step": 5510
},
{
"epoch": 0.8423783453827518,
"grad_norm": 2.7713711261749268,
"learning_rate": 7.396200341455356e-07,
"loss": 0.3969,
"step": 5520
},
{
"epoch": 0.8439043931098945,
"grad_norm": 3.2900307178497314,
"learning_rate": 7.25738160061763e-07,
"loss": 0.4094,
"step": 5530
},
{
"epoch": 0.8454304408370372,
"grad_norm": 2.305082321166992,
"learning_rate": 7.119775962526593e-07,
"loss": 0.4075,
"step": 5540
},
{
"epoch": 0.8469564885641798,
"grad_norm": 2.9151320457458496,
"learning_rate": 6.983387332646718e-07,
"loss": 0.3881,
"step": 5550
},
{
"epoch": 0.8484825362913225,
"grad_norm": 2.791234254837036,
"learning_rate": 6.848219581901866e-07,
"loss": 0.3946,
"step": 5560
},
{
"epoch": 0.8500085840184651,
"grad_norm": 3.1531620025634766,
"learning_rate": 6.714276546565423e-07,
"loss": 0.4137,
"step": 5570
},
{
"epoch": 0.8515346317456078,
"grad_norm": 2.58180570602417,
"learning_rate": 6.581562028151451e-07,
"loss": 0.4073,
"step": 5580
},
{
"epoch": 0.8530606794727505,
"grad_norm": 2.863215208053589,
"learning_rate": 6.450079793306735e-07,
"loss": 0.4321,
"step": 5590
},
{
"epoch": 0.8545867271998931,
"grad_norm": 2.909364700317383,
"learning_rate": 6.319833573703938e-07,
"loss": 0.3989,
"step": 5600
},
{
"epoch": 0.8561127749270359,
"grad_norm": 2.7926652431488037,
"learning_rate": 6.190827065935645e-07,
"loss": 0.404,
"step": 5610
},
{
"epoch": 0.8576388226541785,
"grad_norm": 2.7227327823638916,
"learning_rate": 6.06306393140948e-07,
"loss": 0.3815,
"step": 5620
},
{
"epoch": 0.8591648703813212,
"grad_norm": 2.705726385116577,
"learning_rate": 5.936547796244207e-07,
"loss": 0.3741,
"step": 5630
},
{
"epoch": 0.8606909181084639,
"grad_norm": 3.357189178466797,
"learning_rate": 5.811282251166716e-07,
"loss": 0.398,
"step": 5640
},
{
"epoch": 0.8622169658356065,
"grad_norm": 2.5761477947235107,
"learning_rate": 5.687270851410265e-07,
"loss": 0.396,
"step": 5650
},
{
"epoch": 0.8637430135627492,
"grad_norm": 4.029236316680908,
"learning_rate": 5.564517116613433e-07,
"loss": 0.4209,
"step": 5660
},
{
"epoch": 0.8652690612898919,
"grad_norm": 3.4346041679382324,
"learning_rate": 5.443024530720326e-07,
"loss": 0.3933,
"step": 5670
},
{
"epoch": 0.8667951090170345,
"grad_norm": 2.5683741569519043,
"learning_rate": 5.32279654188163e-07,
"loss": 0.3668,
"step": 5680
},
{
"epoch": 0.8683211567441772,
"grad_norm": 2.6289405822753906,
"learning_rate": 5.203836562356795e-07,
"loss": 0.3781,
"step": 5690
},
{
"epoch": 0.8698472044713198,
"grad_norm": 2.9439804553985596,
"learning_rate": 5.086147968417199e-07,
"loss": 0.4096,
"step": 5700
},
{
"epoch": 0.8713732521984625,
"grad_norm": 2.9020564556121826,
"learning_rate": 4.969734100250229e-07,
"loss": 0.413,
"step": 5710
},
{
"epoch": 0.8728992999256052,
"grad_norm": 3.4045605659484863,
"learning_rate": 4.854598261864618e-07,
"loss": 0.3765,
"step": 5720
},
{
"epoch": 0.8744253476527478,
"grad_norm": 3.001821517944336,
"learning_rate": 4.74074372099656e-07,
"loss": 0.3817,
"step": 5730
},
{
"epoch": 0.8759513953798905,
"grad_norm": 2.8464105129241943,
"learning_rate": 4.628173709017031e-07,
"loss": 0.3914,
"step": 5740
},
{
"epoch": 0.8774774431070331,
"grad_norm": 3.0081145763397217,
"learning_rate": 4.516891420840047e-07,
"loss": 0.4129,
"step": 5750
},
{
"epoch": 0.8790034908341758,
"grad_norm": 2.8356759548187256,
"learning_rate": 4.4069000148319885e-07,
"loss": 0.3986,
"step": 5760
},
{
"epoch": 0.8805295385613185,
"grad_norm": 2.759951114654541,
"learning_rate": 4.298202612722008e-07,
"loss": 0.3763,
"step": 5770
},
{
"epoch": 0.8820555862884611,
"grad_norm": 3.221383571624756,
"learning_rate": 4.1908022995133526e-07,
"loss": 0.3885,
"step": 5780
},
{
"epoch": 0.8835816340156039,
"grad_norm": 2.8623874187469482,
"learning_rate": 4.084702123395834e-07,
"loss": 0.3748,
"step": 5790
},
{
"epoch": 0.8851076817427465,
"grad_norm": 2.868194341659546,
"learning_rate": 3.979905095659381e-07,
"loss": 0.3913,
"step": 5800
},
{
"epoch": 0.8866337294698892,
"grad_norm": 3.0048179626464844,
"learning_rate": 3.8764141906084794e-07,
"loss": 0.3843,
"step": 5810
},
{
"epoch": 0.8881597771970319,
"grad_norm": 2.4534389972686768,
"learning_rate": 3.7742323454778296e-07,
"loss": 0.3878,
"step": 5820
},
{
"epoch": 0.8896858249241745,
"grad_norm": 2.534501552581787,
"learning_rate": 3.6733624603489e-07,
"loss": 0.3899,
"step": 5830
},
{
"epoch": 0.8912118726513172,
"grad_norm": 3.0121171474456787,
"learning_rate": 3.5738073980677355e-07,
"loss": 0.407,
"step": 5840
},
{
"epoch": 0.8927379203784598,
"grad_norm": 2.708857774734497,
"learning_rate": 3.475569984163596e-07,
"loss": 0.4173,
"step": 5850
},
{
"epoch": 0.8942639681056025,
"grad_norm": 3.2787721157073975,
"learning_rate": 3.378653006768823e-07,
"loss": 0.3978,
"step": 5860
},
{
"epoch": 0.8957900158327452,
"grad_norm": 2.9193058013916016,
"learning_rate": 3.2830592165396913e-07,
"loss": 0.3986,
"step": 5870
},
{
"epoch": 0.8973160635598878,
"grad_norm": 2.758446455001831,
"learning_rate": 3.188791326578339e-07,
"loss": 0.3574,
"step": 5880
},
{
"epoch": 0.8988421112870305,
"grad_norm": 2.8080217838287354,
"learning_rate": 3.0958520123557767e-07,
"loss": 0.3749,
"step": 5890
},
{
"epoch": 0.9003681590141732,
"grad_norm": 3.7036428451538086,
"learning_rate": 3.0042439116359455e-07,
"loss": 0.3887,
"step": 5900
},
{
"epoch": 0.9018942067413158,
"grad_norm": 3.0411696434020996,
"learning_rate": 2.9139696244008255e-07,
"loss": 0.3961,
"step": 5910
},
{
"epoch": 0.9034202544684585,
"grad_norm": 2.942746877670288,
"learning_rate": 2.8250317127767213e-07,
"loss": 0.3875,
"step": 5920
},
{
"epoch": 0.9049463021956011,
"grad_norm": 3.230517864227295,
"learning_rate": 2.7374327009614456e-07,
"loss": 0.3981,
"step": 5930
},
{
"epoch": 0.9064723499227438,
"grad_norm": 2.7047958374023438,
"learning_rate": 2.651175075152784e-07,
"loss": 0.3996,
"step": 5940
},
{
"epoch": 0.9079983976498865,
"grad_norm": 3.278867244720459,
"learning_rate": 2.5662612834778164e-07,
"loss": 0.3625,
"step": 5950
},
{
"epoch": 0.9095244453770291,
"grad_norm": 2.955040693283081,
"learning_rate": 2.4826937359235305e-07,
"loss": 0.3662,
"step": 5960
},
{
"epoch": 0.9110504931041719,
"grad_norm": 2.9156110286712646,
"learning_rate": 2.4004748042683933e-07,
"loss": 0.405,
"step": 5970
},
{
"epoch": 0.9125765408313145,
"grad_norm": 3.318800926208496,
"learning_rate": 2.3196068220150025e-07,
"loss": 0.4027,
"step": 5980
},
{
"epoch": 0.9141025885584572,
"grad_norm": 4.083282470703125,
"learning_rate": 2.2400920843239194e-07,
"loss": 0.3634,
"step": 5990
},
{
"epoch": 0.9156286362855999,
"grad_norm": 3.180896759033203,
"learning_rate": 2.161932847948478e-07,
"loss": 0.3918,
"step": 6000
},
{
"epoch": 0.9156286362855999,
"eval_loss": 0.37766218185424805,
"eval_runtime": 99.9689,
"eval_samples_per_second": 5.302,
"eval_steps_per_second": 2.651,
"step": 6000
},
{
"epoch": 0.9171546840127425,
"grad_norm": 3.35513973236084,
"learning_rate": 2.0851313311707532e-07,
"loss": 0.3857,
"step": 6010
},
{
"epoch": 0.9186807317398852,
"grad_norm": 2.6778345108032227,
"learning_rate": 2.0096897137386052e-07,
"loss": 0.3725,
"step": 6020
},
{
"epoch": 0.9202067794670278,
"grad_norm": 2.9898831844329834,
"learning_rate": 1.9356101368038005e-07,
"loss": 0.4051,
"step": 6030
},
{
"epoch": 0.9217328271941705,
"grad_norm": 2.924804449081421,
"learning_rate": 1.8628947028612788e-07,
"loss": 0.3742,
"step": 6040
},
{
"epoch": 0.9232588749213132,
"grad_norm": 2.871389389038086,
"learning_rate": 1.791545475689438e-07,
"loss": 0.3881,
"step": 6050
},
{
"epoch": 0.9247849226484558,
"grad_norm": 3.050419569015503,
"learning_rate": 1.721564480291571e-07,
"loss": 0.3972,
"step": 6060
},
{
"epoch": 0.9263109703755985,
"grad_norm": 3.090453863143921,
"learning_rate": 1.652953702838428e-07,
"loss": 0.3852,
"step": 6070
},
{
"epoch": 0.9278370181027412,
"grad_norm": 3.4830410480499268,
"learning_rate": 1.585715090611778e-07,
"loss": 0.3964,
"step": 6080
},
{
"epoch": 0.9293630658298838,
"grad_norm": 3.4521846771240234,
"learning_rate": 1.5198505519492368e-07,
"loss": 0.4143,
"step": 6090
},
{
"epoch": 0.9308891135570265,
"grad_norm": 3.032611131668091,
"learning_rate": 1.4553619561899935e-07,
"loss": 0.3881,
"step": 6100
},
{
"epoch": 0.9324151612841691,
"grad_norm": 3.2053749561309814,
"learning_rate": 1.3922511336218524e-07,
"loss": 0.397,
"step": 6110
},
{
"epoch": 0.9339412090113118,
"grad_norm": 3.1718320846557617,
"learning_rate": 1.330519875429237e-07,
"loss": 0.4042,
"step": 6120
},
{
"epoch": 0.9354672567384545,
"grad_norm": 2.4236302375793457,
"learning_rate": 1.2701699336423513e-07,
"loss": 0.3791,
"step": 6130
},
{
"epoch": 0.9369933044655971,
"grad_norm": 2.4723575115203857,
"learning_rate": 1.211203021087487e-07,
"loss": 0.4056,
"step": 6140
},
{
"epoch": 0.9385193521927399,
"grad_norm": 3.279250383377075,
"learning_rate": 1.1536208113383684e-07,
"loss": 0.4043,
"step": 6150
},
{
"epoch": 0.9400453999198825,
"grad_norm": 2.6275689601898193,
"learning_rate": 1.0974249386687064e-07,
"loss": 0.3986,
"step": 6160
},
{
"epoch": 0.9415714476470252,
"grad_norm": 2.8793821334838867,
"learning_rate": 1.042616998005752e-07,
"loss": 0.3891,
"step": 6170
},
{
"epoch": 0.9430974953741679,
"grad_norm": 3.2804644107818604,
"learning_rate": 9.891985448850839e-08,
"loss": 0.3963,
"step": 6180
},
{
"epoch": 0.9446235431013105,
"grad_norm": 2.7859580516815186,
"learning_rate": 9.37171095406425e-08,
"loss": 0.3832,
"step": 6190
},
{
"epoch": 0.9461495908284532,
"grad_norm": 2.705620288848877,
"learning_rate": 8.865361261906402e-08,
"loss": 0.3851,
"step": 6200
},
{
"epoch": 0.9476756385555958,
"grad_norm": 2.792065382003784,
"learning_rate": 8.372950743378128e-08,
"loss": 0.403,
"step": 6210
},
{
"epoch": 0.9492016862827385,
"grad_norm": 3.0607993602752686,
"learning_rate": 7.894493373864332e-08,
"loss": 0.4104,
"step": 6220
},
{
"epoch": 0.9507277340098812,
"grad_norm": 3.5412745475769043,
"learning_rate": 7.430002732737973e-08,
"loss": 0.3937,
"step": 6230
},
{
"epoch": 0.9522537817370238,
"grad_norm": 3.418365955352783,
"learning_rate": 6.979492002974098e-08,
"loss": 0.3954,
"step": 6240
},
{
"epoch": 0.9537798294641665,
"grad_norm": 2.9009461402893066,
"learning_rate": 6.542973970775912e-08,
"loss": 0.3904,
"step": 6250
},
{
"epoch": 0.9553058771913091,
"grad_norm": 2.7992494106292725,
"learning_rate": 6.120461025211744e-08,
"loss": 0.3897,
"step": 6260
},
{
"epoch": 0.9568319249184518,
"grad_norm": 2.860257625579834,
"learning_rate": 5.711965157863597e-08,
"loss": 0.3673,
"step": 6270
},
{
"epoch": 0.9583579726455945,
"grad_norm": 2.605058431625366,
"learning_rate": 5.317497962486984e-08,
"loss": 0.3961,
"step": 6280
},
{
"epoch": 0.9598840203727371,
"grad_norm": 3.152754545211792,
"learning_rate": 4.937070634681185e-08,
"loss": 0.3884,
"step": 6290
},
{
"epoch": 0.9614100680998798,
"grad_norm": 2.6811258792877197,
"learning_rate": 4.570693971572393e-08,
"loss": 0.3968,
"step": 6300
},
{
"epoch": 0.9629361158270225,
"grad_norm": 3.2074809074401855,
"learning_rate": 4.218378371506515e-08,
"loss": 0.4024,
"step": 6310
},
{
"epoch": 0.9644621635541651,
"grad_norm": 3.393841028213501,
"learning_rate": 3.880133833754518e-08,
"loss": 0.3852,
"step": 6320
},
{
"epoch": 0.9659882112813079,
"grad_norm": 3.0189993381500244,
"learning_rate": 3.555969958228489e-08,
"loss": 0.3804,
"step": 6330
},
{
"epoch": 0.9675142590084505,
"grad_norm": 2.8455655574798584,
"learning_rate": 3.245895945209132e-08,
"loss": 0.4228,
"step": 6340
},
{
"epoch": 0.9690403067355932,
"grad_norm": 3.668877601623535,
"learning_rate": 2.949920595084643e-08,
"loss": 0.369,
"step": 6350
},
{
"epoch": 0.9705663544627359,
"grad_norm": 3.0246024131774902,
"learning_rate": 2.6680523081011878e-08,
"loss": 0.3899,
"step": 6360
},
{
"epoch": 0.9720924021898785,
"grad_norm": 2.8767731189727783,
"learning_rate": 2.4002990841239804e-08,
"loss": 0.393,
"step": 6370
},
{
"epoch": 0.9736184499170212,
"grad_norm": 3.3473153114318848,
"learning_rate": 2.1466685224107995e-08,
"loss": 0.3722,
"step": 6380
},
{
"epoch": 0.9751444976441638,
"grad_norm": 2.7829878330230713,
"learning_rate": 1.9071678213959388e-08,
"loss": 0.3788,
"step": 6390
},
{
"epoch": 0.9766705453713065,
"grad_norm": 2.5925357341766357,
"learning_rate": 1.6818037784860908e-08,
"loss": 0.3862,
"step": 6400
},
{
"epoch": 0.9781965930984492,
"grad_norm": 3.0968399047851562,
"learning_rate": 1.4705827898672254e-08,
"loss": 0.3738,
"step": 6410
},
{
"epoch": 0.9797226408255918,
"grad_norm": 3.40238881111145,
"learning_rate": 1.2735108503232896e-08,
"loss": 0.4121,
"step": 6420
},
{
"epoch": 0.9812486885527345,
"grad_norm": 2.9040355682373047,
"learning_rate": 1.0905935530658996e-08,
"loss": 0.4003,
"step": 6430
},
{
"epoch": 0.9827747362798771,
"grad_norm": 2.6801180839538574,
"learning_rate": 9.218360895758006e-09,
"loss": 0.3973,
"step": 6440
},
{
"epoch": 0.9843007840070198,
"grad_norm": 2.591391086578369,
"learning_rate": 7.672432494551518e-09,
"loss": 0.3936,
"step": 6450
},
{
"epoch": 0.9858268317341625,
"grad_norm": 2.7946035861968994,
"learning_rate": 6.268194202920241e-09,
"loss": 0.3641,
"step": 6460
},
{
"epoch": 0.9873528794613051,
"grad_norm": 4.159729480743408,
"learning_rate": 5.005685875354993e-09,
"loss": 0.3685,
"step": 6470
},
{
"epoch": 0.9888789271884478,
"grad_norm": 2.7406532764434814,
"learning_rate": 3.884943343829273e-09,
"loss": 0.4149,
"step": 6480
},
{
"epoch": 0.9904049749155904,
"grad_norm": 3.1383161544799805,
"learning_rate": 2.9059984167778553e-09,
"loss": 0.3814,
"step": 6490
},
{
"epoch": 0.9919310226427331,
"grad_norm": 2.687572956085205,
"learning_rate": 2.0688788781980664e-09,
"loss": 0.3942,
"step": 6500
},
{
"epoch": 0.9919310226427331,
"eval_loss": 0.377034068107605,
"eval_runtime": 100.214,
"eval_samples_per_second": 5.289,
"eval_steps_per_second": 2.644,
"step": 6500
},
{
"epoch": 0.9934570703698758,
"grad_norm": 2.9962236881256104,
"learning_rate": 1.3736084868598564e-09,
"loss": 0.3747,
"step": 6510
},
{
"epoch": 0.9949831180970184,
"grad_norm": 2.946183204650879,
"learning_rate": 8.202069756302333e-10,
"loss": 0.3763,
"step": 6520
},
{
"epoch": 0.9965091658241612,
"grad_norm": 3.0049428939819336,
"learning_rate": 4.0869005091481727e-10,
"loss": 0.4033,
"step": 6530
},
{
"epoch": 0.9980352135513039,
"grad_norm": 3.199441432952881,
"learning_rate": 1.3906939221042247e-10,
"loss": 0.3847,
"step": 6540
},
{
"epoch": 0.9995612612784465,
"grad_norm": 2.7879321575164795,
"learning_rate": 1.1352651776985746e-11,
"loss": 0.4005,
"step": 6550
},
{
"epoch": 1.0,
"step": 6553,
"total_flos": 2.1597904813481902e+18,
"train_loss": 0.4742196609496197,
"train_runtime": 41833.5481,
"train_samples_per_second": 1.253,
"train_steps_per_second": 0.157
}
],
"logging_steps": 10,
"max_steps": 6553,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1597904813481902e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}