qwen3-ft-test / trainer_state.json
saneowl's picture
Upload folder contents
512a213 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.483729966002914,
"eval_steps": 100,
"global_step": 17900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013876361617983764,
"grad_norm": 0.0,
"learning_rate": 1.3867488443759631e-06,
"loss": 1.5686,
"step": 10
},
{
"epoch": 0.0027752723235967528,
"grad_norm": 0.0,
"learning_rate": 2.9275808936825885e-06,
"loss": 1.5599,
"step": 20
},
{
"epoch": 0.004162908485395129,
"grad_norm": 0.0,
"learning_rate": 4.468412942989214e-06,
"loss": 1.6107,
"step": 30
},
{
"epoch": 0.0055505446471935055,
"grad_norm": 0.0,
"learning_rate": 6.0092449922958395e-06,
"loss": 1.5163,
"step": 40
},
{
"epoch": 0.0069381808089918826,
"grad_norm": 0.0,
"learning_rate": 7.550077041602466e-06,
"loss": 1.5544,
"step": 50
},
{
"epoch": 0.008325816970790259,
"grad_norm": 0.0,
"learning_rate": 9.090909090909091e-06,
"loss": 1.5103,
"step": 60
},
{
"epoch": 0.009713453132588635,
"grad_norm": 0.0,
"learning_rate": 1.0631741140215717e-05,
"loss": 1.5622,
"step": 70
},
{
"epoch": 0.011101089294387011,
"grad_norm": 0.0,
"learning_rate": 1.2172573189522343e-05,
"loss": 1.6018,
"step": 80
},
{
"epoch": 0.012488725456185389,
"grad_norm": 0.0,
"learning_rate": 1.371340523882897e-05,
"loss": 1.6178,
"step": 90
},
{
"epoch": 0.013876361617983765,
"grad_norm": 0.0,
"learning_rate": 1.5254237288135596e-05,
"loss": 1.4605,
"step": 100
},
{
"epoch": 0.013876361617983765,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.3731,
"eval_samples_per_second": 15.031,
"eval_steps_per_second": 1.879,
"step": 100
},
{
"epoch": 0.015263997779782141,
"grad_norm": 0.0,
"learning_rate": 1.6795069337442222e-05,
"loss": 1.6031,
"step": 110
},
{
"epoch": 0.016651633941580517,
"grad_norm": 0.0,
"learning_rate": 1.8335901386748848e-05,
"loss": 1.5871,
"step": 120
},
{
"epoch": 0.018039270103378895,
"grad_norm": 0.0,
"learning_rate": 1.987673343605547e-05,
"loss": 1.6041,
"step": 130
},
{
"epoch": 0.01942690626517727,
"grad_norm": 0.0,
"learning_rate": 2.1417565485362097e-05,
"loss": 1.5313,
"step": 140
},
{
"epoch": 0.020814542426975648,
"grad_norm": 0.0,
"learning_rate": 2.295839753466872e-05,
"loss": 1.5775,
"step": 150
},
{
"epoch": 0.022202178588774022,
"grad_norm": 0.0,
"learning_rate": 2.4499229583975346e-05,
"loss": 1.5831,
"step": 160
},
{
"epoch": 0.0235898147505724,
"grad_norm": 0.0,
"learning_rate": 2.6040061633281976e-05,
"loss": 1.5453,
"step": 170
},
{
"epoch": 0.024977450912370778,
"grad_norm": 0.0,
"learning_rate": 2.75808936825886e-05,
"loss": 1.5362,
"step": 180
},
{
"epoch": 0.026365087074169152,
"grad_norm": 0.0,
"learning_rate": 2.9121725731895228e-05,
"loss": 1.48,
"step": 190
},
{
"epoch": 0.02775272323596753,
"grad_norm": 0.0,
"learning_rate": 3.066255778120185e-05,
"loss": 1.5183,
"step": 200
},
{
"epoch": 0.02775272323596753,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.1246,
"eval_samples_per_second": 15.018,
"eval_steps_per_second": 1.878,
"step": 200
},
{
"epoch": 0.029140359397765905,
"grad_norm": 0.0,
"learning_rate": 3.2203389830508473e-05,
"loss": 1.4961,
"step": 210
},
{
"epoch": 0.030527995559564283,
"grad_norm": 0.0,
"learning_rate": 3.37442218798151e-05,
"loss": 1.5633,
"step": 220
},
{
"epoch": 0.03191563172136266,
"grad_norm": 0.0,
"learning_rate": 3.5285053929121726e-05,
"loss": 1.5646,
"step": 230
},
{
"epoch": 0.033303267883161035,
"grad_norm": 0.0,
"learning_rate": 3.682588597842835e-05,
"loss": 1.464,
"step": 240
},
{
"epoch": 0.03469090404495941,
"grad_norm": 0.0,
"learning_rate": 3.836671802773498e-05,
"loss": 1.5711,
"step": 250
},
{
"epoch": 0.03607854020675779,
"grad_norm": 0.0,
"learning_rate": 3.9907550077041604e-05,
"loss": 1.5541,
"step": 260
},
{
"epoch": 0.03746617636855616,
"grad_norm": 0.0,
"learning_rate": 4.144838212634823e-05,
"loss": 1.6075,
"step": 270
},
{
"epoch": 0.03885381253035454,
"grad_norm": 0.0,
"learning_rate": 4.298921417565486e-05,
"loss": 1.5286,
"step": 280
},
{
"epoch": 0.04024144869215292,
"grad_norm": 0.0,
"learning_rate": 4.453004622496148e-05,
"loss": 1.5678,
"step": 290
},
{
"epoch": 0.041629084853951295,
"grad_norm": 0.0,
"learning_rate": 4.607087827426811e-05,
"loss": 1.5208,
"step": 300
},
{
"epoch": 0.041629084853951295,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.7103,
"eval_samples_per_second": 14.955,
"eval_steps_per_second": 1.87,
"step": 300
},
{
"epoch": 0.04301672101574967,
"grad_norm": 0.0,
"learning_rate": 4.7611710323574735e-05,
"loss": 1.5492,
"step": 310
},
{
"epoch": 0.044404357177548044,
"grad_norm": 0.0,
"learning_rate": 4.915254237288136e-05,
"loss": 1.5481,
"step": 320
},
{
"epoch": 0.04579199333934642,
"grad_norm": 0.0,
"learning_rate": 5.069337442218799e-05,
"loss": 1.5389,
"step": 330
},
{
"epoch": 0.0471796295011448,
"grad_norm": 0.0,
"learning_rate": 5.223420647149461e-05,
"loss": 1.5258,
"step": 340
},
{
"epoch": 0.04856726566294318,
"grad_norm": 0.0,
"learning_rate": 5.377503852080123e-05,
"loss": 1.4987,
"step": 350
},
{
"epoch": 0.049954901824741556,
"grad_norm": 0.0,
"learning_rate": 5.5315870570107866e-05,
"loss": 1.5953,
"step": 360
},
{
"epoch": 0.05134253798653993,
"grad_norm": 0.0,
"learning_rate": 5.685670261941448e-05,
"loss": 1.5349,
"step": 370
},
{
"epoch": 0.052730174148338305,
"grad_norm": 0.0,
"learning_rate": 5.839753466872111e-05,
"loss": 1.5716,
"step": 380
},
{
"epoch": 0.05411781031013668,
"grad_norm": 0.0,
"learning_rate": 5.993836671802774e-05,
"loss": 1.4449,
"step": 390
},
{
"epoch": 0.05550544647193506,
"grad_norm": 0.0,
"learning_rate": 6.147919876733436e-05,
"loss": 1.5694,
"step": 400
},
{
"epoch": 0.05550544647193506,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.6963,
"eval_samples_per_second": 14.99,
"eval_steps_per_second": 1.874,
"step": 400
},
{
"epoch": 0.05689308263373344,
"grad_norm": 0.0,
"learning_rate": 6.302003081664099e-05,
"loss": 1.588,
"step": 410
},
{
"epoch": 0.05828071879553181,
"grad_norm": 0.0,
"learning_rate": 6.456086286594762e-05,
"loss": 1.5056,
"step": 420
},
{
"epoch": 0.05966835495733019,
"grad_norm": 0.0,
"learning_rate": 6.610169491525424e-05,
"loss": 1.6029,
"step": 430
},
{
"epoch": 0.061055991119128565,
"grad_norm": 0.0,
"learning_rate": 6.764252696456087e-05,
"loss": 1.54,
"step": 440
},
{
"epoch": 0.06244362728092694,
"grad_norm": 0.0,
"learning_rate": 6.918335901386748e-05,
"loss": 1.4744,
"step": 450
},
{
"epoch": 0.06383126344272531,
"grad_norm": 0.0,
"learning_rate": 7.072419106317412e-05,
"loss": 1.6046,
"step": 460
},
{
"epoch": 0.06521889960452369,
"grad_norm": 0.0,
"learning_rate": 7.226502311248075e-05,
"loss": 1.5482,
"step": 470
},
{
"epoch": 0.06660653576632207,
"grad_norm": 0.0,
"learning_rate": 7.380585516178737e-05,
"loss": 1.5734,
"step": 480
},
{
"epoch": 0.06799417192812045,
"grad_norm": 0.0,
"learning_rate": 7.534668721109399e-05,
"loss": 1.515,
"step": 490
},
{
"epoch": 0.06938180808991883,
"grad_norm": 0.0,
"learning_rate": 7.688751926040063e-05,
"loss": 1.515,
"step": 500
},
{
"epoch": 0.06938180808991883,
"eval_loss": 1.5553832054138184,
"eval_runtime": 860.2669,
"eval_samples_per_second": 14.893,
"eval_steps_per_second": 1.862,
"step": 500
},
{
"epoch": 0.0707694442517172,
"grad_norm": 0.0,
"learning_rate": 7.842835130970725e-05,
"loss": 1.5486,
"step": 510
},
{
"epoch": 0.07215708041351558,
"grad_norm": 0.0,
"learning_rate": 7.996918335901386e-05,
"loss": 1.5361,
"step": 520
},
{
"epoch": 0.07354471657531396,
"grad_norm": 0.0,
"learning_rate": 8.151001540832049e-05,
"loss": 1.7084,
"step": 530
},
{
"epoch": 0.07493235273711232,
"grad_norm": 0.0,
"learning_rate": 8.305084745762712e-05,
"loss": 1.5039,
"step": 540
},
{
"epoch": 0.0763199888989107,
"grad_norm": 0.0,
"learning_rate": 8.459167950693376e-05,
"loss": 1.5072,
"step": 550
},
{
"epoch": 0.07770762506070908,
"grad_norm": 0.0,
"learning_rate": 8.613251155624037e-05,
"loss": 1.5539,
"step": 560
},
{
"epoch": 0.07909526122250746,
"grad_norm": 0.0,
"learning_rate": 8.7673343605547e-05,
"loss": 1.5531,
"step": 570
},
{
"epoch": 0.08048289738430583,
"grad_norm": 0.0,
"learning_rate": 8.921417565485362e-05,
"loss": 1.5783,
"step": 580
},
{
"epoch": 0.08187053354610421,
"grad_norm": 0.0,
"learning_rate": 9.075500770416026e-05,
"loss": 1.584,
"step": 590
},
{
"epoch": 0.08325816970790259,
"grad_norm": 0.0,
"learning_rate": 9.229583975346687e-05,
"loss": 1.5598,
"step": 600
},
{
"epoch": 0.08325816970790259,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.4877,
"eval_samples_per_second": 14.907,
"eval_steps_per_second": 1.864,
"step": 600
},
{
"epoch": 0.08464580586970097,
"grad_norm": 0.0,
"learning_rate": 9.38366718027735e-05,
"loss": 1.5914,
"step": 610
},
{
"epoch": 0.08603344203149935,
"grad_norm": 0.0,
"learning_rate": 9.537750385208013e-05,
"loss": 1.509,
"step": 620
},
{
"epoch": 0.08742107819329771,
"grad_norm": 0.0,
"learning_rate": 9.691833590138675e-05,
"loss": 1.5504,
"step": 630
},
{
"epoch": 0.08880871435509609,
"grad_norm": 0.0,
"learning_rate": 9.845916795069338e-05,
"loss": 1.5355,
"step": 640
},
{
"epoch": 0.09019635051689447,
"grad_norm": 0.0,
"learning_rate": 0.0001,
"loss": 1.5586,
"step": 650
},
{
"epoch": 0.09158398667869284,
"grad_norm": 0.0,
"learning_rate": 0.00010154083204930662,
"loss": 1.5139,
"step": 660
},
{
"epoch": 0.09297162284049122,
"grad_norm": 0.0,
"learning_rate": 0.00010308166409861326,
"loss": 1.5743,
"step": 670
},
{
"epoch": 0.0943592590022896,
"grad_norm": 0.0,
"learning_rate": 0.00010462249614791988,
"loss": 1.5265,
"step": 680
},
{
"epoch": 0.09574689516408798,
"grad_norm": 0.0,
"learning_rate": 0.0001061633281972265,
"loss": 1.554,
"step": 690
},
{
"epoch": 0.09713453132588636,
"grad_norm": 0.0,
"learning_rate": 0.00010770416024653314,
"loss": 1.5693,
"step": 700
},
{
"epoch": 0.09713453132588636,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.3735,
"eval_samples_per_second": 15.013,
"eval_steps_per_second": 1.877,
"step": 700
},
{
"epoch": 0.09852216748768473,
"grad_norm": 0.0,
"learning_rate": 0.00010924499229583975,
"loss": 1.6233,
"step": 710
},
{
"epoch": 0.09990980364948311,
"grad_norm": 0.0,
"learning_rate": 0.00011078582434514639,
"loss": 1.5869,
"step": 720
},
{
"epoch": 0.10129743981128148,
"grad_norm": 0.0,
"learning_rate": 0.00011232665639445301,
"loss": 1.5084,
"step": 730
},
{
"epoch": 0.10268507597307985,
"grad_norm": 0.0,
"learning_rate": 0.00011386748844375963,
"loss": 1.448,
"step": 740
},
{
"epoch": 0.10407271213487823,
"grad_norm": 0.0,
"learning_rate": 0.00011540832049306627,
"loss": 1.5386,
"step": 750
},
{
"epoch": 0.10546034829667661,
"grad_norm": 0.0,
"learning_rate": 0.00011694915254237289,
"loss": 1.4706,
"step": 760
},
{
"epoch": 0.10684798445847499,
"grad_norm": 0.0,
"learning_rate": 0.0001184899845916795,
"loss": 1.6041,
"step": 770
},
{
"epoch": 0.10823562062027337,
"grad_norm": 0.0,
"learning_rate": 0.00012003081664098615,
"loss": 1.5857,
"step": 780
},
{
"epoch": 0.10962325678207174,
"grad_norm": 0.0,
"learning_rate": 0.00012157164869029276,
"loss": 1.5199,
"step": 790
},
{
"epoch": 0.11101089294387012,
"grad_norm": 0.0,
"learning_rate": 0.0001231124807395994,
"loss": 1.5045,
"step": 800
},
{
"epoch": 0.11101089294387012,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.1951,
"eval_samples_per_second": 15.034,
"eval_steps_per_second": 1.88,
"step": 800
},
{
"epoch": 0.1123985291056685,
"grad_norm": 0.0,
"learning_rate": 0.000124653312788906,
"loss": 1.5783,
"step": 810
},
{
"epoch": 0.11378616526746688,
"grad_norm": 0.0,
"learning_rate": 0.00012619414483821262,
"loss": 1.5467,
"step": 820
},
{
"epoch": 0.11517380142926524,
"grad_norm": 0.0,
"learning_rate": 0.00012773497688751926,
"loss": 1.5322,
"step": 830
},
{
"epoch": 0.11656143759106362,
"grad_norm": 0.0,
"learning_rate": 0.0001292758089368259,
"loss": 1.5253,
"step": 840
},
{
"epoch": 0.117949073752862,
"grad_norm": 0.0,
"learning_rate": 0.00013081664098613251,
"loss": 1.5313,
"step": 850
},
{
"epoch": 0.11933670991466037,
"grad_norm": 0.0,
"learning_rate": 0.00013235747303543915,
"loss": 1.5569,
"step": 860
},
{
"epoch": 0.12072434607645875,
"grad_norm": 0.0,
"learning_rate": 0.00013389830508474577,
"loss": 1.566,
"step": 870
},
{
"epoch": 0.12211198223825713,
"grad_norm": 0.0,
"learning_rate": 0.00013543913713405238,
"loss": 1.5446,
"step": 880
},
{
"epoch": 0.12349961840005551,
"grad_norm": 0.0,
"learning_rate": 0.00013697996918335902,
"loss": 1.5445,
"step": 890
},
{
"epoch": 0.12488725456185389,
"grad_norm": 0.0,
"learning_rate": 0.00013852080123266563,
"loss": 1.4359,
"step": 900
},
{
"epoch": 0.12488725456185389,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.2417,
"eval_samples_per_second": 14.981,
"eval_steps_per_second": 1.873,
"step": 900
},
{
"epoch": 0.12627489072365225,
"grad_norm": 0.0,
"learning_rate": 0.00014006163328197227,
"loss": 1.5106,
"step": 910
},
{
"epoch": 0.12766252688545063,
"grad_norm": 0.0,
"learning_rate": 0.0001416024653312789,
"loss": 1.4681,
"step": 920
},
{
"epoch": 0.129050163047249,
"grad_norm": 0.0,
"learning_rate": 0.00014314329738058552,
"loss": 1.4964,
"step": 930
},
{
"epoch": 0.13043779920904738,
"grad_norm": 0.0,
"learning_rate": 0.00014468412942989216,
"loss": 1.4468,
"step": 940
},
{
"epoch": 0.13182543537084576,
"grad_norm": 0.0,
"learning_rate": 0.00014622496147919878,
"loss": 1.4915,
"step": 950
},
{
"epoch": 0.13321307153264414,
"grad_norm": 0.0,
"learning_rate": 0.0001477657935285054,
"loss": 1.5481,
"step": 960
},
{
"epoch": 0.13460070769444252,
"grad_norm": 0.0,
"learning_rate": 0.00014930662557781203,
"loss": 1.5247,
"step": 970
},
{
"epoch": 0.1359883438562409,
"grad_norm": 0.0,
"learning_rate": 0.00015084745762711864,
"loss": 1.5824,
"step": 980
},
{
"epoch": 0.13737598001803927,
"grad_norm": 0.0,
"learning_rate": 0.00015238828967642525,
"loss": 1.5115,
"step": 990
},
{
"epoch": 0.13876361617983765,
"grad_norm": 0.0,
"learning_rate": 0.0001539291217257319,
"loss": 1.4536,
"step": 1000
},
{
"epoch": 0.13876361617983765,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.8289,
"eval_samples_per_second": 14.901,
"eval_steps_per_second": 1.863,
"step": 1000
},
{
"epoch": 0.14015125234163603,
"grad_norm": 0.0,
"learning_rate": 0.00015546995377503853,
"loss": 1.5573,
"step": 1010
},
{
"epoch": 0.1415388885034344,
"grad_norm": 0.0,
"learning_rate": 0.00015701078582434517,
"loss": 1.5338,
"step": 1020
},
{
"epoch": 0.14292652466523278,
"grad_norm": 0.0,
"learning_rate": 0.00015855161787365179,
"loss": 1.5633,
"step": 1030
},
{
"epoch": 0.14431416082703116,
"grad_norm": 0.0,
"learning_rate": 0.0001600924499229584,
"loss": 1.5012,
"step": 1040
},
{
"epoch": 0.14570179698882954,
"grad_norm": 0.0,
"learning_rate": 0.00016163328197226504,
"loss": 1.5334,
"step": 1050
},
{
"epoch": 0.14708943315062792,
"grad_norm": 0.0,
"learning_rate": 0.00016317411402157165,
"loss": 1.5824,
"step": 1060
},
{
"epoch": 0.14847706931242627,
"grad_norm": 0.0,
"learning_rate": 0.00016471494607087826,
"loss": 1.5572,
"step": 1070
},
{
"epoch": 0.14986470547422465,
"grad_norm": 0.0,
"learning_rate": 0.0001662557781201849,
"loss": 1.5407,
"step": 1080
},
{
"epoch": 0.15125234163602302,
"grad_norm": 0.0,
"learning_rate": 0.00016779661016949154,
"loss": 1.4918,
"step": 1090
},
{
"epoch": 0.1526399777978214,
"grad_norm": 0.0,
"learning_rate": 0.00016933744221879818,
"loss": 1.5496,
"step": 1100
},
{
"epoch": 0.1526399777978214,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.8719,
"eval_samples_per_second": 15.057,
"eval_steps_per_second": 1.883,
"step": 1100
},
{
"epoch": 0.15402761395961978,
"grad_norm": 0.0,
"learning_rate": 0.0001708782742681048,
"loss": 1.6318,
"step": 1110
},
{
"epoch": 0.15541525012141816,
"grad_norm": 0.0,
"learning_rate": 0.0001724191063174114,
"loss": 1.5595,
"step": 1120
},
{
"epoch": 0.15680288628321654,
"grad_norm": 0.0,
"learning_rate": 0.00017395993836671805,
"loss": 1.5493,
"step": 1130
},
{
"epoch": 0.15819052244501491,
"grad_norm": 0.0,
"learning_rate": 0.00017550077041602466,
"loss": 1.5708,
"step": 1140
},
{
"epoch": 0.1595781586068133,
"grad_norm": 0.0,
"learning_rate": 0.00017704160246533127,
"loss": 1.5479,
"step": 1150
},
{
"epoch": 0.16096579476861167,
"grad_norm": 0.0,
"learning_rate": 0.0001785824345146379,
"loss": 1.5459,
"step": 1160
},
{
"epoch": 0.16235343093041005,
"grad_norm": 0.0,
"learning_rate": 0.00018012326656394453,
"loss": 1.5629,
"step": 1170
},
{
"epoch": 0.16374106709220843,
"grad_norm": 0.0,
"learning_rate": 0.00018166409861325116,
"loss": 1.5617,
"step": 1180
},
{
"epoch": 0.1651287032540068,
"grad_norm": 0.0,
"learning_rate": 0.0001832049306625578,
"loss": 1.5623,
"step": 1190
},
{
"epoch": 0.16651633941580518,
"grad_norm": 0.0,
"learning_rate": 0.00018474576271186442,
"loss": 1.5399,
"step": 1200
},
{
"epoch": 0.16651633941580518,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.5599,
"eval_samples_per_second": 15.045,
"eval_steps_per_second": 1.881,
"step": 1200
},
{
"epoch": 0.16790397557760356,
"grad_norm": 0.0,
"learning_rate": 0.00018628659476117106,
"loss": 1.5636,
"step": 1210
},
{
"epoch": 0.16929161173940194,
"grad_norm": 0.0,
"learning_rate": 0.00018782742681047767,
"loss": 1.5902,
"step": 1220
},
{
"epoch": 0.17067924790120031,
"grad_norm": 0.0,
"learning_rate": 0.00018936825885978428,
"loss": 1.6026,
"step": 1230
},
{
"epoch": 0.1720668840629987,
"grad_norm": 0.0,
"learning_rate": 0.00019090909090909092,
"loss": 1.5568,
"step": 1240
},
{
"epoch": 0.17345452022479707,
"grad_norm": 0.0,
"learning_rate": 0.00019244992295839753,
"loss": 1.568,
"step": 1250
},
{
"epoch": 0.17484215638659542,
"grad_norm": 0.0,
"learning_rate": 0.00019399075500770417,
"loss": 1.5191,
"step": 1260
},
{
"epoch": 0.1762297925483938,
"grad_norm": 0.0,
"learning_rate": 0.00019553158705701081,
"loss": 1.5366,
"step": 1270
},
{
"epoch": 0.17761742871019218,
"grad_norm": 0.0,
"learning_rate": 0.00019707241910631743,
"loss": 1.553,
"step": 1280
},
{
"epoch": 0.17900506487199055,
"grad_norm": 0.0,
"learning_rate": 0.00019861325115562404,
"loss": 1.528,
"step": 1290
},
{
"epoch": 0.18039270103378893,
"grad_norm": 0.0,
"learning_rate": 0.00019999999880520304,
"loss": 1.5596,
"step": 1300
},
{
"epoch": 0.18039270103378893,
"eval_loss": 1.5553832054138184,
"eval_runtime": 846.8538,
"eval_samples_per_second": 15.129,
"eval_steps_per_second": 1.892,
"step": 1300
},
{
"epoch": 0.1817803371955873,
"grad_norm": 0.0,
"learning_rate": 0.00019999985542960055,
"loss": 1.5126,
"step": 1310
},
{
"epoch": 0.1831679733573857,
"grad_norm": 0.0,
"learning_rate": 0.0001999994730949955,
"loss": 1.5304,
"step": 1320
},
{
"epoch": 0.18455560951918407,
"grad_norm": 0.0,
"learning_rate": 0.0001999988518023016,
"loss": 1.523,
"step": 1330
},
{
"epoch": 0.18594324568098244,
"grad_norm": 0.0,
"learning_rate": 0.00019999799155300343,
"loss": 1.5223,
"step": 1340
},
{
"epoch": 0.18733088184278082,
"grad_norm": 0.0,
"learning_rate": 0.00019999689234915667,
"loss": 1.5163,
"step": 1350
},
{
"epoch": 0.1887185180045792,
"grad_norm": 0.0,
"learning_rate": 0.00019999555419338794,
"loss": 1.5422,
"step": 1360
},
{
"epoch": 0.19010615416637758,
"grad_norm": 0.0,
"learning_rate": 0.0001999939770888949,
"loss": 1.5268,
"step": 1370
},
{
"epoch": 0.19149379032817596,
"grad_norm": 0.0,
"learning_rate": 0.00019999216103944617,
"loss": 1.5667,
"step": 1380
},
{
"epoch": 0.19288142648997433,
"grad_norm": 0.0,
"learning_rate": 0.00019999010604938145,
"loss": 1.557,
"step": 1390
},
{
"epoch": 0.1942690626517727,
"grad_norm": 0.0,
"learning_rate": 0.00019998781212361122,
"loss": 1.52,
"step": 1400
},
{
"epoch": 0.1942690626517727,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.6828,
"eval_samples_per_second": 14.973,
"eval_steps_per_second": 1.872,
"step": 1400
},
{
"epoch": 0.1956566988135711,
"grad_norm": 0.0,
"learning_rate": 0.0001999852792676171,
"loss": 1.5547,
"step": 1410
},
{
"epoch": 0.19704433497536947,
"grad_norm": 0.0,
"learning_rate": 0.00019998250748745155,
"loss": 1.5801,
"step": 1420
},
{
"epoch": 0.19843197113716785,
"grad_norm": 0.0,
"learning_rate": 0.00019997949678973804,
"loss": 1.4726,
"step": 1430
},
{
"epoch": 0.19981960729896622,
"grad_norm": 0.0,
"learning_rate": 0.00019997624718167087,
"loss": 1.5853,
"step": 1440
},
{
"epoch": 0.2012072434607646,
"grad_norm": 0.0,
"learning_rate": 0.0001999727586710153,
"loss": 1.536,
"step": 1450
},
{
"epoch": 0.20259487962256295,
"grad_norm": 0.0,
"learning_rate": 0.00019996903126610747,
"loss": 1.5915,
"step": 1460
},
{
"epoch": 0.20398251578436133,
"grad_norm": 0.0,
"learning_rate": 0.00019996506497585434,
"loss": 1.5097,
"step": 1470
},
{
"epoch": 0.2053701519461597,
"grad_norm": 0.0,
"learning_rate": 0.00019996085980973375,
"loss": 1.5386,
"step": 1480
},
{
"epoch": 0.20675778810795808,
"grad_norm": 0.0,
"learning_rate": 0.0001999564157777943,
"loss": 1.5985,
"step": 1490
},
{
"epoch": 0.20814542426975646,
"grad_norm": 0.0,
"learning_rate": 0.00019995173289065544,
"loss": 1.58,
"step": 1500
},
{
"epoch": 0.20814542426975646,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.5849,
"eval_samples_per_second": 15.027,
"eval_steps_per_second": 1.879,
"step": 1500
},
{
"epoch": 0.20953306043155484,
"grad_norm": 0.0,
"learning_rate": 0.0001999468111595074,
"loss": 1.549,
"step": 1510
},
{
"epoch": 0.21092069659335322,
"grad_norm": 0.0,
"learning_rate": 0.00019994165059611106,
"loss": 1.6138,
"step": 1520
},
{
"epoch": 0.2123083327551516,
"grad_norm": 0.0,
"learning_rate": 0.0001999362512127981,
"loss": 1.5108,
"step": 1530
},
{
"epoch": 0.21369596891694997,
"grad_norm": 0.0,
"learning_rate": 0.00019993061302247086,
"loss": 1.5541,
"step": 1540
},
{
"epoch": 0.21508360507874835,
"grad_norm": 0.0,
"learning_rate": 0.00019992473603860228,
"loss": 1.5873,
"step": 1550
},
{
"epoch": 0.21647124124054673,
"grad_norm": 0.0,
"learning_rate": 0.00019991862027523603,
"loss": 1.5918,
"step": 1560
},
{
"epoch": 0.2178588774023451,
"grad_norm": 0.0,
"learning_rate": 0.00019991226574698624,
"loss": 1.5176,
"step": 1570
},
{
"epoch": 0.21924651356414349,
"grad_norm": 0.0,
"learning_rate": 0.0001999056724690377,
"loss": 1.5385,
"step": 1580
},
{
"epoch": 0.22063414972594186,
"grad_norm": 0.0,
"learning_rate": 0.0001998988404571456,
"loss": 1.565,
"step": 1590
},
{
"epoch": 0.22202178588774024,
"grad_norm": 0.0,
"learning_rate": 0.00019989176972763572,
"loss": 1.5304,
"step": 1600
},
{
"epoch": 0.22202178588774024,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.2663,
"eval_samples_per_second": 15.033,
"eval_steps_per_second": 1.88,
"step": 1600
},
{
"epoch": 0.22340942204953862,
"grad_norm": 0.0,
"learning_rate": 0.00019988446029740422,
"loss": 1.5396,
"step": 1610
},
{
"epoch": 0.224797058211337,
"grad_norm": 0.0,
"learning_rate": 0.00019987691218391763,
"loss": 1.5072,
"step": 1620
},
{
"epoch": 0.22618469437313538,
"grad_norm": 0.0,
"learning_rate": 0.00019986912540521294,
"loss": 1.4956,
"step": 1630
},
{
"epoch": 0.22757233053493375,
"grad_norm": 0.0,
"learning_rate": 0.00019986109997989732,
"loss": 1.5727,
"step": 1640
},
{
"epoch": 0.2289599666967321,
"grad_norm": 0.0,
"learning_rate": 0.00019985283592714834,
"loss": 1.6416,
"step": 1650
},
{
"epoch": 0.23034760285853048,
"grad_norm": 0.0,
"learning_rate": 0.00019984433326671364,
"loss": 1.5467,
"step": 1660
},
{
"epoch": 0.23173523902032886,
"grad_norm": 0.0,
"learning_rate": 0.00019983559201891123,
"loss": 1.5827,
"step": 1670
},
{
"epoch": 0.23312287518212724,
"grad_norm": 0.0,
"learning_rate": 0.00019982661220462907,
"loss": 1.5114,
"step": 1680
},
{
"epoch": 0.23451051134392562,
"grad_norm": 0.0,
"learning_rate": 0.00019981739384532525,
"loss": 1.52,
"step": 1690
},
{
"epoch": 0.235898147505724,
"grad_norm": 0.0,
"learning_rate": 0.0001998079369630279,
"loss": 1.572,
"step": 1700
},
{
"epoch": 0.235898147505724,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.3193,
"eval_samples_per_second": 15.067,
"eval_steps_per_second": 1.884,
"step": 1700
},
{
"epoch": 0.23728578366752237,
"grad_norm": 0.0,
"learning_rate": 0.00019979824158033522,
"loss": 1.5659,
"step": 1710
},
{
"epoch": 0.23867341982932075,
"grad_norm": 0.0,
"learning_rate": 0.00019978830772041513,
"loss": 1.5661,
"step": 1720
},
{
"epoch": 0.24006105599111913,
"grad_norm": 0.0,
"learning_rate": 0.00019977813540700555,
"loss": 1.4954,
"step": 1730
},
{
"epoch": 0.2414486921529175,
"grad_norm": 0.0,
"learning_rate": 0.00019976772466441417,
"loss": 1.5238,
"step": 1740
},
{
"epoch": 0.24283632831471588,
"grad_norm": 0.0,
"learning_rate": 0.00019975707551751846,
"loss": 1.4822,
"step": 1750
},
{
"epoch": 0.24422396447651426,
"grad_norm": 0.0,
"learning_rate": 0.00019974618799176551,
"loss": 1.5299,
"step": 1760
},
{
"epoch": 0.24561160063831264,
"grad_norm": 0.0,
"learning_rate": 0.00019973506211317212,
"loss": 1.5403,
"step": 1770
},
{
"epoch": 0.24699923680011102,
"grad_norm": 0.0,
"learning_rate": 0.0001997236979083246,
"loss": 1.606,
"step": 1780
},
{
"epoch": 0.2483868729619094,
"grad_norm": 0.0,
"learning_rate": 0.00019971209540437873,
"loss": 1.5796,
"step": 1790
},
{
"epoch": 0.24977450912370777,
"grad_norm": 0.0,
"learning_rate": 0.0001997002546290599,
"loss": 1.5143,
"step": 1800
},
{
"epoch": 0.24977450912370777,
"eval_loss": 1.5553832054138184,
"eval_runtime": 848.5406,
"eval_samples_per_second": 15.099,
"eval_steps_per_second": 1.888,
"step": 1800
},
{
"epoch": 0.2511621452855061,
"grad_norm": 0.0,
"learning_rate": 0.00019968817561066262,
"loss": 1.431,
"step": 1810
},
{
"epoch": 0.2525497814473045,
"grad_norm": 0.0,
"learning_rate": 0.00019967585837805092,
"loss": 1.5829,
"step": 1820
},
{
"epoch": 0.2539374176091029,
"grad_norm": 0.0,
"learning_rate": 0.0001996633029606579,
"loss": 1.6282,
"step": 1830
},
{
"epoch": 0.25532505377090126,
"grad_norm": 0.0,
"learning_rate": 0.00019965050938848598,
"loss": 1.5562,
"step": 1840
},
{
"epoch": 0.25671268993269963,
"grad_norm": 0.0,
"learning_rate": 0.00019963747769210661,
"loss": 1.6189,
"step": 1850
},
{
"epoch": 0.258100326094498,
"grad_norm": 0.0,
"learning_rate": 0.00019962420790266015,
"loss": 1.5318,
"step": 1860
},
{
"epoch": 0.2594879622562964,
"grad_norm": 0.0,
"learning_rate": 0.00019961070005185608,
"loss": 1.5493,
"step": 1870
},
{
"epoch": 0.26087559841809477,
"grad_norm": 0.0,
"learning_rate": 0.00019959695417197263,
"loss": 1.52,
"step": 1880
},
{
"epoch": 0.26226323457989315,
"grad_norm": 0.0,
"learning_rate": 0.00019958297029585691,
"loss": 1.54,
"step": 1890
},
{
"epoch": 0.2636508707416915,
"grad_norm": 0.0,
"learning_rate": 0.00019956874845692465,
"loss": 1.4546,
"step": 1900
},
{
"epoch": 0.2636508707416915,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.5993,
"eval_samples_per_second": 15.062,
"eval_steps_per_second": 1.883,
"step": 1900
},
{
"epoch": 0.2650385069034899,
"grad_norm": 0.0,
"learning_rate": 0.00019955428868916029,
"loss": 1.5184,
"step": 1910
},
{
"epoch": 0.2664261430652883,
"grad_norm": 0.0,
"learning_rate": 0.0001995395910271168,
"loss": 1.5142,
"step": 1920
},
{
"epoch": 0.26781377922708666,
"grad_norm": 0.0,
"learning_rate": 0.0001995246555059156,
"loss": 1.513,
"step": 1930
},
{
"epoch": 0.26920141538888503,
"grad_norm": 0.0,
"learning_rate": 0.00019950948216124656,
"loss": 1.5488,
"step": 1940
},
{
"epoch": 0.2705890515506834,
"grad_norm": 0.0,
"learning_rate": 0.00019949407102936776,
"loss": 1.584,
"step": 1950
},
{
"epoch": 0.2719766877124818,
"grad_norm": 0.0,
"learning_rate": 0.0001994784221471055,
"loss": 1.5061,
"step": 1960
},
{
"epoch": 0.27336432387428017,
"grad_norm": 0.0,
"learning_rate": 0.00019946253555185435,
"loss": 1.6085,
"step": 1970
},
{
"epoch": 0.27475196003607855,
"grad_norm": 0.0,
"learning_rate": 0.00019944641128157674,
"loss": 1.6,
"step": 1980
},
{
"epoch": 0.2761395961978769,
"grad_norm": 0.0,
"learning_rate": 0.00019943004937480314,
"loss": 1.5589,
"step": 1990
},
{
"epoch": 0.2775272323596753,
"grad_norm": 0.0,
"learning_rate": 0.00019941344987063186,
"loss": 1.5565,
"step": 2000
},
{
"epoch": 0.2775272323596753,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.9389,
"eval_samples_per_second": 14.916,
"eval_steps_per_second": 1.865,
"step": 2000
},
{
"epoch": 0.2789148685214737,
"grad_norm": 0.0,
"learning_rate": 0.00019939661280872896,
"loss": 1.5592,
"step": 2010
},
{
"epoch": 0.28030250468327206,
"grad_norm": 0.0,
"learning_rate": 0.0001993795382293282,
"loss": 1.5324,
"step": 2020
},
{
"epoch": 0.28169014084507044,
"grad_norm": 0.0,
"learning_rate": 0.00019936222617323085,
"loss": 1.512,
"step": 2030
},
{
"epoch": 0.2830777770068688,
"grad_norm": 0.0,
"learning_rate": 0.00019934467668180573,
"loss": 1.5857,
"step": 2040
},
{
"epoch": 0.2844654131686672,
"grad_norm": 0.0,
"learning_rate": 0.00019932688979698893,
"loss": 1.5779,
"step": 2050
},
{
"epoch": 0.28585304933046557,
"grad_norm": 0.0,
"learning_rate": 0.0001993088655612839,
"loss": 1.5252,
"step": 2060
},
{
"epoch": 0.28724068549226395,
"grad_norm": 0.0,
"learning_rate": 0.00019929060401776126,
"loss": 1.522,
"step": 2070
},
{
"epoch": 0.2886283216540623,
"grad_norm": 0.0,
"learning_rate": 0.00019927210521005861,
"loss": 1.5391,
"step": 2080
},
{
"epoch": 0.2900159578158607,
"grad_norm": 0.0,
"learning_rate": 0.00019925336918238067,
"loss": 1.5029,
"step": 2090
},
{
"epoch": 0.2914035939776591,
"grad_norm": 0.0,
"learning_rate": 0.00019923439597949888,
"loss": 1.5913,
"step": 2100
},
{
"epoch": 0.2914035939776591,
"eval_loss": 1.5553832054138184,
"eval_runtime": 846.883,
"eval_samples_per_second": 15.128,
"eval_steps_per_second": 1.892,
"step": 2100
},
{
"epoch": 0.29279123013945746,
"grad_norm": 0.0,
"learning_rate": 0.00019921518564675145,
"loss": 1.5026,
"step": 2110
},
{
"epoch": 0.29417886630125584,
"grad_norm": 0.0,
"learning_rate": 0.00019919573823004333,
"loss": 1.6166,
"step": 2120
},
{
"epoch": 0.2955665024630542,
"grad_norm": 0.0,
"learning_rate": 0.0001991760537758459,
"loss": 1.5524,
"step": 2130
},
{
"epoch": 0.29695413862485254,
"grad_norm": 0.0,
"learning_rate": 0.00019915613233119705,
"loss": 1.5988,
"step": 2140
},
{
"epoch": 0.2983417747866509,
"grad_norm": 0.0,
"learning_rate": 0.00019913597394370086,
"loss": 1.5131,
"step": 2150
},
{
"epoch": 0.2997294109484493,
"grad_norm": 0.0,
"learning_rate": 0.00019911557866152775,
"loss": 1.573,
"step": 2160
},
{
"epoch": 0.30111704711024767,
"grad_norm": 0.0,
"learning_rate": 0.00019909494653341412,
"loss": 1.526,
"step": 2170
},
{
"epoch": 0.30250468327204605,
"grad_norm": 0.0,
"learning_rate": 0.00019907407760866237,
"loss": 1.6097,
"step": 2180
},
{
"epoch": 0.3038923194338444,
"grad_norm": 0.0,
"learning_rate": 0.00019905297193714073,
"loss": 1.5376,
"step": 2190
},
{
"epoch": 0.3052799555956428,
"grad_norm": 0.0,
"learning_rate": 0.00019903162956928322,
"loss": 1.6057,
"step": 2200
},
{
"epoch": 0.3052799555956428,
"eval_loss": 1.5553832054138184,
"eval_runtime": 864.1262,
"eval_samples_per_second": 14.827,
"eval_steps_per_second": 1.854,
"step": 2200
},
{
"epoch": 0.3066675917574412,
"grad_norm": 0.0,
"learning_rate": 0.0001990100505560894,
"loss": 1.5049,
"step": 2210
},
{
"epoch": 0.30805522791923956,
"grad_norm": 0.0,
"learning_rate": 0.00019898823494912432,
"loss": 1.5858,
"step": 2220
},
{
"epoch": 0.30944286408103794,
"grad_norm": 0.0,
"learning_rate": 0.00019896618280051845,
"loss": 1.5559,
"step": 2230
},
{
"epoch": 0.3108305002428363,
"grad_norm": 0.0,
"learning_rate": 0.00019894389416296742,
"loss": 1.5449,
"step": 2240
},
{
"epoch": 0.3122181364046347,
"grad_norm": 0.0,
"learning_rate": 0.00019892136908973205,
"loss": 1.5155,
"step": 2250
},
{
"epoch": 0.31360577256643307,
"grad_norm": 0.0,
"learning_rate": 0.0001988986076346381,
"loss": 1.5085,
"step": 2260
},
{
"epoch": 0.31499340872823145,
"grad_norm": 0.0,
"learning_rate": 0.00019887560985207614,
"loss": 1.5528,
"step": 2270
},
{
"epoch": 0.31638104489002983,
"grad_norm": 0.0,
"learning_rate": 0.0001988523757970016,
"loss": 1.5497,
"step": 2280
},
{
"epoch": 0.3177686810518282,
"grad_norm": 0.0,
"learning_rate": 0.00019882890552493437,
"loss": 1.5805,
"step": 2290
},
{
"epoch": 0.3191563172136266,
"grad_norm": 0.0,
"learning_rate": 0.00019880519909195893,
"loss": 1.547,
"step": 2300
},
{
"epoch": 0.3191563172136266,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.4335,
"eval_samples_per_second": 15.065,
"eval_steps_per_second": 1.884,
"step": 2300
},
{
"epoch": 0.32054395337542496,
"grad_norm": 0.0,
"learning_rate": 0.00019878125655472395,
"loss": 1.5136,
"step": 2310
},
{
"epoch": 0.32193158953722334,
"grad_norm": 0.0,
"learning_rate": 0.00019875707797044237,
"loss": 1.4932,
"step": 2320
},
{
"epoch": 0.3233192256990217,
"grad_norm": 0.0,
"learning_rate": 0.0001987326633968912,
"loss": 1.5067,
"step": 2330
},
{
"epoch": 0.3247068618608201,
"grad_norm": 0.0,
"learning_rate": 0.0001987080128924113,
"loss": 1.5771,
"step": 2340
},
{
"epoch": 0.3260944980226185,
"grad_norm": 0.0,
"learning_rate": 0.00019868312651590743,
"loss": 1.5148,
"step": 2350
},
{
"epoch": 0.32748213418441685,
"grad_norm": 0.0,
"learning_rate": 0.00019865800432684787,
"loss": 1.5113,
"step": 2360
},
{
"epoch": 0.32886977034621523,
"grad_norm": 0.0,
"learning_rate": 0.00019863264638526445,
"loss": 1.6249,
"step": 2370
},
{
"epoch": 0.3302574065080136,
"grad_norm": 0.0,
"learning_rate": 0.0001986070527517523,
"loss": 1.51,
"step": 2380
},
{
"epoch": 0.331645042669812,
"grad_norm": 0.0,
"learning_rate": 0.00019858122348746985,
"loss": 1.5531,
"step": 2390
},
{
"epoch": 0.33303267883161036,
"grad_norm": 0.0,
"learning_rate": 0.00019855515865413855,
"loss": 1.4905,
"step": 2400
},
{
"epoch": 0.33303267883161036,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.2173,
"eval_samples_per_second": 14.981,
"eval_steps_per_second": 1.873,
"step": 2400
},
{
"epoch": 0.33442031499340874,
"grad_norm": 0.0,
"learning_rate": 0.00019852885831404268,
"loss": 1.5991,
"step": 2410
},
{
"epoch": 0.3358079511552071,
"grad_norm": 0.0,
"learning_rate": 0.00019850232253002946,
"loss": 1.5615,
"step": 2420
},
{
"epoch": 0.3371955873170055,
"grad_norm": 0.0,
"learning_rate": 0.00019847555136550852,
"loss": 1.5065,
"step": 2430
},
{
"epoch": 0.3385832234788039,
"grad_norm": 0.0,
"learning_rate": 0.00019844854488445214,
"loss": 1.538,
"step": 2440
},
{
"epoch": 0.33997085964060225,
"grad_norm": 0.0,
"learning_rate": 0.00019842130315139483,
"loss": 1.5472,
"step": 2450
},
{
"epoch": 0.34135849580240063,
"grad_norm": 0.0,
"learning_rate": 0.00019839382623143323,
"loss": 1.6322,
"step": 2460
},
{
"epoch": 0.342746131964199,
"grad_norm": 0.0,
"learning_rate": 0.00019836611419022605,
"loss": 1.5959,
"step": 2470
},
{
"epoch": 0.3441337681259974,
"grad_norm": 0.0,
"learning_rate": 0.00019833816709399372,
"loss": 1.6385,
"step": 2480
},
{
"epoch": 0.34552140428779576,
"grad_norm": 0.0,
"learning_rate": 0.0001983099850095185,
"loss": 1.5804,
"step": 2490
},
{
"epoch": 0.34690904044959414,
"grad_norm": 0.0,
"learning_rate": 0.0001982815680041441,
"loss": 1.6041,
"step": 2500
},
{
"epoch": 0.34690904044959414,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.6524,
"eval_samples_per_second": 14.921,
"eval_steps_per_second": 1.866,
"step": 2500
},
{
"epoch": 0.3482966766113925,
"grad_norm": 0.0,
"learning_rate": 0.00019825291614577561,
"loss": 1.602,
"step": 2510
},
{
"epoch": 0.34968431277319084,
"grad_norm": 0.0,
"learning_rate": 0.00019822402950287935,
"loss": 1.5554,
"step": 2520
},
{
"epoch": 0.3510719489349892,
"grad_norm": 0.0,
"learning_rate": 0.0001981949081444826,
"loss": 1.5689,
"step": 2530
},
{
"epoch": 0.3524595850967876,
"grad_norm": 0.0,
"learning_rate": 0.00019816555214017363,
"loss": 1.5012,
"step": 2540
},
{
"epoch": 0.353847221258586,
"grad_norm": 0.0,
"learning_rate": 0.0001981359615601013,
"loss": 1.5127,
"step": 2550
},
{
"epoch": 0.35523485742038435,
"grad_norm": 0.0,
"learning_rate": 0.00019810613647497512,
"loss": 1.5374,
"step": 2560
},
{
"epoch": 0.35662249358218273,
"grad_norm": 0.0,
"learning_rate": 0.00019807607695606486,
"loss": 1.4944,
"step": 2570
},
{
"epoch": 0.3580101297439811,
"grad_norm": 0.0,
"learning_rate": 0.0001980457830752006,
"loss": 1.5708,
"step": 2580
},
{
"epoch": 0.3593977659057795,
"grad_norm": 0.0,
"learning_rate": 0.00019801525490477237,
"loss": 1.5308,
"step": 2590
},
{
"epoch": 0.36078540206757787,
"grad_norm": 0.0,
"learning_rate": 0.00019798449251773014,
"loss": 1.6061,
"step": 2600
},
{
"epoch": 0.36078540206757787,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.3026,
"eval_samples_per_second": 14.979,
"eval_steps_per_second": 1.873,
"step": 2600
},
{
"epoch": 0.36217303822937624,
"grad_norm": 0.0,
"learning_rate": 0.00019795349598758345,
"loss": 1.4946,
"step": 2610
},
{
"epoch": 0.3635606743911746,
"grad_norm": 0.0,
"learning_rate": 0.00019792226538840142,
"loss": 1.5214,
"step": 2620
},
{
"epoch": 0.364948310552973,
"grad_norm": 0.0,
"learning_rate": 0.00019789080079481245,
"loss": 1.5454,
"step": 2630
},
{
"epoch": 0.3663359467147714,
"grad_norm": 0.0,
"learning_rate": 0.00019785910228200423,
"loss": 1.5109,
"step": 2640
},
{
"epoch": 0.36772358287656975,
"grad_norm": 0.0,
"learning_rate": 0.00019782716992572323,
"loss": 1.5324,
"step": 2650
},
{
"epoch": 0.36911121903836813,
"grad_norm": 0.0,
"learning_rate": 0.00019779500380227486,
"loss": 1.5535,
"step": 2660
},
{
"epoch": 0.3704988552001665,
"grad_norm": 0.0,
"learning_rate": 0.00019776260398852302,
"loss": 1.4757,
"step": 2670
},
{
"epoch": 0.3718864913619649,
"grad_norm": 0.0,
"learning_rate": 0.0001977299705618901,
"loss": 1.5326,
"step": 2680
},
{
"epoch": 0.37327412752376327,
"grad_norm": 0.0,
"learning_rate": 0.00019769710360035677,
"loss": 1.5597,
"step": 2690
},
{
"epoch": 0.37466176368556164,
"grad_norm": 0.0,
"learning_rate": 0.0001976640031824617,
"loss": 1.4312,
"step": 2700
},
{
"epoch": 0.37466176368556164,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.4079,
"eval_samples_per_second": 15.03,
"eval_steps_per_second": 1.879,
"step": 2700
},
{
"epoch": 0.37604939984736,
"grad_norm": 0.0,
"learning_rate": 0.00019763066938730138,
"loss": 1.4829,
"step": 2710
},
{
"epoch": 0.3774370360091584,
"grad_norm": 0.0,
"learning_rate": 0.0001975971022945301,
"loss": 1.6724,
"step": 2720
},
{
"epoch": 0.3788246721709568,
"grad_norm": 0.0,
"learning_rate": 0.0001975633019843595,
"loss": 1.5867,
"step": 2730
},
{
"epoch": 0.38021230833275516,
"grad_norm": 0.0,
"learning_rate": 0.00019752926853755864,
"loss": 1.592,
"step": 2740
},
{
"epoch": 0.38159994449455353,
"grad_norm": 0.0,
"learning_rate": 0.00019749500203545357,
"loss": 1.5938,
"step": 2750
},
{
"epoch": 0.3829875806563519,
"grad_norm": 0.0,
"learning_rate": 0.00019746050255992735,
"loss": 1.56,
"step": 2760
},
{
"epoch": 0.3843752168181503,
"grad_norm": 0.0,
"learning_rate": 0.00019742577019341966,
"loss": 1.5429,
"step": 2770
},
{
"epoch": 0.38576285297994867,
"grad_norm": 0.0,
"learning_rate": 0.00019739080501892674,
"loss": 1.5171,
"step": 2780
},
{
"epoch": 0.38715048914174705,
"grad_norm": 0.0,
"learning_rate": 0.00019735560712000116,
"loss": 1.6044,
"step": 2790
},
{
"epoch": 0.3885381253035454,
"grad_norm": 0.0,
"learning_rate": 0.0001973201765807516,
"loss": 1.5709,
"step": 2800
},
{
"epoch": 0.3885381253035454,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.963,
"eval_samples_per_second": 15.056,
"eval_steps_per_second": 1.883,
"step": 2800
},
{
"epoch": 0.3899257614653438,
"grad_norm": 0.0,
"learning_rate": 0.00019728451348584262,
"loss": 1.5263,
"step": 2810
},
{
"epoch": 0.3913133976271422,
"grad_norm": 0.0,
"learning_rate": 0.00019724861792049455,
"loss": 1.5946,
"step": 2820
},
{
"epoch": 0.39270103378894056,
"grad_norm": 0.0,
"learning_rate": 0.00019721248997048315,
"loss": 1.5997,
"step": 2830
},
{
"epoch": 0.39408866995073893,
"grad_norm": 0.0,
"learning_rate": 0.0001971761297221396,
"loss": 1.5798,
"step": 2840
},
{
"epoch": 0.3954763061125373,
"grad_norm": 0.0,
"learning_rate": 0.00019713953726235004,
"loss": 1.5016,
"step": 2850
},
{
"epoch": 0.3968639422743357,
"grad_norm": 0.0,
"learning_rate": 0.0001971027126785556,
"loss": 1.6087,
"step": 2860
},
{
"epoch": 0.39825157843613407,
"grad_norm": 0.0,
"learning_rate": 0.0001970656560587521,
"loss": 1.5384,
"step": 2870
},
{
"epoch": 0.39963921459793245,
"grad_norm": 0.0,
"learning_rate": 0.00019702836749148977,
"loss": 1.5269,
"step": 2880
},
{
"epoch": 0.4010268507597308,
"grad_norm": 0.0,
"learning_rate": 0.0001969908470658731,
"loss": 1.5338,
"step": 2890
},
{
"epoch": 0.4024144869215292,
"grad_norm": 0.0,
"learning_rate": 0.0001969530948715607,
"loss": 1.6045,
"step": 2900
},
{
"epoch": 0.4024144869215292,
"eval_loss": 1.5553832054138184,
"eval_runtime": 848.916,
"eval_samples_per_second": 15.092,
"eval_steps_per_second": 1.887,
"step": 2900
},
{
"epoch": 0.4038021230833275,
"grad_norm": 0.0,
"learning_rate": 0.00019691511099876493,
"loss": 1.5226,
"step": 2910
},
{
"epoch": 0.4051897592451259,
"grad_norm": 0.0,
"learning_rate": 0.0001968768955382519,
"loss": 1.5307,
"step": 2920
},
{
"epoch": 0.4065773954069243,
"grad_norm": 0.0,
"learning_rate": 0.00019683844858134087,
"loss": 1.5173,
"step": 2930
},
{
"epoch": 0.40796503156872266,
"grad_norm": 0.0,
"learning_rate": 0.0001967997702199045,
"loss": 1.5411,
"step": 2940
},
{
"epoch": 0.40935266773052104,
"grad_norm": 0.0,
"learning_rate": 0.00019676086054636844,
"loss": 1.5955,
"step": 2950
},
{
"epoch": 0.4107403038923194,
"grad_norm": 0.0,
"learning_rate": 0.00019672171965371088,
"loss": 1.5243,
"step": 2960
},
{
"epoch": 0.4121279400541178,
"grad_norm": 0.0,
"learning_rate": 0.0001966823476354627,
"loss": 1.6218,
"step": 2970
},
{
"epoch": 0.41351557621591617,
"grad_norm": 0.0,
"learning_rate": 0.000196642744585707,
"loss": 1.4745,
"step": 2980
},
{
"epoch": 0.41490321237771455,
"grad_norm": 0.0,
"learning_rate": 0.00019660291059907893,
"loss": 1.4971,
"step": 2990
},
{
"epoch": 0.4162908485395129,
"grad_norm": 0.0,
"learning_rate": 0.0001965628457707656,
"loss": 1.4997,
"step": 3000
},
{
"epoch": 0.4162908485395129,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.1371,
"eval_samples_per_second": 15.0,
"eval_steps_per_second": 1.876,
"step": 3000
},
{
"epoch": 0.4176784847013113,
"grad_norm": 0.0,
"learning_rate": 0.00019652255019650565,
"loss": 1.5693,
"step": 3010
},
{
"epoch": 0.4190661208631097,
"grad_norm": 0.0,
"learning_rate": 0.00019648202397258904,
"loss": 1.5664,
"step": 3020
},
{
"epoch": 0.42045375702490806,
"grad_norm": 0.0,
"learning_rate": 0.00019644126719585705,
"loss": 1.516,
"step": 3030
},
{
"epoch": 0.42184139318670644,
"grad_norm": 0.0,
"learning_rate": 0.0001964002799637018,
"loss": 1.5675,
"step": 3040
},
{
"epoch": 0.4232290293485048,
"grad_norm": 0.0,
"learning_rate": 0.0001963590623740661,
"loss": 1.4753,
"step": 3050
},
{
"epoch": 0.4246166655103032,
"grad_norm": 0.0,
"learning_rate": 0.00019631761452544323,
"loss": 1.6459,
"step": 3060
},
{
"epoch": 0.42600430167210157,
"grad_norm": 0.0,
"learning_rate": 0.00019627593651687668,
"loss": 1.5182,
"step": 3070
},
{
"epoch": 0.42739193783389995,
"grad_norm": 0.0,
"learning_rate": 0.00019623402844795996,
"loss": 1.5366,
"step": 3080
},
{
"epoch": 0.4287795739956983,
"grad_norm": 0.0,
"learning_rate": 0.00019619189041883637,
"loss": 1.5423,
"step": 3090
},
{
"epoch": 0.4301672101574967,
"grad_norm": 0.0,
"learning_rate": 0.00019614952253019862,
"loss": 1.6168,
"step": 3100
},
{
"epoch": 0.4301672101574967,
"eval_loss": 1.5553832054138184,
"eval_runtime": 862.9987,
"eval_samples_per_second": 14.846,
"eval_steps_per_second": 1.856,
"step": 3100
},
{
"epoch": 0.4315548463192951,
"grad_norm": 0.0,
"learning_rate": 0.00019610692488328873,
"loss": 1.6113,
"step": 3110
},
{
"epoch": 0.43294248248109346,
"grad_norm": 0.0,
"learning_rate": 0.00019606409757989777,
"loss": 1.5321,
"step": 3120
},
{
"epoch": 0.43433011864289184,
"grad_norm": 0.0,
"learning_rate": 0.0001960210407223656,
"loss": 1.5487,
"step": 3130
},
{
"epoch": 0.4357177548046902,
"grad_norm": 0.0,
"learning_rate": 0.0001959777544135806,
"loss": 1.4904,
"step": 3140
},
{
"epoch": 0.4371053909664886,
"grad_norm": 0.0,
"learning_rate": 0.00019593423875697945,
"loss": 1.5505,
"step": 3150
},
{
"epoch": 0.43849302712828697,
"grad_norm": 0.0,
"learning_rate": 0.00019589049385654685,
"loss": 1.5197,
"step": 3160
},
{
"epoch": 0.43988066329008535,
"grad_norm": 0.0,
"learning_rate": 0.0001958465198168154,
"loss": 1.5095,
"step": 3170
},
{
"epoch": 0.4412682994518837,
"grad_norm": 0.0,
"learning_rate": 0.0001958023167428651,
"loss": 1.4931,
"step": 3180
},
{
"epoch": 0.4426559356136821,
"grad_norm": 0.0,
"learning_rate": 0.00019575788474032336,
"loss": 1.5362,
"step": 3190
},
{
"epoch": 0.4440435717754805,
"grad_norm": 0.0,
"learning_rate": 0.00019571322391536463,
"loss": 1.5893,
"step": 3200
},
{
"epoch": 0.4440435717754805,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.8213,
"eval_samples_per_second": 14.953,
"eval_steps_per_second": 1.87,
"step": 3200
},
{
"epoch": 0.44543120793727886,
"grad_norm": 0.0,
"learning_rate": 0.0001956683343747101,
"loss": 1.5172,
"step": 3210
},
{
"epoch": 0.44681884409907724,
"grad_norm": 0.0,
"learning_rate": 0.00019562321622562754,
"loss": 1.6077,
"step": 3220
},
{
"epoch": 0.4482064802608756,
"grad_norm": 0.0,
"learning_rate": 0.00019557786957593093,
"loss": 1.5728,
"step": 3230
},
{
"epoch": 0.449594116422674,
"grad_norm": 0.0,
"learning_rate": 0.0001955322945339804,
"loss": 1.5058,
"step": 3240
},
{
"epoch": 0.4509817525844724,
"grad_norm": 0.0,
"learning_rate": 0.00019548649120868175,
"loss": 1.551,
"step": 3250
},
{
"epoch": 0.45236938874627075,
"grad_norm": 0.0,
"learning_rate": 0.00019544045970948628,
"loss": 1.5116,
"step": 3260
},
{
"epoch": 0.45375702490806913,
"grad_norm": 0.0,
"learning_rate": 0.00019539420014639058,
"loss": 1.4633,
"step": 3270
},
{
"epoch": 0.4551446610698675,
"grad_norm": 0.0,
"learning_rate": 0.00019534771262993622,
"loss": 1.5815,
"step": 3280
},
{
"epoch": 0.4565322972316659,
"grad_norm": 0.0,
"learning_rate": 0.00019530099727120944,
"loss": 1.5479,
"step": 3290
},
{
"epoch": 0.4579199333934642,
"grad_norm": 0.0,
"learning_rate": 0.00019525405418184094,
"loss": 1.5511,
"step": 3300
},
{
"epoch": 0.4579199333934642,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.7008,
"eval_samples_per_second": 14.99,
"eval_steps_per_second": 1.874,
"step": 3300
},
{
"epoch": 0.4593075695552626,
"grad_norm": 0.0,
"learning_rate": 0.00019520688347400564,
"loss": 1.5653,
"step": 3310
},
{
"epoch": 0.46069520571706096,
"grad_norm": 0.0,
"learning_rate": 0.00019515948526042237,
"loss": 1.5735,
"step": 3320
},
{
"epoch": 0.46208284187885934,
"grad_norm": 0.0,
"learning_rate": 0.00019511185965435363,
"loss": 1.4936,
"step": 3330
},
{
"epoch": 0.4634704780406577,
"grad_norm": 0.0,
"learning_rate": 0.00019506400676960514,
"loss": 1.5324,
"step": 3340
},
{
"epoch": 0.4648581142024561,
"grad_norm": 0.0,
"learning_rate": 0.00019501592672052596,
"loss": 1.5694,
"step": 3350
},
{
"epoch": 0.4662457503642545,
"grad_norm": 0.0,
"learning_rate": 0.00019496761962200777,
"loss": 1.5417,
"step": 3360
},
{
"epoch": 0.46763338652605285,
"grad_norm": 0.0,
"learning_rate": 0.00019491908558948498,
"loss": 1.5219,
"step": 3370
},
{
"epoch": 0.46902102268785123,
"grad_norm": 0.0,
"learning_rate": 0.00019487032473893413,
"loss": 1.5854,
"step": 3380
},
{
"epoch": 0.4704086588496496,
"grad_norm": 0.0,
"learning_rate": 0.00019482133718687388,
"loss": 1.5353,
"step": 3390
},
{
"epoch": 0.471796295011448,
"grad_norm": 0.0,
"learning_rate": 0.0001947721230503645,
"loss": 1.5644,
"step": 3400
},
{
"epoch": 0.471796295011448,
"eval_loss": 1.5553832054138184,
"eval_runtime": 849.4372,
"eval_samples_per_second": 15.083,
"eval_steps_per_second": 1.886,
"step": 3400
},
{
"epoch": 0.47318393117324636,
"grad_norm": 0.0,
"learning_rate": 0.00019472268244700788,
"loss": 1.5277,
"step": 3410
},
{
"epoch": 0.47457156733504474,
"grad_norm": 0.0,
"learning_rate": 0.00019467301549494685,
"loss": 1.4834,
"step": 3420
},
{
"epoch": 0.4759592034968431,
"grad_norm": 0.0,
"learning_rate": 0.0001946231223128653,
"loss": 1.5184,
"step": 3430
},
{
"epoch": 0.4773468396586415,
"grad_norm": 0.0,
"learning_rate": 0.00019457300301998763,
"loss": 1.557,
"step": 3440
},
{
"epoch": 0.4787344758204399,
"grad_norm": 0.0,
"learning_rate": 0.00019452265773607855,
"loss": 1.5981,
"step": 3450
},
{
"epoch": 0.48012211198223825,
"grad_norm": 0.0,
"learning_rate": 0.0001944720865814429,
"loss": 1.6308,
"step": 3460
},
{
"epoch": 0.48150974814403663,
"grad_norm": 0.0,
"learning_rate": 0.0001944212896769251,
"loss": 1.5711,
"step": 3470
},
{
"epoch": 0.482897384305835,
"grad_norm": 0.0,
"learning_rate": 0.00019437026714390915,
"loss": 1.5795,
"step": 3480
},
{
"epoch": 0.4842850204676334,
"grad_norm": 0.0,
"learning_rate": 0.00019431901910431812,
"loss": 1.4727,
"step": 3490
},
{
"epoch": 0.48567265662943176,
"grad_norm": 0.0,
"learning_rate": 0.00019426754568061406,
"loss": 1.5256,
"step": 3500
},
{
"epoch": 0.48567265662943176,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.6448,
"eval_samples_per_second": 14.956,
"eval_steps_per_second": 1.87,
"step": 3500
},
{
"epoch": 0.48706029279123014,
"grad_norm": 0.0,
"learning_rate": 0.00019421584699579747,
"loss": 1.5841,
"step": 3510
},
{
"epoch": 0.4884479289530285,
"grad_norm": 0.0,
"learning_rate": 0.0001941639231734072,
"loss": 1.5792,
"step": 3520
},
{
"epoch": 0.4898355651148269,
"grad_norm": 0.0,
"learning_rate": 0.0001941117743375201,
"loss": 1.5351,
"step": 3530
},
{
"epoch": 0.4912232012766253,
"grad_norm": 0.0,
"learning_rate": 0.00019405940061275066,
"loss": 1.6002,
"step": 3540
},
{
"epoch": 0.49261083743842365,
"grad_norm": 0.0,
"learning_rate": 0.00019400680212425077,
"loss": 1.5695,
"step": 3550
},
{
"epoch": 0.49399847360022203,
"grad_norm": 0.0,
"learning_rate": 0.0001939539789977095,
"loss": 1.6573,
"step": 3560
},
{
"epoch": 0.4953861097620204,
"grad_norm": 0.0,
"learning_rate": 0.00019390093135935262,
"loss": 1.6026,
"step": 3570
},
{
"epoch": 0.4967737459238188,
"grad_norm": 0.0,
"learning_rate": 0.0001938476593359424,
"loss": 1.5125,
"step": 3580
},
{
"epoch": 0.49816138208561717,
"grad_norm": 0.0,
"learning_rate": 0.00019379416305477734,
"loss": 1.4628,
"step": 3590
},
{
"epoch": 0.49954901824741554,
"grad_norm": 0.0,
"learning_rate": 0.00019374044264369183,
"loss": 1.4841,
"step": 3600
},
{
"epoch": 0.49954901824741554,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.8457,
"eval_samples_per_second": 15.005,
"eval_steps_per_second": 1.876,
"step": 3600
},
{
"epoch": 0.5009366544092139,
"grad_norm": 0.0,
"learning_rate": 0.0001936864982310558,
"loss": 1.4635,
"step": 3610
},
{
"epoch": 0.5023242905710122,
"grad_norm": 0.0,
"learning_rate": 0.00019363232994577438,
"loss": 1.5569,
"step": 3620
},
{
"epoch": 0.5037119267328106,
"grad_norm": 0.0,
"learning_rate": 0.00019357793791728787,
"loss": 1.5724,
"step": 3630
},
{
"epoch": 0.505099562894609,
"grad_norm": 0.0,
"learning_rate": 0.00019352332227557105,
"loss": 1.593,
"step": 3640
},
{
"epoch": 0.5064871990564074,
"grad_norm": 0.0,
"learning_rate": 0.00019346848315113314,
"loss": 1.516,
"step": 3650
},
{
"epoch": 0.5078748352182058,
"grad_norm": 0.0,
"learning_rate": 0.00019341342067501728,
"loss": 1.5541,
"step": 3660
},
{
"epoch": 0.5092624713800041,
"grad_norm": 0.0,
"learning_rate": 0.0001933581349788005,
"loss": 1.5348,
"step": 3670
},
{
"epoch": 0.5106501075418025,
"grad_norm": 0.0,
"learning_rate": 0.00019330262619459305,
"loss": 1.4744,
"step": 3680
},
{
"epoch": 0.5120377437036009,
"grad_norm": 0.0,
"learning_rate": 0.0001932468944550384,
"loss": 1.5221,
"step": 3690
},
{
"epoch": 0.5134253798653993,
"grad_norm": 0.0,
"learning_rate": 0.00019319093989331277,
"loss": 1.597,
"step": 3700
},
{
"epoch": 0.5134253798653993,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.2547,
"eval_samples_per_second": 15.068,
"eval_steps_per_second": 1.884,
"step": 3700
},
{
"epoch": 0.5148130160271976,
"grad_norm": 0.0,
"learning_rate": 0.0001931347626431248,
"loss": 1.5977,
"step": 3710
},
{
"epoch": 0.516200652188996,
"grad_norm": 0.0,
"learning_rate": 0.00019307836283871525,
"loss": 1.5051,
"step": 3720
},
{
"epoch": 0.5175882883507944,
"grad_norm": 0.0,
"learning_rate": 0.00019302174061485675,
"loss": 1.4407,
"step": 3730
},
{
"epoch": 0.5189759245125928,
"grad_norm": 0.0,
"learning_rate": 0.00019296489610685344,
"loss": 1.4938,
"step": 3740
},
{
"epoch": 0.5203635606743912,
"grad_norm": 0.0,
"learning_rate": 0.0001929078294505405,
"loss": 1.5282,
"step": 3750
},
{
"epoch": 0.5217511968361895,
"grad_norm": 0.0,
"learning_rate": 0.0001928505407822841,
"loss": 1.546,
"step": 3760
},
{
"epoch": 0.5231388329979879,
"grad_norm": 0.0,
"learning_rate": 0.00019279303023898086,
"loss": 1.568,
"step": 3770
},
{
"epoch": 0.5245264691597863,
"grad_norm": 0.0,
"learning_rate": 0.0001927352979580576,
"loss": 1.5893,
"step": 3780
},
{
"epoch": 0.5259141053215847,
"grad_norm": 0.0,
"learning_rate": 0.00019267734407747095,
"loss": 1.5501,
"step": 3790
},
{
"epoch": 0.527301741483383,
"grad_norm": 0.0,
"learning_rate": 0.0001926191687357072,
"loss": 1.5555,
"step": 3800
},
{
"epoch": 0.527301741483383,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.0196,
"eval_samples_per_second": 15.002,
"eval_steps_per_second": 1.876,
"step": 3800
},
{
"epoch": 0.5286893776451814,
"grad_norm": 0.0,
"learning_rate": 0.00019256077207178174,
"loss": 1.6144,
"step": 3810
},
{
"epoch": 0.5300770138069798,
"grad_norm": 0.0,
"learning_rate": 0.00019250215422523883,
"loss": 1.6102,
"step": 3820
},
{
"epoch": 0.5314646499687782,
"grad_norm": 0.0,
"learning_rate": 0.00019244331533615133,
"loss": 1.5673,
"step": 3830
},
{
"epoch": 0.5328522861305766,
"grad_norm": 0.0,
"learning_rate": 0.00019238425554512025,
"loss": 1.4782,
"step": 3840
},
{
"epoch": 0.5342399222923749,
"grad_norm": 0.0,
"learning_rate": 0.0001923249749932745,
"loss": 1.559,
"step": 3850
},
{
"epoch": 0.5356275584541733,
"grad_norm": 0.0,
"learning_rate": 0.00019226547382227046,
"loss": 1.5148,
"step": 3860
},
{
"epoch": 0.5370151946159717,
"grad_norm": 0.0,
"learning_rate": 0.00019220575217429174,
"loss": 1.5362,
"step": 3870
},
{
"epoch": 0.5384028307777701,
"grad_norm": 0.0,
"learning_rate": 0.0001921458101920489,
"loss": 1.5881,
"step": 3880
},
{
"epoch": 0.5397904669395684,
"grad_norm": 0.0,
"learning_rate": 0.0001920856480187788,
"loss": 1.6019,
"step": 3890
},
{
"epoch": 0.5411781031013668,
"grad_norm": 0.0,
"learning_rate": 0.0001920252657982446,
"loss": 1.5708,
"step": 3900
},
{
"epoch": 0.5411781031013668,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.0746,
"eval_samples_per_second": 15.054,
"eval_steps_per_second": 1.882,
"step": 3900
},
{
"epoch": 0.5425657392631652,
"grad_norm": 0.0,
"learning_rate": 0.0001919646636747353,
"loss": 1.5765,
"step": 3910
},
{
"epoch": 0.5439533754249636,
"grad_norm": 0.0,
"learning_rate": 0.00019190384179306526,
"loss": 1.5805,
"step": 3920
},
{
"epoch": 0.545341011586762,
"grad_norm": 0.0,
"learning_rate": 0.00019184280029857417,
"loss": 1.5299,
"step": 3930
},
{
"epoch": 0.5467286477485603,
"grad_norm": 0.0,
"learning_rate": 0.00019178153933712626,
"loss": 1.5059,
"step": 3940
},
{
"epoch": 0.5481162839103587,
"grad_norm": 0.0,
"learning_rate": 0.00019172005905511043,
"loss": 1.5024,
"step": 3950
},
{
"epoch": 0.5495039200721571,
"grad_norm": 0.0,
"learning_rate": 0.0001916583595994395,
"loss": 1.5994,
"step": 3960
},
{
"epoch": 0.5508915562339555,
"grad_norm": 0.0,
"learning_rate": 0.00019159644111755005,
"loss": 1.5222,
"step": 3970
},
{
"epoch": 0.5522791923957538,
"grad_norm": 0.0,
"learning_rate": 0.00019153430375740222,
"loss": 1.5622,
"step": 3980
},
{
"epoch": 0.5536668285575522,
"grad_norm": 0.0,
"learning_rate": 0.0001914719476674789,
"loss": 1.5364,
"step": 3990
},
{
"epoch": 0.5550544647193506,
"grad_norm": 0.0,
"learning_rate": 0.0001914093729967859,
"loss": 1.5421,
"step": 4000
},
{
"epoch": 0.5550544647193506,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.1421,
"eval_samples_per_second": 15.053,
"eval_steps_per_second": 1.882,
"step": 4000
},
{
"epoch": 0.556442100881149,
"grad_norm": 0.0,
"learning_rate": 0.00019134657989485114,
"loss": 1.5533,
"step": 4010
},
{
"epoch": 0.5578297370429474,
"grad_norm": 0.0,
"learning_rate": 0.0001912835685117247,
"loss": 1.5798,
"step": 4020
},
{
"epoch": 0.5592173732047457,
"grad_norm": 0.0,
"learning_rate": 0.0001912203389979781,
"loss": 1.5972,
"step": 4030
},
{
"epoch": 0.5606050093665441,
"grad_norm": 0.0,
"learning_rate": 0.00019115689150470423,
"loss": 1.5845,
"step": 4040
},
{
"epoch": 0.5619926455283425,
"grad_norm": 0.0,
"learning_rate": 0.00019109322618351678,
"loss": 1.5489,
"step": 4050
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.0,
"learning_rate": 0.00019102934318654998,
"loss": 1.5561,
"step": 4060
},
{
"epoch": 0.5647679178519392,
"grad_norm": 0.0,
"learning_rate": 0.00019096524266645824,
"loss": 1.5311,
"step": 4070
},
{
"epoch": 0.5661555540137376,
"grad_norm": 0.0,
"learning_rate": 0.00019090092477641574,
"loss": 1.491,
"step": 4080
},
{
"epoch": 0.567543190175536,
"grad_norm": 0.0,
"learning_rate": 0.0001908363896701161,
"loss": 1.5083,
"step": 4090
},
{
"epoch": 0.5689308263373344,
"grad_norm": 0.0,
"learning_rate": 0.00019077163750177195,
"loss": 1.4945,
"step": 4100
},
{
"epoch": 0.5689308263373344,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.7869,
"eval_samples_per_second": 15.006,
"eval_steps_per_second": 1.876,
"step": 4100
},
{
"epoch": 0.5703184624991328,
"grad_norm": 0.0,
"learning_rate": 0.00019070666842611468,
"loss": 1.5692,
"step": 4110
},
{
"epoch": 0.5717060986609311,
"grad_norm": 0.0,
"learning_rate": 0.000190641482598394,
"loss": 1.4878,
"step": 4120
},
{
"epoch": 0.5730937348227295,
"grad_norm": 0.0,
"learning_rate": 0.00019057608017437744,
"loss": 1.4801,
"step": 4130
},
{
"epoch": 0.5744813709845279,
"grad_norm": 0.0,
"learning_rate": 0.00019051046131035032,
"loss": 1.4968,
"step": 4140
},
{
"epoch": 0.5758690071463263,
"grad_norm": 0.0,
"learning_rate": 0.000190444626163115,
"loss": 1.5686,
"step": 4150
},
{
"epoch": 0.5772566433081247,
"grad_norm": 0.0,
"learning_rate": 0.00019037857488999067,
"loss": 1.5768,
"step": 4160
},
{
"epoch": 0.578644279469923,
"grad_norm": 0.0,
"learning_rate": 0.00019031230764881313,
"loss": 1.5261,
"step": 4170
},
{
"epoch": 0.5800319156317214,
"grad_norm": 0.0,
"learning_rate": 0.00019024582459793406,
"loss": 1.6063,
"step": 4180
},
{
"epoch": 0.5814195517935198,
"grad_norm": 0.0,
"learning_rate": 0.00019017912589622092,
"loss": 1.5674,
"step": 4190
},
{
"epoch": 0.5828071879553182,
"grad_norm": 0.0,
"learning_rate": 0.00019011221170305657,
"loss": 1.6424,
"step": 4200
},
{
"epoch": 0.5828071879553182,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.6655,
"eval_samples_per_second": 14.956,
"eval_steps_per_second": 1.87,
"step": 4200
},
{
"epoch": 0.5841948241171165,
"grad_norm": 0.0,
"learning_rate": 0.0001900450821783387,
"loss": 1.5284,
"step": 4210
},
{
"epoch": 0.5855824602789149,
"grad_norm": 0.0,
"learning_rate": 0.00018997773748247955,
"loss": 1.5112,
"step": 4220
},
{
"epoch": 0.5869700964407133,
"grad_norm": 0.0,
"learning_rate": 0.00018991017777640555,
"loss": 1.5168,
"step": 4230
},
{
"epoch": 0.5883577326025117,
"grad_norm": 0.0,
"learning_rate": 0.00018984240322155702,
"loss": 1.5474,
"step": 4240
},
{
"epoch": 0.58974536876431,
"grad_norm": 0.0,
"learning_rate": 0.0001897744139798875,
"loss": 1.5246,
"step": 4250
},
{
"epoch": 0.5911330049261084,
"grad_norm": 0.0,
"learning_rate": 0.00018970621021386372,
"loss": 1.5666,
"step": 4260
},
{
"epoch": 0.5925206410879067,
"grad_norm": 0.0,
"learning_rate": 0.0001896377920864649,
"loss": 1.5671,
"step": 4270
},
{
"epoch": 0.5939082772497051,
"grad_norm": 0.0,
"learning_rate": 0.00018956915976118256,
"loss": 1.5047,
"step": 4280
},
{
"epoch": 0.5952959134115035,
"grad_norm": 0.0,
"learning_rate": 0.00018950031340202007,
"loss": 1.5048,
"step": 4290
},
{
"epoch": 0.5966835495733018,
"grad_norm": 0.0,
"learning_rate": 0.00018943125317349226,
"loss": 1.4808,
"step": 4300
},
{
"epoch": 0.5966835495733018,
"eval_loss": 1.5553832054138184,
"eval_runtime": 848.2106,
"eval_samples_per_second": 15.105,
"eval_steps_per_second": 1.889,
"step": 4300
},
{
"epoch": 0.5980711857351002,
"grad_norm": 0.0,
"learning_rate": 0.00018936197924062493,
"loss": 1.6047,
"step": 4310
},
{
"epoch": 0.5994588218968986,
"grad_norm": 0.0,
"learning_rate": 0.0001892924917689547,
"loss": 1.5637,
"step": 4320
},
{
"epoch": 0.600846458058697,
"grad_norm": 0.0,
"learning_rate": 0.00018922279092452836,
"loss": 1.5631,
"step": 4330
},
{
"epoch": 0.6022340942204953,
"grad_norm": 0.0,
"learning_rate": 0.00018915287687390256,
"loss": 1.5161,
"step": 4340
},
{
"epoch": 0.6036217303822937,
"grad_norm": 0.0,
"learning_rate": 0.00018908274978414344,
"loss": 1.5428,
"step": 4350
},
{
"epoch": 0.6050093665440921,
"grad_norm": 0.0,
"learning_rate": 0.0001890124098228263,
"loss": 1.4677,
"step": 4360
},
{
"epoch": 0.6063970027058905,
"grad_norm": 0.0,
"learning_rate": 0.00018894185715803504,
"loss": 1.558,
"step": 4370
},
{
"epoch": 0.6077846388676889,
"grad_norm": 0.0,
"learning_rate": 0.00018887109195836184,
"loss": 1.5269,
"step": 4380
},
{
"epoch": 0.6091722750294872,
"grad_norm": 0.0,
"learning_rate": 0.00018880011439290675,
"loss": 1.5649,
"step": 4390
},
{
"epoch": 0.6105599111912856,
"grad_norm": 0.0,
"learning_rate": 0.00018872892463127726,
"loss": 1.6088,
"step": 4400
},
{
"epoch": 0.6105599111912856,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.5624,
"eval_samples_per_second": 14.992,
"eval_steps_per_second": 1.875,
"step": 4400
},
{
"epoch": 0.611947547353084,
"grad_norm": 0.0,
"learning_rate": 0.00018865752284358807,
"loss": 1.5703,
"step": 4410
},
{
"epoch": 0.6133351835148824,
"grad_norm": 0.0,
"learning_rate": 0.00018858590920046032,
"loss": 1.627,
"step": 4420
},
{
"epoch": 0.6147228196766807,
"grad_norm": 0.0,
"learning_rate": 0.00018851408387302154,
"loss": 1.5132,
"step": 4430
},
{
"epoch": 0.6161104558384791,
"grad_norm": 0.0,
"learning_rate": 0.0001884420470329051,
"loss": 1.5088,
"step": 4440
},
{
"epoch": 0.6174980920002775,
"grad_norm": 0.0,
"learning_rate": 0.00018836979885224968,
"loss": 1.5003,
"step": 4450
},
{
"epoch": 0.6188857281620759,
"grad_norm": 0.0,
"learning_rate": 0.00018829733950369914,
"loss": 1.5189,
"step": 4460
},
{
"epoch": 0.6202733643238743,
"grad_norm": 0.0,
"learning_rate": 0.00018822466916040183,
"loss": 1.4951,
"step": 4470
},
{
"epoch": 0.6216610004856726,
"grad_norm": 0.0,
"learning_rate": 0.00018815178799601036,
"loss": 1.5847,
"step": 4480
},
{
"epoch": 0.623048636647471,
"grad_norm": 0.0,
"learning_rate": 0.00018807869618468103,
"loss": 1.5797,
"step": 4490
},
{
"epoch": 0.6244362728092694,
"grad_norm": 0.0,
"learning_rate": 0.0001880053939010736,
"loss": 1.5155,
"step": 4500
},
{
"epoch": 0.6244362728092694,
"eval_loss": 1.5553832054138184,
"eval_runtime": 849.8599,
"eval_samples_per_second": 15.075,
"eval_steps_per_second": 1.885,
"step": 4500
},
{
"epoch": 0.6258239089710678,
"grad_norm": 0.0,
"learning_rate": 0.00018793188132035072,
"loss": 1.4795,
"step": 4510
},
{
"epoch": 0.6272115451328661,
"grad_norm": 0.0,
"learning_rate": 0.00018785815861817762,
"loss": 1.5742,
"step": 4520
},
{
"epoch": 0.6285991812946645,
"grad_norm": 0.0,
"learning_rate": 0.00018778422597072147,
"loss": 1.6734,
"step": 4530
},
{
"epoch": 0.6299868174564629,
"grad_norm": 0.0,
"learning_rate": 0.00018771008355465135,
"loss": 1.5631,
"step": 4540
},
{
"epoch": 0.6313744536182613,
"grad_norm": 0.0,
"learning_rate": 0.00018763573154713744,
"loss": 1.5465,
"step": 4550
},
{
"epoch": 0.6327620897800597,
"grad_norm": 0.0,
"learning_rate": 0.00018756117012585084,
"loss": 1.5793,
"step": 4560
},
{
"epoch": 0.634149725941858,
"grad_norm": 0.0,
"learning_rate": 0.00018748639946896304,
"loss": 1.5554,
"step": 4570
},
{
"epoch": 0.6355373621036564,
"grad_norm": 0.0,
"learning_rate": 0.00018741141975514545,
"loss": 1.5383,
"step": 4580
},
{
"epoch": 0.6369249982654548,
"grad_norm": 0.0,
"learning_rate": 0.00018733623116356919,
"loss": 1.5009,
"step": 4590
},
{
"epoch": 0.6383126344272532,
"grad_norm": 0.0,
"learning_rate": 0.00018726083387390435,
"loss": 1.5424,
"step": 4600
},
{
"epoch": 0.6383126344272532,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.6756,
"eval_samples_per_second": 14.921,
"eval_steps_per_second": 1.866,
"step": 4600
},
{
"epoch": 0.6397002705890515,
"grad_norm": 0.0,
"learning_rate": 0.0001871852280663199,
"loss": 1.5264,
"step": 4610
},
{
"epoch": 0.6410879067508499,
"grad_norm": 0.0,
"learning_rate": 0.0001871094139214829,
"loss": 1.5278,
"step": 4620
},
{
"epoch": 0.6424755429126483,
"grad_norm": 0.0,
"learning_rate": 0.00018703339162055838,
"loss": 1.5058,
"step": 4630
},
{
"epoch": 0.6438631790744467,
"grad_norm": 0.0,
"learning_rate": 0.0001869571613452087,
"loss": 1.5311,
"step": 4640
},
{
"epoch": 0.6452508152362451,
"grad_norm": 0.0,
"learning_rate": 0.00018688072327759328,
"loss": 1.4861,
"step": 4650
},
{
"epoch": 0.6466384513980434,
"grad_norm": 0.0,
"learning_rate": 0.00018680407760036797,
"loss": 1.5559,
"step": 4660
},
{
"epoch": 0.6480260875598418,
"grad_norm": 0.0,
"learning_rate": 0.0001867272244966848,
"loss": 1.5407,
"step": 4670
},
{
"epoch": 0.6494137237216402,
"grad_norm": 0.0,
"learning_rate": 0.00018665016415019147,
"loss": 1.5442,
"step": 4680
},
{
"epoch": 0.6508013598834386,
"grad_norm": 0.0,
"learning_rate": 0.00018657289674503085,
"loss": 1.458,
"step": 4690
},
{
"epoch": 0.652188996045237,
"grad_norm": 0.0,
"learning_rate": 0.00018649542246584067,
"loss": 1.5634,
"step": 4700
},
{
"epoch": 0.652188996045237,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.275,
"eval_samples_per_second": 14.998,
"eval_steps_per_second": 1.875,
"step": 4700
},
{
"epoch": 0.6535766322070353,
"grad_norm": 0.0,
"learning_rate": 0.0001864177414977529,
"loss": 1.5132,
"step": 4710
},
{
"epoch": 0.6549642683688337,
"grad_norm": 0.0,
"learning_rate": 0.00018633985402639351,
"loss": 1.5802,
"step": 4720
},
{
"epoch": 0.6563519045306321,
"grad_norm": 0.0,
"learning_rate": 0.0001862617602378819,
"loss": 1.5437,
"step": 4730
},
{
"epoch": 0.6577395406924305,
"grad_norm": 0.0,
"learning_rate": 0.0001861834603188305,
"loss": 1.4459,
"step": 4740
},
{
"epoch": 0.6591271768542288,
"grad_norm": 0.0,
"learning_rate": 0.00018610495445634423,
"loss": 1.5517,
"step": 4750
},
{
"epoch": 0.6605148130160272,
"grad_norm": 0.0,
"learning_rate": 0.00018602624283802022,
"loss": 1.5395,
"step": 4760
},
{
"epoch": 0.6619024491778256,
"grad_norm": 0.0,
"learning_rate": 0.00018594732565194722,
"loss": 1.5535,
"step": 4770
},
{
"epoch": 0.663290085339624,
"grad_norm": 0.0,
"learning_rate": 0.00018586820308670525,
"loss": 1.518,
"step": 4780
},
{
"epoch": 0.6646777215014223,
"grad_norm": 0.0,
"learning_rate": 0.00018578887533136505,
"loss": 1.5607,
"step": 4790
},
{
"epoch": 0.6660653576632207,
"grad_norm": 0.0,
"learning_rate": 0.00018570934257548772,
"loss": 1.5749,
"step": 4800
},
{
"epoch": 0.6660653576632207,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.7238,
"eval_samples_per_second": 15.042,
"eval_steps_per_second": 1.881,
"step": 4800
},
{
"epoch": 0.6674529938250191,
"grad_norm": 0.0,
"learning_rate": 0.00018562960500912424,
"loss": 1.5956,
"step": 4810
},
{
"epoch": 0.6688406299868175,
"grad_norm": 0.0,
"learning_rate": 0.0001855496628228149,
"loss": 1.6314,
"step": 4820
},
{
"epoch": 0.6702282661486159,
"grad_norm": 0.0,
"learning_rate": 0.00018546951620758913,
"loss": 1.5407,
"step": 4830
},
{
"epoch": 0.6716159023104142,
"grad_norm": 0.0,
"learning_rate": 0.00018538916535496472,
"loss": 1.4737,
"step": 4840
},
{
"epoch": 0.6730035384722126,
"grad_norm": 0.0,
"learning_rate": 0.00018530861045694752,
"loss": 1.5509,
"step": 4850
},
{
"epoch": 0.674391174634011,
"grad_norm": 0.0,
"learning_rate": 0.00018522785170603097,
"loss": 1.5355,
"step": 4860
},
{
"epoch": 0.6757788107958094,
"grad_norm": 0.0,
"learning_rate": 0.0001851468892951957,
"loss": 1.5318,
"step": 4870
},
{
"epoch": 0.6771664469576077,
"grad_norm": 0.0,
"learning_rate": 0.00018506572341790898,
"loss": 1.5004,
"step": 4880
},
{
"epoch": 0.6785540831194061,
"grad_norm": 0.0,
"learning_rate": 0.00018498435426812418,
"loss": 1.4296,
"step": 4890
},
{
"epoch": 0.6799417192812045,
"grad_norm": 0.0,
"learning_rate": 0.00018490278204028057,
"loss": 1.5449,
"step": 4900
},
{
"epoch": 0.6799417192812045,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.692,
"eval_samples_per_second": 15.043,
"eval_steps_per_second": 1.881,
"step": 4900
},
{
"epoch": 0.6813293554430029,
"grad_norm": 0.0,
"learning_rate": 0.0001848210069293026,
"loss": 1.5653,
"step": 4910
},
{
"epoch": 0.6827169916048013,
"grad_norm": 0.0,
"learning_rate": 0.00018473902913059947,
"loss": 1.5689,
"step": 4920
},
{
"epoch": 0.6841046277665996,
"grad_norm": 0.0,
"learning_rate": 0.00018465684884006484,
"loss": 1.602,
"step": 4930
},
{
"epoch": 0.685492263928398,
"grad_norm": 0.0,
"learning_rate": 0.00018457446625407627,
"loss": 1.5003,
"step": 4940
},
{
"epoch": 0.6868799000901964,
"grad_norm": 0.0,
"learning_rate": 0.00018449188156949452,
"loss": 1.5839,
"step": 4950
},
{
"epoch": 0.6882675362519948,
"grad_norm": 0.0,
"learning_rate": 0.0001844090949836635,
"loss": 1.5864,
"step": 4960
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.0,
"learning_rate": 0.00018432610669440948,
"loss": 1.5096,
"step": 4970
},
{
"epoch": 0.6910428085755915,
"grad_norm": 0.0,
"learning_rate": 0.00018424291690004072,
"loss": 1.5729,
"step": 4980
},
{
"epoch": 0.6924304447373899,
"grad_norm": 0.0,
"learning_rate": 0.000184159525799347,
"loss": 1.5004,
"step": 4990
},
{
"epoch": 0.6938180808991883,
"grad_norm": 0.0,
"learning_rate": 0.00018407593359159917,
"loss": 1.5744,
"step": 5000
},
{
"epoch": 0.6938180808991883,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.3835,
"eval_samples_per_second": 15.031,
"eval_steps_per_second": 1.879,
"step": 5000
},
{
"epoch": 0.6952057170609867,
"grad_norm": 0.0,
"learning_rate": 0.00018399214047654864,
"loss": 1.5931,
"step": 5010
},
{
"epoch": 0.696593353222785,
"grad_norm": 0.0,
"learning_rate": 0.00018390814665442687,
"loss": 1.5866,
"step": 5020
},
{
"epoch": 0.6979809893845834,
"grad_norm": 0.0,
"learning_rate": 0.00018382395232594497,
"loss": 1.5144,
"step": 5030
},
{
"epoch": 0.6993686255463817,
"grad_norm": 0.0,
"learning_rate": 0.00018373955769229313,
"loss": 1.5864,
"step": 5040
},
{
"epoch": 0.7007562617081801,
"grad_norm": 0.0,
"learning_rate": 0.0001836549629551402,
"loss": 1.5325,
"step": 5050
},
{
"epoch": 0.7021438978699784,
"grad_norm": 0.0,
"learning_rate": 0.00018357016831663326,
"loss": 1.5071,
"step": 5060
},
{
"epoch": 0.7035315340317768,
"grad_norm": 0.0,
"learning_rate": 0.00018348517397939702,
"loss": 1.5151,
"step": 5070
},
{
"epoch": 0.7049191701935752,
"grad_norm": 0.0,
"learning_rate": 0.00018339998014653338,
"loss": 1.5249,
"step": 5080
},
{
"epoch": 0.7063068063553736,
"grad_norm": 0.0,
"learning_rate": 0.00018331458702162094,
"loss": 1.5084,
"step": 5090
},
{
"epoch": 0.707694442517172,
"grad_norm": 0.0,
"learning_rate": 0.00018322899480871461,
"loss": 1.4721,
"step": 5100
},
{
"epoch": 0.707694442517172,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.7906,
"eval_samples_per_second": 15.006,
"eval_steps_per_second": 1.876,
"step": 5100
},
{
"epoch": 0.7090820786789703,
"grad_norm": 0.0,
"learning_rate": 0.00018314320371234493,
"loss": 1.5513,
"step": 5110
},
{
"epoch": 0.7104697148407687,
"grad_norm": 0.0,
"learning_rate": 0.00018305721393751777,
"loss": 1.5281,
"step": 5120
},
{
"epoch": 0.7118573510025671,
"grad_norm": 0.0,
"learning_rate": 0.00018297102568971376,
"loss": 1.5382,
"step": 5130
},
{
"epoch": 0.7132449871643655,
"grad_norm": 0.0,
"learning_rate": 0.00018288463917488773,
"loss": 1.5517,
"step": 5140
},
{
"epoch": 0.7146326233261638,
"grad_norm": 0.0,
"learning_rate": 0.00018279805459946836,
"loss": 1.536,
"step": 5150
},
{
"epoch": 0.7160202594879622,
"grad_norm": 0.0,
"learning_rate": 0.0001827112721703576,
"loss": 1.5671,
"step": 5160
},
{
"epoch": 0.7174078956497606,
"grad_norm": 0.0,
"learning_rate": 0.00018262429209493013,
"loss": 1.4578,
"step": 5170
},
{
"epoch": 0.718795531811559,
"grad_norm": 0.0,
"learning_rate": 0.000182537114581033,
"loss": 1.5988,
"step": 5180
},
{
"epoch": 0.7201831679733574,
"grad_norm": 0.0,
"learning_rate": 0.00018244973983698505,
"loss": 1.5489,
"step": 5190
},
{
"epoch": 0.7215708041351557,
"grad_norm": 0.0,
"learning_rate": 0.00018236216807157636,
"loss": 1.4934,
"step": 5200
},
{
"epoch": 0.7215708041351557,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.1134,
"eval_samples_per_second": 15.018,
"eval_steps_per_second": 1.878,
"step": 5200
},
{
"epoch": 0.7229584402969541,
"grad_norm": 0.0,
"learning_rate": 0.00018227439949406786,
"loss": 1.5401,
"step": 5210
},
{
"epoch": 0.7243460764587525,
"grad_norm": 0.0,
"learning_rate": 0.00018218643431419077,
"loss": 1.6144,
"step": 5220
},
{
"epoch": 0.7257337126205509,
"grad_norm": 0.0,
"learning_rate": 0.00018209827274214612,
"loss": 1.5523,
"step": 5230
},
{
"epoch": 0.7271213487823492,
"grad_norm": 0.0,
"learning_rate": 0.00018200991498860424,
"loss": 1.5574,
"step": 5240
},
{
"epoch": 0.7285089849441476,
"grad_norm": 0.0,
"learning_rate": 0.0001819213612647042,
"loss": 1.5118,
"step": 5250
},
{
"epoch": 0.729896621105946,
"grad_norm": 0.0,
"learning_rate": 0.00018183261178205345,
"loss": 1.5509,
"step": 5260
},
{
"epoch": 0.7312842572677444,
"grad_norm": 0.0,
"learning_rate": 0.0001817436667527271,
"loss": 1.5251,
"step": 5270
},
{
"epoch": 0.7326718934295428,
"grad_norm": 0.0,
"learning_rate": 0.0001816545263892677,
"loss": 1.6438,
"step": 5280
},
{
"epoch": 0.7340595295913411,
"grad_norm": 0.0,
"learning_rate": 0.0001815651909046845,
"loss": 1.5911,
"step": 5290
},
{
"epoch": 0.7354471657531395,
"grad_norm": 0.0,
"learning_rate": 0.00018147566051245287,
"loss": 1.5005,
"step": 5300
},
{
"epoch": 0.7354471657531395,
"eval_loss": 1.5553832054138184,
"eval_runtime": 860.5659,
"eval_samples_per_second": 14.888,
"eval_steps_per_second": 1.862,
"step": 5300
},
{
"epoch": 0.7368348019149379,
"grad_norm": 0.0,
"learning_rate": 0.0001813859354265141,
"loss": 1.482,
"step": 5310
},
{
"epoch": 0.7382224380767363,
"grad_norm": 0.0,
"learning_rate": 0.00018129601586127476,
"loss": 1.4996,
"step": 5320
},
{
"epoch": 0.7396100742385346,
"grad_norm": 0.0,
"learning_rate": 0.00018120590203160594,
"loss": 1.6339,
"step": 5330
},
{
"epoch": 0.740997710400333,
"grad_norm": 0.0,
"learning_rate": 0.0001811155941528431,
"loss": 1.4808,
"step": 5340
},
{
"epoch": 0.7423853465621314,
"grad_norm": 0.0,
"learning_rate": 0.00018102509244078538,
"loss": 1.4793,
"step": 5350
},
{
"epoch": 0.7437729827239298,
"grad_norm": 0.0,
"learning_rate": 0.00018093439711169506,
"loss": 1.571,
"step": 5360
},
{
"epoch": 0.7451606188857282,
"grad_norm": 0.0,
"learning_rate": 0.00018084350838229712,
"loss": 1.5976,
"step": 5370
},
{
"epoch": 0.7465482550475265,
"grad_norm": 0.0,
"learning_rate": 0.00018075242646977863,
"loss": 1.5838,
"step": 5380
},
{
"epoch": 0.7479358912093249,
"grad_norm": 0.0,
"learning_rate": 0.00018066115159178842,
"loss": 1.5638,
"step": 5390
},
{
"epoch": 0.7493235273711233,
"grad_norm": 0.0,
"learning_rate": 0.00018056968396643628,
"loss": 1.5642,
"step": 5400
},
{
"epoch": 0.7493235273711233,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.0477,
"eval_samples_per_second": 15.054,
"eval_steps_per_second": 1.882,
"step": 5400
},
{
"epoch": 0.7507111635329217,
"grad_norm": 0.0,
"learning_rate": 0.00018047802381229263,
"loss": 1.6306,
"step": 5410
},
{
"epoch": 0.75209879969472,
"grad_norm": 0.0,
"learning_rate": 0.00018038617134838805,
"loss": 1.5442,
"step": 5420
},
{
"epoch": 0.7534864358565184,
"grad_norm": 0.0,
"learning_rate": 0.00018029412679421253,
"loss": 1.5067,
"step": 5430
},
{
"epoch": 0.7548740720183168,
"grad_norm": 0.0,
"learning_rate": 0.00018020189036971516,
"loss": 1.5358,
"step": 5440
},
{
"epoch": 0.7562617081801152,
"grad_norm": 0.0,
"learning_rate": 0.0001801094622953035,
"loss": 1.4645,
"step": 5450
},
{
"epoch": 0.7576493443419136,
"grad_norm": 0.0,
"learning_rate": 0.0001800168427918431,
"loss": 1.4765,
"step": 5460
},
{
"epoch": 0.7590369805037119,
"grad_norm": 0.0,
"learning_rate": 0.00017992403208065685,
"loss": 1.5688,
"step": 5470
},
{
"epoch": 0.7604246166655103,
"grad_norm": 0.0,
"learning_rate": 0.00017983103038352467,
"loss": 1.5129,
"step": 5480
},
{
"epoch": 0.7618122528273087,
"grad_norm": 0.0,
"learning_rate": 0.00017973783792268278,
"loss": 1.5572,
"step": 5490
},
{
"epoch": 0.7631998889891071,
"grad_norm": 0.0,
"learning_rate": 0.00017964445492082333,
"loss": 1.572,
"step": 5500
},
{
"epoch": 0.7631998889891071,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.5786,
"eval_samples_per_second": 15.027,
"eval_steps_per_second": 1.879,
"step": 5500
},
{
"epoch": 0.7645875251509054,
"grad_norm": 0.0,
"learning_rate": 0.0001795508816010937,
"loss": 1.4957,
"step": 5510
},
{
"epoch": 0.7659751613127038,
"grad_norm": 0.0,
"learning_rate": 0.0001794571181870961,
"loss": 1.5166,
"step": 5520
},
{
"epoch": 0.7673627974745022,
"grad_norm": 0.0,
"learning_rate": 0.00017936316490288693,
"loss": 1.5996,
"step": 5530
},
{
"epoch": 0.7687504336363006,
"grad_norm": 0.0,
"learning_rate": 0.00017926902197297636,
"loss": 1.5391,
"step": 5540
},
{
"epoch": 0.770138069798099,
"grad_norm": 0.0,
"learning_rate": 0.00017917468962232777,
"loss": 1.5454,
"step": 5550
},
{
"epoch": 0.7715257059598973,
"grad_norm": 0.0,
"learning_rate": 0.00017908016807635706,
"loss": 1.5633,
"step": 5560
},
{
"epoch": 0.7729133421216957,
"grad_norm": 0.0,
"learning_rate": 0.00017898545756093233,
"loss": 1.5768,
"step": 5570
},
{
"epoch": 0.7743009782834941,
"grad_norm": 0.0,
"learning_rate": 0.00017889055830237326,
"loss": 1.5428,
"step": 5580
},
{
"epoch": 0.7756886144452925,
"grad_norm": 0.0,
"learning_rate": 0.00017879547052745043,
"loss": 1.5687,
"step": 5590
},
{
"epoch": 0.7770762506070908,
"grad_norm": 0.0,
"learning_rate": 0.00017870019446338496,
"loss": 1.5115,
"step": 5600
},
{
"epoch": 0.7770762506070908,
"eval_loss": 1.5553832054138184,
"eval_runtime": 850.6758,
"eval_samples_per_second": 15.061,
"eval_steps_per_second": 1.883,
"step": 5600
},
{
"epoch": 0.7784638867688892,
"grad_norm": 0.0,
"learning_rate": 0.00017860473033784796,
"loss": 1.5663,
"step": 5610
},
{
"epoch": 0.7798515229306876,
"grad_norm": 0.0,
"learning_rate": 0.0001785090783789599,
"loss": 1.4973,
"step": 5620
},
{
"epoch": 0.781239159092486,
"grad_norm": 0.0,
"learning_rate": 0.00017841323881529,
"loss": 1.5756,
"step": 5630
},
{
"epoch": 0.7826267952542844,
"grad_norm": 0.0,
"learning_rate": 0.00017831721187585594,
"loss": 1.4844,
"step": 5640
},
{
"epoch": 0.7840144314160827,
"grad_norm": 0.0,
"learning_rate": 0.00017822099779012297,
"loss": 1.5505,
"step": 5650
},
{
"epoch": 0.7854020675778811,
"grad_norm": 0.0,
"learning_rate": 0.00017812459678800374,
"loss": 1.593,
"step": 5660
},
{
"epoch": 0.7867897037396795,
"grad_norm": 0.0,
"learning_rate": 0.0001780280090998574,
"loss": 1.5968,
"step": 5670
},
{
"epoch": 0.7881773399014779,
"grad_norm": 0.0,
"learning_rate": 0.00017793123495648926,
"loss": 1.5763,
"step": 5680
},
{
"epoch": 0.7895649760632762,
"grad_norm": 0.0,
"learning_rate": 0.00017783427458915022,
"loss": 1.5819,
"step": 5690
},
{
"epoch": 0.7909526122250746,
"grad_norm": 0.0,
"learning_rate": 0.00017773712822953612,
"loss": 1.5361,
"step": 5700
},
{
"epoch": 0.7909526122250746,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.1786,
"eval_samples_per_second": 14.982,
"eval_steps_per_second": 1.873,
"step": 5700
},
{
"epoch": 0.792340248386873,
"grad_norm": 0.0,
"learning_rate": 0.00017763979610978728,
"loss": 1.5174,
"step": 5710
},
{
"epoch": 0.7937278845486714,
"grad_norm": 0.0,
"learning_rate": 0.00017754227846248784,
"loss": 1.4874,
"step": 5720
},
{
"epoch": 0.7951155207104698,
"grad_norm": 0.0,
"learning_rate": 0.00017744457552066538,
"loss": 1.5176,
"step": 5730
},
{
"epoch": 0.7965031568722681,
"grad_norm": 0.0,
"learning_rate": 0.00017734668751779025,
"loss": 1.5795,
"step": 5740
},
{
"epoch": 0.7978907930340665,
"grad_norm": 0.0,
"learning_rate": 0.0001772486146877749,
"loss": 1.505,
"step": 5750
},
{
"epoch": 0.7992784291958649,
"grad_norm": 0.0,
"learning_rate": 0.0001771503572649736,
"loss": 1.4841,
"step": 5760
},
{
"epoch": 0.8006660653576633,
"grad_norm": 0.0,
"learning_rate": 0.0001770519154841816,
"loss": 1.444,
"step": 5770
},
{
"epoch": 0.8020537015194616,
"grad_norm": 0.0,
"learning_rate": 0.00017695328958063477,
"loss": 1.585,
"step": 5780
},
{
"epoch": 0.80344133768126,
"grad_norm": 0.0,
"learning_rate": 0.0001768544797900089,
"loss": 1.5706,
"step": 5790
},
{
"epoch": 0.8048289738430584,
"grad_norm": 0.0,
"learning_rate": 0.00017675548634841923,
"loss": 1.6169,
"step": 5800
},
{
"epoch": 0.8048289738430584,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.03,
"eval_samples_per_second": 15.002,
"eval_steps_per_second": 1.876,
"step": 5800
},
{
"epoch": 0.8062166100048568,
"grad_norm": 0.0,
"learning_rate": 0.00017665630949241984,
"loss": 1.6402,
"step": 5810
},
{
"epoch": 0.807604246166655,
"grad_norm": 0.0,
"learning_rate": 0.0001765569494590031,
"loss": 1.5333,
"step": 5820
},
{
"epoch": 0.8089918823284534,
"grad_norm": 0.0,
"learning_rate": 0.00017645740648559912,
"loss": 1.4236,
"step": 5830
},
{
"epoch": 0.8103795184902518,
"grad_norm": 0.0,
"learning_rate": 0.0001763576808100751,
"loss": 1.5217,
"step": 5840
},
{
"epoch": 0.8117671546520502,
"grad_norm": 0.0,
"learning_rate": 0.00017625777267073488,
"loss": 1.521,
"step": 5850
},
{
"epoch": 0.8131547908138486,
"grad_norm": 0.0,
"learning_rate": 0.00017615768230631832,
"loss": 1.5287,
"step": 5860
},
{
"epoch": 0.8145424269756469,
"grad_norm": 0.0,
"learning_rate": 0.0001760574099560007,
"loss": 1.6059,
"step": 5870
},
{
"epoch": 0.8159300631374453,
"grad_norm": 0.0,
"learning_rate": 0.00017595695585939213,
"loss": 1.5461,
"step": 5880
},
{
"epoch": 0.8173176992992437,
"grad_norm": 0.0,
"learning_rate": 0.00017585632025653711,
"loss": 1.5032,
"step": 5890
},
{
"epoch": 0.8187053354610421,
"grad_norm": 0.0,
"learning_rate": 0.0001757555033879138,
"loss": 1.627,
"step": 5900
},
{
"epoch": 0.8187053354610421,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.9761,
"eval_samples_per_second": 15.038,
"eval_steps_per_second": 1.88,
"step": 5900
},
{
"epoch": 0.8200929716228404,
"grad_norm": 0.0,
"learning_rate": 0.00017565450549443355,
"loss": 1.5245,
"step": 5910
},
{
"epoch": 0.8214806077846388,
"grad_norm": 0.0,
"learning_rate": 0.00017555332681744022,
"loss": 1.5135,
"step": 5920
},
{
"epoch": 0.8228682439464372,
"grad_norm": 0.0,
"learning_rate": 0.00017545196759870976,
"loss": 1.6013,
"step": 5930
},
{
"epoch": 0.8242558801082356,
"grad_norm": 0.0,
"learning_rate": 0.0001753504280804495,
"loss": 1.5353,
"step": 5940
},
{
"epoch": 0.825643516270034,
"grad_norm": 0.0,
"learning_rate": 0.0001752487085052976,
"loss": 1.5625,
"step": 5950
},
{
"epoch": 0.8270311524318323,
"grad_norm": 0.0,
"learning_rate": 0.0001751468091163225,
"loss": 1.5264,
"step": 5960
},
{
"epoch": 0.8284187885936307,
"grad_norm": 0.0,
"learning_rate": 0.0001750447301570223,
"loss": 1.5596,
"step": 5970
},
{
"epoch": 0.8298064247554291,
"grad_norm": 0.0,
"learning_rate": 0.00017494247187132422,
"loss": 1.5619,
"step": 5980
},
{
"epoch": 0.8311940609172275,
"grad_norm": 0.0,
"learning_rate": 0.000174840034503584,
"loss": 1.5473,
"step": 5990
},
{
"epoch": 0.8325816970790259,
"grad_norm": 0.0,
"learning_rate": 0.0001747374182985853,
"loss": 1.5031,
"step": 6000
},
{
"epoch": 0.8325816970790259,
"eval_loss": 1.5553832054138184,
"eval_runtime": 852.2783,
"eval_samples_per_second": 15.033,
"eval_steps_per_second": 1.88,
"step": 6000
},
{
"epoch": 0.8339693332408242,
"grad_norm": 0.0,
"learning_rate": 0.00017463462350153912,
"loss": 1.5695,
"step": 6010
},
{
"epoch": 0.8353569694026226,
"grad_norm": 0.0,
"learning_rate": 0.0001745316503580833,
"loss": 1.4705,
"step": 6020
},
{
"epoch": 0.836744605564421,
"grad_norm": 0.0,
"learning_rate": 0.00017442849911428172,
"loss": 1.5375,
"step": 6030
},
{
"epoch": 0.8381322417262194,
"grad_norm": 0.0,
"learning_rate": 0.00017432517001662393,
"loss": 1.5085,
"step": 6040
},
{
"epoch": 0.8395198778880177,
"grad_norm": 0.0,
"learning_rate": 0.0001742216633120245,
"loss": 1.6365,
"step": 6050
},
{
"epoch": 0.8409075140498161,
"grad_norm": 0.0,
"learning_rate": 0.00017411797924782237,
"loss": 1.5646,
"step": 6060
},
{
"epoch": 0.8422951502116145,
"grad_norm": 0.0,
"learning_rate": 0.00017401411807178028,
"loss": 1.5703,
"step": 6070
},
{
"epoch": 0.8436827863734129,
"grad_norm": 0.0,
"learning_rate": 0.00017391008003208422,
"loss": 1.5321,
"step": 6080
},
{
"epoch": 0.8450704225352113,
"grad_norm": 0.0,
"learning_rate": 0.00017380586537734286,
"loss": 1.5316,
"step": 6090
},
{
"epoch": 0.8464580586970096,
"grad_norm": 0.0,
"learning_rate": 0.00017370147435658682,
"loss": 1.5733,
"step": 6100
},
{
"epoch": 0.8464580586970096,
"eval_loss": 1.5553832054138184,
"eval_runtime": 849.0242,
"eval_samples_per_second": 15.09,
"eval_steps_per_second": 1.887,
"step": 6100
},
{
"epoch": 0.847845694858808,
"grad_norm": 0.0,
"learning_rate": 0.0001735969072192682,
"loss": 1.5344,
"step": 6110
},
{
"epoch": 0.8492333310206064,
"grad_norm": 0.0,
"learning_rate": 0.00017349216421525993,
"loss": 1.5371,
"step": 6120
},
{
"epoch": 0.8506209671824048,
"grad_norm": 0.0,
"learning_rate": 0.00017338724559485527,
"loss": 1.5302,
"step": 6130
},
{
"epoch": 0.8520086033442031,
"grad_norm": 0.0,
"learning_rate": 0.00017328215160876702,
"loss": 1.5218,
"step": 6140
},
{
"epoch": 0.8533962395060015,
"grad_norm": 0.0,
"learning_rate": 0.00017317688250812708,
"loss": 1.5614,
"step": 6150
},
{
"epoch": 0.8547838756677999,
"grad_norm": 0.0,
"learning_rate": 0.00017307143854448587,
"loss": 1.527,
"step": 6160
},
{
"epoch": 0.8561715118295983,
"grad_norm": 0.0,
"learning_rate": 0.00017296581996981153,
"loss": 1.531,
"step": 6170
},
{
"epoch": 0.8575591479913967,
"grad_norm": 0.0,
"learning_rate": 0.00017286002703648955,
"loss": 1.5872,
"step": 6180
},
{
"epoch": 0.858946784153195,
"grad_norm": 0.0,
"learning_rate": 0.00017275405999732203,
"loss": 1.5583,
"step": 6190
},
{
"epoch": 0.8603344203149934,
"grad_norm": 0.0,
"learning_rate": 0.00017264791910552712,
"loss": 1.5403,
"step": 6200
},
{
"epoch": 0.8603344203149934,
"eval_loss": 1.5553832054138184,
"eval_runtime": 863.4297,
"eval_samples_per_second": 14.838,
"eval_steps_per_second": 1.855,
"step": 6200
},
{
"epoch": 0.8617220564767918,
"grad_norm": 0.0,
"learning_rate": 0.0001725416046147384,
"loss": 1.5541,
"step": 6210
},
{
"epoch": 0.8631096926385902,
"grad_norm": 0.0,
"learning_rate": 0.00017243511677900424,
"loss": 1.5928,
"step": 6220
},
{
"epoch": 0.8644973288003885,
"grad_norm": 0.0,
"learning_rate": 0.00017232845585278734,
"loss": 1.551,
"step": 6230
},
{
"epoch": 0.8658849649621869,
"grad_norm": 0.0,
"learning_rate": 0.0001722216220909639,
"loss": 1.4659,
"step": 6240
},
{
"epoch": 0.8672726011239853,
"grad_norm": 0.0,
"learning_rate": 0.00017211461574882325,
"loss": 1.5259,
"step": 6250
},
{
"epoch": 0.8686602372857837,
"grad_norm": 0.0,
"learning_rate": 0.000172007437082067,
"loss": 1.4713,
"step": 6260
},
{
"epoch": 0.870047873447582,
"grad_norm": 0.0,
"learning_rate": 0.00017190008634680861,
"loss": 1.5781,
"step": 6270
},
{
"epoch": 0.8714355096093804,
"grad_norm": 0.0,
"learning_rate": 0.00017179256379957266,
"loss": 1.5383,
"step": 6280
},
{
"epoch": 0.8728231457711788,
"grad_norm": 0.0,
"learning_rate": 0.0001716848696972944,
"loss": 1.5393,
"step": 6290
},
{
"epoch": 0.8742107819329772,
"grad_norm": 0.0,
"learning_rate": 0.0001715770042973189,
"loss": 1.5745,
"step": 6300
},
{
"epoch": 0.8742107819329772,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.7944,
"eval_samples_per_second": 14.953,
"eval_steps_per_second": 1.87,
"step": 6300
},
{
"epoch": 0.8755984180947756,
"grad_norm": 0.0,
"learning_rate": 0.00017146896785740062,
"loss": 1.5361,
"step": 6310
},
{
"epoch": 0.8769860542565739,
"grad_norm": 0.0,
"learning_rate": 0.00017136076063570274,
"loss": 1.514,
"step": 6320
},
{
"epoch": 0.8783736904183723,
"grad_norm": 0.0,
"learning_rate": 0.00017125238289079654,
"loss": 1.5195,
"step": 6330
},
{
"epoch": 0.8797613265801707,
"grad_norm": 0.0,
"learning_rate": 0.00017114383488166075,
"loss": 1.5063,
"step": 6340
},
{
"epoch": 0.8811489627419691,
"grad_norm": 0.0,
"learning_rate": 0.00017103511686768102,
"loss": 1.5504,
"step": 6350
},
{
"epoch": 0.8825365989037675,
"grad_norm": 0.0,
"learning_rate": 0.00017092622910864916,
"loss": 1.477,
"step": 6360
},
{
"epoch": 0.8839242350655658,
"grad_norm": 0.0,
"learning_rate": 0.00017081717186476268,
"loss": 1.5396,
"step": 6370
},
{
"epoch": 0.8853118712273642,
"grad_norm": 0.0,
"learning_rate": 0.00017070794539662408,
"loss": 1.6266,
"step": 6380
},
{
"epoch": 0.8866995073891626,
"grad_norm": 0.0,
"learning_rate": 0.00017059854996524017,
"loss": 1.4967,
"step": 6390
},
{
"epoch": 0.888087143550961,
"grad_norm": 0.0,
"learning_rate": 0.0001704889858320216,
"loss": 1.522,
"step": 6400
},
{
"epoch": 0.888087143550961,
"eval_loss": 1.5553832054138184,
"eval_runtime": 862.0033,
"eval_samples_per_second": 14.863,
"eval_steps_per_second": 1.858,
"step": 6400
},
{
"epoch": 0.8894747797127593,
"grad_norm": 0.0,
"learning_rate": 0.00017037925325878205,
"loss": 1.5414,
"step": 6410
},
{
"epoch": 0.8908624158745577,
"grad_norm": 0.0,
"learning_rate": 0.00017026935250773784,
"loss": 1.5846,
"step": 6420
},
{
"epoch": 0.8922500520363561,
"grad_norm": 0.0,
"learning_rate": 0.00017015928384150702,
"loss": 1.5889,
"step": 6430
},
{
"epoch": 0.8936376881981545,
"grad_norm": 0.0,
"learning_rate": 0.000170049047523109,
"loss": 1.555,
"step": 6440
},
{
"epoch": 0.8950253243599529,
"grad_norm": 0.0,
"learning_rate": 0.00016993864381596374,
"loss": 1.5805,
"step": 6450
},
{
"epoch": 0.8964129605217512,
"grad_norm": 0.0,
"learning_rate": 0.00016982807298389124,
"loss": 1.5322,
"step": 6460
},
{
"epoch": 0.8978005966835496,
"grad_norm": 0.0,
"learning_rate": 0.00016971733529111088,
"loss": 1.5423,
"step": 6470
},
{
"epoch": 0.899188232845348,
"grad_norm": 0.0,
"learning_rate": 0.0001696064310022406,
"loss": 1.5465,
"step": 6480
},
{
"epoch": 0.9005758690071464,
"grad_norm": 0.0,
"learning_rate": 0.0001694953603822967,
"loss": 1.5719,
"step": 6490
},
{
"epoch": 0.9019635051689447,
"grad_norm": 0.0,
"learning_rate": 0.00016938412369669272,
"loss": 1.5242,
"step": 6500
},
{
"epoch": 0.9019635051689447,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.5172,
"eval_samples_per_second": 15.011,
"eval_steps_per_second": 1.877,
"step": 6500
},
{
"epoch": 0.9033511413307431,
"grad_norm": 0.0,
"learning_rate": 0.00016927272121123918,
"loss": 1.5607,
"step": 6510
},
{
"epoch": 0.9047387774925415,
"grad_norm": 0.0,
"learning_rate": 0.0001691611531921427,
"loss": 1.6793,
"step": 6520
},
{
"epoch": 0.9061264136543399,
"grad_norm": 0.0,
"learning_rate": 0.0001690494199060055,
"loss": 1.5096,
"step": 6530
},
{
"epoch": 0.9075140498161383,
"grad_norm": 0.0,
"learning_rate": 0.00016893752161982467,
"loss": 1.5335,
"step": 6540
},
{
"epoch": 0.9089016859779366,
"grad_norm": 0.0,
"learning_rate": 0.00016882545860099173,
"loss": 1.5678,
"step": 6550
},
{
"epoch": 0.910289322139735,
"grad_norm": 0.0,
"learning_rate": 0.00016871323111729161,
"loss": 1.5634,
"step": 6560
},
{
"epoch": 0.9116769583015334,
"grad_norm": 0.0,
"learning_rate": 0.00016860083943690248,
"loss": 1.5648,
"step": 6570
},
{
"epoch": 0.9130645944633318,
"grad_norm": 0.0,
"learning_rate": 0.0001684882838283947,
"loss": 1.4455,
"step": 6580
},
{
"epoch": 0.91445223062513,
"grad_norm": 0.0,
"learning_rate": 0.00016837556456073048,
"loss": 1.5331,
"step": 6590
},
{
"epoch": 0.9158398667869284,
"grad_norm": 0.0,
"learning_rate": 0.00016826268190326296,
"loss": 1.5532,
"step": 6600
},
{
"epoch": 0.9158398667869284,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.7079,
"eval_samples_per_second": 14.92,
"eval_steps_per_second": 1.866,
"step": 6600
},
{
"epoch": 0.9172275029487268,
"grad_norm": 0.0,
"learning_rate": 0.0001681496361257359,
"loss": 1.5508,
"step": 6610
},
{
"epoch": 0.9186151391105252,
"grad_norm": 0.0,
"learning_rate": 0.00016803642749828267,
"loss": 1.5596,
"step": 6620
},
{
"epoch": 0.9200027752723235,
"grad_norm": 0.0,
"learning_rate": 0.0001679230562914259,
"loss": 1.5676,
"step": 6630
},
{
"epoch": 0.9213904114341219,
"grad_norm": 0.0,
"learning_rate": 0.0001678095227760767,
"loss": 1.5076,
"step": 6640
},
{
"epoch": 0.9227780475959203,
"grad_norm": 0.0,
"learning_rate": 0.00016769582722353402,
"loss": 1.5471,
"step": 6650
},
{
"epoch": 0.9241656837577187,
"grad_norm": 0.0,
"learning_rate": 0.00016758196990548395,
"loss": 1.5696,
"step": 6660
},
{
"epoch": 0.9255533199195171,
"grad_norm": 0.0,
"learning_rate": 0.0001674679510939993,
"loss": 1.5948,
"step": 6670
},
{
"epoch": 0.9269409560813154,
"grad_norm": 0.0,
"learning_rate": 0.0001673537710615386,
"loss": 1.507,
"step": 6680
},
{
"epoch": 0.9283285922431138,
"grad_norm": 0.0,
"learning_rate": 0.00016723943008094573,
"loss": 1.549,
"step": 6690
},
{
"epoch": 0.9297162284049122,
"grad_norm": 0.0,
"learning_rate": 0.00016712492842544914,
"loss": 1.5729,
"step": 6700
},
{
"epoch": 0.9297162284049122,
"eval_loss": 1.5553832054138184,
"eval_runtime": 855.2661,
"eval_samples_per_second": 14.98,
"eval_steps_per_second": 1.873,
"step": 6700
},
{
"epoch": 0.9311038645667106,
"grad_norm": 0.0,
"learning_rate": 0.00016701026636866127,
"loss": 1.5305,
"step": 6710
},
{
"epoch": 0.932491500728509,
"grad_norm": 0.0,
"learning_rate": 0.00016689544418457778,
"loss": 1.5285,
"step": 6720
},
{
"epoch": 0.9338791368903073,
"grad_norm": 0.0,
"learning_rate": 0.00016678046214757707,
"loss": 1.546,
"step": 6730
},
{
"epoch": 0.9352667730521057,
"grad_norm": 0.0,
"learning_rate": 0.00016666532053241943,
"loss": 1.5479,
"step": 6740
},
{
"epoch": 0.9366544092139041,
"grad_norm": 0.0,
"learning_rate": 0.00016655001961424652,
"loss": 1.5483,
"step": 6750
},
{
"epoch": 0.9380420453757025,
"grad_norm": 0.0,
"learning_rate": 0.00016643455966858063,
"loss": 1.5553,
"step": 6760
},
{
"epoch": 0.9394296815375008,
"grad_norm": 0.0,
"learning_rate": 0.00016631894097132418,
"loss": 1.5295,
"step": 6770
},
{
"epoch": 0.9408173176992992,
"grad_norm": 0.0,
"learning_rate": 0.00016620316379875876,
"loss": 1.5047,
"step": 6780
},
{
"epoch": 0.9422049538610976,
"grad_norm": 0.0,
"learning_rate": 0.00016608722842754477,
"loss": 1.5155,
"step": 6790
},
{
"epoch": 0.943592590022896,
"grad_norm": 0.0,
"learning_rate": 0.00016597113513472066,
"loss": 1.575,
"step": 6800
},
{
"epoch": 0.943592590022896,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.8633,
"eval_samples_per_second": 14.987,
"eval_steps_per_second": 1.874,
"step": 6800
},
{
"epoch": 0.9449802261846943,
"grad_norm": 0.0,
"learning_rate": 0.00016585488419770217,
"loss": 1.613,
"step": 6810
},
{
"epoch": 0.9463678623464927,
"grad_norm": 0.0,
"learning_rate": 0.0001657384758942818,
"loss": 1.5741,
"step": 6820
},
{
"epoch": 0.9477554985082911,
"grad_norm": 0.0,
"learning_rate": 0.00016562191050262804,
"loss": 1.5771,
"step": 6830
},
{
"epoch": 0.9491431346700895,
"grad_norm": 0.0,
"learning_rate": 0.0001655051883012848,
"loss": 1.5545,
"step": 6840
},
{
"epoch": 0.9505307708318879,
"grad_norm": 0.0,
"learning_rate": 0.00016538830956917074,
"loss": 1.4783,
"step": 6850
},
{
"epoch": 0.9519184069936862,
"grad_norm": 0.0,
"learning_rate": 0.00016527127458557846,
"loss": 1.5369,
"step": 6860
},
{
"epoch": 0.9533060431554846,
"grad_norm": 0.0,
"learning_rate": 0.000165154083630174,
"loss": 1.5902,
"step": 6870
},
{
"epoch": 0.954693679317283,
"grad_norm": 0.0,
"learning_rate": 0.00016503673698299617,
"loss": 1.5219,
"step": 6880
},
{
"epoch": 0.9560813154790814,
"grad_norm": 0.0,
"learning_rate": 0.00016491923492445562,
"loss": 1.5369,
"step": 6890
},
{
"epoch": 0.9574689516408798,
"grad_norm": 0.0,
"learning_rate": 0.00016480157773533463,
"loss": 1.4893,
"step": 6900
},
{
"epoch": 0.9574689516408798,
"eval_loss": 1.5553832054138184,
"eval_runtime": 851.7875,
"eval_samples_per_second": 15.041,
"eval_steps_per_second": 1.881,
"step": 6900
},
{
"epoch": 0.9588565878026781,
"grad_norm": 0.0,
"learning_rate": 0.000164683765696786,
"loss": 1.567,
"step": 6910
},
{
"epoch": 0.9602442239644765,
"grad_norm": 0.0,
"learning_rate": 0.0001645657990903326,
"loss": 1.5598,
"step": 6920
},
{
"epoch": 0.9616318601262749,
"grad_norm": 0.0,
"learning_rate": 0.00016444767819786667,
"loss": 1.5641,
"step": 6930
},
{
"epoch": 0.9630194962880733,
"grad_norm": 0.0,
"learning_rate": 0.0001643294033016492,
"loss": 1.5821,
"step": 6940
},
{
"epoch": 0.9644071324498716,
"grad_norm": 0.0,
"learning_rate": 0.00016421097468430896,
"loss": 1.5396,
"step": 6950
},
{
"epoch": 0.96579476861167,
"grad_norm": 0.0,
"learning_rate": 0.0001640923926288423,
"loss": 1.5183,
"step": 6960
},
{
"epoch": 0.9671824047734684,
"grad_norm": 0.0,
"learning_rate": 0.00016397365741861215,
"loss": 1.5675,
"step": 6970
},
{
"epoch": 0.9685700409352668,
"grad_norm": 0.0,
"learning_rate": 0.00016385476933734728,
"loss": 1.5301,
"step": 6980
},
{
"epoch": 0.9699576770970652,
"grad_norm": 0.0,
"learning_rate": 0.000163735728669142,
"loss": 1.5445,
"step": 6990
},
{
"epoch": 0.9713453132588635,
"grad_norm": 0.0,
"learning_rate": 0.00016361653569845508,
"loss": 1.565,
"step": 7000
},
{
"epoch": 0.9713453132588635,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.3889,
"eval_samples_per_second": 14.908,
"eval_steps_per_second": 1.864,
"step": 7000
},
{
"epoch": 0.9727329494206619,
"grad_norm": 0.0,
"learning_rate": 0.00016349719071010921,
"loss": 1.5189,
"step": 7010
},
{
"epoch": 0.9741205855824603,
"grad_norm": 0.0,
"learning_rate": 0.00016337769398929046,
"loss": 1.4959,
"step": 7020
},
{
"epoch": 0.9755082217442587,
"grad_norm": 0.0,
"learning_rate": 0.00016325804582154738,
"loss": 1.5076,
"step": 7030
},
{
"epoch": 0.976895857906057,
"grad_norm": 0.0,
"learning_rate": 0.00016313824649279046,
"loss": 1.5328,
"step": 7040
},
{
"epoch": 0.9782834940678554,
"grad_norm": 0.0,
"learning_rate": 0.0001630182962892914,
"loss": 1.6521,
"step": 7050
},
{
"epoch": 0.9796711302296538,
"grad_norm": 0.0,
"learning_rate": 0.00016289819549768239,
"loss": 1.588,
"step": 7060
},
{
"epoch": 0.9810587663914522,
"grad_norm": 0.0,
"learning_rate": 0.00016277794440495557,
"loss": 1.5902,
"step": 7070
},
{
"epoch": 0.9824464025532506,
"grad_norm": 0.0,
"learning_rate": 0.0001626575432984621,
"loss": 1.5927,
"step": 7080
},
{
"epoch": 0.9838340387150489,
"grad_norm": 0.0,
"learning_rate": 0.0001625369924659117,
"loss": 1.6044,
"step": 7090
},
{
"epoch": 0.9852216748768473,
"grad_norm": 0.0,
"learning_rate": 0.0001624162921953719,
"loss": 1.531,
"step": 7100
},
{
"epoch": 0.9852216748768473,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.8233,
"eval_samples_per_second": 14.988,
"eval_steps_per_second": 1.874,
"step": 7100
},
{
"epoch": 0.9866093110386457,
"grad_norm": 0.0,
"learning_rate": 0.00016229544277526718,
"loss": 1.6236,
"step": 7110
},
{
"epoch": 0.9879969472004441,
"grad_norm": 0.0,
"learning_rate": 0.00016217444449437862,
"loss": 1.5169,
"step": 7120
},
{
"epoch": 0.9893845833622424,
"grad_norm": 0.0,
"learning_rate": 0.00016205329764184287,
"loss": 1.5986,
"step": 7130
},
{
"epoch": 0.9907722195240408,
"grad_norm": 0.0,
"learning_rate": 0.00016193200250715168,
"loss": 1.5644,
"step": 7140
},
{
"epoch": 0.9921598556858392,
"grad_norm": 0.0,
"learning_rate": 0.0001618105593801511,
"loss": 1.5578,
"step": 7150
},
{
"epoch": 0.9935474918476376,
"grad_norm": 0.0,
"learning_rate": 0.00016168896855104086,
"loss": 1.627,
"step": 7160
},
{
"epoch": 0.994935128009436,
"grad_norm": 0.0,
"learning_rate": 0.0001615672303103736,
"loss": 1.58,
"step": 7170
},
{
"epoch": 0.9963227641712343,
"grad_norm": 0.0,
"learning_rate": 0.00016144534494905418,
"loss": 1.5433,
"step": 7180
},
{
"epoch": 0.9977104003330327,
"grad_norm": 0.0,
"learning_rate": 0.00016132331275833917,
"loss": 1.5081,
"step": 7190
},
{
"epoch": 0.9990980364948311,
"grad_norm": 0.0,
"learning_rate": 0.0001612011340298358,
"loss": 1.5054,
"step": 7200
},
{
"epoch": 0.9990980364948311,
"eval_loss": 1.5553832054138184,
"eval_runtime": 857.3623,
"eval_samples_per_second": 14.944,
"eval_steps_per_second": 1.869,
"step": 7200
},
{
"epoch": 1.0004162908485394,
"grad_norm": 0.0,
"learning_rate": 0.0001610788090555016,
"loss": 1.5776,
"step": 7210
},
{
"epoch": 1.0018039270103378,
"grad_norm": 0.0,
"learning_rate": 0.0001609563381276435,
"loss": 1.6051,
"step": 7220
},
{
"epoch": 1.0031915631721362,
"grad_norm": 0.0,
"learning_rate": 0.0001608337215389173,
"loss": 1.6271,
"step": 7230
},
{
"epoch": 1.0045791993339346,
"grad_norm": 0.0,
"learning_rate": 0.00016071095958232676,
"loss": 1.5657,
"step": 7240
},
{
"epoch": 1.005966835495733,
"grad_norm": 0.0,
"learning_rate": 0.00016058805255122303,
"loss": 1.5736,
"step": 7250
},
{
"epoch": 1.0073544716575313,
"grad_norm": 0.0,
"learning_rate": 0.00016046500073930398,
"loss": 1.512,
"step": 7260
},
{
"epoch": 1.0087421078193297,
"grad_norm": 0.0,
"learning_rate": 0.0001603418044406134,
"loss": 1.4812,
"step": 7270
},
{
"epoch": 1.010129743981128,
"grad_norm": 0.0,
"learning_rate": 0.00016021846394954036,
"loss": 1.5701,
"step": 7280
},
{
"epoch": 1.0115173801429265,
"grad_norm": 0.0,
"learning_rate": 0.0001600949795608185,
"loss": 1.5056,
"step": 7290
},
{
"epoch": 1.0129050163047248,
"grad_norm": 0.0,
"learning_rate": 0.00015997135156952535,
"loss": 1.4953,
"step": 7300
},
{
"epoch": 1.0129050163047248,
"eval_loss": 1.5553832054138184,
"eval_runtime": 862.1027,
"eval_samples_per_second": 14.861,
"eval_steps_per_second": 1.858,
"step": 7300
},
{
"epoch": 1.0142926524665232,
"grad_norm": 0.0,
"learning_rate": 0.0001598475802710815,
"loss": 1.504,
"step": 7310
},
{
"epoch": 1.0156802886283216,
"grad_norm": 0.0,
"learning_rate": 0.00015972366596125003,
"loss": 1.568,
"step": 7320
},
{
"epoch": 1.01706792479012,
"grad_norm": 0.0,
"learning_rate": 0.0001595996089361358,
"loss": 1.5443,
"step": 7330
},
{
"epoch": 1.0184555609519184,
"grad_norm": 0.0,
"learning_rate": 0.00015947540949218467,
"loss": 1.5165,
"step": 7340
},
{
"epoch": 1.0198431971137167,
"grad_norm": 0.0,
"learning_rate": 0.0001593510679261828,
"loss": 1.5269,
"step": 7350
},
{
"epoch": 1.0212308332755151,
"grad_norm": 0.0,
"learning_rate": 0.000159226584535256,
"loss": 1.5448,
"step": 7360
},
{
"epoch": 1.0226184694373135,
"grad_norm": 0.0,
"learning_rate": 0.00015910195961686897,
"loss": 1.5586,
"step": 7370
},
{
"epoch": 1.0240061055991119,
"grad_norm": 0.0,
"learning_rate": 0.00015897719346882457,
"loss": 1.5269,
"step": 7380
},
{
"epoch": 1.0253937417609102,
"grad_norm": 0.0,
"learning_rate": 0.00015885228638926323,
"loss": 1.5522,
"step": 7390
},
{
"epoch": 1.0267813779227086,
"grad_norm": 0.0,
"learning_rate": 0.00015872723867666207,
"loss": 1.5639,
"step": 7400
},
{
"epoch": 1.0267813779227086,
"eval_loss": 1.5553832054138184,
"eval_runtime": 856.4609,
"eval_samples_per_second": 14.959,
"eval_steps_per_second": 1.87,
"step": 7400
},
{
"epoch": 1.028169014084507,
"grad_norm": 0.0,
"learning_rate": 0.00015860205062983427,
"loss": 1.5362,
"step": 7410
},
{
"epoch": 1.0295566502463054,
"grad_norm": 0.0,
"learning_rate": 0.00015847672254792837,
"loss": 1.6302,
"step": 7420
},
{
"epoch": 1.0309442864081038,
"grad_norm": 0.0,
"learning_rate": 0.00015835125473042755,
"loss": 1.5936,
"step": 7430
},
{
"epoch": 1.0323319225699021,
"grad_norm": 0.0,
"learning_rate": 0.0001582256474771489,
"loss": 1.5836,
"step": 7440
},
{
"epoch": 1.0337195587317005,
"grad_norm": 0.0,
"learning_rate": 0.00015809990108824268,
"loss": 1.499,
"step": 7450
},
{
"epoch": 1.035107194893499,
"grad_norm": 0.0,
"learning_rate": 0.00015797401586419168,
"loss": 1.5871,
"step": 7460
},
{
"epoch": 1.0364948310552973,
"grad_norm": 0.0,
"learning_rate": 0.0001578479921058103,
"loss": 1.5045,
"step": 7470
},
{
"epoch": 1.0378824672170957,
"grad_norm": 0.0,
"learning_rate": 0.0001577218301142442,
"loss": 1.6008,
"step": 7480
},
{
"epoch": 1.039270103378894,
"grad_norm": 0.0,
"learning_rate": 0.00015759553019096924,
"loss": 1.5677,
"step": 7490
},
{
"epoch": 1.0406577395406924,
"grad_norm": 0.0,
"learning_rate": 0.00015746909263779086,
"loss": 1.4854,
"step": 7500
},
{
"epoch": 1.0406577395406924,
"eval_loss": 1.5553832054138184,
"eval_runtime": 854.7095,
"eval_samples_per_second": 14.99,
"eval_steps_per_second": 1.874,
"step": 7500
},
{
"epoch": 1.0420453757024908,
"grad_norm": 0.0,
"learning_rate": 0.00015734251775684338,
"loss": 1.5282,
"step": 7510
},
{
"epoch": 1.0434330118642892,
"grad_norm": 0.0,
"learning_rate": 0.0001572158058505894,
"loss": 1.5321,
"step": 7520
},
{
"epoch": 1.0448206480260875,
"grad_norm": 0.0,
"learning_rate": 0.0001570889572218188,
"loss": 1.46,
"step": 7530
},
{
"epoch": 1.046208284187886,
"grad_norm": 0.0,
"learning_rate": 0.00015696197217364826,
"loss": 1.5914,
"step": 7540
},
{
"epoch": 1.0475959203496843,
"grad_norm": 0.0,
"learning_rate": 0.00015683485100952043,
"loss": 1.503,
"step": 7550
},
{
"epoch": 1.0489835565114827,
"grad_norm": 0.0,
"learning_rate": 0.00015670759403320318,
"loss": 1.539,
"step": 7560
},
{
"epoch": 1.050371192673281,
"grad_norm": 0.0,
"learning_rate": 0.000156580201548789,
"loss": 1.4814,
"step": 7570
},
{
"epoch": 1.0517588288350794,
"grad_norm": 0.0,
"learning_rate": 0.0001564526738606941,
"loss": 1.5348,
"step": 7580
},
{
"epoch": 1.0531464649968778,
"grad_norm": 0.0,
"learning_rate": 0.0001563250112736578,
"loss": 1.5778,
"step": 7590
},
{
"epoch": 1.0545341011586762,
"grad_norm": 0.0,
"learning_rate": 0.00015619721409274186,
"loss": 1.5437,
"step": 7600
},
{
"epoch": 1.0545341011586762,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.4619,
"eval_samples_per_second": 15.012,
"eval_steps_per_second": 1.877,
"step": 7600
},
{
"epoch": 1.0559217373204746,
"grad_norm": 0.0,
"learning_rate": 0.00015606928262332952,
"loss": 1.5839,
"step": 7610
},
{
"epoch": 1.057309373482273,
"grad_norm": 0.0,
"learning_rate": 0.00015594121717112499,
"loss": 1.6073,
"step": 7620
},
{
"epoch": 1.0586970096440713,
"grad_norm": 0.0,
"learning_rate": 0.00015581301804215269,
"loss": 1.5089,
"step": 7630
},
{
"epoch": 1.0600846458058697,
"grad_norm": 0.0,
"learning_rate": 0.00015568468554275636,
"loss": 1.5612,
"step": 7640
},
{
"epoch": 1.061472281967668,
"grad_norm": 0.0,
"learning_rate": 0.00015555621997959853,
"loss": 1.5754,
"step": 7650
},
{
"epoch": 1.0628599181294665,
"grad_norm": 0.0,
"learning_rate": 0.0001554276216596597,
"loss": 1.5358,
"step": 7660
},
{
"epoch": 1.0642475542912648,
"grad_norm": 0.0,
"learning_rate": 0.00015529889089023753,
"loss": 1.5362,
"step": 7670
},
{
"epoch": 1.0656351904530632,
"grad_norm": 0.0,
"learning_rate": 0.00015517002797894627,
"loss": 1.5111,
"step": 7680
},
{
"epoch": 1.0670228266148616,
"grad_norm": 0.0,
"learning_rate": 0.00015504103323371585,
"loss": 1.4885,
"step": 7690
},
{
"epoch": 1.06841046277666,
"grad_norm": 0.0,
"learning_rate": 0.0001549119069627913,
"loss": 1.5007,
"step": 7700
},
{
"epoch": 1.06841046277666,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.3958,
"eval_samples_per_second": 14.908,
"eval_steps_per_second": 1.864,
"step": 7700
},
{
"epoch": 1.0697980989384583,
"grad_norm": 0.0,
"learning_rate": 0.00015478264947473193,
"loss": 1.6073,
"step": 7710
},
{
"epoch": 1.0711857351002567,
"grad_norm": 0.0,
"learning_rate": 0.00015465326107841056,
"loss": 1.5889,
"step": 7720
},
{
"epoch": 1.072573371262055,
"grad_norm": 0.0,
"learning_rate": 0.00015452374208301285,
"loss": 1.5805,
"step": 7730
},
{
"epoch": 1.0739610074238535,
"grad_norm": 0.0,
"learning_rate": 0.0001543940927980366,
"loss": 1.5473,
"step": 7740
},
{
"epoch": 1.0753486435856519,
"grad_norm": 0.0,
"learning_rate": 0.00015426431353329081,
"loss": 1.5056,
"step": 7750
},
{
"epoch": 1.0767362797474502,
"grad_norm": 0.0,
"learning_rate": 0.00015413440459889524,
"loss": 1.5471,
"step": 7760
},
{
"epoch": 1.0781239159092486,
"grad_norm": 0.0,
"learning_rate": 0.00015400436630527941,
"loss": 1.5527,
"step": 7770
},
{
"epoch": 1.079511552071047,
"grad_norm": 0.0,
"learning_rate": 0.000153874198963182,
"loss": 1.5679,
"step": 7780
},
{
"epoch": 1.0808991882328454,
"grad_norm": 0.0,
"learning_rate": 0.00015374390288364997,
"loss": 1.652,
"step": 7790
},
{
"epoch": 1.0822868243946437,
"grad_norm": 0.0,
"learning_rate": 0.0001536134783780381,
"loss": 1.5538,
"step": 7800
},
{
"epoch": 1.0822868243946437,
"eval_loss": 1.5553832054138184,
"eval_runtime": 860.0182,
"eval_samples_per_second": 14.897,
"eval_steps_per_second": 1.863,
"step": 7800
},
{
"epoch": 1.0836744605564421,
"grad_norm": 0.0,
"learning_rate": 0.0001534829257580078,
"loss": 1.5828,
"step": 7810
},
{
"epoch": 1.0850620967182405,
"grad_norm": 0.0,
"learning_rate": 0.00015335224533552687,
"loss": 1.4658,
"step": 7820
},
{
"epoch": 1.0864497328800389,
"grad_norm": 0.0,
"learning_rate": 0.00015322143742286831,
"loss": 1.5252,
"step": 7830
},
{
"epoch": 1.0878373690418373,
"grad_norm": 0.0,
"learning_rate": 0.00015309050233260993,
"loss": 1.5765,
"step": 7840
},
{
"epoch": 1.0892250052036356,
"grad_norm": 0.0,
"learning_rate": 0.00015295944037763335,
"loss": 1.5868,
"step": 7850
},
{
"epoch": 1.090612641365434,
"grad_norm": 0.0,
"learning_rate": 0.0001528282518711233,
"loss": 1.6112,
"step": 7860
},
{
"epoch": 1.0920002775272324,
"grad_norm": 0.0,
"learning_rate": 0.0001526969371265671,
"loss": 1.5436,
"step": 7870
},
{
"epoch": 1.0933879136890308,
"grad_norm": 0.0,
"learning_rate": 0.00015256549645775347,
"loss": 1.553,
"step": 7880
},
{
"epoch": 1.0947755498508291,
"grad_norm": 0.0,
"learning_rate": 0.0001524339301787723,
"loss": 1.5098,
"step": 7890
},
{
"epoch": 1.0961631860126275,
"grad_norm": 0.0,
"learning_rate": 0.0001523022386040134,
"loss": 1.5736,
"step": 7900
},
{
"epoch": 1.0961631860126275,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.1865,
"eval_samples_per_second": 14.912,
"eval_steps_per_second": 1.865,
"step": 7900
},
{
"epoch": 1.097550822174426,
"grad_norm": 0.0,
"learning_rate": 0.0001521704220481662,
"loss": 1.622,
"step": 7910
},
{
"epoch": 1.0989384583362243,
"grad_norm": 0.0,
"learning_rate": 0.00015203848082621867,
"loss": 1.5086,
"step": 7920
},
{
"epoch": 1.1003260944980227,
"grad_norm": 0.0,
"learning_rate": 0.00015190641525345656,
"loss": 1.5271,
"step": 7930
},
{
"epoch": 1.101713730659821,
"grad_norm": 0.0,
"learning_rate": 0.00015177422564546306,
"loss": 1.4512,
"step": 7940
},
{
"epoch": 1.1031013668216194,
"grad_norm": 0.0,
"learning_rate": 0.00015164191231811753,
"loss": 1.4596,
"step": 7950
},
{
"epoch": 1.1044890029834178,
"grad_norm": 0.0,
"learning_rate": 0.00015150947558759502,
"loss": 1.5792,
"step": 7960
},
{
"epoch": 1.1058766391452162,
"grad_norm": 0.0,
"learning_rate": 0.0001513769157703655,
"loss": 1.5391,
"step": 7970
},
{
"epoch": 1.1072642753070145,
"grad_norm": 0.0,
"learning_rate": 0.0001512442331831931,
"loss": 1.5552,
"step": 7980
},
{
"epoch": 1.108651911468813,
"grad_norm": 0.0,
"learning_rate": 0.00015111142814313517,
"loss": 1.4679,
"step": 7990
},
{
"epoch": 1.1100395476306113,
"grad_norm": 0.0,
"learning_rate": 0.00015097850096754177,
"loss": 1.5157,
"step": 8000
},
{
"epoch": 1.1100395476306113,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.8743,
"eval_samples_per_second": 14.917,
"eval_steps_per_second": 1.865,
"step": 8000
},
{
"epoch": 1.1114271837924097,
"grad_norm": 0.0,
"learning_rate": 0.00015084545197405493,
"loss": 1.527,
"step": 8010
},
{
"epoch": 1.112814819954208,
"grad_norm": 0.0,
"learning_rate": 0.0001507122814806075,
"loss": 1.5263,
"step": 8020
},
{
"epoch": 1.1142024561160064,
"grad_norm": 0.0,
"learning_rate": 0.00015057898980542293,
"loss": 1.5607,
"step": 8030
},
{
"epoch": 1.1155900922778048,
"grad_norm": 0.0,
"learning_rate": 0.00015044557726701408,
"loss": 1.5861,
"step": 8040
},
{
"epoch": 1.1169777284396032,
"grad_norm": 0.0,
"learning_rate": 0.00015031204418418275,
"loss": 1.547,
"step": 8050
},
{
"epoch": 1.1183653646014016,
"grad_norm": 0.0,
"learning_rate": 0.00015017839087601867,
"loss": 1.6436,
"step": 8060
},
{
"epoch": 1.1197530007632,
"grad_norm": 0.0,
"learning_rate": 0.00015004461766189895,
"loss": 1.5417,
"step": 8070
},
{
"epoch": 1.1211406369249983,
"grad_norm": 0.0,
"learning_rate": 0.0001499107248614871,
"loss": 1.5294,
"step": 8080
},
{
"epoch": 1.1225282730867967,
"grad_norm": 0.0,
"learning_rate": 0.00014977671279473262,
"loss": 1.4875,
"step": 8090
},
{
"epoch": 1.123915909248595,
"grad_norm": 0.0,
"learning_rate": 0.00014964258178186976,
"loss": 1.5403,
"step": 8100
},
{
"epoch": 1.123915909248595,
"eval_loss": 1.5553832054138184,
"eval_runtime": 853.3464,
"eval_samples_per_second": 15.014,
"eval_steps_per_second": 1.877,
"step": 8100
},
{
"epoch": 1.1253035454103935,
"grad_norm": 0.0,
"learning_rate": 0.00014950833214341715,
"loss": 1.6199,
"step": 8110
},
{
"epoch": 1.1266911815721918,
"grad_norm": 0.0,
"learning_rate": 0.00014937396420017688,
"loss": 1.5757,
"step": 8120
},
{
"epoch": 1.1280788177339902,
"grad_norm": 0.0,
"learning_rate": 0.0001492394782732337,
"loss": 1.5729,
"step": 8130
},
{
"epoch": 1.1294664538957886,
"grad_norm": 0.0,
"learning_rate": 0.00014910487468395425,
"loss": 1.5811,
"step": 8140
},
{
"epoch": 1.130854090057587,
"grad_norm": 0.0,
"learning_rate": 0.0001489701537539864,
"loss": 1.5804,
"step": 8150
},
{
"epoch": 1.1322417262193853,
"grad_norm": 0.0,
"learning_rate": 0.0001488353158052585,
"loss": 1.4874,
"step": 8160
},
{
"epoch": 1.1336293623811837,
"grad_norm": 0.0,
"learning_rate": 0.00014870036115997834,
"loss": 1.5255,
"step": 8170
},
{
"epoch": 1.135016998542982,
"grad_norm": 0.0,
"learning_rate": 0.0001485652901406327,
"loss": 1.5675,
"step": 8180
},
{
"epoch": 1.1364046347047805,
"grad_norm": 0.0,
"learning_rate": 0.0001484301030699864,
"loss": 1.6013,
"step": 8190
},
{
"epoch": 1.1377922708665789,
"grad_norm": 0.0,
"learning_rate": 0.00014829480027108157,
"loss": 1.511,
"step": 8200
},
{
"epoch": 1.1377922708665789,
"eval_loss": 1.5553832054138184,
"eval_runtime": 859.6956,
"eval_samples_per_second": 14.903,
"eval_steps_per_second": 1.863,
"step": 8200
},
{
"epoch": 1.1391799070283772,
"grad_norm": 0.0,
"learning_rate": 0.0001481593820672369,
"loss": 1.5034,
"step": 8210
},
{
"epoch": 1.1405675431901756,
"grad_norm": 0.0,
"learning_rate": 0.00014802384878204685,
"loss": 1.4766,
"step": 8220
},
{
"epoch": 1.141955179351974,
"grad_norm": 0.0,
"learning_rate": 0.0001478882007393809,
"loss": 1.5747,
"step": 8230
},
{
"epoch": 1.1433428155137724,
"grad_norm": 0.0,
"learning_rate": 0.0001477524382633827,
"loss": 1.5423,
"step": 8240
},
{
"epoch": 1.1447304516755707,
"grad_norm": 0.0,
"learning_rate": 0.00014761656167846935,
"loss": 1.5247,
"step": 8250
},
{
"epoch": 1.1461180878373691,
"grad_norm": 0.0,
"learning_rate": 0.00014748057130933067,
"loss": 1.547,
"step": 8260
},
{
"epoch": 1.1475057239991675,
"grad_norm": 0.0,
"learning_rate": 0.00014734446748092842,
"loss": 1.5177,
"step": 8270
},
{
"epoch": 1.1488933601609659,
"grad_norm": 0.0,
"learning_rate": 0.00014720825051849534,
"loss": 1.5871,
"step": 8280
},
{
"epoch": 1.1502809963227643,
"grad_norm": 0.0,
"learning_rate": 0.00014707192074753467,
"loss": 1.5082,
"step": 8290
},
{
"epoch": 1.1516686324845626,
"grad_norm": 0.0,
"learning_rate": 0.00014693547849381906,
"loss": 1.5675,
"step": 8300
},
{
"epoch": 1.1516686324845626,
"eval_loss": 1.5553832054138184,
"eval_runtime": 858.871,
"eval_samples_per_second": 14.917,
"eval_steps_per_second": 1.865,
"step": 8300
},
{
"epoch": 1.153056268646361,
"grad_norm": 0.0,
"learning_rate": 0.00014679892408339012,
"loss": 1.5986,
"step": 8310
},
{
"epoch": 1.1544439048081594,
"grad_norm": 0.0,
"learning_rate": 0.00014666225784255724,
"loss": 1.5578,
"step": 8320
},
{
"epoch": 1.1558315409699578,
"grad_norm": 0.0,
"learning_rate": 0.00014652548009789736,
"loss": 1.5968,
"step": 8330
},
{
"epoch": 1.1572191771317561,
"grad_norm": 0.0,
"learning_rate": 0.00014638859117625358,
"loss": 1.5689,
"step": 8340
},
{
"epoch": 1.1586068132935545,
"grad_norm": 0.0,
"learning_rate": 0.00014625159140473475,
"loss": 1.4963,
"step": 8350
},
{
"epoch": 1.1599944494553527,
"grad_norm": 0.0,
"learning_rate": 0.0001461144811107147,
"loss": 1.4889,
"step": 8360
},
{
"epoch": 1.1613820856171513,
"grad_norm": 0.0,
"learning_rate": 0.0001459772606218312,
"loss": 1.5006,
"step": 8370
},
{
"epoch": 1.1627697217789494,
"grad_norm": 0.0,
"learning_rate": 0.00014583993026598557,
"loss": 1.5204,
"step": 8380
},
{
"epoch": 1.164157357940748,
"grad_norm": 0.0,
"learning_rate": 0.00014570249037134143,
"loss": 1.499,
"step": 8390
},
{
"epoch": 1.1655449941025462,
"grad_norm": 0.0,
"learning_rate": 0.0001455649412663243,
"loss": 1.5346,
"step": 8400
},
{
"epoch": 1.1655449941025462,
"eval_loss": 1.5553832054138184,
"eval_runtime": 864.8064,
"eval_samples_per_second": 14.815,
"eval_steps_per_second": 1.852,
"step": 8400
},
{
"epoch": 1.1669326302643448,
"grad_norm": 0.0,
"learning_rate": 0.0001454272832796206,
"loss": 1.5715,
"step": 8410
},
{
"epoch": 1.168320266426143,
"grad_norm": 0.0,
"learning_rate": 0.000145289516740177,
"loss": 1.4989,
"step": 8420
},
{
"epoch": 1.1697079025879416,
"grad_norm": 0.0,
"learning_rate": 0.00014515164197719945,
"loss": 1.5598,
"step": 8430
},
{
"epoch": 1.1710955387497397,
"grad_norm": 0.0,
"learning_rate": 0.00014501365932015266,
"loss": 1.6222,
"step": 8440
},
{
"epoch": 1.1724831749115383,
"grad_norm": 0.0,
"learning_rate": 0.00014487556909875907,
"loss": 1.5452,
"step": 8450
},
{
"epoch": 1.1738708110733365,
"grad_norm": 0.0,
"learning_rate": 0.00014473737164299815,
"loss": 1.5969,
"step": 8460
},
{
"epoch": 1.175258447235135,
"grad_norm": 0.0,
"learning_rate": 0.00014459906728310563,
"loss": 1.5379,
"step": 8470
},
{
"epoch": 1.1766460833969332,
"grad_norm": 0.0,
"learning_rate": 0.00014446065634957276,
"loss": 1.4996,
"step": 8480
},
{
"epoch": 1.1780337195587318,
"grad_norm": 0.0,
"learning_rate": 0.00014432213917314534,
"loss": 1.5666,
"step": 8490
},
{
"epoch": 1.17942135572053,
"grad_norm": 0.0,
"learning_rate": 0.00014418351608482314,
"loss": 1.489,
"step": 8500
},
{
"epoch": 1.17942135572053,
"eval_loss": 1.5553832054138184,
"eval_runtime": 870.4535,
"eval_samples_per_second": 14.719,
"eval_steps_per_second": 1.84,
"step": 8500
},
{
"epoch": 1.1808089918823286,
"grad_norm": 0.0,
"learning_rate": 0.00014404478741585902,
"loss": 1.4908,
"step": 8510
},
{
"epoch": 1.1821966280441267,
"grad_norm": 0.0,
"learning_rate": 0.00014390595349775804,
"loss": 1.5845,
"step": 8520
},
{
"epoch": 1.1835842642059253,
"grad_norm": 0.0,
"learning_rate": 0.00014376701466227687,
"loss": 1.5,
"step": 8530
},
{
"epoch": 1.1849719003677235,
"grad_norm": 0.0,
"learning_rate": 0.00014362797124142283,
"loss": 1.5638,
"step": 8540
},
{
"epoch": 1.1863595365295219,
"grad_norm": 0.0,
"learning_rate": 0.00014348882356745319,
"loss": 1.5205,
"step": 8550
},
{
"epoch": 1.1877471726913202,
"grad_norm": 0.0,
"learning_rate": 0.0001433495719728743,
"loss": 1.5674,
"step": 8560
},
{
"epoch": 1.1891348088531186,
"grad_norm": 0.0,
"learning_rate": 0.00014321021679044086,
"loss": 1.6119,
"step": 8570
},
{
"epoch": 1.190522445014917,
"grad_norm": 0.0,
"learning_rate": 0.00014307075835315515,
"loss": 1.5422,
"step": 8580
},
{
"epoch": 1.1919100811767154,
"grad_norm": 0.0,
"learning_rate": 0.00014293119699426604,
"loss": 1.6046,
"step": 8590
},
{
"epoch": 1.1932977173385138,
"grad_norm": 0.0,
"learning_rate": 0.00014279153304726857,
"loss": 1.4951,
"step": 8600
},
{
"epoch": 1.1932977173385138,
"eval_loss": 1.5553832054138184,
"eval_runtime": 864.037,
"eval_samples_per_second": 14.828,
"eval_steps_per_second": 1.854,
"step": 8600
},
{
"epoch": 1.1946853535003121,
"grad_norm": 0.0,
"learning_rate": 0.00014265176684590274,
"loss": 1.5689,
"step": 8610
},
{
"epoch": 1.1960729896621105,
"grad_norm": 0.0,
"learning_rate": 0.00014251189872415294,
"loss": 1.5362,
"step": 8620
},
{
"epoch": 1.1974606258239089,
"grad_norm": 0.0,
"learning_rate": 0.00014237192901624712,
"loss": 1.5422,
"step": 8630
},
{
"epoch": 1.1988482619857073,
"grad_norm": 0.0,
"learning_rate": 0.00014223185805665604,
"loss": 1.5692,
"step": 8640
},
{
"epoch": 1.2002358981475056,
"grad_norm": 0.0,
"learning_rate": 0.00014209168618009227,
"loss": 1.5128,
"step": 8650
},
{
"epoch": 1.201623534309304,
"grad_norm": 0.0,
"learning_rate": 0.00014195141372150966,
"loss": 1.5975,
"step": 8660
},
{
"epoch": 1.2030111704711024,
"grad_norm": 0.0,
"learning_rate": 0.0001418110410161024,
"loss": 1.5617,
"step": 8670
},
{
"epoch": 1.2043988066329008,
"grad_norm": 0.0,
"learning_rate": 0.00014167056839930407,
"loss": 1.5745,
"step": 8680
},
{
"epoch": 1.2057864427946992,
"grad_norm": 0.0,
"learning_rate": 0.00014152999620678722,
"loss": 1.579,
"step": 8690
},
{
"epoch": 1.2071740789564975,
"grad_norm": 0.0,
"learning_rate": 0.00014138932477446222,
"loss": 1.5969,
"step": 8700
},
{
"epoch": 1.2071740789564975,
"eval_loss": 1.5553832054138184,
"eval_runtime": 861.7557,
"eval_samples_per_second": 14.867,
"eval_steps_per_second": 1.859,
"step": 8700
},
{
"epoch": 1.208561715118296,
"grad_norm": 0.0,
"learning_rate": 0.00014124855443847662,
"loss": 1.5638,
"step": 8710
},
{
"epoch": 1.2099493512800943,
"grad_norm": 0.0,
"learning_rate": 0.00014110768553521425,
"loss": 1.5384,
"step": 8720
},
{
"epoch": 1.2113369874418927,
"grad_norm": 0.0,
"learning_rate": 0.0001409667184012946,
"loss": 1.5118,
"step": 8730
},
{
"epoch": 1.212724623603691,
"grad_norm": 0.0,
"learning_rate": 0.00014082565337357174,
"loss": 1.5268,
"step": 8740
},
{
"epoch": 1.2141122597654894,
"grad_norm": 0.0,
"learning_rate": 0.0001406844907891338,
"loss": 1.565,
"step": 8750
},
{
"epoch": 1.2154998959272878,
"grad_norm": 0.0,
"learning_rate": 0.0001405432309853019,
"loss": 1.6024,
"step": 8760
},
{
"epoch": 1.2168875320890862,
"grad_norm": 0.0,
"learning_rate": 0.00014040187429962964,
"loss": 1.5062,
"step": 8770
},
{
"epoch": 1.2182751682508846,
"grad_norm": 0.0,
"learning_rate": 0.00014026042106990194,
"loss": 1.6006,
"step": 8780
},
{
"epoch": 1.219662804412683,
"grad_norm": 0.0,
"learning_rate": 0.00014011887163413456,
"loss": 1.5469,
"step": 8790
},
{
"epoch": 1.2210504405744813,
"grad_norm": 0.0,
"learning_rate": 0.00013997722633057313,
"loss": 1.6013,
"step": 8800
},
{
"epoch": 1.2210504405744813,
"eval_loss": 1.5553832054138184,
"eval_runtime": 864.0925,
"eval_samples_per_second": 14.827,
"eval_steps_per_second": 1.854,
"step": 8800
},
{
"epoch": 1.2224380767362797,
"grad_norm": 0.0,
"learning_rate": 0.0001398354854976923,
"loss": 1.5087,
"step": 8810
},
{
"epoch": 1.223825712898078,
"grad_norm": 0.0,
"learning_rate": 0.00013969364947419508,
"loss": 1.6569,
"step": 8820
},
{
"epoch": 1.2252133490598764,
"grad_norm": 0.0,
"learning_rate": 0.00013955171859901183,
"loss": 1.5641,
"step": 8830
},
{
"epoch": 1.2266009852216748,
"grad_norm": 0.0,
"learning_rate": 0.00013940969321129978,
"loss": 1.5895,
"step": 8840
},
{
"epoch": 1.2279886213834732,
"grad_norm": 0.0,
"learning_rate": 0.00013926757365044173,
"loss": 1.5392,
"step": 8850
},
{
"epoch": 1.2293762575452716,
"grad_norm": 0.0,
"learning_rate": 0.00013912536025604576,
"loss": 1.5429,
"step": 8860
},
{
"epoch": 1.23076389370707,
"grad_norm": 0.0,
"learning_rate": 0.000138983053367944,
"loss": 1.5436,
"step": 8870
},
{
"epoch": 1.2321515298688683,
"grad_norm": 0.0,
"learning_rate": 0.00013884065332619213,
"loss": 1.5116,
"step": 8880
},
{
"epoch": 1.2335391660306667,
"grad_norm": 0.0,
"learning_rate": 0.0001386981604710683,
"loss": 1.5252,
"step": 8890
},
{
"epoch": 1.234926802192465,
"grad_norm": 0.0,
"learning_rate": 0.0001385555751430725,
"loss": 1.5872,
"step": 8900
},
{
"epoch": 1.234926802192465,
"eval_loss": 1.5553832054138184,
"eval_runtime": 863.0711,
"eval_samples_per_second": 14.845,
"eval_steps_per_second": 1.856,
"step": 8900
},
{
"epoch": 1.2363144383542635,
"grad_norm": 0.0,
"learning_rate": 0.00013841289768292574,
"loss": 1.5248,
"step": 8910
},
{
"epoch": 1.2377020745160618,
"grad_norm": 0.0,
"learning_rate": 0.00013827012843156913,
"loss": 1.5388,
"step": 8920
},
{
"epoch": 1.2390897106778602,
"grad_norm": 0.0,
"learning_rate": 0.0001381272677301632,
"loss": 1.5608,
"step": 8930
},
{
"epoch": 1.2404773468396586,
"grad_norm": 0.0,
"learning_rate": 0.00013798431592008684,
"loss": 1.494,
"step": 8940
},
{
"epoch": 1.241864983001457,
"grad_norm": 0.0,
"learning_rate": 0.0001378412733429369,
"loss": 1.5242,
"step": 8950
},
{
"epoch": 1.2432526191632554,
"grad_norm": 0.0,
"learning_rate": 0.0001376981403405268,
"loss": 1.5133,
"step": 8960
},
{
"epoch": 1.2446402553250537,
"grad_norm": 0.0,
"learning_rate": 0.00013755491725488646,
"loss": 1.5281,
"step": 8970
},
{
"epoch": 1.2460278914868521,
"grad_norm": 0.0,
"learning_rate": 0.00013741160442826063,
"loss": 1.5155,
"step": 8980
},
{
"epoch": 1.2474155276486505,
"grad_norm": 0.0,
"learning_rate": 0.00013726820220310882,
"loss": 1.641,
"step": 8990
},
{
"epoch": 1.2488031638104489,
"grad_norm": 0.0,
"learning_rate": 0.00013712471092210403,
"loss": 1.5412,
"step": 9000
},
{
"epoch": 1.2488031638104489,
"eval_loss": 1.5553832054138184,
"eval_runtime": 879.3561,
"eval_samples_per_second": 14.57,
"eval_steps_per_second": 1.822,
"step": 9000
},
{
"epoch": 1.2501907999722472,
"grad_norm": 0.0,
"learning_rate": 0.00013698113092813205,
"loss": 1.4897,
"step": 9010
},
{
"epoch": 1.2515784361340456,
"grad_norm": 0.0,
"learning_rate": 0.00013683746256429078,
"loss": 1.4696,
"step": 9020
},
{
"epoch": 1.252966072295844,
"grad_norm": 0.0,
"learning_rate": 0.0001366937061738891,
"loss": 1.5857,
"step": 9030
},
{
"epoch": 1.2543537084576424,
"grad_norm": 0.0,
"learning_rate": 0.00013654986210044645,
"loss": 1.6397,
"step": 9040
},
{
"epoch": 1.2557413446194408,
"grad_norm": 0.0,
"learning_rate": 0.00013640593068769158,
"loss": 1.5742,
"step": 9050
},
{
"epoch": 1.2571289807812391,
"grad_norm": 0.0,
"learning_rate": 0.00013626191227956216,
"loss": 1.5915,
"step": 9060
},
{
"epoch": 1.2585166169430375,
"grad_norm": 0.0,
"learning_rate": 0.00013611780722020357,
"loss": 1.4973,
"step": 9070
},
{
"epoch": 1.259904253104836,
"grad_norm": 0.0,
"learning_rate": 0.00013597361585396836,
"loss": 1.5331,
"step": 9080
},
{
"epoch": 1.2612918892666343,
"grad_norm": 0.0,
"learning_rate": 0.00013582933852541524,
"loss": 1.6072,
"step": 9090
},
{
"epoch": 1.2626795254284326,
"grad_norm": 0.0,
"learning_rate": 0.0001356849755793084,
"loss": 1.4745,
"step": 9100
},
{
"epoch": 1.2626795254284326,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.9402,
"eval_samples_per_second": 14.677,
"eval_steps_per_second": 1.835,
"step": 9100
},
{
"epoch": 1.264067161590231,
"grad_norm": 0.0,
"learning_rate": 0.0001355405273606166,
"loss": 1.4407,
"step": 9110
},
{
"epoch": 1.2654547977520294,
"grad_norm": 0.0,
"learning_rate": 0.00013539599421451235,
"loss": 1.5321,
"step": 9120
},
{
"epoch": 1.2668424339138278,
"grad_norm": 0.0,
"learning_rate": 0.0001352513764863711,
"loss": 1.5258,
"step": 9130
},
{
"epoch": 1.2682300700756262,
"grad_norm": 0.0,
"learning_rate": 0.00013510667452177043,
"loss": 1.5099,
"step": 9140
},
{
"epoch": 1.2696177062374245,
"grad_norm": 0.0,
"learning_rate": 0.00013496188866648926,
"loss": 1.5893,
"step": 9150
},
{
"epoch": 1.271005342399223,
"grad_norm": 0.0,
"learning_rate": 0.00013481701926650687,
"loss": 1.5482,
"step": 9160
},
{
"epoch": 1.2723929785610213,
"grad_norm": 0.0,
"learning_rate": 0.00013467206666800227,
"loss": 1.5634,
"step": 9170
},
{
"epoch": 1.2737806147228197,
"grad_norm": 0.0,
"learning_rate": 0.00013452703121735323,
"loss": 1.6012,
"step": 9180
},
{
"epoch": 1.275168250884618,
"grad_norm": 0.0,
"learning_rate": 0.0001343819132611355,
"loss": 1.5565,
"step": 9190
},
{
"epoch": 1.2765558870464164,
"grad_norm": 0.0,
"learning_rate": 0.00013423671314612202,
"loss": 1.5516,
"step": 9200
},
{
"epoch": 1.2765558870464164,
"eval_loss": 1.5553832054138184,
"eval_runtime": 870.4397,
"eval_samples_per_second": 14.719,
"eval_steps_per_second": 1.84,
"step": 9200
},
{
"epoch": 1.2779435232082148,
"grad_norm": 0.0,
"learning_rate": 0.00013409143121928207,
"loss": 1.5608,
"step": 9210
},
{
"epoch": 1.2793311593700132,
"grad_norm": 0.0,
"learning_rate": 0.00013394606782778033,
"loss": 1.5208,
"step": 9220
},
{
"epoch": 1.2807187955318116,
"grad_norm": 0.0,
"learning_rate": 0.00013380062331897624,
"loss": 1.561,
"step": 9230
},
{
"epoch": 1.28210643169361,
"grad_norm": 0.0,
"learning_rate": 0.00013365509804042308,
"loss": 1.5254,
"step": 9240
},
{
"epoch": 1.2834940678554083,
"grad_norm": 0.0,
"learning_rate": 0.00013350949233986706,
"loss": 1.5598,
"step": 9250
},
{
"epoch": 1.2848817040172067,
"grad_norm": 0.0,
"learning_rate": 0.00013336380656524664,
"loss": 1.587,
"step": 9260
},
{
"epoch": 1.286269340179005,
"grad_norm": 0.0,
"learning_rate": 0.00013321804106469158,
"loss": 1.5162,
"step": 9270
},
{
"epoch": 1.2876569763408034,
"grad_norm": 0.0,
"learning_rate": 0.0001330721961865222,
"loss": 1.5323,
"step": 9280
},
{
"epoch": 1.2890446125026018,
"grad_norm": 0.0,
"learning_rate": 0.00013292627227924844,
"loss": 1.5477,
"step": 9290
},
{
"epoch": 1.2904322486644002,
"grad_norm": 0.0,
"learning_rate": 0.00013278026969156914,
"loss": 1.5326,
"step": 9300
},
{
"epoch": 1.2904322486644002,
"eval_loss": 1.5553832054138184,
"eval_runtime": 887.9291,
"eval_samples_per_second": 14.429,
"eval_steps_per_second": 1.804,
"step": 9300
},
{
"epoch": 1.2918198848261986,
"grad_norm": 0.0,
"learning_rate": 0.00013263418877237107,
"loss": 1.5434,
"step": 9310
},
{
"epoch": 1.293207520987997,
"grad_norm": 0.0,
"learning_rate": 0.00013248802987072836,
"loss": 1.5252,
"step": 9320
},
{
"epoch": 1.2945951571497953,
"grad_norm": 0.0,
"learning_rate": 0.0001323417933359013,
"loss": 1.5959,
"step": 9330
},
{
"epoch": 1.2959827933115937,
"grad_norm": 0.0,
"learning_rate": 0.0001321954795173357,
"loss": 1.5702,
"step": 9340
},
{
"epoch": 1.297370429473392,
"grad_norm": 0.0,
"learning_rate": 0.0001320490887646622,
"loss": 1.5861,
"step": 9350
},
{
"epoch": 1.2987580656351905,
"grad_norm": 0.0,
"learning_rate": 0.00013190262142769515,
"loss": 1.5369,
"step": 9360
},
{
"epoch": 1.3001457017969889,
"grad_norm": 0.0,
"learning_rate": 0.00013175607785643196,
"loss": 1.597,
"step": 9370
},
{
"epoch": 1.3015333379587872,
"grad_norm": 0.0,
"learning_rate": 0.00013160945840105215,
"loss": 1.5199,
"step": 9380
},
{
"epoch": 1.3029209741205856,
"grad_norm": 0.0,
"learning_rate": 0.00013146276341191669,
"loss": 1.5969,
"step": 9390
},
{
"epoch": 1.304308610282384,
"grad_norm": 0.0,
"learning_rate": 0.00013131599323956686,
"loss": 1.5627,
"step": 9400
},
{
"epoch": 1.304308610282384,
"eval_loss": 1.5553832054138184,
"eval_runtime": 883.1047,
"eval_samples_per_second": 14.508,
"eval_steps_per_second": 1.814,
"step": 9400
},
{
"epoch": 1.3056962464441824,
"grad_norm": 0.0,
"learning_rate": 0.00013116914823472383,
"loss": 1.5583,
"step": 9410
},
{
"epoch": 1.3070838826059807,
"grad_norm": 0.0,
"learning_rate": 0.00013102222874828738,
"loss": 1.5026,
"step": 9420
},
{
"epoch": 1.3084715187677791,
"grad_norm": 0.0,
"learning_rate": 0.0001308752351313354,
"loss": 1.5962,
"step": 9430
},
{
"epoch": 1.3098591549295775,
"grad_norm": 0.0,
"learning_rate": 0.00013072816773512287,
"loss": 1.5441,
"step": 9440
},
{
"epoch": 1.3112467910913759,
"grad_norm": 0.0,
"learning_rate": 0.00013058102691108106,
"loss": 1.5238,
"step": 9450
},
{
"epoch": 1.3126344272531743,
"grad_norm": 0.0,
"learning_rate": 0.00013043381301081674,
"loss": 1.5385,
"step": 9460
},
{
"epoch": 1.3140220634149726,
"grad_norm": 0.0,
"learning_rate": 0.0001302865263861113,
"loss": 1.5158,
"step": 9470
},
{
"epoch": 1.315409699576771,
"grad_norm": 0.0,
"learning_rate": 0.00013013916738891985,
"loss": 1.5261,
"step": 9480
},
{
"epoch": 1.3167973357385694,
"grad_norm": 0.0,
"learning_rate": 0.00012999173637137052,
"loss": 1.579,
"step": 9490
},
{
"epoch": 1.3181849719003678,
"grad_norm": 0.0,
"learning_rate": 0.00012984423368576353,
"loss": 1.6033,
"step": 9500
},
{
"epoch": 1.3181849719003678,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.2116,
"eval_samples_per_second": 14.655,
"eval_steps_per_second": 1.833,
"step": 9500
},
{
"epoch": 1.3195726080621661,
"grad_norm": 0.0,
"learning_rate": 0.0001296966596845703,
"loss": 1.537,
"step": 9510
},
{
"epoch": 1.3209602442239645,
"grad_norm": 0.0,
"learning_rate": 0.00012954901472043273,
"loss": 1.4632,
"step": 9520
},
{
"epoch": 1.322347880385763,
"grad_norm": 0.0,
"learning_rate": 0.00012940129914616223,
"loss": 1.5292,
"step": 9530
},
{
"epoch": 1.3237355165475613,
"grad_norm": 0.0,
"learning_rate": 0.00012925351331473904,
"loss": 1.5498,
"step": 9540
},
{
"epoch": 1.3251231527093597,
"grad_norm": 0.0,
"learning_rate": 0.00012910565757931114,
"loss": 1.5441,
"step": 9550
},
{
"epoch": 1.326510788871158,
"grad_norm": 0.0,
"learning_rate": 0.00012895773229319366,
"loss": 1.511,
"step": 9560
},
{
"epoch": 1.3278984250329564,
"grad_norm": 0.0,
"learning_rate": 0.0001288097378098679,
"loss": 1.5437,
"step": 9570
},
{
"epoch": 1.3292860611947548,
"grad_norm": 0.0,
"learning_rate": 0.00012866167448298053,
"loss": 1.5918,
"step": 9580
},
{
"epoch": 1.3306736973565532,
"grad_norm": 0.0,
"learning_rate": 0.0001285135426663427,
"loss": 1.5121,
"step": 9590
},
{
"epoch": 1.3320613335183515,
"grad_norm": 0.0,
"learning_rate": 0.0001283653427139292,
"loss": 1.4988,
"step": 9600
},
{
"epoch": 1.3320613335183515,
"eval_loss": 1.5553832054138184,
"eval_runtime": 873.8623,
"eval_samples_per_second": 14.661,
"eval_steps_per_second": 1.833,
"step": 9600
},
{
"epoch": 1.33344896968015,
"grad_norm": 0.0,
"learning_rate": 0.00012821707497987772,
"loss": 1.544,
"step": 9610
},
{
"epoch": 1.3348366058419483,
"grad_norm": 0.0,
"learning_rate": 0.00012806873981848784,
"loss": 1.5734,
"step": 9620
},
{
"epoch": 1.3362242420037467,
"grad_norm": 0.0,
"learning_rate": 0.00012792033758422032,
"loss": 1.5773,
"step": 9630
},
{
"epoch": 1.337611878165545,
"grad_norm": 0.0,
"learning_rate": 0.00012777186863169613,
"loss": 1.5198,
"step": 9640
},
{
"epoch": 1.3389995143273434,
"grad_norm": 0.0,
"learning_rate": 0.00012762333331569576,
"loss": 1.5376,
"step": 9650
},
{
"epoch": 1.3403871504891418,
"grad_norm": 0.0,
"learning_rate": 0.00012747473199115823,
"loss": 1.587,
"step": 9660
},
{
"epoch": 1.3417747866509402,
"grad_norm": 0.0,
"learning_rate": 0.00012732606501318023,
"loss": 1.5984,
"step": 9670
},
{
"epoch": 1.3431624228127386,
"grad_norm": 0.0,
"learning_rate": 0.00012717733273701548,
"loss": 1.6267,
"step": 9680
},
{
"epoch": 1.344550058974537,
"grad_norm": 0.0,
"learning_rate": 0.00012702853551807357,
"loss": 1.6223,
"step": 9690
},
{
"epoch": 1.3459376951363353,
"grad_norm": 0.0,
"learning_rate": 0.00012687967371191947,
"loss": 1.5781,
"step": 9700
},
{
"epoch": 1.3459376951363353,
"eval_loss": 1.5553832054138184,
"eval_runtime": 873.0946,
"eval_samples_per_second": 14.674,
"eval_steps_per_second": 1.835,
"step": 9700
},
{
"epoch": 1.3473253312981337,
"grad_norm": 0.0,
"learning_rate": 0.0001267307476742723,
"loss": 1.6014,
"step": 9710
},
{
"epoch": 1.348712967459932,
"grad_norm": 0.0,
"learning_rate": 0.0001265817577610048,
"loss": 1.5637,
"step": 9720
},
{
"epoch": 1.3501006036217305,
"grad_norm": 0.0,
"learning_rate": 0.00012643270432814225,
"loss": 1.5874,
"step": 9730
},
{
"epoch": 1.3514882397835288,
"grad_norm": 0.0,
"learning_rate": 0.0001262835877318618,
"loss": 1.5469,
"step": 9740
},
{
"epoch": 1.3528758759453272,
"grad_norm": 0.0,
"learning_rate": 0.00012613440832849146,
"loss": 1.5123,
"step": 9750
},
{
"epoch": 1.3542635121071256,
"grad_norm": 0.0,
"learning_rate": 0.00012598516647450942,
"loss": 1.5328,
"step": 9760
},
{
"epoch": 1.355651148268924,
"grad_norm": 0.0,
"learning_rate": 0.00012583586252654298,
"loss": 1.5016,
"step": 9770
},
{
"epoch": 1.3570387844307223,
"grad_norm": 0.0,
"learning_rate": 0.0001256864968413679,
"loss": 1.5429,
"step": 9780
},
{
"epoch": 1.3584264205925207,
"grad_norm": 0.0,
"learning_rate": 0.00012553706977590744,
"loss": 1.5475,
"step": 9790
},
{
"epoch": 1.359814056754319,
"grad_norm": 0.0,
"learning_rate": 0.00012538758168723156,
"loss": 1.5815,
"step": 9800
},
{
"epoch": 1.359814056754319,
"eval_loss": 1.5553832054138184,
"eval_runtime": 873.1306,
"eval_samples_per_second": 14.674,
"eval_steps_per_second": 1.835,
"step": 9800
},
{
"epoch": 1.3612016929161173,
"grad_norm": 0.0,
"learning_rate": 0.00012523803293255602,
"loss": 1.5421,
"step": 9810
},
{
"epoch": 1.3625893290779159,
"grad_norm": 0.0,
"learning_rate": 0.00012508842386924152,
"loss": 1.4994,
"step": 9820
},
{
"epoch": 1.363976965239714,
"grad_norm": 0.0,
"learning_rate": 0.00012493875485479296,
"loss": 1.5126,
"step": 9830
},
{
"epoch": 1.3653646014015126,
"grad_norm": 0.0,
"learning_rate": 0.00012478902624685838,
"loss": 1.4954,
"step": 9840
},
{
"epoch": 1.3667522375633108,
"grad_norm": 0.0,
"learning_rate": 0.00012463923840322832,
"loss": 1.5676,
"step": 9850
},
{
"epoch": 1.3681398737251094,
"grad_norm": 0.0,
"learning_rate": 0.0001244893916818348,
"loss": 1.5358,
"step": 9860
},
{
"epoch": 1.3695275098869075,
"grad_norm": 0.0,
"learning_rate": 0.0001243394864407506,
"loss": 1.5445,
"step": 9870
},
{
"epoch": 1.3709151460487061,
"grad_norm": 0.0,
"learning_rate": 0.00012418952303818834,
"loss": 1.4529,
"step": 9880
},
{
"epoch": 1.3723027822105043,
"grad_norm": 0.0,
"learning_rate": 0.00012403950183249952,
"loss": 1.555,
"step": 9890
},
{
"epoch": 1.3736904183723029,
"grad_norm": 0.0,
"learning_rate": 0.00012388942318217384,
"loss": 1.5182,
"step": 9900
},
{
"epoch": 1.3736904183723029,
"eval_loss": 1.5553832054138184,
"eval_runtime": 880.2398,
"eval_samples_per_second": 14.555,
"eval_steps_per_second": 1.82,
"step": 9900
},
{
"epoch": 1.375078054534101,
"grad_norm": 0.0,
"learning_rate": 0.0001237392874458383,
"loss": 1.4459,
"step": 9910
},
{
"epoch": 1.3764656906958996,
"grad_norm": 0.0,
"learning_rate": 0.0001235890949822563,
"loss": 1.5648,
"step": 9920
},
{
"epoch": 1.3778533268576978,
"grad_norm": 0.0,
"learning_rate": 0.0001234388461503267,
"loss": 1.6218,
"step": 9930
},
{
"epoch": 1.3792409630194964,
"grad_norm": 0.0,
"learning_rate": 0.00012328854130908319,
"loss": 1.6005,
"step": 9940
},
{
"epoch": 1.3806285991812945,
"grad_norm": 0.0,
"learning_rate": 0.0001231381808176932,
"loss": 1.5869,
"step": 9950
},
{
"epoch": 1.3820162353430931,
"grad_norm": 0.0,
"learning_rate": 0.0001229877650354572,
"loss": 1.6159,
"step": 9960
},
{
"epoch": 1.3834038715048913,
"grad_norm": 0.0,
"learning_rate": 0.00012283729432180775,
"loss": 1.5293,
"step": 9970
},
{
"epoch": 1.38479150766669,
"grad_norm": 0.0,
"learning_rate": 0.0001226867690363087,
"loss": 1.5321,
"step": 9980
},
{
"epoch": 1.386179143828488,
"grad_norm": 0.0,
"learning_rate": 0.00012253618953865427,
"loss": 1.5327,
"step": 9990
},
{
"epoch": 1.3875667799902867,
"grad_norm": 0.0,
"learning_rate": 0.00012238555618866826,
"loss": 1.6345,
"step": 10000
},
{
"epoch": 1.3875667799902867,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.5953,
"eval_samples_per_second": 14.632,
"eval_steps_per_second": 1.83,
"step": 10000
},
{
"epoch": 1.3889544161520848,
"grad_norm": 0.0,
"learning_rate": 0.00012223486934630315,
"loss": 1.5467,
"step": 10010
},
{
"epoch": 1.3903420523138834,
"grad_norm": 0.0,
"learning_rate": 0.00012208412937163922,
"loss": 1.5573,
"step": 10020
},
{
"epoch": 1.3917296884756816,
"grad_norm": 0.0,
"learning_rate": 0.00012193333662488376,
"loss": 1.6015,
"step": 10030
},
{
"epoch": 1.3931173246374802,
"grad_norm": 0.0,
"learning_rate": 0.00012178249146637009,
"loss": 1.5711,
"step": 10040
},
{
"epoch": 1.3945049607992783,
"grad_norm": 0.0,
"learning_rate": 0.00012163159425655682,
"loss": 1.5398,
"step": 10050
},
{
"epoch": 1.395892596961077,
"grad_norm": 0.0,
"learning_rate": 0.000121480645356027,
"loss": 1.518,
"step": 10060
},
{
"epoch": 1.397280233122875,
"grad_norm": 0.0,
"learning_rate": 0.00012132964512548705,
"loss": 1.5997,
"step": 10070
},
{
"epoch": 1.3986678692846737,
"grad_norm": 0.0,
"learning_rate": 0.00012117859392576623,
"loss": 1.5428,
"step": 10080
},
{
"epoch": 1.4000555054464718,
"grad_norm": 0.0,
"learning_rate": 0.00012102749211781539,
"loss": 1.5063,
"step": 10090
},
{
"epoch": 1.4014431416082704,
"grad_norm": 0.0,
"learning_rate": 0.00012087634006270654,
"loss": 1.5924,
"step": 10100
},
{
"epoch": 1.4014431416082704,
"eval_loss": 1.5553832054138184,
"eval_runtime": 866.8356,
"eval_samples_per_second": 14.78,
"eval_steps_per_second": 1.848,
"step": 10100
},
{
"epoch": 1.4028307777700686,
"grad_norm": 0.0,
"learning_rate": 0.00012072513812163157,
"loss": 1.5603,
"step": 10110
},
{
"epoch": 1.4042184139318672,
"grad_norm": 0.0,
"learning_rate": 0.00012057388665590171,
"loss": 1.5096,
"step": 10120
},
{
"epoch": 1.4056060500936653,
"grad_norm": 0.0,
"learning_rate": 0.00012042258602694643,
"loss": 1.5635,
"step": 10130
},
{
"epoch": 1.406993686255464,
"grad_norm": 0.0,
"learning_rate": 0.00012027123659631272,
"loss": 1.508,
"step": 10140
},
{
"epoch": 1.408381322417262,
"grad_norm": 0.0,
"learning_rate": 0.00012011983872566422,
"loss": 1.5646,
"step": 10150
},
{
"epoch": 1.4097689585790607,
"grad_norm": 0.0,
"learning_rate": 0.0001199683927767803,
"loss": 1.5866,
"step": 10160
},
{
"epoch": 1.4111565947408589,
"grad_norm": 0.0,
"learning_rate": 0.00011981689911155518,
"loss": 1.5451,
"step": 10170
},
{
"epoch": 1.4125442309026575,
"grad_norm": 0.0,
"learning_rate": 0.00011966535809199715,
"loss": 1.5747,
"step": 10180
},
{
"epoch": 1.4139318670644556,
"grad_norm": 0.0,
"learning_rate": 0.00011951377008022766,
"loss": 1.5125,
"step": 10190
},
{
"epoch": 1.415319503226254,
"grad_norm": 0.0,
"learning_rate": 0.00011936213543848043,
"loss": 1.4432,
"step": 10200
},
{
"epoch": 1.415319503226254,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.7234,
"eval_samples_per_second": 14.597,
"eval_steps_per_second": 1.825,
"step": 10200
},
{
"epoch": 1.4167071393880524,
"grad_norm": 0.0,
"learning_rate": 0.00011921045452910057,
"loss": 1.5242,
"step": 10210
},
{
"epoch": 1.4180947755498508,
"grad_norm": 0.0,
"learning_rate": 0.00011905872771454387,
"loss": 1.5433,
"step": 10220
},
{
"epoch": 1.4194824117116491,
"grad_norm": 0.0,
"learning_rate": 0.00011890695535737564,
"loss": 1.5994,
"step": 10230
},
{
"epoch": 1.4208700478734475,
"grad_norm": 0.0,
"learning_rate": 0.00011875513782027021,
"loss": 1.5355,
"step": 10240
},
{
"epoch": 1.4222576840352459,
"grad_norm": 0.0,
"learning_rate": 0.00011860327546600969,
"loss": 1.5373,
"step": 10250
},
{
"epoch": 1.4236453201970443,
"grad_norm": 0.0,
"learning_rate": 0.00011845136865748345,
"loss": 1.5215,
"step": 10260
},
{
"epoch": 1.4250329563588426,
"grad_norm": 0.0,
"learning_rate": 0.00011829941775768694,
"loss": 1.6019,
"step": 10270
},
{
"epoch": 1.426420592520641,
"grad_norm": 0.0,
"learning_rate": 0.00011814742312972109,
"loss": 1.5299,
"step": 10280
},
{
"epoch": 1.4278082286824394,
"grad_norm": 0.0,
"learning_rate": 0.00011799538513679127,
"loss": 1.5122,
"step": 10290
},
{
"epoch": 1.4291958648442378,
"grad_norm": 0.0,
"learning_rate": 0.00011784330414220643,
"loss": 1.5991,
"step": 10300
},
{
"epoch": 1.4291958648442378,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.4699,
"eval_samples_per_second": 14.685,
"eval_steps_per_second": 1.836,
"step": 10300
},
{
"epoch": 1.4305835010060362,
"grad_norm": 0.0,
"learning_rate": 0.00011769118050937837,
"loss": 1.6097,
"step": 10310
},
{
"epoch": 1.4319711371678345,
"grad_norm": 0.0,
"learning_rate": 0.0001175390146018207,
"loss": 1.5588,
"step": 10320
},
{
"epoch": 1.433358773329633,
"grad_norm": 0.0,
"learning_rate": 0.00011738680678314813,
"loss": 1.5589,
"step": 10330
},
{
"epoch": 1.4347464094914313,
"grad_norm": 0.0,
"learning_rate": 0.00011723455741707541,
"loss": 1.5128,
"step": 10340
},
{
"epoch": 1.4361340456532297,
"grad_norm": 0.0,
"learning_rate": 0.00011708226686741666,
"loss": 1.5162,
"step": 10350
},
{
"epoch": 1.437521681815028,
"grad_norm": 0.0,
"learning_rate": 0.0001169299354980844,
"loss": 1.5494,
"step": 10360
},
{
"epoch": 1.4389093179768264,
"grad_norm": 0.0,
"learning_rate": 0.00011677756367308866,
"loss": 1.5198,
"step": 10370
},
{
"epoch": 1.4402969541386248,
"grad_norm": 0.0,
"learning_rate": 0.00011662515175653615,
"loss": 1.5008,
"step": 10380
},
{
"epoch": 1.4416845903004232,
"grad_norm": 0.0,
"learning_rate": 0.00011647270011262939,
"loss": 1.4987,
"step": 10390
},
{
"epoch": 1.4430722264622216,
"grad_norm": 0.0,
"learning_rate": 0.00011632020910566586,
"loss": 1.5658,
"step": 10400
},
{
"epoch": 1.4430722264622216,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.8447,
"eval_samples_per_second": 14.678,
"eval_steps_per_second": 1.835,
"step": 10400
},
{
"epoch": 1.44445986262402,
"grad_norm": 0.0,
"learning_rate": 0.00011616767910003706,
"loss": 1.5572,
"step": 10410
},
{
"epoch": 1.4458474987858183,
"grad_norm": 0.0,
"learning_rate": 0.00011601511046022768,
"loss": 1.582,
"step": 10420
},
{
"epoch": 1.4472351349476167,
"grad_norm": 0.0,
"learning_rate": 0.00011586250355081481,
"loss": 1.535,
"step": 10430
},
{
"epoch": 1.448622771109415,
"grad_norm": 0.0,
"learning_rate": 0.00011570985873646685,
"loss": 1.5764,
"step": 10440
},
{
"epoch": 1.4500104072712134,
"grad_norm": 0.0,
"learning_rate": 0.00011555717638194288,
"loss": 1.5277,
"step": 10450
},
{
"epoch": 1.4513980434330118,
"grad_norm": 0.0,
"learning_rate": 0.00011540445685209167,
"loss": 1.5242,
"step": 10460
},
{
"epoch": 1.4527856795948102,
"grad_norm": 0.0,
"learning_rate": 0.00011525170051185081,
"loss": 1.5387,
"step": 10470
},
{
"epoch": 1.4541733157566086,
"grad_norm": 0.0,
"learning_rate": 0.00011509890772624586,
"loss": 1.5014,
"step": 10480
},
{
"epoch": 1.455560951918407,
"grad_norm": 0.0,
"learning_rate": 0.00011494607886038946,
"loss": 1.5327,
"step": 10490
},
{
"epoch": 1.4569485880802053,
"grad_norm": 0.0,
"learning_rate": 0.00011479321427948044,
"loss": 1.574,
"step": 10500
},
{
"epoch": 1.4569485880802053,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.3378,
"eval_samples_per_second": 14.603,
"eval_steps_per_second": 1.826,
"step": 10500
},
{
"epoch": 1.4583362242420037,
"grad_norm": 0.0,
"learning_rate": 0.00011464031434880305,
"loss": 1.4931,
"step": 10510
},
{
"epoch": 1.459723860403802,
"grad_norm": 0.0,
"learning_rate": 0.00011448737943372594,
"loss": 1.6437,
"step": 10520
},
{
"epoch": 1.4611114965656005,
"grad_norm": 0.0,
"learning_rate": 0.00011433440989970141,
"loss": 1.5004,
"step": 10530
},
{
"epoch": 1.4624991327273988,
"grad_norm": 0.0,
"learning_rate": 0.00011418140611226445,
"loss": 1.5678,
"step": 10540
},
{
"epoch": 1.4638867688891972,
"grad_norm": 0.0,
"learning_rate": 0.00011402836843703189,
"loss": 1.5217,
"step": 10550
},
{
"epoch": 1.4652744050509956,
"grad_norm": 0.0,
"learning_rate": 0.0001138752972397016,
"loss": 1.5304,
"step": 10560
},
{
"epoch": 1.466662041212794,
"grad_norm": 0.0,
"learning_rate": 0.00011372219288605148,
"loss": 1.567,
"step": 10570
},
{
"epoch": 1.4680496773745924,
"grad_norm": 0.0,
"learning_rate": 0.00011356905574193872,
"loss": 1.5239,
"step": 10580
},
{
"epoch": 1.4694373135363907,
"grad_norm": 0.0,
"learning_rate": 0.00011341588617329882,
"loss": 1.5732,
"step": 10590
},
{
"epoch": 1.470824949698189,
"grad_norm": 0.0,
"learning_rate": 0.00011326268454614481,
"loss": 1.4986,
"step": 10600
},
{
"epoch": 1.470824949698189,
"eval_loss": 1.5553832054138184,
"eval_runtime": 869.8756,
"eval_samples_per_second": 14.729,
"eval_steps_per_second": 1.842,
"step": 10600
},
{
"epoch": 1.4722125858599875,
"grad_norm": 0.0,
"learning_rate": 0.00011310945122656626,
"loss": 1.6011,
"step": 10610
},
{
"epoch": 1.4736002220217859,
"grad_norm": 0.0,
"learning_rate": 0.00011295618658072858,
"loss": 1.4677,
"step": 10620
},
{
"epoch": 1.4749878581835842,
"grad_norm": 0.0,
"learning_rate": 0.00011280289097487189,
"loss": 1.5537,
"step": 10630
},
{
"epoch": 1.4763754943453826,
"grad_norm": 0.0,
"learning_rate": 0.0001126495647753104,
"loss": 1.5312,
"step": 10640
},
{
"epoch": 1.477763130507181,
"grad_norm": 0.0,
"learning_rate": 0.00011249620834843145,
"loss": 1.5691,
"step": 10650
},
{
"epoch": 1.4791507666689794,
"grad_norm": 0.0,
"learning_rate": 0.00011234282206069444,
"loss": 1.5955,
"step": 10660
},
{
"epoch": 1.4805384028307778,
"grad_norm": 0.0,
"learning_rate": 0.00011218940627863037,
"loss": 1.6063,
"step": 10670
},
{
"epoch": 1.4819260389925761,
"grad_norm": 0.0,
"learning_rate": 0.00011203596136884049,
"loss": 1.5566,
"step": 10680
},
{
"epoch": 1.4833136751543745,
"grad_norm": 0.0,
"learning_rate": 0.00011188248769799584,
"loss": 1.563,
"step": 10690
},
{
"epoch": 1.484701311316173,
"grad_norm": 0.0,
"learning_rate": 0.00011172898563283601,
"loss": 1.5032,
"step": 10700
},
{
"epoch": 1.484701311316173,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.308,
"eval_samples_per_second": 14.637,
"eval_steps_per_second": 1.83,
"step": 10700
},
{
"epoch": 1.4860889474779713,
"grad_norm": 0.0,
"learning_rate": 0.00011157545554016864,
"loss": 1.5129,
"step": 10710
},
{
"epoch": 1.4874765836397696,
"grad_norm": 0.0,
"learning_rate": 0.00011142189778686814,
"loss": 1.5841,
"step": 10720
},
{
"epoch": 1.488864219801568,
"grad_norm": 0.0,
"learning_rate": 0.0001112683127398752,
"loss": 1.5418,
"step": 10730
},
{
"epoch": 1.4902518559633664,
"grad_norm": 0.0,
"learning_rate": 0.00011111470076619557,
"loss": 1.5277,
"step": 10740
},
{
"epoch": 1.4916394921251648,
"grad_norm": 0.0,
"learning_rate": 0.00011096106223289951,
"loss": 1.666,
"step": 10750
},
{
"epoch": 1.4930271282869632,
"grad_norm": 0.0,
"learning_rate": 0.00011080739750712057,
"loss": 1.594,
"step": 10760
},
{
"epoch": 1.4944147644487615,
"grad_norm": 0.0,
"learning_rate": 0.00011065370695605502,
"loss": 1.6252,
"step": 10770
},
{
"epoch": 1.49580240061056,
"grad_norm": 0.0,
"learning_rate": 0.0001104999909469608,
"loss": 1.568,
"step": 10780
},
{
"epoch": 1.4971900367723583,
"grad_norm": 0.0,
"learning_rate": 0.00011034624984715667,
"loss": 1.4767,
"step": 10790
},
{
"epoch": 1.4985776729341567,
"grad_norm": 0.0,
"learning_rate": 0.00011019248402402136,
"loss": 1.4872,
"step": 10800
},
{
"epoch": 1.4985776729341567,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.6368,
"eval_samples_per_second": 14.648,
"eval_steps_per_second": 1.832,
"step": 10800
},
{
"epoch": 1.499965309095955,
"grad_norm": 0.0,
"learning_rate": 0.00011003869384499268,
"loss": 1.4777,
"step": 10810
},
{
"epoch": 1.5013529452577534,
"grad_norm": 0.0,
"learning_rate": 0.00010988487967756666,
"loss": 1.5252,
"step": 10820
},
{
"epoch": 1.5027405814195518,
"grad_norm": 0.0,
"learning_rate": 0.0001097310418892966,
"loss": 1.5309,
"step": 10830
},
{
"epoch": 1.5041282175813502,
"grad_norm": 0.0,
"learning_rate": 0.00010957718084779229,
"loss": 1.5586,
"step": 10840
},
{
"epoch": 1.5055158537431486,
"grad_norm": 0.0,
"learning_rate": 0.00010942329692071909,
"loss": 1.6205,
"step": 10850
},
{
"epoch": 1.506903489904947,
"grad_norm": 0.0,
"learning_rate": 0.00010926939047579699,
"loss": 1.5137,
"step": 10860
},
{
"epoch": 1.5082911260667453,
"grad_norm": 0.0,
"learning_rate": 0.00010911546188079986,
"loss": 1.5804,
"step": 10870
},
{
"epoch": 1.5096787622285437,
"grad_norm": 0.0,
"learning_rate": 0.00010896151150355445,
"loss": 1.4934,
"step": 10880
},
{
"epoch": 1.511066398390342,
"grad_norm": 0.0,
"learning_rate": 0.00010880753971193957,
"loss": 1.4905,
"step": 10890
},
{
"epoch": 1.5124540345521404,
"grad_norm": 0.0,
"learning_rate": 0.00010865354687388522,
"loss": 1.5298,
"step": 10900
},
{
"epoch": 1.5124540345521404,
"eval_loss": 1.5553832054138184,
"eval_runtime": 880.7605,
"eval_samples_per_second": 14.547,
"eval_steps_per_second": 1.819,
"step": 10900
},
{
"epoch": 1.5138416707139388,
"grad_norm": 0.0,
"learning_rate": 0.00010849953335737173,
"loss": 1.5989,
"step": 10910
},
{
"epoch": 1.5152293068757372,
"grad_norm": 0.0,
"learning_rate": 0.00010834549953042869,
"loss": 1.5482,
"step": 10920
},
{
"epoch": 1.5166169430375356,
"grad_norm": 0.0,
"learning_rate": 0.00010819144576113442,
"loss": 1.5114,
"step": 10930
},
{
"epoch": 1.518004579199334,
"grad_norm": 0.0,
"learning_rate": 0.00010803737241761474,
"loss": 1.4501,
"step": 10940
},
{
"epoch": 1.5193922153611323,
"grad_norm": 0.0,
"learning_rate": 0.00010788327986804237,
"loss": 1.4869,
"step": 10950
},
{
"epoch": 1.5207798515229307,
"grad_norm": 0.0,
"learning_rate": 0.00010772916848063581,
"loss": 1.4995,
"step": 10960
},
{
"epoch": 1.522167487684729,
"grad_norm": 0.0,
"learning_rate": 0.00010757503862365865,
"loss": 1.6415,
"step": 10970
},
{
"epoch": 1.5235551238465275,
"grad_norm": 0.0,
"learning_rate": 0.00010742089066541859,
"loss": 1.513,
"step": 10980
},
{
"epoch": 1.5249427600083258,
"grad_norm": 0.0,
"learning_rate": 0.00010726672497426656,
"loss": 1.6119,
"step": 10990
},
{
"epoch": 1.5263303961701242,
"grad_norm": 0.0,
"learning_rate": 0.00010711254191859595,
"loss": 1.5535,
"step": 11000
},
{
"epoch": 1.5263303961701242,
"eval_loss": 1.5553832054138184,
"eval_runtime": 866.7615,
"eval_samples_per_second": 14.781,
"eval_steps_per_second": 1.848,
"step": 11000
},
{
"epoch": 1.5277180323319226,
"grad_norm": 0.0,
"learning_rate": 0.0001069583418668415,
"loss": 1.5341,
"step": 11010
},
{
"epoch": 1.529105668493721,
"grad_norm": 0.0,
"learning_rate": 0.00010680412518747873,
"loss": 1.6164,
"step": 11020
},
{
"epoch": 1.5304933046555194,
"grad_norm": 0.0,
"learning_rate": 0.00010664989224902276,
"loss": 1.6643,
"step": 11030
},
{
"epoch": 1.5318809408173177,
"grad_norm": 0.0,
"learning_rate": 0.00010649564342002763,
"loss": 1.515,
"step": 11040
},
{
"epoch": 1.5332685769791161,
"grad_norm": 0.0,
"learning_rate": 0.00010634137906908534,
"loss": 1.476,
"step": 11050
},
{
"epoch": 1.5346562131409145,
"grad_norm": 0.0,
"learning_rate": 0.00010618709956482498,
"loss": 1.5775,
"step": 11060
},
{
"epoch": 1.5360438493027129,
"grad_norm": 0.0,
"learning_rate": 0.00010603280527591182,
"loss": 1.4662,
"step": 11070
},
{
"epoch": 1.5374314854645112,
"grad_norm": 0.0,
"learning_rate": 0.00010587849657104653,
"loss": 1.5799,
"step": 11080
},
{
"epoch": 1.5388191216263096,
"grad_norm": 0.0,
"learning_rate": 0.00010572417381896418,
"loss": 1.5966,
"step": 11090
},
{
"epoch": 1.540206757788108,
"grad_norm": 0.0,
"learning_rate": 0.00010556983738843335,
"loss": 1.5935,
"step": 11100
},
{
"epoch": 1.540206757788108,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.7742,
"eval_samples_per_second": 14.629,
"eval_steps_per_second": 1.829,
"step": 11100
},
{
"epoch": 1.5415943939499064,
"grad_norm": 0.0,
"learning_rate": 0.00010541548764825544,
"loss": 1.5677,
"step": 11110
},
{
"epoch": 1.5429820301117048,
"grad_norm": 0.0,
"learning_rate": 0.00010526112496726354,
"loss": 1.5742,
"step": 11120
},
{
"epoch": 1.5443696662735031,
"grad_norm": 0.0,
"learning_rate": 0.00010510674971432174,
"loss": 1.5705,
"step": 11130
},
{
"epoch": 1.5457573024353015,
"grad_norm": 0.0,
"learning_rate": 0.0001049523622583241,
"loss": 1.5183,
"step": 11140
},
{
"epoch": 1.5471449385971,
"grad_norm": 0.0,
"learning_rate": 0.00010479796296819393,
"loss": 1.48,
"step": 11150
},
{
"epoch": 1.548532574758898,
"grad_norm": 0.0,
"learning_rate": 0.00010464355221288271,
"loss": 1.5553,
"step": 11160
},
{
"epoch": 1.5499202109206967,
"grad_norm": 0.0,
"learning_rate": 0.0001044891303613694,
"loss": 1.5698,
"step": 11170
},
{
"epoch": 1.5513078470824948,
"grad_norm": 0.0,
"learning_rate": 0.00010433469778265945,
"loss": 1.5182,
"step": 11180
},
{
"epoch": 1.5526954832442934,
"grad_norm": 0.0,
"learning_rate": 0.00010418025484578396,
"loss": 1.5826,
"step": 11190
},
{
"epoch": 1.5540831194060916,
"grad_norm": 0.0,
"learning_rate": 0.00010402580191979873,
"loss": 1.5437,
"step": 11200
},
{
"epoch": 1.5540831194060916,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.2662,
"eval_samples_per_second": 14.604,
"eval_steps_per_second": 1.826,
"step": 11200
},
{
"epoch": 1.5554707555678902,
"grad_norm": 0.0,
"learning_rate": 0.00010387133937378348,
"loss": 1.5353,
"step": 11210
},
{
"epoch": 1.5568583917296883,
"grad_norm": 0.0,
"learning_rate": 0.00010371686757684092,
"loss": 1.5471,
"step": 11220
},
{
"epoch": 1.558246027891487,
"grad_norm": 0.0,
"learning_rate": 0.00010356238689809579,
"loss": 1.6254,
"step": 11230
},
{
"epoch": 1.559633664053285,
"grad_norm": 0.0,
"learning_rate": 0.00010340789770669421,
"loss": 1.5478,
"step": 11240
},
{
"epoch": 1.5610213002150837,
"grad_norm": 0.0,
"learning_rate": 0.00010325340037180244,
"loss": 1.5716,
"step": 11250
},
{
"epoch": 1.5624089363768818,
"grad_norm": 0.0,
"learning_rate": 0.00010309889526260639,
"loss": 1.59,
"step": 11260
},
{
"epoch": 1.5637965725386804,
"grad_norm": 0.0,
"learning_rate": 0.00010294438274831042,
"loss": 1.53,
"step": 11270
},
{
"epoch": 1.5651842087004786,
"grad_norm": 0.0,
"learning_rate": 0.00010278986319813663,
"loss": 1.4877,
"step": 11280
},
{
"epoch": 1.5665718448622772,
"grad_norm": 0.0,
"learning_rate": 0.00010263533698132393,
"loss": 1.5286,
"step": 11290
},
{
"epoch": 1.5679594810240753,
"grad_norm": 0.0,
"learning_rate": 0.00010248080446712716,
"loss": 1.4785,
"step": 11300
},
{
"epoch": 1.5679594810240753,
"eval_loss": 1.5553832054138184,
"eval_runtime": 869.2963,
"eval_samples_per_second": 14.738,
"eval_steps_per_second": 1.843,
"step": 11300
},
{
"epoch": 1.569347117185874,
"grad_norm": 0.0,
"learning_rate": 0.00010232626602481623,
"loss": 1.5692,
"step": 11310
},
{
"epoch": 1.570734753347672,
"grad_norm": 0.0,
"learning_rate": 0.00010217172202367515,
"loss": 1.4877,
"step": 11320
},
{
"epoch": 1.5721223895094707,
"grad_norm": 0.0,
"learning_rate": 0.0001020171728330013,
"loss": 1.5381,
"step": 11330
},
{
"epoch": 1.5735100256712689,
"grad_norm": 0.0,
"learning_rate": 0.00010186261882210437,
"loss": 1.4638,
"step": 11340
},
{
"epoch": 1.5748976618330675,
"grad_norm": 0.0,
"learning_rate": 0.00010170806036030568,
"loss": 1.5052,
"step": 11350
},
{
"epoch": 1.5762852979948656,
"grad_norm": 0.0,
"learning_rate": 0.00010155349781693708,
"loss": 1.5806,
"step": 11360
},
{
"epoch": 1.5776729341566642,
"grad_norm": 0.0,
"learning_rate": 0.00010139893156134024,
"loss": 1.5457,
"step": 11370
},
{
"epoch": 1.5790605703184624,
"grad_norm": 0.0,
"learning_rate": 0.00010124436196286561,
"loss": 1.5623,
"step": 11380
},
{
"epoch": 1.580448206480261,
"grad_norm": 0.0,
"learning_rate": 0.00010108978939087181,
"loss": 1.6168,
"step": 11390
},
{
"epoch": 1.5818358426420591,
"grad_norm": 0.0,
"learning_rate": 0.00010093521421472436,
"loss": 1.5384,
"step": 11400
},
{
"epoch": 1.5818358426420591,
"eval_loss": 1.5553832054138184,
"eval_runtime": 870.7754,
"eval_samples_per_second": 14.713,
"eval_steps_per_second": 1.84,
"step": 11400
},
{
"epoch": 1.5832234788038577,
"grad_norm": 0.0,
"learning_rate": 0.00010078063680379513,
"loss": 1.6739,
"step": 11410
},
{
"epoch": 1.5846111149656559,
"grad_norm": 0.0,
"learning_rate": 0.00010062605752746128,
"loss": 1.4739,
"step": 11420
},
{
"epoch": 1.5859987511274545,
"grad_norm": 0.0,
"learning_rate": 0.00010047147675510442,
"loss": 1.5422,
"step": 11430
},
{
"epoch": 1.5873863872892526,
"grad_norm": 0.0,
"learning_rate": 0.00010031689485610982,
"loss": 1.5205,
"step": 11440
},
{
"epoch": 1.5887740234510512,
"grad_norm": 0.0,
"learning_rate": 0.00010016231219986529,
"loss": 1.5092,
"step": 11450
},
{
"epoch": 1.5901616596128494,
"grad_norm": 0.0,
"learning_rate": 0.00010000772915576064,
"loss": 1.5224,
"step": 11460
},
{
"epoch": 1.591549295774648,
"grad_norm": 0.0,
"learning_rate": 9.985314609318644e-05,
"loss": 1.6409,
"step": 11470
},
{
"epoch": 1.5929369319364461,
"grad_norm": 0.0,
"learning_rate": 9.969856338153334e-05,
"loss": 1.5212,
"step": 11480
},
{
"epoch": 1.5943245680982447,
"grad_norm": 0.0,
"learning_rate": 9.954398139019123e-05,
"loss": 1.5068,
"step": 11490
},
{
"epoch": 1.595712204260043,
"grad_norm": 0.0,
"learning_rate": 9.938940048854822e-05,
"loss": 1.5086,
"step": 11500
},
{
"epoch": 1.595712204260043,
"eval_loss": 1.5553832054138184,
"eval_runtime": 863.1857,
"eval_samples_per_second": 14.843,
"eval_steps_per_second": 1.856,
"step": 11500
},
{
"epoch": 1.5970998404218415,
"grad_norm": 0.0,
"learning_rate": 9.923482104598986e-05,
"loss": 1.4987,
"step": 11510
},
{
"epoch": 1.5984874765836397,
"grad_norm": 0.0,
"learning_rate": 9.908024343189809e-05,
"loss": 1.5908,
"step": 11520
},
{
"epoch": 1.5998751127454383,
"grad_norm": 0.0,
"learning_rate": 9.892566801565061e-05,
"loss": 1.5734,
"step": 11530
},
{
"epoch": 1.6012627489072364,
"grad_norm": 0.0,
"learning_rate": 9.877109516661991e-05,
"loss": 1.5493,
"step": 11540
},
{
"epoch": 1.602650385069035,
"grad_norm": 0.0,
"learning_rate": 9.861652525417213e-05,
"loss": 1.5222,
"step": 11550
},
{
"epoch": 1.6040380212308332,
"grad_norm": 0.0,
"learning_rate": 9.846195864766656e-05,
"loss": 1.4728,
"step": 11560
},
{
"epoch": 1.6054256573926318,
"grad_norm": 0.0,
"learning_rate": 9.830739571645459e-05,
"loss": 1.5139,
"step": 11570
},
{
"epoch": 1.60681329355443,
"grad_norm": 0.0,
"learning_rate": 9.815283682987883e-05,
"loss": 1.5696,
"step": 11580
},
{
"epoch": 1.6082009297162285,
"grad_norm": 0.0,
"learning_rate": 9.799828235727209e-05,
"loss": 1.5483,
"step": 11590
},
{
"epoch": 1.6095885658780267,
"grad_norm": 0.0,
"learning_rate": 9.784373266795679e-05,
"loss": 1.5584,
"step": 11600
},
{
"epoch": 1.6095885658780267,
"eval_loss": 1.5553832054138184,
"eval_runtime": 866.2587,
"eval_samples_per_second": 14.79,
"eval_steps_per_second": 1.849,
"step": 11600
},
{
"epoch": 1.6109762020398253,
"grad_norm": 0.0,
"learning_rate": 9.768918813124384e-05,
"loss": 1.6297,
"step": 11610
},
{
"epoch": 1.6123638382016234,
"grad_norm": 0.0,
"learning_rate": 9.75346491164319e-05,
"loss": 1.5784,
"step": 11620
},
{
"epoch": 1.613751474363422,
"grad_norm": 0.0,
"learning_rate": 9.738011599280632e-05,
"loss": 1.553,
"step": 11630
},
{
"epoch": 1.6151391105252202,
"grad_norm": 0.0,
"learning_rate": 9.722558912963848e-05,
"loss": 1.5311,
"step": 11640
},
{
"epoch": 1.6165267466870188,
"grad_norm": 0.0,
"learning_rate": 9.707106889618481e-05,
"loss": 1.5268,
"step": 11650
},
{
"epoch": 1.617914382848817,
"grad_norm": 0.0,
"learning_rate": 9.691655566168576e-05,
"loss": 1.433,
"step": 11660
},
{
"epoch": 1.6193020190106155,
"grad_norm": 0.0,
"learning_rate": 9.676204979536521e-05,
"loss": 1.5892,
"step": 11670
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.0,
"learning_rate": 9.660755166642934e-05,
"loss": 1.4965,
"step": 11680
},
{
"epoch": 1.6220772913342123,
"grad_norm": 0.0,
"learning_rate": 9.645306164406594e-05,
"loss": 1.6108,
"step": 11690
},
{
"epoch": 1.6234649274960105,
"grad_norm": 0.0,
"learning_rate": 9.629858009744327e-05,
"loss": 1.5417,
"step": 11700
},
{
"epoch": 1.6234649274960105,
"eval_loss": 1.5553832054138184,
"eval_runtime": 868.366,
"eval_samples_per_second": 14.754,
"eval_steps_per_second": 1.845,
"step": 11700
},
{
"epoch": 1.624852563657809,
"grad_norm": 0.0,
"learning_rate": 9.614410739570947e-05,
"loss": 1.5061,
"step": 11710
},
{
"epoch": 1.6262401998196072,
"grad_norm": 0.0,
"learning_rate": 9.598964390799147e-05,
"loss": 1.5252,
"step": 11720
},
{
"epoch": 1.6276278359814058,
"grad_norm": 0.0,
"learning_rate": 9.583519000339429e-05,
"loss": 1.592,
"step": 11730
},
{
"epoch": 1.629015472143204,
"grad_norm": 0.0,
"learning_rate": 9.568074605099989e-05,
"loss": 1.6131,
"step": 11740
},
{
"epoch": 1.6304031083050026,
"grad_norm": 0.0,
"learning_rate": 9.552631241986657e-05,
"loss": 1.6038,
"step": 11750
},
{
"epoch": 1.6317907444668007,
"grad_norm": 0.0,
"learning_rate": 9.537188947902795e-05,
"loss": 1.5402,
"step": 11760
},
{
"epoch": 1.6331783806285993,
"grad_norm": 0.0,
"learning_rate": 9.521747759749202e-05,
"loss": 1.5562,
"step": 11770
},
{
"epoch": 1.6345660167903975,
"grad_norm": 0.0,
"learning_rate": 9.506307714424044e-05,
"loss": 1.56,
"step": 11780
},
{
"epoch": 1.635953652952196,
"grad_norm": 0.0,
"learning_rate": 9.490868848822752e-05,
"loss": 1.5394,
"step": 11790
},
{
"epoch": 1.6373412891139942,
"grad_norm": 0.0,
"learning_rate": 9.475431199837944e-05,
"loss": 1.5144,
"step": 11800
},
{
"epoch": 1.6373412891139942,
"eval_loss": 1.5553832054138184,
"eval_runtime": 878.0467,
"eval_samples_per_second": 14.591,
"eval_steps_per_second": 1.825,
"step": 11800
},
{
"epoch": 1.6387289252757928,
"grad_norm": 0.0,
"learning_rate": 9.459994804359317e-05,
"loss": 1.5351,
"step": 11810
},
{
"epoch": 1.640116561437591,
"grad_norm": 0.0,
"learning_rate": 9.444559699273583e-05,
"loss": 1.4772,
"step": 11820
},
{
"epoch": 1.6415041975993896,
"grad_norm": 0.0,
"learning_rate": 9.429125921464371e-05,
"loss": 1.555,
"step": 11830
},
{
"epoch": 1.6428918337611877,
"grad_norm": 0.0,
"learning_rate": 9.413693507812139e-05,
"loss": 1.5026,
"step": 11840
},
{
"epoch": 1.6442794699229863,
"grad_norm": 0.0,
"learning_rate": 9.398262495194074e-05,
"loss": 1.5238,
"step": 11850
},
{
"epoch": 1.6456671060847845,
"grad_norm": 0.0,
"learning_rate": 9.382832920484026e-05,
"loss": 1.5083,
"step": 11860
},
{
"epoch": 1.647054742246583,
"grad_norm": 0.0,
"learning_rate": 9.367404820552412e-05,
"loss": 1.5808,
"step": 11870
},
{
"epoch": 1.6484423784083813,
"grad_norm": 0.0,
"learning_rate": 9.35197823226611e-05,
"loss": 1.49,
"step": 11880
},
{
"epoch": 1.6498300145701799,
"grad_norm": 0.0,
"learning_rate": 9.336553192488398e-05,
"loss": 1.5472,
"step": 11890
},
{
"epoch": 1.651217650731978,
"grad_norm": 0.0,
"learning_rate": 9.321129738078853e-05,
"loss": 1.5131,
"step": 11900
},
{
"epoch": 1.651217650731978,
"eval_loss": 1.5553832054138184,
"eval_runtime": 867.5789,
"eval_samples_per_second": 14.768,
"eval_steps_per_second": 1.847,
"step": 11900
},
{
"epoch": 1.6526052868937766,
"grad_norm": 0.0,
"learning_rate": 9.305707905893263e-05,
"loss": 1.5538,
"step": 11910
},
{
"epoch": 1.6539929230555748,
"grad_norm": 0.0,
"learning_rate": 9.29028773278353e-05,
"loss": 1.4998,
"step": 11920
},
{
"epoch": 1.6553805592173734,
"grad_norm": 0.0,
"learning_rate": 9.274869255597603e-05,
"loss": 1.5737,
"step": 11930
},
{
"epoch": 1.6567681953791715,
"grad_norm": 0.0,
"learning_rate": 9.259452511179374e-05,
"loss": 1.5355,
"step": 11940
},
{
"epoch": 1.65815583154097,
"grad_norm": 0.0,
"learning_rate": 9.244037536368602e-05,
"loss": 1.508,
"step": 11950
},
{
"epoch": 1.6595434677027683,
"grad_norm": 0.0,
"learning_rate": 9.228624368000798e-05,
"loss": 1.4927,
"step": 11960
},
{
"epoch": 1.6609311038645667,
"grad_norm": 0.0,
"learning_rate": 9.213213042907176e-05,
"loss": 1.5265,
"step": 11970
},
{
"epoch": 1.662318740026365,
"grad_norm": 0.0,
"learning_rate": 9.197803597914541e-05,
"loss": 1.6099,
"step": 11980
},
{
"epoch": 1.6637063761881634,
"grad_norm": 0.0,
"learning_rate": 9.182396069845192e-05,
"loss": 1.4851,
"step": 11990
},
{
"epoch": 1.6650940123499618,
"grad_norm": 0.0,
"learning_rate": 9.166990495516866e-05,
"loss": 1.5752,
"step": 12000
},
{
"epoch": 1.6650940123499618,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.5831,
"eval_samples_per_second": 14.633,
"eval_steps_per_second": 1.83,
"step": 12000
},
{
"epoch": 1.6664816485117602,
"grad_norm": 0.0,
"learning_rate": 9.151586911742617e-05,
"loss": 1.5579,
"step": 12010
},
{
"epoch": 1.6678692846735585,
"grad_norm": 0.0,
"learning_rate": 9.136185355330759e-05,
"loss": 1.5998,
"step": 12020
},
{
"epoch": 1.669256920835357,
"grad_norm": 0.0,
"learning_rate": 9.120785863084738e-05,
"loss": 1.6114,
"step": 12030
},
{
"epoch": 1.6706445569971553,
"grad_norm": 0.0,
"learning_rate": 9.105388471803087e-05,
"loss": 1.5483,
"step": 12040
},
{
"epoch": 1.6720321931589537,
"grad_norm": 0.0,
"learning_rate": 9.08999321827931e-05,
"loss": 1.4476,
"step": 12050
},
{
"epoch": 1.673419829320752,
"grad_norm": 0.0,
"learning_rate": 9.07460013930181e-05,
"loss": 1.5994,
"step": 12060
},
{
"epoch": 1.6748074654825504,
"grad_norm": 0.0,
"learning_rate": 9.059209271653779e-05,
"loss": 1.5157,
"step": 12070
},
{
"epoch": 1.6761951016443488,
"grad_norm": 0.0,
"learning_rate": 9.043820652113138e-05,
"loss": 1.5291,
"step": 12080
},
{
"epoch": 1.6775827378061472,
"grad_norm": 0.0,
"learning_rate": 9.028434317452437e-05,
"loss": 1.5008,
"step": 12090
},
{
"epoch": 1.6789703739679456,
"grad_norm": 0.0,
"learning_rate": 9.013050304438751e-05,
"loss": 1.4394,
"step": 12100
},
{
"epoch": 1.6789703739679456,
"eval_loss": 1.5553832054138184,
"eval_runtime": 864.7788,
"eval_samples_per_second": 14.815,
"eval_steps_per_second": 1.852,
"step": 12100
},
{
"epoch": 1.680358010129744,
"grad_norm": 0.0,
"learning_rate": 8.997668649833623e-05,
"loss": 1.5515,
"step": 12110
},
{
"epoch": 1.6817456462915423,
"grad_norm": 0.0,
"learning_rate": 8.982289390392954e-05,
"loss": 1.5852,
"step": 12120
},
{
"epoch": 1.6831332824533407,
"grad_norm": 0.0,
"learning_rate": 8.966912562866926e-05,
"loss": 1.5582,
"step": 12130
},
{
"epoch": 1.684520918615139,
"grad_norm": 0.0,
"learning_rate": 8.951538203999897e-05,
"loss": 1.5402,
"step": 12140
},
{
"epoch": 1.6859085547769375,
"grad_norm": 0.0,
"learning_rate": 8.936166350530341e-05,
"loss": 1.5596,
"step": 12150
},
{
"epoch": 1.6872961909387358,
"grad_norm": 0.0,
"learning_rate": 8.920797039190736e-05,
"loss": 1.5621,
"step": 12160
},
{
"epoch": 1.6886838271005342,
"grad_norm": 0.0,
"learning_rate": 8.905430306707495e-05,
"loss": 1.5881,
"step": 12170
},
{
"epoch": 1.6900714632623326,
"grad_norm": 0.0,
"learning_rate": 8.890066189800851e-05,
"loss": 1.5164,
"step": 12180
},
{
"epoch": 1.691459099424131,
"grad_norm": 0.0,
"learning_rate": 8.874704725184803e-05,
"loss": 1.5392,
"step": 12190
},
{
"epoch": 1.6928467355859294,
"grad_norm": 0.0,
"learning_rate": 8.859345949567012e-05,
"loss": 1.5654,
"step": 12200
},
{
"epoch": 1.6928467355859294,
"eval_loss": 1.5553832054138184,
"eval_runtime": 880.6667,
"eval_samples_per_second": 14.548,
"eval_steps_per_second": 1.819,
"step": 12200
},
{
"epoch": 1.6942343717477277,
"grad_norm": 0.0,
"learning_rate": 8.843989899648697e-05,
"loss": 1.5485,
"step": 12210
},
{
"epoch": 1.695622007909526,
"grad_norm": 0.0,
"learning_rate": 8.82863661212458e-05,
"loss": 1.6134,
"step": 12220
},
{
"epoch": 1.6970096440713245,
"grad_norm": 0.0,
"learning_rate": 8.813286123682777e-05,
"loss": 1.5393,
"step": 12230
},
{
"epoch": 1.6983972802331229,
"grad_norm": 0.0,
"learning_rate": 8.797938471004722e-05,
"loss": 1.544,
"step": 12240
},
{
"epoch": 1.6997849163949212,
"grad_norm": 0.0,
"learning_rate": 8.782593690765054e-05,
"loss": 1.5545,
"step": 12250
},
{
"epoch": 1.7011725525567196,
"grad_norm": 0.0,
"learning_rate": 8.767251819631562e-05,
"loss": 1.5748,
"step": 12260
},
{
"epoch": 1.702560188718518,
"grad_norm": 0.0,
"learning_rate": 8.751912894265088e-05,
"loss": 1.4681,
"step": 12270
},
{
"epoch": 1.7039478248803164,
"grad_norm": 0.0,
"learning_rate": 8.736576951319424e-05,
"loss": 1.537,
"step": 12280
},
{
"epoch": 1.7053354610421148,
"grad_norm": 0.0,
"learning_rate": 8.721244027441238e-05,
"loss": 1.4676,
"step": 12290
},
{
"epoch": 1.7067230972039131,
"grad_norm": 0.0,
"learning_rate": 8.705914159269985e-05,
"loss": 1.5405,
"step": 12300
},
{
"epoch": 1.7067230972039131,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.2109,
"eval_samples_per_second": 14.689,
"eval_steps_per_second": 1.837,
"step": 12300
},
{
"epoch": 1.7081107333657115,
"grad_norm": 0.0,
"learning_rate": 8.690587383437825e-05,
"loss": 1.5012,
"step": 12310
},
{
"epoch": 1.7094983695275099,
"grad_norm": 0.0,
"learning_rate": 8.67526373656951e-05,
"loss": 1.5688,
"step": 12320
},
{
"epoch": 1.7108860056893083,
"grad_norm": 0.0,
"learning_rate": 8.659943255282333e-05,
"loss": 1.4852,
"step": 12330
},
{
"epoch": 1.7122736418511066,
"grad_norm": 0.0,
"learning_rate": 8.644625976186018e-05,
"loss": 1.5695,
"step": 12340
},
{
"epoch": 1.713661278012905,
"grad_norm": 0.0,
"learning_rate": 8.629311935882634e-05,
"loss": 1.5267,
"step": 12350
},
{
"epoch": 1.7150489141747034,
"grad_norm": 0.0,
"learning_rate": 8.614001170966508e-05,
"loss": 1.5443,
"step": 12360
},
{
"epoch": 1.7164365503365018,
"grad_norm": 0.0,
"learning_rate": 8.598693718024147e-05,
"loss": 1.5144,
"step": 12370
},
{
"epoch": 1.7178241864983002,
"grad_norm": 0.0,
"learning_rate": 8.583389613634142e-05,
"loss": 1.532,
"step": 12380
},
{
"epoch": 1.7192118226600985,
"grad_norm": 0.0,
"learning_rate": 8.56808889436708e-05,
"loss": 1.5548,
"step": 12390
},
{
"epoch": 1.720599458821897,
"grad_norm": 0.0,
"learning_rate": 8.552791596785458e-05,
"loss": 1.5301,
"step": 12400
},
{
"epoch": 1.720599458821897,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.1704,
"eval_samples_per_second": 14.656,
"eval_steps_per_second": 1.833,
"step": 12400
},
{
"epoch": 1.7219870949836953,
"grad_norm": 0.0,
"learning_rate": 8.5374977574436e-05,
"loss": 1.5205,
"step": 12410
},
{
"epoch": 1.7233747311454937,
"grad_norm": 0.0,
"learning_rate": 8.522207412887568e-05,
"loss": 1.564,
"step": 12420
},
{
"epoch": 1.724762367307292,
"grad_norm": 0.0,
"learning_rate": 8.506920599655064e-05,
"loss": 1.6104,
"step": 12430
},
{
"epoch": 1.7261500034690904,
"grad_norm": 0.0,
"learning_rate": 8.491637354275358e-05,
"loss": 1.5709,
"step": 12440
},
{
"epoch": 1.7275376396308888,
"grad_norm": 0.0,
"learning_rate": 8.4763577132692e-05,
"loss": 1.4994,
"step": 12450
},
{
"epoch": 1.7289252757926872,
"grad_norm": 0.0,
"learning_rate": 8.461081713148715e-05,
"loss": 1.5472,
"step": 12460
},
{
"epoch": 1.7303129119544856,
"grad_norm": 0.0,
"learning_rate": 8.445809390417332e-05,
"loss": 1.5493,
"step": 12470
},
{
"epoch": 1.731700548116284,
"grad_norm": 0.0,
"learning_rate": 8.430540781569696e-05,
"loss": 1.5698,
"step": 12480
},
{
"epoch": 1.7330881842780823,
"grad_norm": 0.0,
"learning_rate": 8.415275923091577e-05,
"loss": 1.6143,
"step": 12490
},
{
"epoch": 1.7344758204398807,
"grad_norm": 0.0,
"learning_rate": 8.400014851459779e-05,
"loss": 1.5465,
"step": 12500
},
{
"epoch": 1.7344758204398807,
"eval_loss": 1.5553832054138184,
"eval_runtime": 885.8145,
"eval_samples_per_second": 14.464,
"eval_steps_per_second": 1.809,
"step": 12500
},
{
"epoch": 1.735863456601679,
"grad_norm": 0.0,
"learning_rate": 8.384757603142059e-05,
"loss": 1.4982,
"step": 12510
},
{
"epoch": 1.7372510927634774,
"grad_norm": 0.0,
"learning_rate": 8.369504214597039e-05,
"loss": 1.5044,
"step": 12520
},
{
"epoch": 1.7386387289252758,
"grad_norm": 0.0,
"learning_rate": 8.354254722274117e-05,
"loss": 1.5543,
"step": 12530
},
{
"epoch": 1.7400263650870742,
"grad_norm": 0.0,
"learning_rate": 8.339009162613379e-05,
"loss": 1.5783,
"step": 12540
},
{
"epoch": 1.7414140012488726,
"grad_norm": 0.0,
"learning_rate": 8.323767572045515e-05,
"loss": 1.4921,
"step": 12550
},
{
"epoch": 1.742801637410671,
"grad_norm": 0.0,
"learning_rate": 8.308529986991736e-05,
"loss": 1.4773,
"step": 12560
},
{
"epoch": 1.7441892735724693,
"grad_norm": 0.0,
"learning_rate": 8.293296443863668e-05,
"loss": 1.5926,
"step": 12570
},
{
"epoch": 1.7455769097342677,
"grad_norm": 0.0,
"learning_rate": 8.27806697906329e-05,
"loss": 1.5717,
"step": 12580
},
{
"epoch": 1.746964545896066,
"grad_norm": 0.0,
"learning_rate": 8.262841628982833e-05,
"loss": 1.6031,
"step": 12590
},
{
"epoch": 1.7483521820578645,
"grad_norm": 0.0,
"learning_rate": 8.24762043000469e-05,
"loss": 1.5612,
"step": 12600
},
{
"epoch": 1.7483521820578645,
"eval_loss": 1.5553832054138184,
"eval_runtime": 882.2732,
"eval_samples_per_second": 14.522,
"eval_steps_per_second": 1.816,
"step": 12600
},
{
"epoch": 1.7497398182196628,
"grad_norm": 0.0,
"learning_rate": 8.232403418501344e-05,
"loss": 1.592,
"step": 12610
},
{
"epoch": 1.7511274543814612,
"grad_norm": 0.0,
"learning_rate": 8.217190630835262e-05,
"loss": 1.585,
"step": 12620
},
{
"epoch": 1.7525150905432596,
"grad_norm": 0.0,
"learning_rate": 8.201982103358826e-05,
"loss": 1.5278,
"step": 12630
},
{
"epoch": 1.753902726705058,
"grad_norm": 0.0,
"learning_rate": 8.186777872414233e-05,
"loss": 1.5234,
"step": 12640
},
{
"epoch": 1.7552903628668564,
"grad_norm": 0.0,
"learning_rate": 8.171577974333411e-05,
"loss": 1.51,
"step": 12650
},
{
"epoch": 1.7566779990286547,
"grad_norm": 0.0,
"learning_rate": 8.156382445437942e-05,
"loss": 1.5068,
"step": 12660
},
{
"epoch": 1.7580656351904531,
"grad_norm": 0.0,
"learning_rate": 8.141191322038958e-05,
"loss": 1.4468,
"step": 12670
},
{
"epoch": 1.7594532713522515,
"grad_norm": 0.0,
"learning_rate": 8.126004640437073e-05,
"loss": 1.5458,
"step": 12680
},
{
"epoch": 1.7608409075140499,
"grad_norm": 0.0,
"learning_rate": 8.11082243692228e-05,
"loss": 1.5736,
"step": 12690
},
{
"epoch": 1.7622285436758482,
"grad_norm": 0.0,
"learning_rate": 8.095644747773874e-05,
"loss": 1.554,
"step": 12700
},
{
"epoch": 1.7622285436758482,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.4978,
"eval_samples_per_second": 14.651,
"eval_steps_per_second": 1.832,
"step": 12700
},
{
"epoch": 1.7636161798376464,
"grad_norm": 0.0,
"learning_rate": 8.080471609260361e-05,
"loss": 1.5562,
"step": 12710
},
{
"epoch": 1.765003815999445,
"grad_norm": 0.0,
"learning_rate": 8.065303057639377e-05,
"loss": 1.5142,
"step": 12720
},
{
"epoch": 1.7663914521612432,
"grad_norm": 0.0,
"learning_rate": 8.050139129157592e-05,
"loss": 1.5234,
"step": 12730
},
{
"epoch": 1.7677790883230418,
"grad_norm": 0.0,
"learning_rate": 8.034979860050627e-05,
"loss": 1.5836,
"step": 12740
},
{
"epoch": 1.76916672448484,
"grad_norm": 0.0,
"learning_rate": 8.019825286542979e-05,
"loss": 1.5147,
"step": 12750
},
{
"epoch": 1.7705543606466385,
"grad_norm": 0.0,
"learning_rate": 8.004675444847914e-05,
"loss": 1.5478,
"step": 12760
},
{
"epoch": 1.7719419968084367,
"grad_norm": 0.0,
"learning_rate": 7.989530371167397e-05,
"loss": 1.5685,
"step": 12770
},
{
"epoch": 1.7733296329702353,
"grad_norm": 0.0,
"learning_rate": 7.974390101691993e-05,
"loss": 1.5897,
"step": 12780
},
{
"epoch": 1.7747172691320334,
"grad_norm": 0.0,
"learning_rate": 7.959254672600799e-05,
"loss": 1.559,
"step": 12790
},
{
"epoch": 1.776104905293832,
"grad_norm": 0.0,
"learning_rate": 7.944124120061332e-05,
"loss": 1.5369,
"step": 12800
},
{
"epoch": 1.776104905293832,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.101,
"eval_samples_per_second": 14.641,
"eval_steps_per_second": 1.831,
"step": 12800
},
{
"epoch": 1.7774925414556302,
"grad_norm": 0.0,
"learning_rate": 7.928998480229461e-05,
"loss": 1.5482,
"step": 12810
},
{
"epoch": 1.7788801776174288,
"grad_norm": 0.0,
"learning_rate": 7.913877789249319e-05,
"loss": 1.5226,
"step": 12820
},
{
"epoch": 1.780267813779227,
"grad_norm": 0.0,
"learning_rate": 7.89876208325321e-05,
"loss": 1.5313,
"step": 12830
},
{
"epoch": 1.7816554499410255,
"grad_norm": 0.0,
"learning_rate": 7.883651398361529e-05,
"loss": 1.542,
"step": 12840
},
{
"epoch": 1.7830430861028237,
"grad_norm": 0.0,
"learning_rate": 7.868545770682663e-05,
"loss": 1.5335,
"step": 12850
},
{
"epoch": 1.7844307222646223,
"grad_norm": 0.0,
"learning_rate": 7.853445236312931e-05,
"loss": 1.56,
"step": 12860
},
{
"epoch": 1.7858183584264204,
"grad_norm": 0.0,
"learning_rate": 7.838349831336461e-05,
"loss": 1.5989,
"step": 12870
},
{
"epoch": 1.787205994588219,
"grad_norm": 0.0,
"learning_rate": 7.823259591825144e-05,
"loss": 1.5369,
"step": 12880
},
{
"epoch": 1.7885936307500172,
"grad_norm": 0.0,
"learning_rate": 7.80817455383851e-05,
"loss": 1.6173,
"step": 12890
},
{
"epoch": 1.7899812669118158,
"grad_norm": 0.0,
"learning_rate": 7.793094753423674e-05,
"loss": 1.5219,
"step": 12900
},
{
"epoch": 1.7899812669118158,
"eval_loss": 1.5553832054138184,
"eval_runtime": 880.1196,
"eval_samples_per_second": 14.557,
"eval_steps_per_second": 1.82,
"step": 12900
},
{
"epoch": 1.791368903073614,
"grad_norm": 0.0,
"learning_rate": 7.778020226615225e-05,
"loss": 1.565,
"step": 12910
},
{
"epoch": 1.7927565392354126,
"grad_norm": 0.0,
"learning_rate": 7.762951009435154e-05,
"loss": 1.4944,
"step": 12920
},
{
"epoch": 1.7941441753972107,
"grad_norm": 0.0,
"learning_rate": 7.747887137892762e-05,
"loss": 1.5121,
"step": 12930
},
{
"epoch": 1.7955318115590093,
"grad_norm": 0.0,
"learning_rate": 7.732828647984586e-05,
"loss": 1.5269,
"step": 12940
},
{
"epoch": 1.7969194477208075,
"grad_norm": 0.0,
"learning_rate": 7.717775575694288e-05,
"loss": 1.5544,
"step": 12950
},
{
"epoch": 1.798307083882606,
"grad_norm": 0.0,
"learning_rate": 7.702727956992593e-05,
"loss": 1.475,
"step": 12960
},
{
"epoch": 1.7996947200444042,
"grad_norm": 0.0,
"learning_rate": 7.687685827837196e-05,
"loss": 1.5259,
"step": 12970
},
{
"epoch": 1.8010823562062028,
"grad_norm": 0.0,
"learning_rate": 7.672649224172667e-05,
"loss": 1.4519,
"step": 12980
},
{
"epoch": 1.802469992368001,
"grad_norm": 0.0,
"learning_rate": 7.657618181930375e-05,
"loss": 1.5886,
"step": 12990
},
{
"epoch": 1.8038576285297996,
"grad_norm": 0.0,
"learning_rate": 7.642592737028403e-05,
"loss": 1.6084,
"step": 13000
},
{
"epoch": 1.8038576285297996,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.6719,
"eval_samples_per_second": 14.598,
"eval_steps_per_second": 1.825,
"step": 13000
},
{
"epoch": 1.8052452646915977,
"grad_norm": 0.0,
"learning_rate": 7.627572925371458e-05,
"loss": 1.5702,
"step": 13010
},
{
"epoch": 1.8066329008533963,
"grad_norm": 0.0,
"learning_rate": 7.612558782850781e-05,
"loss": 1.6286,
"step": 13020
},
{
"epoch": 1.8080205370151945,
"grad_norm": 0.0,
"learning_rate": 7.597550345344068e-05,
"loss": 1.5058,
"step": 13030
},
{
"epoch": 1.809408173176993,
"grad_norm": 0.0,
"learning_rate": 7.582547648715385e-05,
"loss": 1.4792,
"step": 13040
},
{
"epoch": 1.8107958093387913,
"grad_norm": 0.0,
"learning_rate": 7.567550728815085e-05,
"loss": 1.5139,
"step": 13050
},
{
"epoch": 1.8121834455005899,
"grad_norm": 0.0,
"learning_rate": 7.552559621479697e-05,
"loss": 1.5075,
"step": 13060
},
{
"epoch": 1.813571081662388,
"grad_norm": 0.0,
"learning_rate": 7.53757436253188e-05,
"loss": 1.5765,
"step": 13070
},
{
"epoch": 1.8149587178241866,
"grad_norm": 0.0,
"learning_rate": 7.522594987780312e-05,
"loss": 1.5871,
"step": 13080
},
{
"epoch": 1.8163463539859848,
"grad_norm": 0.0,
"learning_rate": 7.50762153301961e-05,
"loss": 1.5039,
"step": 13090
},
{
"epoch": 1.8177339901477834,
"grad_norm": 0.0,
"learning_rate": 7.492654034030238e-05,
"loss": 1.5577,
"step": 13100
},
{
"epoch": 1.8177339901477834,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.1669,
"eval_samples_per_second": 14.656,
"eval_steps_per_second": 1.833,
"step": 13100
},
{
"epoch": 1.8191216263095815,
"grad_norm": 0.0,
"learning_rate": 7.477692526578439e-05,
"loss": 1.5594,
"step": 13110
},
{
"epoch": 1.8205092624713801,
"grad_norm": 0.0,
"learning_rate": 7.462737046416136e-05,
"loss": 1.5738,
"step": 13120
},
{
"epoch": 1.8218968986331783,
"grad_norm": 0.0,
"learning_rate": 7.44778762928084e-05,
"loss": 1.5344,
"step": 13130
},
{
"epoch": 1.8232845347949769,
"grad_norm": 0.0,
"learning_rate": 7.432844310895584e-05,
"loss": 1.5409,
"step": 13140
},
{
"epoch": 1.824672170956775,
"grad_norm": 0.0,
"learning_rate": 7.417907126968823e-05,
"loss": 1.5455,
"step": 13150
},
{
"epoch": 1.8260598071185736,
"grad_norm": 0.0,
"learning_rate": 7.40297611319436e-05,
"loss": 1.5883,
"step": 13160
},
{
"epoch": 1.8274474432803718,
"grad_norm": 0.0,
"learning_rate": 7.38805130525124e-05,
"loss": 1.5222,
"step": 13170
},
{
"epoch": 1.8288350794421704,
"grad_norm": 0.0,
"learning_rate": 7.373132738803692e-05,
"loss": 1.5567,
"step": 13180
},
{
"epoch": 1.8302227156039685,
"grad_norm": 0.0,
"learning_rate": 7.358220449501025e-05,
"loss": 1.5775,
"step": 13190
},
{
"epoch": 1.8316103517657671,
"grad_norm": 0.0,
"learning_rate": 7.343314472977545e-05,
"loss": 1.5057,
"step": 13200
},
{
"epoch": 1.8316103517657671,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.9713,
"eval_samples_per_second": 14.676,
"eval_steps_per_second": 1.835,
"step": 13200
},
{
"epoch": 1.8329979879275653,
"grad_norm": 0.0,
"learning_rate": 7.328414844852478e-05,
"loss": 1.5522,
"step": 13210
},
{
"epoch": 1.834385624089364,
"grad_norm": 0.0,
"learning_rate": 7.313521600729878e-05,
"loss": 1.5101,
"step": 13220
},
{
"epoch": 1.835773260251162,
"grad_norm": 0.0,
"learning_rate": 7.298634776198548e-05,
"loss": 1.5026,
"step": 13230
},
{
"epoch": 1.8371608964129607,
"grad_norm": 0.0,
"learning_rate": 7.28375440683194e-05,
"loss": 1.5102,
"step": 13240
},
{
"epoch": 1.8385485325747588,
"grad_norm": 0.0,
"learning_rate": 7.26888052818809e-05,
"loss": 1.5971,
"step": 13250
},
{
"epoch": 1.8399361687365574,
"grad_norm": 0.0,
"learning_rate": 7.254013175809523e-05,
"loss": 1.5755,
"step": 13260
},
{
"epoch": 1.8413238048983556,
"grad_norm": 0.0,
"learning_rate": 7.239152385223171e-05,
"loss": 1.6045,
"step": 13270
},
{
"epoch": 1.8427114410601542,
"grad_norm": 0.0,
"learning_rate": 7.224298191940272e-05,
"loss": 1.5351,
"step": 13280
},
{
"epoch": 1.8440990772219523,
"grad_norm": 0.0,
"learning_rate": 7.209450631456318e-05,
"loss": 1.552,
"step": 13290
},
{
"epoch": 1.845486713383751,
"grad_norm": 0.0,
"learning_rate": 7.194609739250944e-05,
"loss": 1.508,
"step": 13300
},
{
"epoch": 1.845486713383751,
"eval_loss": 1.5553832054138184,
"eval_runtime": 875.2085,
"eval_samples_per_second": 14.639,
"eval_steps_per_second": 1.83,
"step": 13300
},
{
"epoch": 1.846874349545549,
"grad_norm": 0.0,
"learning_rate": 7.17977555078784e-05,
"loss": 1.5756,
"step": 13310
},
{
"epoch": 1.8482619857073477,
"grad_norm": 0.0,
"learning_rate": 7.164948101514692e-05,
"loss": 1.5041,
"step": 13320
},
{
"epoch": 1.8496496218691458,
"grad_norm": 0.0,
"learning_rate": 7.150127426863076e-05,
"loss": 1.5462,
"step": 13330
},
{
"epoch": 1.8510372580309444,
"grad_norm": 0.0,
"learning_rate": 7.135313562248383e-05,
"loss": 1.5198,
"step": 13340
},
{
"epoch": 1.8524248941927426,
"grad_norm": 0.0,
"learning_rate": 7.120506543069718e-05,
"loss": 1.5542,
"step": 13350
},
{
"epoch": 1.8538125303545412,
"grad_norm": 0.0,
"learning_rate": 7.105706404709843e-05,
"loss": 1.5656,
"step": 13360
},
{
"epoch": 1.8552001665163393,
"grad_norm": 0.0,
"learning_rate": 7.090913182535073e-05,
"loss": 1.5231,
"step": 13370
},
{
"epoch": 1.856587802678138,
"grad_norm": 0.0,
"learning_rate": 7.076126911895197e-05,
"loss": 1.4914,
"step": 13380
},
{
"epoch": 1.857975438839936,
"grad_norm": 0.0,
"learning_rate": 7.061347628123385e-05,
"loss": 1.6405,
"step": 13390
},
{
"epoch": 1.8593630750017347,
"grad_norm": 0.0,
"learning_rate": 7.04657536653612e-05,
"loss": 1.5911,
"step": 13400
},
{
"epoch": 1.8593630750017347,
"eval_loss": 1.5553832054138184,
"eval_runtime": 876.3422,
"eval_samples_per_second": 14.62,
"eval_steps_per_second": 1.828,
"step": 13400
},
{
"epoch": 1.8607507111635329,
"grad_norm": 0.0,
"learning_rate": 7.031810162433106e-05,
"loss": 1.4791,
"step": 13410
},
{
"epoch": 1.8621383473253315,
"grad_norm": 0.0,
"learning_rate": 7.01705205109717e-05,
"loss": 1.5813,
"step": 13420
},
{
"epoch": 1.8635259834871296,
"grad_norm": 0.0,
"learning_rate": 7.002301067794204e-05,
"loss": 1.5423,
"step": 13430
},
{
"epoch": 1.8649136196489282,
"grad_norm": 0.0,
"learning_rate": 6.98755724777306e-05,
"loss": 1.5634,
"step": 13440
},
{
"epoch": 1.8663012558107264,
"grad_norm": 0.0,
"learning_rate": 6.97282062626548e-05,
"loss": 1.4679,
"step": 13450
},
{
"epoch": 1.867688891972525,
"grad_norm": 0.0,
"learning_rate": 6.958091238485989e-05,
"loss": 1.5489,
"step": 13460
},
{
"epoch": 1.8690765281343231,
"grad_norm": 0.0,
"learning_rate": 6.943369119631841e-05,
"loss": 1.4743,
"step": 13470
},
{
"epoch": 1.8704641642961217,
"grad_norm": 0.0,
"learning_rate": 6.928654304882916e-05,
"loss": 1.5471,
"step": 13480
},
{
"epoch": 1.8718518004579199,
"grad_norm": 0.0,
"learning_rate": 6.91394682940164e-05,
"loss": 1.5452,
"step": 13490
},
{
"epoch": 1.8732394366197183,
"grad_norm": 0.0,
"learning_rate": 6.899246728332895e-05,
"loss": 1.5628,
"step": 13500
},
{
"epoch": 1.8732394366197183,
"eval_loss": 1.5553832054138184,
"eval_runtime": 878.2498,
"eval_samples_per_second": 14.588,
"eval_steps_per_second": 1.824,
"step": 13500
},
{
"epoch": 1.8746270727815166,
"grad_norm": 0.0,
"learning_rate": 6.884554036803952e-05,
"loss": 1.5329,
"step": 13510
},
{
"epoch": 1.876014708943315,
"grad_norm": 0.0,
"learning_rate": 6.869868789924372e-05,
"loss": 1.5281,
"step": 13520
},
{
"epoch": 1.8774023451051134,
"grad_norm": 0.0,
"learning_rate": 6.855191022785918e-05,
"loss": 1.5502,
"step": 13530
},
{
"epoch": 1.8787899812669118,
"grad_norm": 0.0,
"learning_rate": 6.840520770462494e-05,
"loss": 1.4811,
"step": 13540
},
{
"epoch": 1.8801776174287101,
"grad_norm": 0.0,
"learning_rate": 6.825858068010034e-05,
"loss": 1.5324,
"step": 13550
},
{
"epoch": 1.8815652535905085,
"grad_norm": 0.0,
"learning_rate": 6.811202950466442e-05,
"loss": 1.5537,
"step": 13560
},
{
"epoch": 1.882952889752307,
"grad_norm": 0.0,
"learning_rate": 6.796555452851485e-05,
"loss": 1.4715,
"step": 13570
},
{
"epoch": 1.8843405259141053,
"grad_norm": 0.0,
"learning_rate": 6.781915610166731e-05,
"loss": 1.5999,
"step": 13580
},
{
"epoch": 1.8857281620759037,
"grad_norm": 0.0,
"learning_rate": 6.767283457395453e-05,
"loss": 1.5633,
"step": 13590
},
{
"epoch": 1.887115798237702,
"grad_norm": 0.0,
"learning_rate": 6.752659029502548e-05,
"loss": 1.4866,
"step": 13600
},
{
"epoch": 1.887115798237702,
"eval_loss": 1.5553832054138184,
"eval_runtime": 884.2082,
"eval_samples_per_second": 14.49,
"eval_steps_per_second": 1.812,
"step": 13600
},
{
"epoch": 1.8885034343995004,
"grad_norm": 0.0,
"learning_rate": 6.738042361434451e-05,
"loss": 1.5527,
"step": 13610
},
{
"epoch": 1.8898910705612988,
"grad_norm": 0.0,
"learning_rate": 6.72343348811906e-05,
"loss": 1.5674,
"step": 13620
},
{
"epoch": 1.8912787067230972,
"grad_norm": 0.0,
"learning_rate": 6.708832444465644e-05,
"loss": 1.573,
"step": 13630
},
{
"epoch": 1.8926663428848955,
"grad_norm": 0.0,
"learning_rate": 6.694239265364756e-05,
"loss": 1.5562,
"step": 13640
},
{
"epoch": 1.894053979046694,
"grad_norm": 0.0,
"learning_rate": 6.679653985688165e-05,
"loss": 1.6307,
"step": 13650
},
{
"epoch": 1.8954416152084923,
"grad_norm": 0.0,
"learning_rate": 6.665076640288761e-05,
"loss": 1.5187,
"step": 13660
},
{
"epoch": 1.8968292513702907,
"grad_norm": 0.0,
"learning_rate": 6.650507264000476e-05,
"loss": 1.5494,
"step": 13670
},
{
"epoch": 1.898216887532089,
"grad_norm": 0.0,
"learning_rate": 6.63594589163819e-05,
"loss": 1.506,
"step": 13680
},
{
"epoch": 1.8996045236938874,
"grad_norm": 0.0,
"learning_rate": 6.621392557997667e-05,
"loss": 1.5752,
"step": 13690
},
{
"epoch": 1.9009921598556858,
"grad_norm": 0.0,
"learning_rate": 6.606847297855459e-05,
"loss": 1.5399,
"step": 13700
},
{
"epoch": 1.9009921598556858,
"eval_loss": 1.5553832054138184,
"eval_runtime": 887.0642,
"eval_samples_per_second": 14.443,
"eval_steps_per_second": 1.806,
"step": 13700
},
{
"epoch": 1.9023797960174842,
"grad_norm": 0.0,
"learning_rate": 6.592310145968828e-05,
"loss": 1.5826,
"step": 13710
},
{
"epoch": 1.9037674321792826,
"grad_norm": 0.0,
"learning_rate": 6.577781137075647e-05,
"loss": 1.5534,
"step": 13720
},
{
"epoch": 1.905155068341081,
"grad_norm": 0.0,
"learning_rate": 6.563260305894349e-05,
"loss": 1.665,
"step": 13730
},
{
"epoch": 1.9065427045028793,
"grad_norm": 0.0,
"learning_rate": 6.54874768712382e-05,
"loss": 1.4964,
"step": 13740
},
{
"epoch": 1.9079303406646777,
"grad_norm": 0.0,
"learning_rate": 6.534243315443311e-05,
"loss": 1.5646,
"step": 13750
},
{
"epoch": 1.909317976826476,
"grad_norm": 0.0,
"learning_rate": 6.519747225512377e-05,
"loss": 1.5572,
"step": 13760
},
{
"epoch": 1.9107056129882745,
"grad_norm": 0.0,
"learning_rate": 6.505259451970782e-05,
"loss": 1.5625,
"step": 13770
},
{
"epoch": 1.9120932491500728,
"grad_norm": 0.0,
"learning_rate": 6.490780029438417e-05,
"loss": 1.4955,
"step": 13780
},
{
"epoch": 1.9134808853118712,
"grad_norm": 0.0,
"learning_rate": 6.47630899251521e-05,
"loss": 1.5368,
"step": 13790
},
{
"epoch": 1.9148685214736696,
"grad_norm": 0.0,
"learning_rate": 6.461846375781058e-05,
"loss": 1.494,
"step": 13800
},
{
"epoch": 1.9148685214736696,
"eval_loss": 1.5553832054138184,
"eval_runtime": 922.4284,
"eval_samples_per_second": 13.889,
"eval_steps_per_second": 1.737,
"step": 13800
},
{
"epoch": 1.916256157635468,
"grad_norm": 0.0,
"learning_rate": 6.447392213795737e-05,
"loss": 1.5269,
"step": 13810
},
{
"epoch": 1.9176437937972663,
"grad_norm": 0.0,
"learning_rate": 6.432946541098823e-05,
"loss": 1.5747,
"step": 13820
},
{
"epoch": 1.9190314299590647,
"grad_norm": 0.0,
"learning_rate": 6.418509392209593e-05,
"loss": 1.5638,
"step": 13830
},
{
"epoch": 1.920419066120863,
"grad_norm": 0.0,
"learning_rate": 6.404080801626966e-05,
"loss": 1.556,
"step": 13840
},
{
"epoch": 1.9218067022826615,
"grad_norm": 0.0,
"learning_rate": 6.389660803829414e-05,
"loss": 1.5173,
"step": 13850
},
{
"epoch": 1.9231943384444599,
"grad_norm": 0.0,
"learning_rate": 6.375249433274861e-05,
"loss": 1.5467,
"step": 13860
},
{
"epoch": 1.9245819746062582,
"grad_norm": 0.0,
"learning_rate": 6.360846724400628e-05,
"loss": 1.6175,
"step": 13870
},
{
"epoch": 1.9259696107680566,
"grad_norm": 0.0,
"learning_rate": 6.346452711623334e-05,
"loss": 1.5365,
"step": 13880
},
{
"epoch": 1.927357246929855,
"grad_norm": 0.0,
"learning_rate": 6.332067429338824e-05,
"loss": 1.5158,
"step": 13890
},
{
"epoch": 1.9287448830916534,
"grad_norm": 0.0,
"learning_rate": 6.317690911922063e-05,
"loss": 1.5795,
"step": 13900
},
{
"epoch": 1.9287448830916534,
"eval_loss": 1.5553832054138184,
"eval_runtime": 925.1585,
"eval_samples_per_second": 13.848,
"eval_steps_per_second": 1.732,
"step": 13900
},
{
"epoch": 1.9301325192534518,
"grad_norm": 0.0,
"learning_rate": 6.30332319372709e-05,
"loss": 1.5182,
"step": 13910
},
{
"epoch": 1.9315201554152501,
"grad_norm": 0.0,
"learning_rate": 6.28896430908691e-05,
"loss": 1.5741,
"step": 13920
},
{
"epoch": 1.9329077915770485,
"grad_norm": 0.0,
"learning_rate": 6.274614292313425e-05,
"loss": 1.5026,
"step": 13930
},
{
"epoch": 1.9342954277388469,
"grad_norm": 0.0,
"learning_rate": 6.260273177697334e-05,
"loss": 1.5389,
"step": 13940
},
{
"epoch": 1.9356830639006453,
"grad_norm": 0.0,
"learning_rate": 6.245940999508071e-05,
"loss": 1.5421,
"step": 13950
},
{
"epoch": 1.9370707000624436,
"grad_norm": 0.0,
"learning_rate": 6.231617791993724e-05,
"loss": 1.5632,
"step": 13960
},
{
"epoch": 1.938458336224242,
"grad_norm": 0.0,
"learning_rate": 6.217303589380925e-05,
"loss": 1.5596,
"step": 13970
},
{
"epoch": 1.9398459723860404,
"grad_norm": 0.0,
"learning_rate": 6.202998425874806e-05,
"loss": 1.5452,
"step": 13980
},
{
"epoch": 1.9412336085478388,
"grad_norm": 0.0,
"learning_rate": 6.188702335658892e-05,
"loss": 1.4568,
"step": 13990
},
{
"epoch": 1.9426212447096372,
"grad_norm": 0.0,
"learning_rate": 6.174415352895029e-05,
"loss": 1.5651,
"step": 14000
},
{
"epoch": 1.9426212447096372,
"eval_loss": 1.5553832054138184,
"eval_runtime": 935.3872,
"eval_samples_per_second": 13.697,
"eval_steps_per_second": 1.713,
"step": 14000
},
{
"epoch": 1.9440088808714355,
"grad_norm": 0.0,
"learning_rate": 6.160137511723291e-05,
"loss": 1.541,
"step": 14010
},
{
"epoch": 1.945396517033234,
"grad_norm": 0.0,
"learning_rate": 6.14586884626192e-05,
"loss": 1.6313,
"step": 14020
},
{
"epoch": 1.9467841531950323,
"grad_norm": 0.0,
"learning_rate": 6.131609390607223e-05,
"loss": 1.5859,
"step": 14030
},
{
"epoch": 1.9481717893568307,
"grad_norm": 0.0,
"learning_rate": 6.117359178833508e-05,
"loss": 1.5797,
"step": 14040
},
{
"epoch": 1.949559425518629,
"grad_norm": 0.0,
"learning_rate": 6.103118244992978e-05,
"loss": 1.5444,
"step": 14050
},
{
"epoch": 1.9509470616804274,
"grad_norm": 0.0,
"learning_rate": 6.0888866231156836e-05,
"loss": 1.4658,
"step": 14060
},
{
"epoch": 1.9523346978422258,
"grad_norm": 0.0,
"learning_rate": 6.0746643472094155e-05,
"loss": 1.606,
"step": 14070
},
{
"epoch": 1.9537223340040242,
"grad_norm": 0.0,
"learning_rate": 6.060451451259627e-05,
"loss": 1.4978,
"step": 14080
},
{
"epoch": 1.9551099701658226,
"grad_norm": 0.0,
"learning_rate": 6.0462479692293616e-05,
"loss": 1.5408,
"step": 14090
},
{
"epoch": 1.956497606327621,
"grad_norm": 0.0,
"learning_rate": 6.032053935059172e-05,
"loss": 1.5434,
"step": 14100
},
{
"epoch": 1.956497606327621,
"eval_loss": 1.5553832054138184,
"eval_runtime": 871.9972,
"eval_samples_per_second": 14.693,
"eval_steps_per_second": 1.837,
"step": 14100
},
{
"epoch": 1.9578852424894193,
"grad_norm": 0.0,
"learning_rate": 6.01786938266703e-05,
"loss": 1.4944,
"step": 14110
},
{
"epoch": 1.9592728786512177,
"grad_norm": 0.0,
"learning_rate": 6.003694345948243e-05,
"loss": 1.61,
"step": 14120
},
{
"epoch": 1.960660514813016,
"grad_norm": 0.0,
"learning_rate": 5.989528858775391e-05,
"loss": 1.5269,
"step": 14130
},
{
"epoch": 1.9620481509748144,
"grad_norm": 0.0,
"learning_rate": 5.975372954998228e-05,
"loss": 1.5771,
"step": 14140
},
{
"epoch": 1.9634357871366128,
"grad_norm": 0.0,
"learning_rate": 5.9612266684436136e-05,
"loss": 1.5687,
"step": 14150
},
{
"epoch": 1.9648234232984112,
"grad_norm": 0.0,
"learning_rate": 5.9470900329154164e-05,
"loss": 1.533,
"step": 14160
},
{
"epoch": 1.9662110594602096,
"grad_norm": 0.0,
"learning_rate": 5.932963082194449e-05,
"loss": 1.5323,
"step": 14170
},
{
"epoch": 1.967598695622008,
"grad_norm": 0.0,
"learning_rate": 5.918845850038388e-05,
"loss": 1.556,
"step": 14180
},
{
"epoch": 1.9689863317838063,
"grad_norm": 0.0,
"learning_rate": 5.9047383701816684e-05,
"loss": 1.5338,
"step": 14190
},
{
"epoch": 1.9703739679456047,
"grad_norm": 0.0,
"learning_rate": 5.890640676335439e-05,
"loss": 1.557,
"step": 14200
},
{
"epoch": 1.9703739679456047,
"eval_loss": 1.5553832054138184,
"eval_runtime": 872.3008,
"eval_samples_per_second": 14.688,
"eval_steps_per_second": 1.837,
"step": 14200
},
{
"epoch": 1.971761604107403,
"grad_norm": 0.0,
"learning_rate": 5.876552802187454e-05,
"loss": 1.551,
"step": 14210
},
{
"epoch": 1.9731492402692015,
"grad_norm": 0.0,
"learning_rate": 5.862474781402012e-05,
"loss": 1.4758,
"step": 14220
},
{
"epoch": 1.9745368764309998,
"grad_norm": 0.0,
"learning_rate": 5.8484066476198506e-05,
"loss": 1.5223,
"step": 14230
},
{
"epoch": 1.9759245125927982,
"grad_norm": 0.0,
"learning_rate": 5.834348434458097e-05,
"loss": 1.5176,
"step": 14240
},
{
"epoch": 1.9773121487545966,
"grad_norm": 0.0,
"learning_rate": 5.8203001755101616e-05,
"loss": 1.5878,
"step": 14250
},
{
"epoch": 1.9786997849163948,
"grad_norm": 0.0,
"learning_rate": 5.8062619043456775e-05,
"loss": 1.5916,
"step": 14260
},
{
"epoch": 1.9800874210781934,
"grad_norm": 0.0,
"learning_rate": 5.792233654510399e-05,
"loss": 1.6007,
"step": 14270
},
{
"epoch": 1.9814750572399915,
"grad_norm": 0.0,
"learning_rate": 5.778215459526145e-05,
"loss": 1.6015,
"step": 14280
},
{
"epoch": 1.98286269340179,
"grad_norm": 0.0,
"learning_rate": 5.764207352890702e-05,
"loss": 1.624,
"step": 14290
},
{
"epoch": 1.9842503295635883,
"grad_norm": 0.0,
"learning_rate": 5.750209368077754e-05,
"loss": 1.5387,
"step": 14300
},
{
"epoch": 1.9842503295635883,
"eval_loss": 1.5553832054138184,
"eval_runtime": 896.0816,
"eval_samples_per_second": 14.298,
"eval_steps_per_second": 1.788,
"step": 14300
},
{
"epoch": 1.9856379657253869,
"grad_norm": 0.0,
"learning_rate": 5.736221538536786e-05,
"loss": 1.5593,
"step": 14310
},
{
"epoch": 1.987025601887185,
"grad_norm": 0.0,
"learning_rate": 5.7222438976930295e-05,
"loss": 1.5958,
"step": 14320
},
{
"epoch": 1.9884132380489836,
"grad_norm": 0.0,
"learning_rate": 5.708276478947362e-05,
"loss": 1.5773,
"step": 14330
},
{
"epoch": 1.9898008742107818,
"grad_norm": 0.0,
"learning_rate": 5.694319315676242e-05,
"loss": 1.5789,
"step": 14340
},
{
"epoch": 1.9911885103725804,
"grad_norm": 0.0,
"learning_rate": 5.6803724412316074e-05,
"loss": 1.5386,
"step": 14350
},
{
"epoch": 1.9925761465343785,
"grad_norm": 0.0,
"learning_rate": 5.666435888940822e-05,
"loss": 1.6013,
"step": 14360
},
{
"epoch": 1.9939637826961771,
"grad_norm": 0.0,
"learning_rate": 5.6525096921065844e-05,
"loss": 1.5647,
"step": 14370
},
{
"epoch": 1.9953514188579753,
"grad_norm": 0.0,
"learning_rate": 5.6385938840068374e-05,
"loss": 1.6151,
"step": 14380
},
{
"epoch": 1.996739055019774,
"grad_norm": 0.0,
"learning_rate": 5.624688497894708e-05,
"loss": 1.5482,
"step": 14390
},
{
"epoch": 1.998126691181572,
"grad_norm": 0.0,
"learning_rate": 5.610793566998414e-05,
"loss": 1.5079,
"step": 14400
},
{
"epoch": 1.998126691181572,
"eval_loss": 1.5553832054138184,
"eval_runtime": 887.1687,
"eval_samples_per_second": 14.441,
"eval_steps_per_second": 1.806,
"step": 14400
},
{
"epoch": 1.9995143273433706,
"grad_norm": 0.0,
"learning_rate": 5.5969091245211994e-05,
"loss": 1.474,
"step": 14410
},
{
"epoch": 2.0009713453132587,
"grad_norm": 0.0,
"learning_rate": 5.583035203641227e-05,
"loss": 1.6786,
"step": 14420
},
{
"epoch": 2.0023589814750573,
"grad_norm": 0.0,
"learning_rate": 5.5691718375115334e-05,
"loss": 1.632,
"step": 14430
},
{
"epoch": 2.0037466176368555,
"grad_norm": 0.0,
"learning_rate": 5.5553190592599295e-05,
"loss": 1.5874,
"step": 14440
},
{
"epoch": 2.005134253798654,
"grad_norm": 0.0,
"learning_rate": 5.541476901988918e-05,
"loss": 1.5561,
"step": 14450
},
{
"epoch": 2.006521889960452,
"grad_norm": 0.0,
"learning_rate": 5.5276453987756314e-05,
"loss": 1.575,
"step": 14460
},
{
"epoch": 2.007909526122251,
"grad_norm": 0.0,
"learning_rate": 5.5138245826717394e-05,
"loss": 1.493,
"step": 14470
},
{
"epoch": 2.009297162284049,
"grad_norm": 0.0,
"learning_rate": 5.5000144867033776e-05,
"loss": 1.6124,
"step": 14480
},
{
"epoch": 2.0106847984458476,
"grad_norm": 0.0,
"learning_rate": 5.486215143871053e-05,
"loss": 1.5919,
"step": 14490
},
{
"epoch": 2.0120724346076457,
"grad_norm": 0.0,
"learning_rate": 5.472426587149591e-05,
"loss": 1.5504,
"step": 14500
},
{
"epoch": 2.0120724346076457,
"eval_loss": 1.5553832054138184,
"eval_runtime": 881.8543,
"eval_samples_per_second": 14.528,
"eval_steps_per_second": 1.817,
"step": 14500
},
{
"epoch": 2.0134600707694443,
"grad_norm": 0.0,
"learning_rate": 5.458648849488037e-05,
"loss": 1.4898,
"step": 14510
},
{
"epoch": 2.0148477069312425,
"grad_norm": 0.0,
"learning_rate": 5.44488196380958e-05,
"loss": 1.515,
"step": 14520
},
{
"epoch": 2.016235343093041,
"grad_norm": 0.0,
"learning_rate": 5.431125963011481e-05,
"loss": 1.5875,
"step": 14530
},
{
"epoch": 2.0176229792548392,
"grad_norm": 0.0,
"learning_rate": 5.4173808799649905e-05,
"loss": 1.6191,
"step": 14540
},
{
"epoch": 2.019010615416638,
"grad_norm": 0.0,
"learning_rate": 5.403646747515274e-05,
"loss": 1.5824,
"step": 14550
},
{
"epoch": 2.020398251578436,
"grad_norm": 0.0,
"learning_rate": 5.3899235984813166e-05,
"loss": 1.5268,
"step": 14560
},
{
"epoch": 2.0217858877402346,
"grad_norm": 0.0,
"learning_rate": 5.376211465655871e-05,
"loss": 1.4775,
"step": 14570
},
{
"epoch": 2.0231735239020328,
"grad_norm": 0.0,
"learning_rate": 5.362510381805357e-05,
"loss": 1.5118,
"step": 14580
},
{
"epoch": 2.0245611600638314,
"grad_norm": 0.0,
"learning_rate": 5.348820379669801e-05,
"loss": 1.58,
"step": 14590
},
{
"epoch": 2.0259487962256295,
"grad_norm": 0.0,
"learning_rate": 5.335141491962736e-05,
"loss": 1.5622,
"step": 14600
},
{
"epoch": 2.0259487962256295,
"eval_loss": 1.5553832054138184,
"eval_runtime": 886.1254,
"eval_samples_per_second": 14.458,
"eval_steps_per_second": 1.808,
"step": 14600
},
{
"epoch": 2.027336432387428,
"grad_norm": 0.0,
"learning_rate": 5.321473751371147e-05,
"loss": 1.4896,
"step": 14610
},
{
"epoch": 2.0287240685492263,
"grad_norm": 0.0,
"learning_rate": 5.30781719055538e-05,
"loss": 1.554,
"step": 14620
},
{
"epoch": 2.030111704711025,
"grad_norm": 0.0,
"learning_rate": 5.294171842149056e-05,
"loss": 1.4881,
"step": 14630
},
{
"epoch": 2.031499340872823,
"grad_norm": 0.0,
"learning_rate": 5.280537738759015e-05,
"loss": 1.5954,
"step": 14640
},
{
"epoch": 2.0328869770346216,
"grad_norm": 0.0,
"learning_rate": 5.266914912965222e-05,
"loss": 1.5027,
"step": 14650
},
{
"epoch": 2.03427461319642,
"grad_norm": 0.0,
"learning_rate": 5.2533033973206945e-05,
"loss": 1.5843,
"step": 14660
},
{
"epoch": 2.0356622493582184,
"grad_norm": 0.0,
"learning_rate": 5.2397032243514174e-05,
"loss": 1.5423,
"step": 14670
},
{
"epoch": 2.0370498855200165,
"grad_norm": 0.0,
"learning_rate": 5.2261144265562766e-05,
"loss": 1.5753,
"step": 14680
},
{
"epoch": 2.038437521681815,
"grad_norm": 0.0,
"learning_rate": 5.212537036406975e-05,
"loss": 1.5654,
"step": 14690
},
{
"epoch": 2.0398251578436133,
"grad_norm": 0.0,
"learning_rate": 5.19897108634796e-05,
"loss": 1.5264,
"step": 14700
},
{
"epoch": 2.0398251578436133,
"eval_loss": 1.5553832054138184,
"eval_runtime": 934.2204,
"eval_samples_per_second": 13.714,
"eval_steps_per_second": 1.715,
"step": 14700
},
{
"epoch": 2.041212794005412,
"grad_norm": 0.0,
"learning_rate": 5.18541660879633e-05,
"loss": 1.5304,
"step": 14710
},
{
"epoch": 2.04260043016721,
"grad_norm": 0.0,
"learning_rate": 5.1718736361417786e-05,
"loss": 1.5799,
"step": 14720
},
{
"epoch": 2.0439880663290086,
"grad_norm": 0.0,
"learning_rate": 5.158342200746511e-05,
"loss": 1.507,
"step": 14730
},
{
"epoch": 2.045375702490807,
"grad_norm": 0.0,
"learning_rate": 5.144822334945146e-05,
"loss": 1.6232,
"step": 14740
},
{
"epoch": 2.0467633386526054,
"grad_norm": 0.0,
"learning_rate": 5.131314071044675e-05,
"loss": 1.62,
"step": 14750
},
{
"epoch": 2.0481509748144036,
"grad_norm": 0.0,
"learning_rate": 5.117817441324353e-05,
"loss": 1.4876,
"step": 14760
},
{
"epoch": 2.049538610976202,
"grad_norm": 0.0,
"learning_rate": 5.104332478035645e-05,
"loss": 1.6082,
"step": 14770
},
{
"epoch": 2.0509262471380003,
"grad_norm": 0.0,
"learning_rate": 5.090859213402124e-05,
"loss": 1.4934,
"step": 14780
},
{
"epoch": 2.052313883299799,
"grad_norm": 0.0,
"learning_rate": 5.077397679619416e-05,
"loss": 1.6092,
"step": 14790
},
{
"epoch": 2.053701519461597,
"grad_norm": 0.0,
"learning_rate": 5.063947908855118e-05,
"loss": 1.5959,
"step": 14800
},
{
"epoch": 2.053701519461597,
"eval_loss": 1.5553832054138184,
"eval_runtime": 935.4436,
"eval_samples_per_second": 13.696,
"eval_steps_per_second": 1.713,
"step": 14800
},
{
"epoch": 2.0550891556233957,
"grad_norm": 0.0,
"learning_rate": 5.0505099332487146e-05,
"loss": 1.5665,
"step": 14810
},
{
"epoch": 2.056476791785194,
"grad_norm": 0.0,
"learning_rate": 5.037083784911502e-05,
"loss": 1.58,
"step": 14820
},
{
"epoch": 2.0578644279469924,
"grad_norm": 0.0,
"learning_rate": 5.023669495926516e-05,
"loss": 1.5067,
"step": 14830
},
{
"epoch": 2.0592520641087906,
"grad_norm": 0.0,
"learning_rate": 5.0102670983484604e-05,
"loss": 1.5376,
"step": 14840
},
{
"epoch": 2.060639700270589,
"grad_norm": 0.0,
"learning_rate": 4.996876624203608e-05,
"loss": 1.5834,
"step": 14850
},
{
"epoch": 2.0620273364323873,
"grad_norm": 0.0,
"learning_rate": 4.9834981054897535e-05,
"loss": 1.5117,
"step": 14860
},
{
"epoch": 2.063414972594186,
"grad_norm": 0.0,
"learning_rate": 4.970131574176117e-05,
"loss": 1.5067,
"step": 14870
},
{
"epoch": 2.064802608755984,
"grad_norm": 0.0,
"learning_rate": 4.956777062203278e-05,
"loss": 1.5413,
"step": 14880
},
{
"epoch": 2.0661902449177827,
"grad_norm": 0.0,
"learning_rate": 4.943434601483087e-05,
"loss": 1.5114,
"step": 14890
},
{
"epoch": 2.067577881079581,
"grad_norm": 0.0,
"learning_rate": 4.9301042238986005e-05,
"loss": 1.5587,
"step": 14900
},
{
"epoch": 2.067577881079581,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.1569,
"eval_samples_per_second": 14.606,
"eval_steps_per_second": 1.826,
"step": 14900
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.0,
"learning_rate": 4.916785961304008e-05,
"loss": 1.5112,
"step": 14910
},
{
"epoch": 2.0703531534031776,
"grad_norm": 0.0,
"learning_rate": 4.903479845524535e-05,
"loss": 1.5256,
"step": 14920
},
{
"epoch": 2.071740789564976,
"grad_norm": 0.0,
"learning_rate": 4.890185908356393e-05,
"loss": 1.5042,
"step": 14930
},
{
"epoch": 2.0731284257267744,
"grad_norm": 0.0,
"learning_rate": 4.876904181566686e-05,
"loss": 1.5899,
"step": 14940
},
{
"epoch": 2.074516061888573,
"grad_norm": 0.0,
"learning_rate": 4.863634696893349e-05,
"loss": 1.5495,
"step": 14950
},
{
"epoch": 2.075903698050371,
"grad_norm": 0.0,
"learning_rate": 4.850377486045045e-05,
"loss": 1.5809,
"step": 14960
},
{
"epoch": 2.0772913342121697,
"grad_norm": 0.0,
"learning_rate": 4.8371325807011234e-05,
"loss": 1.6356,
"step": 14970
},
{
"epoch": 2.078678970373968,
"grad_norm": 0.0,
"learning_rate": 4.823900012511524e-05,
"loss": 1.5579,
"step": 14980
},
{
"epoch": 2.0800666065357665,
"grad_norm": 0.0,
"learning_rate": 4.81067981309671e-05,
"loss": 1.5337,
"step": 14990
},
{
"epoch": 2.0814542426975646,
"grad_norm": 0.0,
"learning_rate": 4.797472014047576e-05,
"loss": 1.5839,
"step": 15000
},
{
"epoch": 2.0814542426975646,
"eval_loss": 1.5553832054138184,
"eval_runtime": 879.4065,
"eval_samples_per_second": 14.569,
"eval_steps_per_second": 1.822,
"step": 15000
},
{
"epoch": 2.0827031152431834,
"grad_norm": 0.0,
"learning_rate": 4.7842766469253945e-05,
"loss": 1.5855,
"step": 15010
},
{
"epoch": 2.0840907514049816,
"grad_norm": 0.0,
"learning_rate": 4.771093743261734e-05,
"loss": 1.6072,
"step": 15020
},
{
"epoch": 2.08547838756678,
"grad_norm": 0.0,
"learning_rate": 4.757923334558367e-05,
"loss": 1.5583,
"step": 15030
},
{
"epoch": 2.0868660237285783,
"grad_norm": 0.0,
"learning_rate": 4.744765452287221e-05,
"loss": 1.5175,
"step": 15040
},
{
"epoch": 2.088253659890377,
"grad_norm": 0.0,
"learning_rate": 4.731620127890284e-05,
"loss": 1.5471,
"step": 15050
},
{
"epoch": 2.089641296052175,
"grad_norm": 0.0,
"learning_rate": 4.718487392779543e-05,
"loss": 1.529,
"step": 15060
},
{
"epoch": 2.0910289322139737,
"grad_norm": 0.0,
"learning_rate": 4.705367278336888e-05,
"loss": 1.531,
"step": 15070
},
{
"epoch": 2.092416568375772,
"grad_norm": 0.0,
"learning_rate": 4.6922598159140616e-05,
"loss": 1.5668,
"step": 15080
},
{
"epoch": 2.0938042045375704,
"grad_norm": 0.0,
"learning_rate": 4.6791650368325715e-05,
"loss": 1.5357,
"step": 15090
},
{
"epoch": 2.0951918406993686,
"grad_norm": 0.0,
"learning_rate": 4.666082972383621e-05,
"loss": 1.5436,
"step": 15100
},
{
"epoch": 2.0951918406993686,
"eval_loss": 1.5553832054138184,
"eval_runtime": 916.3967,
"eval_samples_per_second": 13.981,
"eval_steps_per_second": 1.748,
"step": 15100
},
{
"epoch": 2.096579476861167,
"grad_norm": 0.0,
"learning_rate": 4.653013653828018e-05,
"loss": 1.5593,
"step": 15110
},
{
"epoch": 2.0979671130229653,
"grad_norm": 0.0,
"learning_rate": 4.639957112396123e-05,
"loss": 1.6224,
"step": 15120
},
{
"epoch": 2.099354749184764,
"grad_norm": 0.0,
"learning_rate": 4.626913379287768e-05,
"loss": 1.5973,
"step": 15130
},
{
"epoch": 2.100742385346562,
"grad_norm": 0.0,
"learning_rate": 4.6138824856721654e-05,
"loss": 1.5058,
"step": 15140
},
{
"epoch": 2.1021300215083607,
"grad_norm": 0.0,
"learning_rate": 4.600864462687855e-05,
"loss": 1.488,
"step": 15150
},
{
"epoch": 2.103517657670159,
"grad_norm": 0.0,
"learning_rate": 4.587859341442622e-05,
"loss": 1.4917,
"step": 15160
},
{
"epoch": 2.1049052938319575,
"grad_norm": 0.0,
"learning_rate": 4.574867153013421e-05,
"loss": 1.532,
"step": 15170
},
{
"epoch": 2.1062929299937556,
"grad_norm": 0.0,
"learning_rate": 4.561887928446296e-05,
"loss": 1.5549,
"step": 15180
},
{
"epoch": 2.107680566155554,
"grad_norm": 0.0,
"learning_rate": 4.5489216987563176e-05,
"loss": 1.5537,
"step": 15190
},
{
"epoch": 2.1090682023173524,
"grad_norm": 0.0,
"learning_rate": 4.535968494927507e-05,
"loss": 1.6042,
"step": 15200
},
{
"epoch": 2.1090682023173524,
"eval_loss": 1.5553832054138184,
"eval_runtime": 953.681,
"eval_samples_per_second": 13.434,
"eval_steps_per_second": 1.68,
"step": 15200
},
{
"epoch": 2.110455838479151,
"grad_norm": 0.0,
"learning_rate": 4.5230283479127575e-05,
"loss": 1.4479,
"step": 15210
},
{
"epoch": 2.111843474640949,
"grad_norm": 0.0,
"learning_rate": 4.510101288633753e-05,
"loss": 1.5663,
"step": 15220
},
{
"epoch": 2.1132311108027477,
"grad_norm": 0.0,
"learning_rate": 4.4971873479809147e-05,
"loss": 1.5425,
"step": 15230
},
{
"epoch": 2.114618746964546,
"grad_norm": 0.0,
"learning_rate": 4.484286556813314e-05,
"loss": 1.5906,
"step": 15240
},
{
"epoch": 2.1160063831263445,
"grad_norm": 0.0,
"learning_rate": 4.471398945958589e-05,
"loss": 1.5221,
"step": 15250
},
{
"epoch": 2.1173940192881426,
"grad_norm": 0.0,
"learning_rate": 4.4585245462128956e-05,
"loss": 1.5014,
"step": 15260
},
{
"epoch": 2.1187816554499412,
"grad_norm": 0.0,
"learning_rate": 4.445663388340815e-05,
"loss": 1.5336,
"step": 15270
},
{
"epoch": 2.1201692916117394,
"grad_norm": 0.0,
"learning_rate": 4.43281550307529e-05,
"loss": 1.6044,
"step": 15280
},
{
"epoch": 2.121556927773538,
"grad_norm": 0.0,
"learning_rate": 4.4199809211175344e-05,
"loss": 1.5086,
"step": 15290
},
{
"epoch": 2.122944563935336,
"grad_norm": 0.0,
"learning_rate": 4.407159673136988e-05,
"loss": 1.5354,
"step": 15300
},
{
"epoch": 2.122944563935336,
"eval_loss": 1.5553832054138184,
"eval_runtime": 926.3099,
"eval_samples_per_second": 13.831,
"eval_steps_per_second": 1.729,
"step": 15300
},
{
"epoch": 2.1243322000971343,
"grad_norm": 0.0,
"learning_rate": 4.3943517897712206e-05,
"loss": 1.495,
"step": 15310
},
{
"epoch": 2.125719836258933,
"grad_norm": 0.0,
"learning_rate": 4.3815573016258696e-05,
"loss": 1.4791,
"step": 15320
},
{
"epoch": 2.1271074724207315,
"grad_norm": 0.0,
"learning_rate": 4.368776239274554e-05,
"loss": 1.4759,
"step": 15330
},
{
"epoch": 2.1284951085825297,
"grad_norm": 0.0,
"learning_rate": 4.356008633258819e-05,
"loss": 1.5037,
"step": 15340
},
{
"epoch": 2.129882744744328,
"grad_norm": 0.0,
"learning_rate": 4.3432545140880584e-05,
"loss": 1.4478,
"step": 15350
},
{
"epoch": 2.1312703809061264,
"grad_norm": 0.0,
"learning_rate": 4.3305139122394235e-05,
"loss": 1.4855,
"step": 15360
},
{
"epoch": 2.132658017067925,
"grad_norm": 0.0,
"learning_rate": 4.3177868581577786e-05,
"loss": 1.5878,
"step": 15370
},
{
"epoch": 2.134045653229723,
"grad_norm": 0.0,
"learning_rate": 4.3050733822556075e-05,
"loss": 1.4945,
"step": 15380
},
{
"epoch": 2.1354332893915213,
"grad_norm": 0.0,
"learning_rate": 4.292373514912954e-05,
"loss": 1.54,
"step": 15390
},
{
"epoch": 2.13682092555332,
"grad_norm": 0.0,
"learning_rate": 4.279687286477331e-05,
"loss": 1.5607,
"step": 15400
},
{
"epoch": 2.13682092555332,
"eval_loss": 1.5553832054138184,
"eval_runtime": 938.5818,
"eval_samples_per_second": 13.65,
"eval_steps_per_second": 1.707,
"step": 15400
},
{
"epoch": 2.1382085617151185,
"grad_norm": 0.0,
"learning_rate": 4.267014727263671e-05,
"loss": 1.4647,
"step": 15410
},
{
"epoch": 2.1395961978769167,
"grad_norm": 0.0,
"learning_rate": 4.2543558675542374e-05,
"loss": 1.55,
"step": 15420
},
{
"epoch": 2.140983834038715,
"grad_norm": 0.0,
"learning_rate": 4.241710737598564e-05,
"loss": 1.4891,
"step": 15430
},
{
"epoch": 2.1423714702005134,
"grad_norm": 0.0,
"learning_rate": 4.2290793676133634e-05,
"loss": 1.5599,
"step": 15440
},
{
"epoch": 2.1437591063623116,
"grad_norm": 0.0,
"learning_rate": 4.2164617877824776e-05,
"loss": 1.5297,
"step": 15450
},
{
"epoch": 2.14514674252411,
"grad_norm": 0.0,
"learning_rate": 4.2038580282567975e-05,
"loss": 1.5349,
"step": 15460
},
{
"epoch": 2.1465343786859084,
"grad_norm": 0.0,
"learning_rate": 4.191268119154178e-05,
"loss": 1.5351,
"step": 15470
},
{
"epoch": 2.147922014847707,
"grad_norm": 0.0,
"learning_rate": 4.1786920905593864e-05,
"loss": 1.6173,
"step": 15480
},
{
"epoch": 2.149309651009505,
"grad_norm": 0.0,
"learning_rate": 4.166129972524019e-05,
"loss": 1.5345,
"step": 15490
},
{
"epoch": 2.1506972871713037,
"grad_norm": 0.0,
"learning_rate": 4.153581795066435e-05,
"loss": 1.4677,
"step": 15500
},
{
"epoch": 2.1506972871713037,
"eval_loss": 1.5553832054138184,
"eval_runtime": 929.9599,
"eval_samples_per_second": 13.777,
"eval_steps_per_second": 1.723,
"step": 15500
},
{
"epoch": 2.152084923333102,
"grad_norm": 0.0,
"learning_rate": 4.14104758817167e-05,
"loss": 1.5162,
"step": 15510
},
{
"epoch": 2.1534725594949005,
"grad_norm": 0.0,
"learning_rate": 4.1285273817913874e-05,
"loss": 1.6178,
"step": 15520
},
{
"epoch": 2.1548601956566986,
"grad_norm": 0.0,
"learning_rate": 4.1160212058437886e-05,
"loss": 1.6055,
"step": 15530
},
{
"epoch": 2.156247831818497,
"grad_norm": 0.0,
"learning_rate": 4.103529090213556e-05,
"loss": 1.5298,
"step": 15540
},
{
"epoch": 2.1576354679802954,
"grad_norm": 0.0,
"learning_rate": 4.09105106475176e-05,
"loss": 1.6078,
"step": 15550
},
{
"epoch": 2.159023104142094,
"grad_norm": 0.0,
"learning_rate": 4.078587159275811e-05,
"loss": 1.5427,
"step": 15560
},
{
"epoch": 2.160410740303892,
"grad_norm": 0.0,
"learning_rate": 4.066137403569381e-05,
"loss": 1.5159,
"step": 15570
},
{
"epoch": 2.1617983764656907,
"grad_norm": 0.0,
"learning_rate": 4.053701827382319e-05,
"loss": 1.5719,
"step": 15580
},
{
"epoch": 2.163186012627489,
"grad_norm": 0.0,
"learning_rate": 4.041280460430598e-05,
"loss": 1.5559,
"step": 15590
},
{
"epoch": 2.1645736487892875,
"grad_norm": 0.0,
"learning_rate": 4.028873332396237e-05,
"loss": 1.5674,
"step": 15600
},
{
"epoch": 2.1645736487892875,
"eval_loss": 1.5553832054138184,
"eval_runtime": 944.6173,
"eval_samples_per_second": 13.563,
"eval_steps_per_second": 1.696,
"step": 15600
},
{
"epoch": 2.1659612849510856,
"grad_norm": 0.0,
"learning_rate": 4.016480472927232e-05,
"loss": 1.5766,
"step": 15610
},
{
"epoch": 2.1673489211128842,
"grad_norm": 0.0,
"learning_rate": 4.0041019116374714e-05,
"loss": 1.5224,
"step": 15620
},
{
"epoch": 2.1687365572746824,
"grad_norm": 0.0,
"learning_rate": 3.9917376781066874e-05,
"loss": 1.6476,
"step": 15630
},
{
"epoch": 2.170124193436481,
"grad_norm": 0.0,
"learning_rate": 3.979387801880373e-05,
"loss": 1.5762,
"step": 15640
},
{
"epoch": 2.171511829598279,
"grad_norm": 0.0,
"learning_rate": 3.967052312469716e-05,
"loss": 1.5226,
"step": 15650
},
{
"epoch": 2.1728994657600778,
"grad_norm": 0.0,
"learning_rate": 3.954731239351512e-05,
"loss": 1.5913,
"step": 15660
},
{
"epoch": 2.174287101921876,
"grad_norm": 0.0,
"learning_rate": 3.942424611968123e-05,
"loss": 1.5287,
"step": 15670
},
{
"epoch": 2.1756747380836745,
"grad_norm": 0.0,
"learning_rate": 3.930132459727388e-05,
"loss": 1.5339,
"step": 15680
},
{
"epoch": 2.1770623742454727,
"grad_norm": 0.0,
"learning_rate": 3.917854812002547e-05,
"loss": 1.5704,
"step": 15690
},
{
"epoch": 2.1784500104072713,
"grad_norm": 0.0,
"learning_rate": 3.905591698132189e-05,
"loss": 1.4806,
"step": 15700
},
{
"epoch": 2.1784500104072713,
"eval_loss": 1.5553832054138184,
"eval_runtime": 927.5263,
"eval_samples_per_second": 13.813,
"eval_steps_per_second": 1.727,
"step": 15700
},
{
"epoch": 2.1798376465690694,
"grad_norm": 0.0,
"learning_rate": 3.893343147420174e-05,
"loss": 1.591,
"step": 15710
},
{
"epoch": 2.181225282730868,
"grad_norm": 0.0,
"learning_rate": 3.8811091891355614e-05,
"loss": 1.537,
"step": 15720
},
{
"epoch": 2.182612918892666,
"grad_norm": 0.0,
"learning_rate": 3.868889852512528e-05,
"loss": 1.5321,
"step": 15730
},
{
"epoch": 2.1840005550544648,
"grad_norm": 0.0,
"learning_rate": 3.856685166750329e-05,
"loss": 1.4817,
"step": 15740
},
{
"epoch": 2.185388191216263,
"grad_norm": 0.0,
"learning_rate": 3.844495161013198e-05,
"loss": 1.5511,
"step": 15750
},
{
"epoch": 2.1867758273780615,
"grad_norm": 0.0,
"learning_rate": 3.8323198644303005e-05,
"loss": 1.5125,
"step": 15760
},
{
"epoch": 2.1881634635398597,
"grad_norm": 0.0,
"learning_rate": 3.820159306095635e-05,
"loss": 1.5082,
"step": 15770
},
{
"epoch": 2.1895510997016583,
"grad_norm": 0.0,
"learning_rate": 3.8080135150679996e-05,
"loss": 1.5499,
"step": 15780
},
{
"epoch": 2.1909387358634564,
"grad_norm": 0.0,
"learning_rate": 3.795882520370898e-05,
"loss": 1.5333,
"step": 15790
},
{
"epoch": 2.192326372025255,
"grad_norm": 0.0,
"learning_rate": 3.78376635099247e-05,
"loss": 1.5596,
"step": 15800
},
{
"epoch": 2.192326372025255,
"eval_loss": 1.5553832054138184,
"eval_runtime": 933.9213,
"eval_samples_per_second": 13.719,
"eval_steps_per_second": 1.715,
"step": 15800
},
{
"epoch": 2.193714008187053,
"grad_norm": 0.0,
"learning_rate": 3.7716650358854386e-05,
"loss": 1.5425,
"step": 15810
},
{
"epoch": 2.195101644348852,
"grad_norm": 0.0,
"learning_rate": 3.759578603967028e-05,
"loss": 1.5336,
"step": 15820
},
{
"epoch": 2.19648928051065,
"grad_norm": 0.0,
"learning_rate": 3.7475070841189e-05,
"loss": 1.5973,
"step": 15830
},
{
"epoch": 2.1978769166724486,
"grad_norm": 0.0,
"learning_rate": 3.735450505187072e-05,
"loss": 1.524,
"step": 15840
},
{
"epoch": 2.1992645528342467,
"grad_norm": 0.0,
"learning_rate": 3.7234088959818715e-05,
"loss": 1.5217,
"step": 15850
},
{
"epoch": 2.2006521889960453,
"grad_norm": 0.0,
"learning_rate": 3.711382285277847e-05,
"loss": 1.5523,
"step": 15860
},
{
"epoch": 2.2020398251578435,
"grad_norm": 0.0,
"learning_rate": 3.699370701813715e-05,
"loss": 1.5474,
"step": 15870
},
{
"epoch": 2.203427461319642,
"grad_norm": 0.0,
"learning_rate": 3.687374174292268e-05,
"loss": 1.6118,
"step": 15880
},
{
"epoch": 2.2048150974814402,
"grad_norm": 0.0,
"learning_rate": 3.675392731380336e-05,
"loss": 1.4827,
"step": 15890
},
{
"epoch": 2.206202733643239,
"grad_norm": 0.0,
"learning_rate": 3.663426401708698e-05,
"loss": 1.5971,
"step": 15900
},
{
"epoch": 2.206202733643239,
"eval_loss": 1.5553832054138184,
"eval_runtime": 932.2836,
"eval_samples_per_second": 13.743,
"eval_steps_per_second": 1.718,
"step": 15900
},
{
"epoch": 2.207590369805037,
"grad_norm": 0.0,
"learning_rate": 3.6514752138720124e-05,
"loss": 1.5733,
"step": 15910
},
{
"epoch": 2.2089780059668356,
"grad_norm": 0.0,
"learning_rate": 3.6395391964287606e-05,
"loss": 1.5794,
"step": 15920
},
{
"epoch": 2.2103656421286337,
"grad_norm": 0.0,
"learning_rate": 3.627618377901174e-05,
"loss": 1.5671,
"step": 15930
},
{
"epoch": 2.2117532782904323,
"grad_norm": 0.0,
"learning_rate": 3.615712786775165e-05,
"loss": 1.5647,
"step": 15940
},
{
"epoch": 2.2131409144522305,
"grad_norm": 0.0,
"learning_rate": 3.603822451500252e-05,
"loss": 1.5265,
"step": 15950
},
{
"epoch": 2.214528550614029,
"grad_norm": 0.0,
"learning_rate": 3.5919474004895027e-05,
"loss": 1.5581,
"step": 15960
},
{
"epoch": 2.2159161867758272,
"grad_norm": 0.0,
"learning_rate": 3.580087662119464e-05,
"loss": 1.6019,
"step": 15970
},
{
"epoch": 2.217303822937626,
"grad_norm": 0.0,
"learning_rate": 3.568243264730092e-05,
"loss": 1.5216,
"step": 15980
},
{
"epoch": 2.218691459099424,
"grad_norm": 0.0,
"learning_rate": 3.5564142366246755e-05,
"loss": 1.5791,
"step": 15990
},
{
"epoch": 2.2200790952612226,
"grad_norm": 0.0,
"learning_rate": 3.544600606069785e-05,
"loss": 1.5498,
"step": 16000
},
{
"epoch": 2.2200790952612226,
"eval_loss": 1.5553832054138184,
"eval_runtime": 973.7526,
"eval_samples_per_second": 13.157,
"eval_steps_per_second": 1.645,
"step": 16000
},
{
"epoch": 2.2214667314230208,
"grad_norm": 0.0,
"learning_rate": 3.532802401295199e-05,
"loss": 1.5221,
"step": 16010
},
{
"epoch": 2.2228543675848194,
"grad_norm": 0.0,
"learning_rate": 3.521019650493824e-05,
"loss": 1.5488,
"step": 16020
},
{
"epoch": 2.2242420037466175,
"grad_norm": 0.0,
"learning_rate": 3.5092523818216486e-05,
"loss": 1.4977,
"step": 16030
},
{
"epoch": 2.225629639908416,
"grad_norm": 0.0,
"learning_rate": 3.4975006233976595e-05,
"loss": 1.5192,
"step": 16040
},
{
"epoch": 2.2270172760702143,
"grad_norm": 0.0,
"learning_rate": 3.485764403303787e-05,
"loss": 1.5094,
"step": 16050
},
{
"epoch": 2.228404912232013,
"grad_norm": 0.0,
"learning_rate": 3.4740437495848186e-05,
"loss": 1.6359,
"step": 16060
},
{
"epoch": 2.229792548393811,
"grad_norm": 0.0,
"learning_rate": 3.462338690248356e-05,
"loss": 1.5899,
"step": 16070
},
{
"epoch": 2.2311801845556096,
"grad_norm": 0.0,
"learning_rate": 3.4506492532647315e-05,
"loss": 1.5689,
"step": 16080
},
{
"epoch": 2.232567820717408,
"grad_norm": 0.0,
"learning_rate": 3.438975466566953e-05,
"loss": 1.5154,
"step": 16090
},
{
"epoch": 2.2339554568792064,
"grad_norm": 0.0,
"learning_rate": 3.427317358050616e-05,
"loss": 1.5396,
"step": 16100
},
{
"epoch": 2.2339554568792064,
"eval_loss": 1.5553832054138184,
"eval_runtime": 996.8854,
"eval_samples_per_second": 12.852,
"eval_steps_per_second": 1.607,
"step": 16100
},
{
"epoch": 2.2353430930410045,
"grad_norm": 0.0,
"learning_rate": 3.415674955573864e-05,
"loss": 1.5667,
"step": 16110
},
{
"epoch": 2.236730729202803,
"grad_norm": 0.0,
"learning_rate": 3.404048286957312e-05,
"loss": 1.5438,
"step": 16120
},
{
"epoch": 2.2381183653646013,
"grad_norm": 0.0,
"learning_rate": 3.3924373799839615e-05,
"loss": 1.5788,
"step": 16130
},
{
"epoch": 2.2395060015264,
"grad_norm": 0.0,
"learning_rate": 3.380842262399166e-05,
"loss": 1.5115,
"step": 16140
},
{
"epoch": 2.240893637688198,
"grad_norm": 0.0,
"learning_rate": 3.369262961910542e-05,
"loss": 1.5161,
"step": 16150
},
{
"epoch": 2.2422812738499966,
"grad_norm": 0.0,
"learning_rate": 3.3576995061879145e-05,
"loss": 1.5062,
"step": 16160
},
{
"epoch": 2.243668910011795,
"grad_norm": 0.0,
"learning_rate": 3.346151922863233e-05,
"loss": 1.504,
"step": 16170
},
{
"epoch": 2.2450565461735934,
"grad_norm": 0.0,
"learning_rate": 3.334620239530534e-05,
"loss": 1.5609,
"step": 16180
},
{
"epoch": 2.2464441823353916,
"grad_norm": 0.0,
"learning_rate": 3.3231044837458495e-05,
"loss": 1.5546,
"step": 16190
},
{
"epoch": 2.24783181849719,
"grad_norm": 0.0,
"learning_rate": 3.3116046830271594e-05,
"loss": 1.5937,
"step": 16200
},
{
"epoch": 2.24783181849719,
"eval_loss": 1.5553832054138184,
"eval_runtime": 1004.3974,
"eval_samples_per_second": 12.756,
"eval_steps_per_second": 1.595,
"step": 16200
},
{
"epoch": 2.2492194546589883,
"grad_norm": 0.0,
"learning_rate": 3.3001208648543055e-05,
"loss": 1.552,
"step": 16210
},
{
"epoch": 2.250607090820787,
"grad_norm": 0.0,
"learning_rate": 3.288653056668949e-05,
"loss": 1.4559,
"step": 16220
},
{
"epoch": 2.251994726982585,
"grad_norm": 0.0,
"learning_rate": 3.2772012858744916e-05,
"loss": 1.5379,
"step": 16230
},
{
"epoch": 2.2533823631443837,
"grad_norm": 0.0,
"learning_rate": 3.265765579836004e-05,
"loss": 1.5939,
"step": 16240
},
{
"epoch": 2.254769999306182,
"grad_norm": 0.0,
"learning_rate": 3.254345965880179e-05,
"loss": 1.5655,
"step": 16250
},
{
"epoch": 2.2561576354679804,
"grad_norm": 0.0,
"learning_rate": 3.2429424712952494e-05,
"loss": 1.5769,
"step": 16260
},
{
"epoch": 2.2575452716297786,
"grad_norm": 0.0,
"learning_rate": 3.231555123330937e-05,
"loss": 1.6075,
"step": 16270
},
{
"epoch": 2.258932907791577,
"grad_norm": 0.0,
"learning_rate": 3.220183949198368e-05,
"loss": 1.5309,
"step": 16280
},
{
"epoch": 2.2603205439533753,
"grad_norm": 0.0,
"learning_rate": 3.208828976070027e-05,
"loss": 1.5075,
"step": 16290
},
{
"epoch": 2.261708180115174,
"grad_norm": 0.0,
"learning_rate": 3.197490231079685e-05,
"loss": 1.5901,
"step": 16300
},
{
"epoch": 2.261708180115174,
"eval_loss": 1.5553832054138184,
"eval_runtime": 991.4676,
"eval_samples_per_second": 12.922,
"eval_steps_per_second": 1.616,
"step": 16300
},
{
"epoch": 2.263095816276972,
"grad_norm": 0.0,
"learning_rate": 3.186167741322337e-05,
"loss": 1.4599,
"step": 16310
},
{
"epoch": 2.2644834524387707,
"grad_norm": 0.0,
"learning_rate": 3.1748615338541224e-05,
"loss": 1.4669,
"step": 16320
},
{
"epoch": 2.265871088600569,
"grad_norm": 0.0,
"learning_rate": 3.163571635692286e-05,
"loss": 1.5273,
"step": 16330
},
{
"epoch": 2.2672587247623675,
"grad_norm": 0.0,
"learning_rate": 3.152298073815096e-05,
"loss": 1.5162,
"step": 16340
},
{
"epoch": 2.2686463609241656,
"grad_norm": 0.0,
"learning_rate": 3.141040875161779e-05,
"loss": 1.5584,
"step": 16350
},
{
"epoch": 2.270033997085964,
"grad_norm": 0.0,
"learning_rate": 3.129800066632463e-05,
"loss": 1.5418,
"step": 16360
},
{
"epoch": 2.2714216332477624,
"grad_norm": 0.0,
"learning_rate": 3.1185756750881143e-05,
"loss": 1.5496,
"step": 16370
},
{
"epoch": 2.272809269409561,
"grad_norm": 0.0,
"learning_rate": 3.1073677273504666e-05,
"loss": 1.5761,
"step": 16380
},
{
"epoch": 2.274196905571359,
"grad_norm": 0.0,
"learning_rate": 3.096176250201953e-05,
"loss": 1.6242,
"step": 16390
},
{
"epoch": 2.2755845417331577,
"grad_norm": 0.0,
"learning_rate": 3.0850012703856567e-05,
"loss": 1.5603,
"step": 16400
},
{
"epoch": 2.2755845417331577,
"eval_loss": 1.5553832054138184,
"eval_runtime": 887.9758,
"eval_samples_per_second": 14.428,
"eval_steps_per_second": 1.804,
"step": 16400
},
{
"epoch": 2.276972177894956,
"grad_norm": 0.0,
"learning_rate": 3.073842814605239e-05,
"loss": 1.5251,
"step": 16410
},
{
"epoch": 2.2783598140567545,
"grad_norm": 0.0,
"learning_rate": 3.0627009095248734e-05,
"loss": 1.6049,
"step": 16420
},
{
"epoch": 2.2797474502185526,
"grad_norm": 0.0,
"learning_rate": 3.0515755817691794e-05,
"loss": 1.5205,
"step": 16430
},
{
"epoch": 2.2811350863803512,
"grad_norm": 0.0,
"learning_rate": 3.0404668579231686e-05,
"loss": 1.4944,
"step": 16440
},
{
"epoch": 2.2825227225421494,
"grad_norm": 0.0,
"learning_rate": 3.029374764532181e-05,
"loss": 1.5663,
"step": 16450
},
{
"epoch": 2.283910358703948,
"grad_norm": 0.0,
"learning_rate": 3.0182993281018034e-05,
"loss": 1.5988,
"step": 16460
},
{
"epoch": 2.285297994865746,
"grad_norm": 0.0,
"learning_rate": 3.0072405750978283e-05,
"loss": 1.5206,
"step": 16470
},
{
"epoch": 2.2866856310275447,
"grad_norm": 0.0,
"learning_rate": 2.9961985319461804e-05,
"loss": 1.5273,
"step": 16480
},
{
"epoch": 2.288073267189343,
"grad_norm": 0.0,
"learning_rate": 2.985173225032858e-05,
"loss": 1.4942,
"step": 16490
},
{
"epoch": 2.2894609033511415,
"grad_norm": 0.0,
"learning_rate": 2.9741646807038558e-05,
"loss": 1.5831,
"step": 16500
},
{
"epoch": 2.2894609033511415,
"eval_loss": 1.5553832054138184,
"eval_runtime": 897.0246,
"eval_samples_per_second": 14.283,
"eval_steps_per_second": 1.786,
"step": 16500
},
{
"epoch": 2.2908485395129397,
"grad_norm": 0.0,
"learning_rate": 2.963172925265123e-05,
"loss": 1.5637,
"step": 16510
},
{
"epoch": 2.2922361756747383,
"grad_norm": 0.0,
"learning_rate": 2.9521979849824855e-05,
"loss": 1.5101,
"step": 16520
},
{
"epoch": 2.2936238118365364,
"grad_norm": 0.0,
"learning_rate": 2.9412398860815936e-05,
"loss": 1.6035,
"step": 16530
},
{
"epoch": 2.295011447998335,
"grad_norm": 0.0,
"learning_rate": 2.9302986547478416e-05,
"loss": 1.5706,
"step": 16540
},
{
"epoch": 2.296399084160133,
"grad_norm": 0.0,
"learning_rate": 2.9193743171263288e-05,
"loss": 1.5611,
"step": 16550
},
{
"epoch": 2.2977867203219318,
"grad_norm": 0.0,
"learning_rate": 2.9084668993217832e-05,
"loss": 1.5604,
"step": 16560
},
{
"epoch": 2.29917435648373,
"grad_norm": 0.0,
"learning_rate": 2.8975764273984953e-05,
"loss": 1.5665,
"step": 16570
},
{
"epoch": 2.3005619926455285,
"grad_norm": 0.0,
"learning_rate": 2.886702927380266e-05,
"loss": 1.5259,
"step": 16580
},
{
"epoch": 2.3019496288073267,
"grad_norm": 0.0,
"learning_rate": 2.8758464252503402e-05,
"loss": 1.5501,
"step": 16590
},
{
"epoch": 2.3033372649691253,
"grad_norm": 0.0,
"learning_rate": 2.8650069469513497e-05,
"loss": 1.5792,
"step": 16600
},
{
"epoch": 2.3033372649691253,
"eval_loss": 1.5553832054138184,
"eval_runtime": 889.8596,
"eval_samples_per_second": 14.398,
"eval_steps_per_second": 1.8,
"step": 16600
},
{
"epoch": 2.3047249011309234,
"grad_norm": 0.0,
"learning_rate": 2.8541845183852345e-05,
"loss": 1.5979,
"step": 16610
},
{
"epoch": 2.306112537292722,
"grad_norm": 0.0,
"learning_rate": 2.8433791654132013e-05,
"loss": 1.5241,
"step": 16620
},
{
"epoch": 2.30750017345452,
"grad_norm": 0.0,
"learning_rate": 2.8325909138556515e-05,
"loss": 1.5629,
"step": 16630
},
{
"epoch": 2.308887809616319,
"grad_norm": 0.0,
"learning_rate": 2.821819789492125e-05,
"loss": 1.563,
"step": 16640
},
{
"epoch": 2.310275445778117,
"grad_norm": 0.0,
"learning_rate": 2.8110658180612226e-05,
"loss": 1.5398,
"step": 16650
},
{
"epoch": 2.3116630819399155,
"grad_norm": 0.0,
"learning_rate": 2.8003290252605685e-05,
"loss": 1.5296,
"step": 16660
},
{
"epoch": 2.3130507181017137,
"grad_norm": 0.0,
"learning_rate": 2.789609436746734e-05,
"loss": 1.5364,
"step": 16670
},
{
"epoch": 2.3144383542635123,
"grad_norm": 0.0,
"learning_rate": 2.7789070781351745e-05,
"loss": 1.5231,
"step": 16680
},
{
"epoch": 2.3158259904253105,
"grad_norm": 0.0,
"learning_rate": 2.768221975000177e-05,
"loss": 1.503,
"step": 16690
},
{
"epoch": 2.317213626587109,
"grad_norm": 0.0,
"learning_rate": 2.757554152874796e-05,
"loss": 1.6102,
"step": 16700
},
{
"epoch": 2.317213626587109,
"eval_loss": 1.5553832054138184,
"eval_runtime": 893.7238,
"eval_samples_per_second": 14.336,
"eval_steps_per_second": 1.793,
"step": 16700
},
{
"epoch": 2.318601262748907,
"grad_norm": 0.0,
"learning_rate": 2.746903637250793e-05,
"loss": 1.5784,
"step": 16710
},
{
"epoch": 2.3199888989107054,
"grad_norm": 0.0,
"learning_rate": 2.736270453578562e-05,
"loss": 1.5198,
"step": 16720
},
{
"epoch": 2.321376535072504,
"grad_norm": 0.0,
"learning_rate": 2.7256546272670946e-05,
"loss": 1.4816,
"step": 16730
},
{
"epoch": 2.3227641712343026,
"grad_norm": 0.0,
"learning_rate": 2.7150561836838994e-05,
"loss": 1.5082,
"step": 16740
},
{
"epoch": 2.3241518073961007,
"grad_norm": 0.0,
"learning_rate": 2.7044751481549525e-05,
"loss": 1.5617,
"step": 16750
},
{
"epoch": 2.325539443557899,
"grad_norm": 0.0,
"learning_rate": 2.693911545964619e-05,
"loss": 1.5301,
"step": 16760
},
{
"epoch": 2.3269270797196975,
"grad_norm": 0.0,
"learning_rate": 2.6833654023556177e-05,
"loss": 1.5309,
"step": 16770
},
{
"epoch": 2.328314715881496,
"grad_norm": 0.0,
"learning_rate": 2.6728367425289493e-05,
"loss": 1.5755,
"step": 16780
},
{
"epoch": 2.3297023520432942,
"grad_norm": 0.0,
"learning_rate": 2.6623255916438217e-05,
"loss": 1.5467,
"step": 16790
},
{
"epoch": 2.3310899882050924,
"grad_norm": 0.0,
"learning_rate": 2.651831974817619e-05,
"loss": 1.5429,
"step": 16800
},
{
"epoch": 2.3310899882050924,
"eval_loss": 1.5553832054138184,
"eval_runtime": 897.8065,
"eval_samples_per_second": 14.27,
"eval_steps_per_second": 1.784,
"step": 16800
},
{
"epoch": 2.332477624366891,
"grad_norm": 0.0,
"learning_rate": 2.641355917125816e-05,
"loss": 1.4949,
"step": 16810
},
{
"epoch": 2.3338652605286896,
"grad_norm": 0.0,
"learning_rate": 2.6308974436019375e-05,
"loss": 1.5429,
"step": 16820
},
{
"epoch": 2.3352528966904877,
"grad_norm": 0.0,
"learning_rate": 2.620456579237476e-05,
"loss": 1.5654,
"step": 16830
},
{
"epoch": 2.336640532852286,
"grad_norm": 0.0,
"learning_rate": 2.6100333489818572e-05,
"loss": 1.5314,
"step": 16840
},
{
"epoch": 2.3380281690140845,
"grad_norm": 0.0,
"learning_rate": 2.5996277777423628e-05,
"loss": 1.5607,
"step": 16850
},
{
"epoch": 2.339415805175883,
"grad_norm": 0.0,
"learning_rate": 2.5892398903840832e-05,
"loss": 1.5424,
"step": 16860
},
{
"epoch": 2.3408034413376813,
"grad_norm": 0.0,
"learning_rate": 2.5788697117298377e-05,
"loss": 1.5879,
"step": 16870
},
{
"epoch": 2.3421910774994794,
"grad_norm": 0.0,
"learning_rate": 2.568517266560141e-05,
"loss": 1.6442,
"step": 16880
},
{
"epoch": 2.343578713661278,
"grad_norm": 0.0,
"learning_rate": 2.558182579613133e-05,
"loss": 1.604,
"step": 16890
},
{
"epoch": 2.3449663498230766,
"grad_norm": 0.0,
"learning_rate": 2.5478656755845077e-05,
"loss": 1.5769,
"step": 16900
},
{
"epoch": 2.3449663498230766,
"eval_loss": 1.5553832054138184,
"eval_runtime": 892.889,
"eval_samples_per_second": 14.349,
"eval_steps_per_second": 1.794,
"step": 16900
},
{
"epoch": 2.3463539859848748,
"grad_norm": 0.0,
"learning_rate": 2.537566579127475e-05,
"loss": 1.6231,
"step": 16910
},
{
"epoch": 2.347741622146673,
"grad_norm": 0.0,
"learning_rate": 2.5272853148526876e-05,
"loss": 1.5962,
"step": 16920
},
{
"epoch": 2.3491292583084715,
"grad_norm": 0.0,
"learning_rate": 2.517021907328191e-05,
"loss": 1.5489,
"step": 16930
},
{
"epoch": 2.35051689447027,
"grad_norm": 0.0,
"learning_rate": 2.506776381079351e-05,
"loss": 1.607,
"step": 16940
},
{
"epoch": 2.3519045306320683,
"grad_norm": 0.0,
"learning_rate": 2.4965487605888137e-05,
"loss": 1.5333,
"step": 16950
},
{
"epoch": 2.3532921667938664,
"grad_norm": 0.0,
"learning_rate": 2.486339070296434e-05,
"loss": 1.5035,
"step": 16960
},
{
"epoch": 2.354679802955665,
"grad_norm": 0.0,
"learning_rate": 2.4761473345992247e-05,
"loss": 1.5349,
"step": 16970
},
{
"epoch": 2.3560674391174636,
"grad_norm": 0.0,
"learning_rate": 2.4659735778512836e-05,
"loss": 1.4888,
"step": 16980
},
{
"epoch": 2.357455075279262,
"grad_norm": 0.0,
"learning_rate": 2.4558178243637587e-05,
"loss": 1.5775,
"step": 16990
},
{
"epoch": 2.35884271144106,
"grad_norm": 0.0,
"learning_rate": 2.4456800984047736e-05,
"loss": 1.4904,
"step": 17000
},
{
"epoch": 2.35884271144106,
"eval_loss": 1.5553832054138184,
"eval_runtime": 891.2441,
"eval_samples_per_second": 14.375,
"eval_steps_per_second": 1.797,
"step": 17000
},
{
"epoch": 2.3602303476028585,
"grad_norm": 0.0,
"learning_rate": 2.4355604241993656e-05,
"loss": 1.6108,
"step": 17010
},
{
"epoch": 2.361617983764657,
"grad_norm": 0.0,
"learning_rate": 2.4254588259294465e-05,
"loss": 1.4995,
"step": 17020
},
{
"epoch": 2.3630056199264553,
"grad_norm": 0.0,
"learning_rate": 2.41537532773373e-05,
"loss": 1.5408,
"step": 17030
},
{
"epoch": 2.3643932560882535,
"grad_norm": 0.0,
"learning_rate": 2.4053099537076794e-05,
"loss": 1.5079,
"step": 17040
},
{
"epoch": 2.365780892250052,
"grad_norm": 0.0,
"learning_rate": 2.3952627279034424e-05,
"loss": 1.4928,
"step": 17050
},
{
"epoch": 2.3671685284118507,
"grad_norm": 0.0,
"learning_rate": 2.3852336743298053e-05,
"loss": 1.5503,
"step": 17060
},
{
"epoch": 2.368556164573649,
"grad_norm": 0.0,
"learning_rate": 2.3752228169521307e-05,
"loss": 1.5749,
"step": 17070
},
{
"epoch": 2.369943800735447,
"grad_norm": 0.0,
"learning_rate": 2.3652301796923003e-05,
"loss": 1.5013,
"step": 17080
},
{
"epoch": 2.3713314368972456,
"grad_norm": 0.0,
"learning_rate": 2.3552557864286495e-05,
"loss": 1.516,
"step": 17090
},
{
"epoch": 2.3727190730590437,
"grad_norm": 0.0,
"learning_rate": 2.3452996609959253e-05,
"loss": 1.533,
"step": 17100
},
{
"epoch": 2.3727190730590437,
"eval_loss": 1.5553832054138184,
"eval_runtime": 890.5528,
"eval_samples_per_second": 14.387,
"eval_steps_per_second": 1.799,
"step": 17100
},
{
"epoch": 2.3741067092208423,
"grad_norm": 0.0,
"learning_rate": 2.335361827185224e-05,
"loss": 1.4783,
"step": 17110
},
{
"epoch": 2.3754943453826405,
"grad_norm": 0.0,
"learning_rate": 2.3254423087439237e-05,
"loss": 1.4655,
"step": 17120
},
{
"epoch": 2.376881981544439,
"grad_norm": 0.0,
"learning_rate": 2.3155411293756412e-05,
"loss": 1.5946,
"step": 17130
},
{
"epoch": 2.3782696177062372,
"grad_norm": 0.0,
"learning_rate": 2.3056583127401733e-05,
"loss": 1.6442,
"step": 17140
},
{
"epoch": 2.379657253868036,
"grad_norm": 0.0,
"learning_rate": 2.2957938824534343e-05,
"loss": 1.5603,
"step": 17150
},
{
"epoch": 2.381044890029834,
"grad_norm": 0.0,
"learning_rate": 2.2859478620873975e-05,
"loss": 1.6026,
"step": 17160
},
{
"epoch": 2.3824325261916326,
"grad_norm": 0.0,
"learning_rate": 2.2761202751700528e-05,
"loss": 1.6088,
"step": 17170
},
{
"epoch": 2.3838201623534307,
"grad_norm": 0.0,
"learning_rate": 2.2663111451853368e-05,
"loss": 1.5083,
"step": 17180
},
{
"epoch": 2.3852077985152293,
"grad_norm": 0.0,
"learning_rate": 2.256520495573087e-05,
"loss": 1.5413,
"step": 17190
},
{
"epoch": 2.3865954346770275,
"grad_norm": 0.0,
"learning_rate": 2.2467483497289677e-05,
"loss": 1.5837,
"step": 17200
},
{
"epoch": 2.3865954346770275,
"eval_loss": 1.5553832054138184,
"eval_runtime": 889.6194,
"eval_samples_per_second": 14.402,
"eval_steps_per_second": 1.801,
"step": 17200
},
{
"epoch": 2.387983070838826,
"grad_norm": 0.0,
"learning_rate": 2.2369947310044392e-05,
"loss": 1.5767,
"step": 17210
},
{
"epoch": 2.3893707070006243,
"grad_norm": 0.0,
"learning_rate": 2.227259662706689e-05,
"loss": 1.5455,
"step": 17220
},
{
"epoch": 2.390758343162423,
"grad_norm": 0.0,
"learning_rate": 2.217543168098565e-05,
"loss": 1.5668,
"step": 17230
},
{
"epoch": 2.392145979324221,
"grad_norm": 0.0,
"learning_rate": 2.207845270398544e-05,
"loss": 1.6349,
"step": 17240
},
{
"epoch": 2.3935336154860196,
"grad_norm": 0.0,
"learning_rate": 2.1981659927806576e-05,
"loss": 1.5506,
"step": 17250
},
{
"epoch": 2.3949212516478178,
"grad_norm": 0.0,
"learning_rate": 2.1885053583744485e-05,
"loss": 1.5319,
"step": 17260
},
{
"epoch": 2.3963088878096164,
"grad_norm": 0.0,
"learning_rate": 2.1788633902648992e-05,
"loss": 1.5557,
"step": 17270
},
{
"epoch": 2.3976965239714145,
"grad_norm": 0.0,
"learning_rate": 2.1692401114923975e-05,
"loss": 1.5497,
"step": 17280
},
{
"epoch": 2.399084160133213,
"grad_norm": 0.0,
"learning_rate": 2.1596355450526673e-05,
"loss": 1.5477,
"step": 17290
},
{
"epoch": 2.4004717962950113,
"grad_norm": 0.0,
"learning_rate": 2.15004971389672e-05,
"loss": 1.5048,
"step": 17300
},
{
"epoch": 2.4004717962950113,
"eval_loss": 1.5553832054138184,
"eval_runtime": 884.422,
"eval_samples_per_second": 14.486,
"eval_steps_per_second": 1.811,
"step": 17300
},
{
"epoch": 2.40185943245681,
"grad_norm": 0.0,
"learning_rate": 2.140482640930791e-05,
"loss": 1.6127,
"step": 17310
},
{
"epoch": 2.403247068618608,
"grad_norm": 0.0,
"learning_rate": 2.130934349016297e-05,
"loss": 1.5357,
"step": 17320
},
{
"epoch": 2.4046347047804066,
"grad_norm": 0.0,
"learning_rate": 2.121404860969778e-05,
"loss": 1.5203,
"step": 17330
},
{
"epoch": 2.406022340942205,
"grad_norm": 0.0,
"learning_rate": 2.1118941995628294e-05,
"loss": 1.5295,
"step": 17340
},
{
"epoch": 2.4074099771040034,
"grad_norm": 0.0,
"learning_rate": 2.10240238752207e-05,
"loss": 1.52,
"step": 17350
},
{
"epoch": 2.4087976132658016,
"grad_norm": 0.0,
"learning_rate": 2.092929447529072e-05,
"loss": 1.6179,
"step": 17360
},
{
"epoch": 2.4101852494276,
"grad_norm": 0.0,
"learning_rate": 2.083475402220312e-05,
"loss": 1.5312,
"step": 17370
},
{
"epoch": 2.4115728855893983,
"grad_norm": 0.0,
"learning_rate": 2.074040274187111e-05,
"loss": 1.5649,
"step": 17380
},
{
"epoch": 2.412960521751197,
"grad_norm": 0.0,
"learning_rate": 2.0646240859755926e-05,
"loss": 1.5582,
"step": 17390
},
{
"epoch": 2.414348157912995,
"grad_norm": 0.0,
"learning_rate": 2.0552268600866174e-05,
"loss": 1.4689,
"step": 17400
},
{
"epoch": 2.414348157912995,
"eval_loss": 1.5553832054138184,
"eval_runtime": 881.4697,
"eval_samples_per_second": 14.535,
"eval_steps_per_second": 1.817,
"step": 17400
},
{
"epoch": 2.4157357940747937,
"grad_norm": 0.0,
"learning_rate": 2.0458486189757385e-05,
"loss": 1.5304,
"step": 17410
},
{
"epoch": 2.417123430236592,
"grad_norm": 0.0,
"learning_rate": 2.0364893850531342e-05,
"loss": 1.5111,
"step": 17420
},
{
"epoch": 2.4185110663983904,
"grad_norm": 0.0,
"learning_rate": 2.0271491806835717e-05,
"loss": 1.5422,
"step": 17430
},
{
"epoch": 2.4198987025601886,
"grad_norm": 0.0,
"learning_rate": 2.017828028186345e-05,
"loss": 1.5842,
"step": 17440
},
{
"epoch": 2.421286338721987,
"grad_norm": 0.0,
"learning_rate": 2.008525949835214e-05,
"loss": 1.5357,
"step": 17450
},
{
"epoch": 2.4226739748837853,
"grad_norm": 0.0,
"learning_rate": 1.999242967858367e-05,
"loss": 1.5144,
"step": 17460
},
{
"epoch": 2.424061611045584,
"grad_norm": 0.0,
"learning_rate": 1.9899791044383575e-05,
"loss": 1.553,
"step": 17470
},
{
"epoch": 2.425449247207382,
"grad_norm": 0.0,
"learning_rate": 1.980734381712056e-05,
"loss": 1.5922,
"step": 17480
},
{
"epoch": 2.4268368833691807,
"grad_norm": 0.0,
"learning_rate": 1.9715088217705856e-05,
"loss": 1.5451,
"step": 17490
},
{
"epoch": 2.428224519530979,
"grad_norm": 0.0,
"learning_rate": 1.9623024466592877e-05,
"loss": 1.5234,
"step": 17500
},
{
"epoch": 2.428224519530979,
"eval_loss": 1.5553832054138184,
"eval_runtime": 873.5127,
"eval_samples_per_second": 14.667,
"eval_steps_per_second": 1.834,
"step": 17500
},
{
"epoch": 2.4296121556927774,
"grad_norm": 0.0,
"learning_rate": 1.9531152783776553e-05,
"loss": 1.5719,
"step": 17510
},
{
"epoch": 2.4309997918545756,
"grad_norm": 0.0,
"learning_rate": 1.9439473388792895e-05,
"loss": 1.6198,
"step": 17520
},
{
"epoch": 2.432387428016374,
"grad_norm": 0.0,
"learning_rate": 1.934798650071833e-05,
"loss": 1.5601,
"step": 17530
},
{
"epoch": 2.4337750641781724,
"grad_norm": 0.0,
"learning_rate": 1.9256692338169345e-05,
"loss": 1.5462,
"step": 17540
},
{
"epoch": 2.435162700339971,
"grad_norm": 0.0,
"learning_rate": 1.9165591119301918e-05,
"loss": 1.4857,
"step": 17550
},
{
"epoch": 2.436550336501769,
"grad_norm": 0.0,
"learning_rate": 1.9074683061810873e-05,
"loss": 1.5428,
"step": 17560
},
{
"epoch": 2.4379379726635677,
"grad_norm": 0.0,
"learning_rate": 1.898396838292953e-05,
"loss": 1.5551,
"step": 17570
},
{
"epoch": 2.439325608825366,
"grad_norm": 0.0,
"learning_rate": 1.889344729942909e-05,
"loss": 1.5344,
"step": 17580
},
{
"epoch": 2.4407132449871645,
"grad_norm": 0.0,
"learning_rate": 1.880312002761818e-05,
"loss": 1.4525,
"step": 17590
},
{
"epoch": 2.4421008811489626,
"grad_norm": 0.0,
"learning_rate": 1.8712986783342213e-05,
"loss": 1.5635,
"step": 17600
},
{
"epoch": 2.4421008811489626,
"eval_loss": 1.5553832054138184,
"eval_runtime": 874.238,
"eval_samples_per_second": 14.655,
"eval_steps_per_second": 1.832,
"step": 17600
},
{
"epoch": 2.443488517310761,
"grad_norm": 0.0,
"learning_rate": 1.8623047781983015e-05,
"loss": 1.5471,
"step": 17610
},
{
"epoch": 2.4448761534725594,
"grad_norm": 0.0,
"learning_rate": 1.8533303238458242e-05,
"loss": 1.5249,
"step": 17620
},
{
"epoch": 2.446263789634358,
"grad_norm": 0.0,
"learning_rate": 1.8443753367220895e-05,
"loss": 1.6026,
"step": 17630
},
{
"epoch": 2.447651425796156,
"grad_norm": 0.0,
"learning_rate": 1.8354398382258718e-05,
"loss": 1.5442,
"step": 17640
},
{
"epoch": 2.4490390619579547,
"grad_norm": 0.0,
"learning_rate": 1.8265238497093818e-05,
"loss": 1.5441,
"step": 17650
},
{
"epoch": 2.450426698119753,
"grad_norm": 0.0,
"learning_rate": 1.8176273924782105e-05,
"loss": 1.5742,
"step": 17660
},
{
"epoch": 2.4518143342815515,
"grad_norm": 0.0,
"learning_rate": 1.8087504877912685e-05,
"loss": 1.517,
"step": 17670
},
{
"epoch": 2.4532019704433496,
"grad_norm": 0.0,
"learning_rate": 1.7998931568607515e-05,
"loss": 1.4875,
"step": 17680
},
{
"epoch": 2.4545896066051482,
"grad_norm": 0.0,
"learning_rate": 1.7910554208520814e-05,
"loss": 1.5491,
"step": 17690
},
{
"epoch": 2.4559772427669464,
"grad_norm": 0.0,
"learning_rate": 1.7822373008838555e-05,
"loss": 1.5338,
"step": 17700
},
{
"epoch": 2.4559772427669464,
"eval_loss": 1.5553832054138184,
"eval_runtime": 877.3156,
"eval_samples_per_second": 14.604,
"eval_steps_per_second": 1.826,
"step": 17700
},
{
"epoch": 2.457364878928745,
"grad_norm": 0.0,
"learning_rate": 1.7734388180277916e-05,
"loss": 1.5268,
"step": 17710
},
{
"epoch": 2.458752515090543,
"grad_norm": 0.0,
"learning_rate": 1.7646599933086892e-05,
"loss": 1.5664,
"step": 17720
},
{
"epoch": 2.4601401512523418,
"grad_norm": 0.0,
"learning_rate": 1.7559008477043715e-05,
"loss": 1.5986,
"step": 17730
},
{
"epoch": 2.46152778741414,
"grad_norm": 0.0,
"learning_rate": 1.7471614021456363e-05,
"loss": 1.494,
"step": 17740
},
{
"epoch": 2.4629154235759385,
"grad_norm": 0.0,
"learning_rate": 1.7384416775162015e-05,
"loss": 1.5648,
"step": 17750
},
{
"epoch": 2.4643030597377367,
"grad_norm": 0.0,
"learning_rate": 1.729741694652668e-05,
"loss": 1.54,
"step": 17760
},
{
"epoch": 2.4656906958995353,
"grad_norm": 0.0,
"learning_rate": 1.7210614743444588e-05,
"loss": 1.5335,
"step": 17770
},
{
"epoch": 2.4670783320613334,
"grad_norm": 0.0,
"learning_rate": 1.7124010373337684e-05,
"loss": 1.5415,
"step": 17780
},
{
"epoch": 2.468465968223132,
"grad_norm": 0.0,
"learning_rate": 1.703760404315522e-05,
"loss": 1.5377,
"step": 17790
},
{
"epoch": 2.46985360438493,
"grad_norm": 0.0,
"learning_rate": 1.6951395959373194e-05,
"loss": 1.5515,
"step": 17800
},
{
"epoch": 2.46985360438493,
"eval_loss": 1.5553832054138184,
"eval_runtime": 878.8065,
"eval_samples_per_second": 14.579,
"eval_steps_per_second": 1.823,
"step": 17800
},
{
"epoch": 2.471241240546729,
"grad_norm": 0.0,
"learning_rate": 1.6865386327993927e-05,
"loss": 1.5407,
"step": 17810
},
{
"epoch": 2.472628876708527,
"grad_norm": 0.0,
"learning_rate": 1.6779575354545406e-05,
"loss": 1.5614,
"step": 17820
},
{
"epoch": 2.4740165128703255,
"grad_norm": 0.0,
"learning_rate": 1.6693963244081002e-05,
"loss": 1.4807,
"step": 17830
},
{
"epoch": 2.4754041490321237,
"grad_norm": 0.0,
"learning_rate": 1.660855020117885e-05,
"loss": 1.5384,
"step": 17840
},
{
"epoch": 2.4767917851939223,
"grad_norm": 0.0,
"learning_rate": 1.652333642994144e-05,
"loss": 1.5561,
"step": 17850
},
{
"epoch": 2.4781794213557204,
"grad_norm": 0.0,
"learning_rate": 1.6438322133994986e-05,
"loss": 1.5725,
"step": 17860
},
{
"epoch": 2.479567057517519,
"grad_norm": 0.0,
"learning_rate": 1.6353507516489118e-05,
"loss": 1.5946,
"step": 17870
},
{
"epoch": 2.480954693679317,
"grad_norm": 0.0,
"learning_rate": 1.6268892780096322e-05,
"loss": 1.5852,
"step": 17880
},
{
"epoch": 2.482342329841116,
"grad_norm": 0.0,
"learning_rate": 1.618447812701137e-05,
"loss": 1.6317,
"step": 17890
},
{
"epoch": 2.483729966002914,
"grad_norm": 0.0,
"learning_rate": 1.6100263758950995e-05,
"loss": 1.5009,
"step": 17900
},
{
"epoch": 2.483729966002914,
"eval_loss": 1.5553832054138184,
"eval_runtime": 901.7145,
"eval_samples_per_second": 14.208,
"eval_steps_per_second": 1.777,
"step": 17900
}
],
"logging_steps": 10,
"max_steps": 21621,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.391892638486782e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}