Inputoutput_SFT_Qwen3_4B / trainer_state.json
yujianll's picture
Upload folder using huggingface_hub
508146e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.817204301075268,
"eval_steps": 500,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030721966205837174,
"grad_norm": 5.377892017364502,
"learning_rate": 1.6460905349794242e-07,
"loss": 0.6829,
"step": 5
},
{
"epoch": 0.06144393241167435,
"grad_norm": 4.802157878875732,
"learning_rate": 3.7037037037037036e-07,
"loss": 0.6782,
"step": 10
},
{
"epoch": 0.09216589861751152,
"grad_norm": 4.367337226867676,
"learning_rate": 5.761316872427984e-07,
"loss": 0.6663,
"step": 15
},
{
"epoch": 0.1228878648233487,
"grad_norm": 3.747973918914795,
"learning_rate": 7.818930041152265e-07,
"loss": 0.655,
"step": 20
},
{
"epoch": 0.15360983102918588,
"grad_norm": 2.209118127822876,
"learning_rate": 9.876543209876544e-07,
"loss": 0.6173,
"step": 25
},
{
"epoch": 0.18433179723502305,
"grad_norm": 1.1678818464279175,
"learning_rate": 1.1934156378600823e-06,
"loss": 0.6048,
"step": 30
},
{
"epoch": 0.21505376344086022,
"grad_norm": 0.8436072468757629,
"learning_rate": 1.3991769547325104e-06,
"loss": 0.5856,
"step": 35
},
{
"epoch": 0.2457757296466974,
"grad_norm": 0.9035472273826599,
"learning_rate": 1.6049382716049383e-06,
"loss": 0.5731,
"step": 40
},
{
"epoch": 0.2764976958525346,
"grad_norm": 0.6879045963287354,
"learning_rate": 1.8106995884773665e-06,
"loss": 0.5672,
"step": 45
},
{
"epoch": 0.30721966205837176,
"grad_norm": 0.5572217106819153,
"learning_rate": 2.0164609053497946e-06,
"loss": 0.5652,
"step": 50
},
{
"epoch": 0.3379416282642089,
"grad_norm": 0.475868821144104,
"learning_rate": 2.222222222222222e-06,
"loss": 0.5569,
"step": 55
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.4037950932979584,
"learning_rate": 2.4279835390946504e-06,
"loss": 0.552,
"step": 60
},
{
"epoch": 0.39938556067588327,
"grad_norm": 0.3646543323993683,
"learning_rate": 2.6337448559670788e-06,
"loss": 0.5486,
"step": 65
},
{
"epoch": 0.43010752688172044,
"grad_norm": 0.3199547231197357,
"learning_rate": 2.8395061728395062e-06,
"loss": 0.548,
"step": 70
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.29529860615730286,
"learning_rate": 3.0452674897119346e-06,
"loss": 0.5412,
"step": 75
},
{
"epoch": 0.4915514592933948,
"grad_norm": 0.2877354025840759,
"learning_rate": 3.2510288065843625e-06,
"loss": 0.5384,
"step": 80
},
{
"epoch": 0.522273425499232,
"grad_norm": 0.27960506081581116,
"learning_rate": 3.4567901234567904e-06,
"loss": 0.5408,
"step": 85
},
{
"epoch": 0.5529953917050692,
"grad_norm": 0.3203478455543518,
"learning_rate": 3.6625514403292183e-06,
"loss": 0.5385,
"step": 90
},
{
"epoch": 0.5837173579109063,
"grad_norm": 0.293573260307312,
"learning_rate": 3.868312757201647e-06,
"loss": 0.5367,
"step": 95
},
{
"epoch": 0.6144393241167435,
"grad_norm": 0.2991442084312439,
"learning_rate": 4.074074074074074e-06,
"loss": 0.5318,
"step": 100
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.28697723150253296,
"learning_rate": 4.2798353909465025e-06,
"loss": 0.5325,
"step": 105
},
{
"epoch": 0.6758832565284179,
"grad_norm": 0.29979708790779114,
"learning_rate": 4.485596707818931e-06,
"loss": 0.5329,
"step": 110
},
{
"epoch": 0.706605222734255,
"grad_norm": 0.28516969084739685,
"learning_rate": 4.691358024691358e-06,
"loss": 0.5303,
"step": 115
},
{
"epoch": 0.7373271889400922,
"grad_norm": 0.2979312837123871,
"learning_rate": 4.897119341563787e-06,
"loss": 0.532,
"step": 120
},
{
"epoch": 0.7680491551459293,
"grad_norm": 0.29658472537994385,
"learning_rate": 5.102880658436214e-06,
"loss": 0.5361,
"step": 125
},
{
"epoch": 0.7987711213517665,
"grad_norm": 0.3170669972896576,
"learning_rate": 5.3086419753086425e-06,
"loss": 0.5307,
"step": 130
},
{
"epoch": 0.8294930875576036,
"grad_norm": 0.3079938590526581,
"learning_rate": 5.514403292181071e-06,
"loss": 0.5277,
"step": 135
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.33612361550331116,
"learning_rate": 5.720164609053498e-06,
"loss": 0.5298,
"step": 140
},
{
"epoch": 0.890937019969278,
"grad_norm": 0.3119032382965088,
"learning_rate": 5.925925925925926e-06,
"loss": 0.5248,
"step": 145
},
{
"epoch": 0.9216589861751152,
"grad_norm": 0.33814293146133423,
"learning_rate": 6.131687242798354e-06,
"loss": 0.5284,
"step": 150
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.322230726480484,
"learning_rate": 6.3374485596707825e-06,
"loss": 0.5281,
"step": 155
},
{
"epoch": 0.9831029185867896,
"grad_norm": 0.3144535720348358,
"learning_rate": 6.543209876543211e-06,
"loss": 0.5242,
"step": 160
},
{
"epoch": 1.012288786482335,
"grad_norm": 0.29515042901039124,
"learning_rate": 6.748971193415639e-06,
"loss": 0.5163,
"step": 165
},
{
"epoch": 1.043010752688172,
"grad_norm": 0.3272690176963806,
"learning_rate": 6.954732510288067e-06,
"loss": 0.5168,
"step": 170
},
{
"epoch": 1.0737327188940091,
"grad_norm": 0.38112202286720276,
"learning_rate": 7.160493827160494e-06,
"loss": 0.5175,
"step": 175
},
{
"epoch": 1.1044546850998465,
"grad_norm": 0.3627144992351532,
"learning_rate": 7.3662551440329225e-06,
"loss": 0.5146,
"step": 180
},
{
"epoch": 1.1351766513056836,
"grad_norm": 0.34759828448295593,
"learning_rate": 7.57201646090535e-06,
"loss": 0.5091,
"step": 185
},
{
"epoch": 1.1658986175115207,
"grad_norm": 0.32023346424102783,
"learning_rate": 7.77777777777778e-06,
"loss": 0.5154,
"step": 190
},
{
"epoch": 1.1966205837173578,
"grad_norm": 0.34595441818237305,
"learning_rate": 7.983539094650207e-06,
"loss": 0.514,
"step": 195
},
{
"epoch": 1.2273425499231951,
"grad_norm": 0.35786375403404236,
"learning_rate": 8.189300411522634e-06,
"loss": 0.515,
"step": 200
},
{
"epoch": 1.2580645161290323,
"grad_norm": 0.3521522283554077,
"learning_rate": 8.395061728395062e-06,
"loss": 0.5096,
"step": 205
},
{
"epoch": 1.2887864823348694,
"grad_norm": 0.3382728397846222,
"learning_rate": 8.60082304526749e-06,
"loss": 0.5092,
"step": 210
},
{
"epoch": 1.3195084485407067,
"grad_norm": 0.3584599494934082,
"learning_rate": 8.806584362139918e-06,
"loss": 0.5088,
"step": 215
},
{
"epoch": 1.3502304147465438,
"grad_norm": 0.3886154294013977,
"learning_rate": 9.012345679012346e-06,
"loss": 0.5103,
"step": 220
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.35392460227012634,
"learning_rate": 9.218106995884775e-06,
"loss": 0.5122,
"step": 225
},
{
"epoch": 1.411674347158218,
"grad_norm": 0.3449483811855316,
"learning_rate": 9.423868312757202e-06,
"loss": 0.5101,
"step": 230
},
{
"epoch": 1.4423963133640554,
"grad_norm": 0.40504640340805054,
"learning_rate": 9.62962962962963e-06,
"loss": 0.5079,
"step": 235
},
{
"epoch": 1.4731182795698925,
"grad_norm": 0.3839814066886902,
"learning_rate": 9.835390946502057e-06,
"loss": 0.5075,
"step": 240
},
{
"epoch": 1.5038402457757296,
"grad_norm": 0.3998360335826874,
"learning_rate": 9.999994841278135e-06,
"loss": 0.5117,
"step": 245
},
{
"epoch": 1.5345622119815667,
"grad_norm": 0.3241407573223114,
"learning_rate": 9.99981428713058e-06,
"loss": 0.5116,
"step": 250
},
{
"epoch": 1.565284178187404,
"grad_norm": 0.3408064544200897,
"learning_rate": 9.999375807534642e-06,
"loss": 0.5086,
"step": 255
},
{
"epoch": 1.5960061443932412,
"grad_norm": 0.3956799805164337,
"learning_rate": 9.998679425110168e-06,
"loss": 0.5057,
"step": 260
},
{
"epoch": 1.6267281105990783,
"grad_norm": 0.34674304723739624,
"learning_rate": 9.997725175781445e-06,
"loss": 0.5042,
"step": 265
},
{
"epoch": 1.6574500768049156,
"grad_norm": 0.33803871273994446,
"learning_rate": 9.996513108775338e-06,
"loss": 0.5094,
"step": 270
},
{
"epoch": 1.6881720430107527,
"grad_norm": 0.3286557197570801,
"learning_rate": 9.995043286618752e-06,
"loss": 0.5082,
"step": 275
},
{
"epoch": 1.7188940092165899,
"grad_norm": 0.4859721064567566,
"learning_rate": 9.993315785135417e-06,
"loss": 0.5062,
"step": 280
},
{
"epoch": 1.7496159754224272,
"grad_norm": 0.39187705516815186,
"learning_rate": 9.991330693441956e-06,
"loss": 0.5004,
"step": 285
},
{
"epoch": 1.780337941628264,
"grad_norm": 0.3706142008304596,
"learning_rate": 9.989088113943309e-06,
"loss": 0.5074,
"step": 290
},
{
"epoch": 1.8110599078341014,
"grad_norm": 0.36376601457595825,
"learning_rate": 9.986588162327436e-06,
"loss": 0.5043,
"step": 295
},
{
"epoch": 1.8417818740399385,
"grad_norm": 0.3372829854488373,
"learning_rate": 9.983830967559355e-06,
"loss": 0.505,
"step": 300
},
{
"epoch": 1.8725038402457757,
"grad_norm": 0.3605220913887024,
"learning_rate": 9.98081667187449e-06,
"loss": 0.506,
"step": 305
},
{
"epoch": 1.903225806451613,
"grad_norm": 0.37473252415657043,
"learning_rate": 9.977545430771332e-06,
"loss": 0.5065,
"step": 310
},
{
"epoch": 1.93394777265745,
"grad_norm": 0.3622889816761017,
"learning_rate": 9.974017413003407e-06,
"loss": 0.5049,
"step": 315
},
{
"epoch": 1.9646697388632872,
"grad_norm": 0.36003556847572327,
"learning_rate": 9.970232800570594e-06,
"loss": 0.5042,
"step": 320
},
{
"epoch": 1.9953917050691246,
"grad_norm": 0.35878923535346985,
"learning_rate": 9.966191788709716e-06,
"loss": 0.498,
"step": 325
},
{
"epoch": 2.02457757296467,
"grad_norm": 0.3277081847190857,
"learning_rate": 9.961894585884472e-06,
"loss": 0.4833,
"step": 330
},
{
"epoch": 2.055299539170507,
"grad_norm": 0.35245636105537415,
"learning_rate": 9.957341413774693e-06,
"loss": 0.4823,
"step": 335
},
{
"epoch": 2.086021505376344,
"grad_norm": 0.3628138601779938,
"learning_rate": 9.952532507264892e-06,
"loss": 0.4789,
"step": 340
},
{
"epoch": 2.1167434715821813,
"grad_norm": 0.36662936210632324,
"learning_rate": 9.947468114432156e-06,
"loss": 0.4876,
"step": 345
},
{
"epoch": 2.1474654377880182,
"grad_norm": 0.3806234896183014,
"learning_rate": 9.942148496533348e-06,
"loss": 0.4797,
"step": 350
},
{
"epoch": 2.1781874039938556,
"grad_norm": 0.3836243152618408,
"learning_rate": 9.936573927991631e-06,
"loss": 0.4823,
"step": 355
},
{
"epoch": 2.208909370199693,
"grad_norm": 0.3716926872730255,
"learning_rate": 9.930744696382298e-06,
"loss": 0.4846,
"step": 360
},
{
"epoch": 2.23963133640553,
"grad_norm": 0.3589572608470917,
"learning_rate": 9.924661102417959e-06,
"loss": 0.4794,
"step": 365
},
{
"epoch": 2.270353302611367,
"grad_norm": 0.44799497723579407,
"learning_rate": 9.918323459933006e-06,
"loss": 0.4849,
"step": 370
},
{
"epoch": 2.3010752688172045,
"grad_norm": 0.35237064957618713,
"learning_rate": 9.911732095867443e-06,
"loss": 0.4819,
"step": 375
},
{
"epoch": 2.3317972350230414,
"grad_norm": 0.3844442665576935,
"learning_rate": 9.904887350250002e-06,
"loss": 0.4828,
"step": 380
},
{
"epoch": 2.3625192012288787,
"grad_norm": 0.34357205033302307,
"learning_rate": 9.897789576180617e-06,
"loss": 0.4795,
"step": 385
},
{
"epoch": 2.3932411674347156,
"grad_norm": 0.34739232063293457,
"learning_rate": 9.8904391398122e-06,
"loss": 0.4817,
"step": 390
},
{
"epoch": 2.423963133640553,
"grad_norm": 0.3262459337711334,
"learning_rate": 9.882836420331753e-06,
"loss": 0.4807,
"step": 395
},
{
"epoch": 2.4546850998463903,
"grad_norm": 0.32715994119644165,
"learning_rate": 9.87498180994081e-06,
"loss": 0.4825,
"step": 400
},
{
"epoch": 2.485407066052227,
"grad_norm": 0.3524874150753021,
"learning_rate": 9.8668757138352e-06,
"loss": 0.4832,
"step": 405
},
{
"epoch": 2.5161290322580645,
"grad_norm": 0.3556855618953705,
"learning_rate": 9.858518550184154e-06,
"loss": 0.4784,
"step": 410
},
{
"epoch": 2.546850998463902,
"grad_norm": 0.350763201713562,
"learning_rate": 9.849910750108718e-06,
"loss": 0.4796,
"step": 415
},
{
"epoch": 2.5775729646697387,
"grad_norm": 0.40554359555244446,
"learning_rate": 9.841052757659525e-06,
"loss": 0.4795,
"step": 420
},
{
"epoch": 2.608294930875576,
"grad_norm": 0.38155123591423035,
"learning_rate": 9.831945029793884e-06,
"loss": 0.4824,
"step": 425
},
{
"epoch": 2.6390168970814134,
"grad_norm": 0.34588319063186646,
"learning_rate": 9.822588036352201e-06,
"loss": 0.4812,
"step": 430
},
{
"epoch": 2.6697388632872503,
"grad_norm": 0.3738536536693573,
"learning_rate": 9.812982260033753e-06,
"loss": 0.4776,
"step": 435
},
{
"epoch": 2.7004608294930876,
"grad_norm": 0.34988853335380554,
"learning_rate": 9.803128196371778e-06,
"loss": 0.4827,
"step": 440
},
{
"epoch": 2.731182795698925,
"grad_norm": 0.3567947447299957,
"learning_rate": 9.793026353707915e-06,
"loss": 0.4824,
"step": 445
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.3680736720561981,
"learning_rate": 9.782677253165979e-06,
"loss": 0.4817,
"step": 450
},
{
"epoch": 2.792626728110599,
"grad_norm": 0.3302510380744934,
"learning_rate": 9.77208142862508e-06,
"loss": 0.4799,
"step": 455
},
{
"epoch": 2.823348694316436,
"grad_norm": 0.3362921178340912,
"learning_rate": 9.761239426692077e-06,
"loss": 0.4792,
"step": 460
},
{
"epoch": 2.8540706605222734,
"grad_norm": 0.3084135949611664,
"learning_rate": 9.750151806673389e-06,
"loss": 0.4798,
"step": 465
},
{
"epoch": 2.8847926267281108,
"grad_norm": 0.3991258442401886,
"learning_rate": 9.738819140546135e-06,
"loss": 0.4776,
"step": 470
},
{
"epoch": 2.9155145929339477,
"grad_norm": 0.3840397298336029,
"learning_rate": 9.727242012928622e-06,
"loss": 0.4827,
"step": 475
},
{
"epoch": 2.946236559139785,
"grad_norm": 0.3366018235683441,
"learning_rate": 9.715421021050205e-06,
"loss": 0.478,
"step": 480
},
{
"epoch": 2.976958525345622,
"grad_norm": 0.3289054036140442,
"learning_rate": 9.703356774720454e-06,
"loss": 0.4806,
"step": 485
},
{
"epoch": 3.0061443932411676,
"grad_norm": 0.41949138045310974,
"learning_rate": 9.69104989629772e-06,
"loss": 0.474,
"step": 490
},
{
"epoch": 3.0368663594470044,
"grad_norm": 0.3739219605922699,
"learning_rate": 9.678501020657008e-06,
"loss": 0.4555,
"step": 495
},
{
"epoch": 3.067588325652842,
"grad_norm": 0.3918289244174957,
"learning_rate": 9.665710795157236e-06,
"loss": 0.4559,
"step": 500
},
{
"epoch": 3.098310291858679,
"grad_norm": 0.37078753113746643,
"learning_rate": 9.652679879607843e-06,
"loss": 0.4523,
"step": 505
},
{
"epoch": 3.129032258064516,
"grad_norm": 0.39428192377090454,
"learning_rate": 9.639408946234745e-06,
"loss": 0.455,
"step": 510
},
{
"epoch": 3.1597542242703534,
"grad_norm": 0.36103686690330505,
"learning_rate": 9.625898679645656e-06,
"loss": 0.4539,
"step": 515
},
{
"epoch": 3.1904761904761907,
"grad_norm": 0.3871241807937622,
"learning_rate": 9.612149776794776e-06,
"loss": 0.4585,
"step": 520
},
{
"epoch": 3.2211981566820276,
"grad_norm": 0.3608538508415222,
"learning_rate": 9.59816294694684e-06,
"loss": 0.4545,
"step": 525
},
{
"epoch": 3.251920122887865,
"grad_norm": 0.33820873498916626,
"learning_rate": 9.583938911640513e-06,
"loss": 0.4581,
"step": 530
},
{
"epoch": 3.282642089093702,
"grad_norm": 0.3311152160167694,
"learning_rate": 9.569478404651192e-06,
"loss": 0.4572,
"step": 535
},
{
"epoch": 3.313364055299539,
"grad_norm": 0.3974754512310028,
"learning_rate": 9.55478217195313e-06,
"loss": 0.4579,
"step": 540
},
{
"epoch": 3.3440860215053765,
"grad_norm": 0.36764049530029297,
"learning_rate": 9.53985097168097e-06,
"loss": 0.4548,
"step": 545
},
{
"epoch": 3.3748079877112134,
"grad_norm": 0.3310830295085907,
"learning_rate": 9.524685574090627e-06,
"loss": 0.4596,
"step": 550
},
{
"epoch": 3.4055299539170507,
"grad_norm": 0.35807356238365173,
"learning_rate": 9.50928676151955e-06,
"loss": 0.4561,
"step": 555
},
{
"epoch": 3.436251920122888,
"grad_norm": 0.3509482741355896,
"learning_rate": 9.493655328346378e-06,
"loss": 0.4601,
"step": 560
},
{
"epoch": 3.466973886328725,
"grad_norm": 0.32899123430252075,
"learning_rate": 9.477792080949938e-06,
"loss": 0.458,
"step": 565
},
{
"epoch": 3.4976958525345623,
"grad_norm": 0.3493783473968506,
"learning_rate": 9.461697837667668e-06,
"loss": 0.4578,
"step": 570
},
{
"epoch": 3.528417818740399,
"grad_norm": 0.42410966753959656,
"learning_rate": 9.445373428753386e-06,
"loss": 0.457,
"step": 575
},
{
"epoch": 3.5591397849462365,
"grad_norm": 0.39236894249916077,
"learning_rate": 9.42881969633447e-06,
"loss": 0.4621,
"step": 580
},
{
"epoch": 3.589861751152074,
"grad_norm": 0.3428690433502197,
"learning_rate": 9.412037494368412e-06,
"loss": 0.4613,
"step": 585
},
{
"epoch": 3.6205837173579107,
"grad_norm": 0.3500923216342926,
"learning_rate": 9.395027688598756e-06,
"loss": 0.4553,
"step": 590
},
{
"epoch": 3.651305683563748,
"grad_norm": 0.3521360456943512,
"learning_rate": 9.377791156510456e-06,
"loss": 0.4609,
"step": 595
},
{
"epoch": 3.6820276497695854,
"grad_norm": 0.3520371615886688,
"learning_rate": 9.360328787284587e-06,
"loss": 0.4561,
"step": 600
},
{
"epoch": 3.7127496159754223,
"grad_norm": 0.37317851185798645,
"learning_rate": 9.342641481752492e-06,
"loss": 0.4543,
"step": 605
},
{
"epoch": 3.7434715821812596,
"grad_norm": 0.3881990313529968,
"learning_rate": 9.324730152349305e-06,
"loss": 0.4573,
"step": 610
},
{
"epoch": 3.774193548387097,
"grad_norm": 0.34541720151901245,
"learning_rate": 9.306595723066878e-06,
"loss": 0.4588,
"step": 615
},
{
"epoch": 3.804915514592934,
"grad_norm": 0.6174806356430054,
"learning_rate": 9.28823912940612e-06,
"loss": 0.4615,
"step": 620
},
{
"epoch": 3.835637480798771,
"grad_norm": 0.37580618262290955,
"learning_rate": 9.26966131832873e-06,
"loss": 0.4603,
"step": 625
},
{
"epoch": 3.8663594470046085,
"grad_norm": 0.3373568058013916,
"learning_rate": 9.250863248208357e-06,
"loss": 0.4575,
"step": 630
},
{
"epoch": 3.8970814132104454,
"grad_norm": 0.3492389917373657,
"learning_rate": 9.231845888781153e-06,
"loss": 0.457,
"step": 635
},
{
"epoch": 3.9278033794162828,
"grad_norm": 0.353481262922287,
"learning_rate": 9.212610221095748e-06,
"loss": 0.4593,
"step": 640
},
{
"epoch": 3.9585253456221197,
"grad_norm": 0.339603066444397,
"learning_rate": 9.193157237462642e-06,
"loss": 0.4583,
"step": 645
},
{
"epoch": 3.989247311827957,
"grad_norm": 0.35986068844795227,
"learning_rate": 9.173487941403011e-06,
"loss": 0.4575,
"step": 650
},
{
"epoch": 4.018433179723503,
"grad_norm": 0.39629873633384705,
"learning_rate": 9.153603347596946e-06,
"loss": 0.4437,
"step": 655
},
{
"epoch": 4.04915514592934,
"grad_norm": 0.38085299730300903,
"learning_rate": 9.133504481831103e-06,
"loss": 0.4315,
"step": 660
},
{
"epoch": 4.0798771121351765,
"grad_norm": 0.375144898891449,
"learning_rate": 9.113192380945783e-06,
"loss": 0.4332,
"step": 665
},
{
"epoch": 4.110599078341014,
"grad_norm": 0.3690689206123352,
"learning_rate": 9.092668092781454e-06,
"loss": 0.4286,
"step": 670
},
{
"epoch": 4.141321044546851,
"grad_norm": 0.3713686764240265,
"learning_rate": 9.071932676124686e-06,
"loss": 0.4321,
"step": 675
},
{
"epoch": 4.172043010752688,
"grad_norm": 0.37255361676216125,
"learning_rate": 9.050987200653538e-06,
"loss": 0.4308,
"step": 680
},
{
"epoch": 4.202764976958525,
"grad_norm": 0.4153440296649933,
"learning_rate": 9.029832746882372e-06,
"loss": 0.434,
"step": 685
},
{
"epoch": 4.233486943164363,
"grad_norm": 0.3848015367984772,
"learning_rate": 9.008470406106118e-06,
"loss": 0.4321,
"step": 690
},
{
"epoch": 4.2642089093702,
"grad_norm": 0.38491949439048767,
"learning_rate": 8.986901280343973e-06,
"loss": 0.437,
"step": 695
},
{
"epoch": 4.2949308755760365,
"grad_norm": 0.40272125601768494,
"learning_rate": 8.96512648228255e-06,
"loss": 0.4327,
"step": 700
},
{
"epoch": 4.325652841781874,
"grad_norm": 0.40901532769203186,
"learning_rate": 8.943147135218482e-06,
"loss": 0.4355,
"step": 705
},
{
"epoch": 4.356374807987711,
"grad_norm": 0.37816864252090454,
"learning_rate": 8.920964373000474e-06,
"loss": 0.4309,
"step": 710
},
{
"epoch": 4.387096774193548,
"grad_norm": 0.3686360716819763,
"learning_rate": 8.898579339970806e-06,
"loss": 0.4333,
"step": 715
},
{
"epoch": 4.417818740399386,
"grad_norm": 0.3904341161251068,
"learning_rate": 8.875993190906309e-06,
"loss": 0.436,
"step": 720
},
{
"epoch": 4.448540706605223,
"grad_norm": 0.369642049074173,
"learning_rate": 8.85320709095878e-06,
"loss": 0.4393,
"step": 725
},
{
"epoch": 4.47926267281106,
"grad_norm": 0.39105841517448425,
"learning_rate": 8.83022221559489e-06,
"loss": 0.4335,
"step": 730
},
{
"epoch": 4.509984639016897,
"grad_norm": 0.35647451877593994,
"learning_rate": 8.80703975053554e-06,
"loss": 0.4365,
"step": 735
},
{
"epoch": 4.540706605222734,
"grad_norm": 0.37886905670166016,
"learning_rate": 8.783660891694683e-06,
"loss": 0.4358,
"step": 740
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.33613675832748413,
"learning_rate": 8.760086845117648e-06,
"loss": 0.4339,
"step": 745
},
{
"epoch": 4.602150537634409,
"grad_norm": 0.3609409034252167,
"learning_rate": 8.736318826918909e-06,
"loss": 0.4367,
"step": 750
},
{
"epoch": 4.632872503840246,
"grad_norm": 0.3324005603790283,
"learning_rate": 8.71235806321936e-06,
"loss": 0.4368,
"step": 755
},
{
"epoch": 4.663594470046083,
"grad_norm": 0.34170496463775635,
"learning_rate": 8.688205790083053e-06,
"loss": 0.4364,
"step": 760
},
{
"epoch": 4.6943164362519205,
"grad_norm": 0.3765306770801544,
"learning_rate": 8.663863253453444e-06,
"loss": 0.4381,
"step": 765
},
{
"epoch": 4.725038402457757,
"grad_norm": 0.3638916611671448,
"learning_rate": 8.639331709089107e-06,
"loss": 0.438,
"step": 770
},
{
"epoch": 4.755760368663594,
"grad_norm": 0.3378274738788605,
"learning_rate": 8.614612422498965e-06,
"loss": 0.4396,
"step": 775
},
{
"epoch": 4.786482334869431,
"grad_norm": 0.3760294020175934,
"learning_rate": 8.589706668876995e-06,
"loss": 0.4387,
"step": 780
},
{
"epoch": 4.817204301075269,
"grad_norm": 0.3364088535308838,
"learning_rate": 8.564615733036457e-06,
"loss": 0.4388,
"step": 785
},
{
"epoch": 4.847926267281106,
"grad_norm": 0.3584051728248596,
"learning_rate": 8.539340909343597e-06,
"loss": 0.4355,
"step": 790
},
{
"epoch": 4.878648233486944,
"grad_norm": 0.3589382469654083,
"learning_rate": 8.513883501650892e-06,
"loss": 0.4393,
"step": 795
},
{
"epoch": 4.9093701996927805,
"grad_norm": 0.362913578748703,
"learning_rate": 8.488244823229781e-06,
"loss": 0.4391,
"step": 800
},
{
"epoch": 4.940092165898617,
"grad_norm": 0.38569971919059753,
"learning_rate": 8.462426196702912e-06,
"loss": 0.44,
"step": 805
},
{
"epoch": 4.970814132104454,
"grad_norm": 0.7799672484397888,
"learning_rate": 8.436428953975921e-06,
"loss": 0.4402,
"step": 810
},
{
"epoch": 5.0,
"grad_norm": 0.35950392484664917,
"learning_rate": 8.41025443616872e-06,
"loss": 0.4385,
"step": 815
},
{
"epoch": 5.030721966205837,
"grad_norm": 0.434950053691864,
"learning_rate": 8.38390399354631e-06,
"loss": 0.4124,
"step": 820
},
{
"epoch": 5.061443932411675,
"grad_norm": 0.38890355825424194,
"learning_rate": 8.357378985449124e-06,
"loss": 0.4077,
"step": 825
},
{
"epoch": 5.092165898617512,
"grad_norm": 0.3747502267360687,
"learning_rate": 8.330680780222907e-06,
"loss": 0.4116,
"step": 830
},
{
"epoch": 5.1228878648233485,
"grad_norm": 0.4041999578475952,
"learning_rate": 8.303810755148127e-06,
"loss": 0.4125,
"step": 835
},
{
"epoch": 5.153609831029186,
"grad_norm": 0.8506478667259216,
"learning_rate": 8.276770296368922e-06,
"loss": 0.4086,
"step": 840
},
{
"epoch": 5.184331797235023,
"grad_norm": 0.43535116314888,
"learning_rate": 8.249560798821592e-06,
"loss": 0.4118,
"step": 845
},
{
"epoch": 5.21505376344086,
"grad_norm": 0.4166457951068878,
"learning_rate": 8.222183666162647e-06,
"loss": 0.41,
"step": 850
},
{
"epoch": 5.245775729646698,
"grad_norm": 0.3790026009082794,
"learning_rate": 8.194640310696383e-06,
"loss": 0.4131,
"step": 855
},
{
"epoch": 5.276497695852535,
"grad_norm": 0.4068205654621124,
"learning_rate": 8.16693215330204e-06,
"loss": 0.4149,
"step": 860
},
{
"epoch": 5.307219662058372,
"grad_norm": 0.40233853459358215,
"learning_rate": 8.139060623360494e-06,
"loss": 0.414,
"step": 865
},
{
"epoch": 5.337941628264209,
"grad_norm": 0.4058436155319214,
"learning_rate": 8.111027158680516e-06,
"loss": 0.4128,
"step": 870
},
{
"epoch": 5.368663594470046,
"grad_norm": 0.35581091046333313,
"learning_rate": 8.082833205424614e-06,
"loss": 0.412,
"step": 875
},
{
"epoch": 5.399385560675883,
"grad_norm": 0.39174729585647583,
"learning_rate": 8.054480218034415e-06,
"loss": 0.4127,
"step": 880
},
{
"epoch": 5.43010752688172,
"grad_norm": 0.4122447371482849,
"learning_rate": 8.02596965915564e-06,
"loss": 0.4143,
"step": 885
},
{
"epoch": 5.460829493087558,
"grad_norm": 0.37394076585769653,
"learning_rate": 7.997302999562657e-06,
"loss": 0.4165,
"step": 890
},
{
"epoch": 5.491551459293395,
"grad_norm": 0.38974493741989136,
"learning_rate": 7.968481718082601e-06,
"loss": 0.4158,
"step": 895
},
{
"epoch": 5.522273425499232,
"grad_norm": 0.3667392134666443,
"learning_rate": 7.93950730151908e-06,
"loss": 0.4186,
"step": 900
},
{
"epoch": 5.552995391705069,
"grad_norm": 0.3641802668571472,
"learning_rate": 7.910381244575491e-06,
"loss": 0.4146,
"step": 905
},
{
"epoch": 5.583717357910906,
"grad_norm": 0.37418097257614136,
"learning_rate": 7.881105049777902e-06,
"loss": 0.4146,
"step": 910
},
{
"epoch": 5.614439324116743,
"grad_norm": 0.3662942051887512,
"learning_rate": 7.851680227397541e-06,
"loss": 0.4181,
"step": 915
},
{
"epoch": 5.645161290322581,
"grad_norm": 0.3564474284648895,
"learning_rate": 7.82210829537289e-06,
"loss": 0.4122,
"step": 920
},
{
"epoch": 5.675883256528418,
"grad_norm": 0.3735935091972351,
"learning_rate": 7.792390779231374e-06,
"loss": 0.4152,
"step": 925
},
{
"epoch": 5.706605222734255,
"grad_norm": 0.3896511197090149,
"learning_rate": 7.762529212010675e-06,
"loss": 0.4125,
"step": 930
},
{
"epoch": 5.7373271889400925,
"grad_norm": 0.42632153630256653,
"learning_rate": 7.732525134179625e-06,
"loss": 0.4138,
"step": 935
},
{
"epoch": 5.768049155145929,
"grad_norm": 0.3700067698955536,
"learning_rate": 7.702380093558766e-06,
"loss": 0.4128,
"step": 940
},
{
"epoch": 5.798771121351766,
"grad_norm": 0.3713553547859192,
"learning_rate": 7.672095645240479e-06,
"loss": 0.4153,
"step": 945
},
{
"epoch": 5.829493087557603,
"grad_norm": 0.49530503153800964,
"learning_rate": 7.641673351508774e-06,
"loss": 0.4159,
"step": 950
},
{
"epoch": 5.860215053763441,
"grad_norm": 0.3351239562034607,
"learning_rate": 7.6111147817586925e-06,
"loss": 0.4181,
"step": 955
},
{
"epoch": 5.890937019969278,
"grad_norm": 0.3583086133003235,
"learning_rate": 7.580421512415349e-06,
"loss": 0.4148,
"step": 960
},
{
"epoch": 5.921658986175116,
"grad_norm": 0.3566780388355255,
"learning_rate": 7.549595126852605e-06,
"loss": 0.4133,
"step": 965
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.3630661964416504,
"learning_rate": 7.518637215311388e-06,
"loss": 0.4151,
"step": 970
},
{
"epoch": 5.983102918586789,
"grad_norm": 0.35128363966941833,
"learning_rate": 7.487549374817662e-06,
"loss": 0.4159,
"step": 975
},
{
"epoch": 6.012288786482335,
"grad_norm": 0.442436158657074,
"learning_rate": 7.456333209100032e-06,
"loss": 0.4034,
"step": 980
},
{
"epoch": 6.043010752688172,
"grad_norm": 0.41370487213134766,
"learning_rate": 7.424990328507017e-06,
"loss": 0.3851,
"step": 985
},
{
"epoch": 6.073732718894009,
"grad_norm": 0.4333699941635132,
"learning_rate": 7.393522349923981e-06,
"loss": 0.3869,
"step": 990
},
{
"epoch": 6.104454685099847,
"grad_norm": 0.4446549415588379,
"learning_rate": 7.361930896689713e-06,
"loss": 0.3836,
"step": 995
},
{
"epoch": 6.135176651305684,
"grad_norm": 0.40873849391937256,
"learning_rate": 7.330217598512696e-06,
"loss": 0.3857,
"step": 1000
},
{
"epoch": 6.1658986175115205,
"grad_norm": 0.4244365990161896,
"learning_rate": 7.2983840913870215e-06,
"loss": 0.3863,
"step": 1005
},
{
"epoch": 6.196620583717358,
"grad_norm": 0.3845650851726532,
"learning_rate": 7.266432017508008e-06,
"loss": 0.3901,
"step": 1010
},
{
"epoch": 6.227342549923195,
"grad_norm": 0.39868220686912537,
"learning_rate": 7.234363025187474e-06,
"loss": 0.3855,
"step": 1015
},
{
"epoch": 6.258064516129032,
"grad_norm": 0.37892380356788635,
"learning_rate": 7.202178768768711e-06,
"loss": 0.3928,
"step": 1020
},
{
"epoch": 6.28878648233487,
"grad_norm": 0.3923156261444092,
"learning_rate": 7.169880908541136e-06,
"loss": 0.3921,
"step": 1025
},
{
"epoch": 6.319508448540707,
"grad_norm": 0.39880916476249695,
"learning_rate": 7.137471110654656e-06,
"loss": 0.3938,
"step": 1030
},
{
"epoch": 6.350230414746544,
"grad_norm": 0.4066980481147766,
"learning_rate": 7.104951047033697e-06,
"loss": 0.3906,
"step": 1035
},
{
"epoch": 6.380952380952381,
"grad_norm": 0.3751300573348999,
"learning_rate": 7.0723223952909694e-06,
"loss": 0.3909,
"step": 1040
},
{
"epoch": 6.411674347158218,
"grad_norm": 0.3525156080722809,
"learning_rate": 7.039586838640918e-06,
"loss": 0.3894,
"step": 1045
},
{
"epoch": 6.442396313364055,
"grad_norm": 0.36944258213043213,
"learning_rate": 7.006746065812895e-06,
"loss": 0.3909,
"step": 1050
},
{
"epoch": 6.473118279569892,
"grad_norm": 0.3836762011051178,
"learning_rate": 6.973801770964031e-06,
"loss": 0.3896,
"step": 1055
},
{
"epoch": 6.50384024577573,
"grad_norm": 0.41395968198776245,
"learning_rate": 6.940755653591859e-06,
"loss": 0.3889,
"step": 1060
},
{
"epoch": 6.534562211981567,
"grad_norm": 0.42266151309013367,
"learning_rate": 6.907609418446623e-06,
"loss": 0.3924,
"step": 1065
},
{
"epoch": 6.565284178187404,
"grad_norm": 0.38352274894714355,
"learning_rate": 6.8743647754433485e-06,
"loss": 0.3934,
"step": 1070
},
{
"epoch": 6.596006144393241,
"grad_norm": 0.3761062026023865,
"learning_rate": 6.841023439573623e-06,
"loss": 0.3915,
"step": 1075
},
{
"epoch": 6.626728110599078,
"grad_norm": 0.38670945167541504,
"learning_rate": 6.807587130817134e-06,
"loss": 0.3925,
"step": 1080
},
{
"epoch": 6.657450076804915,
"grad_norm": 0.36626312136650085,
"learning_rate": 6.774057574052932e-06,
"loss": 0.3944,
"step": 1085
},
{
"epoch": 6.688172043010753,
"grad_norm": 0.4045194685459137,
"learning_rate": 6.740436498970453e-06,
"loss": 0.3955,
"step": 1090
},
{
"epoch": 6.71889400921659,
"grad_norm": 0.4138599932193756,
"learning_rate": 6.706725639980294e-06,
"loss": 0.3929,
"step": 1095
},
{
"epoch": 6.749615975422427,
"grad_norm": 0.39506402611732483,
"learning_rate": 6.6729267361247295e-06,
"loss": 0.3883,
"step": 1100
},
{
"epoch": 6.7803379416282645,
"grad_norm": 0.3903568387031555,
"learning_rate": 6.639041530988009e-06,
"loss": 0.3939,
"step": 1105
},
{
"epoch": 6.811059907834101,
"grad_norm": 0.3678980767726898,
"learning_rate": 6.605071772606404e-06,
"loss": 0.394,
"step": 1110
},
{
"epoch": 6.841781874039938,
"grad_norm": 0.35300132632255554,
"learning_rate": 6.571019213378034e-06,
"loss": 0.391,
"step": 1115
},
{
"epoch": 6.872503840245776,
"grad_norm": 0.3788436949253082,
"learning_rate": 6.536885609972467e-06,
"loss": 0.397,
"step": 1120
},
{
"epoch": 6.903225806451613,
"grad_norm": 0.38878560066223145,
"learning_rate": 6.502672723240103e-06,
"loss": 0.3969,
"step": 1125
},
{
"epoch": 6.93394777265745,
"grad_norm": 0.4072780907154083,
"learning_rate": 6.4683823181213224e-06,
"loss": 0.3969,
"step": 1130
},
{
"epoch": 6.964669738863288,
"grad_norm": 0.40496107935905457,
"learning_rate": 6.434016163555452e-06,
"loss": 0.3957,
"step": 1135
},
{
"epoch": 6.9953917050691246,
"grad_norm": 0.3747064173221588,
"learning_rate": 6.399576032389505e-06,
"loss": 0.3984,
"step": 1140
},
{
"epoch": 7.024577572964669,
"grad_norm": 0.5090351104736328,
"learning_rate": 6.365063701286728e-06,
"loss": 0.3714,
"step": 1145
},
{
"epoch": 7.055299539170507,
"grad_norm": 0.42551228404045105,
"learning_rate": 6.330480950634942e-06,
"loss": 0.3673,
"step": 1150
},
{
"epoch": 7.086021505376344,
"grad_norm": 0.4707318842411041,
"learning_rate": 6.2958295644547026e-06,
"loss": 0.3641,
"step": 1155
},
{
"epoch": 7.116743471582181,
"grad_norm": 0.40848663449287415,
"learning_rate": 6.261111330307272e-06,
"loss": 0.3628,
"step": 1160
},
{
"epoch": 7.147465437788019,
"grad_norm": 0.4382622539997101,
"learning_rate": 6.22632803920239e-06,
"loss": 0.3691,
"step": 1165
},
{
"epoch": 7.178187403993856,
"grad_norm": 0.3866026699542999,
"learning_rate": 6.191481485505898e-06,
"loss": 0.3639,
"step": 1170
},
{
"epoch": 7.2089093701996925,
"grad_norm": 0.4263141453266144,
"learning_rate": 6.1565734668471614e-06,
"loss": 0.3634,
"step": 1175
},
{
"epoch": 7.23963133640553,
"grad_norm": 0.4050372242927551,
"learning_rate": 6.121605784026339e-06,
"loss": 0.3648,
"step": 1180
},
{
"epoch": 7.270353302611367,
"grad_norm": 0.3879098892211914,
"learning_rate": 6.086580240921486e-06,
"loss": 0.3667,
"step": 1185
},
{
"epoch": 7.301075268817204,
"grad_norm": 0.4055810868740082,
"learning_rate": 6.051498644395496e-06,
"loss": 0.3656,
"step": 1190
},
{
"epoch": 7.331797235023042,
"grad_norm": 0.42201170325279236,
"learning_rate": 6.01636280420289e-06,
"loss": 0.3679,
"step": 1195
},
{
"epoch": 7.362519201228879,
"grad_norm": 0.4164835214614868,
"learning_rate": 5.981174532896459e-06,
"loss": 0.367,
"step": 1200
},
{
"epoch": 7.393241167434716,
"grad_norm": 0.39605438709259033,
"learning_rate": 5.9459356457337556e-06,
"loss": 0.3647,
"step": 1205
},
{
"epoch": 7.423963133640553,
"grad_norm": 0.4393250644207001,
"learning_rate": 5.910647960583458e-06,
"loss": 0.3733,
"step": 1210
},
{
"epoch": 7.45468509984639,
"grad_norm": 0.37553438544273376,
"learning_rate": 5.875313297831579e-06,
"loss": 0.37,
"step": 1215
},
{
"epoch": 7.485407066052227,
"grad_norm": 0.3898600935935974,
"learning_rate": 5.839933480287572e-06,
"loss": 0.3678,
"step": 1220
},
{
"epoch": 7.516129032258064,
"grad_norm": 0.4083476662635803,
"learning_rate": 5.804510333090287e-06,
"loss": 0.3665,
"step": 1225
},
{
"epoch": 7.546850998463902,
"grad_norm": 0.40433645248413086,
"learning_rate": 5.769045683613822e-06,
"loss": 0.3715,
"step": 1230
},
{
"epoch": 7.577572964669739,
"grad_norm": 0.4303235709667206,
"learning_rate": 5.733541361373253e-06,
"loss": 0.3711,
"step": 1235
},
{
"epoch": 7.6082949308755765,
"grad_norm": 0.40306177735328674,
"learning_rate": 5.697999197930259e-06,
"loss": 0.3659,
"step": 1240
},
{
"epoch": 7.639016897081413,
"grad_norm": 0.39787065982818604,
"learning_rate": 5.662421026798624e-06,
"loss": 0.3722,
"step": 1245
},
{
"epoch": 7.66973886328725,
"grad_norm": 0.401962012052536,
"learning_rate": 5.626808683349672e-06,
"loss": 0.3691,
"step": 1250
},
{
"epoch": 7.700460829493087,
"grad_norm": 0.38256722688674927,
"learning_rate": 5.591164004717567e-06,
"loss": 0.3694,
"step": 1255
},
{
"epoch": 7.731182795698925,
"grad_norm": 0.4020300507545471,
"learning_rate": 5.55548882970455e-06,
"loss": 0.3728,
"step": 1260
},
{
"epoch": 7.761904761904762,
"grad_norm": 0.41450026631355286,
"learning_rate": 5.519784998686081e-06,
"loss": 0.3673,
"step": 1265
},
{
"epoch": 7.792626728110599,
"grad_norm": 0.36544522643089294,
"learning_rate": 5.484054353515896e-06,
"loss": 0.3729,
"step": 1270
},
{
"epoch": 7.8233486943164365,
"grad_norm": 0.38962146639823914,
"learning_rate": 5.448298737430992e-06,
"loss": 0.3697,
"step": 1275
},
{
"epoch": 7.854070660522273,
"grad_norm": 0.425886869430542,
"learning_rate": 5.412519994956543e-06,
"loss": 0.3733,
"step": 1280
},
{
"epoch": 7.88479262672811,
"grad_norm": 0.3979520797729492,
"learning_rate": 5.376719971810741e-06,
"loss": 0.3734,
"step": 1285
},
{
"epoch": 7.915514592933948,
"grad_norm": 0.38723668456077576,
"learning_rate": 5.340900514809587e-06,
"loss": 0.3726,
"step": 1290
},
{
"epoch": 7.946236559139785,
"grad_norm": 0.37770572304725647,
"learning_rate": 5.305063471771614e-06,
"loss": 0.3699,
"step": 1295
},
{
"epoch": 7.976958525345622,
"grad_norm": 0.398049533367157,
"learning_rate": 5.26921069142257e-06,
"loss": 0.3717,
"step": 1300
},
{
"epoch": 8.006144393241167,
"grad_norm": 0.5838120579719543,
"learning_rate": 5.233344023300037e-06,
"loss": 0.3649,
"step": 1305
},
{
"epoch": 8.036866359447005,
"grad_norm": 0.4888751208782196,
"learning_rate": 5.197465317658036e-06,
"loss": 0.3417,
"step": 1310
},
{
"epoch": 8.067588325652842,
"grad_norm": 0.4426686465740204,
"learning_rate": 5.161576425371554e-06,
"loss": 0.3448,
"step": 1315
},
{
"epoch": 8.09831029185868,
"grad_norm": 0.4328514635562897,
"learning_rate": 5.125679197841088e-06,
"loss": 0.3427,
"step": 1320
},
{
"epoch": 8.129032258064516,
"grad_norm": 0.461224764585495,
"learning_rate": 5.089775486897121e-06,
"loss": 0.3411,
"step": 1325
},
{
"epoch": 8.159754224270353,
"grad_norm": 0.41059058904647827,
"learning_rate": 5.053867144704594e-06,
"loss": 0.3432,
"step": 1330
},
{
"epoch": 8.19047619047619,
"grad_norm": 0.4233262538909912,
"learning_rate": 5.017956023667363e-06,
"loss": 0.3428,
"step": 1335
},
{
"epoch": 8.221198156682028,
"grad_norm": 0.44398781657218933,
"learning_rate": 4.982043976332638e-06,
"loss": 0.3396,
"step": 1340
},
{
"epoch": 8.251920122887865,
"grad_norm": 0.43628108501434326,
"learning_rate": 4.946132855295407e-06,
"loss": 0.3432,
"step": 1345
},
{
"epoch": 8.282642089093702,
"grad_norm": 0.45262426137924194,
"learning_rate": 4.910224513102881e-06,
"loss": 0.34,
"step": 1350
},
{
"epoch": 8.31336405529954,
"grad_norm": 0.46370255947113037,
"learning_rate": 4.8743208021589135e-06,
"loss": 0.3404,
"step": 1355
},
{
"epoch": 8.344086021505376,
"grad_norm": 0.40948814153671265,
"learning_rate": 4.838423574628447e-06,
"loss": 0.3431,
"step": 1360
},
{
"epoch": 8.374807987711213,
"grad_norm": 0.4436282813549042,
"learning_rate": 4.802534682341966e-06,
"loss": 0.3446,
"step": 1365
},
{
"epoch": 8.40552995391705,
"grad_norm": 0.4203520119190216,
"learning_rate": 4.7666559766999635e-06,
"loss": 0.3478,
"step": 1370
},
{
"epoch": 8.436251920122888,
"grad_norm": 0.4091641306877136,
"learning_rate": 4.730789308577432e-06,
"loss": 0.3461,
"step": 1375
},
{
"epoch": 8.466973886328725,
"grad_norm": 0.4320433735847473,
"learning_rate": 4.694936528228387e-06,
"loss": 0.3504,
"step": 1380
},
{
"epoch": 8.497695852534562,
"grad_norm": 0.4243397116661072,
"learning_rate": 4.659099485190414e-06,
"loss": 0.3444,
"step": 1385
},
{
"epoch": 8.5284178187404,
"grad_norm": 0.42783576250076294,
"learning_rate": 4.6232800281892604e-06,
"loss": 0.3398,
"step": 1390
},
{
"epoch": 8.559139784946236,
"grad_norm": 0.395312637090683,
"learning_rate": 4.587480005043458e-06,
"loss": 0.3472,
"step": 1395
},
{
"epoch": 8.589861751152073,
"grad_norm": 0.41875869035720825,
"learning_rate": 4.551701262569009e-06,
"loss": 0.3475,
"step": 1400
},
{
"epoch": 8.620583717357912,
"grad_norm": 0.4307910203933716,
"learning_rate": 4.515945646484105e-06,
"loss": 0.3465,
"step": 1405
},
{
"epoch": 8.651305683563749,
"grad_norm": 0.40852200984954834,
"learning_rate": 4.480215001313919e-06,
"loss": 0.3497,
"step": 1410
},
{
"epoch": 8.682027649769585,
"grad_norm": 0.4345207214355469,
"learning_rate": 4.444511170295451e-06,
"loss": 0.3474,
"step": 1415
},
{
"epoch": 8.712749615975422,
"grad_norm": 0.4096705913543701,
"learning_rate": 4.408835995282434e-06,
"loss": 0.3472,
"step": 1420
},
{
"epoch": 8.74347158218126,
"grad_norm": 0.4314156770706177,
"learning_rate": 4.373191316650328e-06,
"loss": 0.3518,
"step": 1425
},
{
"epoch": 8.774193548387096,
"grad_norm": 0.41832414269447327,
"learning_rate": 4.3375789732013775e-06,
"loss": 0.3498,
"step": 1430
},
{
"epoch": 8.804915514592935,
"grad_norm": 0.42618289589881897,
"learning_rate": 4.302000802069744e-06,
"loss": 0.3486,
"step": 1435
},
{
"epoch": 8.835637480798772,
"grad_norm": 0.43849977850914,
"learning_rate": 4.2664586386267474e-06,
"loss": 0.346,
"step": 1440
},
{
"epoch": 8.866359447004609,
"grad_norm": 0.42157772183418274,
"learning_rate": 4.230954316386179e-06,
"loss": 0.3475,
"step": 1445
},
{
"epoch": 8.897081413210445,
"grad_norm": 0.39600861072540283,
"learning_rate": 4.195489666909714e-06,
"loss": 0.3455,
"step": 1450
},
{
"epoch": 8.927803379416282,
"grad_norm": 0.3980286419391632,
"learning_rate": 4.160066519712428e-06,
"loss": 0.3488,
"step": 1455
},
{
"epoch": 8.95852534562212,
"grad_norm": 0.41449347138404846,
"learning_rate": 4.1246867021684206e-06,
"loss": 0.345,
"step": 1460
},
{
"epoch": 8.989247311827956,
"grad_norm": 0.43595919013023376,
"learning_rate": 4.089352039416543e-06,
"loss": 0.3476,
"step": 1465
},
{
"epoch": 9.018433179723502,
"grad_norm": 0.5251989364624023,
"learning_rate": 4.054064354266244e-06,
"loss": 0.3327,
"step": 1470
},
{
"epoch": 9.049155145929339,
"grad_norm": 0.4793786108493805,
"learning_rate": 4.018825467103542e-06,
"loss": 0.318,
"step": 1475
},
{
"epoch": 9.079877112135177,
"grad_norm": 0.4722115993499756,
"learning_rate": 3.983637195797111e-06,
"loss": 0.3217,
"step": 1480
},
{
"epoch": 9.110599078341014,
"grad_norm": 0.4697054326534271,
"learning_rate": 3.948501355604507e-06,
"loss": 0.3184,
"step": 1485
},
{
"epoch": 9.141321044546851,
"grad_norm": 0.4698534607887268,
"learning_rate": 3.9134197590785164e-06,
"loss": 0.3193,
"step": 1490
},
{
"epoch": 9.172043010752688,
"grad_norm": 0.4833962917327881,
"learning_rate": 3.878394215973663e-06,
"loss": 0.3243,
"step": 1495
},
{
"epoch": 9.202764976958525,
"grad_norm": 0.47097915410995483,
"learning_rate": 3.843426533152841e-06,
"loss": 0.3218,
"step": 1500
},
{
"epoch": 9.233486943164362,
"grad_norm": 0.4613553285598755,
"learning_rate": 3.808518514494105e-06,
"loss": 0.3191,
"step": 1505
},
{
"epoch": 9.2642089093702,
"grad_norm": 0.4618718922138214,
"learning_rate": 3.773671960797613e-06,
"loss": 0.3219,
"step": 1510
},
{
"epoch": 9.294930875576037,
"grad_norm": 0.45279550552368164,
"learning_rate": 3.7388886696927317e-06,
"loss": 0.3235,
"step": 1515
},
{
"epoch": 9.325652841781874,
"grad_norm": 0.4596066176891327,
"learning_rate": 3.704170435545299e-06,
"loss": 0.3183,
"step": 1520
},
{
"epoch": 9.356374807987711,
"grad_norm": 0.4353365898132324,
"learning_rate": 3.6695190493650608e-06,
"loss": 0.3204,
"step": 1525
},
{
"epoch": 9.387096774193548,
"grad_norm": 0.444594144821167,
"learning_rate": 3.634936298713274e-06,
"loss": 0.3225,
"step": 1530
},
{
"epoch": 9.417818740399385,
"grad_norm": 0.43966248631477356,
"learning_rate": 3.6004239676104957e-06,
"loss": 0.3236,
"step": 1535
},
{
"epoch": 9.448540706605222,
"grad_norm": 0.4758555591106415,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.3219,
"step": 1540
},
{
"epoch": 9.47926267281106,
"grad_norm": 0.4469148814678192,
"learning_rate": 3.5316176818786797e-06,
"loss": 0.324,
"step": 1545
},
{
"epoch": 9.509984639016897,
"grad_norm": 0.4485386908054352,
"learning_rate": 3.497327276759899e-06,
"loss": 0.3238,
"step": 1550
},
{
"epoch": 9.540706605222734,
"grad_norm": 0.44222620129585266,
"learning_rate": 3.463114390027533e-06,
"loss": 0.3205,
"step": 1555
},
{
"epoch": 9.571428571428571,
"grad_norm": 0.45242762565612793,
"learning_rate": 3.4289807866219683e-06,
"loss": 0.3222,
"step": 1560
},
{
"epoch": 9.602150537634408,
"grad_norm": 0.44047296047210693,
"learning_rate": 3.394928227393598e-06,
"loss": 0.3277,
"step": 1565
},
{
"epoch": 9.632872503840245,
"grad_norm": 0.44352057576179504,
"learning_rate": 3.3609584690119924e-06,
"loss": 0.3231,
"step": 1570
},
{
"epoch": 9.663594470046084,
"grad_norm": 0.44703468680381775,
"learning_rate": 3.3270732638752713e-06,
"loss": 0.3242,
"step": 1575
},
{
"epoch": 9.69431643625192,
"grad_norm": 0.46220463514328003,
"learning_rate": 3.293274360019707e-06,
"loss": 0.3256,
"step": 1580
},
{
"epoch": 9.725038402457757,
"grad_norm": 0.4843495786190033,
"learning_rate": 3.259563501029548e-06,
"loss": 0.3279,
"step": 1585
},
{
"epoch": 9.755760368663594,
"grad_norm": 0.4653911292552948,
"learning_rate": 3.2259424259470705e-06,
"loss": 0.3233,
"step": 1590
},
{
"epoch": 9.786482334869431,
"grad_norm": 0.45650723576545715,
"learning_rate": 3.1924128691828678e-06,
"loss": 0.324,
"step": 1595
},
{
"epoch": 9.817204301075268,
"grad_norm": 0.5246464014053345,
"learning_rate": 3.158976560426379e-06,
"loss": 0.3286,
"step": 1600
}
],
"logging_steps": 5,
"max_steps": 2430,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.4548687991848042e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}