ACSD_journal / experiments /de-pt /checkpoint-790 /trainer_state.json
PuxAI's picture
Upload folder using huggingface_hub
6bd2d39 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 790,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03164556962025317,
"grad_norm": 5.788066387176514,
"learning_rate": 0.0002,
"loss": 2.6104,
"step": 5
},
{
"epoch": 0.06329113924050633,
"grad_norm": 1.5542352199554443,
"learning_rate": 0.00019872773536895675,
"loss": 0.7864,
"step": 10
},
{
"epoch": 0.0949367088607595,
"grad_norm": 1.996996283531189,
"learning_rate": 0.00019745547073791352,
"loss": 0.5532,
"step": 15
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.9193770885467529,
"learning_rate": 0.00019618320610687023,
"loss": 0.522,
"step": 20
},
{
"epoch": 0.15822784810126583,
"grad_norm": 1.024322271347046,
"learning_rate": 0.00019491094147582698,
"loss": 0.5621,
"step": 25
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.873715341091156,
"learning_rate": 0.00019363867684478372,
"loss": 0.4996,
"step": 30
},
{
"epoch": 0.22151898734177214,
"grad_norm": 0.8645951151847839,
"learning_rate": 0.00019236641221374049,
"loss": 0.502,
"step": 35
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.7674330472946167,
"learning_rate": 0.00019109414758269723,
"loss": 0.433,
"step": 40
},
{
"epoch": 0.2848101265822785,
"grad_norm": 0.7591924667358398,
"learning_rate": 0.00018982188295165394,
"loss": 0.4406,
"step": 45
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.7595440745353699,
"learning_rate": 0.00018854961832061068,
"loss": 0.499,
"step": 50
},
{
"epoch": 0.34810126582278483,
"grad_norm": 0.7064336538314819,
"learning_rate": 0.00018727735368956745,
"loss": 0.4756,
"step": 55
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.7232657670974731,
"learning_rate": 0.0001860050890585242,
"loss": 0.5227,
"step": 60
},
{
"epoch": 0.41139240506329117,
"grad_norm": 0.7500166296958923,
"learning_rate": 0.00018473282442748093,
"loss": 0.4481,
"step": 65
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.6161800026893616,
"learning_rate": 0.00018346055979643765,
"loss": 0.4868,
"step": 70
},
{
"epoch": 0.47468354430379744,
"grad_norm": 0.7168012857437134,
"learning_rate": 0.00018218829516539442,
"loss": 0.4833,
"step": 75
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.7813606262207031,
"learning_rate": 0.00018091603053435116,
"loss": 0.4777,
"step": 80
},
{
"epoch": 0.5379746835443038,
"grad_norm": 0.7056337594985962,
"learning_rate": 0.0001796437659033079,
"loss": 0.4521,
"step": 85
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.570087730884552,
"learning_rate": 0.00017837150127226464,
"loss": 0.4456,
"step": 90
},
{
"epoch": 0.6012658227848101,
"grad_norm": 0.642938494682312,
"learning_rate": 0.00017709923664122138,
"loss": 0.4169,
"step": 95
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.7025493383407593,
"learning_rate": 0.00017582697201017812,
"loss": 0.4987,
"step": 100
},
{
"epoch": 0.6645569620253164,
"grad_norm": 0.7466819882392883,
"learning_rate": 0.00017455470737913486,
"loss": 0.4644,
"step": 105
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.7106885313987732,
"learning_rate": 0.00017328244274809163,
"loss": 0.4653,
"step": 110
},
{
"epoch": 0.7278481012658228,
"grad_norm": 0.6158185601234436,
"learning_rate": 0.00017201017811704835,
"loss": 0.4556,
"step": 115
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.678554117679596,
"learning_rate": 0.0001707379134860051,
"loss": 0.4587,
"step": 120
},
{
"epoch": 0.7911392405063291,
"grad_norm": 0.8016729354858398,
"learning_rate": 0.00016946564885496183,
"loss": 0.4228,
"step": 125
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.7110231518745422,
"learning_rate": 0.0001681933842239186,
"loss": 0.4303,
"step": 130
},
{
"epoch": 0.8544303797468354,
"grad_norm": 0.6997452974319458,
"learning_rate": 0.00016692111959287534,
"loss": 0.4342,
"step": 135
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.6250122785568237,
"learning_rate": 0.00016564885496183205,
"loss": 0.427,
"step": 140
},
{
"epoch": 0.9177215189873418,
"grad_norm": 0.6947687864303589,
"learning_rate": 0.0001643765903307888,
"loss": 0.4763,
"step": 145
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.680385947227478,
"learning_rate": 0.00016310432569974556,
"loss": 0.4554,
"step": 150
},
{
"epoch": 0.9810126582278481,
"grad_norm": 0.5412645936012268,
"learning_rate": 0.0001618320610687023,
"loss": 0.4664,
"step": 155
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.5828943848609924,
"learning_rate": 0.00016055979643765905,
"loss": 0.4573,
"step": 160
},
{
"epoch": 1.0443037974683544,
"grad_norm": 0.6317119002342224,
"learning_rate": 0.0001592875318066158,
"loss": 0.325,
"step": 165
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.6031287312507629,
"learning_rate": 0.00015801526717557253,
"loss": 0.3658,
"step": 170
},
{
"epoch": 1.1075949367088607,
"grad_norm": 0.6438406109809875,
"learning_rate": 0.00015674300254452927,
"loss": 0.3645,
"step": 175
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.6503311395645142,
"learning_rate": 0.000155470737913486,
"loss": 0.323,
"step": 180
},
{
"epoch": 1.1708860759493671,
"grad_norm": 0.8476307392120361,
"learning_rate": 0.00015419847328244275,
"loss": 0.347,
"step": 185
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.7285150289535522,
"learning_rate": 0.0001529262086513995,
"loss": 0.3831,
"step": 190
},
{
"epoch": 1.2341772151898733,
"grad_norm": 0.6327723860740662,
"learning_rate": 0.00015165394402035624,
"loss": 0.3577,
"step": 195
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.6771088242530823,
"learning_rate": 0.00015038167938931298,
"loss": 0.3154,
"step": 200
},
{
"epoch": 1.2974683544303798,
"grad_norm": 0.7355062365531921,
"learning_rate": 0.00014910941475826972,
"loss": 0.3706,
"step": 205
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.7912581562995911,
"learning_rate": 0.0001478371501272265,
"loss": 0.3456,
"step": 210
},
{
"epoch": 1.360759493670886,
"grad_norm": 0.6501379609107971,
"learning_rate": 0.0001465648854961832,
"loss": 0.3243,
"step": 215
},
{
"epoch": 1.3924050632911391,
"grad_norm": 0.6570438146591187,
"learning_rate": 0.00014529262086513994,
"loss": 0.3595,
"step": 220
},
{
"epoch": 1.4240506329113924,
"grad_norm": 0.6073997020721436,
"learning_rate": 0.0001440203562340967,
"loss": 0.3397,
"step": 225
},
{
"epoch": 1.4556962025316456,
"grad_norm": 0.7310261130332947,
"learning_rate": 0.00014274809160305345,
"loss": 0.3641,
"step": 230
},
{
"epoch": 1.4873417721518987,
"grad_norm": 0.8089779019355774,
"learning_rate": 0.0001414758269720102,
"loss": 0.3374,
"step": 235
},
{
"epoch": 1.518987341772152,
"grad_norm": 0.8505273461341858,
"learning_rate": 0.0001402035623409669,
"loss": 0.3695,
"step": 240
},
{
"epoch": 1.5506329113924051,
"grad_norm": 0.6972491145133972,
"learning_rate": 0.00013893129770992368,
"loss": 0.3556,
"step": 245
},
{
"epoch": 1.5822784810126582,
"grad_norm": 0.740247368812561,
"learning_rate": 0.00013765903307888042,
"loss": 0.3604,
"step": 250
},
{
"epoch": 1.6139240506329116,
"grad_norm": 0.818209707736969,
"learning_rate": 0.00013638676844783716,
"loss": 0.3538,
"step": 255
},
{
"epoch": 1.6455696202531644,
"grad_norm": 0.822881817817688,
"learning_rate": 0.0001351145038167939,
"loss": 0.3494,
"step": 260
},
{
"epoch": 1.6772151898734178,
"grad_norm": 0.7193669080734253,
"learning_rate": 0.00013384223918575064,
"loss": 0.3676,
"step": 265
},
{
"epoch": 1.7088607594936709,
"grad_norm": 0.6926146149635315,
"learning_rate": 0.00013256997455470738,
"loss": 0.313,
"step": 270
},
{
"epoch": 1.740506329113924,
"grad_norm": 1.0953829288482666,
"learning_rate": 0.00013129770992366413,
"loss": 0.3202,
"step": 275
},
{
"epoch": 1.7721518987341773,
"grad_norm": 0.8663277626037598,
"learning_rate": 0.00013002544529262087,
"loss": 0.3488,
"step": 280
},
{
"epoch": 1.8037974683544302,
"grad_norm": 1.1026146411895752,
"learning_rate": 0.0001287531806615776,
"loss": 0.3654,
"step": 285
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.7661195993423462,
"learning_rate": 0.00012748091603053435,
"loss": 0.3693,
"step": 290
},
{
"epoch": 1.8670886075949367,
"grad_norm": 0.6808319687843323,
"learning_rate": 0.0001262086513994911,
"loss": 0.3449,
"step": 295
},
{
"epoch": 1.8987341772151898,
"grad_norm": 0.7904935479164124,
"learning_rate": 0.00012493638676844783,
"loss": 0.3219,
"step": 300
},
{
"epoch": 1.9303797468354431,
"grad_norm": 0.7428227066993713,
"learning_rate": 0.0001236641221374046,
"loss": 0.3452,
"step": 305
},
{
"epoch": 1.9620253164556962,
"grad_norm": 0.8595893383026123,
"learning_rate": 0.00012239185750636134,
"loss": 0.3629,
"step": 310
},
{
"epoch": 1.9936708860759493,
"grad_norm": 0.7588908672332764,
"learning_rate": 0.00012111959287531807,
"loss": 0.333,
"step": 315
},
{
"epoch": 2.0253164556962027,
"grad_norm": 0.6112965941429138,
"learning_rate": 0.00011984732824427483,
"loss": 0.2809,
"step": 320
},
{
"epoch": 2.0569620253164556,
"grad_norm": 1.506441593170166,
"learning_rate": 0.00011857506361323157,
"loss": 0.2251,
"step": 325
},
{
"epoch": 2.088607594936709,
"grad_norm": 0.8897147178649902,
"learning_rate": 0.0001173027989821883,
"loss": 0.2258,
"step": 330
},
{
"epoch": 2.1202531645569622,
"grad_norm": 0.6773934960365295,
"learning_rate": 0.00011603053435114504,
"loss": 0.2115,
"step": 335
},
{
"epoch": 2.151898734177215,
"grad_norm": 0.832305908203125,
"learning_rate": 0.00011475826972010179,
"loss": 0.2317,
"step": 340
},
{
"epoch": 2.1835443037974684,
"grad_norm": 0.9453684687614441,
"learning_rate": 0.00011348600508905853,
"loss": 0.2325,
"step": 345
},
{
"epoch": 2.2151898734177213,
"grad_norm": 0.9567768573760986,
"learning_rate": 0.00011221374045801527,
"loss": 0.2273,
"step": 350
},
{
"epoch": 2.2468354430379747,
"grad_norm": 0.8501098155975342,
"learning_rate": 0.000110941475826972,
"loss": 0.2184,
"step": 355
},
{
"epoch": 2.278481012658228,
"grad_norm": 1.046438217163086,
"learning_rate": 0.00010966921119592877,
"loss": 0.2181,
"step": 360
},
{
"epoch": 2.310126582278481,
"grad_norm": 0.8483916521072388,
"learning_rate": 0.0001083969465648855,
"loss": 0.2176,
"step": 365
},
{
"epoch": 2.3417721518987342,
"grad_norm": 0.805766224861145,
"learning_rate": 0.00010712468193384224,
"loss": 0.2303,
"step": 370
},
{
"epoch": 2.3734177215189876,
"grad_norm": 0.8078694343566895,
"learning_rate": 0.00010585241730279898,
"loss": 0.2199,
"step": 375
},
{
"epoch": 2.4050632911392404,
"grad_norm": 1.255946397781372,
"learning_rate": 0.00010458015267175574,
"loss": 0.234,
"step": 380
},
{
"epoch": 2.4367088607594938,
"grad_norm": 0.8427215814590454,
"learning_rate": 0.00010330788804071248,
"loss": 0.2299,
"step": 385
},
{
"epoch": 2.4683544303797467,
"grad_norm": 0.6670682430267334,
"learning_rate": 0.00010203562340966922,
"loss": 0.2237,
"step": 390
},
{
"epoch": 2.5,
"grad_norm": 0.7526563405990601,
"learning_rate": 0.00010076335877862595,
"loss": 0.2276,
"step": 395
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.8490801453590393,
"learning_rate": 9.94910941475827e-05,
"loss": 0.223,
"step": 400
},
{
"epoch": 2.5632911392405062,
"grad_norm": 0.8474723100662231,
"learning_rate": 9.821882951653944e-05,
"loss": 0.23,
"step": 405
},
{
"epoch": 2.5949367088607596,
"grad_norm": 0.9560302495956421,
"learning_rate": 9.694656488549618e-05,
"loss": 0.2254,
"step": 410
},
{
"epoch": 2.6265822784810124,
"grad_norm": 0.8522864580154419,
"learning_rate": 9.567430025445293e-05,
"loss": 0.2304,
"step": 415
},
{
"epoch": 2.6582278481012658,
"grad_norm": 0.7552340030670166,
"learning_rate": 9.440203562340968e-05,
"loss": 0.2223,
"step": 420
},
{
"epoch": 2.689873417721519,
"grad_norm": 0.9277474284172058,
"learning_rate": 9.312977099236642e-05,
"loss": 0.232,
"step": 425
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.9359229803085327,
"learning_rate": 9.185750636132316e-05,
"loss": 0.2278,
"step": 430
},
{
"epoch": 2.7531645569620253,
"grad_norm": 0.8999188542366028,
"learning_rate": 9.05852417302799e-05,
"loss": 0.2101,
"step": 435
},
{
"epoch": 2.7848101265822782,
"grad_norm": 1.0271995067596436,
"learning_rate": 8.931297709923665e-05,
"loss": 0.2355,
"step": 440
},
{
"epoch": 2.8164556962025316,
"grad_norm": 0.8051729798316956,
"learning_rate": 8.804071246819339e-05,
"loss": 0.2321,
"step": 445
},
{
"epoch": 2.848101265822785,
"grad_norm": 0.9024167656898499,
"learning_rate": 8.676844783715013e-05,
"loss": 0.2401,
"step": 450
},
{
"epoch": 2.879746835443038,
"grad_norm": 0.9658450484275818,
"learning_rate": 8.549618320610687e-05,
"loss": 0.2454,
"step": 455
},
{
"epoch": 2.911392405063291,
"grad_norm": 0.766612708568573,
"learning_rate": 8.422391857506363e-05,
"loss": 0.2391,
"step": 460
},
{
"epoch": 2.9430379746835444,
"grad_norm": 0.7812886238098145,
"learning_rate": 8.295165394402035e-05,
"loss": 0.2286,
"step": 465
},
{
"epoch": 2.9746835443037973,
"grad_norm": 0.8621058464050293,
"learning_rate": 8.167938931297711e-05,
"loss": 0.2444,
"step": 470
},
{
"epoch": 3.0063291139240507,
"grad_norm": 0.6189867854118347,
"learning_rate": 8.040712468193385e-05,
"loss": 0.2114,
"step": 475
},
{
"epoch": 3.037974683544304,
"grad_norm": 0.8174425959587097,
"learning_rate": 7.913486005089059e-05,
"loss": 0.1325,
"step": 480
},
{
"epoch": 3.069620253164557,
"grad_norm": 1.3215669393539429,
"learning_rate": 7.786259541984733e-05,
"loss": 0.1431,
"step": 485
},
{
"epoch": 3.1012658227848102,
"grad_norm": 0.944607138633728,
"learning_rate": 7.659033078880407e-05,
"loss": 0.1334,
"step": 490
},
{
"epoch": 3.132911392405063,
"grad_norm": 0.8664195537567139,
"learning_rate": 7.531806615776081e-05,
"loss": 0.1358,
"step": 495
},
{
"epoch": 3.1645569620253164,
"grad_norm": 0.7892764806747437,
"learning_rate": 7.404580152671756e-05,
"loss": 0.124,
"step": 500
},
{
"epoch": 3.1962025316455698,
"grad_norm": 0.9069737195968628,
"learning_rate": 7.27735368956743e-05,
"loss": 0.1388,
"step": 505
},
{
"epoch": 3.2278481012658227,
"grad_norm": 0.9595538973808289,
"learning_rate": 7.150127226463105e-05,
"loss": 0.1305,
"step": 510
},
{
"epoch": 3.259493670886076,
"grad_norm": 0.9580609798431396,
"learning_rate": 7.022900763358778e-05,
"loss": 0.139,
"step": 515
},
{
"epoch": 3.291139240506329,
"grad_norm": 0.8919333219528198,
"learning_rate": 6.895674300254454e-05,
"loss": 0.1292,
"step": 520
},
{
"epoch": 3.3227848101265822,
"grad_norm": 0.9798533320426941,
"learning_rate": 6.768447837150128e-05,
"loss": 0.1348,
"step": 525
},
{
"epoch": 3.3544303797468356,
"grad_norm": 0.737899661064148,
"learning_rate": 6.641221374045802e-05,
"loss": 0.1418,
"step": 530
},
{
"epoch": 3.3860759493670884,
"grad_norm": 0.8077306151390076,
"learning_rate": 6.513994910941476e-05,
"loss": 0.1434,
"step": 535
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.6728256940841675,
"learning_rate": 6.38676844783715e-05,
"loss": 0.1347,
"step": 540
},
{
"epoch": 3.449367088607595,
"grad_norm": 0.8441898822784424,
"learning_rate": 6.259541984732826e-05,
"loss": 0.1294,
"step": 545
},
{
"epoch": 3.481012658227848,
"grad_norm": 0.7539904713630676,
"learning_rate": 6.132315521628498e-05,
"loss": 0.1337,
"step": 550
},
{
"epoch": 3.5126582278481013,
"grad_norm": 0.874884843826294,
"learning_rate": 6.005089058524174e-05,
"loss": 0.1318,
"step": 555
},
{
"epoch": 3.5443037974683547,
"grad_norm": 0.8220652937889099,
"learning_rate": 5.877862595419847e-05,
"loss": 0.1372,
"step": 560
},
{
"epoch": 3.5759493670886076,
"grad_norm": 0.8709121942520142,
"learning_rate": 5.750636132315522e-05,
"loss": 0.1329,
"step": 565
},
{
"epoch": 3.607594936708861,
"grad_norm": 1.0847886800765991,
"learning_rate": 5.6234096692111956e-05,
"loss": 0.1366,
"step": 570
},
{
"epoch": 3.6392405063291138,
"grad_norm": 1.150924563407898,
"learning_rate": 5.496183206106871e-05,
"loss": 0.1401,
"step": 575
},
{
"epoch": 3.670886075949367,
"grad_norm": 1.2749351263046265,
"learning_rate": 5.3689567430025446e-05,
"loss": 0.1388,
"step": 580
},
{
"epoch": 3.7025316455696204,
"grad_norm": 0.8032536506652832,
"learning_rate": 5.2417302798982194e-05,
"loss": 0.1312,
"step": 585
},
{
"epoch": 3.7341772151898733,
"grad_norm": 1.0450551509857178,
"learning_rate": 5.114503816793893e-05,
"loss": 0.1444,
"step": 590
},
{
"epoch": 3.7658227848101267,
"grad_norm": 0.8416706919670105,
"learning_rate": 4.9872773536895677e-05,
"loss": 0.1401,
"step": 595
},
{
"epoch": 3.7974683544303796,
"grad_norm": 0.9472242593765259,
"learning_rate": 4.860050890585242e-05,
"loss": 0.1414,
"step": 600
},
{
"epoch": 3.829113924050633,
"grad_norm": 1.0049540996551514,
"learning_rate": 4.7328244274809166e-05,
"loss": 0.142,
"step": 605
},
{
"epoch": 3.8607594936708862,
"grad_norm": 0.9480180144309998,
"learning_rate": 4.605597964376591e-05,
"loss": 0.1416,
"step": 610
},
{
"epoch": 3.892405063291139,
"grad_norm": 1.5082101821899414,
"learning_rate": 4.478371501272265e-05,
"loss": 0.1327,
"step": 615
},
{
"epoch": 3.9240506329113924,
"grad_norm": 0.7728102207183838,
"learning_rate": 4.351145038167939e-05,
"loss": 0.1303,
"step": 620
},
{
"epoch": 3.9556962025316453,
"grad_norm": 0.8425063490867615,
"learning_rate": 4.223918575063613e-05,
"loss": 0.1324,
"step": 625
},
{
"epoch": 3.9873417721518987,
"grad_norm": 0.9874700307846069,
"learning_rate": 4.096692111959288e-05,
"loss": 0.1405,
"step": 630
},
{
"epoch": 4.018987341772152,
"grad_norm": 0.47495508193969727,
"learning_rate": 3.969465648854962e-05,
"loss": 0.1071,
"step": 635
},
{
"epoch": 4.050632911392405,
"grad_norm": 0.3872312009334564,
"learning_rate": 3.842239185750636e-05,
"loss": 0.0901,
"step": 640
},
{
"epoch": 4.082278481012658,
"grad_norm": 0.5231612920761108,
"learning_rate": 3.7150127226463104e-05,
"loss": 0.0969,
"step": 645
},
{
"epoch": 4.113924050632911,
"grad_norm": 1.3609975576400757,
"learning_rate": 3.5877862595419845e-05,
"loss": 0.0893,
"step": 650
},
{
"epoch": 4.1455696202531644,
"grad_norm": 0.673308253288269,
"learning_rate": 3.4605597964376594e-05,
"loss": 0.0928,
"step": 655
},
{
"epoch": 4.177215189873418,
"grad_norm": 0.482837438583374,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0912,
"step": 660
},
{
"epoch": 4.208860759493671,
"grad_norm": 0.7408327460289001,
"learning_rate": 3.2061068702290076e-05,
"loss": 0.0912,
"step": 665
},
{
"epoch": 4.2405063291139244,
"grad_norm": 0.5170139074325562,
"learning_rate": 3.078880407124682e-05,
"loss": 0.0958,
"step": 670
},
{
"epoch": 4.272151898734177,
"grad_norm": 0.6944636106491089,
"learning_rate": 2.9516539440203562e-05,
"loss": 0.0883,
"step": 675
},
{
"epoch": 4.30379746835443,
"grad_norm": 0.6294474601745605,
"learning_rate": 2.824427480916031e-05,
"loss": 0.0915,
"step": 680
},
{
"epoch": 4.3354430379746836,
"grad_norm": 0.6630133390426636,
"learning_rate": 2.6972010178117052e-05,
"loss": 0.0905,
"step": 685
},
{
"epoch": 4.367088607594937,
"grad_norm": 0.6174125075340271,
"learning_rate": 2.5699745547073793e-05,
"loss": 0.0958,
"step": 690
},
{
"epoch": 4.39873417721519,
"grad_norm": 0.7214713096618652,
"learning_rate": 2.4427480916030535e-05,
"loss": 0.0905,
"step": 695
},
{
"epoch": 4.430379746835443,
"grad_norm": 0.7950146794319153,
"learning_rate": 2.3155216284987276e-05,
"loss": 0.0925,
"step": 700
},
{
"epoch": 4.462025316455696,
"grad_norm": 0.6609070301055908,
"learning_rate": 2.1882951653944024e-05,
"loss": 0.0929,
"step": 705
},
{
"epoch": 4.493670886075949,
"grad_norm": 0.6648293733596802,
"learning_rate": 2.0610687022900766e-05,
"loss": 0.0908,
"step": 710
},
{
"epoch": 4.525316455696203,
"grad_norm": 0.5198394656181335,
"learning_rate": 1.9338422391857507e-05,
"loss": 0.0907,
"step": 715
},
{
"epoch": 4.556962025316456,
"grad_norm": 0.8868843913078308,
"learning_rate": 1.8066157760814252e-05,
"loss": 0.0957,
"step": 720
},
{
"epoch": 4.588607594936709,
"grad_norm": 0.6488995552062988,
"learning_rate": 1.6793893129770993e-05,
"loss": 0.0898,
"step": 725
},
{
"epoch": 4.620253164556962,
"grad_norm": 0.5500432252883911,
"learning_rate": 1.5521628498727735e-05,
"loss": 0.0946,
"step": 730
},
{
"epoch": 4.651898734177215,
"grad_norm": 0.8371357321739197,
"learning_rate": 1.424936386768448e-05,
"loss": 0.0905,
"step": 735
},
{
"epoch": 4.6835443037974684,
"grad_norm": 0.5861048102378845,
"learning_rate": 1.2977099236641221e-05,
"loss": 0.0941,
"step": 740
},
{
"epoch": 4.715189873417722,
"grad_norm": 0.7422693371772766,
"learning_rate": 1.1704834605597966e-05,
"loss": 0.0956,
"step": 745
},
{
"epoch": 4.746835443037975,
"grad_norm": 0.5376149415969849,
"learning_rate": 1.0432569974554709e-05,
"loss": 0.0937,
"step": 750
},
{
"epoch": 4.7784810126582276,
"grad_norm": 0.46256035566329956,
"learning_rate": 9.16030534351145e-06,
"loss": 0.0878,
"step": 755
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.4410872459411621,
"learning_rate": 7.888040712468193e-06,
"loss": 0.0929,
"step": 760
},
{
"epoch": 4.841772151898734,
"grad_norm": 0.44353851675987244,
"learning_rate": 6.615776081424936e-06,
"loss": 0.0909,
"step": 765
},
{
"epoch": 4.8734177215189876,
"grad_norm": 0.5728248953819275,
"learning_rate": 5.343511450381679e-06,
"loss": 0.0946,
"step": 770
},
{
"epoch": 4.905063291139241,
"grad_norm": 0.4497832655906677,
"learning_rate": 4.0712468193384225e-06,
"loss": 0.089,
"step": 775
},
{
"epoch": 4.936708860759493,
"grad_norm": 0.650806725025177,
"learning_rate": 2.7989821882951656e-06,
"loss": 0.0939,
"step": 780
},
{
"epoch": 4.968354430379747,
"grad_norm": 0.5624284148216248,
"learning_rate": 1.5267175572519084e-06,
"loss": 0.0918,
"step": 785
},
{
"epoch": 5.0,
"grad_norm": 0.5814446210861206,
"learning_rate": 2.544529262086514e-07,
"loss": 0.0872,
"step": 790
}
],
"logging_steps": 5,
"max_steps": 790,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.503292302504755e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}