ACSD_journal / experiments /de-en /checkpoint-790 /trainer_state.json
PuxAI's picture
Upload folder using huggingface_hub
a179af5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 790,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03164556962025317,
"grad_norm": 5.988588333129883,
"learning_rate": 0.0002,
"loss": 2.7545,
"step": 5
},
{
"epoch": 0.06329113924050633,
"grad_norm": 1.671600580215454,
"learning_rate": 0.00019872773536895675,
"loss": 0.7945,
"step": 10
},
{
"epoch": 0.0949367088607595,
"grad_norm": 1.5613313913345337,
"learning_rate": 0.00019745547073791352,
"loss": 0.5902,
"step": 15
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.9209323525428772,
"learning_rate": 0.00019618320610687023,
"loss": 0.4814,
"step": 20
},
{
"epoch": 0.15822784810126583,
"grad_norm": 0.7866256237030029,
"learning_rate": 0.00019491094147582698,
"loss": 0.4366,
"step": 25
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.7535956501960754,
"learning_rate": 0.00019363867684478372,
"loss": 0.4238,
"step": 30
},
{
"epoch": 0.22151898734177214,
"grad_norm": 0.6957400441169739,
"learning_rate": 0.00019236641221374049,
"loss": 0.5012,
"step": 35
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.7030977010726929,
"learning_rate": 0.00019109414758269723,
"loss": 0.4676,
"step": 40
},
{
"epoch": 0.2848101265822785,
"grad_norm": 0.741550624370575,
"learning_rate": 0.00018982188295165394,
"loss": 0.4893,
"step": 45
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.5580260753631592,
"learning_rate": 0.00018854961832061068,
"loss": 0.4825,
"step": 50
},
{
"epoch": 0.34810126582278483,
"grad_norm": 0.5945926308631897,
"learning_rate": 0.00018727735368956745,
"loss": 0.495,
"step": 55
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.570940375328064,
"learning_rate": 0.0001860050890585242,
"loss": 0.4557,
"step": 60
},
{
"epoch": 0.41139240506329117,
"grad_norm": 0.6694577932357788,
"learning_rate": 0.00018473282442748093,
"loss": 0.4674,
"step": 65
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.6412336826324463,
"learning_rate": 0.00018346055979643765,
"loss": 0.4305,
"step": 70
},
{
"epoch": 0.47468354430379744,
"grad_norm": 0.6980250477790833,
"learning_rate": 0.00018218829516539442,
"loss": 0.4357,
"step": 75
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.48515501618385315,
"learning_rate": 0.00018091603053435116,
"loss": 0.4285,
"step": 80
},
{
"epoch": 0.5379746835443038,
"grad_norm": 0.6025907397270203,
"learning_rate": 0.0001796437659033079,
"loss": 0.45,
"step": 85
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.5797450542449951,
"learning_rate": 0.00017837150127226464,
"loss": 0.5033,
"step": 90
},
{
"epoch": 0.6012658227848101,
"grad_norm": 0.6783467531204224,
"learning_rate": 0.00017709923664122138,
"loss": 0.4174,
"step": 95
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.5603845119476318,
"learning_rate": 0.00017582697201017812,
"loss": 0.4505,
"step": 100
},
{
"epoch": 0.6645569620253164,
"grad_norm": 0.7022290229797363,
"learning_rate": 0.00017455470737913486,
"loss": 0.5161,
"step": 105
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.6286556124687195,
"learning_rate": 0.00017328244274809163,
"loss": 0.4525,
"step": 110
},
{
"epoch": 0.7278481012658228,
"grad_norm": 0.7144973874092102,
"learning_rate": 0.00017201017811704835,
"loss": 0.5068,
"step": 115
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.55781090259552,
"learning_rate": 0.0001707379134860051,
"loss": 0.4385,
"step": 120
},
{
"epoch": 0.7911392405063291,
"grad_norm": 0.5584812760353088,
"learning_rate": 0.00016946564885496183,
"loss": 0.4206,
"step": 125
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.7030683755874634,
"learning_rate": 0.0001681933842239186,
"loss": 0.4833,
"step": 130
},
{
"epoch": 0.8544303797468354,
"grad_norm": 0.6400471329689026,
"learning_rate": 0.00016692111959287534,
"loss": 0.4646,
"step": 135
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.5747826099395752,
"learning_rate": 0.00016564885496183205,
"loss": 0.4334,
"step": 140
},
{
"epoch": 0.9177215189873418,
"grad_norm": 0.519247829914093,
"learning_rate": 0.0001643765903307888,
"loss": 0.4365,
"step": 145
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.6712743639945984,
"learning_rate": 0.00016310432569974556,
"loss": 0.4196,
"step": 150
},
{
"epoch": 0.9810126582278481,
"grad_norm": 0.8766248226165771,
"learning_rate": 0.0001618320610687023,
"loss": 0.44,
"step": 155
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.432377427816391,
"learning_rate": 0.00016055979643765905,
"loss": 0.3616,
"step": 160
},
{
"epoch": 1.0443037974683544,
"grad_norm": 0.5202181339263916,
"learning_rate": 0.0001592875318066158,
"loss": 0.3763,
"step": 165
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.5511195659637451,
"learning_rate": 0.00015801526717557253,
"loss": 0.3291,
"step": 170
},
{
"epoch": 1.1075949367088607,
"grad_norm": 0.6027284264564514,
"learning_rate": 0.00015674300254452927,
"loss": 0.3141,
"step": 175
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.6925147175788879,
"learning_rate": 0.000155470737913486,
"loss": 0.3651,
"step": 180
},
{
"epoch": 1.1708860759493671,
"grad_norm": 0.6030688285827637,
"learning_rate": 0.00015419847328244275,
"loss": 0.3411,
"step": 185
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.5992720127105713,
"learning_rate": 0.0001529262086513995,
"loss": 0.3508,
"step": 190
},
{
"epoch": 1.2341772151898733,
"grad_norm": 0.7508724331855774,
"learning_rate": 0.00015165394402035624,
"loss": 0.3284,
"step": 195
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.7126018404960632,
"learning_rate": 0.00015038167938931298,
"loss": 0.3466,
"step": 200
},
{
"epoch": 1.2974683544303798,
"grad_norm": 0.8017547130584717,
"learning_rate": 0.00014910941475826972,
"loss": 0.3485,
"step": 205
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.7311880588531494,
"learning_rate": 0.0001478371501272265,
"loss": 0.3566,
"step": 210
},
{
"epoch": 1.360759493670886,
"grad_norm": 0.7723591327667236,
"learning_rate": 0.0001465648854961832,
"loss": 0.3329,
"step": 215
},
{
"epoch": 1.3924050632911391,
"grad_norm": 0.8075547814369202,
"learning_rate": 0.00014529262086513994,
"loss": 0.3584,
"step": 220
},
{
"epoch": 1.4240506329113924,
"grad_norm": 0.5989384055137634,
"learning_rate": 0.0001440203562340967,
"loss": 0.371,
"step": 225
},
{
"epoch": 1.4556962025316456,
"grad_norm": 0.678035318851471,
"learning_rate": 0.00014274809160305345,
"loss": 0.3448,
"step": 230
},
{
"epoch": 1.4873417721518987,
"grad_norm": 0.8693724274635315,
"learning_rate": 0.0001414758269720102,
"loss": 0.3644,
"step": 235
},
{
"epoch": 1.518987341772152,
"grad_norm": 0.6025015115737915,
"learning_rate": 0.0001402035623409669,
"loss": 0.3233,
"step": 240
},
{
"epoch": 1.5506329113924051,
"grad_norm": 0.679233729839325,
"learning_rate": 0.00013893129770992368,
"loss": 0.3247,
"step": 245
},
{
"epoch": 1.5822784810126582,
"grad_norm": 0.7034026980400085,
"learning_rate": 0.00013765903307888042,
"loss": 0.3527,
"step": 250
},
{
"epoch": 1.6139240506329116,
"grad_norm": 0.7514588236808777,
"learning_rate": 0.00013638676844783716,
"loss": 0.3487,
"step": 255
},
{
"epoch": 1.6455696202531644,
"grad_norm": 0.7183879017829895,
"learning_rate": 0.0001351145038167939,
"loss": 0.3407,
"step": 260
},
{
"epoch": 1.6772151898734178,
"grad_norm": 0.6752856969833374,
"learning_rate": 0.00013384223918575064,
"loss": 0.3088,
"step": 265
},
{
"epoch": 1.7088607594936709,
"grad_norm": 0.8107082843780518,
"learning_rate": 0.00013256997455470738,
"loss": 0.3841,
"step": 270
},
{
"epoch": 1.740506329113924,
"grad_norm": 0.5849813222885132,
"learning_rate": 0.00013129770992366413,
"loss": 0.3325,
"step": 275
},
{
"epoch": 1.7721518987341773,
"grad_norm": 0.8018965125083923,
"learning_rate": 0.00013002544529262087,
"loss": 0.3649,
"step": 280
},
{
"epoch": 1.8037974683544302,
"grad_norm": 0.8379972577095032,
"learning_rate": 0.0001287531806615776,
"loss": 0.3668,
"step": 285
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.6462769508361816,
"learning_rate": 0.00012748091603053435,
"loss": 0.3363,
"step": 290
},
{
"epoch": 1.8670886075949367,
"grad_norm": 0.8890714645385742,
"learning_rate": 0.0001262086513994911,
"loss": 0.3265,
"step": 295
},
{
"epoch": 1.8987341772151898,
"grad_norm": 0.797147274017334,
"learning_rate": 0.00012493638676844783,
"loss": 0.3636,
"step": 300
},
{
"epoch": 1.9303797468354431,
"grad_norm": 0.6804778575897217,
"learning_rate": 0.0001236641221374046,
"loss": 0.3442,
"step": 305
},
{
"epoch": 1.9620253164556962,
"grad_norm": 0.6891390681266785,
"learning_rate": 0.00012239185750636134,
"loss": 0.3145,
"step": 310
},
{
"epoch": 1.9936708860759493,
"grad_norm": 0.9055079817771912,
"learning_rate": 0.00012111959287531807,
"loss": 0.342,
"step": 315
},
{
"epoch": 2.0253164556962027,
"grad_norm": 0.609603762626648,
"learning_rate": 0.00011984732824427483,
"loss": 0.2504,
"step": 320
},
{
"epoch": 2.0569620253164556,
"grad_norm": 1.3054362535476685,
"learning_rate": 0.00011857506361323157,
"loss": 0.2211,
"step": 325
},
{
"epoch": 2.088607594936709,
"grad_norm": 0.8065559267997742,
"learning_rate": 0.0001173027989821883,
"loss": 0.2173,
"step": 330
},
{
"epoch": 2.1202531645569622,
"grad_norm": 0.8054972887039185,
"learning_rate": 0.00011603053435114504,
"loss": 0.2126,
"step": 335
},
{
"epoch": 2.151898734177215,
"grad_norm": 0.9218589663505554,
"learning_rate": 0.00011475826972010179,
"loss": 0.2042,
"step": 340
},
{
"epoch": 2.1835443037974684,
"grad_norm": 0.9257758259773254,
"learning_rate": 0.00011348600508905853,
"loss": 0.2102,
"step": 345
},
{
"epoch": 2.2151898734177213,
"grad_norm": 0.9863210320472717,
"learning_rate": 0.00011221374045801527,
"loss": 0.219,
"step": 350
},
{
"epoch": 2.2468354430379747,
"grad_norm": 0.8986596465110779,
"learning_rate": 0.000110941475826972,
"loss": 0.2145,
"step": 355
},
{
"epoch": 2.278481012658228,
"grad_norm": 0.869886040687561,
"learning_rate": 0.00010966921119592877,
"loss": 0.1967,
"step": 360
},
{
"epoch": 2.310126582278481,
"grad_norm": 1.0244789123535156,
"learning_rate": 0.0001083969465648855,
"loss": 0.2095,
"step": 365
},
{
"epoch": 2.3417721518987342,
"grad_norm": 0.7236781120300293,
"learning_rate": 0.00010712468193384224,
"loss": 0.2123,
"step": 370
},
{
"epoch": 2.3734177215189876,
"grad_norm": 0.7103443145751953,
"learning_rate": 0.00010585241730279898,
"loss": 0.2205,
"step": 375
},
{
"epoch": 2.4050632911392404,
"grad_norm": 0.9352710247039795,
"learning_rate": 0.00010458015267175574,
"loss": 0.2259,
"step": 380
},
{
"epoch": 2.4367088607594938,
"grad_norm": 0.8048036098480225,
"learning_rate": 0.00010330788804071248,
"loss": 0.2138,
"step": 385
},
{
"epoch": 2.4683544303797467,
"grad_norm": 0.814346194267273,
"learning_rate": 0.00010203562340966922,
"loss": 0.2311,
"step": 390
},
{
"epoch": 2.5,
"grad_norm": 0.9042171835899353,
"learning_rate": 0.00010076335877862595,
"loss": 0.2278,
"step": 395
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.7023847103118896,
"learning_rate": 9.94910941475827e-05,
"loss": 0.2357,
"step": 400
},
{
"epoch": 2.5632911392405062,
"grad_norm": 0.9368842840194702,
"learning_rate": 9.821882951653944e-05,
"loss": 0.2182,
"step": 405
},
{
"epoch": 2.5949367088607596,
"grad_norm": 0.9225996136665344,
"learning_rate": 9.694656488549618e-05,
"loss": 0.2228,
"step": 410
},
{
"epoch": 2.6265822784810124,
"grad_norm": 0.7295313477516174,
"learning_rate": 9.567430025445293e-05,
"loss": 0.2143,
"step": 415
},
{
"epoch": 2.6582278481012658,
"grad_norm": 0.9664236903190613,
"learning_rate": 9.440203562340968e-05,
"loss": 0.2152,
"step": 420
},
{
"epoch": 2.689873417721519,
"grad_norm": 0.8742689490318298,
"learning_rate": 9.312977099236642e-05,
"loss": 0.2182,
"step": 425
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.8087453842163086,
"learning_rate": 9.185750636132316e-05,
"loss": 0.2184,
"step": 430
},
{
"epoch": 2.7531645569620253,
"grad_norm": 1.062659502029419,
"learning_rate": 9.05852417302799e-05,
"loss": 0.2156,
"step": 435
},
{
"epoch": 2.7848101265822782,
"grad_norm": 0.9411716461181641,
"learning_rate": 8.931297709923665e-05,
"loss": 0.2213,
"step": 440
},
{
"epoch": 2.8164556962025316,
"grad_norm": 0.993147075176239,
"learning_rate": 8.804071246819339e-05,
"loss": 0.2127,
"step": 445
},
{
"epoch": 2.848101265822785,
"grad_norm": 0.8353611826896667,
"learning_rate": 8.676844783715013e-05,
"loss": 0.2116,
"step": 450
},
{
"epoch": 2.879746835443038,
"grad_norm": 0.9915521740913391,
"learning_rate": 8.549618320610687e-05,
"loss": 0.2299,
"step": 455
},
{
"epoch": 2.911392405063291,
"grad_norm": 0.9111132621765137,
"learning_rate": 8.422391857506363e-05,
"loss": 0.2113,
"step": 460
},
{
"epoch": 2.9430379746835444,
"grad_norm": 0.8857221007347107,
"learning_rate": 8.295165394402035e-05,
"loss": 0.2285,
"step": 465
},
{
"epoch": 2.9746835443037973,
"grad_norm": 0.8553436398506165,
"learning_rate": 8.167938931297711e-05,
"loss": 0.233,
"step": 470
},
{
"epoch": 3.0063291139240507,
"grad_norm": 0.5474989414215088,
"learning_rate": 8.040712468193385e-05,
"loss": 0.1938,
"step": 475
},
{
"epoch": 3.037974683544304,
"grad_norm": 0.703250527381897,
"learning_rate": 7.913486005089059e-05,
"loss": 0.131,
"step": 480
},
{
"epoch": 3.069620253164557,
"grad_norm": 1.2964314222335815,
"learning_rate": 7.786259541984733e-05,
"loss": 0.1256,
"step": 485
},
{
"epoch": 3.1012658227848102,
"grad_norm": 0.7699221968650818,
"learning_rate": 7.659033078880407e-05,
"loss": 0.1247,
"step": 490
},
{
"epoch": 3.132911392405063,
"grad_norm": 0.6273168325424194,
"learning_rate": 7.531806615776081e-05,
"loss": 0.1173,
"step": 495
},
{
"epoch": 3.1645569620253164,
"grad_norm": 0.7778182029724121,
"learning_rate": 7.404580152671756e-05,
"loss": 0.1263,
"step": 500
},
{
"epoch": 3.1962025316455698,
"grad_norm": 1.197022795677185,
"learning_rate": 7.27735368956743e-05,
"loss": 0.1278,
"step": 505
},
{
"epoch": 3.2278481012658227,
"grad_norm": 0.7795239090919495,
"learning_rate": 7.150127226463105e-05,
"loss": 0.1253,
"step": 510
},
{
"epoch": 3.259493670886076,
"grad_norm": 0.8459110856056213,
"learning_rate": 7.022900763358778e-05,
"loss": 0.1245,
"step": 515
},
{
"epoch": 3.291139240506329,
"grad_norm": 0.6801343560218811,
"learning_rate": 6.895674300254454e-05,
"loss": 0.1284,
"step": 520
},
{
"epoch": 3.3227848101265822,
"grad_norm": 1.0283461809158325,
"learning_rate": 6.768447837150128e-05,
"loss": 0.1289,
"step": 525
},
{
"epoch": 3.3544303797468356,
"grad_norm": 1.1402161121368408,
"learning_rate": 6.641221374045802e-05,
"loss": 0.1335,
"step": 530
},
{
"epoch": 3.3860759493670884,
"grad_norm": 0.8805460333824158,
"learning_rate": 6.513994910941476e-05,
"loss": 0.127,
"step": 535
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.8641778230667114,
"learning_rate": 6.38676844783715e-05,
"loss": 0.1253,
"step": 540
},
{
"epoch": 3.449367088607595,
"grad_norm": 0.9324259161949158,
"learning_rate": 6.259541984732826e-05,
"loss": 0.1244,
"step": 545
},
{
"epoch": 3.481012658227848,
"grad_norm": 0.8999868035316467,
"learning_rate": 6.132315521628498e-05,
"loss": 0.1294,
"step": 550
},
{
"epoch": 3.5126582278481013,
"grad_norm": 0.8019403219223022,
"learning_rate": 6.005089058524174e-05,
"loss": 0.1243,
"step": 555
},
{
"epoch": 3.5443037974683547,
"grad_norm": 0.9356296062469482,
"learning_rate": 5.877862595419847e-05,
"loss": 0.1296,
"step": 560
},
{
"epoch": 3.5759493670886076,
"grad_norm": 0.8532143831253052,
"learning_rate": 5.750636132315522e-05,
"loss": 0.124,
"step": 565
},
{
"epoch": 3.607594936708861,
"grad_norm": 1.1260122060775757,
"learning_rate": 5.6234096692111956e-05,
"loss": 0.1209,
"step": 570
},
{
"epoch": 3.6392405063291138,
"grad_norm": 0.7926989793777466,
"learning_rate": 5.496183206106871e-05,
"loss": 0.1265,
"step": 575
},
{
"epoch": 3.670886075949367,
"grad_norm": 0.8992180824279785,
"learning_rate": 5.3689567430025446e-05,
"loss": 0.1311,
"step": 580
},
{
"epoch": 3.7025316455696204,
"grad_norm": 0.7314108610153198,
"learning_rate": 5.2417302798982194e-05,
"loss": 0.1254,
"step": 585
},
{
"epoch": 3.7341772151898733,
"grad_norm": 0.9207622408866882,
"learning_rate": 5.114503816793893e-05,
"loss": 0.1289,
"step": 590
},
{
"epoch": 3.7658227848101267,
"grad_norm": 0.622431218624115,
"learning_rate": 4.9872773536895677e-05,
"loss": 0.1251,
"step": 595
},
{
"epoch": 3.7974683544303796,
"grad_norm": 1.0110617876052856,
"learning_rate": 4.860050890585242e-05,
"loss": 0.1312,
"step": 600
},
{
"epoch": 3.829113924050633,
"grad_norm": 0.699611246585846,
"learning_rate": 4.7328244274809166e-05,
"loss": 0.1263,
"step": 605
},
{
"epoch": 3.8607594936708862,
"grad_norm": 0.7877194881439209,
"learning_rate": 4.605597964376591e-05,
"loss": 0.1304,
"step": 610
},
{
"epoch": 3.892405063291139,
"grad_norm": 0.8100650906562805,
"learning_rate": 4.478371501272265e-05,
"loss": 0.1311,
"step": 615
},
{
"epoch": 3.9240506329113924,
"grad_norm": 0.6674991250038147,
"learning_rate": 4.351145038167939e-05,
"loss": 0.1303,
"step": 620
},
{
"epoch": 3.9556962025316453,
"grad_norm": 0.8028637170791626,
"learning_rate": 4.223918575063613e-05,
"loss": 0.1304,
"step": 625
},
{
"epoch": 3.9873417721518987,
"grad_norm": 1.6102169752120972,
"learning_rate": 4.096692111959288e-05,
"loss": 0.125,
"step": 630
},
{
"epoch": 4.018987341772152,
"grad_norm": 0.4470888376235962,
"learning_rate": 3.969465648854962e-05,
"loss": 0.106,
"step": 635
},
{
"epoch": 4.050632911392405,
"grad_norm": 0.40415889024734497,
"learning_rate": 3.842239185750636e-05,
"loss": 0.0871,
"step": 640
},
{
"epoch": 4.082278481012658,
"grad_norm": 0.48266398906707764,
"learning_rate": 3.7150127226463104e-05,
"loss": 0.0859,
"step": 645
},
{
"epoch": 4.113924050632911,
"grad_norm": 0.836426854133606,
"learning_rate": 3.5877862595419845e-05,
"loss": 0.0883,
"step": 650
},
{
"epoch": 4.1455696202531644,
"grad_norm": 0.6731426119804382,
"learning_rate": 3.4605597964376594e-05,
"loss": 0.0876,
"step": 655
},
{
"epoch": 4.177215189873418,
"grad_norm": 0.5741623640060425,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0911,
"step": 660
},
{
"epoch": 4.208860759493671,
"grad_norm": 0.5007706880569458,
"learning_rate": 3.2061068702290076e-05,
"loss": 0.0882,
"step": 665
},
{
"epoch": 4.2405063291139244,
"grad_norm": 0.5870316028594971,
"learning_rate": 3.078880407124682e-05,
"loss": 0.0891,
"step": 670
},
{
"epoch": 4.272151898734177,
"grad_norm": 0.698828935623169,
"learning_rate": 2.9516539440203562e-05,
"loss": 0.0912,
"step": 675
},
{
"epoch": 4.30379746835443,
"grad_norm": 0.5611512064933777,
"learning_rate": 2.824427480916031e-05,
"loss": 0.0881,
"step": 680
},
{
"epoch": 4.3354430379746836,
"grad_norm": 0.9599896669387817,
"learning_rate": 2.6972010178117052e-05,
"loss": 0.0875,
"step": 685
},
{
"epoch": 4.367088607594937,
"grad_norm": 0.6073245406150818,
"learning_rate": 2.5699745547073793e-05,
"loss": 0.0887,
"step": 690
},
{
"epoch": 4.39873417721519,
"grad_norm": 0.6183071136474609,
"learning_rate": 2.4427480916030535e-05,
"loss": 0.0902,
"step": 695
},
{
"epoch": 4.430379746835443,
"grad_norm": 0.4458979368209839,
"learning_rate": 2.3155216284987276e-05,
"loss": 0.088,
"step": 700
},
{
"epoch": 4.462025316455696,
"grad_norm": 0.6202102303504944,
"learning_rate": 2.1882951653944024e-05,
"loss": 0.0905,
"step": 705
},
{
"epoch": 4.493670886075949,
"grad_norm": 0.46292412281036377,
"learning_rate": 2.0610687022900766e-05,
"loss": 0.0895,
"step": 710
},
{
"epoch": 4.525316455696203,
"grad_norm": 0.6506438255310059,
"learning_rate": 1.9338422391857507e-05,
"loss": 0.0931,
"step": 715
},
{
"epoch": 4.556962025316456,
"grad_norm": 0.5219342112541199,
"learning_rate": 1.8066157760814252e-05,
"loss": 0.0916,
"step": 720
},
{
"epoch": 4.588607594936709,
"grad_norm": 0.47599899768829346,
"learning_rate": 1.6793893129770993e-05,
"loss": 0.0867,
"step": 725
},
{
"epoch": 4.620253164556962,
"grad_norm": 0.5680922865867615,
"learning_rate": 1.5521628498727735e-05,
"loss": 0.0878,
"step": 730
},
{
"epoch": 4.651898734177215,
"grad_norm": 0.5268383622169495,
"learning_rate": 1.424936386768448e-05,
"loss": 0.0881,
"step": 735
},
{
"epoch": 4.6835443037974684,
"grad_norm": 0.6063334345817566,
"learning_rate": 1.2977099236641221e-05,
"loss": 0.0904,
"step": 740
},
{
"epoch": 4.715189873417722,
"grad_norm": 0.5388665795326233,
"learning_rate": 1.1704834605597966e-05,
"loss": 0.0877,
"step": 745
},
{
"epoch": 4.746835443037975,
"grad_norm": 0.5125636458396912,
"learning_rate": 1.0432569974554709e-05,
"loss": 0.0927,
"step": 750
},
{
"epoch": 4.7784810126582276,
"grad_norm": 0.5058565139770508,
"learning_rate": 9.16030534351145e-06,
"loss": 0.0885,
"step": 755
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.39005881547927856,
"learning_rate": 7.888040712468193e-06,
"loss": 0.0892,
"step": 760
},
{
"epoch": 4.841772151898734,
"grad_norm": 0.45494306087493896,
"learning_rate": 6.615776081424936e-06,
"loss": 0.0926,
"step": 765
},
{
"epoch": 4.8734177215189876,
"grad_norm": 0.5130964517593384,
"learning_rate": 5.343511450381679e-06,
"loss": 0.0902,
"step": 770
},
{
"epoch": 4.905063291139241,
"grad_norm": 0.6438283324241638,
"learning_rate": 4.0712468193384225e-06,
"loss": 0.092,
"step": 775
},
{
"epoch": 4.936708860759493,
"grad_norm": 0.4781509041786194,
"learning_rate": 2.7989821882951656e-06,
"loss": 0.0912,
"step": 780
},
{
"epoch": 4.968354430379747,
"grad_norm": 0.42383071780204773,
"learning_rate": 1.5267175572519084e-06,
"loss": 0.0866,
"step": 785
},
{
"epoch": 5.0,
"grad_norm": 0.7937325835227966,
"learning_rate": 2.544529262086514e-07,
"loss": 0.0851,
"step": 790
}
],
"logging_steps": 5,
"max_steps": 790,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.804508938412032e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}