Upload folder using huggingface_hub

6bd2d39 verified 2 months ago

28 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 5.0,
	"eval_steps": 500,
	"global_step": 790,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.03164556962025317,
	"grad_norm": 5.788066387176514,
	"learning_rate": 0.0002,
	"loss": 2.6104,
	"step": 5
	},
	{
	"epoch": 0.06329113924050633,
	"grad_norm": 1.5542352199554443,
	"learning_rate": 0.00019872773536895675,
	"loss": 0.7864,
	"step": 10
	},
	{
	"epoch": 0.0949367088607595,
	"grad_norm": 1.996996283531189,
	"learning_rate": 0.00019745547073791352,
	"loss": 0.5532,
	"step": 15
	},
	{
	"epoch": 0.12658227848101267,
	"grad_norm": 0.9193770885467529,
	"learning_rate": 0.00019618320610687023,
	"loss": 0.522,
	"step": 20
	},
	{
	"epoch": 0.15822784810126583,
	"grad_norm": 1.024322271347046,
	"learning_rate": 0.00019491094147582698,
	"loss": 0.5621,
	"step": 25
	},
	{
	"epoch": 0.189873417721519,
	"grad_norm": 0.873715341091156,
	"learning_rate": 0.00019363867684478372,
	"loss": 0.4996,
	"step": 30
	},
	{
	"epoch": 0.22151898734177214,
	"grad_norm": 0.8645951151847839,
	"learning_rate": 0.00019236641221374049,
	"loss": 0.502,
	"step": 35
	},
	{
	"epoch": 0.25316455696202533,
	"grad_norm": 0.7674330472946167,
	"learning_rate": 0.00019109414758269723,
	"loss": 0.433,
	"step": 40
	},
	{
	"epoch": 0.2848101265822785,
	"grad_norm": 0.7591924667358398,
	"learning_rate": 0.00018982188295165394,
	"loss": 0.4406,
	"step": 45
	},
	{
	"epoch": 0.31645569620253167,
	"grad_norm": 0.7595440745353699,
	"learning_rate": 0.00018854961832061068,
	"loss": 0.499,
	"step": 50
	},
	{
	"epoch": 0.34810126582278483,
	"grad_norm": 0.7064336538314819,
	"learning_rate": 0.00018727735368956745,
	"loss": 0.4756,
	"step": 55
	},
	{
	"epoch": 0.379746835443038,
	"grad_norm": 0.7232657670974731,
	"learning_rate": 0.0001860050890585242,
	"loss": 0.5227,
	"step": 60
	},
	{
	"epoch": 0.41139240506329117,
	"grad_norm": 0.7500166296958923,
	"learning_rate": 0.00018473282442748093,
	"loss": 0.4481,
	"step": 65
	},
	{
	"epoch": 0.4430379746835443,
	"grad_norm": 0.6161800026893616,
	"learning_rate": 0.00018346055979643765,
	"loss": 0.4868,
	"step": 70
	},
	{
	"epoch": 0.47468354430379744,
	"grad_norm": 0.7168012857437134,
	"learning_rate": 0.00018218829516539442,
	"loss": 0.4833,
	"step": 75
	},
	{
	"epoch": 0.5063291139240507,
	"grad_norm": 0.7813606262207031,
	"learning_rate": 0.00018091603053435116,
	"loss": 0.4777,
	"step": 80
	},
	{
	"epoch": 0.5379746835443038,
	"grad_norm": 0.7056337594985962,
	"learning_rate": 0.0001796437659033079,
	"loss": 0.4521,
	"step": 85
	},
	{
	"epoch": 0.569620253164557,
	"grad_norm": 0.570087730884552,
	"learning_rate": 0.00017837150127226464,
	"loss": 0.4456,
	"step": 90
	},
	{
	"epoch": 0.6012658227848101,
	"grad_norm": 0.642938494682312,
	"learning_rate": 0.00017709923664122138,
	"loss": 0.4169,
	"step": 95
	},
	{
	"epoch": 0.6329113924050633,
	"grad_norm": 0.7025493383407593,
	"learning_rate": 0.00017582697201017812,
	"loss": 0.4987,
	"step": 100
	},
	{
	"epoch": 0.6645569620253164,
	"grad_norm": 0.7466819882392883,
	"learning_rate": 0.00017455470737913486,
	"loss": 0.4644,
	"step": 105
	},
	{
	"epoch": 0.6962025316455697,
	"grad_norm": 0.7106885313987732,
	"learning_rate": 0.00017328244274809163,
	"loss": 0.4653,
	"step": 110
	},
	{
	"epoch": 0.7278481012658228,
	"grad_norm": 0.6158185601234436,
	"learning_rate": 0.00017201017811704835,
	"loss": 0.4556,
	"step": 115
	},
	{
	"epoch": 0.759493670886076,
	"grad_norm": 0.678554117679596,
	"learning_rate": 0.0001707379134860051,
	"loss": 0.4587,
	"step": 120
	},
	{
	"epoch": 0.7911392405063291,
	"grad_norm": 0.8016729354858398,
	"learning_rate": 0.00016946564885496183,
	"loss": 0.4228,
	"step": 125
	},
	{
	"epoch": 0.8227848101265823,
	"grad_norm": 0.7110231518745422,
	"learning_rate": 0.0001681933842239186,
	"loss": 0.4303,
	"step": 130
	},
	{
	"epoch": 0.8544303797468354,
	"grad_norm": 0.6997452974319458,
	"learning_rate": 0.00016692111959287534,
	"loss": 0.4342,
	"step": 135
	},
	{
	"epoch": 0.8860759493670886,
	"grad_norm": 0.6250122785568237,
	"learning_rate": 0.00016564885496183205,
	"loss": 0.427,
	"step": 140
	},
	{
	"epoch": 0.9177215189873418,
	"grad_norm": 0.6947687864303589,
	"learning_rate": 0.0001643765903307888,
	"loss": 0.4763,
	"step": 145
	},
	{
	"epoch": 0.9493670886075949,
	"grad_norm": 0.680385947227478,
	"learning_rate": 0.00016310432569974556,
	"loss": 0.4554,
	"step": 150
	},
	{
	"epoch": 0.9810126582278481,
	"grad_norm": 0.5412645936012268,
	"learning_rate": 0.0001618320610687023,
	"loss": 0.4664,
	"step": 155
	},
	{
	"epoch": 1.0126582278481013,
	"grad_norm": 0.5828943848609924,
	"learning_rate": 0.00016055979643765905,
	"loss": 0.4573,
	"step": 160
	},
	{
	"epoch": 1.0443037974683544,
	"grad_norm": 0.6317119002342224,
	"learning_rate": 0.0001592875318066158,
	"loss": 0.325,
	"step": 165
	},
	{
	"epoch": 1.0759493670886076,
	"grad_norm": 0.6031287312507629,
	"learning_rate": 0.00015801526717557253,
	"loss": 0.3658,
	"step": 170
	},
	{
	"epoch": 1.1075949367088607,
	"grad_norm": 0.6438406109809875,
	"learning_rate": 0.00015674300254452927,
	"loss": 0.3645,
	"step": 175
	},
	{
	"epoch": 1.139240506329114,
	"grad_norm": 0.6503311395645142,
	"learning_rate": 0.000155470737913486,
	"loss": 0.323,
	"step": 180
	},
	{
	"epoch": 1.1708860759493671,
	"grad_norm": 0.8476307392120361,
	"learning_rate": 0.00015419847328244275,
	"loss": 0.347,
	"step": 185
	},
	{
	"epoch": 1.2025316455696202,
	"grad_norm": 0.7285150289535522,
	"learning_rate": 0.0001529262086513995,
	"loss": 0.3831,
	"step": 190
	},
	{
	"epoch": 1.2341772151898733,
	"grad_norm": 0.6327723860740662,
	"learning_rate": 0.00015165394402035624,
	"loss": 0.3577,
	"step": 195
	},
	{
	"epoch": 1.2658227848101267,
	"grad_norm": 0.6771088242530823,
	"learning_rate": 0.00015038167938931298,
	"loss": 0.3154,
	"step": 200
	},
	{
	"epoch": 1.2974683544303798,
	"grad_norm": 0.7355062365531921,
	"learning_rate": 0.00014910941475826972,
	"loss": 0.3706,
	"step": 205
	},
	{
	"epoch": 1.3291139240506329,
	"grad_norm": 0.7912581562995911,
	"learning_rate": 0.0001478371501272265,
	"loss": 0.3456,
	"step": 210
	},
	{
	"epoch": 1.360759493670886,
	"grad_norm": 0.6501379609107971,
	"learning_rate": 0.0001465648854961832,
	"loss": 0.3243,
	"step": 215
	},
	{
	"epoch": 1.3924050632911391,
	"grad_norm": 0.6570438146591187,
	"learning_rate": 0.00014529262086513994,
	"loss": 0.3595,
	"step": 220
	},
	{
	"epoch": 1.4240506329113924,
	"grad_norm": 0.6073997020721436,
	"learning_rate": 0.0001440203562340967,
	"loss": 0.3397,
	"step": 225
	},
	{
	"epoch": 1.4556962025316456,
	"grad_norm": 0.7310261130332947,
	"learning_rate": 0.00014274809160305345,
	"loss": 0.3641,
	"step": 230
	},
	{
	"epoch": 1.4873417721518987,
	"grad_norm": 0.8089779019355774,
	"learning_rate": 0.0001414758269720102,
	"loss": 0.3374,
	"step": 235
	},
	{
	"epoch": 1.518987341772152,
	"grad_norm": 0.8505273461341858,
	"learning_rate": 0.0001402035623409669,
	"loss": 0.3695,
	"step": 240
	},
	{
	"epoch": 1.5506329113924051,
	"grad_norm": 0.6972491145133972,
	"learning_rate": 0.00013893129770992368,
	"loss": 0.3556,
	"step": 245
	},
	{
	"epoch": 1.5822784810126582,
	"grad_norm": 0.740247368812561,
	"learning_rate": 0.00013765903307888042,
	"loss": 0.3604,
	"step": 250
	},
	{
	"epoch": 1.6139240506329116,
	"grad_norm": 0.818209707736969,
	"learning_rate": 0.00013638676844783716,
	"loss": 0.3538,
	"step": 255
	},
	{
	"epoch": 1.6455696202531644,
	"grad_norm": 0.822881817817688,
	"learning_rate": 0.0001351145038167939,
	"loss": 0.3494,
	"step": 260
	},
	{
	"epoch": 1.6772151898734178,
	"grad_norm": 0.7193669080734253,
	"learning_rate": 0.00013384223918575064,
	"loss": 0.3676,
	"step": 265
	},
	{
	"epoch": 1.7088607594936709,
	"grad_norm": 0.6926146149635315,
	"learning_rate": 0.00013256997455470738,
	"loss": 0.313,
	"step": 270
	},
	{
	"epoch": 1.740506329113924,
	"grad_norm": 1.0953829288482666,
	"learning_rate": 0.00013129770992366413,
	"loss": 0.3202,
	"step": 275
	},
	{
	"epoch": 1.7721518987341773,
	"grad_norm": 0.8663277626037598,
	"learning_rate": 0.00013002544529262087,
	"loss": 0.3488,
	"step": 280
	},
	{
	"epoch": 1.8037974683544302,
	"grad_norm": 1.1026146411895752,
	"learning_rate": 0.0001287531806615776,
	"loss": 0.3654,
	"step": 285
	},
	{
	"epoch": 1.8354430379746836,
	"grad_norm": 0.7661195993423462,
	"learning_rate": 0.00012748091603053435,
	"loss": 0.3693,
	"step": 290
	},
	{
	"epoch": 1.8670886075949367,
	"grad_norm": 0.6808319687843323,
	"learning_rate": 0.0001262086513994911,
	"loss": 0.3449,
	"step": 295
	},
	{
	"epoch": 1.8987341772151898,
	"grad_norm": 0.7904935479164124,
	"learning_rate": 0.00012493638676844783,
	"loss": 0.3219,
	"step": 300
	},
	{
	"epoch": 1.9303797468354431,
	"grad_norm": 0.7428227066993713,
	"learning_rate": 0.0001236641221374046,
	"loss": 0.3452,
	"step": 305
	},
	{
	"epoch": 1.9620253164556962,
	"grad_norm": 0.8595893383026123,
	"learning_rate": 0.00012239185750636134,
	"loss": 0.3629,
	"step": 310
	},
	{
	"epoch": 1.9936708860759493,
	"grad_norm": 0.7588908672332764,
	"learning_rate": 0.00012111959287531807,
	"loss": 0.333,
	"step": 315
	},
	{
	"epoch": 2.0253164556962027,
	"grad_norm": 0.6112965941429138,
	"learning_rate": 0.00011984732824427483,
	"loss": 0.2809,
	"step": 320
	},
	{
	"epoch": 2.0569620253164556,
	"grad_norm": 1.506441593170166,
	"learning_rate": 0.00011857506361323157,
	"loss": 0.2251,
	"step": 325
	},
	{
	"epoch": 2.088607594936709,
	"grad_norm": 0.8897147178649902,
	"learning_rate": 0.0001173027989821883,
	"loss": 0.2258,
	"step": 330
	},
	{
	"epoch": 2.1202531645569622,
	"grad_norm": 0.6773934960365295,
	"learning_rate": 0.00011603053435114504,
	"loss": 0.2115,
	"step": 335
	},
	{
	"epoch": 2.151898734177215,
	"grad_norm": 0.832305908203125,
	"learning_rate": 0.00011475826972010179,
	"loss": 0.2317,
	"step": 340
	},
	{
	"epoch": 2.1835443037974684,
	"grad_norm": 0.9453684687614441,
	"learning_rate": 0.00011348600508905853,
	"loss": 0.2325,
	"step": 345
	},
	{
	"epoch": 2.2151898734177213,
	"grad_norm": 0.9567768573760986,
	"learning_rate": 0.00011221374045801527,
	"loss": 0.2273,
	"step": 350
	},
	{
	"epoch": 2.2468354430379747,
	"grad_norm": 0.8501098155975342,
	"learning_rate": 0.000110941475826972,
	"loss": 0.2184,
	"step": 355
	},
	{
	"epoch": 2.278481012658228,
	"grad_norm": 1.046438217163086,
	"learning_rate": 0.00010966921119592877,
	"loss": 0.2181,
	"step": 360
	},
	{
	"epoch": 2.310126582278481,
	"grad_norm": 0.8483916521072388,
	"learning_rate": 0.0001083969465648855,
	"loss": 0.2176,
	"step": 365
	},
	{
	"epoch": 2.3417721518987342,
	"grad_norm": 0.805766224861145,
	"learning_rate": 0.00010712468193384224,
	"loss": 0.2303,
	"step": 370
	},
	{
	"epoch": 2.3734177215189876,
	"grad_norm": 0.8078694343566895,
	"learning_rate": 0.00010585241730279898,
	"loss": 0.2199,
	"step": 375
	},
	{
	"epoch": 2.4050632911392404,
	"grad_norm": 1.255946397781372,
	"learning_rate": 0.00010458015267175574,
	"loss": 0.234,
	"step": 380
	},
	{
	"epoch": 2.4367088607594938,
	"grad_norm": 0.8427215814590454,
	"learning_rate": 0.00010330788804071248,
	"loss": 0.2299,
	"step": 385
	},
	{
	"epoch": 2.4683544303797467,
	"grad_norm": 0.6670682430267334,
	"learning_rate": 0.00010203562340966922,
	"loss": 0.2237,
	"step": 390
	},
	{
	"epoch": 2.5,
	"grad_norm": 0.7526563405990601,
	"learning_rate": 0.00010076335877862595,
	"loss": 0.2276,
	"step": 395
	},
	{
	"epoch": 2.5316455696202533,
	"grad_norm": 0.8490801453590393,
	"learning_rate": 9.94910941475827e-05,
	"loss": 0.223,
	"step": 400
	},
	{
	"epoch": 2.5632911392405062,
	"grad_norm": 0.8474723100662231,
	"learning_rate": 9.821882951653944e-05,
	"loss": 0.23,
	"step": 405
	},
	{
	"epoch": 2.5949367088607596,
	"grad_norm": 0.9560302495956421,
	"learning_rate": 9.694656488549618e-05,
	"loss": 0.2254,
	"step": 410
	},
	{
	"epoch": 2.6265822784810124,
	"grad_norm": 0.8522864580154419,
	"learning_rate": 9.567430025445293e-05,
	"loss": 0.2304,
	"step": 415
	},
	{
	"epoch": 2.6582278481012658,
	"grad_norm": 0.7552340030670166,
	"learning_rate": 9.440203562340968e-05,
	"loss": 0.2223,
	"step": 420
	},
	{
	"epoch": 2.689873417721519,
	"grad_norm": 0.9277474284172058,
	"learning_rate": 9.312977099236642e-05,
	"loss": 0.232,
	"step": 425
	},
	{
	"epoch": 2.721518987341772,
	"grad_norm": 0.9359229803085327,
	"learning_rate": 9.185750636132316e-05,
	"loss": 0.2278,
	"step": 430
	},
	{
	"epoch": 2.7531645569620253,
	"grad_norm": 0.8999188542366028,
	"learning_rate": 9.05852417302799e-05,
	"loss": 0.2101,
	"step": 435
	},
	{
	"epoch": 2.7848101265822782,
	"grad_norm": 1.0271995067596436,
	"learning_rate": 8.931297709923665e-05,
	"loss": 0.2355,
	"step": 440
	},
	{
	"epoch": 2.8164556962025316,
	"grad_norm": 0.8051729798316956,
	"learning_rate": 8.804071246819339e-05,
	"loss": 0.2321,
	"step": 445
	},
	{
	"epoch": 2.848101265822785,
	"grad_norm": 0.9024167656898499,
	"learning_rate": 8.676844783715013e-05,
	"loss": 0.2401,
	"step": 450
	},
	{
	"epoch": 2.879746835443038,
	"grad_norm": 0.9658450484275818,
	"learning_rate": 8.549618320610687e-05,
	"loss": 0.2454,
	"step": 455
	},
	{
	"epoch": 2.911392405063291,
	"grad_norm": 0.766612708568573,
	"learning_rate": 8.422391857506363e-05,
	"loss": 0.2391,
	"step": 460
	},
	{
	"epoch": 2.9430379746835444,
	"grad_norm": 0.7812886238098145,
	"learning_rate": 8.295165394402035e-05,
	"loss": 0.2286,
	"step": 465
	},
	{
	"epoch": 2.9746835443037973,
	"grad_norm": 0.8621058464050293,
	"learning_rate": 8.167938931297711e-05,
	"loss": 0.2444,
	"step": 470
	},
	{
	"epoch": 3.0063291139240507,
	"grad_norm": 0.6189867854118347,
	"learning_rate": 8.040712468193385e-05,
	"loss": 0.2114,
	"step": 475
	},
	{
	"epoch": 3.037974683544304,
	"grad_norm": 0.8174425959587097,
	"learning_rate": 7.913486005089059e-05,
	"loss": 0.1325,
	"step": 480
	},
	{
	"epoch": 3.069620253164557,
	"grad_norm": 1.3215669393539429,
	"learning_rate": 7.786259541984733e-05,
	"loss": 0.1431,
	"step": 485
	},
	{
	"epoch": 3.1012658227848102,
	"grad_norm": 0.944607138633728,
	"learning_rate": 7.659033078880407e-05,
	"loss": 0.1334,
	"step": 490
	},
	{
	"epoch": 3.132911392405063,
	"grad_norm": 0.8664195537567139,
	"learning_rate": 7.531806615776081e-05,
	"loss": 0.1358,
	"step": 495
	},
	{
	"epoch": 3.1645569620253164,
	"grad_norm": 0.7892764806747437,
	"learning_rate": 7.404580152671756e-05,
	"loss": 0.124,
	"step": 500
	},
	{
	"epoch": 3.1962025316455698,
	"grad_norm": 0.9069737195968628,
	"learning_rate": 7.27735368956743e-05,
	"loss": 0.1388,
	"step": 505
	},
	{
	"epoch": 3.2278481012658227,
	"grad_norm": 0.9595538973808289,
	"learning_rate": 7.150127226463105e-05,
	"loss": 0.1305,
	"step": 510
	},
	{
	"epoch": 3.259493670886076,
	"grad_norm": 0.9580609798431396,
	"learning_rate": 7.022900763358778e-05,
	"loss": 0.139,
	"step": 515
	},
	{
	"epoch": 3.291139240506329,
	"grad_norm": 0.8919333219528198,
	"learning_rate": 6.895674300254454e-05,
	"loss": 0.1292,
	"step": 520
	},
	{
	"epoch": 3.3227848101265822,
	"grad_norm": 0.9798533320426941,
	"learning_rate": 6.768447837150128e-05,
	"loss": 0.1348,
	"step": 525
	},
	{
	"epoch": 3.3544303797468356,
	"grad_norm": 0.737899661064148,
	"learning_rate": 6.641221374045802e-05,
	"loss": 0.1418,
	"step": 530
	},
	{
	"epoch": 3.3860759493670884,
	"grad_norm": 0.8077306151390076,
	"learning_rate": 6.513994910941476e-05,
	"loss": 0.1434,
	"step": 535
	},
	{
	"epoch": 3.4177215189873418,
	"grad_norm": 0.6728256940841675,
	"learning_rate": 6.38676844783715e-05,
	"loss": 0.1347,
	"step": 540
	},
	{
	"epoch": 3.449367088607595,
	"grad_norm": 0.8441898822784424,
	"learning_rate": 6.259541984732826e-05,
	"loss": 0.1294,
	"step": 545
	},
	{
	"epoch": 3.481012658227848,
	"grad_norm": 0.7539904713630676,
	"learning_rate": 6.132315521628498e-05,
	"loss": 0.1337,
	"step": 550
	},
	{
	"epoch": 3.5126582278481013,
	"grad_norm": 0.874884843826294,
	"learning_rate": 6.005089058524174e-05,
	"loss": 0.1318,
	"step": 555
	},
	{
	"epoch": 3.5443037974683547,
	"grad_norm": 0.8220652937889099,
	"learning_rate": 5.877862595419847e-05,
	"loss": 0.1372,
	"step": 560
	},
	{
	"epoch": 3.5759493670886076,
	"grad_norm": 0.8709121942520142,
	"learning_rate": 5.750636132315522e-05,
	"loss": 0.1329,
	"step": 565
	},
	{
	"epoch": 3.607594936708861,
	"grad_norm": 1.0847886800765991,
	"learning_rate": 5.6234096692111956e-05,
	"loss": 0.1366,
	"step": 570
	},
	{
	"epoch": 3.6392405063291138,
	"grad_norm": 1.150924563407898,
	"learning_rate": 5.496183206106871e-05,
	"loss": 0.1401,
	"step": 575
	},
	{
	"epoch": 3.670886075949367,
	"grad_norm": 1.2749351263046265,
	"learning_rate": 5.3689567430025446e-05,
	"loss": 0.1388,
	"step": 580
	},
	{
	"epoch": 3.7025316455696204,
	"grad_norm": 0.8032536506652832,
	"learning_rate": 5.2417302798982194e-05,
	"loss": 0.1312,
	"step": 585
	},
	{
	"epoch": 3.7341772151898733,
	"grad_norm": 1.0450551509857178,
	"learning_rate": 5.114503816793893e-05,
	"loss": 0.1444,
	"step": 590
	},
	{
	"epoch": 3.7658227848101267,
	"grad_norm": 0.8416706919670105,
	"learning_rate": 4.9872773536895677e-05,
	"loss": 0.1401,
	"step": 595
	},
	{
	"epoch": 3.7974683544303796,
	"grad_norm": 0.9472242593765259,
	"learning_rate": 4.860050890585242e-05,
	"loss": 0.1414,
	"step": 600
	},
	{
	"epoch": 3.829113924050633,
	"grad_norm": 1.0049540996551514,
	"learning_rate": 4.7328244274809166e-05,
	"loss": 0.142,
	"step": 605
	},
	{
	"epoch": 3.8607594936708862,
	"grad_norm": 0.9480180144309998,
	"learning_rate": 4.605597964376591e-05,
	"loss": 0.1416,
	"step": 610
	},
	{
	"epoch": 3.892405063291139,
	"grad_norm": 1.5082101821899414,
	"learning_rate": 4.478371501272265e-05,
	"loss": 0.1327,
	"step": 615
	},
	{
	"epoch": 3.9240506329113924,
	"grad_norm": 0.7728102207183838,
	"learning_rate": 4.351145038167939e-05,
	"loss": 0.1303,
	"step": 620
	},
	{
	"epoch": 3.9556962025316453,
	"grad_norm": 0.8425063490867615,
	"learning_rate": 4.223918575063613e-05,
	"loss": 0.1324,
	"step": 625
	},
	{
	"epoch": 3.9873417721518987,
	"grad_norm": 0.9874700307846069,
	"learning_rate": 4.096692111959288e-05,
	"loss": 0.1405,
	"step": 630
	},
	{
	"epoch": 4.018987341772152,
	"grad_norm": 0.47495508193969727,
	"learning_rate": 3.969465648854962e-05,
	"loss": 0.1071,
	"step": 635
	},
	{
	"epoch": 4.050632911392405,
	"grad_norm": 0.3872312009334564,
	"learning_rate": 3.842239185750636e-05,
	"loss": 0.0901,
	"step": 640
	},
	{
	"epoch": 4.082278481012658,
	"grad_norm": 0.5231612920761108,
	"learning_rate": 3.7150127226463104e-05,
	"loss": 0.0969,
	"step": 645
	},
	{
	"epoch": 4.113924050632911,
	"grad_norm": 1.3609975576400757,
	"learning_rate": 3.5877862595419845e-05,
	"loss": 0.0893,
	"step": 650
	},
	{
	"epoch": 4.1455696202531644,
	"grad_norm": 0.673308253288269,
	"learning_rate": 3.4605597964376594e-05,
	"loss": 0.0928,
	"step": 655
	},
	{
	"epoch": 4.177215189873418,
	"grad_norm": 0.482837438583374,
	"learning_rate": 3.3333333333333335e-05,
	"loss": 0.0912,
	"step": 660
	},
	{
	"epoch": 4.208860759493671,
	"grad_norm": 0.7408327460289001,
	"learning_rate": 3.2061068702290076e-05,
	"loss": 0.0912,
	"step": 665
	},
	{
	"epoch": 4.2405063291139244,
	"grad_norm": 0.5170139074325562,
	"learning_rate": 3.078880407124682e-05,
	"loss": 0.0958,
	"step": 670
	},
	{
	"epoch": 4.272151898734177,
	"grad_norm": 0.6944636106491089,
	"learning_rate": 2.9516539440203562e-05,
	"loss": 0.0883,
	"step": 675
	},
	{
	"epoch": 4.30379746835443,
	"grad_norm": 0.6294474601745605,
	"learning_rate": 2.824427480916031e-05,
	"loss": 0.0915,
	"step": 680
	},
	{
	"epoch": 4.3354430379746836,
	"grad_norm": 0.6630133390426636,
	"learning_rate": 2.6972010178117052e-05,
	"loss": 0.0905,
	"step": 685
	},
	{
	"epoch": 4.367088607594937,
	"grad_norm": 0.6174125075340271,
	"learning_rate": 2.5699745547073793e-05,
	"loss": 0.0958,
	"step": 690
	},
	{
	"epoch": 4.39873417721519,
	"grad_norm": 0.7214713096618652,
	"learning_rate": 2.4427480916030535e-05,
	"loss": 0.0905,
	"step": 695
	},
	{
	"epoch": 4.430379746835443,
	"grad_norm": 0.7950146794319153,
	"learning_rate": 2.3155216284987276e-05,
	"loss": 0.0925,
	"step": 700
	},
	{
	"epoch": 4.462025316455696,
	"grad_norm": 0.6609070301055908,
	"learning_rate": 2.1882951653944024e-05,
	"loss": 0.0929,
	"step": 705
	},
	{
	"epoch": 4.493670886075949,
	"grad_norm": 0.6648293733596802,
	"learning_rate": 2.0610687022900766e-05,
	"loss": 0.0908,
	"step": 710
	},
	{
	"epoch": 4.525316455696203,
	"grad_norm": 0.5198394656181335,
	"learning_rate": 1.9338422391857507e-05,
	"loss": 0.0907,
	"step": 715
	},
	{
	"epoch": 4.556962025316456,
	"grad_norm": 0.8868843913078308,
	"learning_rate": 1.8066157760814252e-05,
	"loss": 0.0957,
	"step": 720
	},
	{
	"epoch": 4.588607594936709,
	"grad_norm": 0.6488995552062988,
	"learning_rate": 1.6793893129770993e-05,
	"loss": 0.0898,
	"step": 725
	},
	{
	"epoch": 4.620253164556962,
	"grad_norm": 0.5500432252883911,
	"learning_rate": 1.5521628498727735e-05,
	"loss": 0.0946,
	"step": 730
	},
	{
	"epoch": 4.651898734177215,
	"grad_norm": 0.8371357321739197,
	"learning_rate": 1.424936386768448e-05,
	"loss": 0.0905,
	"step": 735
	},
	{
	"epoch": 4.6835443037974684,
	"grad_norm": 0.5861048102378845,
	"learning_rate": 1.2977099236641221e-05,
	"loss": 0.0941,
	"step": 740
	},
	{
	"epoch": 4.715189873417722,
	"grad_norm": 0.7422693371772766,
	"learning_rate": 1.1704834605597966e-05,
	"loss": 0.0956,
	"step": 745
	},
	{
	"epoch": 4.746835443037975,
	"grad_norm": 0.5376149415969849,
	"learning_rate": 1.0432569974554709e-05,
	"loss": 0.0937,
	"step": 750
	},
	{
	"epoch": 4.7784810126582276,
	"grad_norm": 0.46256035566329956,
	"learning_rate": 9.16030534351145e-06,
	"loss": 0.0878,
	"step": 755
	},
	{
	"epoch": 4.810126582278481,
	"grad_norm": 0.4410872459411621,
	"learning_rate": 7.888040712468193e-06,
	"loss": 0.0929,
	"step": 760
	},
	{
	"epoch": 4.841772151898734,
	"grad_norm": 0.44353851675987244,
	"learning_rate": 6.615776081424936e-06,
	"loss": 0.0909,
	"step": 765
	},
	{
	"epoch": 4.8734177215189876,
	"grad_norm": 0.5728248953819275,
	"learning_rate": 5.343511450381679e-06,
	"loss": 0.0946,
	"step": 770
	},
	{
	"epoch": 4.905063291139241,
	"grad_norm": 0.4497832655906677,
	"learning_rate": 4.0712468193384225e-06,
	"loss": 0.089,
	"step": 775
	},
	{
	"epoch": 4.936708860759493,
	"grad_norm": 0.650806725025177,
	"learning_rate": 2.7989821882951656e-06,
	"loss": 0.0939,
	"step": 780
	},
	{
	"epoch": 4.968354430379747,
	"grad_norm": 0.5624284148216248,
	"learning_rate": 1.5267175572519084e-06,
	"loss": 0.0918,
	"step": 785
	},
	{
	"epoch": 5.0,
	"grad_norm": 0.5814446210861206,
	"learning_rate": 2.544529262086514e-07,
	"loss": 0.0872,
	"step": 790
	}
	],
	"logging_steps": 5,
	"max_steps": 790,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 5,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 9.503292302504755e+16,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}