7b_sota_2M / trainer_state.json
ZHZ2002's picture
Upload folder using huggingface_hub
8595b2f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0078125,
"grad_norm": 4487.662174517055,
"learning_rate": 1.0000000000000002e-06,
"loss": 14.0537,
"step": 5
},
{
"epoch": 0.015625,
"grad_norm": 3265.096963107133,
"learning_rate": 2.25e-06,
"loss": 13.5061,
"step": 10
},
{
"epoch": 0.0234375,
"grad_norm": 4019.1529344819937,
"learning_rate": 3.5e-06,
"loss": 10.2522,
"step": 15
},
{
"epoch": 0.03125,
"grad_norm": 338.18628553133266,
"learning_rate": 4.75e-06,
"loss": 7.6669,
"step": 20
},
{
"epoch": 0.0390625,
"grad_norm": 543.0311725735058,
"learning_rate": 4.999486510586282e-06,
"loss": 5.5188,
"step": 25
},
{
"epoch": 0.046875,
"grad_norm": 246.23088738753083,
"learning_rate": 4.9974008213559725e-06,
"loss": 3.8606,
"step": 30
},
{
"epoch": 0.0546875,
"grad_norm": 119.09382078616744,
"learning_rate": 4.993712176889086e-06,
"loss": 3.9223,
"step": 35
},
{
"epoch": 0.0625,
"grad_norm": 369.14487711524765,
"learning_rate": 4.988422944739889e-06,
"loss": 3.7582,
"step": 40
},
{
"epoch": 0.0703125,
"grad_norm": 73.68641210467196,
"learning_rate": 4.981536519798899e-06,
"loss": 4.3581,
"step": 45
},
{
"epoch": 0.078125,
"grad_norm": 143.98864514567893,
"learning_rate": 4.973057322113883e-06,
"loss": 4.1023,
"step": 50
},
{
"epoch": 0.0859375,
"grad_norm": 129.86219911163266,
"learning_rate": 4.962990794052847e-06,
"loss": 3.3248,
"step": 55
},
{
"epoch": 0.09375,
"grad_norm": 213.93081161843782,
"learning_rate": 4.95134339681086e-06,
"loss": 3.5744,
"step": 60
},
{
"epoch": 0.1015625,
"grad_norm": 43.57647693525295,
"learning_rate": 4.938122606262935e-06,
"loss": 3.0606,
"step": 65
},
{
"epoch": 0.109375,
"grad_norm": 24.136487456562335,
"learning_rate": 4.923336908165649e-06,
"loss": 3.2941,
"step": 70
},
{
"epoch": 0.1171875,
"grad_norm": 79.3928477087145,
"learning_rate": 4.906995792710559e-06,
"loss": 3.466,
"step": 75
},
{
"epoch": 0.125,
"grad_norm": 20.979020201181374,
"learning_rate": 4.889109748432932e-06,
"loss": 3.0447,
"step": 80
},
{
"epoch": 0.1328125,
"grad_norm": 33.10678605914481,
"learning_rate": 4.8696902554796824e-06,
"loss": 2.9834,
"step": 85
},
{
"epoch": 0.140625,
"grad_norm": 37.842992430043786,
"learning_rate": 4.84874977824085e-06,
"loss": 3.3414,
"step": 90
},
{
"epoch": 0.1484375,
"grad_norm": 15.354073559162247,
"learning_rate": 4.826301757349337e-06,
"loss": 3.4018,
"step": 95
},
{
"epoch": 0.15625,
"grad_norm": 41.07210772695517,
"learning_rate": 4.802360601054042e-06,
"loss": 3.3653,
"step": 100
},
{
"epoch": 0.1640625,
"grad_norm": 34.46675403978081,
"learning_rate": 4.776941675971941e-06,
"loss": 2.8566,
"step": 105
},
{
"epoch": 0.171875,
"grad_norm": 30.438715655532075,
"learning_rate": 4.750061297225028e-06,
"loss": 2.9337,
"step": 110
},
{
"epoch": 0.1796875,
"grad_norm": 47.52097975604328,
"learning_rate": 4.721736717968465e-06,
"loss": 3.0085,
"step": 115
},
{
"epoch": 0.1875,
"grad_norm": 32.91467336279922,
"learning_rate": 4.691986118316654e-06,
"loss": 2.7027,
"step": 120
},
{
"epoch": 0.1953125,
"grad_norm": 31.817561483162603,
"learning_rate": 4.660828593674344e-06,
"loss": 3.0477,
"step": 125
},
{
"epoch": 0.203125,
"grad_norm": 68.38396573292292,
"learning_rate": 4.628284142480256e-06,
"loss": 3.1287,
"step": 130
},
{
"epoch": 0.2109375,
"grad_norm": 48.99060766773029,
"learning_rate": 4.594373653371095e-06,
"loss": 3.0499,
"step": 135
},
{
"epoch": 0.21875,
"grad_norm": 29.811323317057937,
"learning_rate": 4.559118891774188e-06,
"loss": 2.6658,
"step": 140
},
{
"epoch": 0.2265625,
"grad_norm": 16.757192674214547,
"learning_rate": 4.522542485937369e-06,
"loss": 2.7111,
"step": 145
},
{
"epoch": 0.234375,
"grad_norm": 854.2596861814334,
"learning_rate": 4.484667912405038e-06,
"loss": 2.7731,
"step": 150
},
{
"epoch": 0.2421875,
"grad_norm": 41.03420716766998,
"learning_rate": 4.445519480949761e-06,
"loss": 3.0335,
"step": 155
},
{
"epoch": 0.25,
"grad_norm": 37.20268617216472,
"learning_rate": 4.4051223189690585e-06,
"loss": 2.6551,
"step": 160
},
{
"epoch": 0.2578125,
"grad_norm": 35.47333065284907,
"learning_rate": 4.3635023553574e-06,
"loss": 2.5314,
"step": 165
},
{
"epoch": 0.265625,
"grad_norm": 34.11952544419604,
"learning_rate": 4.320686303863752e-06,
"loss": 2.4063,
"step": 170
},
{
"epoch": 0.2734375,
"grad_norm": 31.337300080344246,
"learning_rate": 4.276701645945384e-06,
"loss": 2.8953,
"step": 175
},
{
"epoch": 0.28125,
"grad_norm": 71.0076530183938,
"learning_rate": 4.231576613128902e-06,
"loss": 2.7765,
"step": 180
},
{
"epoch": 0.2890625,
"grad_norm": 35.89471866430099,
"learning_rate": 4.185340168889869e-06,
"loss": 2.6225,
"step": 185
},
{
"epoch": 0.296875,
"grad_norm": 34.718088237273705,
"learning_rate": 4.138021990062606e-06,
"loss": 2.6321,
"step": 190
},
{
"epoch": 0.3046875,
"grad_norm": 21.190370232623685,
"learning_rate": 4.089652447792141e-06,
"loss": 2.3654,
"step": 195
},
{
"epoch": 0.3125,
"grad_norm": 24.28207185282827,
"learning_rate": 4.040262588040503e-06,
"loss": 2.439,
"step": 200
},
{
"epoch": 0.3203125,
"grad_norm": 30.270563671656443,
"learning_rate": 3.989884111659893e-06,
"loss": 2.6155,
"step": 205
},
{
"epoch": 0.328125,
"grad_norm": 30.374998546903175,
"learning_rate": 3.938549354045508e-06,
"loss": 2.5646,
"step": 210
},
{
"epoch": 0.3359375,
"grad_norm": 28.01955269110465,
"learning_rate": 3.8862912643810895e-06,
"loss": 2.1882,
"step": 215
},
{
"epoch": 0.34375,
"grad_norm": 43.01312536445347,
"learning_rate": 3.833143384490506e-06,
"loss": 2.6895,
"step": 220
},
{
"epoch": 0.3515625,
"grad_norm": 34.41155110572982,
"learning_rate": 3.7791398273089562e-06,
"loss": 2.5118,
"step": 225
},
{
"epoch": 0.359375,
"grad_norm": 112.57991856629606,
"learning_rate": 3.7243152549875995e-06,
"loss": 2.223,
"step": 230
},
{
"epoch": 0.3671875,
"grad_norm": 76.22091458973803,
"learning_rate": 3.6687048566456783e-06,
"loss": 2.5385,
"step": 235
},
{
"epoch": 0.375,
"grad_norm": 47.102411590390844,
"learning_rate": 3.6123443257843985e-06,
"loss": 2.0943,
"step": 240
},
{
"epoch": 0.3828125,
"grad_norm": 73.15900084042869,
"learning_rate": 3.55526983737708e-06,
"loss": 2.4384,
"step": 245
},
{
"epoch": 0.390625,
"grad_norm": 54.222406085796,
"learning_rate": 3.4975180246502694e-06,
"loss": 2.5384,
"step": 250
},
{
"epoch": 0.3984375,
"grad_norm": 81.39267333279535,
"learning_rate": 3.4391259555707258e-06,
"loss": 2.4972,
"step": 255
},
{
"epoch": 0.40625,
"grad_norm": 67.5323257615502,
"learning_rate": 3.3801311090533713e-06,
"loss": 1.8014,
"step": 260
},
{
"epoch": 0.4140625,
"grad_norm": 26.26948979836602,
"learning_rate": 3.320571350905466e-06,
"loss": 1.773,
"step": 265
},
{
"epoch": 0.421875,
"grad_norm": 44.03138506176169,
"learning_rate": 3.2604849095224666e-06,
"loss": 2.0221,
"step": 270
},
{
"epoch": 0.4296875,
"grad_norm": 43.260012330071866,
"learning_rate": 3.1999103513511528e-06,
"loss": 2.2129,
"step": 275
},
{
"epoch": 0.4375,
"grad_norm": 26.951615470416954,
"learning_rate": 3.1388865561357727e-06,
"loss": 2.2301,
"step": 280
},
{
"epoch": 0.4453125,
"grad_norm": 46.519629142166174,
"learning_rate": 3.077452691963109e-06,
"loss": 2.3289,
"step": 285
},
{
"epoch": 0.453125,
"grad_norm": 48.39622358551033,
"learning_rate": 3.0156481901224573e-06,
"loss": 1.795,
"step": 290
},
{
"epoch": 0.4609375,
"grad_norm": 52.06374998250507,
"learning_rate": 2.953512719796683e-06,
"loss": 2.2433,
"step": 295
},
{
"epoch": 0.46875,
"grad_norm": 474.92706894375743,
"learning_rate": 2.8910861626005774e-06,
"loss": 1.7952,
"step": 300
},
{
"epoch": 0.4765625,
"grad_norm": 74.9957849628404,
"learning_rate": 2.8284085869828664e-06,
"loss": 2.3712,
"step": 305
},
{
"epoch": 0.484375,
"grad_norm": 51.21339523637916,
"learning_rate": 2.765520222508302e-06,
"loss": 1.9892,
"step": 310
},
{
"epoch": 0.4921875,
"grad_norm": 131.64456956757485,
"learning_rate": 2.7024614340363365e-06,
"loss": 1.9972,
"step": 315
},
{
"epoch": 0.5,
"grad_norm": 31.52894208660832,
"learning_rate": 2.6392726958129653e-06,
"loss": 1.5076,
"step": 320
},
{
"epoch": 0.5078125,
"grad_norm": 106.27601023204515,
"learning_rate": 2.5759945654923575e-06,
"loss": 2.0369,
"step": 325
},
{
"epoch": 0.515625,
"grad_norm": 47.324062545245894,
"learning_rate": 2.5126676581049413e-06,
"loss": 1.8094,
"step": 330
},
{
"epoch": 0.5234375,
"grad_norm": 76.05234123339977,
"learning_rate": 2.4493326199886813e-06,
"loss": 1.9059,
"step": 335
},
{
"epoch": 0.53125,
"grad_norm": 115.507138279873,
"learning_rate": 2.3860301027002432e-06,
"loss": 1.9663,
"step": 340
},
{
"epoch": 0.5390625,
"grad_norm": 81.81607613481519,
"learning_rate": 2.322800736922818e-06,
"loss": 2.1141,
"step": 345
},
{
"epoch": 0.546875,
"grad_norm": 97.43878259958254,
"learning_rate": 2.259685106387345e-06,
"loss": 2.0336,
"step": 350
},
{
"epoch": 0.5546875,
"grad_norm": 42.91394725301739,
"learning_rate": 2.196723721823863e-06,
"loss": 2.038,
"step": 355
},
{
"epoch": 0.5625,
"grad_norm": 37.32603199657443,
"learning_rate": 2.1339569949597284e-06,
"loss": 1.7698,
"step": 360
},
{
"epoch": 0.5703125,
"grad_norm": 200.45931632492386,
"learning_rate": 2.0714252125813667e-06,
"loss": 1.9531,
"step": 365
},
{
"epoch": 0.578125,
"grad_norm": 31.963910575975373,
"learning_rate": 2.0091685106762233e-06,
"loss": 1.8749,
"step": 370
},
{
"epoch": 0.5859375,
"grad_norm": 89.03310789583136,
"learning_rate": 1.947226848671508e-06,
"loss": 2.0674,
"step": 375
},
{
"epoch": 0.59375,
"grad_norm": 31.937502854280478,
"learning_rate": 1.8856399837862552e-06,
"loss": 1.6382,
"step": 380
},
{
"epoch": 0.6015625,
"grad_norm": 39.761140346439326,
"learning_rate": 1.824447445513179e-06,
"loss": 1.6946,
"step": 385
},
{
"epoch": 0.609375,
"grad_norm": 46.64848349398602,
"learning_rate": 1.7636885102466907e-06,
"loss": 1.5179,
"step": 390
},
{
"epoch": 0.6171875,
"grad_norm": 20.613416865713134,
"learning_rate": 1.7034021760733712e-06,
"loss": 1.437,
"step": 395
},
{
"epoch": 0.625,
"grad_norm": 70.94522606308973,
"learning_rate": 1.6436271377410667e-06,
"loss": 2.2482,
"step": 400
},
{
"epoch": 0.6328125,
"grad_norm": 43.35603987077311,
"learning_rate": 1.5844017618226934e-06,
"loss": 1.6283,
"step": 405
},
{
"epoch": 0.640625,
"grad_norm": 117.64561093792364,
"learning_rate": 1.525764062090671e-06,
"loss": 1.5861,
"step": 410
},
{
"epoch": 0.6484375,
"grad_norm": 51.13049033408944,
"learning_rate": 1.46775167511781e-06,
"loss": 1.7587,
"step": 415
},
{
"epoch": 0.65625,
"grad_norm": 23.67179139062178,
"learning_rate": 1.4104018361202947e-06,
"loss": 1.6897,
"step": 420
},
{
"epoch": 0.6640625,
"grad_norm": 29.890157693144154,
"learning_rate": 1.3537513550582853e-06,
"loss": 1.8858,
"step": 425
},
{
"epoch": 0.671875,
"grad_norm": 120.12084561754483,
"learning_rate": 1.2978365930094645e-06,
"loss": 1.6889,
"step": 430
},
{
"epoch": 0.6796875,
"grad_norm": 29.265397717275942,
"learning_rate": 1.2426934388307059e-06,
"loss": 1.6589,
"step": 435
},
{
"epoch": 0.6875,
"grad_norm": 147.8296220293303,
"learning_rate": 1.1883572861228255e-06,
"loss": 2.3553,
"step": 440
},
{
"epoch": 0.6953125,
"grad_norm": 75.42228430231394,
"learning_rate": 1.1348630105132253e-06,
"loss": 1.7002,
"step": 445
},
{
"epoch": 0.703125,
"grad_norm": 50.39104505021309,
"learning_rate": 1.0822449472709907e-06,
"loss": 1.9122,
"step": 450
},
{
"epoch": 0.7109375,
"grad_norm": 61.50703160486361,
"learning_rate": 1.0305368692688175e-06,
"loss": 1.536,
"step": 455
},
{
"epoch": 0.71875,
"grad_norm": 73.8503299397507,
"learning_rate": 9.797719653059176e-07,
"loss": 1.7232,
"step": 460
},
{
"epoch": 0.7265625,
"grad_norm": 67.88315535620379,
"learning_rate": 9.299828188058013e-07,
"loss": 1.7666,
"step": 465
},
{
"epoch": 0.734375,
"grad_norm": 83.76797348985389,
"learning_rate": 8.812013869026334e-07,
"loss": 1.6567,
"step": 470
},
{
"epoch": 0.7421875,
"grad_norm": 136.16919333944256,
"learning_rate": 8.334589799295592e-07,
"loss": 1.6399,
"step": 475
},
{
"epoch": 0.75,
"grad_norm": 50.54545692566125,
"learning_rate": 7.867862413221894e-07,
"loss": 1.5422,
"step": 480
},
{
"epoch": 0.7578125,
"grad_norm": 243.07205145310937,
"learning_rate": 7.412131279501297e-07,
"loss": 1.4785,
"step": 485
},
{
"epoch": 0.765625,
"grad_norm": 45.010451945401705,
"learning_rate": 6.967688908891793e-07,
"loss": 1.884,
"step": 490
},
{
"epoch": 0.7734375,
"grad_norm": 48.23699940972905,
"learning_rate": 6.534820566465464e-07,
"loss": 1.7596,
"step": 495
},
{
"epoch": 0.78125,
"grad_norm": 90.74839489958582,
"learning_rate": 6.113804088511261e-07,
"loss": 1.6985,
"step": 500
},
{
"epoch": 0.7890625,
"grad_norm": 26.31053939307125,
"learning_rate": 5.704909704205949e-07,
"loss": 1.7385,
"step": 505
},
{
"epoch": 0.796875,
"grad_norm": 58.47989097161682,
"learning_rate": 5.308399862167693e-07,
"loss": 1.9481,
"step": 510
},
{
"epoch": 0.8046875,
"grad_norm": 796.9462628456254,
"learning_rate": 4.924529062003522e-07,
"loss": 1.762,
"step": 515
},
{
"epoch": 0.8125,
"grad_norm": 39.897170110050865,
"learning_rate": 4.553543690958939e-07,
"loss": 1.2992,
"step": 520
},
{
"epoch": 0.8203125,
"grad_norm": 53.31172241007692,
"learning_rate": 4.1956818657744065e-07,
"loss": 1.8682,
"step": 525
},
{
"epoch": 0.828125,
"grad_norm": 35.918165546768364,
"learning_rate": 3.851173279850251e-07,
"loss": 1.6464,
"step": 530
},
{
"epoch": 0.8359375,
"grad_norm": 47.70548947871064,
"learning_rate": 3.5202390558181145e-07,
"loss": 2.157,
"step": 535
},
{
"epoch": 0.84375,
"grad_norm": 81.91605468065885,
"learning_rate": 3.2030916036134866e-07,
"loss": 1.6421,
"step": 540
},
{
"epoch": 0.8515625,
"grad_norm": 53.06973167494431,
"learning_rate": 2.8999344841405377e-07,
"loss": 1.6688,
"step": 545
},
{
"epoch": 0.859375,
"grad_norm": 88.49158154638214,
"learning_rate": 2.61096227861668e-07,
"loss": 1.702,
"step": 550
},
{
"epoch": 0.8671875,
"grad_norm": 120.59801382869998,
"learning_rate": 2.3363604636807065e-07,
"loss": 1.3829,
"step": 555
},
{
"epoch": 0.875,
"grad_norm": 139.01600571943374,
"learning_rate": 2.0763052923447214e-07,
"loss": 1.6871,
"step": 560
},
{
"epoch": 0.8828125,
"grad_norm": 45.50934842650282,
"learning_rate": 1.830963680866285e-07,
"loss": 1.4887,
"step": 565
},
{
"epoch": 0.890625,
"grad_norm": 35.76953954334381,
"learning_rate": 1.600493101613268e-07,
"loss": 1.6375,
"step": 570
},
{
"epoch": 0.8984375,
"grad_norm": 37.27719885211302,
"learning_rate": 1.3850414819903235e-07,
"loss": 1.7941,
"step": 575
},
{
"epoch": 0.90625,
"grad_norm": 39.534848671404404,
"learning_rate": 1.1847471094917711e-07,
"loss": 1.7665,
"step": 580
},
{
"epoch": 0.9140625,
"grad_norm": 50.50325344773253,
"learning_rate": 9.997385429418555e-08,
"loss": 1.8857,
"step": 585
},
{
"epoch": 0.921875,
"grad_norm": 31.729549388411133,
"learning_rate": 8.301345299793374e-08,
"loss": 1.5837,
"step": 590
},
{
"epoch": 0.9296875,
"grad_norm": 29.796478369791654,
"learning_rate": 6.760439308393763e-08,
"loss": 1.963,
"step": 595
},
{
"epoch": 0.9375,
"grad_norm": 37.3072954627531,
"learning_rate": 5.3756564848168325e-08,
"loss": 1.507,
"step": 600
},
{
"epoch": 0.9453125,
"grad_norm": 32.75001709716989,
"learning_rate": 4.147885651096861e-08,
"loss": 1.6399,
"step": 605
},
{
"epoch": 0.953125,
"grad_norm": 53.42892228692126,
"learning_rate": 3.077914851215585e-08,
"loss": 1.7654,
"step": 610
},
{
"epoch": 0.9609375,
"grad_norm": 91.78268048604703,
"learning_rate": 2.1664308452965798e-08,
"loss": 1.4882,
"step": 615
},
{
"epoch": 0.96875,
"grad_norm": 52.95012166005466,
"learning_rate": 1.4140186688086365e-08,
"loss": 1.5379,
"step": 620
},
{
"epoch": 0.9765625,
"grad_norm": 114.57655882499603,
"learning_rate": 8.211612570611927e-09,
"loss": 1.5018,
"step": 625
},
{
"epoch": 0.984375,
"grad_norm": 96.85859485649613,
"learning_rate": 3.882391352324766e-09,
"loss": 1.4683,
"step": 630
},
{
"epoch": 0.9921875,
"grad_norm": 37.14188481057337,
"learning_rate": 1.1553017412971323e-09,
"loss": 1.8316,
"step": 635
},
{
"epoch": 1.0,
"grad_norm": 34.77706091063119,
"learning_rate": 3.2094118379288885e-11,
"loss": 1.6444,
"step": 640
},
{
"epoch": 1.0,
"step": 640,
"total_flos": 61131755814912.0,
"train_loss": 2.5290436543524266,
"train_runtime": 1576.5021,
"train_samples_per_second": 3.245,
"train_steps_per_second": 0.406
}
],
"logging_steps": 5,
"max_steps": 640,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 640.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 61131755814912.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}