Qwen3-4B-Sorted-3Task / trainer_state.json
TabibitoQZP's picture
Upload folder using huggingface_hub
8bbc2e9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.000629326620516,
"eval_steps": 500,
"global_step": 1589,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0062932662051604785,
"grad_norm": 39.40880584716797,
"learning_rate": 5.660377358490567e-07,
"loss": 1.2507,
"step": 10
},
{
"epoch": 0.012586532410320957,
"grad_norm": 12.689669609069824,
"learning_rate": 1.1949685534591195e-06,
"loss": 0.9818,
"step": 20
},
{
"epoch": 0.018879798615481436,
"grad_norm": 1.60954749584198,
"learning_rate": 1.8238993710691824e-06,
"loss": 0.6639,
"step": 30
},
{
"epoch": 0.025173064820641914,
"grad_norm": 0.9736618995666504,
"learning_rate": 2.4528301886792453e-06,
"loss": 0.603,
"step": 40
},
{
"epoch": 0.03146633102580239,
"grad_norm": 0.9699676632881165,
"learning_rate": 3.0817610062893084e-06,
"loss": 0.5679,
"step": 50
},
{
"epoch": 0.03775959723096287,
"grad_norm": 0.8372435569763184,
"learning_rate": 3.710691823899371e-06,
"loss": 0.5549,
"step": 60
},
{
"epoch": 0.04405286343612335,
"grad_norm": 0.8186138272285461,
"learning_rate": 4.339622641509435e-06,
"loss": 0.5552,
"step": 70
},
{
"epoch": 0.05034612964128383,
"grad_norm": 0.7362136840820312,
"learning_rate": 4.968553459119497e-06,
"loss": 0.558,
"step": 80
},
{
"epoch": 0.056639395846444306,
"grad_norm": 0.8293086290359497,
"learning_rate": 5.59748427672956e-06,
"loss": 0.5432,
"step": 90
},
{
"epoch": 0.06293266205160478,
"grad_norm": 0.7764604091644287,
"learning_rate": 6.226415094339623e-06,
"loss": 0.541,
"step": 100
},
{
"epoch": 0.06922592825676527,
"grad_norm": 0.8436954021453857,
"learning_rate": 6.855345911949685e-06,
"loss": 0.5457,
"step": 110
},
{
"epoch": 0.07551919446192575,
"grad_norm": 0.7573267817497253,
"learning_rate": 7.484276729559748e-06,
"loss": 0.5285,
"step": 120
},
{
"epoch": 0.08181246066708622,
"grad_norm": 0.8208069801330566,
"learning_rate": 8.113207547169812e-06,
"loss": 0.5352,
"step": 130
},
{
"epoch": 0.0881057268722467,
"grad_norm": 0.759560227394104,
"learning_rate": 8.742138364779875e-06,
"loss": 0.5333,
"step": 140
},
{
"epoch": 0.09439899307740718,
"grad_norm": 0.8434644341468811,
"learning_rate": 9.371069182389939e-06,
"loss": 0.5372,
"step": 150
},
{
"epoch": 0.10069225928256766,
"grad_norm": 0.8114253878593445,
"learning_rate": 1e-05,
"loss": 0.5339,
"step": 160
},
{
"epoch": 0.10698552548772813,
"grad_norm": 0.8041621446609497,
"learning_rate": 9.998793436421342e-06,
"loss": 0.5371,
"step": 170
},
{
"epoch": 0.11327879169288861,
"grad_norm": 0.782455563545227,
"learning_rate": 9.99517432800363e-06,
"loss": 0.5224,
"step": 180
},
{
"epoch": 0.11957205789804909,
"grad_norm": 0.802542507648468,
"learning_rate": 9.98914442142063e-06,
"loss": 0.5209,
"step": 190
},
{
"epoch": 0.12586532410320955,
"grad_norm": 0.8419063091278076,
"learning_rate": 9.980706626858607e-06,
"loss": 0.5261,
"step": 200
},
{
"epoch": 0.13215859030837004,
"grad_norm": 0.7716870307922363,
"learning_rate": 9.9698650166118e-06,
"loss": 0.522,
"step": 210
},
{
"epoch": 0.13845185651353054,
"grad_norm": 0.774776816368103,
"learning_rate": 9.956624823117036e-06,
"loss": 0.5305,
"step": 220
},
{
"epoch": 0.144745122718691,
"grad_norm": 0.7823233008384705,
"learning_rate": 9.94099243642841e-06,
"loss": 0.5247,
"step": 230
},
{
"epoch": 0.1510383889238515,
"grad_norm": 0.7220829725265503,
"learning_rate": 9.922975401133292e-06,
"loss": 0.5286,
"step": 240
},
{
"epoch": 0.15733165512901195,
"grad_norm": 0.7797294855117798,
"learning_rate": 9.90258241271112e-06,
"loss": 0.5299,
"step": 250
},
{
"epoch": 0.16362492133417245,
"grad_norm": 0.7687580585479736,
"learning_rate": 9.879823313336723e-06,
"loss": 0.5262,
"step": 260
},
{
"epoch": 0.1699181875393329,
"grad_norm": 0.7156737446784973,
"learning_rate": 9.854709087130261e-06,
"loss": 0.5227,
"step": 270
},
{
"epoch": 0.1762114537444934,
"grad_norm": 0.747580885887146,
"learning_rate": 9.827251854855992e-06,
"loss": 0.5186,
"step": 280
},
{
"epoch": 0.18250471994965387,
"grad_norm": 0.7566559910774231,
"learning_rate": 9.797464868072489e-06,
"loss": 0.5127,
"step": 290
},
{
"epoch": 0.18879798615481436,
"grad_norm": 0.747591495513916,
"learning_rate": 9.765362502737098e-06,
"loss": 0.5167,
"step": 300
},
{
"epoch": 0.19509125235997482,
"grad_norm": 0.732440173625946,
"learning_rate": 9.730960252267744e-06,
"loss": 0.5225,
"step": 310
},
{
"epoch": 0.2013845185651353,
"grad_norm": 0.7387551069259644,
"learning_rate": 9.6942747200654e-06,
"loss": 0.5149,
"step": 320
},
{
"epoch": 0.20767778477029578,
"grad_norm": 0.7358985543251038,
"learning_rate": 9.655323611500876e-06,
"loss": 0.518,
"step": 330
},
{
"epoch": 0.21397105097545627,
"grad_norm": 0.7722839117050171,
"learning_rate": 9.614125725369748e-06,
"loss": 0.5095,
"step": 340
},
{
"epoch": 0.22026431718061673,
"grad_norm": 0.677197277545929,
"learning_rate": 9.570700944819584e-06,
"loss": 0.5233,
"step": 350
},
{
"epoch": 0.22655758338577722,
"grad_norm": 0.6825560331344604,
"learning_rate": 9.525070227753835e-06,
"loss": 0.5125,
"step": 360
},
{
"epoch": 0.2328508495909377,
"grad_norm": 0.6920183300971985,
"learning_rate": 9.477255596717012e-06,
"loss": 0.5191,
"step": 370
},
{
"epoch": 0.23914411579609818,
"grad_norm": 0.7336747646331787,
"learning_rate": 9.427280128266049e-06,
"loss": 0.5163,
"step": 380
},
{
"epoch": 0.24543738200125864,
"grad_norm": 0.7665858268737793,
"learning_rate": 9.375167941832974e-06,
"loss": 0.5062,
"step": 390
},
{
"epoch": 0.2517306482064191,
"grad_norm": 0.6906554102897644,
"learning_rate": 9.320944188084241e-06,
"loss": 0.518,
"step": 400
},
{
"epoch": 0.2580239144115796,
"grad_norm": 0.7612572312355042,
"learning_rate": 9.264635036782406e-06,
"loss": 0.5042,
"step": 410
},
{
"epoch": 0.2643171806167401,
"grad_norm": 0.7517194747924805,
"learning_rate": 9.206267664155906e-06,
"loss": 0.5221,
"step": 420
},
{
"epoch": 0.27061044682190055,
"grad_norm": 0.7678345441818237,
"learning_rate": 9.145870239783143e-06,
"loss": 0.5172,
"step": 430
},
{
"epoch": 0.27690371302706107,
"grad_norm": 0.7215328812599182,
"learning_rate": 9.08347191299711e-06,
"loss": 0.5143,
"step": 440
},
{
"epoch": 0.28319697923222154,
"grad_norm": 0.6326926350593567,
"learning_rate": 9.019102798817196e-06,
"loss": 0.5164,
"step": 450
},
{
"epoch": 0.289490245437382,
"grad_norm": 0.689453661441803,
"learning_rate": 8.952793963414908e-06,
"loss": 0.5179,
"step": 460
},
{
"epoch": 0.29578351164254246,
"grad_norm": 0.7151985168457031,
"learning_rate": 8.884577409120535e-06,
"loss": 0.5073,
"step": 470
},
{
"epoch": 0.302076777847703,
"grad_norm": 0.7656172513961792,
"learning_rate": 8.814486058978035e-06,
"loss": 0.5042,
"step": 480
},
{
"epoch": 0.30837004405286345,
"grad_norm": 0.680549144744873,
"learning_rate": 8.742553740855507e-06,
"loss": 0.5191,
"step": 490
},
{
"epoch": 0.3146633102580239,
"grad_norm": 0.7366177439689636,
"learning_rate": 8.66881517111902e-06,
"loss": 0.5163,
"step": 500
},
{
"epoch": 0.3209565764631844,
"grad_norm": 0.8044482469558716,
"learning_rate": 8.593305937877614e-06,
"loss": 0.5152,
"step": 510
},
{
"epoch": 0.3272498426683449,
"grad_norm": 0.6989796161651611,
"learning_rate": 8.516062483807556e-06,
"loss": 0.5192,
"step": 520
},
{
"epoch": 0.33354310887350536,
"grad_norm": 0.705839216709137,
"learning_rate": 8.437122088564197e-06,
"loss": 0.5054,
"step": 530
},
{
"epoch": 0.3398363750786658,
"grad_norm": 0.6799296736717224,
"learning_rate": 8.356522850789852e-06,
"loss": 0.5032,
"step": 540
},
{
"epoch": 0.3461296412838263,
"grad_norm": 0.8019563555717468,
"learning_rate": 8.274303669726427e-06,
"loss": 0.5113,
"step": 550
},
{
"epoch": 0.3524229074889868,
"grad_norm": 0.7109248638153076,
"learning_rate": 8.190504226441654e-06,
"loss": 0.5029,
"step": 560
},
{
"epoch": 0.35871617369414727,
"grad_norm": 0.7193357944488525,
"learning_rate": 8.105164964678009e-06,
"loss": 0.5127,
"step": 570
},
{
"epoch": 0.36500943989930773,
"grad_norm": 0.6586730480194092,
"learning_rate": 8.018327071333521e-06,
"loss": 0.5178,
"step": 580
},
{
"epoch": 0.3713027061044682,
"grad_norm": 0.7992932796478271,
"learning_rate": 7.930032456583931e-06,
"loss": 0.5064,
"step": 590
},
{
"epoch": 0.3775959723096287,
"grad_norm": 0.6866645812988281,
"learning_rate": 7.84032373365578e-06,
"loss": 0.5025,
"step": 600
},
{
"epoch": 0.3838892385147892,
"grad_norm": 0.662344217300415,
"learning_rate": 7.749244198260175e-06,
"loss": 0.5103,
"step": 610
},
{
"epoch": 0.39018250471994964,
"grad_norm": 0.6587361693382263,
"learning_rate": 7.656837807697187e-06,
"loss": 0.5129,
"step": 620
},
{
"epoch": 0.3964757709251101,
"grad_norm": 0.6918533444404602,
"learning_rate": 7.563149159640929e-06,
"loss": 0.5053,
"step": 630
},
{
"epoch": 0.4027690371302706,
"grad_norm": 0.6420175433158875,
"learning_rate": 7.468223470615593e-06,
"loss": 0.5223,
"step": 640
},
{
"epoch": 0.4090623033354311,
"grad_norm": 0.7031328678131104,
"learning_rate": 7.372106554172802e-06,
"loss": 0.5024,
"step": 650
},
{
"epoch": 0.41535556954059155,
"grad_norm": 0.7460775375366211,
"learning_rate": 7.274844798780826e-06,
"loss": 0.5123,
"step": 660
},
{
"epoch": 0.42164883574575207,
"grad_norm": 0.6937898397445679,
"learning_rate": 7.176485145436325e-06,
"loss": 0.5051,
"step": 670
},
{
"epoch": 0.42794210195091253,
"grad_norm": 0.623894453048706,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.5059,
"step": 680
},
{
"epoch": 0.434235368156073,
"grad_norm": 0.6496269106864929,
"learning_rate": 6.976662535333107e-06,
"loss": 0.4999,
"step": 690
},
{
"epoch": 0.44052863436123346,
"grad_norm": 0.6723958253860474,
"learning_rate": 6.87529601804781e-06,
"loss": 0.5054,
"step": 700
},
{
"epoch": 0.446821900566394,
"grad_norm": 0.6890814900398254,
"learning_rate": 6.773024435212678e-06,
"loss": 0.5066,
"step": 710
},
{
"epoch": 0.45311516677155445,
"grad_norm": 0.6805148720741272,
"learning_rate": 6.669897145694507e-06,
"loss": 0.5086,
"step": 720
},
{
"epoch": 0.4594084329767149,
"grad_norm": 0.6633646488189697,
"learning_rate": 6.565963921345896e-06,
"loss": 0.4939,
"step": 730
},
{
"epoch": 0.4657016991818754,
"grad_norm": 0.6664919257164001,
"learning_rate": 6.461274922984087e-06,
"loss": 0.4995,
"step": 740
},
{
"epoch": 0.4719949653870359,
"grad_norm": 0.6816923022270203,
"learning_rate": 6.355880676182086e-06,
"loss": 0.5038,
"step": 750
},
{
"epoch": 0.47828823159219636,
"grad_norm": 0.6514876484870911,
"learning_rate": 6.249832046883729e-06,
"loss": 0.5011,
"step": 760
},
{
"epoch": 0.4845814977973568,
"grad_norm": 0.6344130039215088,
"learning_rate": 6.143180216854488e-06,
"loss": 0.5034,
"step": 770
},
{
"epoch": 0.4908747640025173,
"grad_norm": 0.6643583178520203,
"learning_rate": 6.035976658979846e-06,
"loss": 0.4956,
"step": 780
},
{
"epoch": 0.4971680302076778,
"grad_norm": 0.7020254731178284,
"learning_rate": 5.928273112423177e-06,
"loss": 0.497,
"step": 790
},
{
"epoch": 1.0037759597230962,
"grad_norm": 0.6873272657394409,
"learning_rate": 5.820121557655109e-06,
"loss": 0.5445,
"step": 800
},
{
"epoch": 1.0100692259282569,
"grad_norm": 0.6778249144554138,
"learning_rate": 5.711574191366427e-06,
"loss": 0.4808,
"step": 810
},
{
"epoch": 1.0163624921334173,
"grad_norm": 0.6917534470558167,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.4871,
"step": 820
},
{
"epoch": 1.0226557583385778,
"grad_norm": 0.6982170343399048,
"learning_rate": 5.493501740850228e-06,
"loss": 0.4768,
"step": 830
},
{
"epoch": 1.0289490245437383,
"grad_norm": 0.622083842754364,
"learning_rate": 5.384081903933235e-06,
"loss": 0.4874,
"step": 840
},
{
"epoch": 1.0352422907488987,
"grad_norm": 0.682299792766571,
"learning_rate": 5.274476699321638e-06,
"loss": 0.4787,
"step": 850
},
{
"epoch": 1.0415355569540592,
"grad_norm": 0.719980776309967,
"learning_rate": 5.164739025274604e-06,
"loss": 0.4731,
"step": 860
},
{
"epoch": 1.0478288231592197,
"grad_norm": 0.7684125304222107,
"learning_rate": 5.0549218439844185e-06,
"loss": 0.4858,
"step": 870
},
{
"epoch": 1.0541220893643801,
"grad_norm": 0.6164060831069946,
"learning_rate": 4.945078156015582e-06,
"loss": 0.4803,
"step": 880
},
{
"epoch": 1.0604153555695406,
"grad_norm": 0.7356188297271729,
"learning_rate": 4.835260974725397e-06,
"loss": 0.4756,
"step": 890
},
{
"epoch": 1.066708621774701,
"grad_norm": 0.6549850106239319,
"learning_rate": 4.7255233006783626e-06,
"loss": 0.4665,
"step": 900
},
{
"epoch": 1.0730018879798615,
"grad_norm": 0.6597391366958618,
"learning_rate": 4.615918096066766e-06,
"loss": 0.4669,
"step": 910
},
{
"epoch": 1.079295154185022,
"grad_norm": 0.6646954417228699,
"learning_rate": 4.506498259149774e-06,
"loss": 0.4717,
"step": 920
},
{
"epoch": 1.0855884203901824,
"grad_norm": 0.6768482327461243,
"learning_rate": 4.397316598723385e-06,
"loss": 0.4702,
"step": 930
},
{
"epoch": 1.091881686595343,
"grad_norm": 0.7066707611083984,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.4502,
"step": 940
},
{
"epoch": 1.0981749528005034,
"grad_norm": 0.6753413677215576,
"learning_rate": 4.179878442344892e-06,
"loss": 0.4721,
"step": 950
},
{
"epoch": 1.104468219005664,
"grad_norm": 0.6423931121826172,
"learning_rate": 4.071726887576823e-06,
"loss": 0.4661,
"step": 960
},
{
"epoch": 1.1107614852108245,
"grad_norm": 0.686931848526001,
"learning_rate": 3.9640233410201555e-06,
"loss": 0.4684,
"step": 970
},
{
"epoch": 1.117054751415985,
"grad_norm": 0.6282669901847839,
"learning_rate": 3.856819783145514e-06,
"loss": 0.4621,
"step": 980
},
{
"epoch": 1.1233480176211454,
"grad_norm": 0.7436355352401733,
"learning_rate": 3.750167953116272e-06,
"loss": 0.4575,
"step": 990
},
{
"epoch": 1.129641283826306,
"grad_norm": 0.6427481174468994,
"learning_rate": 3.6441193238179152e-06,
"loss": 0.4591,
"step": 1000
},
{
"epoch": 1.1359345500314664,
"grad_norm": 0.6406493782997131,
"learning_rate": 3.5387250770159152e-06,
"loss": 0.4503,
"step": 1010
},
{
"epoch": 1.1422278162366268,
"grad_norm": 0.6536152958869934,
"learning_rate": 3.4340360786541067e-06,
"loss": 0.454,
"step": 1020
},
{
"epoch": 1.1485210824417873,
"grad_norm": 0.7142040729522705,
"learning_rate": 3.3301028543054935e-06,
"loss": 0.4527,
"step": 1030
},
{
"epoch": 1.1548143486469478,
"grad_norm": 0.6241782903671265,
"learning_rate": 3.226975564787322e-06,
"loss": 0.4472,
"step": 1040
},
{
"epoch": 1.1611076148521082,
"grad_norm": 0.6668282747268677,
"learning_rate": 3.1247039819521907e-06,
"loss": 0.4509,
"step": 1050
},
{
"epoch": 1.1674008810572687,
"grad_norm": 0.6013820171356201,
"learning_rate": 3.0233374646668935e-06,
"loss": 0.443,
"step": 1060
},
{
"epoch": 1.1736941472624292,
"grad_norm": 0.6819528341293335,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.4598,
"step": 1070
},
{
"epoch": 1.1799874134675896,
"grad_norm": 0.6443890929222107,
"learning_rate": 2.8235148545636776e-06,
"loss": 0.447,
"step": 1080
},
{
"epoch": 1.18628067967275,
"grad_norm": 0.6288453340530396,
"learning_rate": 2.7251552012191763e-06,
"loss": 0.4563,
"step": 1090
},
{
"epoch": 1.1925739458779105,
"grad_norm": 0.661491334438324,
"learning_rate": 2.6278934458271998e-06,
"loss": 0.4443,
"step": 1100
},
{
"epoch": 1.198867212083071,
"grad_norm": 0.6220525503158569,
"learning_rate": 2.531776529384407e-06,
"loss": 0.446,
"step": 1110
},
{
"epoch": 1.2051604782882315,
"grad_norm": 0.6786297559738159,
"learning_rate": 2.436850840359073e-06,
"loss": 0.4464,
"step": 1120
},
{
"epoch": 1.2114537444933922,
"grad_norm": 0.7809886336326599,
"learning_rate": 2.3431621923028146e-06,
"loss": 0.4554,
"step": 1130
},
{
"epoch": 1.2177470106985526,
"grad_norm": 0.7114007472991943,
"learning_rate": 2.2507558017398263e-06,
"loss": 0.4405,
"step": 1140
},
{
"epoch": 1.224040276903713,
"grad_norm": 0.726741373538971,
"learning_rate": 2.159676266344222e-06,
"loss": 0.4463,
"step": 1150
},
{
"epoch": 1.2303335431088736,
"grad_norm": 0.6209679841995239,
"learning_rate": 2.06996754341607e-06,
"loss": 0.4601,
"step": 1160
},
{
"epoch": 1.236626809314034,
"grad_norm": 0.7238234877586365,
"learning_rate": 1.98167292866648e-06,
"loss": 0.4498,
"step": 1170
},
{
"epoch": 1.2429200755191945,
"grad_norm": 0.663245439529419,
"learning_rate": 1.8948350353219913e-06,
"loss": 0.4507,
"step": 1180
},
{
"epoch": 1.249213341724355,
"grad_norm": 0.6694537997245789,
"learning_rate": 1.8094957735583463e-06,
"loss": 0.4616,
"step": 1190
},
{
"epoch": 1.2555066079295154,
"grad_norm": 0.6908589005470276,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.439,
"step": 1200
},
{
"epoch": 1.2617998741346759,
"grad_norm": 0.6885989308357239,
"learning_rate": 1.6434771492101487e-06,
"loss": 0.4444,
"step": 1210
},
{
"epoch": 1.2680931403398363,
"grad_norm": 0.6936271786689758,
"learning_rate": 1.5628779114358034e-06,
"loss": 0.4535,
"step": 1220
},
{
"epoch": 1.2743864065449968,
"grad_norm": 0.6543680429458618,
"learning_rate": 1.4839375161924446e-06,
"loss": 0.4584,
"step": 1230
},
{
"epoch": 1.2806796727501573,
"grad_norm": 0.6488659381866455,
"learning_rate": 1.406694062122389e-06,
"loss": 0.4532,
"step": 1240
},
{
"epoch": 1.286972938955318,
"grad_norm": 0.668251097202301,
"learning_rate": 1.3311848288809815e-06,
"loss": 0.4432,
"step": 1250
},
{
"epoch": 1.2932662051604784,
"grad_norm": 0.6807104349136353,
"learning_rate": 1.257446259144494e-06,
"loss": 0.4324,
"step": 1260
},
{
"epoch": 1.2995594713656389,
"grad_norm": 0.6319746971130371,
"learning_rate": 1.1855139410219657e-06,
"loss": 0.4493,
"step": 1270
},
{
"epoch": 1.3058527375707993,
"grad_norm": 0.6163645386695862,
"learning_rate": 1.115422590879464e-06,
"loss": 0.4501,
"step": 1280
},
{
"epoch": 1.3121460037759598,
"grad_norm": 0.6366046071052551,
"learning_rate": 1.047206036585095e-06,
"loss": 0.4423,
"step": 1290
},
{
"epoch": 1.3184392699811203,
"grad_norm": 0.5919917821884155,
"learning_rate": 9.808972011828055e-07,
"loss": 0.4379,
"step": 1300
},
{
"epoch": 1.3247325361862807,
"grad_norm": 0.6659424304962158,
"learning_rate": 9.165280870028919e-07,
"loss": 0.4548,
"step": 1310
},
{
"epoch": 1.3310258023914412,
"grad_norm": 0.6823071837425232,
"learning_rate": 8.541297602168591e-07,
"loss": 0.464,
"step": 1320
},
{
"epoch": 1.3373190685966017,
"grad_norm": 0.6970245838165283,
"learning_rate": 7.937323358440935e-07,
"loss": 0.4598,
"step": 1330
},
{
"epoch": 1.3436123348017621,
"grad_norm": 0.6151891946792603,
"learning_rate": 7.353649632175957e-07,
"loss": 0.4569,
"step": 1340
},
{
"epoch": 1.3499056010069226,
"grad_norm": 0.6397150754928589,
"learning_rate": 6.790558119157597e-07,
"loss": 0.4598,
"step": 1350
},
{
"epoch": 1.356198867212083,
"grad_norm": 0.6420609951019287,
"learning_rate": 6.248320581670281e-07,
"loss": 0.4576,
"step": 1360
},
{
"epoch": 1.3624921334172435,
"grad_norm": 0.6434526443481445,
"learning_rate": 5.727198717339511e-07,
"loss": 0.4544,
"step": 1370
},
{
"epoch": 1.368785399622404,
"grad_norm": 0.6515443921089172,
"learning_rate": 5.227444032829887e-07,
"loss": 0.4462,
"step": 1380
},
{
"epoch": 1.3750786658275644,
"grad_norm": 0.59568852186203,
"learning_rate": 4.74929772246166e-07,
"loss": 0.4697,
"step": 1390
},
{
"epoch": 1.381371932032725,
"grad_norm": 0.6043298244476318,
"learning_rate": 4.2929905518041714e-07,
"loss": 0.4506,
"step": 1400
},
{
"epoch": 1.3876651982378854,
"grad_norm": 0.5912688970565796,
"learning_rate": 3.858742746302535e-07,
"loss": 0.4358,
"step": 1410
},
{
"epoch": 1.3939584644430458,
"grad_norm": 0.6092295050621033,
"learning_rate": 3.44676388499125e-07,
"loss": 0.4545,
"step": 1420
},
{
"epoch": 1.4002517306482063,
"grad_norm": 0.6405302882194519,
"learning_rate": 3.0572527993460054e-07,
"loss": 0.4584,
"step": 1430
},
{
"epoch": 1.406544996853367,
"grad_norm": 0.6120900511741638,
"learning_rate": 2.6903974773225703e-07,
"loss": 0.4461,
"step": 1440
},
{
"epoch": 1.4128382630585274,
"grad_norm": 0.594711184501648,
"learning_rate": 2.3463749726290287e-07,
"loss": 0.4519,
"step": 1450
},
{
"epoch": 1.419131529263688,
"grad_norm": 0.6510460376739502,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.4509,
"step": 1460
},
{
"epoch": 1.4254247954688484,
"grad_norm": 0.574932873249054,
"learning_rate": 1.7274814514400995e-07,
"loss": 0.4511,
"step": 1470
},
{
"epoch": 1.4317180616740088,
"grad_norm": 0.6897699236869812,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.4496,
"step": 1480
},
{
"epoch": 1.4380113278791693,
"grad_norm": 0.6335061192512512,
"learning_rate": 1.2017668666327752e-07,
"loss": 0.4481,
"step": 1490
},
{
"epoch": 1.4443045940843298,
"grad_norm": 0.6672943830490112,
"learning_rate": 9.741758728888218e-08,
"loss": 0.4518,
"step": 1500
},
{
"epoch": 1.4505978602894902,
"grad_norm": 0.673734188079834,
"learning_rate": 7.702459886670788e-08,
"loss": 0.4495,
"step": 1510
},
{
"epoch": 1.4568911264946507,
"grad_norm": 0.6775258183479309,
"learning_rate": 5.900756357159143e-08,
"loss": 0.458,
"step": 1520
},
{
"epoch": 1.4631843926998112,
"grad_norm": 0.588603138923645,
"learning_rate": 4.337517688296544e-08,
"loss": 0.4543,
"step": 1530
},
{
"epoch": 1.4694776589049716,
"grad_norm": 0.642877459526062,
"learning_rate": 3.013498338820031e-08,
"loss": 0.4522,
"step": 1540
},
{
"epoch": 1.475770925110132,
"grad_norm": 0.6829048991203308,
"learning_rate": 1.9293373141394124e-08,
"loss": 0.4583,
"step": 1550
},
{
"epoch": 1.4820641913152928,
"grad_norm": 0.6072986721992493,
"learning_rate": 1.0855578579370696e-08,
"loss": 0.4504,
"step": 1560
},
{
"epoch": 1.4883574575204532,
"grad_norm": 0.6607363820075989,
"learning_rate": 4.825671996370184e-09,
"loss": 0.4618,
"step": 1570
},
{
"epoch": 1.4946507237256137,
"grad_norm": 0.6756806969642639,
"learning_rate": 1.2065635786595586e-09,
"loss": 0.4539,
"step": 1580
},
{
"epoch": 2.000629326620516,
"step": 1589,
"total_flos": 208043401019392.0,
"train_loss": 0.4968605734703299,
"train_runtime": 50248.0079,
"train_samples_per_second": 1.012,
"train_steps_per_second": 0.032
}
],
"logging_steps": 10,
"max_steps": 1589,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 208043401019392.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}