sanity_static_20p_100k / trainer_state.json
terry69's picture
Model save
b9a5129 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 869,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011507479861910242,
"grad_norm": 0.4465322789934592,
"learning_rate": 2.2988505747126437e-06,
"loss": 1.3558,
"step": 1
},
{
"epoch": 0.005753739930955121,
"grad_norm": 0.4729686939623816,
"learning_rate": 1.1494252873563218e-05,
"loss": 1.3725,
"step": 5
},
{
"epoch": 0.011507479861910242,
"grad_norm": 0.5386929459384674,
"learning_rate": 2.2988505747126437e-05,
"loss": 1.3751,
"step": 10
},
{
"epoch": 0.01726121979286536,
"grad_norm": 0.1777783055450745,
"learning_rate": 3.4482758620689657e-05,
"loss": 1.3698,
"step": 15
},
{
"epoch": 0.023014959723820484,
"grad_norm": 0.16074111506146127,
"learning_rate": 4.597701149425287e-05,
"loss": 1.308,
"step": 20
},
{
"epoch": 0.028768699654775604,
"grad_norm": 0.13109040303886016,
"learning_rate": 5.747126436781609e-05,
"loss": 1.3108,
"step": 25
},
{
"epoch": 0.03452243958573072,
"grad_norm": 0.1399833196718492,
"learning_rate": 6.896551724137931e-05,
"loss": 1.2826,
"step": 30
},
{
"epoch": 0.04027617951668585,
"grad_norm": 0.1280676761539199,
"learning_rate": 8.045977011494253e-05,
"loss": 1.2773,
"step": 35
},
{
"epoch": 0.04602991944764097,
"grad_norm": 0.12005806159718177,
"learning_rate": 9.195402298850575e-05,
"loss": 1.2377,
"step": 40
},
{
"epoch": 0.05178365937859609,
"grad_norm": 0.10506239475224881,
"learning_rate": 0.00010344827586206898,
"loss": 1.2296,
"step": 45
},
{
"epoch": 0.05753739930955121,
"grad_norm": 0.0819749468856537,
"learning_rate": 0.00011494252873563218,
"loss": 1.197,
"step": 50
},
{
"epoch": 0.06329113924050633,
"grad_norm": 0.08509500503388337,
"learning_rate": 0.0001264367816091954,
"loss": 1.1709,
"step": 55
},
{
"epoch": 0.06904487917146145,
"grad_norm": 0.07393378633841856,
"learning_rate": 0.00013793103448275863,
"loss": 1.1692,
"step": 60
},
{
"epoch": 0.07479861910241657,
"grad_norm": 0.06625962718107734,
"learning_rate": 0.00014942528735632183,
"loss": 1.2001,
"step": 65
},
{
"epoch": 0.0805523590333717,
"grad_norm": 0.07029239664213276,
"learning_rate": 0.00016091954022988506,
"loss": 1.1746,
"step": 70
},
{
"epoch": 0.08630609896432681,
"grad_norm": 0.0734118145247187,
"learning_rate": 0.00017241379310344826,
"loss": 1.1477,
"step": 75
},
{
"epoch": 0.09205983889528194,
"grad_norm": 0.07794825571468995,
"learning_rate": 0.0001839080459770115,
"loss": 1.1657,
"step": 80
},
{
"epoch": 0.09781357882623705,
"grad_norm": 0.08512834878649493,
"learning_rate": 0.00019540229885057472,
"loss": 1.1775,
"step": 85
},
{
"epoch": 0.10356731875719218,
"grad_norm": 0.081601122042688,
"learning_rate": 0.00019999273737707646,
"loss": 1.1584,
"step": 90
},
{
"epoch": 0.1093210586881473,
"grad_norm": 0.09077503220224638,
"learning_rate": 0.00019994835850163924,
"loss": 1.1332,
"step": 95
},
{
"epoch": 0.11507479861910241,
"grad_norm": 0.08933595254839481,
"learning_rate": 0.00019986365342513265,
"loss": 1.1341,
"step": 100
},
{
"epoch": 0.12082853855005754,
"grad_norm": 0.0927003880602526,
"learning_rate": 0.00019973865632354516,
"loss": 1.1355,
"step": 105
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.07791130311867683,
"learning_rate": 0.00019957341762950344,
"loss": 1.124,
"step": 110
},
{
"epoch": 0.1323360184119678,
"grad_norm": 0.0790112418591026,
"learning_rate": 0.0001993680040119244,
"loss": 1.1307,
"step": 115
},
{
"epoch": 0.1380897583429229,
"grad_norm": 0.09227634318705202,
"learning_rate": 0.000199122498349116,
"loss": 1.098,
"step": 120
},
{
"epoch": 0.14384349827387802,
"grad_norm": 0.08290995925084253,
"learning_rate": 0.0001988369996953386,
"loss": 1.1347,
"step": 125
},
{
"epoch": 0.14959723820483314,
"grad_norm": 0.08548470364290049,
"learning_rate": 0.00019851162324083932,
"loss": 1.1418,
"step": 130
},
{
"epoch": 0.15535097813578827,
"grad_norm": 0.08440345287709118,
"learning_rate": 0.0001981465002653763,
"loss": 1.135,
"step": 135
},
{
"epoch": 0.1611047180667434,
"grad_norm": 0.08444559508248843,
"learning_rate": 0.00019774177808525113,
"loss": 1.1273,
"step": 140
},
{
"epoch": 0.1668584579976985,
"grad_norm": 0.0840175874952237,
"learning_rate": 0.00019729761999387103,
"loss": 1.1129,
"step": 145
},
{
"epoch": 0.17261219792865362,
"grad_norm": 0.07480610551808778,
"learning_rate": 0.000196814205195865,
"loss": 1.1182,
"step": 150
},
{
"epoch": 0.17836593785960875,
"grad_norm": 0.07608916685245905,
"learning_rate": 0.00019629172873477995,
"loss": 1.1017,
"step": 155
},
{
"epoch": 0.18411967779056387,
"grad_norm": 0.09515413227881493,
"learning_rate": 0.00019573040141438624,
"loss": 1.1281,
"step": 160
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.07493301926746701,
"learning_rate": 0.00019513044971362494,
"loss": 1.1213,
"step": 165
},
{
"epoch": 0.1956271576524741,
"grad_norm": 0.07980883594622108,
"learning_rate": 0.00019449211569523,
"loss": 1.1289,
"step": 170
},
{
"epoch": 0.20138089758342922,
"grad_norm": 0.08091237279913081,
"learning_rate": 0.00019381565690806328,
"loss": 1.1222,
"step": 175
},
{
"epoch": 0.20713463751438435,
"grad_norm": 0.07635483082524008,
"learning_rate": 0.00019310134628320114,
"loss": 1.1173,
"step": 180
},
{
"epoch": 0.21288837744533948,
"grad_norm": 0.07974746871493653,
"learning_rate": 0.00019234947202381486,
"loss": 1.1074,
"step": 185
},
{
"epoch": 0.2186421173762946,
"grad_norm": 0.07924995122930302,
"learning_rate": 0.00019156033748888917,
"loss": 1.1193,
"step": 190
},
{
"epoch": 0.2243958573072497,
"grad_norm": 0.08344491509033138,
"learning_rate": 0.000190734261070826,
"loss": 1.1162,
"step": 195
},
{
"epoch": 0.23014959723820483,
"grad_norm": 0.0809591767505205,
"learning_rate": 0.00018987157606698235,
"loss": 1.1085,
"step": 200
},
{
"epoch": 0.23590333716915995,
"grad_norm": 0.08094087212789886,
"learning_rate": 0.00018897263054519498,
"loss": 1.0996,
"step": 205
},
{
"epoch": 0.24165707710011508,
"grad_norm": 0.07353474154724883,
"learning_rate": 0.0001880377872033451,
"loss": 1.1301,
"step": 210
},
{
"epoch": 0.2474108170310702,
"grad_norm": 0.0784057120949034,
"learning_rate": 0.00018706742322302064,
"loss": 1.1149,
"step": 215
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.07563014633070095,
"learning_rate": 0.0001860619301173347,
"loss": 1.1084,
"step": 220
},
{
"epoch": 0.25891829689298046,
"grad_norm": 0.0732650161077316,
"learning_rate": 0.00018502171357296144,
"loss": 1.0934,
"step": 225
},
{
"epoch": 0.2646720368239356,
"grad_norm": 0.07642524798153359,
"learning_rate": 0.0001839471932864537,
"loss": 1.1279,
"step": 230
},
{
"epoch": 0.27042577675489066,
"grad_norm": 0.07924123360978817,
"learning_rate": 0.0001828388027949078,
"loss": 1.1164,
"step": 235
},
{
"epoch": 0.2761795166858458,
"grad_norm": 0.08291524400368479,
"learning_rate": 0.0001816969893010442,
"loss": 1.1426,
"step": 240
},
{
"epoch": 0.2819332566168009,
"grad_norm": 0.09503646536490909,
"learning_rate": 0.00018052221349277442,
"loss": 1.1007,
"step": 245
},
{
"epoch": 0.28768699654775604,
"grad_norm": 0.08546247272232582,
"learning_rate": 0.0001793149493573271,
"loss": 1.0996,
"step": 250
},
{
"epoch": 0.29344073647871116,
"grad_norm": 0.07292435172406726,
"learning_rate": 0.00017807568399000822,
"loss": 1.1297,
"step": 255
},
{
"epoch": 0.2991944764096663,
"grad_norm": 0.08793050679899453,
"learning_rate": 0.0001768049173976727,
"loss": 1.128,
"step": 260
},
{
"epoch": 0.3049482163406214,
"grad_norm": 0.08228996895895493,
"learning_rate": 0.0001755031622969862,
"loss": 1.1067,
"step": 265
},
{
"epoch": 0.31070195627157654,
"grad_norm": 0.08034368274055034,
"learning_rate": 0.00017417094390755934,
"loss": 1.1399,
"step": 270
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.07355220396446509,
"learning_rate": 0.00017280879974003707,
"loss": 1.105,
"step": 275
},
{
"epoch": 0.3222094361334868,
"grad_norm": 0.07695547066904129,
"learning_rate": 0.0001714172793792291,
"loss": 1.1282,
"step": 280
},
{
"epoch": 0.32796317606444186,
"grad_norm": 0.07242274233224258,
"learning_rate": 0.0001699969442623686,
"loss": 1.1138,
"step": 285
},
{
"epoch": 0.333716915995397,
"grad_norm": 0.08486742993293918,
"learning_rate": 0.0001685483674525891,
"loss": 1.1,
"step": 290
},
{
"epoch": 0.3394706559263521,
"grad_norm": 0.0747389855159797,
"learning_rate": 0.0001670721334077103,
"loss": 1.1016,
"step": 295
},
{
"epoch": 0.34522439585730724,
"grad_norm": 0.07533809685344381,
"learning_rate": 0.00016556883774442675,
"loss": 1.1181,
"step": 300
},
{
"epoch": 0.35097813578826237,
"grad_norm": 0.07699490502726904,
"learning_rate": 0.00016403908699799425,
"loss": 1.0986,
"step": 305
},
{
"epoch": 0.3567318757192175,
"grad_norm": 0.07297020155234164,
"learning_rate": 0.00016248349837751062,
"loss": 1.0951,
"step": 310
},
{
"epoch": 0.3624856156501726,
"grad_norm": 0.08519497936629768,
"learning_rate": 0.0001609026995168904,
"loss": 1.1148,
"step": 315
},
{
"epoch": 0.36823935558112775,
"grad_norm": 0.07458638521829272,
"learning_rate": 0.00015929732822163287,
"loss": 1.1165,
"step": 320
},
{
"epoch": 0.3739930955120829,
"grad_norm": 0.08000996094690015,
"learning_rate": 0.00015766803221148673,
"loss": 1.1072,
"step": 325
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.08213267404173417,
"learning_rate": 0.00015601546885911404,
"loss": 1.1078,
"step": 330
},
{
"epoch": 0.38550057537399307,
"grad_norm": 0.08374970807354172,
"learning_rate": 0.00015434030492486023,
"loss": 1.0842,
"step": 335
},
{
"epoch": 0.3912543153049482,
"grad_norm": 0.07669296380369849,
"learning_rate": 0.0001526432162877356,
"loss": 1.1124,
"step": 340
},
{
"epoch": 0.3970080552359033,
"grad_norm": 0.07376471868371856,
"learning_rate": 0.00015092488767271857,
"loss": 1.0882,
"step": 345
},
{
"epoch": 0.40276179516685845,
"grad_norm": 0.08896375142853388,
"learning_rate": 0.00014918601237448923,
"loss": 1.1217,
"step": 350
},
{
"epoch": 0.4085155350978136,
"grad_norm": 0.07333518083670185,
"learning_rate": 0.00014742729197770552,
"loss": 1.0853,
"step": 355
},
{
"epoch": 0.4142692750287687,
"grad_norm": 0.07541021900185346,
"learning_rate": 0.00014564943607393459,
"loss": 1.1078,
"step": 360
},
{
"epoch": 0.42002301495972383,
"grad_norm": 0.07240408031237026,
"learning_rate": 0.00014385316197535372,
"loss": 1.0963,
"step": 365
},
{
"epoch": 0.42577675489067895,
"grad_norm": 0.07726369320922719,
"learning_rate": 0.00014203919442533597,
"loss": 1.082,
"step": 370
},
{
"epoch": 0.4315304948216341,
"grad_norm": 0.07947926565287182,
"learning_rate": 0.00014020826530603776,
"loss": 1.0775,
"step": 375
},
{
"epoch": 0.4372842347525892,
"grad_norm": 0.0835226760212179,
"learning_rate": 0.0001383611133431062,
"loss": 1.1005,
"step": 380
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.07628153739619549,
"learning_rate": 0.00013649848380762513,
"loss": 1.1139,
"step": 385
},
{
"epoch": 0.4487917146144994,
"grad_norm": 0.07650525696735216,
"learning_rate": 0.00013462112821542016,
"loss": 1.1171,
"step": 390
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.07569647253340571,
"learning_rate": 0.0001327298040238446,
"loss": 1.0918,
"step": 395
},
{
"epoch": 0.46029919447640966,
"grad_norm": 0.07572977420046635,
"learning_rate": 0.0001308252743261675,
"loss": 1.1018,
"step": 400
},
{
"epoch": 0.4660529344073648,
"grad_norm": 0.084906098343464,
"learning_rate": 0.00012890830754368855,
"loss": 1.1145,
"step": 405
},
{
"epoch": 0.4718066743383199,
"grad_norm": 0.08856218479901995,
"learning_rate": 0.00012697967711570242,
"loss": 1.1049,
"step": 410
},
{
"epoch": 0.47756041426927504,
"grad_norm": 0.07498278458233507,
"learning_rate": 0.00012504016118743935,
"loss": 1.1061,
"step": 415
},
{
"epoch": 0.48331415420023016,
"grad_norm": 0.07551499247429329,
"learning_rate": 0.00012309054229610623,
"loss": 1.0858,
"step": 420
},
{
"epoch": 0.4890678941311853,
"grad_norm": 0.07573682259745337,
"learning_rate": 0.00012113160705515625,
"loss": 1.0917,
"step": 425
},
{
"epoch": 0.4948216340621404,
"grad_norm": 0.09077558081676396,
"learning_rate": 0.0001191641458369136,
"loss": 1.118,
"step": 430
},
{
"epoch": 0.5005753739930955,
"grad_norm": 0.07771540883960386,
"learning_rate": 0.00011718895245368167,
"loss": 1.1056,
"step": 435
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.08583872028129896,
"learning_rate": 0.00011520682383746333,
"loss": 1.0792,
"step": 440
},
{
"epoch": 0.5120828538550057,
"grad_norm": 0.07563432127302055,
"learning_rate": 0.00011321855971842243,
"loss": 1.1024,
"step": 445
},
{
"epoch": 0.5178365937859609,
"grad_norm": 0.07968899236090986,
"learning_rate": 0.00011122496230221645,
"loss": 1.1037,
"step": 450
},
{
"epoch": 0.523590333716916,
"grad_norm": 0.08595525245941885,
"learning_rate": 0.00010922683594633021,
"loss": 1.0991,
"step": 455
},
{
"epoch": 0.5293440736478712,
"grad_norm": 0.0870194667650804,
"learning_rate": 0.0001072249868355415,
"loss": 1.1193,
"step": 460
},
{
"epoch": 0.5350978135788262,
"grad_norm": 0.07553504230608732,
"learning_rate": 0.0001052202226566494,
"loss": 1.0876,
"step": 465
},
{
"epoch": 0.5408515535097813,
"grad_norm": 0.08165627721996481,
"learning_rate": 0.00010321335227259661,
"loss": 1.0855,
"step": 470
},
{
"epoch": 0.5466052934407365,
"grad_norm": 0.07548535900683774,
"learning_rate": 0.0001012051853961172,
"loss": 1.1067,
"step": 475
},
{
"epoch": 0.5523590333716916,
"grad_norm": 0.08254614481275399,
"learning_rate": 9.919653226304148e-05,
"loss": 1.1185,
"step": 480
},
{
"epoch": 0.5581127733026467,
"grad_norm": 0.07261119139795047,
"learning_rate": 9.718820330538998e-05,
"loss": 1.0913,
"step": 485
},
{
"epoch": 0.5638665132336018,
"grad_norm": 0.07666759909446064,
"learning_rate": 9.51810088243879e-05,
"loss": 1.1058,
"step": 490
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.0795402304145169,
"learning_rate": 9.317575866353292e-05,
"loss": 1.1249,
"step": 495
},
{
"epoch": 0.5753739930955121,
"grad_norm": 0.07900824948192435,
"learning_rate": 9.117326188184695e-05,
"loss": 1.1043,
"step": 500
},
{
"epoch": 0.5811277330264673,
"grad_norm": 0.07403897844355141,
"learning_rate": 8.917432642744518e-05,
"loss": 1.0858,
"step": 505
},
{
"epoch": 0.5868814729574223,
"grad_norm": 0.07943002355447462,
"learning_rate": 8.717975881155261e-05,
"loss": 1.0842,
"step": 510
},
{
"epoch": 0.5926352128883774,
"grad_norm": 0.07687109951447574,
"learning_rate": 8.519036378310096e-05,
"loss": 1.1046,
"step": 515
},
{
"epoch": 0.5983889528193326,
"grad_norm": 0.08051861745508501,
"learning_rate": 8.320694400403606e-05,
"loss": 1.1176,
"step": 520
},
{
"epoch": 0.6041426927502876,
"grad_norm": 0.07385921126253402,
"learning_rate": 8.123029972546781e-05,
"loss": 1.1237,
"step": 525
},
{
"epoch": 0.6098964326812428,
"grad_norm": 0.0758570951173778,
"learning_rate": 7.926122846479224e-05,
"loss": 1.102,
"step": 530
},
{
"epoch": 0.6156501726121979,
"grad_norm": 0.07600528797089706,
"learning_rate": 7.730052468391725e-05,
"loss": 1.0911,
"step": 535
},
{
"epoch": 0.6214039125431531,
"grad_norm": 0.07938766187013545,
"learning_rate": 7.534897946872042e-05,
"loss": 1.0867,
"step": 540
},
{
"epoch": 0.6271576524741082,
"grad_norm": 0.07930610991929318,
"learning_rate": 7.340738020986961e-05,
"loss": 1.1205,
"step": 545
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.08212529428864859,
"learning_rate": 7.147651028513383e-05,
"loss": 1.0931,
"step": 550
},
{
"epoch": 0.6386651323360184,
"grad_norm": 0.07731536951924715,
"learning_rate": 6.955714874331387e-05,
"loss": 1.0976,
"step": 555
},
{
"epoch": 0.6444188722669736,
"grad_norm": 0.07561598587821595,
"learning_rate": 6.765006998991888e-05,
"loss": 1.0842,
"step": 560
},
{
"epoch": 0.6501726121979287,
"grad_norm": 0.07405869311005314,
"learning_rate": 6.575604347471695e-05,
"loss": 1.0637,
"step": 565
},
{
"epoch": 0.6559263521288837,
"grad_norm": 0.07183365685581813,
"learning_rate": 6.387583338128471e-05,
"loss": 1.114,
"step": 570
},
{
"epoch": 0.6616800920598389,
"grad_norm": 0.07244649469955952,
"learning_rate": 6.201019831868208e-05,
"loss": 1.09,
"step": 575
},
{
"epoch": 0.667433831990794,
"grad_norm": 0.07512222335465936,
"learning_rate": 6.015989101537586e-05,
"loss": 1.1117,
"step": 580
},
{
"epoch": 0.6731875719217492,
"grad_norm": 0.0773183295000779,
"learning_rate": 5.83256580155362e-05,
"loss": 1.0846,
"step": 585
},
{
"epoch": 0.6789413118527042,
"grad_norm": 0.0746381622325705,
"learning_rate": 5.6508239377828034e-05,
"loss": 1.0942,
"step": 590
},
{
"epoch": 0.6846950517836594,
"grad_norm": 0.07207675237237808,
"learning_rate": 5.470836837681954e-05,
"loss": 1.0728,
"step": 595
},
{
"epoch": 0.6904487917146145,
"grad_norm": 0.07598696177120805,
"learning_rate": 5.2926771207127254e-05,
"loss": 1.0954,
"step": 600
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.07327699224620024,
"learning_rate": 5.116416669041843e-05,
"loss": 1.1158,
"step": 605
},
{
"epoch": 0.7019562715765247,
"grad_norm": 0.07840719027532041,
"learning_rate": 4.9421265985387476e-05,
"loss": 1.0738,
"step": 610
},
{
"epoch": 0.7077100115074798,
"grad_norm": 0.0769085876990078,
"learning_rate": 4.7698772300824756e-05,
"loss": 1.0958,
"step": 615
},
{
"epoch": 0.713463751438435,
"grad_norm": 0.07132843178750269,
"learning_rate": 4.599738061189244e-05,
"loss": 1.1058,
"step": 620
},
{
"epoch": 0.7192174913693901,
"grad_norm": 0.07713712307040285,
"learning_rate": 4.4317777379722866e-05,
"loss": 1.1028,
"step": 625
},
{
"epoch": 0.7249712313003452,
"grad_norm": 0.07758438004748038,
"learning_rate": 4.266064027445155e-05,
"loss": 1.0899,
"step": 630
},
{
"epoch": 0.7307249712313003,
"grad_norm": 0.07854544913701697,
"learning_rate": 4.102663790179764e-05,
"loss": 1.0761,
"step": 635
},
{
"epoch": 0.7364787111622555,
"grad_norm": 0.0751537421287673,
"learning_rate": 3.941642953330103e-05,
"loss": 1.0935,
"step": 640
},
{
"epoch": 0.7422324510932106,
"grad_norm": 0.07536112057903382,
"learning_rate": 3.7830664840326145e-05,
"loss": 1.1316,
"step": 645
},
{
"epoch": 0.7479861910241657,
"grad_norm": 0.0757154673994382,
"learning_rate": 3.6269983631938475e-05,
"loss": 1.1105,
"step": 650
},
{
"epoch": 0.7537399309551208,
"grad_norm": 0.07517902293608703,
"learning_rate": 3.473501559676088e-05,
"loss": 1.1183,
"step": 655
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.07512890867407872,
"learning_rate": 3.3226380048912585e-05,
"loss": 1.095,
"step": 660
},
{
"epoch": 0.7652474108170311,
"grad_norm": 0.0747389401617661,
"learning_rate": 3.174468567813461e-05,
"loss": 1.0924,
"step": 665
},
{
"epoch": 0.7710011507479861,
"grad_norm": 0.07269553748906823,
"learning_rate": 3.029053030420115e-05,
"loss": 1.1079,
"step": 670
},
{
"epoch": 0.7767548906789413,
"grad_norm": 0.0752874034050374,
"learning_rate": 2.886450063571735e-05,
"loss": 1.102,
"step": 675
},
{
"epoch": 0.7825086306098964,
"grad_norm": 0.0735724332478834,
"learning_rate": 2.7467172033399458e-05,
"loss": 1.1114,
"step": 680
},
{
"epoch": 0.7882623705408516,
"grad_norm": 0.08104817717496945,
"learning_rate": 2.6099108277934103e-05,
"loss": 1.1077,
"step": 685
},
{
"epoch": 0.7940161104718066,
"grad_norm": 0.07305813837737488,
"learning_rate": 2.4760861342509233e-05,
"loss": 1.0906,
"step": 690
},
{
"epoch": 0.7997698504027618,
"grad_norm": 0.07242520365927102,
"learning_rate": 2.345297117010954e-05,
"loss": 1.0893,
"step": 695
},
{
"epoch": 0.8055235903337169,
"grad_norm": 0.07698458847076538,
"learning_rate": 2.2175965455665226e-05,
"loss": 1.0898,
"step": 700
},
{
"epoch": 0.8112773302646721,
"grad_norm": 0.076651036057528,
"learning_rate": 2.0930359433142932e-05,
"loss": 1.0877,
"step": 705
},
{
"epoch": 0.8170310701956272,
"grad_norm": 0.0721820626725428,
"learning_rate": 1.9716655667664008e-05,
"loss": 1.0794,
"step": 710
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.07461008979903963,
"learning_rate": 1.8535343852734332e-05,
"loss": 1.1101,
"step": 715
},
{
"epoch": 0.8285385500575374,
"grad_norm": 0.07528450837009106,
"learning_rate": 1.7386900612667633e-05,
"loss": 1.0949,
"step": 720
},
{
"epoch": 0.8342922899884925,
"grad_norm": 0.07330788015551219,
"learning_rate": 1.6271789310281517e-05,
"loss": 1.0942,
"step": 725
},
{
"epoch": 0.8400460299194477,
"grad_norm": 0.07387736383215655,
"learning_rate": 1.5190459859944505e-05,
"loss": 1.0785,
"step": 730
},
{
"epoch": 0.8457997698504027,
"grad_norm": 0.07362643024392242,
"learning_rate": 1.4143348546048707e-05,
"loss": 1.0834,
"step": 735
},
{
"epoch": 0.8515535097813579,
"grad_norm": 0.07308957142384619,
"learning_rate": 1.3130877846982204e-05,
"loss": 1.0833,
"step": 740
},
{
"epoch": 0.857307249712313,
"grad_norm": 0.07884770650029872,
"learning_rate": 1.2153456264671337e-05,
"loss": 1.1047,
"step": 745
},
{
"epoch": 0.8630609896432682,
"grad_norm": 0.07410132122132759,
"learning_rate": 1.1211478159762478e-05,
"loss": 1.1042,
"step": 750
},
{
"epoch": 0.8688147295742232,
"grad_norm": 0.07315305451840597,
"learning_rate": 1.0305323592509009e-05,
"loss": 1.1127,
"step": 755
},
{
"epoch": 0.8745684695051784,
"grad_norm": 0.07241715378225073,
"learning_rate": 9.435358169428442e-06,
"loss": 1.0552,
"step": 760
},
{
"epoch": 0.8803222094361335,
"grad_norm": 0.07310674530244925,
"learning_rate": 8.601932895790877e-06,
"loss": 1.1175,
"step": 765
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.08154925564133862,
"learning_rate": 7.805384033998875e-06,
"loss": 1.093,
"step": 770
},
{
"epoch": 0.8918296892980437,
"grad_norm": 0.07285132059208473,
"learning_rate": 7.046032967915483e-06,
"loss": 1.0869,
"step": 775
},
{
"epoch": 0.8975834292289988,
"grad_norm": 0.07660142337736248,
"learning_rate": 6.32418607319546e-06,
"loss": 1.0914,
"step": 780
},
{
"epoch": 0.903337169159954,
"grad_norm": 0.07846326221484771,
"learning_rate": 5.640134593671598e-06,
"loss": 1.0879,
"step": 785
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.0744126532665454,
"learning_rate": 4.994154523846695e-06,
"loss": 1.0948,
"step": 790
},
{
"epoch": 0.9148446490218642,
"grad_norm": 0.07568510267678755,
"learning_rate": 4.386506497537757e-06,
"loss": 1.1054,
"step": 795
},
{
"epoch": 0.9205983889528193,
"grad_norm": 0.07378727253077548,
"learning_rate": 3.817435682718096e-06,
"loss": 1.0787,
"step": 800
},
{
"epoch": 0.9263521288837745,
"grad_norm": 0.0747897379397166,
"learning_rate": 3.287171682599255e-06,
"loss": 1.0911,
"step": 805
},
{
"epoch": 0.9321058688147296,
"grad_norm": 0.07818706538185882,
"learning_rate": 2.7959284429929456e-06,
"loss": 1.0926,
"step": 810
},
{
"epoch": 0.9378596087456847,
"grad_norm": 0.07201156955643878,
"learning_rate": 2.3439041659902407e-06,
"loss": 1.106,
"step": 815
},
{
"epoch": 0.9436133486766398,
"grad_norm": 0.07293591627119424,
"learning_rate": 1.9312812299929094e-06,
"loss": 1.1041,
"step": 820
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.07620976319405062,
"learning_rate": 1.5582261161291245e-06,
"loss": 1.0835,
"step": 825
},
{
"epoch": 0.9551208285385501,
"grad_norm": 0.07403997053394712,
"learning_rate": 1.2248893410832685e-06,
"loss": 1.1048,
"step": 830
},
{
"epoch": 0.9608745684695051,
"grad_norm": 0.07432396016348998,
"learning_rate": 9.314053963669245e-07,
"loss": 1.0915,
"step": 835
},
{
"epoch": 0.9666283084004603,
"grad_norm": 0.07180128894783522,
"learning_rate": 6.778926940555152e-07,
"loss": 1.0981,
"step": 840
},
{
"epoch": 0.9723820483314154,
"grad_norm": 0.07204281549304838,
"learning_rate": 4.644535190125421e-07,
"loss": 1.0962,
"step": 845
},
{
"epoch": 0.9781357882623706,
"grad_norm": 0.07295868011398168,
"learning_rate": 2.9117398762069647e-07,
"loss": 1.1083,
"step": 850
},
{
"epoch": 0.9838895281933256,
"grad_norm": 0.07600348765786609,
"learning_rate": 1.5812401303639813e-07,
"loss": 1.1138,
"step": 855
},
{
"epoch": 0.9896432681242808,
"grad_norm": 0.07208207285243298,
"learning_rate": 6.535727698199213e-08,
"loss": 1.0787,
"step": 860
},
{
"epoch": 0.9953970080552359,
"grad_norm": 0.07662851683615209,
"learning_rate": 1.2911208086663351e-08,
"loss": 1.087,
"step": 865
},
{
"epoch": 1.0,
"eval_loss": 1.038455605506897,
"eval_runtime": 2.7622,
"eval_samples_per_second": 2.534,
"eval_steps_per_second": 0.724,
"step": 869
},
{
"epoch": 1.0,
"step": 869,
"total_flos": 1.0927921245978624e+16,
"train_loss": 1.118139435424629,
"train_runtime": 17754.8873,
"train_samples_per_second": 3.132,
"train_steps_per_second": 0.049
}
],
"logging_steps": 5,
"max_steps": 869,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0927921245978624e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}