sanity_static_20p_100k / trainer_state.json

Model save

b9a5129 verified over 1 year ago

31.5 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 1.0,
	"eval_steps": 500,
	"global_step": 869,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0011507479861910242,
	"grad_norm": 0.4465322789934592,
	"learning_rate": 2.2988505747126437e-06,
	"loss": 1.3558,
	"step": 1
	},
	{
	"epoch": 0.005753739930955121,
	"grad_norm": 0.4729686939623816,
	"learning_rate": 1.1494252873563218e-05,
	"loss": 1.3725,
	"step": 5
	},
	{
	"epoch": 0.011507479861910242,
	"grad_norm": 0.5386929459384674,
	"learning_rate": 2.2988505747126437e-05,
	"loss": 1.3751,
	"step": 10
	},
	{
	"epoch": 0.01726121979286536,
	"grad_norm": 0.1777783055450745,
	"learning_rate": 3.4482758620689657e-05,
	"loss": 1.3698,
	"step": 15
	},
	{
	"epoch": 0.023014959723820484,
	"grad_norm": 0.16074111506146127,
	"learning_rate": 4.597701149425287e-05,
	"loss": 1.308,
	"step": 20
	},
	{
	"epoch": 0.028768699654775604,
	"grad_norm": 0.13109040303886016,
	"learning_rate": 5.747126436781609e-05,
	"loss": 1.3108,
	"step": 25
	},
	{
	"epoch": 0.03452243958573072,
	"grad_norm": 0.1399833196718492,
	"learning_rate": 6.896551724137931e-05,
	"loss": 1.2826,
	"step": 30
	},
	{
	"epoch": 0.04027617951668585,
	"grad_norm": 0.1280676761539199,
	"learning_rate": 8.045977011494253e-05,
	"loss": 1.2773,
	"step": 35
	},
	{
	"epoch": 0.04602991944764097,
	"grad_norm": 0.12005806159718177,
	"learning_rate": 9.195402298850575e-05,
	"loss": 1.2377,
	"step": 40
	},
	{
	"epoch": 0.05178365937859609,
	"grad_norm": 0.10506239475224881,
	"learning_rate": 0.00010344827586206898,
	"loss": 1.2296,
	"step": 45
	},
	{
	"epoch": 0.05753739930955121,
	"grad_norm": 0.0819749468856537,
	"learning_rate": 0.00011494252873563218,
	"loss": 1.197,
	"step": 50
	},
	{
	"epoch": 0.06329113924050633,
	"grad_norm": 0.08509500503388337,
	"learning_rate": 0.0001264367816091954,
	"loss": 1.1709,
	"step": 55
	},
	{
	"epoch": 0.06904487917146145,
	"grad_norm": 0.07393378633841856,
	"learning_rate": 0.00013793103448275863,
	"loss": 1.1692,
	"step": 60
	},
	{
	"epoch": 0.07479861910241657,
	"grad_norm": 0.06625962718107734,
	"learning_rate": 0.00014942528735632183,
	"loss": 1.2001,
	"step": 65
	},
	{
	"epoch": 0.0805523590333717,
	"grad_norm": 0.07029239664213276,
	"learning_rate": 0.00016091954022988506,
	"loss": 1.1746,
	"step": 70
	},
	{
	"epoch": 0.08630609896432681,
	"grad_norm": 0.0734118145247187,
	"learning_rate": 0.00017241379310344826,
	"loss": 1.1477,
	"step": 75
	},
	{
	"epoch": 0.09205983889528194,
	"grad_norm": 0.07794825571468995,
	"learning_rate": 0.0001839080459770115,
	"loss": 1.1657,
	"step": 80
	},
	{
	"epoch": 0.09781357882623705,
	"grad_norm": 0.08512834878649493,
	"learning_rate": 0.00019540229885057472,
	"loss": 1.1775,
	"step": 85
	},
	{
	"epoch": 0.10356731875719218,
	"grad_norm": 0.081601122042688,
	"learning_rate": 0.00019999273737707646,
	"loss": 1.1584,
	"step": 90
	},
	{
	"epoch": 0.1093210586881473,
	"grad_norm": 0.09077503220224638,
	"learning_rate": 0.00019994835850163924,
	"loss": 1.1332,
	"step": 95
	},
	{
	"epoch": 0.11507479861910241,
	"grad_norm": 0.08933595254839481,
	"learning_rate": 0.00019986365342513265,
	"loss": 1.1341,
	"step": 100
	},
	{
	"epoch": 0.12082853855005754,
	"grad_norm": 0.0927003880602526,
	"learning_rate": 0.00019973865632354516,
	"loss": 1.1355,
	"step": 105
	},
	{
	"epoch": 0.12658227848101267,
	"grad_norm": 0.07791130311867683,
	"learning_rate": 0.00019957341762950344,
	"loss": 1.124,
	"step": 110
	},
	{
	"epoch": 0.1323360184119678,
	"grad_norm": 0.0790112418591026,
	"learning_rate": 0.0001993680040119244,
	"loss": 1.1307,
	"step": 115
	},
	{
	"epoch": 0.1380897583429229,
	"grad_norm": 0.09227634318705202,
	"learning_rate": 0.000199122498349116,
	"loss": 1.098,
	"step": 120
	},
	{
	"epoch": 0.14384349827387802,
	"grad_norm": 0.08290995925084253,
	"learning_rate": 0.0001988369996953386,
	"loss": 1.1347,
	"step": 125
	},
	{
	"epoch": 0.14959723820483314,
	"grad_norm": 0.08548470364290049,
	"learning_rate": 0.00019851162324083932,
	"loss": 1.1418,
	"step": 130
	},
	{
	"epoch": 0.15535097813578827,
	"grad_norm": 0.08440345287709118,
	"learning_rate": 0.0001981465002653763,
	"loss": 1.135,
	"step": 135
	},
	{
	"epoch": 0.1611047180667434,
	"grad_norm": 0.08444559508248843,
	"learning_rate": 0.00019774177808525113,
	"loss": 1.1273,
	"step": 140
	},
	{
	"epoch": 0.1668584579976985,
	"grad_norm": 0.0840175874952237,
	"learning_rate": 0.00019729761999387103,
	"loss": 1.1129,
	"step": 145
	},
	{
	"epoch": 0.17261219792865362,
	"grad_norm": 0.07480610551808778,
	"learning_rate": 0.000196814205195865,
	"loss": 1.1182,
	"step": 150
	},
	{
	"epoch": 0.17836593785960875,
	"grad_norm": 0.07608916685245905,
	"learning_rate": 0.00019629172873477995,
	"loss": 1.1017,
	"step": 155
	},
	{
	"epoch": 0.18411967779056387,
	"grad_norm": 0.09515413227881493,
	"learning_rate": 0.00019573040141438624,
	"loss": 1.1281,
	"step": 160
	},
	{
	"epoch": 0.189873417721519,
	"grad_norm": 0.07493301926746701,
	"learning_rate": 0.00019513044971362494,
	"loss": 1.1213,
	"step": 165
	},
	{
	"epoch": 0.1956271576524741,
	"grad_norm": 0.07980883594622108,
	"learning_rate": 0.00019449211569523,
	"loss": 1.1289,
	"step": 170
	},
	{
	"epoch": 0.20138089758342922,
	"grad_norm": 0.08091237279913081,
	"learning_rate": 0.00019381565690806328,
	"loss": 1.1222,
	"step": 175
	},
	{
	"epoch": 0.20713463751438435,
	"grad_norm": 0.07635483082524008,
	"learning_rate": 0.00019310134628320114,
	"loss": 1.1173,
	"step": 180
	},
	{
	"epoch": 0.21288837744533948,
	"grad_norm": 0.07974746871493653,
	"learning_rate": 0.00019234947202381486,
	"loss": 1.1074,
	"step": 185
	},
	{
	"epoch": 0.2186421173762946,
	"grad_norm": 0.07924995122930302,
	"learning_rate": 0.00019156033748888917,
	"loss": 1.1193,
	"step": 190
	},
	{
	"epoch": 0.2243958573072497,
	"grad_norm": 0.08344491509033138,
	"learning_rate": 0.000190734261070826,
	"loss": 1.1162,
	"step": 195
	},
	{
	"epoch": 0.23014959723820483,
	"grad_norm": 0.0809591767505205,
	"learning_rate": 0.00018987157606698235,
	"loss": 1.1085,
	"step": 200
	},
	{
	"epoch": 0.23590333716915995,
	"grad_norm": 0.08094087212789886,
	"learning_rate": 0.00018897263054519498,
	"loss": 1.0996,
	"step": 205
	},
	{
	"epoch": 0.24165707710011508,
	"grad_norm": 0.07353474154724883,
	"learning_rate": 0.0001880377872033451,
	"loss": 1.1301,
	"step": 210
	},
	{
	"epoch": 0.2474108170310702,
	"grad_norm": 0.0784057120949034,
	"learning_rate": 0.00018706742322302064,
	"loss": 1.1149,
	"step": 215
	},
	{
	"epoch": 0.25316455696202533,
	"grad_norm": 0.07563014633070095,
	"learning_rate": 0.0001860619301173347,
	"loss": 1.1084,
	"step": 220
	},
	{
	"epoch": 0.25891829689298046,
	"grad_norm": 0.0732650161077316,
	"learning_rate": 0.00018502171357296144,
	"loss": 1.0934,
	"step": 225
	},
	{
	"epoch": 0.2646720368239356,
	"grad_norm": 0.07642524798153359,
	"learning_rate": 0.0001839471932864537,
	"loss": 1.1279,
	"step": 230
	},
	{
	"epoch": 0.27042577675489066,
	"grad_norm": 0.07924123360978817,
	"learning_rate": 0.0001828388027949078,
	"loss": 1.1164,
	"step": 235
	},
	{
	"epoch": 0.2761795166858458,
	"grad_norm": 0.08291524400368479,
	"learning_rate": 0.0001816969893010442,
	"loss": 1.1426,
	"step": 240
	},
	{
	"epoch": 0.2819332566168009,
	"grad_norm": 0.09503646536490909,
	"learning_rate": 0.00018052221349277442,
	"loss": 1.1007,
	"step": 245
	},
	{
	"epoch": 0.28768699654775604,
	"grad_norm": 0.08546247272232582,
	"learning_rate": 0.0001793149493573271,
	"loss": 1.0996,
	"step": 250
	},
	{
	"epoch": 0.29344073647871116,
	"grad_norm": 0.07292435172406726,
	"learning_rate": 0.00017807568399000822,
	"loss": 1.1297,
	"step": 255
	},
	{
	"epoch": 0.2991944764096663,
	"grad_norm": 0.08793050679899453,
	"learning_rate": 0.0001768049173976727,
	"loss": 1.128,
	"step": 260
	},
	{
	"epoch": 0.3049482163406214,
	"grad_norm": 0.08228996895895493,
	"learning_rate": 0.0001755031622969862,
	"loss": 1.1067,
	"step": 265
	},
	{
	"epoch": 0.31070195627157654,
	"grad_norm": 0.08034368274055034,
	"learning_rate": 0.00017417094390755934,
	"loss": 1.1399,
	"step": 270
	},
	{
	"epoch": 0.31645569620253167,
	"grad_norm": 0.07355220396446509,
	"learning_rate": 0.00017280879974003707,
	"loss": 1.105,
	"step": 275
	},
	{
	"epoch": 0.3222094361334868,
	"grad_norm": 0.07695547066904129,
	"learning_rate": 0.0001714172793792291,
	"loss": 1.1282,
	"step": 280
	},
	{
	"epoch": 0.32796317606444186,
	"grad_norm": 0.07242274233224258,
	"learning_rate": 0.0001699969442623686,
	"loss": 1.1138,
	"step": 285
	},
	{
	"epoch": 0.333716915995397,
	"grad_norm": 0.08486742993293918,
	"learning_rate": 0.0001685483674525891,
	"loss": 1.1,
	"step": 290
	},
	{
	"epoch": 0.3394706559263521,
	"grad_norm": 0.0747389855159797,
	"learning_rate": 0.0001670721334077103,
	"loss": 1.1016,
	"step": 295
	},
	{
	"epoch": 0.34522439585730724,
	"grad_norm": 0.07533809685344381,
	"learning_rate": 0.00016556883774442675,
	"loss": 1.1181,
	"step": 300
	},
	{
	"epoch": 0.35097813578826237,
	"grad_norm": 0.07699490502726904,
	"learning_rate": 0.00016403908699799425,
	"loss": 1.0986,
	"step": 305
	},
	{
	"epoch": 0.3567318757192175,
	"grad_norm": 0.07297020155234164,
	"learning_rate": 0.00016248349837751062,
	"loss": 1.0951,
	"step": 310
	},
	{
	"epoch": 0.3624856156501726,
	"grad_norm": 0.08519497936629768,
	"learning_rate": 0.0001609026995168904,
	"loss": 1.1148,
	"step": 315
	},
	{
	"epoch": 0.36823935558112775,
	"grad_norm": 0.07458638521829272,
	"learning_rate": 0.00015929732822163287,
	"loss": 1.1165,
	"step": 320
	},
	{
	"epoch": 0.3739930955120829,
	"grad_norm": 0.08000996094690015,
	"learning_rate": 0.00015766803221148673,
	"loss": 1.1072,
	"step": 325
	},
	{
	"epoch": 0.379746835443038,
	"grad_norm": 0.08213267404173417,
	"learning_rate": 0.00015601546885911404,
	"loss": 1.1078,
	"step": 330
	},
	{
	"epoch": 0.38550057537399307,
	"grad_norm": 0.08374970807354172,
	"learning_rate": 0.00015434030492486023,
	"loss": 1.0842,
	"step": 335
	},
	{
	"epoch": 0.3912543153049482,
	"grad_norm": 0.07669296380369849,
	"learning_rate": 0.0001526432162877356,
	"loss": 1.1124,
	"step": 340
	},
	{
	"epoch": 0.3970080552359033,
	"grad_norm": 0.07376471868371856,
	"learning_rate": 0.00015092488767271857,
	"loss": 1.0882,
	"step": 345
	},
	{
	"epoch": 0.40276179516685845,
	"grad_norm": 0.08896375142853388,
	"learning_rate": 0.00014918601237448923,
	"loss": 1.1217,
	"step": 350
	},
	{
	"epoch": 0.4085155350978136,
	"grad_norm": 0.07333518083670185,
	"learning_rate": 0.00014742729197770552,
	"loss": 1.0853,
	"step": 355
	},
	{
	"epoch": 0.4142692750287687,
	"grad_norm": 0.07541021900185346,
	"learning_rate": 0.00014564943607393459,
	"loss": 1.1078,
	"step": 360
	},
	{
	"epoch": 0.42002301495972383,
	"grad_norm": 0.07240408031237026,
	"learning_rate": 0.00014385316197535372,
	"loss": 1.0963,
	"step": 365
	},
	{
	"epoch": 0.42577675489067895,
	"grad_norm": 0.07726369320922719,
	"learning_rate": 0.00014203919442533597,
	"loss": 1.082,
	"step": 370
	},
	{
	"epoch": 0.4315304948216341,
	"grad_norm": 0.07947926565287182,
	"learning_rate": 0.00014020826530603776,
	"loss": 1.0775,
	"step": 375
	},
	{
	"epoch": 0.4372842347525892,
	"grad_norm": 0.0835226760212179,
	"learning_rate": 0.0001383611133431062,
	"loss": 1.1005,
	"step": 380
	},
	{
	"epoch": 0.4430379746835443,
	"grad_norm": 0.07628153739619549,
	"learning_rate": 0.00013649848380762513,
	"loss": 1.1139,
	"step": 385
	},
	{
	"epoch": 0.4487917146144994,
	"grad_norm": 0.07650525696735216,
	"learning_rate": 0.00013462112821542016,
	"loss": 1.1171,
	"step": 390
	},
	{
	"epoch": 0.45454545454545453,
	"grad_norm": 0.07569647253340571,
	"learning_rate": 0.0001327298040238446,
	"loss": 1.0918,
	"step": 395
	},
	{
	"epoch": 0.46029919447640966,
	"grad_norm": 0.07572977420046635,
	"learning_rate": 0.0001308252743261675,
	"loss": 1.1018,
	"step": 400
	},
	{
	"epoch": 0.4660529344073648,
	"grad_norm": 0.084906098343464,
	"learning_rate": 0.00012890830754368855,
	"loss": 1.1145,
	"step": 405
	},
	{
	"epoch": 0.4718066743383199,
	"grad_norm": 0.08856218479901995,
	"learning_rate": 0.00012697967711570242,
	"loss": 1.1049,
	"step": 410
	},
	{
	"epoch": 0.47756041426927504,
	"grad_norm": 0.07498278458233507,
	"learning_rate": 0.00012504016118743935,
	"loss": 1.1061,
	"step": 415
	},
	{
	"epoch": 0.48331415420023016,
	"grad_norm": 0.07551499247429329,
	"learning_rate": 0.00012309054229610623,
	"loss": 1.0858,
	"step": 420
	},
	{
	"epoch": 0.4890678941311853,
	"grad_norm": 0.07573682259745337,
	"learning_rate": 0.00012113160705515625,
	"loss": 1.0917,
	"step": 425
	},
	{
	"epoch": 0.4948216340621404,
	"grad_norm": 0.09077558081676396,
	"learning_rate": 0.0001191641458369136,
	"loss": 1.118,
	"step": 430
	},
	{
	"epoch": 0.5005753739930955,
	"grad_norm": 0.07771540883960386,
	"learning_rate": 0.00011718895245368167,
	"loss": 1.1056,
	"step": 435
	},
	{
	"epoch": 0.5063291139240507,
	"grad_norm": 0.08583872028129896,
	"learning_rate": 0.00011520682383746333,
	"loss": 1.0792,
	"step": 440
	},
	{
	"epoch": 0.5120828538550057,
	"grad_norm": 0.07563432127302055,
	"learning_rate": 0.00011321855971842243,
	"loss": 1.1024,
	"step": 445
	},
	{
	"epoch": 0.5178365937859609,
	"grad_norm": 0.07968899236090986,
	"learning_rate": 0.00011122496230221645,
	"loss": 1.1037,
	"step": 450
	},
	{
	"epoch": 0.523590333716916,
	"grad_norm": 0.08595525245941885,
	"learning_rate": 0.00010922683594633021,
	"loss": 1.0991,
	"step": 455
	},
	{
	"epoch": 0.5293440736478712,
	"grad_norm": 0.0870194667650804,
	"learning_rate": 0.0001072249868355415,
	"loss": 1.1193,
	"step": 460
	},
	{
	"epoch": 0.5350978135788262,
	"grad_norm": 0.07553504230608732,
	"learning_rate": 0.0001052202226566494,
	"loss": 1.0876,
	"step": 465
	},
	{
	"epoch": 0.5408515535097813,
	"grad_norm": 0.08165627721996481,
	"learning_rate": 0.00010321335227259661,
	"loss": 1.0855,
	"step": 470
	},
	{
	"epoch": 0.5466052934407365,
	"grad_norm": 0.07548535900683774,
	"learning_rate": 0.0001012051853961172,
	"loss": 1.1067,
	"step": 475
	},
	{
	"epoch": 0.5523590333716916,
	"grad_norm": 0.08254614481275399,
	"learning_rate": 9.919653226304148e-05,
	"loss": 1.1185,
	"step": 480
	},
	{
	"epoch": 0.5581127733026467,
	"grad_norm": 0.07261119139795047,
	"learning_rate": 9.718820330538998e-05,
	"loss": 1.0913,
	"step": 485
	},
	{
	"epoch": 0.5638665132336018,
	"grad_norm": 0.07666759909446064,
	"learning_rate": 9.51810088243879e-05,
	"loss": 1.1058,
	"step": 490
	},
	{
	"epoch": 0.569620253164557,
	"grad_norm": 0.0795402304145169,
	"learning_rate": 9.317575866353292e-05,
	"loss": 1.1249,
	"step": 495
	},
	{
	"epoch": 0.5753739930955121,
	"grad_norm": 0.07900824948192435,
	"learning_rate": 9.117326188184695e-05,
	"loss": 1.1043,
	"step": 500
	},
	{
	"epoch": 0.5811277330264673,
	"grad_norm": 0.07403897844355141,
	"learning_rate": 8.917432642744518e-05,
	"loss": 1.0858,
	"step": 505
	},
	{
	"epoch": 0.5868814729574223,
	"grad_norm": 0.07943002355447462,
	"learning_rate": 8.717975881155261e-05,
	"loss": 1.0842,
	"step": 510
	},
	{
	"epoch": 0.5926352128883774,
	"grad_norm": 0.07687109951447574,
	"learning_rate": 8.519036378310096e-05,
	"loss": 1.1046,
	"step": 515
	},
	{
	"epoch": 0.5983889528193326,
	"grad_norm": 0.08051861745508501,
	"learning_rate": 8.320694400403606e-05,
	"loss": 1.1176,
	"step": 520
	},
	{
	"epoch": 0.6041426927502876,
	"grad_norm": 0.07385921126253402,
	"learning_rate": 8.123029972546781e-05,
	"loss": 1.1237,
	"step": 525
	},
	{
	"epoch": 0.6098964326812428,
	"grad_norm": 0.0758570951173778,
	"learning_rate": 7.926122846479224e-05,
	"loss": 1.102,
	"step": 530
	},
	{
	"epoch": 0.6156501726121979,
	"grad_norm": 0.07600528797089706,
	"learning_rate": 7.730052468391725e-05,
	"loss": 1.0911,
	"step": 535
	},
	{
	"epoch": 0.6214039125431531,
	"grad_norm": 0.07938766187013545,
	"learning_rate": 7.534897946872042e-05,
	"loss": 1.0867,
	"step": 540
	},
	{
	"epoch": 0.6271576524741082,
	"grad_norm": 0.07930610991929318,
	"learning_rate": 7.340738020986961e-05,
	"loss": 1.1205,
	"step": 545
	},
	{
	"epoch": 0.6329113924050633,
	"grad_norm": 0.08212529428864859,
	"learning_rate": 7.147651028513383e-05,
	"loss": 1.0931,
	"step": 550
	},
	{
	"epoch": 0.6386651323360184,
	"grad_norm": 0.07731536951924715,
	"learning_rate": 6.955714874331387e-05,
	"loss": 1.0976,
	"step": 555
	},
	{
	"epoch": 0.6444188722669736,
	"grad_norm": 0.07561598587821595,
	"learning_rate": 6.765006998991888e-05,
	"loss": 1.0842,
	"step": 560
	},
	{
	"epoch": 0.6501726121979287,
	"grad_norm": 0.07405869311005314,
	"learning_rate": 6.575604347471695e-05,
	"loss": 1.0637,
	"step": 565
	},
	{
	"epoch": 0.6559263521288837,
	"grad_norm": 0.07183365685581813,
	"learning_rate": 6.387583338128471e-05,
	"loss": 1.114,
	"step": 570
	},
	{
	"epoch": 0.6616800920598389,
	"grad_norm": 0.07244649469955952,
	"learning_rate": 6.201019831868208e-05,
	"loss": 1.09,
	"step": 575
	},
	{
	"epoch": 0.667433831990794,
	"grad_norm": 0.07512222335465936,
	"learning_rate": 6.015989101537586e-05,
	"loss": 1.1117,
	"step": 580
	},
	{
	"epoch": 0.6731875719217492,
	"grad_norm": 0.0773183295000779,
	"learning_rate": 5.83256580155362e-05,
	"loss": 1.0846,
	"step": 585
	},
	{
	"epoch": 0.6789413118527042,
	"grad_norm": 0.0746381622325705,
	"learning_rate": 5.6508239377828034e-05,
	"loss": 1.0942,
	"step": 590
	},
	{
	"epoch": 0.6846950517836594,
	"grad_norm": 0.07207675237237808,
	"learning_rate": 5.470836837681954e-05,
	"loss": 1.0728,
	"step": 595
	},
	{
	"epoch": 0.6904487917146145,
	"grad_norm": 0.07598696177120805,
	"learning_rate": 5.2926771207127254e-05,
	"loss": 1.0954,
	"step": 600
	},
	{
	"epoch": 0.6962025316455697,
	"grad_norm": 0.07327699224620024,
	"learning_rate": 5.116416669041843e-05,
	"loss": 1.1158,
	"step": 605
	},
	{
	"epoch": 0.7019562715765247,
	"grad_norm": 0.07840719027532041,
	"learning_rate": 4.9421265985387476e-05,
	"loss": 1.0738,
	"step": 610
	},
	{
	"epoch": 0.7077100115074798,
	"grad_norm": 0.0769085876990078,
	"learning_rate": 4.7698772300824756e-05,
	"loss": 1.0958,
	"step": 615
	},
	{
	"epoch": 0.713463751438435,
	"grad_norm": 0.07132843178750269,
	"learning_rate": 4.599738061189244e-05,
	"loss": 1.1058,
	"step": 620
	},
	{
	"epoch": 0.7192174913693901,
	"grad_norm": 0.07713712307040285,
	"learning_rate": 4.4317777379722866e-05,
	"loss": 1.1028,
	"step": 625
	},
	{
	"epoch": 0.7249712313003452,
	"grad_norm": 0.07758438004748038,
	"learning_rate": 4.266064027445155e-05,
	"loss": 1.0899,
	"step": 630
	},
	{
	"epoch": 0.7307249712313003,
	"grad_norm": 0.07854544913701697,
	"learning_rate": 4.102663790179764e-05,
	"loss": 1.0761,
	"step": 635
	},
	{
	"epoch": 0.7364787111622555,
	"grad_norm": 0.0751537421287673,
	"learning_rate": 3.941642953330103e-05,
	"loss": 1.0935,
	"step": 640
	},
	{
	"epoch": 0.7422324510932106,
	"grad_norm": 0.07536112057903382,
	"learning_rate": 3.7830664840326145e-05,
	"loss": 1.1316,
	"step": 645
	},
	{
	"epoch": 0.7479861910241657,
	"grad_norm": 0.0757154673994382,
	"learning_rate": 3.6269983631938475e-05,
	"loss": 1.1105,
	"step": 650
	},
	{
	"epoch": 0.7537399309551208,
	"grad_norm": 0.07517902293608703,
	"learning_rate": 3.473501559676088e-05,
	"loss": 1.1183,
	"step": 655
	},
	{
	"epoch": 0.759493670886076,
	"grad_norm": 0.07512890867407872,
	"learning_rate": 3.3226380048912585e-05,
	"loss": 1.095,
	"step": 660
	},
	{
	"epoch": 0.7652474108170311,
	"grad_norm": 0.0747389401617661,
	"learning_rate": 3.174468567813461e-05,
	"loss": 1.0924,
	"step": 665
	},
	{
	"epoch": 0.7710011507479861,
	"grad_norm": 0.07269553748906823,
	"learning_rate": 3.029053030420115e-05,
	"loss": 1.1079,
	"step": 670
	},
	{
	"epoch": 0.7767548906789413,
	"grad_norm": 0.0752874034050374,
	"learning_rate": 2.886450063571735e-05,
	"loss": 1.102,
	"step": 675
	},
	{
	"epoch": 0.7825086306098964,
	"grad_norm": 0.0735724332478834,
	"learning_rate": 2.7467172033399458e-05,
	"loss": 1.1114,
	"step": 680
	},
	{
	"epoch": 0.7882623705408516,
	"grad_norm": 0.08104817717496945,
	"learning_rate": 2.6099108277934103e-05,
	"loss": 1.1077,
	"step": 685
	},
	{
	"epoch": 0.7940161104718066,
	"grad_norm": 0.07305813837737488,
	"learning_rate": 2.4760861342509233e-05,
	"loss": 1.0906,
	"step": 690
	},
	{
	"epoch": 0.7997698504027618,
	"grad_norm": 0.07242520365927102,
	"learning_rate": 2.345297117010954e-05,
	"loss": 1.0893,
	"step": 695
	},
	{
	"epoch": 0.8055235903337169,
	"grad_norm": 0.07698458847076538,
	"learning_rate": 2.2175965455665226e-05,
	"loss": 1.0898,
	"step": 700
	},
	{
	"epoch": 0.8112773302646721,
	"grad_norm": 0.076651036057528,
	"learning_rate": 2.0930359433142932e-05,
	"loss": 1.0877,
	"step": 705
	},
	{
	"epoch": 0.8170310701956272,
	"grad_norm": 0.0721820626725428,
	"learning_rate": 1.9716655667664008e-05,
	"loss": 1.0794,
	"step": 710
	},
	{
	"epoch": 0.8227848101265823,
	"grad_norm": 0.07461008979903963,
	"learning_rate": 1.8535343852734332e-05,
	"loss": 1.1101,
	"step": 715
	},
	{
	"epoch": 0.8285385500575374,
	"grad_norm": 0.07528450837009106,
	"learning_rate": 1.7386900612667633e-05,
	"loss": 1.0949,
	"step": 720
	},
	{
	"epoch": 0.8342922899884925,
	"grad_norm": 0.07330788015551219,
	"learning_rate": 1.6271789310281517e-05,
	"loss": 1.0942,
	"step": 725
	},
	{
	"epoch": 0.8400460299194477,
	"grad_norm": 0.07387736383215655,
	"learning_rate": 1.5190459859944505e-05,
	"loss": 1.0785,
	"step": 730
	},
	{
	"epoch": 0.8457997698504027,
	"grad_norm": 0.07362643024392242,
	"learning_rate": 1.4143348546048707e-05,
	"loss": 1.0834,
	"step": 735
	},
	{
	"epoch": 0.8515535097813579,
	"grad_norm": 0.07308957142384619,
	"learning_rate": 1.3130877846982204e-05,
	"loss": 1.0833,
	"step": 740
	},
	{
	"epoch": 0.857307249712313,
	"grad_norm": 0.07884770650029872,
	"learning_rate": 1.2153456264671337e-05,
	"loss": 1.1047,
	"step": 745
	},
	{
	"epoch": 0.8630609896432682,
	"grad_norm": 0.07410132122132759,
	"learning_rate": 1.1211478159762478e-05,
	"loss": 1.1042,
	"step": 750
	},
	{
	"epoch": 0.8688147295742232,
	"grad_norm": 0.07315305451840597,
	"learning_rate": 1.0305323592509009e-05,
	"loss": 1.1127,
	"step": 755
	},
	{
	"epoch": 0.8745684695051784,
	"grad_norm": 0.07241715378225073,
	"learning_rate": 9.435358169428442e-06,
	"loss": 1.0552,
	"step": 760
	},
	{
	"epoch": 0.8803222094361335,
	"grad_norm": 0.07310674530244925,
	"learning_rate": 8.601932895790877e-06,
	"loss": 1.1175,
	"step": 765
	},
	{
	"epoch": 0.8860759493670886,
	"grad_norm": 0.08154925564133862,
	"learning_rate": 7.805384033998875e-06,
	"loss": 1.093,
	"step": 770
	},
	{
	"epoch": 0.8918296892980437,
	"grad_norm": 0.07285132059208473,
	"learning_rate": 7.046032967915483e-06,
	"loss": 1.0869,
	"step": 775
	},
	{
	"epoch": 0.8975834292289988,
	"grad_norm": 0.07660142337736248,
	"learning_rate": 6.32418607319546e-06,
	"loss": 1.0914,
	"step": 780
	},
	{
	"epoch": 0.903337169159954,
	"grad_norm": 0.07846326221484771,
	"learning_rate": 5.640134593671598e-06,
	"loss": 1.0879,
	"step": 785
	},
	{
	"epoch": 0.9090909090909091,
	"grad_norm": 0.0744126532665454,
	"learning_rate": 4.994154523846695e-06,
	"loss": 1.0948,
	"step": 790
	},
	{
	"epoch": 0.9148446490218642,
	"grad_norm": 0.07568510267678755,
	"learning_rate": 4.386506497537757e-06,
	"loss": 1.1054,
	"step": 795
	},
	{
	"epoch": 0.9205983889528193,
	"grad_norm": 0.07378727253077548,
	"learning_rate": 3.817435682718096e-06,
	"loss": 1.0787,
	"step": 800
	},
	{
	"epoch": 0.9263521288837745,
	"grad_norm": 0.0747897379397166,
	"learning_rate": 3.287171682599255e-06,
	"loss": 1.0911,
	"step": 805
	},
	{
	"epoch": 0.9321058688147296,
	"grad_norm": 0.07818706538185882,
	"learning_rate": 2.7959284429929456e-06,
	"loss": 1.0926,
	"step": 810
	},
	{
	"epoch": 0.9378596087456847,
	"grad_norm": 0.07201156955643878,
	"learning_rate": 2.3439041659902407e-06,
	"loss": 1.106,
	"step": 815
	},
	{
	"epoch": 0.9436133486766398,
	"grad_norm": 0.07293591627119424,
	"learning_rate": 1.9312812299929094e-06,
	"loss": 1.1041,
	"step": 820
	},
	{
	"epoch": 0.9493670886075949,
	"grad_norm": 0.07620976319405062,
	"learning_rate": 1.5582261161291245e-06,
	"loss": 1.0835,
	"step": 825
	},
	{
	"epoch": 0.9551208285385501,
	"grad_norm": 0.07403997053394712,
	"learning_rate": 1.2248893410832685e-06,
	"loss": 1.1048,
	"step": 830
	},
	{
	"epoch": 0.9608745684695051,
	"grad_norm": 0.07432396016348998,
	"learning_rate": 9.314053963669245e-07,
	"loss": 1.0915,
	"step": 835
	},
	{
	"epoch": 0.9666283084004603,
	"grad_norm": 0.07180128894783522,
	"learning_rate": 6.778926940555152e-07,
	"loss": 1.0981,
	"step": 840
	},
	{
	"epoch": 0.9723820483314154,
	"grad_norm": 0.07204281549304838,
	"learning_rate": 4.644535190125421e-07,
	"loss": 1.0962,
	"step": 845
	},
	{
	"epoch": 0.9781357882623706,
	"grad_norm": 0.07295868011398168,
	"learning_rate": 2.9117398762069647e-07,
	"loss": 1.1083,
	"step": 850
	},
	{
	"epoch": 0.9838895281933256,
	"grad_norm": 0.07600348765786609,
	"learning_rate": 1.5812401303639813e-07,
	"loss": 1.1138,
	"step": 855
	},
	{
	"epoch": 0.9896432681242808,
	"grad_norm": 0.07208207285243298,
	"learning_rate": 6.535727698199213e-08,
	"loss": 1.0787,
	"step": 860
	},
	{
	"epoch": 0.9953970080552359,
	"grad_norm": 0.07662851683615209,
	"learning_rate": 1.2911208086663351e-08,
	"loss": 1.087,
	"step": 865
	},
	{
	"epoch": 1.0,
	"eval_loss": 1.038455605506897,
	"eval_runtime": 2.7622,
	"eval_samples_per_second": 2.534,
	"eval_steps_per_second": 0.724,
	"step": 869
	},
	{
	"epoch": 1.0,
	"step": 869,
	"total_flos": 1.0927921245978624e+16,
	"train_loss": 1.118139435424629,
	"train_runtime": 17754.8873,
	"train_samples_per_second": 3.132,
	"train_steps_per_second": 0.049
	}
	],
	"logging_steps": 5,
	"max_steps": 869,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 1.0927921245978624e+16,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}