PrefixTuning_FlanT5_v1 / trainer_state.json

pretrain_model and pretrain_model_token

ab7ab0b verified over 1 year ago

60 kB

	{
	"best_metric": 3.9158332347869873,
	"best_model_checkpoint": "/content/drive/MyDrive/checkpoints/checkpoint-3342",
	"epoch": 3.0,
	"eval_steps": 500,
	"global_step": 3342,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.008976660682226212,
	"grad_norm": 0.31800901889801025,
	"learning_rate": 0.0004997755834829443,
	"loss": 45.5673,
	"step": 10
	},
	{
	"epoch": 0.017953321364452424,
	"grad_norm": 0.35396715998649597,
	"learning_rate": 0.0004995511669658887,
	"loss": 45.3249,
	"step": 20
	},
	{
	"epoch": 0.026929982046678635,
	"grad_norm": 0.37757596373558044,
	"learning_rate": 0.000499326750448833,
	"loss": 44.7853,
	"step": 30
	},
	{
	"epoch": 0.03590664272890485,
	"grad_norm": 0.38259994983673096,
	"learning_rate": 0.0004991023339317774,
	"loss": 44.8996,
	"step": 40
	},
	{
	"epoch": 0.04488330341113106,
	"grad_norm": 0.38321229815483093,
	"learning_rate": 0.0004988779174147217,
	"loss": 44.1242,
	"step": 50
	},
	{
	"epoch": 0.05385996409335727,
	"grad_norm": 0.42167848348617554,
	"learning_rate": 0.0004986535008976661,
	"loss": 44.4739,
	"step": 60
	},
	{
	"epoch": 0.06283662477558348,
	"grad_norm": 0.40018683671951294,
	"learning_rate": 0.0004984290843806105,
	"loss": 44.4548,
	"step": 70
	},
	{
	"epoch": 0.0718132854578097,
	"grad_norm": 0.39394208788871765,
	"learning_rate": 0.0004982046678635547,
	"loss": 43.4622,
	"step": 80
	},
	{
	"epoch": 0.0807899461400359,
	"grad_norm": 0.3660307228565216,
	"learning_rate": 0.0004979802513464991,
	"loss": 43.0475,
	"step": 90
	},
	{
	"epoch": 0.08976660682226212,
	"grad_norm": 0.36663416028022766,
	"learning_rate": 0.0004977558348294434,
	"loss": 42.8994,
	"step": 100
	},
	{
	"epoch": 0.09874326750448834,
	"grad_norm": 0.40418022871017456,
	"learning_rate": 0.0004975314183123878,
	"loss": 43.0499,
	"step": 110
	},
	{
	"epoch": 0.10771992818671454,
	"grad_norm": 0.32946863770484924,
	"learning_rate": 0.0004973070017953322,
	"loss": 43.0516,
	"step": 120
	},
	{
	"epoch": 0.11669658886894076,
	"grad_norm": 0.36427420377731323,
	"learning_rate": 0.0004970825852782765,
	"loss": 43.0005,
	"step": 130
	},
	{
	"epoch": 0.12567324955116696,
	"grad_norm": 0.3754049837589264,
	"learning_rate": 0.0004968581687612209,
	"loss": 42.3461,
	"step": 140
	},
	{
	"epoch": 0.13464991023339318,
	"grad_norm": 0.3867158889770508,
	"learning_rate": 0.0004966337522441652,
	"loss": 42.5113,
	"step": 150
	},
	{
	"epoch": 0.1436265709156194,
	"grad_norm": 0.35019099712371826,
	"learning_rate": 0.0004964093357271095,
	"loss": 41.8364,
	"step": 160
	},
	{
	"epoch": 0.1526032315978456,
	"grad_norm": 0.37168896198272705,
	"learning_rate": 0.0004961849192100539,
	"loss": 42.4202,
	"step": 170
	},
	{
	"epoch": 0.1615798922800718,
	"grad_norm": 0.36585116386413574,
	"learning_rate": 0.0004959605026929982,
	"loss": 41.5454,
	"step": 180
	},
	{
	"epoch": 0.17055655296229802,
	"grad_norm": 0.4089430272579193,
	"learning_rate": 0.0004957360861759426,
	"loss": 41.0863,
	"step": 190
	},
	{
	"epoch": 0.17953321364452424,
	"grad_norm": 0.3681723475456238,
	"learning_rate": 0.0004955116696588868,
	"loss": 41.0869,
	"step": 200
	},
	{
	"epoch": 0.18850987432675045,
	"grad_norm": 0.3823374807834625,
	"learning_rate": 0.0004952872531418312,
	"loss": 41.0254,
	"step": 210
	},
	{
	"epoch": 0.19748653500897667,
	"grad_norm": 0.3909670412540436,
	"learning_rate": 0.0004950628366247755,
	"loss": 41.0026,
	"step": 220
	},
	{
	"epoch": 0.20646319569120286,
	"grad_norm": 0.39831164479255676,
	"learning_rate": 0.0004948384201077199,
	"loss": 40.3224,
	"step": 230
	},
	{
	"epoch": 0.21543985637342908,
	"grad_norm": 0.3801274597644806,
	"learning_rate": 0.0004946140035906643,
	"loss": 40.4711,
	"step": 240
	},
	{
	"epoch": 0.2244165170556553,
	"grad_norm": 0.39255771040916443,
	"learning_rate": 0.0004943895870736086,
	"loss": 39.9713,
	"step": 250
	},
	{
	"epoch": 0.2333931777378815,
	"grad_norm": 0.400642067193985,
	"learning_rate": 0.000494165170556553,
	"loss": 39.3574,
	"step": 260
	},
	{
	"epoch": 0.24236983842010773,
	"grad_norm": 0.44542375206947327,
	"learning_rate": 0.0004939407540394973,
	"loss": 39.4756,
	"step": 270
	},
	{
	"epoch": 0.2513464991023339,
	"grad_norm": 0.41471394896507263,
	"learning_rate": 0.0004937163375224417,
	"loss": 39.4551,
	"step": 280
	},
	{
	"epoch": 0.26032315978456017,
	"grad_norm": 0.3956909775733948,
	"learning_rate": 0.000493491921005386,
	"loss": 39.0815,
	"step": 290
	},
	{
	"epoch": 0.26929982046678635,
	"grad_norm": 0.5405673384666443,
	"learning_rate": 0.0004932675044883304,
	"loss": 38.7405,
	"step": 300
	},
	{
	"epoch": 0.27827648114901254,
	"grad_norm": 0.4720427691936493,
	"learning_rate": 0.0004930430879712747,
	"loss": 38.1905,
	"step": 310
	},
	{
	"epoch": 0.2872531418312388,
	"grad_norm": 0.4677943289279938,
	"learning_rate": 0.0004928186714542191,
	"loss": 38.023,
	"step": 320
	},
	{
	"epoch": 0.296229802513465,
	"grad_norm": 0.4742816090583801,
	"learning_rate": 0.0004925942549371633,
	"loss": 37.6844,
	"step": 330
	},
	{
	"epoch": 0.3052064631956912,
	"grad_norm": 0.463733047246933,
	"learning_rate": 0.0004923698384201077,
	"loss": 37.4262,
	"step": 340
	},
	{
	"epoch": 0.3141831238779174,
	"grad_norm": 0.48447635769844055,
	"learning_rate": 0.000492145421903052,
	"loss": 37.1974,
	"step": 350
	},
	{
	"epoch": 0.3231597845601436,
	"grad_norm": 0.5126340389251709,
	"learning_rate": 0.0004919210053859964,
	"loss": 36.4875,
	"step": 360
	},
	{
	"epoch": 0.33213644524236985,
	"grad_norm": 0.5128099322319031,
	"learning_rate": 0.0004916965888689407,
	"loss": 36.8894,
	"step": 370
	},
	{
	"epoch": 0.34111310592459604,
	"grad_norm": 0.5677986741065979,
	"learning_rate": 0.0004914721723518851,
	"loss": 36.0053,
	"step": 380
	},
	{
	"epoch": 0.3500897666068223,
	"grad_norm": 0.6088815927505493,
	"learning_rate": 0.0004912477558348294,
	"loss": 36.2308,
	"step": 390
	},
	{
	"epoch": 0.3590664272890485,
	"grad_norm": 0.5765969157218933,
	"learning_rate": 0.0004910233393177738,
	"loss": 35.4461,
	"step": 400
	},
	{
	"epoch": 0.36804308797127466,
	"grad_norm": 0.5858592391014099,
	"learning_rate": 0.0004907989228007182,
	"loss": 35.0571,
	"step": 410
	},
	{
	"epoch": 0.3770197486535009,
	"grad_norm": 0.6825990080833435,
	"learning_rate": 0.0004905745062836625,
	"loss": 34.717,
	"step": 420
	},
	{
	"epoch": 0.3859964093357271,
	"grad_norm": 0.7166014313697815,
	"learning_rate": 0.0004903500897666069,
	"loss": 33.8366,
	"step": 430
	},
	{
	"epoch": 0.39497307001795334,
	"grad_norm": 0.6887209415435791,
	"learning_rate": 0.0004901256732495512,
	"loss": 33.7563,
	"step": 440
	},
	{
	"epoch": 0.40394973070017953,
	"grad_norm": 0.7413772344589233,
	"learning_rate": 0.0004899012567324956,
	"loss": 33.1205,
	"step": 450
	},
	{
	"epoch": 0.4129263913824057,
	"grad_norm": 0.7537035942077637,
	"learning_rate": 0.0004896768402154398,
	"loss": 32.9826,
	"step": 460
	},
	{
	"epoch": 0.42190305206463197,
	"grad_norm": 0.730989396572113,
	"learning_rate": 0.0004894524236983842,
	"loss": 31.9265,
	"step": 470
	},
	{
	"epoch": 0.43087971274685816,
	"grad_norm": 0.9165148735046387,
	"learning_rate": 0.0004892280071813285,
	"loss": 31.3014,
	"step": 480
	},
	{
	"epoch": 0.4398563734290844,
	"grad_norm": 0.8587144613265991,
	"learning_rate": 0.0004890035906642729,
	"loss": 30.889,
	"step": 490
	},
	{
	"epoch": 0.4488330341113106,
	"grad_norm": 0.9183847904205322,
	"learning_rate": 0.0004887791741472172,
	"loss": 30.1653,
	"step": 500
	},
	{
	"epoch": 0.4578096947935368,
	"grad_norm": 0.9044579863548279,
	"learning_rate": 0.0004885547576301616,
	"loss": 29.9274,
	"step": 510
	},
	{
	"epoch": 0.466786355475763,
	"grad_norm": 0.8621285557746887,
	"learning_rate": 0.0004883303411131059,
	"loss": 29.5479,
	"step": 520
	},
	{
	"epoch": 0.4757630161579892,
	"grad_norm": 1.1030315160751343,
	"learning_rate": 0.0004881059245960503,
	"loss": 29.1674,
	"step": 530
	},
	{
	"epoch": 0.48473967684021546,
	"grad_norm": 1.071616768836975,
	"learning_rate": 0.00048788150807899463,
	"loss": 28.5656,
	"step": 540
	},
	{
	"epoch": 0.49371633752244165,
	"grad_norm": 0.9452396035194397,
	"learning_rate": 0.000487657091561939,
	"loss": 28.1162,
	"step": 550
	},
	{
	"epoch": 0.5026929982046678,
	"grad_norm": 0.9999839067459106,
	"learning_rate": 0.0004874326750448833,
	"loss": 27.1627,
	"step": 560
	},
	{
	"epoch": 0.5116696588868941,
	"grad_norm": 1.5522288084030151,
	"learning_rate": 0.00048720825852782766,
	"loss": 26.8812,
	"step": 570
	},
	{
	"epoch": 0.5206463195691203,
	"grad_norm": 1.1541786193847656,
	"learning_rate": 0.000486983842010772,
	"loss": 26.2589,
	"step": 580
	},
	{
	"epoch": 0.5296229802513465,
	"grad_norm": 0.9977880120277405,
	"learning_rate": 0.00048675942549371634,
	"loss": 26.168,
	"step": 590
	},
	{
	"epoch": 0.5385996409335727,
	"grad_norm": 0.9028811454772949,
	"learning_rate": 0.0004865350089766607,
	"loss": 24.9378,
	"step": 600
	},
	{
	"epoch": 0.547576301615799,
	"grad_norm": 1.0026092529296875,
	"learning_rate": 0.00048631059245960503,
	"loss": 25.0135,
	"step": 610
	},
	{
	"epoch": 0.5565529622980251,
	"grad_norm": 1.0198203325271606,
	"learning_rate": 0.00048608617594254937,
	"loss": 24.6053,
	"step": 620
	},
	{
	"epoch": 0.5655296229802513,
	"grad_norm": 1.1564388275146484,
	"learning_rate": 0.0004858617594254937,
	"loss": 24.0813,
	"step": 630
	},
	{
	"epoch": 0.5745062836624776,
	"grad_norm": 0.8892808556556702,
	"learning_rate": 0.0004856373429084381,
	"loss": 23.7441,
	"step": 640
	},
	{
	"epoch": 0.5834829443447038,
	"grad_norm": 1.1114846467971802,
	"learning_rate": 0.0004854129263913824,
	"loss": 22.8734,
	"step": 650
	},
	{
	"epoch": 0.59245960502693,
	"grad_norm": 1.0558847188949585,
	"learning_rate": 0.0004851885098743268,
	"loss": 22.3045,
	"step": 660
	},
	{
	"epoch": 0.6014362657091562,
	"grad_norm": 0.8897343277931213,
	"learning_rate": 0.0004849640933572711,
	"loss": 21.977,
	"step": 670
	},
	{
	"epoch": 0.6104129263913824,
	"grad_norm": 0.9796168208122253,
	"learning_rate": 0.0004847396768402155,
	"loss": 21.5113,
	"step": 680
	},
	{
	"epoch": 0.6193895870736086,
	"grad_norm": 0.8519884943962097,
	"learning_rate": 0.0004845152603231598,
	"loss": 20.9744,
	"step": 690
	},
	{
	"epoch": 0.6283662477558348,
	"grad_norm": 1.1632051467895508,
	"learning_rate": 0.00048429084380610416,
	"loss": 20.271,
	"step": 700
	},
	{
	"epoch": 0.6373429084380611,
	"grad_norm": 0.9868700504302979,
	"learning_rate": 0.0004840664272890485,
	"loss": 19.9961,
	"step": 710
	},
	{
	"epoch": 0.6463195691202872,
	"grad_norm": 0.9679480791091919,
	"learning_rate": 0.0004838420107719928,
	"loss": 19.4405,
	"step": 720
	},
	{
	"epoch": 0.6552962298025135,
	"grad_norm": 1.0145677328109741,
	"learning_rate": 0.0004836175942549372,
	"loss": 19.2046,
	"step": 730
	},
	{
	"epoch": 0.6642728904847397,
	"grad_norm": 1.0279533863067627,
	"learning_rate": 0.00048339317773788147,
	"loss": 18.2792,
	"step": 740
	},
	{
	"epoch": 0.6732495511669659,
	"grad_norm": 1.2876602411270142,
	"learning_rate": 0.00048316876122082587,
	"loss": 17.8022,
	"step": 750
	},
	{
	"epoch": 0.6822262118491921,
	"grad_norm": 1.0419774055480957,
	"learning_rate": 0.0004829443447037702,
	"loss": 17.4577,
	"step": 760
	},
	{
	"epoch": 0.6912028725314183,
	"grad_norm": 1.0887730121612549,
	"learning_rate": 0.00048271992818671455,
	"loss": 16.5106,
	"step": 770
	},
	{
	"epoch": 0.7001795332136446,
	"grad_norm": 1.1203436851501465,
	"learning_rate": 0.0004824955116696589,
	"loss": 16.4582,
	"step": 780
	},
	{
	"epoch": 0.7091561938958707,
	"grad_norm": 1.0770111083984375,
	"learning_rate": 0.00048227109515260324,
	"loss": 16.003,
	"step": 790
	},
	{
	"epoch": 0.718132854578097,
	"grad_norm": 1.2158771753311157,
	"learning_rate": 0.0004820466786355476,
	"loss": 15.2694,
	"step": 800
	},
	{
	"epoch": 0.7271095152603232,
	"grad_norm": 1.1706403493881226,
	"learning_rate": 0.000481822262118492,
	"loss": 14.9252,
	"step": 810
	},
	{
	"epoch": 0.7360861759425493,
	"grad_norm": 1.189310908317566,
	"learning_rate": 0.00048159784560143626,
	"loss": 14.4921,
	"step": 820
	},
	{
	"epoch": 0.7450628366247756,
	"grad_norm": 1.6199108362197876,
	"learning_rate": 0.00048137342908438066,
	"loss": 13.9443,
	"step": 830
	},
	{
	"epoch": 0.7540394973070018,
	"grad_norm": 1.1757200956344604,
	"learning_rate": 0.00048114901256732494,
	"loss": 13.8288,
	"step": 840
	},
	{
	"epoch": 0.7630161579892281,
	"grad_norm": 1.2064054012298584,
	"learning_rate": 0.00048092459605026934,
	"loss": 12.9563,
	"step": 850
	},
	{
	"epoch": 0.7719928186714542,
	"grad_norm": 1.1954108476638794,
	"learning_rate": 0.00048070017953321363,
	"loss": 12.1382,
	"step": 860
	},
	{
	"epoch": 0.7809694793536804,
	"grad_norm": 1.5387598276138306,
	"learning_rate": 0.00048047576301615797,
	"loss": 12.1248,
	"step": 870
	},
	{
	"epoch": 0.7899461400359067,
	"grad_norm": 1.2923359870910645,
	"learning_rate": 0.00048025134649910237,
	"loss": 11.7902,
	"step": 880
	},
	{
	"epoch": 0.7989228007181328,
	"grad_norm": 0.9865145683288574,
	"learning_rate": 0.00048002692998204665,
	"loss": 10.7329,
	"step": 890
	},
	{
	"epoch": 0.8078994614003591,
	"grad_norm": 1.140541672706604,
	"learning_rate": 0.00047980251346499105,
	"loss": 10.5986,
	"step": 900
	},
	{
	"epoch": 0.8168761220825853,
	"grad_norm": 1.1022454500198364,
	"learning_rate": 0.00047957809694793534,
	"loss": 10.2782,
	"step": 910
	},
	{
	"epoch": 0.8258527827648114,
	"grad_norm": 0.8876429200172424,
	"learning_rate": 0.00047935368043087973,
	"loss": 9.3573,
	"step": 920
	},
	{
	"epoch": 0.8348294434470377,
	"grad_norm": 0.9144046306610107,
	"learning_rate": 0.0004791292639138241,
	"loss": 9.4616,
	"step": 930
	},
	{
	"epoch": 0.8438061041292639,
	"grad_norm": 1.022176742553711,
	"learning_rate": 0.0004789048473967684,
	"loss": 9.0571,
	"step": 940
	},
	{
	"epoch": 0.8527827648114902,
	"grad_norm": 0.9050130248069763,
	"learning_rate": 0.00047868043087971276,
	"loss": 8.4811,
	"step": 950
	},
	{
	"epoch": 0.8617594254937163,
	"grad_norm": 0.8372008800506592,
	"learning_rate": 0.0004784560143626571,
	"loss": 8.3873,
	"step": 960
	},
	{
	"epoch": 0.8707360861759426,
	"grad_norm": 0.8663610816001892,
	"learning_rate": 0.00047823159784560144,
	"loss": 8.0233,
	"step": 970
	},
	{
	"epoch": 0.8797127468581688,
	"grad_norm": 0.6936354637145996,
	"learning_rate": 0.00047800718132854584,
	"loss": 7.8054,
	"step": 980
	},
	{
	"epoch": 0.8886894075403949,
	"grad_norm": 0.5529871582984924,
	"learning_rate": 0.00047778276481149013,
	"loss": 7.6013,
	"step": 990
	},
	{
	"epoch": 0.8976660682226212,
	"grad_norm": 0.6260952353477478,
	"learning_rate": 0.00047755834829443447,
	"loss": 7.4237,
	"step": 1000
	},
	{
	"epoch": 0.9066427289048474,
	"grad_norm": 0.851337730884552,
	"learning_rate": 0.0004773339317773788,
	"loss": 7.2549,
	"step": 1010
	},
	{
	"epoch": 0.9156193895870736,
	"grad_norm": 0.6702756285667419,
	"learning_rate": 0.00047710951526032315,
	"loss": 7.0967,
	"step": 1020
	},
	{
	"epoch": 0.9245960502692998,
	"grad_norm": 0.6650304794311523,
	"learning_rate": 0.0004768850987432675,
	"loss": 6.9988,
	"step": 1030
	},
	{
	"epoch": 0.933572710951526,
	"grad_norm": 0.551717221736908,
	"learning_rate": 0.00047666068222621184,
	"loss": 6.5465,
	"step": 1040
	},
	{
	"epoch": 0.9425493716337523,
	"grad_norm": 0.4560067653656006,
	"learning_rate": 0.00047643626570915623,
	"loss": 6.5641,
	"step": 1050
	},
	{
	"epoch": 0.9515260323159784,
	"grad_norm": 0.4556948244571686,
	"learning_rate": 0.0004762118491921005,
	"loss": 6.6911,
	"step": 1060
	},
	{
	"epoch": 0.9605026929982047,
	"grad_norm": 0.8652740716934204,
	"learning_rate": 0.0004759874326750449,
	"loss": 6.7453,
	"step": 1070
	},
	{
	"epoch": 0.9694793536804309,
	"grad_norm": 0.32210618257522583,
	"learning_rate": 0.0004757630161579892,
	"loss": 6.5263,
	"step": 1080
	},
	{
	"epoch": 0.9784560143626571,
	"grad_norm": 1.9738398790359497,
	"learning_rate": 0.0004755385996409336,
	"loss": 6.4019,
	"step": 1090
	},
	{
	"epoch": 0.9874326750448833,
	"grad_norm": 0.31478866934776306,
	"learning_rate": 0.00047531418312387794,
	"loss": 6.266,
	"step": 1100
	},
	{
	"epoch": 0.9964093357271095,
	"grad_norm": 0.39359068870544434,
	"learning_rate": 0.0004750897666068223,
	"loss": 6.2422,
	"step": 1110
	},
	{
	"epoch": 1.0,
	"eval_loss": 5.025014400482178,
	"eval_runtime": 436.9889,
	"eval_samples_per_second": 10.197,
	"eval_steps_per_second": 1.275,
	"step": 1114
	},
	{
	"epoch": 1.0053859964093357,
	"grad_norm": 0.3087250888347626,
	"learning_rate": 0.0004748653500897666,
	"loss": 6.1059,
	"step": 1120
	},
	{
	"epoch": 1.014362657091562,
	"grad_norm": 0.4997764825820923,
	"learning_rate": 0.00047464093357271097,
	"loss": 6.1567,
	"step": 1130
	},
	{
	"epoch": 1.0233393177737882,
	"grad_norm": 0.4492017328739166,
	"learning_rate": 0.0004744165170556553,
	"loss": 6.0689,
	"step": 1140
	},
	{
	"epoch": 1.0323159784560143,
	"grad_norm": 0.35565611720085144,
	"learning_rate": 0.00047419210053859965,
	"loss": 5.9551,
	"step": 1150
	},
	{
	"epoch": 1.0412926391382407,
	"grad_norm": 0.28686025738716125,
	"learning_rate": 0.000473967684021544,
	"loss": 5.9306,
	"step": 1160
	},
	{
	"epoch": 1.0502692998204668,
	"grad_norm": 0.28098103404045105,
	"learning_rate": 0.00047374326750448834,
	"loss": 5.8205,
	"step": 1170
	},
	{
	"epoch": 1.059245960502693,
	"grad_norm": 0.3124157190322876,
	"learning_rate": 0.0004735188509874327,
	"loss": 5.7734,
	"step": 1180
	},
	{
	"epoch": 1.0682226211849193,
	"grad_norm": 0.27604150772094727,
	"learning_rate": 0.000473294434470377,
	"loss": 5.8549,
	"step": 1190
	},
	{
	"epoch": 1.0771992818671454,
	"grad_norm": 0.48105934262275696,
	"learning_rate": 0.00047307001795332136,
	"loss": 5.8208,
	"step": 1200
	},
	{
	"epoch": 1.0861759425493716,
	"grad_norm": 0.33073532581329346,
	"learning_rate": 0.0004728456014362657,
	"loss": 5.7798,
	"step": 1210
	},
	{
	"epoch": 1.095152603231598,
	"grad_norm": 0.24770517647266388,
	"learning_rate": 0.0004726211849192101,
	"loss": 5.6513,
	"step": 1220
	},
	{
	"epoch": 1.104129263913824,
	"grad_norm": 0.23116350173950195,
	"learning_rate": 0.0004723967684021544,
	"loss": 5.6458,
	"step": 1230
	},
	{
	"epoch": 1.1131059245960502,
	"grad_norm": 0.2757456302642822,
	"learning_rate": 0.0004721723518850988,
	"loss": 5.7592,
	"step": 1240
	},
	{
	"epoch": 1.1220825852782765,
	"grad_norm": 0.23286688327789307,
	"learning_rate": 0.00047194793536804307,
	"loss": 5.6889,
	"step": 1250
	},
	{
	"epoch": 1.1310592459605027,
	"grad_norm": 0.1967301219701767,
	"learning_rate": 0.00047172351885098747,
	"loss": 5.5865,
	"step": 1260
	},
	{
	"epoch": 1.140035906642729,
	"grad_norm": 0.22576653957366943,
	"learning_rate": 0.0004714991023339318,
	"loss": 5.4764,
	"step": 1270
	},
	{
	"epoch": 1.1490125673249552,
	"grad_norm": 0.217813640832901,
	"learning_rate": 0.00047127468581687615,
	"loss": 5.6309,
	"step": 1280
	},
	{
	"epoch": 1.1579892280071813,
	"grad_norm": 0.1798250824213028,
	"learning_rate": 0.0004710502692998205,
	"loss": 5.4452,
	"step": 1290
	},
	{
	"epoch": 1.1669658886894076,
	"grad_norm": 0.22210471332073212,
	"learning_rate": 0.0004708258527827648,
	"loss": 5.5905,
	"step": 1300
	},
	{
	"epoch": 1.1759425493716338,
	"grad_norm": 0.24236564338207245,
	"learning_rate": 0.0004706014362657092,
	"loss": 5.5106,
	"step": 1310
	},
	{
	"epoch": 1.18491921005386,
	"grad_norm": 0.205738365650177,
	"learning_rate": 0.00047037701974865346,
	"loss": 5.4863,
	"step": 1320
	},
	{
	"epoch": 1.1938958707360863,
	"grad_norm": 0.2275596708059311,
	"learning_rate": 0.00047015260323159786,
	"loss": 5.4782,
	"step": 1330
	},
	{
	"epoch": 1.2028725314183124,
	"grad_norm": 0.40637847781181335,
	"learning_rate": 0.0004699281867145422,
	"loss": 5.4103,
	"step": 1340
	},
	{
	"epoch": 1.2118491921005385,
	"grad_norm": 0.17678338289260864,
	"learning_rate": 0.00046970377019748654,
	"loss": 5.3858,
	"step": 1350
	},
	{
	"epoch": 1.220825852782765,
	"grad_norm": 0.1862853765487671,
	"learning_rate": 0.0004694793536804309,
	"loss": 5.379,
	"step": 1360
	},
	{
	"epoch": 1.229802513464991,
	"grad_norm": 0.12334032356739044,
	"learning_rate": 0.0004692549371633752,
	"loss": 5.396,
	"step": 1370
	},
	{
	"epoch": 1.2387791741472172,
	"grad_norm": 0.15632939338684082,
	"learning_rate": 0.00046903052064631957,
	"loss": 5.3853,
	"step": 1380
	},
	{
	"epoch": 1.2477558348294435,
	"grad_norm": 0.18021011352539062,
	"learning_rate": 0.00046880610412926396,
	"loss": 5.2905,
	"step": 1390
	},
	{
	"epoch": 1.2567324955116697,
	"grad_norm": 0.15651032328605652,
	"learning_rate": 0.00046858168761220825,
	"loss": 5.4102,
	"step": 1400
	},
	{
	"epoch": 1.2657091561938958,
	"grad_norm": 0.15990717709064484,
	"learning_rate": 0.00046835727109515265,
	"loss": 5.3213,
	"step": 1410
	},
	{
	"epoch": 1.2746858168761221,
	"grad_norm": 0.23683366179466248,
	"learning_rate": 0.00046813285457809694,
	"loss": 5.3596,
	"step": 1420
	},
	{
	"epoch": 1.2836624775583483,
	"grad_norm": 0.17186540365219116,
	"learning_rate": 0.0004679084380610413,
	"loss": 5.2734,
	"step": 1430
	},
	{
	"epoch": 1.2926391382405744,
	"grad_norm": 0.12084522843360901,
	"learning_rate": 0.0004676840215439857,
	"loss": 5.2741,
	"step": 1440
	},
	{
	"epoch": 1.3016157989228008,
	"grad_norm": 0.13929304480552673,
	"learning_rate": 0.00046745960502692996,
	"loss": 5.2734,
	"step": 1450
	},
	{
	"epoch": 1.310592459605027,
	"grad_norm": 0.22931580245494843,
	"learning_rate": 0.00046723518850987436,
	"loss": 5.2281,
	"step": 1460
	},
	{
	"epoch": 1.319569120287253,
	"grad_norm": 0.13986773788928986,
	"learning_rate": 0.00046701077199281865,
	"loss": 5.2185,
	"step": 1470
	},
	{
	"epoch": 1.3285457809694794,
	"grad_norm": 0.11496925354003906,
	"learning_rate": 0.00046678635547576304,
	"loss": 5.2082,
	"step": 1480
	},
	{
	"epoch": 1.3375224416517055,
	"grad_norm": 0.2594555616378784,
	"learning_rate": 0.00046656193895870733,
	"loss": 5.1917,
	"step": 1490
	},
	{
	"epoch": 1.3464991023339317,
	"grad_norm": 0.13332834839820862,
	"learning_rate": 0.0004663375224416517,
	"loss": 5.1701,
	"step": 1500
	},
	{
	"epoch": 1.355475763016158,
	"grad_norm": 0.1260669082403183,
	"learning_rate": 0.00046611310592459607,
	"loss": 5.1703,
	"step": 1510
	},
	{
	"epoch": 1.3644524236983842,
	"grad_norm": 0.17557017505168915,
	"learning_rate": 0.0004658886894075404,
	"loss": 5.1374,
	"step": 1520
	},
	{
	"epoch": 1.3734290843806103,
	"grad_norm": 0.1354808807373047,
	"learning_rate": 0.00046566427289048475,
	"loss": 5.1732,
	"step": 1530
	},
	{
	"epoch": 1.3824057450628366,
	"grad_norm": 0.16720908880233765,
	"learning_rate": 0.0004654398563734291,
	"loss": 5.3396,
	"step": 1540
	},
	{
	"epoch": 1.3913824057450628,
	"grad_norm": 0.19078396260738373,
	"learning_rate": 0.00046521543985637343,
	"loss": 5.1455,
	"step": 1550
	},
	{
	"epoch": 1.400359066427289,
	"grad_norm": 0.2168230563402176,
	"learning_rate": 0.00046499102333931783,
	"loss": 5.1026,
	"step": 1560
	},
	{
	"epoch": 1.4093357271095153,
	"grad_norm": 0.12317873537540436,
	"learning_rate": 0.0004647666068222621,
	"loss": 5.1632,
	"step": 1570
	},
	{
	"epoch": 1.4183123877917414,
	"grad_norm": 0.16298305988311768,
	"learning_rate": 0.00046454219030520646,
	"loss": 5.1489,
	"step": 1580
	},
	{
	"epoch": 1.4272890484739678,
	"grad_norm": 0.09502866864204407,
	"learning_rate": 0.0004643177737881508,
	"loss": 5.1068,
	"step": 1590
	},
	{
	"epoch": 1.436265709156194,
	"grad_norm": 0.15911273658275604,
	"learning_rate": 0.00046409335727109514,
	"loss": 5.0888,
	"step": 1600
	},
	{
	"epoch": 1.44524236983842,
	"grad_norm": 0.12198328226804733,
	"learning_rate": 0.00046386894075403954,
	"loss": 5.071,
	"step": 1610
	},
	{
	"epoch": 1.4542190305206464,
	"grad_norm": 0.11831381171941757,
	"learning_rate": 0.00046364452423698383,
	"loss": 5.0809,
	"step": 1620
	},
	{
	"epoch": 1.4631956912028725,
	"grad_norm": 0.1053285300731659,
	"learning_rate": 0.0004634201077199282,
	"loss": 5.0774,
	"step": 1630
	},
	{
	"epoch": 1.4721723518850989,
	"grad_norm": 0.1193586066365242,
	"learning_rate": 0.0004631956912028725,
	"loss": 5.0553,
	"step": 1640
	},
	{
	"epoch": 1.481149012567325,
	"grad_norm": 0.16306863725185394,
	"learning_rate": 0.0004629712746858169,
	"loss": 5.0607,
	"step": 1650
	},
	{
	"epoch": 1.4901256732495511,
	"grad_norm": 0.12861207127571106,
	"learning_rate": 0.0004627468581687612,
	"loss": 5.0656,
	"step": 1660
	},
	{
	"epoch": 1.4991023339317775,
	"grad_norm": 0.08006058633327484,
	"learning_rate": 0.0004625224416517056,
	"loss": 5.0515,
	"step": 1670
	},
	{
	"epoch": 1.5080789946140036,
	"grad_norm": 0.11404240876436234,
	"learning_rate": 0.00046229802513464993,
	"loss": 5.0098,
	"step": 1680
	},
	{
	"epoch": 1.5170556552962298,
	"grad_norm": 0.13075587153434753,
	"learning_rate": 0.0004620736086175943,
	"loss": 4.9911,
	"step": 1690
	},
	{
	"epoch": 1.5260323159784561,
	"grad_norm": 0.17212539911270142,
	"learning_rate": 0.0004618491921005386,
	"loss": 5.0541,
	"step": 1700
	},
	{
	"epoch": 1.5350089766606823,
	"grad_norm": 0.07674333453178406,
	"learning_rate": 0.00046162477558348296,
	"loss": 5.0126,
	"step": 1710
	},
	{
	"epoch": 1.5439856373429084,
	"grad_norm": 0.1121719628572464,
	"learning_rate": 0.0004614003590664273,
	"loss": 5.0082,
	"step": 1720
	},
	{
	"epoch": 1.5529622980251347,
	"grad_norm": 0.16214531660079956,
	"learning_rate": 0.00046117594254937164,
	"loss": 4.9905,
	"step": 1730
	},
	{
	"epoch": 1.5619389587073609,
	"grad_norm": 0.12353977560997009,
	"learning_rate": 0.000460951526032316,
	"loss": 4.9644,
	"step": 1740
	},
	{
	"epoch": 1.570915619389587,
	"grad_norm": 0.15267392992973328,
	"learning_rate": 0.0004607271095152603,
	"loss": 4.9708,
	"step": 1750
	},
	{
	"epoch": 1.5798922800718134,
	"grad_norm": 0.17361833155155182,
	"learning_rate": 0.00046050269299820467,
	"loss": 4.9869,
	"step": 1760
	},
	{
	"epoch": 1.5888689407540395,
	"grad_norm": 0.2920306622982025,
	"learning_rate": 0.000460278276481149,
	"loss": 4.9322,
	"step": 1770
	},
	{
	"epoch": 1.5978456014362656,
	"grad_norm": 0.09478717297315598,
	"learning_rate": 0.0004600538599640934,
	"loss": 4.9247,
	"step": 1780
	},
	{
	"epoch": 1.606822262118492,
	"grad_norm": 0.09164275228977203,
	"learning_rate": 0.0004598294434470377,
	"loss": 4.9086,
	"step": 1790
	},
	{
	"epoch": 1.6157989228007181,
	"grad_norm": 0.07962439954280853,
	"learning_rate": 0.0004596050269299821,
	"loss": 4.9412,
	"step": 1800
	},
	{
	"epoch": 1.6247755834829443,
	"grad_norm": 0.08752849698066711,
	"learning_rate": 0.0004593806104129264,
	"loss": 4.9291,
	"step": 1810
	},
	{
	"epoch": 1.6337522441651706,
	"grad_norm": 0.09293937683105469,
	"learning_rate": 0.0004591561938958708,
	"loss": 4.9652,
	"step": 1820
	},
	{
	"epoch": 1.6427289048473968,
	"grad_norm": 0.09523571282625198,
	"learning_rate": 0.00045893177737881506,
	"loss": 4.9137,
	"step": 1830
	},
	{
	"epoch": 1.6517055655296229,
	"grad_norm": 0.09075015783309937,
	"learning_rate": 0.00045870736086175946,
	"loss": 4.8925,
	"step": 1840
	},
	{
	"epoch": 1.6606822262118492,
	"grad_norm": 0.14088210463523865,
	"learning_rate": 0.0004584829443447038,
	"loss": 4.8941,
	"step": 1850
	},
	{
	"epoch": 1.6696588868940754,
	"grad_norm": 0.06859997659921646,
	"learning_rate": 0.0004582585278276481,
	"loss": 4.8731,
	"step": 1860
	},
	{
	"epoch": 1.6786355475763015,
	"grad_norm": 0.06676523387432098,
	"learning_rate": 0.0004580341113105925,
	"loss": 4.8615,
	"step": 1870
	},
	{
	"epoch": 1.6876122082585279,
	"grad_norm": 0.08721990138292313,
	"learning_rate": 0.00045780969479353677,
	"loss": 4.8847,
	"step": 1880
	},
	{
	"epoch": 1.696588868940754,
	"grad_norm": 0.08681096136569977,
	"learning_rate": 0.00045758527827648117,
	"loss": 4.884,
	"step": 1890
	},
	{
	"epoch": 1.7055655296229801,
	"grad_norm": 0.1754937767982483,
	"learning_rate": 0.0004573608617594255,
	"loss": 4.8663,
	"step": 1900
	},
	{
	"epoch": 1.7145421903052065,
	"grad_norm": 0.07060963660478592,
	"learning_rate": 0.00045713644524236985,
	"loss": 4.9142,
	"step": 1910
	},
	{
	"epoch": 1.7235188509874326,
	"grad_norm": 0.12035933881998062,
	"learning_rate": 0.0004569120287253142,
	"loss": 4.8599,
	"step": 1920
	},
	{
	"epoch": 1.7324955116696588,
	"grad_norm": 0.11212557554244995,
	"learning_rate": 0.00045668761220825853,
	"loss": 4.8899,
	"step": 1930
	},
	{
	"epoch": 1.7414721723518851,
	"grad_norm": 0.058452803641557693,
	"learning_rate": 0.0004564631956912029,
	"loss": 4.8454,
	"step": 1940
	},
	{
	"epoch": 1.7504488330341115,
	"grad_norm": 0.1073731780052185,
	"learning_rate": 0.0004562387791741472,
	"loss": 4.8546,
	"step": 1950
	},
	{
	"epoch": 1.7594254937163374,
	"grad_norm": 0.12025927007198334,
	"learning_rate": 0.00045601436265709156,
	"loss": 4.8446,
	"step": 1960
	},
	{
	"epoch": 1.7684021543985637,
	"grad_norm": 0.08838968724012375,
	"learning_rate": 0.00045578994614003596,
	"loss": 4.8625,
	"step": 1970
	},
	{
	"epoch": 1.77737881508079,
	"grad_norm": 0.0963386595249176,
	"learning_rate": 0.00045556552962298024,
	"loss": 4.8518,
	"step": 1980
	},
	{
	"epoch": 1.786355475763016,
	"grad_norm": 0.10317738354206085,
	"learning_rate": 0.00045534111310592464,
	"loss": 4.8161,
	"step": 1990
	},
	{
	"epoch": 1.7953321364452424,
	"grad_norm": 2.792144536972046,
	"learning_rate": 0.00045511669658886893,
	"loss": 4.8589,
	"step": 2000
	},
	{
	"epoch": 1.8043087971274687,
	"grad_norm": 0.08297235518693924,
	"learning_rate": 0.00045489228007181327,
	"loss": 4.8135,
	"step": 2010
	},
	{
	"epoch": 1.8132854578096946,
	"grad_norm": 0.080784372985363,
	"learning_rate": 0.00045466786355475767,
	"loss": 4.8436,
	"step": 2020
	},
	{
	"epoch": 1.822262118491921,
	"grad_norm": 0.08878181129693985,
	"learning_rate": 0.00045444344703770195,
	"loss": 4.8532,
	"step": 2030
	},
	{
	"epoch": 1.8312387791741473,
	"grad_norm": 0.0814034566283226,
	"learning_rate": 0.00045421903052064635,
	"loss": 4.7992,
	"step": 2040
	},
	{
	"epoch": 1.8402154398563735,
	"grad_norm": 0.05908200889825821,
	"learning_rate": 0.00045399461400359064,
	"loss": 4.8005,
	"step": 2050
	},
	{
	"epoch": 1.8491921005385996,
	"grad_norm": 0.06837856769561768,
	"learning_rate": 0.00045377019748653503,
	"loss": 4.802,
	"step": 2060
	},
	{
	"epoch": 1.858168761220826,
	"grad_norm": 0.06775591522455215,
	"learning_rate": 0.0004535457809694794,
	"loss": 4.7847,
	"step": 2070
	},
	{
	"epoch": 1.867145421903052,
	"grad_norm": 0.27018266916275024,
	"learning_rate": 0.0004533213644524237,
	"loss": 4.7918,
	"step": 2080
	},
	{
	"epoch": 1.8761220825852782,
	"grad_norm": 0.21435914933681488,
	"learning_rate": 0.00045309694793536806,
	"loss": 4.8033,
	"step": 2090
	},
	{
	"epoch": 1.8850987432675046,
	"grad_norm": 0.07224582880735397,
	"learning_rate": 0.0004528725314183124,
	"loss": 4.7639,
	"step": 2100
	},
	{
	"epoch": 1.8940754039497307,
	"grad_norm": 0.08708648383617401,
	"learning_rate": 0.00045264811490125674,
	"loss": 4.7894,
	"step": 2110
	},
	{
	"epoch": 1.9030520646319569,
	"grad_norm": 0.08637712150812149,
	"learning_rate": 0.0004524236983842011,
	"loss": 4.7745,
	"step": 2120
	},
	{
	"epoch": 1.9120287253141832,
	"grad_norm": 0.06233949586749077,
	"learning_rate": 0.0004521992818671454,
	"loss": 4.783,
	"step": 2130
	},
	{
	"epoch": 1.9210053859964094,
	"grad_norm": 0.07999356091022491,
	"learning_rate": 0.0004519748653500898,
	"loss": 4.7585,
	"step": 2140
	},
	{
	"epoch": 1.9299820466786355,
	"grad_norm": 0.09440754354000092,
	"learning_rate": 0.0004517504488330341,
	"loss": 4.7653,
	"step": 2150
	},
	{
	"epoch": 1.9389587073608618,
	"grad_norm": 0.09272520244121552,
	"learning_rate": 0.00045152603231597845,
	"loss": 4.7523,
	"step": 2160
	},
	{
	"epoch": 1.947935368043088,
	"grad_norm": 0.08041410148143768,
	"learning_rate": 0.0004513016157989228,
	"loss": 4.7593,
	"step": 2170
	},
	{
	"epoch": 1.9569120287253141,
	"grad_norm": 0.048107001930475235,
	"learning_rate": 0.00045107719928186714,
	"loss": 4.7392,
	"step": 2180
	},
	{
	"epoch": 1.9658886894075405,
	"grad_norm": 0.07445549219846725,
	"learning_rate": 0.00045085278276481153,
	"loss": 4.7288,
	"step": 2190
	},
	{
	"epoch": 1.9748653500897666,
	"grad_norm": 0.06540877372026443,
	"learning_rate": 0.0004506283662477558,
	"loss": 4.7311,
	"step": 2200
	},
	{
	"epoch": 1.9838420107719927,
	"grad_norm": 0.05422632023692131,
	"learning_rate": 0.0004504039497307002,
	"loss": 4.728,
	"step": 2210
	},
	{
	"epoch": 1.992818671454219,
	"grad_norm": 0.05353199318051338,
	"learning_rate": 0.0004501795332136445,
	"loss": 4.7274,
	"step": 2220
	},
	{
	"epoch": 2.0,
	"eval_loss": 4.539531707763672,
	"eval_runtime": 437.126,
	"eval_samples_per_second": 10.194,
	"eval_steps_per_second": 1.274,
	"step": 2228
	},
	{
	"epoch": 2.0017953321364454,
	"grad_norm": 0.05564208701252937,
	"learning_rate": 0.0004499551166965889,
	"loss": 4.7397,
	"step": 2230
	},
	{
	"epoch": 2.0107719928186714,
	"grad_norm": 0.05997077003121376,
	"learning_rate": 0.00044973070017953324,
	"loss": 4.714,
	"step": 2240
	},
	{
	"epoch": 2.0197486535008977,
	"grad_norm": 0.0496087446808815,
	"learning_rate": 0.0004495062836624776,
	"loss": 4.7431,
	"step": 2250
	},
	{
	"epoch": 2.028725314183124,
	"grad_norm": 0.08797594904899597,
	"learning_rate": 0.0004492818671454219,
	"loss": 4.7186,
	"step": 2260
	},
	{
	"epoch": 2.03770197486535,
	"grad_norm": 0.05270407721400261,
	"learning_rate": 0.00044905745062836627,
	"loss": 4.7419,
	"step": 2270
	},
	{
	"epoch": 2.0466786355475763,
	"grad_norm": 0.06538320332765579,
	"learning_rate": 0.0004488330341113106,
	"loss": 4.714,
	"step": 2280
	},
	{
	"epoch": 2.0556552962298027,
	"grad_norm": 0.060536161065101624,
	"learning_rate": 0.0004486086175942549,
	"loss": 4.691,
	"step": 2290
	},
	{
	"epoch": 2.0646319569120286,
	"grad_norm": 0.10158341377973557,
	"learning_rate": 0.0004483842010771993,
	"loss": 4.702,
	"step": 2300
	},
	{
	"epoch": 2.073608617594255,
	"grad_norm": 0.08171387016773224,
	"learning_rate": 0.00044815978456014363,
	"loss": 4.7029,
	"step": 2310
	},
	{
	"epoch": 2.0825852782764813,
	"grad_norm": 0.07701843976974487,
	"learning_rate": 0.000447935368043088,
	"loss": 4.6957,
	"step": 2320
	},
	{
	"epoch": 2.0915619389587072,
	"grad_norm": 0.06302302330732346,
	"learning_rate": 0.0004477109515260323,
	"loss": 4.6855,
	"step": 2330
	},
	{
	"epoch": 2.1005385996409336,
	"grad_norm": 0.12679466605186462,
	"learning_rate": 0.00044748653500897666,
	"loss": 4.7147,
	"step": 2340
	},
	{
	"epoch": 2.10951526032316,
	"grad_norm": 0.17339470982551575,
	"learning_rate": 0.000447262118491921,
	"loss": 4.6697,
	"step": 2350
	},
	{
	"epoch": 2.118491921005386,
	"grad_norm": 0.07397322356700897,
	"learning_rate": 0.0004470377019748654,
	"loss": 4.6642,
	"step": 2360
	},
	{
	"epoch": 2.127468581687612,
	"grad_norm": 0.0524037629365921,
	"learning_rate": 0.0004468132854578097,
	"loss": 4.6511,
	"step": 2370
	},
	{
	"epoch": 2.1364452423698386,
	"grad_norm": 0.06674987077713013,
	"learning_rate": 0.0004465888689407541,
	"loss": 4.6374,
	"step": 2380
	},
	{
	"epoch": 2.1454219030520645,
	"grad_norm": 0.04827852547168732,
	"learning_rate": 0.00044636445242369837,
	"loss": 4.6531,
	"step": 2390
	},
	{
	"epoch": 2.154398563734291,
	"grad_norm": 0.05094282329082489,
	"learning_rate": 0.00044614003590664277,
	"loss": 4.6554,
	"step": 2400
	},
	{
	"epoch": 2.163375224416517,
	"grad_norm": 0.0653914213180542,
	"learning_rate": 0.00044591561938958705,
	"loss": 4.6568,
	"step": 2410
	},
	{
	"epoch": 2.172351885098743,
	"grad_norm": 0.06652519851922989,
	"learning_rate": 0.00044569120287253145,
	"loss": 4.6533,
	"step": 2420
	},
	{
	"epoch": 2.1813285457809695,
	"grad_norm": 0.051527008414268494,
	"learning_rate": 0.0004454667863554758,
	"loss": 4.6438,
	"step": 2430
	},
	{
	"epoch": 2.190305206463196,
	"grad_norm": 0.047185543924570084,
	"learning_rate": 0.0004452423698384201,
	"loss": 4.6157,
	"step": 2440
	},
	{
	"epoch": 2.1992818671454217,
	"grad_norm": 0.0524996742606163,
	"learning_rate": 0.0004450179533213645,
	"loss": 4.6583,
	"step": 2450
	},
	{
	"epoch": 2.208258527827648,
	"grad_norm": 0.055864688009023666,
	"learning_rate": 0.00044479353680430876,
	"loss": 4.611,
	"step": 2460
	},
	{
	"epoch": 2.2172351885098744,
	"grad_norm": 0.055937688797712326,
	"learning_rate": 0.00044456912028725316,
	"loss": 4.6302,
	"step": 2470
	},
	{
	"epoch": 2.2262118491921004,
	"grad_norm": 0.07318311929702759,
	"learning_rate": 0.0004443447037701975,
	"loss": 4.6383,
	"step": 2480
	},
	{
	"epoch": 2.2351885098743267,
	"grad_norm": 0.05302512273192406,
	"learning_rate": 0.00044412028725314184,
	"loss": 4.6142,
	"step": 2490
	},
	{
	"epoch": 2.244165170556553,
	"grad_norm": 0.050843581557273865,
	"learning_rate": 0.0004438958707360862,
	"loss": 4.5937,
	"step": 2500
	},
	{
	"epoch": 2.253141831238779,
	"grad_norm": 0.0519312284886837,
	"learning_rate": 0.0004436714542190305,
	"loss": 4.6083,
	"step": 2510
	},
	{
	"epoch": 2.2621184919210053,
	"grad_norm": 0.05857894569635391,
	"learning_rate": 0.00044344703770197487,
	"loss": 4.5765,
	"step": 2520
	},
	{
	"epoch": 2.2710951526032317,
	"grad_norm": 0.05550041422247887,
	"learning_rate": 0.00044322262118491926,
	"loss": 4.5859,
	"step": 2530
	},
	{
	"epoch": 2.280071813285458,
	"grad_norm": 0.10349979996681213,
	"learning_rate": 0.00044299820466786355,
	"loss": 4.5765,
	"step": 2540
	},
	{
	"epoch": 2.289048473967684,
	"grad_norm": 0.1185607761144638,
	"learning_rate": 0.00044277378815080795,
	"loss": 4.5946,
	"step": 2550
	},
	{
	"epoch": 2.2980251346499103,
	"grad_norm": 0.09133188426494598,
	"learning_rate": 0.00044254937163375224,
	"loss": 4.578,
	"step": 2560
	},
	{
	"epoch": 2.3070017953321367,
	"grad_norm": 0.08713024109601974,
	"learning_rate": 0.00044232495511669663,
	"loss": 4.6011,
	"step": 2570
	},
	{
	"epoch": 2.3159784560143626,
	"grad_norm": 0.05465725436806679,
	"learning_rate": 0.0004421005385996409,
	"loss": 4.5755,
	"step": 2580
	},
	{
	"epoch": 2.324955116696589,
	"grad_norm": 0.056493621319532394,
	"learning_rate": 0.00044187612208258526,
	"loss": 4.5855,
	"step": 2590
	},
	{
	"epoch": 2.3339317773788153,
	"grad_norm": 0.047107528895139694,
	"learning_rate": 0.00044165170556552966,
	"loss": 4.5681,
	"step": 2600
	},
	{
	"epoch": 2.342908438061041,
	"grad_norm": 0.05533495545387268,
	"learning_rate": 0.00044142728904847394,
	"loss": 4.5581,
	"step": 2610
	},
	{
	"epoch": 2.3518850987432676,
	"grad_norm": 0.0478278249502182,
	"learning_rate": 0.00044120287253141834,
	"loss": 4.5425,
	"step": 2620
	},
	{
	"epoch": 2.360861759425494,
	"grad_norm": 0.06553395092487335,
	"learning_rate": 0.00044097845601436263,
	"loss": 4.5484,
	"step": 2630
	},
	{
	"epoch": 2.36983842010772,
	"grad_norm": 0.07375505566596985,
	"learning_rate": 0.000440754039497307,
	"loss": 4.541,
	"step": 2640
	},
	{
	"epoch": 2.378815080789946,
	"grad_norm": 0.20693852007389069,
	"learning_rate": 0.00044052962298025137,
	"loss": 4.5521,
	"step": 2650
	},
	{
	"epoch": 2.3877917414721725,
	"grad_norm": 0.056829433888196945,
	"learning_rate": 0.0004403052064631957,
	"loss": 4.5588,
	"step": 2660
	},
	{
	"epoch": 2.3967684021543985,
	"grad_norm": 0.05583564192056656,
	"learning_rate": 0.00044008078994614005,
	"loss": 4.5358,
	"step": 2670
	},
	{
	"epoch": 2.405745062836625,
	"grad_norm": 0.07319542020559311,
	"learning_rate": 0.0004398563734290844,
	"loss": 4.523,
	"step": 2680
	},
	{
	"epoch": 2.414721723518851,
	"grad_norm": 0.052402835339307785,
	"learning_rate": 0.00043963195691202873,
	"loss": 4.5096,
	"step": 2690
	},
	{
	"epoch": 2.423698384201077,
	"grad_norm": 0.05206010863184929,
	"learning_rate": 0.00043940754039497313,
	"loss": 4.5053,
	"step": 2700
	},
	{
	"epoch": 2.4326750448833034,
	"grad_norm": 0.05443358048796654,
	"learning_rate": 0.0004391831238779174,
	"loss": 4.5501,
	"step": 2710
	},
	{
	"epoch": 2.44165170556553,
	"grad_norm": 0.07843279093503952,
	"learning_rate": 0.00043895870736086176,
	"loss": 4.5027,
	"step": 2720
	},
	{
	"epoch": 2.4506283662477557,
	"grad_norm": 0.046305350959300995,
	"learning_rate": 0.0004387342908438061,
	"loss": 4.4975,
	"step": 2730
	},
	{
	"epoch": 2.459605026929982,
	"grad_norm": 0.22592291235923767,
	"learning_rate": 0.00043850987432675044,
	"loss": 4.5183,
	"step": 2740
	},
	{
	"epoch": 2.4685816876122084,
	"grad_norm": 0.05082382634282112,
	"learning_rate": 0.0004382854578096948,
	"loss": 4.4864,
	"step": 2750
	},
	{
	"epoch": 2.4775583482944343,
	"grad_norm": 0.06731193512678146,
	"learning_rate": 0.00043806104129263913,
	"loss": 4.4982,
	"step": 2760
	},
	{
	"epoch": 2.4865350089766607,
	"grad_norm": 185.7747039794922,
	"learning_rate": 0.0004378366247755835,
	"loss": 4.6692,
	"step": 2770
	},
	{
	"epoch": 2.495511669658887,
	"grad_norm": 0.058124568313360214,
	"learning_rate": 0.0004376122082585278,
	"loss": 4.4775,
	"step": 2780
	},
	{
	"epoch": 2.504488330341113,
	"grad_norm": 0.08968983590602875,
	"learning_rate": 0.0004373877917414722,
	"loss": 4.4944,
	"step": 2790
	},
	{
	"epoch": 2.5134649910233393,
	"grad_norm": 0.2788603901863098,
	"learning_rate": 0.0004371633752244165,
	"loss": 4.503,
	"step": 2800
	},
	{
	"epoch": 2.5224416517055657,
	"grad_norm": 0.05559522658586502,
	"learning_rate": 0.0004369389587073609,
	"loss": 4.4733,
	"step": 2810
	},
	{
	"epoch": 2.5314183123877916,
	"grad_norm": 0.05935097113251686,
	"learning_rate": 0.00043671454219030523,
	"loss": 4.4686,
	"step": 2820
	},
	{
	"epoch": 2.540394973070018,
	"grad_norm": 0.05860767886042595,
	"learning_rate": 0.0004364901256732496,
	"loss": 4.4593,
	"step": 2830
	},
	{
	"epoch": 2.5493716337522443,
	"grad_norm": 0.047259800136089325,
	"learning_rate": 0.0004362657091561939,
	"loss": 4.4479,
	"step": 2840
	},
	{
	"epoch": 2.55834829443447,
	"grad_norm": 0.04901234060525894,
	"learning_rate": 0.00043604129263913826,
	"loss": 4.4621,
	"step": 2850
	},
	{
	"epoch": 2.5673249551166966,
	"grad_norm": 0.05742761120200157,
	"learning_rate": 0.0004358168761220826,
	"loss": 4.4422,
	"step": 2860
	},
	{
	"epoch": 2.576301615798923,
	"grad_norm": 0.05717416852712631,
	"learning_rate": 0.0004355924596050269,
	"loss": 4.4248,
	"step": 2870
	},
	{
	"epoch": 2.585278276481149,
	"grad_norm": 0.0896502435207367,
	"learning_rate": 0.0004353680430879713,
	"loss": 4.4368,
	"step": 2880
	},
	{
	"epoch": 2.594254937163375,
	"grad_norm": 0.08746081590652466,
	"learning_rate": 0.0004351436265709156,
	"loss": 4.4282,
	"step": 2890
	},
	{
	"epoch": 2.6032315978456015,
	"grad_norm": 0.07144750654697418,
	"learning_rate": 0.00043491921005385997,
	"loss": 4.4794,
	"step": 2900
	},
	{
	"epoch": 2.6122082585278275,
	"grad_norm": 0.05990668013691902,
	"learning_rate": 0.0004346947935368043,
	"loss": 4.4117,
	"step": 2910
	},
	{
	"epoch": 2.621184919210054,
	"grad_norm": 0.07920947670936584,
	"learning_rate": 0.00043447037701974865,
	"loss": 4.4179,
	"step": 2920
	},
	{
	"epoch": 2.63016157989228,
	"grad_norm": 0.053824532777071,
	"learning_rate": 0.000434245960502693,
	"loss": 4.3963,
	"step": 2930
	},
	{
	"epoch": 2.639138240574506,
	"grad_norm": 0.06394129246473312,
	"learning_rate": 0.0004340215439856374,
	"loss": 4.4045,
	"step": 2940
	},
	{
	"epoch": 2.6481149012567324,
	"grad_norm": 0.2640804648399353,
	"learning_rate": 0.0004337971274685817,
	"loss": 4.3916,
	"step": 2950
	},
	{
	"epoch": 2.657091561938959,
	"grad_norm": 0.04887564107775688,
	"learning_rate": 0.0004335727109515261,
	"loss": 4.3892,
	"step": 2960
	},
	{
	"epoch": 2.6660682226211847,
	"grad_norm": 0.05104290321469307,
	"learning_rate": 0.00043334829443447036,
	"loss": 4.3883,
	"step": 2970
	},
	{
	"epoch": 2.675044883303411,
	"grad_norm": 0.18991751968860626,
	"learning_rate": 0.00043312387791741476,
	"loss": 4.3747,
	"step": 2980
	},
	{
	"epoch": 2.6840215439856374,
	"grad_norm": 0.3262752294540405,
	"learning_rate": 0.0004328994614003591,
	"loss": 4.3755,
	"step": 2990
	},
	{
	"epoch": 2.6929982046678633,
	"grad_norm": 0.6619095802307129,
	"learning_rate": 0.0004326750448833034,
	"loss": 4.3711,
	"step": 3000
	},
	{
	"epoch": 2.7019748653500897,
	"grad_norm": 0.06734511256217957,
	"learning_rate": 0.0004324506283662478,
	"loss": 4.3606,
	"step": 3010
	},
	{
	"epoch": 2.710951526032316,
	"grad_norm": 0.06055251508951187,
	"learning_rate": 0.00043222621184919207,
	"loss": 4.3639,
	"step": 3020
	},
	{
	"epoch": 2.719928186714542,
	"grad_norm": 0.08325715363025665,
	"learning_rate": 0.00043200179533213647,
	"loss": 4.3439,
	"step": 3030
	},
	{
	"epoch": 2.7289048473967683,
	"grad_norm": 0.06473597139120102,
	"learning_rate": 0.00043177737881508075,
	"loss": 4.3448,
	"step": 3040
	},
	{
	"epoch": 2.7378815080789947,
	"grad_norm": 0.062395766377449036,
	"learning_rate": 0.00043155296229802515,
	"loss": 4.3262,
	"step": 3050
	},
	{
	"epoch": 2.7468581687612206,
	"grad_norm": 0.054201096296310425,
	"learning_rate": 0.0004313285457809695,
	"loss": 4.3466,
	"step": 3060
	},
	{
	"epoch": 2.755834829443447,
	"grad_norm": 0.05278482288122177,
	"learning_rate": 0.00043110412926391383,
	"loss": 4.3493,
	"step": 3070
	},
	{
	"epoch": 2.7648114901256733,
	"grad_norm": 0.05334211513400078,
	"learning_rate": 0.0004308797127468582,
	"loss": 4.3308,
	"step": 3080
	},
	{
	"epoch": 2.773788150807899,
	"grad_norm": 0.06164594739675522,
	"learning_rate": 0.0004306552962298025,
	"loss": 4.3166,
	"step": 3090
	},
	{
	"epoch": 2.7827648114901256,
	"grad_norm": 0.07043807953596115,
	"learning_rate": 0.00043043087971274686,
	"loss": 4.3009,
	"step": 3100
	},
	{
	"epoch": 2.791741472172352,
	"grad_norm": 0.05904858186841011,
	"learning_rate": 0.00043020646319569126,
	"loss": 4.2982,
	"step": 3110
	},
	{
	"epoch": 2.800718132854578,
	"grad_norm": 0.3487374782562256,
	"learning_rate": 0.00042998204667863554,
	"loss": 4.2992,
	"step": 3120
	},
	{
	"epoch": 2.809694793536804,
	"grad_norm": 0.06090310215950012,
	"learning_rate": 0.00042975763016157994,
	"loss": 4.3199,
	"step": 3130
	},
	{
	"epoch": 2.8186714542190305,
	"grad_norm": 0.0674201026558876,
	"learning_rate": 0.0004295332136445242,
	"loss": 4.2813,
	"step": 3140
	},
	{
	"epoch": 2.827648114901257,
	"grad_norm": 0.0564940869808197,
	"learning_rate": 0.00042930879712746857,
	"loss": 4.2875,
	"step": 3150
	},
	{
	"epoch": 2.836624775583483,
	"grad_norm": 0.08277291059494019,
	"learning_rate": 0.00042908438061041297,
	"loss": 4.2741,
	"step": 3160
	},
	{
	"epoch": 2.845601436265709,
	"grad_norm": 0.05882051959633827,
	"learning_rate": 0.00042885996409335725,
	"loss": 4.2606,
	"step": 3170
	},
	{
	"epoch": 2.8545780969479355,
	"grad_norm": 0.056912124156951904,
	"learning_rate": 0.00042863554757630165,
	"loss": 4.2387,
	"step": 3180
	},
	{
	"epoch": 2.8635547576301614,
	"grad_norm": 0.06803829967975616,
	"learning_rate": 0.00042841113105924594,
	"loss": 4.2595,
	"step": 3190
	},
	{
	"epoch": 2.872531418312388,
	"grad_norm": 0.38242146372795105,
	"learning_rate": 0.00042818671454219033,
	"loss": 4.252,
	"step": 3200
	},
	{
	"epoch": 2.881508078994614,
	"grad_norm": 0.06552311778068542,
	"learning_rate": 0.0004279622980251346,
	"loss": 4.2386,
	"step": 3210
	},
	{
	"epoch": 2.89048473967684,
	"grad_norm": 0.06190953776240349,
	"learning_rate": 0.000427737881508079,
	"loss": 4.2279,
	"step": 3220
	},
	{
	"epoch": 2.8994614003590664,
	"grad_norm": 0.07202804833650589,
	"learning_rate": 0.00042751346499102336,
	"loss": 4.2397,
	"step": 3230
	},
	{
	"epoch": 2.9084380610412928,
	"grad_norm": 0.06415878981351852,
	"learning_rate": 0.0004272890484739677,
	"loss": 4.2124,
	"step": 3240
	},
	{
	"epoch": 2.917414721723519,
	"grad_norm": 0.06290468573570251,
	"learning_rate": 0.00042706463195691204,
	"loss": 4.2267,
	"step": 3250
	},
	{
	"epoch": 2.926391382405745,
	"grad_norm": 0.05975602567195892,
	"learning_rate": 0.0004268402154398564,
	"loss": 4.21,
	"step": 3260
	},
	{
	"epoch": 2.9353680430879714,
	"grad_norm": 0.08022774755954742,
	"learning_rate": 0.0004266157989228007,
	"loss": 4.2127,
	"step": 3270
	},
	{
	"epoch": 2.9443447037701977,
	"grad_norm": 0.11041318625211716,
	"learning_rate": 0.0004263913824057451,
	"loss": 4.1903,
	"step": 3280
	},
	{
	"epoch": 2.9533213644524237,
	"grad_norm": 0.06093136593699455,
	"learning_rate": 0.0004261669658886894,
	"loss": 4.1882,
	"step": 3290
	},
	{
	"epoch": 2.96229802513465,
	"grad_norm": 0.060306135565042496,
	"learning_rate": 0.00042594254937163375,
	"loss": 4.1874,
	"step": 3300
	},
	{
	"epoch": 2.9712746858168764,
	"grad_norm": 0.0592743381857872,
	"learning_rate": 0.0004257181328545781,
	"loss": 4.1881,
	"step": 3310
	},
	{
	"epoch": 2.9802513464991023,
	"grad_norm": 0.06113787367939949,
	"learning_rate": 0.00042549371633752244,
	"loss": 4.1788,
	"step": 3320
	},
	{
	"epoch": 2.9892280071813286,
	"grad_norm": 0.0978228747844696,
	"learning_rate": 0.0004252692998204668,
	"loss": 4.1714,
	"step": 3330
	},
	{
	"epoch": 2.998204667863555,
	"grad_norm": 0.06704937666654587,
	"learning_rate": 0.0004250448833034111,
	"loss": 4.1375,
	"step": 3340
	},
	{
	"epoch": 3.0,
	"eval_loss": 3.9158332347869873,
	"eval_runtime": 437.3013,
	"eval_samples_per_second": 10.19,
	"eval_steps_per_second": 1.274,
	"step": 3342
	}
	],
	"logging_steps": 10,
	"max_steps": 22280,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 20,
	"save_steps": 500,
	"stateful_callbacks": {
	"EarlyStoppingCallback": {
	"args": {
	"early_stopping_patience": 3,
	"early_stopping_threshold": 0.0
	},
	"attributes": {
	"early_stopping_patience_counter": 0
	}
	},
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 3.661123660598477e+16,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}