GENOME-gemma-2b-it / sharegpt /trainer_state.json

Upload 15 files

579a00e verified about 1 year ago

59.7 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.999849510910459,
	"eval_steps": 1000,
	"global_step": 3322,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.0030097817908201654,
	"grad_norm": 0.5666791796684265,
	"learning_rate": 6.006006006006006e-06,
	"loss": 1.0909,
	"step": 10
	},
	{
	"epoch": 0.006019563581640331,
	"grad_norm": 0.5926806926727295,
	"learning_rate": 1.2012012012012012e-05,
	"loss": 1.0813,
	"step": 20
	},
	{
	"epoch": 0.009029345372460496,
	"grad_norm": 0.5680918097496033,
	"learning_rate": 1.801801801801802e-05,
	"loss": 1.051,
	"step": 30
	},
	{
	"epoch": 0.012039127163280662,
	"grad_norm": 0.4019787907600403,
	"learning_rate": 2.4024024024024024e-05,
	"loss": 0.9406,
	"step": 40
	},
	{
	"epoch": 0.015048908954100828,
	"grad_norm": 0.289296418428421,
	"learning_rate": 3.0030030030030033e-05,
	"loss": 0.9232,
	"step": 50
	},
	{
	"epoch": 0.01805869074492099,
	"grad_norm": 0.23350511491298676,
	"learning_rate": 3.603603603603604e-05,
	"loss": 0.8762,
	"step": 60
	},
	{
	"epoch": 0.021068472535741158,
	"grad_norm": 0.17287714779376984,
	"learning_rate": 4.204204204204204e-05,
	"loss": 0.8496,
	"step": 70
	},
	{
	"epoch": 0.024078254326561323,
	"grad_norm": 0.16750749945640564,
	"learning_rate": 4.804804804804805e-05,
	"loss": 0.8424,
	"step": 80
	},
	{
	"epoch": 0.02708803611738149,
	"grad_norm": 0.15370824933052063,
	"learning_rate": 5.405405405405406e-05,
	"loss": 0.8601,
	"step": 90
	},
	{
	"epoch": 0.030097817908201655,
	"grad_norm": 0.16766728460788727,
	"learning_rate": 6.0060060060060066e-05,
	"loss": 0.8179,
	"step": 100
	},
	{
	"epoch": 0.03310759969902182,
	"grad_norm": 0.2106652557849884,
	"learning_rate": 6.606606606606607e-05,
	"loss": 0.8094,
	"step": 110
	},
	{
	"epoch": 0.03611738148984198,
	"grad_norm": 0.22345463931560516,
	"learning_rate": 7.207207207207208e-05,
	"loss": 0.8096,
	"step": 120
	},
	{
	"epoch": 0.03912716328066215,
	"grad_norm": 0.23186658322811127,
	"learning_rate": 7.807807807807808e-05,
	"loss": 0.8195,
	"step": 130
	},
	{
	"epoch": 0.042136945071482315,
	"grad_norm": 0.3011253774166107,
	"learning_rate": 8.408408408408409e-05,
	"loss": 0.8184,
	"step": 140
	},
	{
	"epoch": 0.045146726862302484,
	"grad_norm": 0.21401682496070862,
	"learning_rate": 9.009009009009009e-05,
	"loss": 0.8,
	"step": 150
	},
	{
	"epoch": 0.04815650865312265,
	"grad_norm": 0.2243528664112091,
	"learning_rate": 9.60960960960961e-05,
	"loss": 0.7739,
	"step": 160
	},
	{
	"epoch": 0.051166290443942816,
	"grad_norm": 0.46177345514297485,
	"learning_rate": 0.00010210210210210212,
	"loss": 0.8172,
	"step": 170
	},
	{
	"epoch": 0.05417607223476298,
	"grad_norm": 0.2084399163722992,
	"learning_rate": 0.00010810810810810812,
	"loss": 0.8181,
	"step": 180
	},
	{
	"epoch": 0.05718585402558315,
	"grad_norm": 0.24123747646808624,
	"learning_rate": 0.00011411411411411413,
	"loss": 0.8032,
	"step": 190
	},
	{
	"epoch": 0.06019563581640331,
	"grad_norm": 0.23546789586544037,
	"learning_rate": 0.00012012012012012013,
	"loss": 0.7775,
	"step": 200
	},
	{
	"epoch": 0.06320541760722348,
	"grad_norm": 0.23672956228256226,
	"learning_rate": 0.00012612612612612612,
	"loss": 0.821,
	"step": 210
	},
	{
	"epoch": 0.06621519939804364,
	"grad_norm": 0.22630153596401215,
	"learning_rate": 0.00013213213213213214,
	"loss": 0.7892,
	"step": 220
	},
	{
	"epoch": 0.0692249811888638,
	"grad_norm": 0.2221691757440567,
	"learning_rate": 0.00013813813813813813,
	"loss": 0.7932,
	"step": 230
	},
	{
	"epoch": 0.07223476297968397,
	"grad_norm": 0.2853372097015381,
	"learning_rate": 0.00014414414414414415,
	"loss": 0.7993,
	"step": 240
	},
	{
	"epoch": 0.07524454477050414,
	"grad_norm": 0.24529512226581573,
	"learning_rate": 0.00015015015015015014,
	"loss": 0.8281,
	"step": 250
	},
	{
	"epoch": 0.0782543265613243,
	"grad_norm": 0.20747515559196472,
	"learning_rate": 0.00015615615615615616,
	"loss": 0.7695,
	"step": 260
	},
	{
	"epoch": 0.08126410835214447,
	"grad_norm": 0.21736431121826172,
	"learning_rate": 0.00016216216216216218,
	"loss": 0.7863,
	"step": 270
	},
	{
	"epoch": 0.08427389014296463,
	"grad_norm": 0.25259608030319214,
	"learning_rate": 0.00016816816816816817,
	"loss": 0.7616,
	"step": 280
	},
	{
	"epoch": 0.0872836719337848,
	"grad_norm": 0.19296181201934814,
	"learning_rate": 0.0001741741741741742,
	"loss": 0.7707,
	"step": 290
	},
	{
	"epoch": 0.09029345372460497,
	"grad_norm": 0.26890674233436584,
	"learning_rate": 0.00018018018018018018,
	"loss": 0.7744,
	"step": 300
	},
	{
	"epoch": 0.09330323551542513,
	"grad_norm": 0.2053564041852951,
	"learning_rate": 0.0001861861861861862,
	"loss": 0.743,
	"step": 310
	},
	{
	"epoch": 0.0963130173062453,
	"grad_norm": 0.20555047690868378,
	"learning_rate": 0.0001921921921921922,
	"loss": 0.8089,
	"step": 320
	},
	{
	"epoch": 0.09932279909706546,
	"grad_norm": 0.2425510585308075,
	"learning_rate": 0.0001981981981981982,
	"loss": 0.8012,
	"step": 330
	},
	{
	"epoch": 0.10233258088788563,
	"grad_norm": 0.2260735034942627,
	"learning_rate": 0.00019999729347501484,
	"loss": 0.7843,
	"step": 340
	},
	{
	"epoch": 0.1053423626787058,
	"grad_norm": 0.21512670814990997,
	"learning_rate": 0.0001999840373787939,
	"loss": 0.7992,
	"step": 350
	},
	{
	"epoch": 0.10835214446952596,
	"grad_norm": 0.22542227804660797,
	"learning_rate": 0.0001999597360570722,
	"loss": 0.772,
	"step": 360
	},
	{
	"epoch": 0.11136192626034612,
	"grad_norm": 0.15299277007579803,
	"learning_rate": 0.0001999243921944139,
	"loss": 0.7511,
	"step": 370
	},
	{
	"epoch": 0.1143717080511663,
	"grad_norm": 0.27176716923713684,
	"learning_rate": 0.00019987800969525164,
	"loss": 0.7735,
	"step": 380
	},
	{
	"epoch": 0.11738148984198646,
	"grad_norm": 0.28357240557670593,
	"learning_rate": 0.00019982059368345496,
	"loss": 0.7729,
	"step": 390
	},
	{
	"epoch": 0.12039127163280662,
	"grad_norm": 0.18754735589027405,
	"learning_rate": 0.00019975215050176433,
	"loss": 0.795,
	"step": 400
	},
	{
	"epoch": 0.12340105342362678,
	"grad_norm": 0.15982478857040405,
	"learning_rate": 0.00019967268771109035,
	"loss": 0.7671,
	"step": 410
	},
	{
	"epoch": 0.12641083521444696,
	"grad_norm": 0.18540535867214203,
	"learning_rate": 0.00019958221408967875,
	"loss": 0.7491,
	"step": 420
	},
	{
	"epoch": 0.1294206170052671,
	"grad_norm": 0.4123195707798004,
	"learning_rate": 0.00019948073963214043,
	"loss": 0.7766,
	"step": 430
	},
	{
	"epoch": 0.13243039879608728,
	"grad_norm": 0.1608065813779831,
	"learning_rate": 0.00019936827554834738,
	"loss": 0.7904,
	"step": 440
	},
	{
	"epoch": 0.13544018058690746,
	"grad_norm": 0.21908819675445557,
	"learning_rate": 0.00019924483426219452,
	"loss": 0.7816,
	"step": 450
	},
	{
	"epoch": 0.1384499623777276,
	"grad_norm": 0.17256338894367218,
	"learning_rate": 0.00019911042941022695,
	"loss": 0.7982,
	"step": 460
	},
	{
	"epoch": 0.14145974416854779,
	"grad_norm": 0.15286709368228912,
	"learning_rate": 0.00019896507584013376,
	"loss": 0.793,
	"step": 470
	},
	{
	"epoch": 0.14446952595936793,
	"grad_norm": 0.1949639767408371,
	"learning_rate": 0.00019880878960910772,
	"loss": 0.745,
	"step": 480
	},
	{
	"epoch": 0.1474793077501881,
	"grad_norm": 0.22766350209712982,
	"learning_rate": 0.00019864158798207137,
	"loss": 0.765,
	"step": 490
	},
	{
	"epoch": 0.1504890895410083,
	"grad_norm": 0.17246346175670624,
	"learning_rate": 0.0001984634894297699,
	"loss": 0.7494,
	"step": 500
	},
	{
	"epoch": 0.15349887133182843,
	"grad_norm": 0.18809442222118378,
	"learning_rate": 0.00019827451362673052,
	"loss": 0.7906,
	"step": 510
	},
	{
	"epoch": 0.1565086531226486,
	"grad_norm": 0.287610799074173,
	"learning_rate": 0.00019807468144908928,
	"loss": 0.7991,
	"step": 520
	},
	{
	"epoch": 0.1595184349134688,
	"grad_norm": 0.2345225214958191,
	"learning_rate": 0.00019786401497228466,
	"loss": 0.7593,
	"step": 530
	},
	{
	"epoch": 0.16252821670428894,
	"grad_norm": 0.24265146255493164,
	"learning_rate": 0.00019764253746861886,
	"loss": 0.7966,
	"step": 540
	},
	{
	"epoch": 0.1655379984951091,
	"grad_norm": 0.1692132204771042,
	"learning_rate": 0.00019741027340468715,
	"loss": 0.7525,
	"step": 550
	},
	{
	"epoch": 0.16854778028592926,
	"grad_norm": 0.18456414341926575,
	"learning_rate": 0.00019716724843867487,
	"loss": 0.7706,
	"step": 560
	},
	{
	"epoch": 0.17155756207674944,
	"grad_norm": 0.2189481407403946,
	"learning_rate": 0.000196913489417523,
	"loss": 0.7683,
	"step": 570
	},
	{
	"epoch": 0.1745673438675696,
	"grad_norm": 0.24484622478485107,
	"learning_rate": 0.00019664902437396245,
	"loss": 0.821,
	"step": 580
	},
	{
	"epoch": 0.17757712565838976,
	"grad_norm": 0.1760026067495346,
	"learning_rate": 0.00019637388252341715,
	"loss": 0.7744,
	"step": 590
	},
	{
	"epoch": 0.18058690744920994,
	"grad_norm": 0.18553385138511658,
	"learning_rate": 0.00019608809426077678,
	"loss": 0.7607,
	"step": 600
	},
	{
	"epoch": 0.1835966892400301,
	"grad_norm": 0.23498745262622833,
	"learning_rate": 0.000195791691157039,
	"loss": 0.7707,
	"step": 610
	},
	{
	"epoch": 0.18660647103085026,
	"grad_norm": 0.20103472471237183,
	"learning_rate": 0.00019548470595582166,
	"loss": 0.7487,
	"step": 620
	},
	{
	"epoch": 0.18961625282167044,
	"grad_norm": 0.18829959630966187,
	"learning_rate": 0.00019516717256974592,
	"loss": 0.7653,
	"step": 630
	},
	{
	"epoch": 0.1926260346124906,
	"grad_norm": 0.24024085700511932,
	"learning_rate": 0.00019483912607668965,
	"loss": 0.7918,
	"step": 640
	},
	{
	"epoch": 0.19563581640331076,
	"grad_norm": 0.18580889701843262,
	"learning_rate": 0.00019450060271591243,
	"loss": 0.8022,
	"step": 650
	},
	{
	"epoch": 0.1986455981941309,
	"grad_norm": 0.2515565752983093,
	"learning_rate": 0.0001941516398840524,
	"loss": 0.7548,
	"step": 660
	},
	{
	"epoch": 0.2016553799849511,
	"grad_norm": 0.18456920981407166,
	"learning_rate": 0.00019379227613099473,
	"loss": 0.7903,
	"step": 670
	},
	{
	"epoch": 0.20466516177577126,
	"grad_norm": 0.23653873801231384,
	"learning_rate": 0.00019342255115561337,
	"loss": 0.7917,
	"step": 680
	},
	{
	"epoch": 0.2076749435665914,
	"grad_norm": 0.26416492462158203,
	"learning_rate": 0.00019304250580138524,
	"loss": 0.7879,
	"step": 690
	},
	{
	"epoch": 0.2106847253574116,
	"grad_norm": 0.22712339460849762,
	"learning_rate": 0.0001926521820518784,
	"loss": 0.8014,
	"step": 700
	},
	{
	"epoch": 0.21369450714823177,
	"grad_norm": 0.17389413714408875,
	"learning_rate": 0.00019225162302611412,
	"loss": 0.7403,
	"step": 710
	},
	{
	"epoch": 0.21670428893905191,
	"grad_norm": 0.23703350126743317,
	"learning_rate": 0.00019184087297380344,
	"loss": 0.7795,
	"step": 720
	},
	{
	"epoch": 0.2197140707298721,
	"grad_norm": 0.20507369935512543,
	"learning_rate": 0.000191419977270459,
	"loss": 0.8047,
	"step": 730
	},
	{
	"epoch": 0.22272385252069224,
	"grad_norm": 0.1989058554172516,
	"learning_rate": 0.0001909889824123824,
	"loss": 0.7786,
	"step": 740
	},
	{
	"epoch": 0.22573363431151242,
	"grad_norm": 0.1992356926202774,
	"learning_rate": 0.00019054793601152773,
	"loss": 0.7352,
	"step": 750
	},
	{
	"epoch": 0.2287434161023326,
	"grad_norm": 0.1841665804386139,
	"learning_rate": 0.0001900968867902419,
	"loss": 0.778,
	"step": 760
	},
	{
	"epoch": 0.23175319789315274,
	"grad_norm": 0.1729171723127365,
	"learning_rate": 0.00018963588457588228,
	"loss": 0.7776,
	"step": 770
	},
	{
	"epoch": 0.23476297968397292,
	"grad_norm": 0.18624809384346008,
	"learning_rate": 0.00018916498029531223,
	"loss": 0.7841,
	"step": 780
	},
	{
	"epoch": 0.23777276147479307,
	"grad_norm": 0.19757080078125,
	"learning_rate": 0.00018868422596927535,
	"loss": 0.7628,
	"step": 790
	},
	{
	"epoch": 0.24078254326561324,
	"grad_norm": 0.1995578110218048,
	"learning_rate": 0.00018819367470664862,
	"loss": 0.7403,
	"step": 800
	},
	{
	"epoch": 0.24379232505643342,
	"grad_norm": 0.2151431441307068,
	"learning_rate": 0.00018769338069857548,
	"loss": 0.7581,
	"step": 810
	},
	{
	"epoch": 0.24680210684725357,
	"grad_norm": 0.2272290140390396,
	"learning_rate": 0.00018718339921247945,
	"loss": 0.7914,
	"step": 820
	},
	{
	"epoch": 0.24981188863807374,
	"grad_norm": 0.1463918834924698,
	"learning_rate": 0.0001866637865859586,
	"loss": 0.7953,
	"step": 830
	},
	{
	"epoch": 0.2528216704288939,
	"grad_norm": 0.2657776176929474,
	"learning_rate": 0.00018613460022056215,
	"loss": 0.7576,
	"step": 840
	},
	{
	"epoch": 0.2558314522197141,
	"grad_norm": 0.2566029131412506,
	"learning_rate": 0.000185595898575449,
	"loss": 0.7508,
	"step": 850
	},
	{
	"epoch": 0.2588412340105342,
	"grad_norm": 0.2356068342924118,
	"learning_rate": 0.00018504774116093008,
	"loss": 0.7332,
	"step": 860
	},
	{
	"epoch": 0.2618510158013544,
	"grad_norm": 0.20745734870433807,
	"learning_rate": 0.00018449018853189403,
	"loss": 0.756,
	"step": 870
	},
	{
	"epoch": 0.26486079759217457,
	"grad_norm": 0.1839013397693634,
	"learning_rate": 0.0001839233022811179,
	"loss": 0.7776,
	"step": 880
	},
	{
	"epoch": 0.26787057938299474,
	"grad_norm": 0.238608717918396,
	"learning_rate": 0.00018334714503246273,
	"loss": 0.7771,
	"step": 890
	},
	{
	"epoch": 0.2708803611738149,
	"grad_norm": 0.17599129676818848,
	"learning_rate": 0.00018276178043395586,
	"loss": 0.7544,
	"step": 900
	},
	{
	"epoch": 0.27389014296463504,
	"grad_norm": 0.23767121136188507,
	"learning_rate": 0.00018216727315075945,
	"loss": 0.7946,
	"step": 910
	},
	{
	"epoch": 0.2768999247554552,
	"grad_norm": 0.180665984749794,
	"learning_rate": 0.00018156368885802695,
	"loss": 0.8202,
	"step": 920
	},
	{
	"epoch": 0.2799097065462754,
	"grad_norm": 0.20493340492248535,
	"learning_rate": 0.00018095109423364817,
	"loss": 0.7823,
	"step": 930
	},
	{
	"epoch": 0.28291948833709557,
	"grad_norm": 0.29762348532676697,
	"learning_rate": 0.0001803295569508832,
	"loss": 0.7637,
	"step": 940
	},
	{
	"epoch": 0.28592927012791575,
	"grad_norm": 0.2135084718465805,
	"learning_rate": 0.0001796991456708866,
	"loss": 0.768,
	"step": 950
	},
	{
	"epoch": 0.28893905191873587,
	"grad_norm": 0.21105533838272095,
	"learning_rate": 0.0001790599300351225,
	"loss": 0.7492,
	"step": 960
	},
	{
	"epoch": 0.29194883370955604,
	"grad_norm": 0.2163008451461792,
	"learning_rate": 0.00017841198065767107,
	"loss": 0.758,
	"step": 970
	},
	{
	"epoch": 0.2949586155003762,
	"grad_norm": 0.16340641677379608,
	"learning_rate": 0.00017775536911742806,
	"loss": 0.7739,
	"step": 980
	},
	{
	"epoch": 0.2979683972911964,
	"grad_norm": 0.20751433074474335,
	"learning_rate": 0.00017709016795019742,
	"loss": 0.7692,
	"step": 990
	},
	{
	"epoch": 0.3009781790820166,
	"grad_norm": 0.1802573949098587,
	"learning_rate": 0.00017641645064067816,
	"loss": 0.7886,
	"step": 1000
	},
	{
	"epoch": 0.3009781790820166,
	"eval_loss": 0.8210044503211975,
	"eval_runtime": 143.0609,
	"eval_samples_per_second": 39.116,
	"eval_steps_per_second": 4.893,
	"step": 1000
	},
	{
	"epoch": 0.3039879608728367,
	"grad_norm": 0.17218339443206787,
	"learning_rate": 0.0001757342916143466,
	"loss": 0.7595,
	"step": 1010
	},
	{
	"epoch": 0.30699774266365687,
	"grad_norm": 0.3174282908439636,
	"learning_rate": 0.00017504376622923465,
	"loss": 0.7821,
	"step": 1020
	},
	{
	"epoch": 0.31000752445447705,
	"grad_norm": 0.1862519532442093,
	"learning_rate": 0.00017434495076760483,
	"loss": 0.7982,
	"step": 1030
	},
	{
	"epoch": 0.3130173062452972,
	"grad_norm": 0.19561271369457245,
	"learning_rate": 0.00017363792242752353,
	"loss": 0.7422,
	"step": 1040
	},
	{
	"epoch": 0.3160270880361174,
	"grad_norm": 0.24750228226184845,
	"learning_rate": 0.000172922759314333,
	"loss": 0.7425,
	"step": 1050
	},
	{
	"epoch": 0.3190368698269376,
	"grad_norm": 0.22299301624298096,
	"learning_rate": 0.0001721995404320228,
	"loss": 0.7392,
	"step": 1060
	},
	{
	"epoch": 0.3220466516177577,
	"grad_norm": 0.38551756739616394,
	"learning_rate": 0.0001714683456745026,
	"loss": 0.7913,
	"step": 1070
	},
	{
	"epoch": 0.32505643340857787,
	"grad_norm": 0.19068562984466553,
	"learning_rate": 0.00017072925581677594,
	"loss": 0.7368,
	"step": 1080
	},
	{
	"epoch": 0.32806621519939805,
	"grad_norm": 0.14717283844947815,
	"learning_rate": 0.0001699823525060174,
	"loss": 0.7938,
	"step": 1090
	},
	{
	"epoch": 0.3310759969902182,
	"grad_norm": 0.20433548092842102,
	"learning_rate": 0.00016922771825255263,
	"loss": 0.7672,
	"step": 1100
	},
	{
	"epoch": 0.3340857787810384,
	"grad_norm": 0.24607650935649872,
	"learning_rate": 0.0001684654364207438,
	"loss": 0.7971,
	"step": 1110
	},
	{
	"epoch": 0.3370955605718585,
	"grad_norm": 0.1704825758934021,
	"learning_rate": 0.00016769559121978026,
	"loss": 0.7283,
	"step": 1120
	},
	{
	"epoch": 0.3401053423626787,
	"grad_norm": 0.18218447268009186,
	"learning_rate": 0.0001669182676943757,
	"loss": 0.7405,
	"step": 1130
	},
	{
	"epoch": 0.3431151241534989,
	"grad_norm": 0.25802308320999146,
	"learning_rate": 0.0001661335517153737,
	"loss": 0.7821,
	"step": 1140
	},
	{
	"epoch": 0.34612490594431905,
	"grad_norm": 0.2192961871623993,
	"learning_rate": 0.00016534152997026125,
	"loss": 0.7392,
	"step": 1150
	},
	{
	"epoch": 0.3491346877351392,
	"grad_norm": 0.19438660144805908,
	"learning_rate": 0.00016454228995359252,
	"loss": 0.7928,
	"step": 1160
	},
	{
	"epoch": 0.35214446952595935,
	"grad_norm": 0.16023948788642883,
	"learning_rate": 0.00016373591995732338,
	"loss": 0.7542,
	"step": 1170
	},
	{
	"epoch": 0.3551542513167795,
	"grad_norm": 0.32932865619659424,
	"learning_rate": 0.0001629225090610577,
	"loss": 0.7766,
	"step": 1180
	},
	{
	"epoch": 0.3581640331075997,
	"grad_norm": 0.282248854637146,
	"learning_rate": 0.00016210214712220687,
	"loss": 0.7528,
	"step": 1190
	},
	{
	"epoch": 0.3611738148984199,
	"grad_norm": 0.2124871164560318,
	"learning_rate": 0.00016127492476606308,
	"loss": 0.7874,
	"step": 1200
	},
	{
	"epoch": 0.36418359668924005,
	"grad_norm": 0.23827847838401794,
	"learning_rate": 0.00016044093337578815,
	"loss": 0.7599,
	"step": 1210
	},
	{
	"epoch": 0.3671933784800602,
	"grad_norm": 0.22505390644073486,
	"learning_rate": 0.00015960026508231824,
	"loss": 0.7707,
	"step": 1220
	},
	{
	"epoch": 0.37020316027088035,
	"grad_norm": 0.17258504033088684,
	"learning_rate": 0.00015875301275418638,
	"loss": 0.8102,
	"step": 1230
	},
	{
	"epoch": 0.3732129420617005,
	"grad_norm": 0.24378369748592377,
	"learning_rate": 0.00015789926998726315,
	"loss": 0.7388,
	"step": 1240
	},
	{
	"epoch": 0.3762227238525207,
	"grad_norm": 0.22882601618766785,
	"learning_rate": 0.00015703913109441713,
	"loss": 0.7707,
	"step": 1250
	},
	{
	"epoch": 0.3792325056433409,
	"grad_norm": 0.20203706622123718,
	"learning_rate": 0.0001561726910950962,
	"loss": 0.7444,
	"step": 1260
	},
	{
	"epoch": 0.382242287434161,
	"grad_norm": 0.25836071372032166,
	"learning_rate": 0.00015530004570483093,
	"loss": 0.7838,
	"step": 1270
	},
	{
	"epoch": 0.3852520692249812,
	"grad_norm": 0.16395032405853271,
	"learning_rate": 0.00015442129132466054,
	"loss": 0.7281,
	"step": 1280
	},
	{
	"epoch": 0.38826185101580135,
	"grad_norm": 0.19731546938419342,
	"learning_rate": 0.00015353652503048384,
	"loss": 0.7471,
	"step": 1290
	},
	{
	"epoch": 0.3912716328066215,
	"grad_norm": 0.23747088015079498,
	"learning_rate": 0.00015264584456233502,
	"loss": 0.7469,
	"step": 1300
	},
	{
	"epoch": 0.3942814145974417,
	"grad_norm": 0.222218319773674,
	"learning_rate": 0.0001517493483135864,
	"loss": 0.7945,
	"step": 1310
	},
	{
	"epoch": 0.3972911963882618,
	"grad_norm": 0.21308743953704834,
	"learning_rate": 0.00015084713532007905,
	"loss": 0.7659,
	"step": 1320
	},
	{
	"epoch": 0.400300978179082,
	"grad_norm": 0.25059592723846436,
	"learning_rate": 0.00014993930524918208,
	"loss": 0.7718,
	"step": 1330
	},
	{
	"epoch": 0.4033107599699022,
	"grad_norm": 0.2029498815536499,
	"learning_rate": 0.00014902595838878256,
	"loss": 0.759,
	"step": 1340
	},
	{
	"epoch": 0.40632054176072235,
	"grad_norm": 0.26108860969543457,
	"learning_rate": 0.0001481071956362067,
	"loss": 0.7568,
	"step": 1350
	},
	{
	"epoch": 0.40933032355154253,
	"grad_norm": 0.16724595427513123,
	"learning_rate": 0.0001471831184870737,
	"loss": 0.7504,
	"step": 1360
	},
	{
	"epoch": 0.4123401053423627,
	"grad_norm": 0.2351473867893219,
	"learning_rate": 0.00014625382902408356,
	"loss": 0.7365,
	"step": 1370
	},
	{
	"epoch": 0.4153498871331828,
	"grad_norm": 0.20350322127342224,
	"learning_rate": 0.00014531942990573998,
	"loss": 0.7444,
	"step": 1380
	},
	{
	"epoch": 0.418359668924003,
	"grad_norm": 0.2595962882041931,
	"learning_rate": 0.00014438002435500979,
	"loss": 0.7574,
	"step": 1390
	},
	{
	"epoch": 0.4213694507148232,
	"grad_norm": 0.28216540813446045,
	"learning_rate": 0.0001434357161479198,
	"loss": 0.74,
	"step": 1400
	},
	{
	"epoch": 0.42437923250564336,
	"grad_norm": 0.19514194130897522,
	"learning_rate": 0.0001424866096020927,
	"loss": 0.7761,
	"step": 1410
	},
	{
	"epoch": 0.42738901429646353,
	"grad_norm": 0.2227347493171692,
	"learning_rate": 0.00014153280956522322,
	"loss": 0.7895,
	"step": 1420
	},
	{
	"epoch": 0.43039879608728365,
	"grad_norm": 0.21557843685150146,
	"learning_rate": 0.00014057442140349543,
	"loss": 0.794,
	"step": 1430
	},
	{
	"epoch": 0.43340857787810383,
	"grad_norm": 0.1980540156364441,
	"learning_rate": 0.00013961155098994309,
	"loss": 0.7471,
	"step": 1440
	},
	{
	"epoch": 0.436418359668924,
	"grad_norm": 0.2989167869091034,
	"learning_rate": 0.00013864430469275377,
	"loss": 0.745,
	"step": 1450
	},
	{
	"epoch": 0.4394281414597442,
	"grad_norm": 0.2816371023654938,
	"learning_rate": 0.00013767278936351854,
	"loss": 0.7392,
	"step": 1460
	},
	{
	"epoch": 0.44243792325056436,
	"grad_norm": 0.20301824808120728,
	"learning_rate": 0.00013669711232542776,
	"loss": 0.7486,
	"step": 1470
	},
	{
	"epoch": 0.4454477050413845,
	"grad_norm": 0.18164554238319397,
	"learning_rate": 0.00013571738136141555,
	"loss": 0.7571,
	"step": 1480
	},
	{
	"epoch": 0.44845748683220465,
	"grad_norm": 0.20167161524295807,
	"learning_rate": 0.0001347337047022526,
	"loss": 0.76,
	"step": 1490
	},
	{
	"epoch": 0.45146726862302483,
	"grad_norm": 0.18913504481315613,
	"learning_rate": 0.00013374619101459012,
	"loss": 0.7444,
	"step": 1500
	},
	{
	"epoch": 0.454477050413845,
	"grad_norm": 0.2532965838909149,
	"learning_rate": 0.00013275494938895556,
	"loss": 0.7755,
	"step": 1510
	},
	{
	"epoch": 0.4574868322046652,
	"grad_norm": 0.16350123286247253,
	"learning_rate": 0.00013176008932770113,
	"loss": 0.755,
	"step": 1520
	},
	{
	"epoch": 0.4604966139954853,
	"grad_norm": 0.18730787932872772,
	"learning_rate": 0.00013076172073290724,
	"loss": 0.7369,
	"step": 1530
	},
	{
	"epoch": 0.4635063957863055,
	"grad_norm": 0.21689856052398682,
	"learning_rate": 0.00012975995389424166,
	"loss": 0.7773,
	"step": 1540
	},
	{
	"epoch": 0.46651617757712566,
	"grad_norm": 0.21407072246074677,
	"learning_rate": 0.0001287548994767758,
	"loss": 0.7287,
	"step": 1550
	},
	{
	"epoch": 0.46952595936794583,
	"grad_norm": 0.2083187699317932,
	"learning_rate": 0.00012774666850875942,
	"loss": 0.7717,
	"step": 1560
	},
	{
	"epoch": 0.472535741158766,
	"grad_norm": 0.2223493456840515,
	"learning_rate": 0.00012673537236935556,
	"loss": 0.7613,
	"step": 1570
	},
	{
	"epoch": 0.47554552294958613,
	"grad_norm": 0.2172088325023651,
	"learning_rate": 0.00012572112277633649,
	"loss": 0.7602,
	"step": 1580
	},
	{
	"epoch": 0.4785553047404063,
	"grad_norm": 0.22197513282299042,
	"learning_rate": 0.0001247040317737419,
	"loss": 0.7241,
	"step": 1590
	},
	{
	"epoch": 0.4815650865312265,
	"grad_norm": 0.1837187558412552,
	"learning_rate": 0.00012368421171950192,
	"loss": 0.7313,
	"step": 1600
	},
	{
	"epoch": 0.48457486832204666,
	"grad_norm": 0.23295271396636963,
	"learning_rate": 0.00012266177527302472,
	"loss": 0.7432,
	"step": 1610
	},
	{
	"epoch": 0.48758465011286684,
	"grad_norm": 0.21737614274024963,
	"learning_rate": 0.0001216368353827508,
	"loss": 0.7599,
	"step": 1620
	},
	{
	"epoch": 0.49059443190368696,
	"grad_norm": 0.1803632527589798,
	"learning_rate": 0.00012060950527367603,
	"loss": 0.7386,
	"step": 1630
	},
	{
	"epoch": 0.49360421369450713,
	"grad_norm": 0.1795688271522522,
	"learning_rate": 0.00011957989843484345,
	"loss": 0.7548,
	"step": 1640
	},
	{
	"epoch": 0.4966139954853273,
	"grad_norm": 0.2615033984184265,
	"learning_rate": 0.00011854812860680613,
	"loss": 0.7838,
	"step": 1650
	},
	{
	"epoch": 0.4996237772761475,
	"grad_norm": 0.22325466573238373,
	"learning_rate": 0.00011751430976906233,
	"loss": 0.7492,
	"step": 1660
	},
	{
	"epoch": 0.5026335590669676,
	"grad_norm": 0.20187042653560638,
	"learning_rate": 0.00011647855612746423,
	"loss": 0.7897,
	"step": 1670
	},
	{
	"epoch": 0.5056433408577878,
	"grad_norm": 0.3646783232688904,
	"learning_rate": 0.00011544098210160152,
	"loss": 0.7847,
	"step": 1680
	},
	{
	"epoch": 0.508653122648608,
	"grad_norm": 0.23362590372562408,
	"learning_rate": 0.00011440170231216154,
	"loss": 0.7624,
	"step": 1690
	},
	{
	"epoch": 0.5116629044394282,
	"grad_norm": 0.2684471011161804,
	"learning_rate": 0.00011336083156826722,
	"loss": 0.7973,
	"step": 1700
	},
	{
	"epoch": 0.5146726862302483,
	"grad_norm": 0.15820720791816711,
	"learning_rate": 0.00011231848485479395,
	"loss": 0.7322,
	"step": 1710
	},
	{
	"epoch": 0.5176824680210684,
	"grad_norm": 0.3005324900150299,
	"learning_rate": 0.00011127477731966735,
	"loss": 0.7449,
	"step": 1720
	},
	{
	"epoch": 0.5206922498118887,
	"grad_norm": 0.19756928086280823,
	"learning_rate": 0.00011022982426114292,
	"loss": 0.7988,
	"step": 1730
	},
	{
	"epoch": 0.5237020316027088,
	"grad_norm": 0.20102205872535706,
	"learning_rate": 0.00010918374111506893,
	"loss": 0.7273,
	"step": 1740
	},
	{
	"epoch": 0.526711813393529,
	"grad_norm": 0.2193961888551712,
	"learning_rate": 0.00010813664344213427,
	"loss": 0.7367,
	"step": 1750
	},
	{
	"epoch": 0.5297215951843491,
	"grad_norm": 0.2881150245666504,
	"learning_rate": 0.00010708864691510254,
	"loss": 0.7702,
	"step": 1760
	},
	{
	"epoch": 0.5327313769751693,
	"grad_norm": 0.24800467491149902,
	"learning_rate": 0.00010603986730603368,
	"loss": 0.7853,
	"step": 1770
	},
	{
	"epoch": 0.5357411587659895,
	"grad_norm": 0.18105538189411163,
	"learning_rate": 0.00010499042047349455,
	"loss": 0.7576,
	"step": 1780
	},
	{
	"epoch": 0.5387509405568096,
	"grad_norm": 0.16151605546474457,
	"learning_rate": 0.00010394042234976016,
	"loss": 0.7363,
	"step": 1790
	},
	{
	"epoch": 0.5417607223476298,
	"grad_norm": 0.2590145170688629,
	"learning_rate": 0.00010288998892800657,
	"loss": 0.7501,
	"step": 1800
	},
	{
	"epoch": 0.54477050413845,
	"grad_norm": 0.22977079451084137,
	"learning_rate": 0.0001018392362494972,
	"loss": 0.7768,
	"step": 1810
	},
	{
	"epoch": 0.5477802859292701,
	"grad_norm": 0.23535679280757904,
	"learning_rate": 0.00010078828039076367,
	"loss": 0.7803,
	"step": 1820
	},
	{
	"epoch": 0.5507900677200903,
	"grad_norm": 0.43004941940307617,
	"learning_rate": 9.973723745078296e-05,
	"loss": 0.7686,
	"step": 1830
	},
	{
	"epoch": 0.5537998495109104,
	"grad_norm": 0.25901973247528076,
	"learning_rate": 9.868622353815188e-05,
	"loss": 0.7623,
	"step": 1840
	},
	{
	"epoch": 0.5568096313017307,
	"grad_norm": 0.2264987975358963,
	"learning_rate": 9.763535475826054e-05,
	"loss": 0.7439,
	"step": 1850
	},
	{
	"epoch": 0.5598194130925508,
	"grad_norm": 0.24374960362911224,
	"learning_rate": 9.658474720046637e-05,
	"loss": 0.7825,
	"step": 1860
	},
	{
	"epoch": 0.5628291948833709,
	"grad_norm": 0.20084655284881592,
	"learning_rate": 9.553451692526954e-05,
	"loss": 0.7802,
	"step": 1870
	},
	{
	"epoch": 0.5658389766741911,
	"grad_norm": 0.1831783950328827,
	"learning_rate": 9.448477995149182e-05,
	"loss": 0.7328,
	"step": 1880
	},
	{
	"epoch": 0.5688487584650113,
	"grad_norm": 0.18079884350299835,
	"learning_rate": 9.343565224346013e-05,
	"loss": 0.7464,
	"step": 1890
	},
	{
	"epoch": 0.5718585402558315,
	"grad_norm": 0.248003751039505,
	"learning_rate": 9.238724969819579e-05,
	"loss": 0.7387,
	"step": 1900
	},
	{
	"epoch": 0.5748683220466516,
	"grad_norm": 0.18932189047336578,
	"learning_rate": 9.13396881326115e-05,
	"loss": 0.7136,
	"step": 1910
	},
	{
	"epoch": 0.5778781038374717,
	"grad_norm": 0.20788271725177765,
	"learning_rate": 9.029308327071702e-05,
	"loss": 0.7702,
	"step": 1920
	},
	{
	"epoch": 0.580887885628292,
	"grad_norm": 0.19441328942775726,
	"learning_rate": 8.924755073083517e-05,
	"loss": 0.7901,
	"step": 1930
	},
	{
	"epoch": 0.5838976674191121,
	"grad_norm": 0.21718983352184296,
	"learning_rate": 8.820320601282949e-05,
	"loss": 0.7771,
	"step": 1940
	},
	{
	"epoch": 0.5869074492099323,
	"grad_norm": 0.19140474498271942,
	"learning_rate": 8.71601644853449e-05,
	"loss": 0.7601,
	"step": 1950
	},
	{
	"epoch": 0.5899172310007524,
	"grad_norm": 0.26861098408699036,
	"learning_rate": 8.61185413730631e-05,
	"loss": 0.7575,
	"step": 1960
	},
	{
	"epoch": 0.5929270127915726,
	"grad_norm": 0.19967325031757355,
	"learning_rate": 8.507845174397357e-05,
	"loss": 0.8136,
	"step": 1970
	},
	{
	"epoch": 0.5959367945823928,
	"grad_norm": 0.19900915026664734,
	"learning_rate": 8.404001049666211e-05,
	"loss": 0.7205,
	"step": 1980
	},
	{
	"epoch": 0.5989465763732129,
	"grad_norm": 0.25308361649513245,
	"learning_rate": 8.300333234761787e-05,
	"loss": 0.768,
	"step": 1990
	},
	{
	"epoch": 0.6019563581640331,
	"grad_norm": 0.16838368773460388,
	"learning_rate": 8.196853181856081e-05,
	"loss": 0.7502,
	"step": 2000
	},
	{
	"epoch": 0.6019563581640331,
	"eval_loss": 0.8055371046066284,
	"eval_runtime": 143.0386,
	"eval_samples_per_second": 39.122,
	"eval_steps_per_second": 4.894,
	"step": 2000
	},
	{
	"epoch": 0.6049661399548533,
	"grad_norm": 0.22218653559684753,
	"learning_rate": 8.093572322379045e-05,
	"loss": 0.7597,
	"step": 2010
	},
	{
	"epoch": 0.6079759217456734,
	"grad_norm": 0.19360238313674927,
	"learning_rate": 7.990502065755748e-05,
	"loss": 0.7592,
	"step": 2020
	},
	{
	"epoch": 0.6109857035364936,
	"grad_norm": 0.25584983825683594,
	"learning_rate": 7.887653798145987e-05,
	"loss": 0.7432,
	"step": 2030
	},
	{
	"epoch": 0.6139954853273137,
	"grad_norm": 0.23388119041919708,
	"learning_rate": 7.785038881186462e-05,
	"loss": 0.7638,
	"step": 2040
	},
	{
	"epoch": 0.617005267118134,
	"grad_norm": 0.24461683630943298,
	"learning_rate": 7.682668650735645e-05,
	"loss": 0.7451,
	"step": 2050
	},
	{
	"epoch": 0.6200150489089541,
	"grad_norm": 0.20549526810646057,
	"learning_rate": 7.580554415621522e-05,
	"loss": 0.7432,
	"step": 2060
	},
	{
	"epoch": 0.6230248306997742,
	"grad_norm": 0.22572945058345795,
	"learning_rate": 7.478707456392302e-05,
	"loss": 0.7924,
	"step": 2070
	},
	{
	"epoch": 0.6260346124905944,
	"grad_norm": 0.33990249037742615,
	"learning_rate": 7.377139024070254e-05,
	"loss": 0.7471,
	"step": 2080
	},
	{
	"epoch": 0.6290443942814146,
	"grad_norm": 0.2138250321149826,
	"learning_rate": 7.275860338908815e-05,
	"loss": 0.7535,
	"step": 2090
	},
	{
	"epoch": 0.6320541760722348,
	"grad_norm": 0.17337530851364136,
	"learning_rate": 7.174882589153076e-05,
	"loss": 0.7523,
	"step": 2100
	},
	{
	"epoch": 0.6350639578630549,
	"grad_norm": 0.2286282777786255,
	"learning_rate": 7.07421692980384e-05,
	"loss": 0.7507,
	"step": 2110
	},
	{
	"epoch": 0.6380737396538751,
	"grad_norm": 0.19162622094154358,
	"learning_rate": 6.973874481385312e-05,
	"loss": 0.7671,
	"step": 2120
	},
	{
	"epoch": 0.6410835214446953,
	"grad_norm": 0.22725722193717957,
	"learning_rate": 6.873866328716614e-05,
	"loss": 0.7595,
	"step": 2130
	},
	{
	"epoch": 0.6440933032355154,
	"grad_norm": 0.28031495213508606,
	"learning_rate": 6.774203519687265e-05,
	"loss": 0.7384,
	"step": 2140
	},
	{
	"epoch": 0.6471030850263356,
	"grad_norm": 0.31639155745506287,
	"learning_rate": 6.674897064036706e-05,
	"loss": 0.7629,
	"step": 2150
	},
	{
	"epoch": 0.6501128668171557,
	"grad_norm": 0.2253294438123703,
	"learning_rate": 6.575957932138057e-05,
	"loss": 0.7793,
	"step": 2160
	},
	{
	"epoch": 0.653122648607976,
	"grad_norm": 0.2402772158384323,
	"learning_rate": 6.47739705378623e-05,
	"loss": 0.7586,
	"step": 2170
	},
	{
	"epoch": 0.6561324303987961,
	"grad_norm": 0.1558917760848999,
	"learning_rate": 6.379225316990505e-05,
	"loss": 0.7564,
	"step": 2180
	},
	{
	"epoch": 0.6591422121896162,
	"grad_norm": 0.17613032460212708,
	"learning_rate": 6.281453566771735e-05,
	"loss": 0.7354,
	"step": 2190
	},
	{
	"epoch": 0.6621519939804364,
	"grad_norm": 0.28283971548080444,
	"learning_rate": 6.184092603964308e-05,
	"loss": 0.8203,
	"step": 2200
	},
	{
	"epoch": 0.6651617757712566,
	"grad_norm": 0.2295006364583969,
	"learning_rate": 6.087153184022969e-05,
	"loss": 0.7466,
	"step": 2210
	},
	{
	"epoch": 0.6681715575620768,
	"grad_norm": 0.24166414141654968,
	"learning_rate": 5.990646015834668e-05,
	"loss": 0.76,
	"step": 2220
	},
	{
	"epoch": 0.6711813393528969,
	"grad_norm": 0.5943595170974731,
	"learning_rate": 5.894581760535549e-05,
	"loss": 0.7662,
	"step": 2230
	},
	{
	"epoch": 0.674191121143717,
	"grad_norm": 0.23785801231861115,
	"learning_rate": 5.798971030333227e-05,
	"loss": 0.7742,
	"step": 2240
	},
	{
	"epoch": 0.6772009029345373,
	"grad_norm": 0.20206815004348755,
	"learning_rate": 5.703824387334442e-05,
	"loss": 0.7837,
	"step": 2250
	},
	{
	"epoch": 0.6802106847253574,
	"grad_norm": 0.21082885563373566,
	"learning_rate": 5.609152342378278e-05,
	"loss": 0.758,
	"step": 2260
	},
	{
	"epoch": 0.6832204665161776,
	"grad_norm": 0.1953180432319641,
	"learning_rate": 5.514965353875019e-05,
	"loss": 0.726,
	"step": 2270
	},
	{
	"epoch": 0.6862302483069977,
	"grad_norm": 0.22035697102546692,
	"learning_rate": 5.4212738266508245e-05,
	"loss": 0.7199,
	"step": 2280
	},
	{
	"epoch": 0.6892400300978179,
	"grad_norm": 0.17208510637283325,
	"learning_rate": 5.3280881107982946e-05,
	"loss": 0.7379,
	"step": 2290
	},
	{
	"epoch": 0.6922498118886381,
	"grad_norm": 0.18434813618659973,
	"learning_rate": 5.235418500533109e-05,
	"loss": 0.7574,
	"step": 2300
	},
	{
	"epoch": 0.6952595936794582,
	"grad_norm": 0.35970667004585266,
	"learning_rate": 5.143275233056817e-05,
	"loss": 0.7737,
	"step": 2310
	},
	{
	"epoch": 0.6982693754702785,
	"grad_norm": 0.19193576276302338,
	"learning_rate": 5.051668487425938e-05,
	"loss": 0.7268,
	"step": 2320
	},
	{
	"epoch": 0.7012791572610986,
	"grad_norm": 0.21549300849437714,
	"learning_rate": 4.960608383427481e-05,
	"loss": 0.7542,
	"step": 2330
	},
	{
	"epoch": 0.7042889390519187,
	"grad_norm": 0.1781996786594391,
	"learning_rate": 4.8701049804610265e-05,
	"loss": 0.7493,
	"step": 2340
	},
	{
	"epoch": 0.7072987208427389,
	"grad_norm": 0.17246761918067932,
	"learning_rate": 4.780168276427441e-05,
	"loss": 0.7468,
	"step": 2350
	},
	{
	"epoch": 0.710308502633559,
	"grad_norm": 0.22511474788188934,
	"learning_rate": 4.6908082066244275e-05,
	"loss": 0.7667,
	"step": 2360
	},
	{
	"epoch": 0.7133182844243793,
	"grad_norm": 0.22506214678287506,
	"learning_rate": 4.602034642648968e-05,
	"loss": 0.788,
	"step": 2370
	},
	{
	"epoch": 0.7163280662151994,
	"grad_norm": 0.23567315936088562,
	"learning_rate": 4.513857391306812e-05,
	"loss": 0.772,
	"step": 2380
	},
	{
	"epoch": 0.7193378480060195,
	"grad_norm": 0.21594233810901642,
	"learning_rate": 4.4262861935291144e-05,
	"loss": 0.7257,
	"step": 2390
	},
	{
	"epoch": 0.7223476297968398,
	"grad_norm": 0.2095227688550949,
	"learning_rate": 4.339330723296373e-05,
	"loss": 0.7456,
	"step": 2400
	},
	{
	"epoch": 0.7253574115876599,
	"grad_norm": 0.2402193695306778,
	"learning_rate": 4.25300058656972e-05,
	"loss": 0.7722,
	"step": 2410
	},
	{
	"epoch": 0.7283671933784801,
	"grad_norm": 0.2426275908946991,
	"learning_rate": 4.1673053202297676e-05,
	"loss": 0.7871,
	"step": 2420
	},
	{
	"epoch": 0.7313769751693002,
	"grad_norm": 0.1813143640756607,
	"learning_rate": 4.0822543910230674e-05,
	"loss": 0.7394,
	"step": 2430
	},
	{
	"epoch": 0.7343867569601203,
	"grad_norm": 0.29781273007392883,
	"learning_rate": 3.997857194516319e-05,
	"loss": 0.7318,
	"step": 2440
	},
	{
	"epoch": 0.7373965387509406,
	"grad_norm": 0.2380327582359314,
	"learning_rate": 3.914123054058446e-05,
	"loss": 0.7711,
	"step": 2450
	},
	{
	"epoch": 0.7404063205417607,
	"grad_norm": 0.2532583773136139,
	"learning_rate": 3.831061219750636e-05,
	"loss": 0.7341,
	"step": 2460
	},
	{
	"epoch": 0.7434161023325809,
	"grad_norm": 0.18625672161579132,
	"learning_rate": 3.7486808674245047e-05,
	"loss": 0.7349,
	"step": 2470
	},
	{
	"epoch": 0.746425884123401,
	"grad_norm": 0.2749151289463043,
	"learning_rate": 3.666991097628416e-05,
	"loss": 0.7551,
	"step": 2480
	},
	{
	"epoch": 0.7494356659142212,
	"grad_norm": 0.28592735528945923,
	"learning_rate": 3.586000934622166e-05,
	"loss": 0.7485,
	"step": 2490
	},
	{
	"epoch": 0.7524454477050414,
	"grad_norm": 0.22789210081100464,
	"learning_rate": 3.5057193253800624e-05,
	"loss": 0.7308,
	"step": 2500
	},
	{
	"epoch": 0.7554552294958615,
	"grad_norm": 0.2588096559047699,
	"learning_rate": 3.426155138602558e-05,
	"loss": 0.7717,
	"step": 2510
	},
	{
	"epoch": 0.7584650112866818,
	"grad_norm": 0.2197035402059555,
	"learning_rate": 3.347317163736524e-05,
	"loss": 0.753,
	"step": 2520
	},
	{
	"epoch": 0.7614747930775019,
	"grad_norm": 0.19545480608940125,
	"learning_rate": 3.269214110004293e-05,
	"loss": 0.7552,
	"step": 2530
	},
	{
	"epoch": 0.764484574868322,
	"grad_norm": 0.1901889145374298,
	"learning_rate": 3.191854605441527e-05,
	"loss": 0.7146,
	"step": 2540
	},
	{
	"epoch": 0.7674943566591422,
	"grad_norm": 0.19916728138923645,
	"learning_rate": 3.115247195944102e-05,
	"loss": 0.7733,
	"step": 2550
	},
	{
	"epoch": 0.7705041384499624,
	"grad_norm": 0.2248535454273224,
	"learning_rate": 3.039400344324035e-05,
	"loss": 0.7948,
	"step": 2560
	},
	{
	"epoch": 0.7735139202407826,
	"grad_norm": 0.238205224275589,
	"learning_rate": 2.9643224293745954e-05,
	"loss": 0.7637,
	"step": 2570
	},
	{
	"epoch": 0.7765237020316027,
	"grad_norm": 0.26717284321784973,
	"learning_rate": 2.8900217449447074e-05,
	"loss": 0.7326,
	"step": 2580
	},
	{
	"epoch": 0.7795334838224228,
	"grad_norm": 0.2582661509513855,
	"learning_rate": 2.8165064990227252e-05,
	"loss": 0.7571,
	"step": 2590
	},
	{
	"epoch": 0.782543265613243,
	"grad_norm": 0.22693775594234467,
	"learning_rate": 2.7437848128296982e-05,
	"loss": 0.75,
	"step": 2600
	},
	{
	"epoch": 0.7855530474040632,
	"grad_norm": 0.21676841378211975,
	"learning_rate": 2.6718647199222214e-05,
	"loss": 0.7693,
	"step": 2610
	},
	{
	"epoch": 0.7885628291948834,
	"grad_norm": 0.21357332170009613,
	"learning_rate": 2.600754165304966e-05,
	"loss": 0.7499,
	"step": 2620
	},
	{
	"epoch": 0.7915726109857035,
	"grad_norm": 0.2448240965604782,
	"learning_rate": 2.530461004553001e-05,
	"loss": 0.7565,
	"step": 2630
	},
	{
	"epoch": 0.7945823927765236,
	"grad_norm": 0.16155965626239777,
	"learning_rate": 2.460993002943983e-05,
	"loss": 0.738,
	"step": 2640
	},
	{
	"epoch": 0.7975921745673439,
	"grad_norm": 0.22366459667682648,
	"learning_rate": 2.3923578346003363e-05,
	"loss": 0.746,
	"step": 2650
	},
	{
	"epoch": 0.800601956358164,
	"grad_norm": 0.21000875532627106,
	"learning_rate": 2.32456308164148e-05,
	"loss": 0.7615,
	"step": 2660
	},
	{
	"epoch": 0.8036117381489842,
	"grad_norm": 0.23698626458644867,
	"learning_rate": 2.2576162333462402e-05,
	"loss": 0.8101,
	"step": 2670
	},
	{
	"epoch": 0.8066215199398044,
	"grad_norm": 0.17663483321666718,
	"learning_rate": 2.191524685325512e-05,
	"loss": 0.7551,
	"step": 2680
	},
	{
	"epoch": 0.8096313017306245,
	"grad_norm": 0.193350687623024,
	"learning_rate": 2.126295738705262e-05,
	"loss": 0.7621,
	"step": 2690
	},
	{
	"epoch": 0.8126410835214447,
	"grad_norm": 0.2506263256072998,
	"learning_rate": 2.0619365993199747e-05,
	"loss": 0.7789,
	"step": 2700
	},
	{
	"epoch": 0.8156508653122648,
	"grad_norm": 0.23012906312942505,
	"learning_rate": 1.9984543769166265e-05,
	"loss": 0.7246,
	"step": 2710
	},
	{
	"epoch": 0.8186606471030851,
	"grad_norm": 0.18312327563762665,
	"learning_rate": 1.9358560843692787e-05,
	"loss": 0.7295,
	"step": 2720
	},
	{
	"epoch": 0.8216704288939052,
	"grad_norm": 0.22966724634170532,
	"learning_rate": 1.8741486369043505e-05,
	"loss": 0.7851,
	"step": 2730
	},
	{
	"epoch": 0.8246802106847254,
	"grad_norm": 0.160865917801857,
	"learning_rate": 1.8133388513367078e-05,
	"loss": 0.7278,
	"step": 2740
	},
	{
	"epoch": 0.8276899924755455,
	"grad_norm": 0.26794126629829407,
	"learning_rate": 1.7534334453166068e-05,
	"loss": 0.7247,
	"step": 2750
	},
	{
	"epoch": 0.8306997742663657,
	"grad_norm": 0.23241771757602692,
	"learning_rate": 1.6944390365875952e-05,
	"loss": 0.7741,
	"step": 2760
	},
	{
	"epoch": 0.8337095560571859,
	"grad_norm": 0.20465601980686188,
	"learning_rate": 1.6363621422554476e-05,
	"loss": 0.7267,
	"step": 2770
	},
	{
	"epoch": 0.836719337848006,
	"grad_norm": 0.21765820682048798,
	"learning_rate": 1.579209178068234e-05,
	"loss": 0.7566,
	"step": 2780
	},
	{
	"epoch": 0.8397291196388262,
	"grad_norm": 0.21495480835437775,
	"learning_rate": 1.5229864577075547e-05,
	"loss": 0.8005,
	"step": 2790
	},
	{
	"epoch": 0.8427389014296464,
	"grad_norm": 0.19566848874092102,
	"learning_rate": 1.4677001920910827e-05,
	"loss": 0.7494,
	"step": 2800
	},
	{
	"epoch": 0.8457486832204665,
	"grad_norm": 0.1817813217639923,
	"learning_rate": 1.4133564886864381e-05,
	"loss": 0.7289,
	"step": 2810
	},
	{
	"epoch": 0.8487584650112867,
	"grad_norm": 0.2588653266429901,
	"learning_rate": 1.3599613508364984e-05,
	"loss": 0.7493,
	"step": 2820
	},
	{
	"epoch": 0.8517682468021068,
	"grad_norm": 0.1954454928636551,
	"learning_rate": 1.307520677096209e-05,
	"loss": 0.7504,
	"step": 2830
	},
	{
	"epoch": 0.8547780285929271,
	"grad_norm": 0.17466256022453308,
	"learning_rate": 1.2560402605809707e-05,
	"loss": 0.7705,
	"step": 2840
	},
	{
	"epoch": 0.8577878103837472,
	"grad_norm": 0.2213626205921173,
	"learning_rate": 1.2055257883266791e-05,
	"loss": 0.7307,
	"step": 2850
	},
	{
	"epoch": 0.8607975921745673,
	"grad_norm": 0.19768081605434418,
	"learning_rate": 1.1559828406614714e-05,
	"loss": 0.7741,
	"step": 2860
	},
	{
	"epoch": 0.8638073739653875,
	"grad_norm": 0.22028213739395142,
	"learning_rate": 1.1074168905892702e-05,
	"loss": 0.7238,
	"step": 2870
	},
	{
	"epoch": 0.8668171557562077,
	"grad_norm": 0.2675631046295166,
	"learning_rate": 1.0598333031851881e-05,
	"loss": 0.7315,
	"step": 2880
	},
	{
	"epoch": 0.8698269375470279,
	"grad_norm": 0.20498494803905487,
	"learning_rate": 1.0132373350028313e-05,
	"loss": 0.751,
	"step": 2890
	},
	{
	"epoch": 0.872836719337848,
	"grad_norm": 0.22774334251880646,
	"learning_rate": 9.676341334936346e-06,
	"loss": 0.72,
	"step": 2900
	},
	{
	"epoch": 0.8758465011286681,
	"grad_norm": 0.18757307529449463,
	"learning_rate": 9.230287364382007e-06,
	"loss": 0.7433,
	"step": 2910
	},
	{
	"epoch": 0.8788562829194884,
	"grad_norm": 0.23557019233703613,
	"learning_rate": 8.794260713897862e-06,
	"loss": 0.7473,
	"step": 2920
	},
	{
	"epoch": 0.8818660647103085,
	"grad_norm": 0.24945016205310822,
	"learning_rate": 8.368309551299536e-06,
	"loss": 0.746,
	"step": 2930
	},
	{
	"epoch": 0.8848758465011287,
	"grad_norm": 0.5168988704681396,
	"learning_rate": 7.952480931364658e-06,
	"loss": 0.7523,
	"step": 2940
	},
	{
	"epoch": 0.8878856282919488,
	"grad_norm": 0.21044060587882996,
	"learning_rate": 7.546820790634646e-06,
	"loss": 0.7359,
	"step": 2950
	},
	{
	"epoch": 0.890895410082769,
	"grad_norm": 0.2304990291595459,
	"learning_rate": 7.1513739423402e-06,
	"loss": 0.7201,
	"step": 2960
	},
	{
	"epoch": 0.8939051918735892,
	"grad_norm": 0.5010607242584229,
	"learning_rate": 6.766184071450721e-06,
	"loss": 0.7619,
	"step": 2970
	},
	{
	"epoch": 0.8969149736644093,
	"grad_norm": 0.1829194277524948,
	"learning_rate": 6.391293729848435e-06,
	"loss": 0.7654,
	"step": 2980
	},
	{
	"epoch": 0.8999247554552295,
	"grad_norm": 0.4193117320537567,
	"learning_rate": 6.026744331627731e-06,
	"loss": 0.7416,
	"step": 2990
	},
	{
	"epoch": 0.9029345372460497,
	"grad_norm": 0.2364160269498825,
	"learning_rate": 5.672576148520137e-06,
	"loss": 0.7516,
	"step": 3000
	},
	{
	"epoch": 0.9029345372460497,
	"eval_loss": 0.7987983226776123,
	"eval_runtime": 142.8043,
	"eval_samples_per_second": 39.187,
	"eval_steps_per_second": 4.902,
	"step": 3000
	},
	{
	"epoch": 0.9059443190368698,
	"grad_norm": 0.1936904937028885,
	"learning_rate": 5.328828305445477e-06,
	"loss": 0.7357,
	"step": 3010
	},
	{
	"epoch": 0.90895410082769,
	"grad_norm": 0.194386288523674,
	"learning_rate": 4.9955387761897785e-06,
	"loss": 0.7725,
	"step": 3020
	},
	{
	"epoch": 0.9119638826185101,
	"grad_norm": 0.24116984009742737,
	"learning_rate": 4.672744379210336e-06,
	"loss": 0.776,
	"step": 3030
	},
	{
	"epoch": 0.9149736644093304,
	"grad_norm": 0.1920221745967865,
	"learning_rate": 4.360480773568321e-06,
	"loss": 0.7323,
	"step": 3040
	},
	{
	"epoch": 0.9179834462001505,
	"grad_norm": 0.19343051314353943,
	"learning_rate": 4.058782454989529e-06,
	"loss": 0.7436,
	"step": 3050
	},
	{
	"epoch": 0.9209932279909706,
	"grad_norm": 0.2893059253692627,
	"learning_rate": 3.767682752053714e-06,
	"loss": 0.7538,
	"step": 3060
	},
	{
	"epoch": 0.9240030097817908,
	"grad_norm": 0.23565465211868286,
	"learning_rate": 3.487213822512714e-06,
	"loss": 0.7535,
	"step": 3070
	},
	{
	"epoch": 0.927012791572611,
	"grad_norm": 0.2228977531194687,
	"learning_rate": 3.2174066497380086e-06,
	"loss": 0.7572,
	"step": 3080
	},
	{
	"epoch": 0.9300225733634312,
	"grad_norm": 0.20145723223686218,
	"learning_rate": 2.958291039298e-06,
	"loss": 0.7114,
	"step": 3090
	},
	{
	"epoch": 0.9330323551542513,
	"grad_norm": 0.2287394404411316,
	"learning_rate": 2.709895615665392e-06,
	"loss": 0.7656,
	"step": 3100
	},
	{
	"epoch": 0.9360421369450714,
	"grad_norm": 0.21397553384304047,
	"learning_rate": 2.472247819055029e-06,
	"loss": 0.7361,
	"step": 3110
	},
	{
	"epoch": 0.9390519187358917,
	"grad_norm": 0.22272509336471558,
	"learning_rate": 2.2453739023926113e-06,
	"loss": 0.7789,
	"step": 3120
	},
	{
	"epoch": 0.9420617005267118,
	"grad_norm": 0.2435009628534317,
	"learning_rate": 2.0292989284144915e-06,
	"loss": 0.7756,
	"step": 3130
	},
	{
	"epoch": 0.945071482317532,
	"grad_norm": 0.17544378340244293,
	"learning_rate": 1.8240467668990457e-06,
	"loss": 0.7319,
	"step": 3140
	},
	{
	"epoch": 0.9480812641083521,
	"grad_norm": 0.19456711411476135,
	"learning_rate": 1.6296400920297383e-06,
	"loss": 0.7469,
	"step": 3150
	},
	{
	"epoch": 0.9510910458991723,
	"grad_norm": 0.21780775487422943,
	"learning_rate": 1.4461003798903695e-06,
	"loss": 0.7587,
	"step": 3160
	},
	{
	"epoch": 0.9541008276899925,
	"grad_norm": 0.21393659710884094,
	"learning_rate": 1.2734479060925397e-06,
	"loss": 0.7504,
	"step": 3170
	},
	{
	"epoch": 0.9571106094808126,
	"grad_norm": 0.21127097308635712,
	"learning_rate": 1.1117017435358423e-06,
	"loss": 0.7628,
	"step": 3180
	},
	{
	"epoch": 0.9601203912716328,
	"grad_norm": 0.19377408921718597,
	"learning_rate": 9.608797603008812e-07,
	"loss": 0.7195,
	"step": 3190
	},
	{
	"epoch": 0.963130173062453,
	"grad_norm": 0.27100813388824463,
	"learning_rate": 8.209986176753948e-07,
	"loss": 0.7476,
	"step": 3200
	},
	{
	"epoch": 0.9661399548532731,
	"grad_norm": 0.1997697949409485,
	"learning_rate": 6.920737683136613e-07,
	"loss": 0.7578,
	"step": 3210
	},
	{
	"epoch": 0.9691497366440933,
	"grad_norm": 0.3180257976055145,
	"learning_rate": 5.741194545294648e-07,
	"loss": 0.7537,
	"step": 3220
	},
	{
	"epoch": 0.9721595184349134,
	"grad_norm": 0.20285405218601227,
	"learning_rate": 4.671487067227531e-07,
	"loss": 0.7226,
	"step": 3230
	},
	{
	"epoch": 0.9751693002257337,
	"grad_norm": 0.1977706104516983,
	"learning_rate": 3.711733419401453e-07,
	"loss": 0.7487,
	"step": 3240
	},
	{
	"epoch": 0.9781790820165538,
	"grad_norm": 0.2490547150373459,
	"learning_rate": 2.8620396256953117e-07,
	"loss": 0.773,
	"step": 3250
	},
	{
	"epoch": 0.9811888638073739,
	"grad_norm": 0.18216556310653687,
	"learning_rate": 2.122499551688084e-07,
	"loss": 0.7348,
	"step": 3260
	},
	{
	"epoch": 0.9841986455981941,
	"grad_norm": 0.20423290133476257,
	"learning_rate": 1.4931948942895624e-07,
	"loss": 0.7116,
	"step": 3270
	},
	{
	"epoch": 0.9872084273890143,
	"grad_norm": 0.3691667914390564,
	"learning_rate": 9.741951727152421e-08,
	"loss": 0.7242,
	"step": 3280
	},
	{
	"epoch": 0.9902182091798345,
	"grad_norm": 0.1876600682735443,
	"learning_rate": 5.655577208069085e-08,
	"loss": 0.755,
	"step": 3290
	},
	{
	"epoch": 0.9932279909706546,
	"grad_norm": 0.23210102319717407,
	"learning_rate": 2.6732768069825943e-08,
	"loss": 0.7697,
	"step": 3300
	},
	{
	"epoch": 0.9962377727614747,
	"grad_norm": 0.18966606259346008,
	"learning_rate": 7.953799782889349e-09,
	"loss": 0.7166,
	"step": 3310
	},
	{
	"epoch": 0.999247554552295,
	"grad_norm": 0.5191490650177002,
	"learning_rate": 2.2094173039999277e-10,
	"loss": 0.7366,
	"step": 3320
	},
	{
	"epoch": 0.999849510910459,
	"step": 3322,
	"total_flos": 3.156428776263385e+18,
	"train_loss": 0.7673604141007986,
	"train_runtime": 8289.1597,
	"train_samples_per_second": 12.826,
	"train_steps_per_second": 0.401
	}
	],
	"logging_steps": 10,
	"max_steps": 3322,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 1000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 3.156428776263385e+18,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}