testypoos2 / last-checkpoint /trainer_state.json

Training in progress, step 3600, checkpoint

7c19f8e verified about 1 year ago

88.4 kB

	{
	"best_metric": 0.717534065246582,
	"best_model_checkpoint": "./output/checkpoint-450",
	"epoch": 156.52173913043478,
	"eval_steps": 150,
	"global_step": 3600,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.43478260869565216,
	"grad_norm": 1.5021440982818604,
	"learning_rate": 3e-06,
	"loss": 0.9061,
	"step": 10
	},
	{
	"epoch": 0.8695652173913043,
	"grad_norm": 1.6870536804199219,
	"learning_rate": 6e-06,
	"loss": 0.9023,
	"step": 20
	},
	{
	"epoch": 1.3043478260869565,
	"grad_norm": 1.729659080505371,
	"learning_rate": 9e-06,
	"loss": 0.9004,
	"step": 30
	},
	{
	"epoch": 1.7391304347826086,
	"grad_norm": 1.453600525856018,
	"learning_rate": 1.2e-05,
	"loss": 0.9091,
	"step": 40
	},
	{
	"epoch": 2.1739130434782608,
	"grad_norm": 1.3518075942993164,
	"learning_rate": 1.5e-05,
	"loss": 0.8362,
	"step": 50
	},
	{
	"epoch": 2.608695652173913,
	"grad_norm": 2.031172513961792,
	"learning_rate": 1.8e-05,
	"loss": 0.8893,
	"step": 60
	},
	{
	"epoch": 3.0434782608695654,
	"grad_norm": 1.484531283378601,
	"learning_rate": 2.1e-05,
	"loss": 0.8915,
	"step": 70
	},
	{
	"epoch": 3.4782608695652173,
	"grad_norm": 1.7294986248016357,
	"learning_rate": 2.4e-05,
	"loss": 0.8233,
	"step": 80
	},
	{
	"epoch": 3.9130434782608696,
	"grad_norm": 1.4242360591888428,
	"learning_rate": 2.7000000000000002e-05,
	"loss": 0.8527,
	"step": 90
	},
	{
	"epoch": 4.3478260869565215,
	"grad_norm": 1.3656773567199707,
	"learning_rate": 3e-05,
	"loss": 0.8648,
	"step": 100
	},
	{
	"epoch": 4.782608695652174,
	"grad_norm": 2.19753098487854,
	"learning_rate": 2.999999702723963e-05,
	"loss": 0.8225,
	"step": 110
	},
	{
	"epoch": 5.217391304347826,
	"grad_norm": 1.0726382732391357,
	"learning_rate": 2.9999988108959687e-05,
	"loss": 0.7654,
	"step": 120
	},
	{
	"epoch": 5.6521739130434785,
	"grad_norm": 1.5603922605514526,
	"learning_rate": 2.9999973245163716e-05,
	"loss": 0.7417,
	"step": 130
	},
	{
	"epoch": 6.086956521739131,
	"grad_norm": 1.9068461656570435,
	"learning_rate": 2.99999524358576e-05,
	"loss": 0.7654,
	"step": 140
	},
	{
	"epoch": 6.521739130434782,
	"grad_norm": 1.1220637559890747,
	"learning_rate": 2.9999925681049593e-05,
	"loss": 0.7857,
	"step": 150
	},
	{
	"epoch": 6.521739130434782,
	"eval_loss": 0.7963114976882935,
	"eval_runtime": 0.4908,
	"eval_samples_per_second": 20.374,
	"eval_steps_per_second": 20.374,
	"step": 150
	},
	{
	"epoch": 6.956521739130435,
	"grad_norm": 1.5331261157989502,
	"learning_rate": 2.9999892980750297e-05,
	"loss": 0.6585,
	"step": 160
	},
	{
	"epoch": 7.391304347826087,
	"grad_norm": 1.3447493314743042,
	"learning_rate": 2.9999854334972675e-05,
	"loss": 0.7388,
	"step": 170
	},
	{
	"epoch": 7.826086956521739,
	"grad_norm": 1.7259607315063477,
	"learning_rate": 2.999980974373204e-05,
	"loss": 0.7293,
	"step": 180
	},
	{
	"epoch": 8.26086956521739,
	"grad_norm": 1.5403547286987305,
	"learning_rate": 2.9999759207046075e-05,
	"loss": 0.6247,
	"step": 190
	},
	{
	"epoch": 8.695652173913043,
	"grad_norm": 1.7431354522705078,
	"learning_rate": 2.9999702724934804e-05,
	"loss": 0.6765,
	"step": 200
	},
	{
	"epoch": 9.130434782608695,
	"grad_norm": 1.0416122674942017,
	"learning_rate": 2.999964029742062e-05,
	"loss": 0.6523,
	"step": 210
	},
	{
	"epoch": 9.565217391304348,
	"grad_norm": 1.2200145721435547,
	"learning_rate": 2.9999571924528263e-05,
	"loss": 0.5592,
	"step": 220
	},
	{
	"epoch": 10.0,
	"grad_norm": 1.526785969734192,
	"learning_rate": 2.9999497606284837e-05,
	"loss": 0.756,
	"step": 230
	},
	{
	"epoch": 10.434782608695652,
	"grad_norm": 1.4215515851974487,
	"learning_rate": 2.9999417342719796e-05,
	"loss": 0.7117,
	"step": 240
	},
	{
	"epoch": 10.869565217391305,
	"grad_norm": 0.9789811372756958,
	"learning_rate": 2.9999331133864956e-05,
	"loss": 0.5896,
	"step": 250
	},
	{
	"epoch": 11.304347826086957,
	"grad_norm": 1.1944794654846191,
	"learning_rate": 2.9999238979754485e-05,
	"loss": 0.6547,
	"step": 260
	},
	{
	"epoch": 11.73913043478261,
	"grad_norm": 1.050191044807434,
	"learning_rate": 2.999914088042492e-05,
	"loss": 0.6475,
	"step": 270
	},
	{
	"epoch": 12.173913043478262,
	"grad_norm": 1.3121248483657837,
	"learning_rate": 2.9999036835915132e-05,
	"loss": 0.594,
	"step": 280
	},
	{
	"epoch": 12.608695652173914,
	"grad_norm": 1.082655906677246,
	"learning_rate": 2.9998926846266365e-05,
	"loss": 0.6326,
	"step": 290
	},
	{
	"epoch": 13.043478260869565,
	"grad_norm": 1.3888633251190186,
	"learning_rate": 2.9998810911522213e-05,
	"loss": 0.5806,
	"step": 300
	},
	{
	"epoch": 13.043478260869565,
	"eval_loss": 0.7309322357177734,
	"eval_runtime": 0.5145,
	"eval_samples_per_second": 19.436,
	"eval_steps_per_second": 19.436,
	"step": 300
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.554,
	"Start_State_samples_per_second": 18.05,
	"Start_State_steps_per_second": 18.05,
	"epoch": 13.043478260869565,
	"step": 300
	},
	{
	"SWA_loss": 0.7309322357177734,
	"SWA_runtime": 0.5629,
	"SWA_samples_per_second": 17.765,
	"SWA_steps_per_second": 17.765,
	"epoch": 13.043478260869565,
	"step": 300
	},
	{
	"EMA_loss": 0.7309322357177734,
	"EMA_runtime": 0.5474,
	"EMA_samples_per_second": 18.268,
	"EMA_steps_per_second": 18.268,
	"epoch": 13.043478260869565,
	"step": 300
	},
	{
	"epoch": 13.478260869565217,
	"grad_norm": 1.7805578708648682,
	"learning_rate": 2.9998689031728636e-05,
	"loss": 0.5145,
	"step": 310
	},
	{
	"epoch": 13.91304347826087,
	"grad_norm": 1.533318042755127,
	"learning_rate": 2.9998561206933938e-05,
	"loss": 0.6497,
	"step": 320
	},
	{
	"epoch": 14.347826086956522,
	"grad_norm": 1.4810696840286255,
	"learning_rate": 2.9998427437188786e-05,
	"loss": 0.5741,
	"step": 330
	},
	{
	"epoch": 14.782608695652174,
	"grad_norm": 1.3101780414581299,
	"learning_rate": 2.99982877225462e-05,
	"loss": 0.6013,
	"step": 340
	},
	{
	"epoch": 15.217391304347826,
	"grad_norm": 0.9747373461723328,
	"learning_rate": 2.9998142063061564e-05,
	"loss": 0.4991,
	"step": 350
	},
	{
	"epoch": 15.652173913043478,
	"grad_norm": 1.6347649097442627,
	"learning_rate": 2.9997990458792603e-05,
	"loss": 0.5624,
	"step": 360
	},
	{
	"epoch": 16.08695652173913,
	"grad_norm": 1.6364760398864746,
	"learning_rate": 2.9997832909799417e-05,
	"loss": 0.667,
	"step": 370
	},
	{
	"epoch": 16.52173913043478,
	"grad_norm": 0.9518026113510132,
	"learning_rate": 2.9997669416144452e-05,
	"loss": 0.513,
	"step": 380
	},
	{
	"epoch": 16.956521739130434,
	"grad_norm": 0.9366481304168701,
	"learning_rate": 2.999749997789251e-05,
	"loss": 0.5798,
	"step": 390
	},
	{
	"epoch": 17.391304347826086,
	"grad_norm": 1.1163969039916992,
	"learning_rate": 2.9997324595110743e-05,
	"loss": 0.518,
	"step": 400
	},
	{
	"epoch": 17.82608695652174,
	"grad_norm": 1.2849133014678955,
	"learning_rate": 2.9997143267868683e-05,
	"loss": 0.5877,
	"step": 410
	},
	{
	"epoch": 18.26086956521739,
	"grad_norm": 1.1642106771469116,
	"learning_rate": 2.9996955996238192e-05,
	"loss": 0.506,
	"step": 420
	},
	{
	"epoch": 18.695652173913043,
	"grad_norm": 1.1996164321899414,
	"learning_rate": 2.9996762780293503e-05,
	"loss": 0.5315,
	"step": 430
	},
	{
	"epoch": 19.130434782608695,
	"grad_norm": 1.214064121246338,
	"learning_rate": 2.9996563620111197e-05,
	"loss": 0.5334,
	"step": 440
	},
	{
	"epoch": 19.565217391304348,
	"grad_norm": 1.4286197423934937,
	"learning_rate": 2.9996358515770218e-05,
	"loss": 0.5677,
	"step": 450
	},
	{
	"epoch": 19.565217391304348,
	"eval_loss": 0.717534065246582,
	"eval_runtime": 0.5321,
	"eval_samples_per_second": 18.792,
	"eval_steps_per_second": 18.792,
	"step": 450
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4001,
	"Start_State_samples_per_second": 24.995,
	"Start_State_steps_per_second": 24.995,
	"epoch": 19.565217391304348,
	"step": 450
	},
	{
	"Raw_Model_loss": 0.717534065246582,
	"Raw_Model_runtime": 0.4009,
	"Raw_Model_samples_per_second": 24.944,
	"Raw_Model_steps_per_second": 24.944,
	"epoch": 19.565217391304348,
	"step": 450
	},
	{
	"SWA_loss": 0.7233762741088867,
	"SWA_runtime": 0.3946,
	"SWA_samples_per_second": 25.341,
	"SWA_steps_per_second": 25.341,
	"epoch": 19.565217391304348,
	"step": 450
	},
	{
	"EMA_loss": 0.7309414744377136,
	"EMA_runtime": 0.4041,
	"EMA_samples_per_second": 24.748,
	"EMA_steps_per_second": 24.748,
	"epoch": 19.565217391304348,
	"step": 450
	},
	{
	"epoch": 20.0,
	"grad_norm": 2.118807792663574,
	"learning_rate": 2.9996147467351856e-05,
	"loss": 0.5147,
	"step": 460
	},
	{
	"epoch": 20.434782608695652,
	"grad_norm": 1.2774548530578613,
	"learning_rate": 2.9995930474939773e-05,
	"loss": 0.4785,
	"step": 470
	},
	{
	"epoch": 20.869565217391305,
	"grad_norm": 1.4731013774871826,
	"learning_rate": 2.9995707538619975e-05,
	"loss": 0.5703,
	"step": 480
	},
	{
	"epoch": 21.304347826086957,
	"grad_norm": 1.3251285552978516,
	"learning_rate": 2.9995478658480822e-05,
	"loss": 0.5164,
	"step": 490
	},
	{
	"epoch": 21.73913043478261,
	"grad_norm": 1.2412965297698975,
	"learning_rate": 2.9995243834613043e-05,
	"loss": 0.5204,
	"step": 500
	},
	{
	"epoch": 22.17391304347826,
	"grad_norm": 1.7840219736099243,
	"learning_rate": 2.9995003067109707e-05,
	"loss": 0.4838,
	"step": 510
	},
	{
	"epoch": 22.608695652173914,
	"grad_norm": 1.5308188199996948,
	"learning_rate": 2.9994756356066246e-05,
	"loss": 0.5616,
	"step": 520
	},
	{
	"epoch": 23.043478260869566,
	"grad_norm": 1.7345212697982788,
	"learning_rate": 2.999450370158046e-05,
	"loss": 0.4929,
	"step": 530
	},
	{
	"epoch": 23.47826086956522,
	"grad_norm": 1.3111943006515503,
	"learning_rate": 2.9994245103752478e-05,
	"loss": 0.4384,
	"step": 540
	},
	{
	"epoch": 23.91304347826087,
	"grad_norm": 1.234527349472046,
	"learning_rate": 2.999398056268481e-05,
	"loss": 0.5266,
	"step": 550
	},
	{
	"epoch": 24.347826086956523,
	"grad_norm": 1.4057211875915527,
	"learning_rate": 2.9993710078482306e-05,
	"loss": 0.5204,
	"step": 560
	},
	{
	"epoch": 24.782608695652176,
	"grad_norm": 0.9548116326332092,
	"learning_rate": 2.9993433651252185e-05,
	"loss": 0.4428,
	"step": 570
	},
	{
	"epoch": 25.217391304347824,
	"grad_norm": 1.7164983749389648,
	"learning_rate": 2.9993151281104006e-05,
	"loss": 0.5329,
	"step": 580
	},
	{
	"epoch": 25.652173913043477,
	"grad_norm": 1.1313426494598389,
	"learning_rate": 2.9992862968149695e-05,
	"loss": 0.4733,
	"step": 590
	},
	{
	"epoch": 26.08695652173913,
	"grad_norm": 1.1755690574645996,
	"learning_rate": 2.9992568712503533e-05,
	"loss": 0.4607,
	"step": 600
	},
	{
	"epoch": 26.08695652173913,
	"eval_loss": 0.7199033498764038,
	"eval_runtime": 0.3999,
	"eval_samples_per_second": 25.009,
	"eval_steps_per_second": 25.009,
	"step": 600
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.3949,
	"Start_State_samples_per_second": 25.321,
	"Start_State_steps_per_second": 25.321,
	"epoch": 26.08695652173913,
	"step": 600
	},
	{
	"Raw_Model_loss": 0.7199033498764038,
	"Raw_Model_runtime": 0.3918,
	"Raw_Model_samples_per_second": 25.525,
	"Raw_Model_steps_per_second": 25.525,
	"epoch": 26.08695652173913,
	"step": 600
	},
	{
	"SWA_loss": 0.7180979251861572,
	"SWA_runtime": 0.3964,
	"SWA_samples_per_second": 25.23,
	"SWA_steps_per_second": 25.23,
	"epoch": 26.08695652173913,
	"step": 600
	},
	{
	"EMA_loss": 0.7309598326683044,
	"EMA_runtime": 0.3932,
	"EMA_samples_per_second": 25.43,
	"EMA_steps_per_second": 25.43,
	"epoch": 26.08695652173913,
	"step": 600
	},
	{
	"epoch": 26.52173913043478,
	"grad_norm": 1.095847249031067,
	"learning_rate": 2.9992268514282142e-05,
	"loss": 0.5118,
	"step": 610
	},
	{
	"epoch": 26.956521739130434,
	"grad_norm": 1.3382961750030518,
	"learning_rate": 2.999196237360452e-05,
	"loss": 0.4316,
	"step": 620
	},
	{
	"epoch": 27.391304347826086,
	"grad_norm": 1.2023630142211914,
	"learning_rate": 2.9991650290592016e-05,
	"loss": 0.4756,
	"step": 630
	},
	{
	"epoch": 27.82608695652174,
	"grad_norm": 1.3882129192352295,
	"learning_rate": 2.999133226536832e-05,
	"loss": 0.5011,
	"step": 640
	},
	{
	"epoch": 28.26086956521739,
	"grad_norm": 1.4160760641098022,
	"learning_rate": 2.9991008298059493e-05,
	"loss": 0.4106,
	"step": 650
	},
	{
	"epoch": 28.695652173913043,
	"grad_norm": 1.5552334785461426,
	"learning_rate": 2.9990678388793944e-05,
	"loss": 0.5064,
	"step": 660
	},
	{
	"epoch": 29.130434782608695,
	"grad_norm": 1.3141825199127197,
	"learning_rate": 2.999034253770244e-05,
	"loss": 0.4349,
	"step": 670
	},
	{
	"epoch": 29.565217391304348,
	"grad_norm": 1.0743430852890015,
	"learning_rate": 2.9990000744918097e-05,
	"loss": 0.4704,
	"step": 680
	},
	{
	"epoch": 30.0,
	"grad_norm": 2.4982922077178955,
	"learning_rate": 2.9989653010576392e-05,
	"loss": 0.4144,
	"step": 690
	},
	{
	"epoch": 30.434782608695652,
	"grad_norm": 1.3296608924865723,
	"learning_rate": 2.9989299334815158e-05,
	"loss": 0.4766,
	"step": 700
	},
	{
	"epoch": 30.869565217391305,
	"grad_norm": 1.62749445438385,
	"learning_rate": 2.9988939717774578e-05,
	"loss": 0.412,
	"step": 710
	},
	{
	"epoch": 31.304347826086957,
	"grad_norm": 0.9021294116973877,
	"learning_rate": 2.9988574159597194e-05,
	"loss": 0.4246,
	"step": 720
	},
	{
	"epoch": 31.73913043478261,
	"grad_norm": 1.641708254814148,
	"learning_rate": 2.9988202660427907e-05,
	"loss": 0.4827,
	"step": 730
	},
	{
	"epoch": 32.17391304347826,
	"grad_norm": 1.1983932256698608,
	"learning_rate": 2.9987825220413958e-05,
	"loss": 0.4382,
	"step": 740
	},
	{
	"epoch": 32.608695652173914,
	"grad_norm": 1.765030026435852,
	"learning_rate": 2.998744183970496e-05,
	"loss": 0.4731,
	"step": 750
	},
	{
	"epoch": 32.608695652173914,
	"eval_loss": 0.7314910888671875,
	"eval_runtime": 0.4917,
	"eval_samples_per_second": 20.337,
	"eval_steps_per_second": 20.337,
	"step": 750
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4719,
	"Start_State_samples_per_second": 21.19,
	"Start_State_steps_per_second": 21.19,
	"epoch": 32.608695652173914,
	"step": 750
	},
	{
	"Raw_Model_loss": 0.7314910888671875,
	"Raw_Model_runtime": 0.4163,
	"Raw_Model_samples_per_second": 24.018,
	"Raw_Model_steps_per_second": 24.018,
	"epoch": 32.608695652173914,
	"step": 750
	},
	{
	"SWA_loss": 0.7177615761756897,
	"SWA_runtime": 0.3941,
	"SWA_samples_per_second": 25.374,
	"SWA_steps_per_second": 25.374,
	"epoch": 32.608695652173914,
	"step": 750
	},
	{
	"EMA_loss": 0.7303470969200134,
	"EMA_runtime": 0.407,
	"EMA_samples_per_second": 24.567,
	"EMA_steps_per_second": 24.567,
	"epoch": 32.608695652173914,
	"step": 750
	},
	{
	"epoch": 33.04347826086956,
	"grad_norm": 1.495551347732544,
	"learning_rate": 2.998705251845287e-05,
	"loss": 0.4299,
	"step": 760
	},
	{
	"epoch": 33.47826086956522,
	"grad_norm": 1.644679069519043,
	"learning_rate": 2.9986657256812e-05,
	"loss": 0.4302,
	"step": 770
	},
	{
	"epoch": 33.91304347826087,
	"grad_norm": 1.3021020889282227,
	"learning_rate": 2.9986256054939022e-05,
	"loss": 0.4078,
	"step": 780
	},
	{
	"epoch": 34.34782608695652,
	"grad_norm": 1.483847975730896,
	"learning_rate": 2.9985848912992956e-05,
	"loss": 0.4026,
	"step": 790
	},
	{
	"epoch": 34.78260869565217,
	"grad_norm": 1.5579402446746826,
	"learning_rate": 2.9985435831135184e-05,
	"loss": 0.3833,
	"step": 800
	},
	{
	"epoch": 35.21739130434783,
	"grad_norm": 1.3132578134536743,
	"learning_rate": 2.9985016809529437e-05,
	"loss": 0.4742,
	"step": 810
	},
	{
	"epoch": 35.65217391304348,
	"grad_norm": 1.332205891609192,
	"learning_rate": 2.9984591848341806e-05,
	"loss": 0.4028,
	"step": 820
	},
	{
	"epoch": 36.08695652173913,
	"grad_norm": 1.0762503147125244,
	"learning_rate": 2.9984160947740723e-05,
	"loss": 0.4181,
	"step": 830
	},
	{
	"epoch": 36.52173913043478,
	"grad_norm": 1.1693116426467896,
	"learning_rate": 2.9983724107896993e-05,
	"loss": 0.3803,
	"step": 840
	},
	{
	"epoch": 36.95652173913044,
	"grad_norm": 1.4850109815597534,
	"learning_rate": 2.9983281328983757e-05,
	"loss": 0.4498,
	"step": 850
	},
	{
	"epoch": 37.391304347826086,
	"grad_norm": 1.8984379768371582,
	"learning_rate": 2.9982832611176523e-05,
	"loss": 0.4182,
	"step": 860
	},
	{
	"epoch": 37.82608695652174,
	"grad_norm": 1.2748432159423828,
	"learning_rate": 2.998237795465315e-05,
	"loss": 0.3716,
	"step": 870
	},
	{
	"epoch": 38.26086956521739,
	"grad_norm": 1.268835186958313,
	"learning_rate": 2.9981917359593843e-05,
	"loss": 0.4011,
	"step": 880
	},
	{
	"epoch": 38.69565217391305,
	"grad_norm": 1.446075439453125,
	"learning_rate": 2.9981450826181172e-05,
	"loss": 0.3551,
	"step": 890
	},
	{
	"epoch": 39.130434782608695,
	"grad_norm": 1.9400171041488647,
	"learning_rate": 2.9980978354600057e-05,
	"loss": 0.4631,
	"step": 900
	},
	{
	"epoch": 39.130434782608695,
	"eval_loss": 0.7510843276977539,
	"eval_runtime": 0.4747,
	"eval_samples_per_second": 21.065,
	"eval_steps_per_second": 21.065,
	"step": 900
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4104,
	"Start_State_samples_per_second": 24.364,
	"Start_State_steps_per_second": 24.364,
	"epoch": 39.130434782608695,
	"step": 900
	},
	{
	"Raw_Model_loss": 0.7510843276977539,
	"Raw_Model_runtime": 0.4105,
	"Raw_Model_samples_per_second": 24.36,
	"Raw_Model_steps_per_second": 24.36,
	"epoch": 39.130434782608695,
	"step": 900
	},
	{
	"SWA_loss": 0.7209179997444153,
	"SWA_runtime": 0.4135,
	"SWA_samples_per_second": 24.182,
	"SWA_steps_per_second": 24.182,
	"epoch": 39.130434782608695,
	"step": 900
	},
	{
	"EMA_loss": 0.7315307259559631,
	"EMA_runtime": 0.4077,
	"EMA_samples_per_second": 24.527,
	"EMA_steps_per_second": 24.527,
	"epoch": 39.130434782608695,
	"step": 900
	},
	{
	"epoch": 39.56521739130435,
	"grad_norm": 1.5488810539245605,
	"learning_rate": 2.9980499945037765e-05,
	"loss": 0.3835,
	"step": 910
	},
	{
	"epoch": 40.0,
	"grad_norm": 3.038804769515991,
	"learning_rate": 2.998001559768393e-05,
	"loss": 0.3862,
	"step": 920
	},
	{
	"epoch": 40.43478260869565,
	"grad_norm": 1.543023705482483,
	"learning_rate": 2.9979525312730525e-05,
	"loss": 0.4491,
	"step": 930
	},
	{
	"epoch": 40.869565217391305,
	"grad_norm": 1.6729778051376343,
	"learning_rate": 2.9979029090371885e-05,
	"loss": 0.3409,
	"step": 940
	},
	{
	"epoch": 41.30434782608695,
	"grad_norm": 2.224083662033081,
	"learning_rate": 2.99785269308047e-05,
	"loss": 0.3417,
	"step": 950
	},
	{
	"epoch": 41.73913043478261,
	"grad_norm": 1.5069278478622437,
	"learning_rate": 2.9978018834228007e-05,
	"loss": 0.3647,
	"step": 960
	},
	{
	"epoch": 42.17391304347826,
	"grad_norm": 1.5148930549621582,
	"learning_rate": 2.9977504800843197e-05,
	"loss": 0.4348,
	"step": 970
	},
	{
	"epoch": 42.608695652173914,
	"grad_norm": 1.5450372695922852,
	"learning_rate": 2.9976984830854022e-05,
	"loss": 0.3751,
	"step": 980
	},
	{
	"epoch": 43.04347826086956,
	"grad_norm": 1.6492244005203247,
	"learning_rate": 2.997645892446658e-05,
	"loss": 0.3672,
	"step": 990
	},
	{
	"epoch": 43.47826086956522,
	"grad_norm": 1.4176095724105835,
	"learning_rate": 2.9975927081889322e-05,
	"loss": 0.3908,
	"step": 1000
	},
	{
	"epoch": 43.91304347826087,
	"grad_norm": 1.192176342010498,
	"learning_rate": 2.9975389303333047e-05,
	"loss": 0.3461,
	"step": 1010
	},
	{
	"epoch": 44.34782608695652,
	"grad_norm": 2.0878190994262695,
	"learning_rate": 2.997484558901093e-05,
	"loss": 0.3918,
	"step": 1020
	},
	{
	"epoch": 44.78260869565217,
	"grad_norm": 1.6980842351913452,
	"learning_rate": 2.9974295939138465e-05,
	"loss": 0.3809,
	"step": 1030
	},
	{
	"epoch": 45.21739130434783,
	"grad_norm": 1.2522655725479126,
	"learning_rate": 2.9973740353933523e-05,
	"loss": 0.2649,
	"step": 1040
	},
	{
	"epoch": 45.65217391304348,
	"grad_norm": 1.678786039352417,
	"learning_rate": 2.997317883361632e-05,
	"loss": 0.3613,
	"step": 1050
	},
	{
	"epoch": 45.65217391304348,
	"eval_loss": 0.777652382850647,
	"eval_runtime": 0.507,
	"eval_samples_per_second": 19.723,
	"eval_steps_per_second": 19.723,
	"step": 1050
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.452,
	"Start_State_samples_per_second": 22.123,
	"Start_State_steps_per_second": 22.123,
	"epoch": 45.65217391304348,
	"step": 1050
	},
	{
	"Raw_Model_loss": 0.777652382850647,
	"Raw_Model_runtime": 0.4426,
	"Raw_Model_samples_per_second": 22.594,
	"Raw_Model_steps_per_second": 22.594,
	"epoch": 45.65217391304348,
	"step": 1050
	},
	{
	"SWA_loss": 0.7228068709373474,
	"SWA_runtime": 0.4408,
	"SWA_samples_per_second": 22.684,
	"SWA_steps_per_second": 22.684,
	"epoch": 45.65217391304348,
	"step": 1050
	},
	{
	"EMA_loss": 0.7306644320487976,
	"EMA_runtime": 0.3993,
	"EMA_samples_per_second": 25.044,
	"EMA_steps_per_second": 25.044,
	"epoch": 45.65217391304348,
	"step": 1050
	},
	{
	"epoch": 46.08695652173913,
	"grad_norm": 1.7957395315170288,
	"learning_rate": 2.997261137840943e-05,
	"loss": 0.4103,
	"step": 1060
	},
	{
	"epoch": 46.52173913043478,
	"grad_norm": 2.156790256500244,
	"learning_rate": 2.9972037988537758e-05,
	"loss": 0.3785,
	"step": 1070
	},
	{
	"epoch": 46.95652173913044,
	"grad_norm": 1.9486017227172852,
	"learning_rate": 2.9971458664228595e-05,
	"loss": 0.3324,
	"step": 1080
	},
	{
	"epoch": 47.391304347826086,
	"grad_norm": 2.1510581970214844,
	"learning_rate": 2.997087340571156e-05,
	"loss": 0.3368,
	"step": 1090
	},
	{
	"epoch": 47.82608695652174,
	"grad_norm": 1.5172206163406372,
	"learning_rate": 2.997028221321863e-05,
	"loss": 0.3563,
	"step": 1100
	},
	{
	"epoch": 48.26086956521739,
	"grad_norm": 2.3161354064941406,
	"learning_rate": 2.9969685086984132e-05,
	"loss": 0.3734,
	"step": 1110
	},
	{
	"epoch": 48.69565217391305,
	"grad_norm": 1.6685658693313599,
	"learning_rate": 2.9969082027244755e-05,
	"loss": 0.3001,
	"step": 1120
	},
	{
	"epoch": 49.130434782608695,
	"grad_norm": 1.843396782875061,
	"learning_rate": 2.996847303423953e-05,
	"loss": 0.4154,
	"step": 1130
	},
	{
	"epoch": 49.56521739130435,
	"grad_norm": 1.3093624114990234,
	"learning_rate": 2.9967858108209838e-05,
	"loss": 0.3713,
	"step": 1140
	},
	{
	"epoch": 50.0,
	"grad_norm": 2.944302797317505,
	"learning_rate": 2.9967237249399417e-05,
	"loss": 0.292,
	"step": 1150
	},
	{
	"epoch": 50.43478260869565,
	"grad_norm": 1.5263242721557617,
	"learning_rate": 2.996661045805436e-05,
	"loss": 0.2961,
	"step": 1160
	},
	{
	"epoch": 50.869565217391305,
	"grad_norm": 1.8892343044281006,
	"learning_rate": 2.9965977734423106e-05,
	"loss": 0.3417,
	"step": 1170
	},
	{
	"epoch": 51.30434782608695,
	"grad_norm": 1.9361391067504883,
	"learning_rate": 2.9965339078756445e-05,
	"loss": 0.3541,
	"step": 1180
	},
	{
	"epoch": 51.73913043478261,
	"grad_norm": 1.1996322870254517,
	"learning_rate": 2.9964694491307514e-05,
	"loss": 0.2802,
	"step": 1190
	},
	{
	"epoch": 52.17391304347826,
	"grad_norm": 2.5254251956939697,
	"learning_rate": 2.996404397233182e-05,
	"loss": 0.4085,
	"step": 1200
	},
	{
	"epoch": 52.17391304347826,
	"eval_loss": 0.8014206886291504,
	"eval_runtime": 0.508,
	"eval_samples_per_second": 19.686,
	"eval_steps_per_second": 19.686,
	"step": 1200
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4,
	"Start_State_samples_per_second": 25.001,
	"Start_State_steps_per_second": 25.001,
	"epoch": 52.17391304347826,
	"step": 1200
	},
	{
	"Raw_Model_loss": 0.8014206886291504,
	"Raw_Model_runtime": 0.4013,
	"Raw_Model_samples_per_second": 24.922,
	"Raw_Model_steps_per_second": 24.922,
	"epoch": 52.17391304347826,
	"step": 1200
	},
	{
	"SWA_loss": 0.7281149625778198,
	"SWA_runtime": 0.4076,
	"SWA_samples_per_second": 24.535,
	"SWA_steps_per_second": 24.535,
	"epoch": 52.17391304347826,
	"step": 1200
	},
	{
	"EMA_loss": 0.7309959530830383,
	"EMA_runtime": 0.415,
	"EMA_samples_per_second": 24.094,
	"EMA_steps_per_second": 24.094,
	"epoch": 52.17391304347826,
	"step": 1200
	},
	{
	"epoch": 52.608695652173914,
	"grad_norm": 1.616297721862793,
	"learning_rate": 1.4982021986165911e-06,
	"loss": 0.2865,
	"step": 1210
	},
	{
	"epoch": 53.04347826086956,
	"grad_norm": 1.7724196910858154,
	"learning_rate": 2.9964043972331822e-06,
	"loss": 0.3649,
	"step": 1220
	},
	{
	"epoch": 53.47826086956522,
	"grad_norm": 1.3953560590744019,
	"learning_rate": 4.494606595849773e-06,
	"loss": 0.2719,
	"step": 1230
	},
	{
	"epoch": 53.91304347826087,
	"grad_norm": 1.9502956867218018,
	"learning_rate": 5.9928087944663644e-06,
	"loss": 0.334,
	"step": 1240
	},
	{
	"epoch": 54.34782608695652,
	"grad_norm": 1.9493101835250854,
	"learning_rate": 7.491010993082955e-06,
	"loss": 0.385,
	"step": 1250
	},
	{
	"epoch": 54.78260869565217,
	"grad_norm": 1.1656595468521118,
	"learning_rate": 8.989213191699545e-06,
	"loss": 0.284,
	"step": 1260
	},
	{
	"epoch": 55.21739130434783,
	"grad_norm": 1.5772318840026855,
	"learning_rate": 1.0487415390316136e-05,
	"loss": 0.3105,
	"step": 1270
	},
	{
	"epoch": 55.65217391304348,
	"grad_norm": 1.708022117614746,
	"learning_rate": 1.1985617588932729e-05,
	"loss": 0.3553,
	"step": 1280
	},
	{
	"epoch": 56.08695652173913,
	"grad_norm": 2.282125473022461,
	"learning_rate": 1.348381978754932e-05,
	"loss": 0.2844,
	"step": 1290
	},
	{
	"epoch": 56.52173913043478,
	"grad_norm": 1.458001971244812,
	"learning_rate": 1.498202198616591e-05,
	"loss": 0.3387,
	"step": 1300
	},
	{
	"epoch": 56.95652173913044,
	"grad_norm": 1.9748072624206543,
	"learning_rate": 1.4982020501567203e-05,
	"loss": 0.3318,
	"step": 1310
	},
	{
	"epoch": 57.391304347826086,
	"grad_norm": 1.4179987907409668,
	"learning_rate": 1.4982016047771664e-05,
	"loss": 0.3109,
	"step": 1320
	},
	{
	"epoch": 57.82608695652174,
	"grad_norm": 2.815448522567749,
	"learning_rate": 1.4982008624781062e-05,
	"loss": 0.3369,
	"step": 1330
	},
	{
	"epoch": 58.26086956521739,
	"grad_norm": 1.4394376277923584,
	"learning_rate": 1.4981998232598337e-05,
	"loss": 0.3303,
	"step": 1340
	},
	{
	"epoch": 58.69565217391305,
	"grad_norm": 1.8707002401351929,
	"learning_rate": 1.4981984871227611e-05,
	"loss": 0.3077,
	"step": 1350
	},
	{
	"epoch": 58.69565217391305,
	"eval_loss": 0.8195747137069702,
	"eval_runtime": 0.4109,
	"eval_samples_per_second": 24.335,
	"eval_steps_per_second": 24.335,
	"step": 1350
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.3897,
	"Start_State_samples_per_second": 25.658,
	"Start_State_steps_per_second": 25.658,
	"epoch": 58.69565217391305,
	"step": 1350
	},
	{
	"Raw_Model_loss": 0.8195747137069702,
	"Raw_Model_runtime": 0.3979,
	"Raw_Model_samples_per_second": 25.132,
	"Raw_Model_steps_per_second": 25.132,
	"epoch": 58.69565217391305,
	"step": 1350
	},
	{
	"SWA_loss": 0.7320815324783325,
	"SWA_runtime": 0.395,
	"SWA_samples_per_second": 25.319,
	"SWA_steps_per_second": 25.319,
	"epoch": 58.69565217391305,
	"step": 1350
	},
	{
	"EMA_loss": 0.7313606142997742,
	"EMA_runtime": 0.3882,
	"EMA_samples_per_second": 25.759,
	"EMA_steps_per_second": 25.759,
	"epoch": 58.69565217391305,
	"step": 1350
	},
	{
	"epoch": 59.130434782608695,
	"grad_norm": 1.5965052843093872,
	"learning_rate": 1.4981968540674177e-05,
	"loss": 0.3206,
	"step": 1360
	},
	{
	"epoch": 59.56521739130435,
	"grad_norm": 1.3822482824325562,
	"learning_rate": 1.4981949240944509e-05,
	"loss": 0.3011,
	"step": 1370
	},
	{
	"epoch": 60.0,
	"grad_norm": 1.6288312673568726,
	"learning_rate": 1.4981926972046258e-05,
	"loss": 0.3095,
	"step": 1380
	},
	{
	"epoch": 60.43478260869565,
	"grad_norm": 1.9036870002746582,
	"learning_rate": 1.498190173398825e-05,
	"loss": 0.3173,
	"step": 1390
	},
	{
	"epoch": 60.869565217391305,
	"grad_norm": 1.5387356281280518,
	"learning_rate": 1.4981873526780487e-05,
	"loss": 0.3054,
	"step": 1400
	},
	{
	"epoch": 61.30434782608695,
	"grad_norm": 1.4343056678771973,
	"learning_rate": 1.4981842350434152e-05,
	"loss": 0.3046,
	"step": 1410
	},
	{
	"epoch": 61.73913043478261,
	"grad_norm": 1.4938664436340332,
	"learning_rate": 1.49818082049616e-05,
	"loss": 0.3205,
	"step": 1420
	},
	{
	"epoch": 62.17391304347826,
	"grad_norm": 2.177480459213257,
	"learning_rate": 1.4981771090376367e-05,
	"loss": 0.2865,
	"step": 1430
	},
	{
	"epoch": 62.608695652173914,
	"grad_norm": 1.8865878582000732,
	"learning_rate": 1.4981731006693164e-05,
	"loss": 0.3213,
	"step": 1440
	},
	{
	"epoch": 63.04347826086956,
	"grad_norm": 1.3152176141738892,
	"learning_rate": 1.4981687953927875e-05,
	"loss": 0.3125,
	"step": 1450
	},
	{
	"epoch": 63.47826086956522,
	"grad_norm": 1.9965901374816895,
	"learning_rate": 1.498164193209757e-05,
	"loss": 0.345,
	"step": 1460
	},
	{
	"epoch": 63.91304347826087,
	"grad_norm": 1.6480698585510254,
	"learning_rate": 1.498159294122049e-05,
	"loss": 0.2924,
	"step": 1470
	},
	{
	"epoch": 64.34782608695652,
	"grad_norm": 1.8093769550323486,
	"learning_rate": 1.4981540981316052e-05,
	"loss": 0.2688,
	"step": 1480
	},
	{
	"epoch": 64.78260869565217,
	"grad_norm": 1.529961347579956,
	"learning_rate": 1.4981486052404848e-05,
	"loss": 0.3585,
	"step": 1490
	},
	{
	"epoch": 65.21739130434783,
	"grad_norm": 1.4079116582870483,
	"learning_rate": 1.4981428154508652e-05,
	"loss": 0.269,
	"step": 1500
	},
	{
	"epoch": 65.21739130434783,
	"eval_loss": 0.8343552350997925,
	"eval_runtime": 0.4105,
	"eval_samples_per_second": 24.363,
	"eval_steps_per_second": 24.363,
	"step": 1500
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4152,
	"Start_State_samples_per_second": 24.085,
	"Start_State_steps_per_second": 24.085,
	"epoch": 65.21739130434783,
	"step": 1500
	},
	{
	"Raw_Model_loss": 0.8343552350997925,
	"Raw_Model_runtime": 0.399,
	"Raw_Model_samples_per_second": 25.06,
	"Raw_Model_steps_per_second": 25.06,
	"epoch": 65.21739130434783,
	"step": 1500
	},
	{
	"SWA_loss": 0.7395024299621582,
	"SWA_runtime": 0.4083,
	"SWA_samples_per_second": 24.489,
	"SWA_steps_per_second": 24.489,
	"epoch": 65.21739130434783,
	"step": 1500
	},
	{
	"EMA_loss": 0.7315851449966431,
	"EMA_runtime": 0.4003,
	"EMA_samples_per_second": 24.979,
	"EMA_steps_per_second": 24.979,
	"epoch": 65.21739130434783,
	"step": 1500
	},
	{
	"epoch": 65.65217391304348,
	"grad_norm": 2.2492856979370117,
	"learning_rate": 1.4981367287650419e-05,
	"loss": 0.3161,
	"step": 1510
	},
	{
	"epoch": 66.08695652173913,
	"grad_norm": 1.7571766376495361,
	"learning_rate": 1.4981303451854267e-05,
	"loss": 0.2947,
	"step": 1520
	},
	{
	"epoch": 66.52173913043478,
	"grad_norm": 1.7509160041809082,
	"learning_rate": 1.4981236647145501e-05,
	"loss": 0.3107,
	"step": 1530
	},
	{
	"epoch": 66.95652173913044,
	"grad_norm": 2.094277858734131,
	"learning_rate": 1.4981166873550601e-05,
	"loss": 0.3051,
	"step": 1540
	},
	{
	"epoch": 67.3913043478261,
	"grad_norm": 1.7601019144058228,
	"learning_rate": 1.4981094131097224e-05,
	"loss": 0.2711,
	"step": 1550
	},
	{
	"epoch": 67.82608695652173,
	"grad_norm": 2.0073230266571045,
	"learning_rate": 1.49810184198142e-05,
	"loss": 0.3434,
	"step": 1560
	},
	{
	"epoch": 68.26086956521739,
	"grad_norm": 2.084998846054077,
	"learning_rate": 1.498093973973154e-05,
	"loss": 0.2506,
	"step": 1570
	},
	{
	"epoch": 68.69565217391305,
	"grad_norm": 1.8126795291900635,
	"learning_rate": 1.4980858090880429e-05,
	"loss": 0.286,
	"step": 1580
	},
	{
	"epoch": 69.1304347826087,
	"grad_norm": 1.9416148662567139,
	"learning_rate": 1.4980773473293232e-05,
	"loss": 0.3681,
	"step": 1590
	},
	{
	"epoch": 69.56521739130434,
	"grad_norm": 1.978805422782898,
	"learning_rate": 1.4980685887003486e-05,
	"loss": 0.3073,
	"step": 1600
	},
	{
	"epoch": 70.0,
	"grad_norm": 1.6534956693649292,
	"learning_rate": 1.498059533204591e-05,
	"loss": 0.2691,
	"step": 1610
	},
	{
	"epoch": 70.43478260869566,
	"grad_norm": 2.2284836769104004,
	"learning_rate": 1.4980501808456398e-05,
	"loss": 0.3139,
	"step": 1620
	},
	{
	"epoch": 70.8695652173913,
	"grad_norm": 1.9585868120193481,
	"learning_rate": 1.4980405316272018e-05,
	"loss": 0.2997,
	"step": 1630
	},
	{
	"epoch": 71.30434782608695,
	"grad_norm": 2.346238851547241,
	"learning_rate": 1.4980305855531015e-05,
	"loss": 0.2891,
	"step": 1640
	},
	{
	"epoch": 71.73913043478261,
	"grad_norm": 1.851641058921814,
	"learning_rate": 1.4980203426272815e-05,
	"loss": 0.2627,
	"step": 1650
	},
	{
	"epoch": 71.73913043478261,
	"eval_loss": 0.8489276766777039,
	"eval_runtime": 0.4811,
	"eval_samples_per_second": 20.784,
	"eval_steps_per_second": 20.784,
	"step": 1650
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4395,
	"Start_State_samples_per_second": 22.752,
	"Start_State_steps_per_second": 22.752,
	"epoch": 71.73913043478261,
	"step": 1650
	},
	{
	"Raw_Model_loss": 0.8489276766777039,
	"Raw_Model_runtime": 0.4362,
	"Raw_Model_samples_per_second": 22.928,
	"Raw_Model_steps_per_second": 22.928,
	"epoch": 71.73913043478261,
	"step": 1650
	},
	{
	"SWA_loss": 0.7444645166397095,
	"SWA_runtime": 0.4365,
	"SWA_samples_per_second": 22.911,
	"SWA_steps_per_second": 22.911,
	"epoch": 71.73913043478261,
	"step": 1650
	},
	{
	"EMA_loss": 0.7310957312583923,
	"EMA_runtime": 0.4342,
	"EMA_samples_per_second": 23.032,
	"EMA_steps_per_second": 23.032,
	"epoch": 71.73913043478261,
	"step": 1650
	},
	{
	"epoch": 72.17391304347827,
	"grad_norm": 1.65473210811615,
	"learning_rate": 1.4980098028538014e-05,
	"loss": 0.328,
	"step": 1660
	},
	{
	"epoch": 72.6086956521739,
	"grad_norm": 2.0884604454040527,
	"learning_rate": 1.4979989662368391e-05,
	"loss": 0.2959,
	"step": 1670
	},
	{
	"epoch": 73.04347826086956,
	"grad_norm": 1.906488299369812,
	"learning_rate": 1.4979878327806899e-05,
	"loss": 0.3098,
	"step": 1680
	},
	{
	"epoch": 73.47826086956522,
	"grad_norm": 2.01023530960083,
	"learning_rate": 1.4979764024897668e-05,
	"loss": 0.2878,
	"step": 1690
	},
	{
	"epoch": 73.91304347826087,
	"grad_norm": 1.8358246088027954,
	"learning_rate": 1.4979646753686002e-05,
	"loss": 0.2796,
	"step": 1700
	},
	{
	"epoch": 74.34782608695652,
	"grad_norm": 1.3833634853363037,
	"learning_rate": 1.4979526514218385e-05,
	"loss": 0.2769,
	"step": 1710
	},
	{
	"epoch": 74.78260869565217,
	"grad_norm": 1.5111050605773926,
	"learning_rate": 1.4979403306542473e-05,
	"loss": 0.3278,
	"step": 1720
	},
	{
	"epoch": 75.21739130434783,
	"grad_norm": 1.5712664127349854,
	"learning_rate": 1.4979277130707107e-05,
	"loss": 0.2338,
	"step": 1730
	},
	{
	"epoch": 75.65217391304348,
	"grad_norm": 1.660670280456543,
	"learning_rate": 1.4979147986762295e-05,
	"loss": 0.3144,
	"step": 1740
	},
	{
	"epoch": 76.08695652173913,
	"grad_norm": 1.8221240043640137,
	"learning_rate": 1.4979015874759227e-05,
	"loss": 0.2694,
	"step": 1750
	},
	{
	"epoch": 76.52173913043478,
	"grad_norm": 1.8922370672225952,
	"learning_rate": 1.4978880794750266e-05,
	"loss": 0.2665,
	"step": 1760
	},
	{
	"epoch": 76.95652173913044,
	"grad_norm": 1.296356201171875,
	"learning_rate": 1.4978742746788957e-05,
	"loss": 0.3007,
	"step": 1770
	},
	{
	"epoch": 77.3913043478261,
	"grad_norm": 1.8244571685791016,
	"learning_rate": 1.4978601730930014e-05,
	"loss": 0.2842,
	"step": 1780
	},
	{
	"epoch": 77.82608695652173,
	"grad_norm": 1.8345180749893188,
	"learning_rate": 1.4978457747229335e-05,
	"loss": 0.2714,
	"step": 1790
	},
	{
	"epoch": 78.26086956521739,
	"grad_norm": 1.850252389907837,
	"learning_rate": 1.497831079574399e-05,
	"loss": 0.3055,
	"step": 1800
	},
	{
	"epoch": 78.26086956521739,
	"eval_loss": 0.8643280267715454,
	"eval_runtime": 0.4359,
	"eval_samples_per_second": 22.94,
	"eval_steps_per_second": 22.94,
	"step": 1800
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4203,
	"Start_State_samples_per_second": 23.79,
	"Start_State_steps_per_second": 23.79,
	"epoch": 78.26086956521739,
	"step": 1800
	},
	{
	"Raw_Model_loss": 0.8643280267715454,
	"Raw_Model_runtime": 0.4033,
	"Raw_Model_samples_per_second": 24.798,
	"Raw_Model_steps_per_second": 24.798,
	"epoch": 78.26086956521739,
	"step": 1800
	},
	{
	"SWA_loss": 0.7512942552566528,
	"SWA_runtime": 0.3891,
	"SWA_samples_per_second": 25.698,
	"SWA_steps_per_second": 25.698,
	"epoch": 78.26086956521739,
	"step": 1800
	},
	{
	"EMA_loss": 0.7310723066329956,
	"EMA_runtime": 0.3904,
	"EMA_samples_per_second": 25.613,
	"EMA_steps_per_second": 25.613,
	"epoch": 78.26086956521739,
	"step": 1800
	},
	{
	"epoch": 78.69565217391305,
	"grad_norm": 1.2641412019729614,
	"learning_rate": 1.4978160876532222e-05,
	"loss": 0.2824,
	"step": 1810
	},
	{
	"epoch": 79.1304347826087,
	"grad_norm": 1.9765238761901855,
	"learning_rate": 1.4978007989653455e-05,
	"loss": 0.2406,
	"step": 1820
	},
	{
	"epoch": 79.56521739130434,
	"grad_norm": 1.5835498571395874,
	"learning_rate": 1.4977852135168293e-05,
	"loss": 0.2607,
	"step": 1830
	},
	{
	"epoch": 80.0,
	"grad_norm": 1.8932580947875977,
	"learning_rate": 1.4977693313138507e-05,
	"loss": 0.3036,
	"step": 1840
	},
	{
	"epoch": 80.43478260869566,
	"grad_norm": 2.1030030250549316,
	"learning_rate": 1.4977531523627054e-05,
	"loss": 0.2799,
	"step": 1850
	},
	{
	"epoch": 80.8695652173913,
	"grad_norm": 1.2366570234298706,
	"learning_rate": 1.4977366766698058e-05,
	"loss": 0.2792,
	"step": 1860
	},
	{
	"epoch": 81.30434782608695,
	"grad_norm": 1.5485888719558716,
	"learning_rate": 1.4977199042416822e-05,
	"loss": 0.2311,
	"step": 1870
	},
	{
	"epoch": 81.73913043478261,
	"grad_norm": 1.5375139713287354,
	"learning_rate": 1.4977028350849831e-05,
	"loss": 0.3059,
	"step": 1880
	},
	{
	"epoch": 82.17391304347827,
	"grad_norm": 1.6247549057006836,
	"learning_rate": 1.4976854692064739e-05,
	"loss": 0.2147,
	"step": 1890
	},
	{
	"epoch": 82.6086956521739,
	"grad_norm": 1.8154581785202026,
	"learning_rate": 1.497667806613038e-05,
	"loss": 0.2594,
	"step": 1900
	},
	{
	"epoch": 83.04347826086956,
	"grad_norm": 1.579021692276001,
	"learning_rate": 1.497649847311676e-05,
	"loss": 0.3002,
	"step": 1910
	},
	{
	"epoch": 83.47826086956522,
	"grad_norm": 1.4831469058990479,
	"learning_rate": 1.4976315913095068e-05,
	"loss": 0.265,
	"step": 1920
	},
	{
	"epoch": 83.91304347826087,
	"grad_norm": 2.305431842803955,
	"learning_rate": 1.4976130386137666e-05,
	"loss": 0.3039,
	"step": 1930
	},
	{
	"epoch": 84.34782608695652,
	"grad_norm": 1.720330834388733,
	"learning_rate": 1.4975941892318084e-05,
	"loss": 0.2642,
	"step": 1940
	},
	{
	"epoch": 84.78260869565217,
	"grad_norm": 2.2541563510894775,
	"learning_rate": 1.497575043171104e-05,
	"loss": 0.2798,
	"step": 1950
	},
	{
	"epoch": 84.78260869565217,
	"eval_loss": 0.8888376355171204,
	"eval_runtime": 0.4413,
	"eval_samples_per_second": 22.658,
	"eval_steps_per_second": 22.658,
	"step": 1950
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4289,
	"Start_State_samples_per_second": 23.318,
	"Start_State_steps_per_second": 23.318,
	"epoch": 84.78260869565217,
	"step": 1950
	},
	{
	"Raw_Model_loss": 0.8888376355171204,
	"Raw_Model_runtime": 0.427,
	"Raw_Model_samples_per_second": 23.417,
	"Raw_Model_steps_per_second": 23.417,
	"epoch": 84.78260869565217,
	"step": 1950
	},
	{
	"SWA_loss": 0.7555862665176392,
	"SWA_runtime": 0.402,
	"SWA_samples_per_second": 24.875,
	"SWA_steps_per_second": 24.875,
	"epoch": 84.78260869565217,
	"step": 1950
	},
	{
	"EMA_loss": 0.7317630052566528,
	"EMA_runtime": 0.3962,
	"EMA_samples_per_second": 25.24,
	"EMA_steps_per_second": 25.24,
	"epoch": 84.78260869565217,
	"step": 1950
	},
	{
	"epoch": 85.21739130434783,
	"grad_norm": 2.295459270477295,
	"learning_rate": 7.487875215855521e-07,
	"loss": 0.2648,
	"step": 1960
	},
	{
	"epoch": 85.65217391304348,
	"grad_norm": 1.9124552011489868,
	"learning_rate": 1.4975750431711041e-06,
	"loss": 0.2705,
	"step": 1970
	},
	{
	"epoch": 86.08695652173913,
	"grad_norm": 2.192692756652832,
	"learning_rate": 2.2463625647566557e-06,
	"loss": 0.2532,
	"step": 1980
	},
	{
	"epoch": 86.52173913043478,
	"grad_norm": 1.792695164680481,
	"learning_rate": 2.9951500863422082e-06,
	"loss": 0.2765,
	"step": 1990
	},
	{
	"epoch": 86.95652173913044,
	"grad_norm": 2.0215790271759033,
	"learning_rate": 3.74393760792776e-06,
	"loss": 0.2769,
	"step": 2000
	},
	{
	"epoch": 87.3913043478261,
	"grad_norm": 1.4278439283370972,
	"learning_rate": 4.4927251295133115e-06,
	"loss": 0.278,
	"step": 2010
	},
	{
	"epoch": 87.82608695652173,
	"grad_norm": 1.9748132228851318,
	"learning_rate": 5.241512651098863e-06,
	"loss": 0.2587,
	"step": 2020
	},
	{
	"epoch": 88.26086956521739,
	"grad_norm": 2.0187323093414307,
	"learning_rate": 5.9903001726844164e-06,
	"loss": 0.2613,
	"step": 2030
	},
	{
	"epoch": 88.69565217391305,
	"grad_norm": 1.7434452772140503,
	"learning_rate": 6.739087694269968e-06,
	"loss": 0.2851,
	"step": 2040
	},
	{
	"epoch": 89.1304347826087,
	"grad_norm": 1.828153371810913,
	"learning_rate": 7.48787521585552e-06,
	"loss": 0.2918,
	"step": 2050
	},
	{
	"epoch": 89.56521739130434,
	"grad_norm": 1.5711168050765991,
	"learning_rate": 7.487874473866896e-06,
	"loss": 0.247,
	"step": 2060
	},
	{
	"epoch": 90.0,
	"grad_norm": 1.6228244304656982,
	"learning_rate": 7.487872247901318e-06,
	"loss": 0.2522,
	"step": 2070
	},
	{
	"epoch": 90.43478260869566,
	"grad_norm": 1.863221526145935,
	"learning_rate": 7.4878685379596685e-06,
	"loss": 0.2577,
	"step": 2080
	},
	{
	"epoch": 90.8695652173913,
	"grad_norm": 1.7543621063232422,
	"learning_rate": 7.487863344043418e-06,
	"loss": 0.283,
	"step": 2090
	},
	{
	"epoch": 91.30434782608695,
	"grad_norm": 1.765681266784668,
	"learning_rate": 7.487856666154626e-06,
	"loss": 0.2727,
	"step": 2100
	},
	{
	"epoch": 91.30434782608695,
	"eval_loss": 0.8941524624824524,
	"eval_runtime": 0.5508,
	"eval_samples_per_second": 18.155,
	"eval_steps_per_second": 18.155,
	"step": 2100
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4064,
	"Start_State_samples_per_second": 24.603,
	"Start_State_steps_per_second": 24.603,
	"epoch": 91.30434782608695,
	"step": 2100
	},
	{
	"Raw_Model_loss": 0.8941524624824524,
	"Raw_Model_runtime": 0.4788,
	"Raw_Model_samples_per_second": 20.886,
	"Raw_Model_steps_per_second": 20.886,
	"epoch": 91.30434782608695,
	"step": 2100
	},
	{
	"SWA_loss": 0.7625434994697571,
	"SWA_runtime": 0.5432,
	"SWA_samples_per_second": 18.408,
	"SWA_steps_per_second": 18.408,
	"epoch": 91.30434782608695,
	"step": 2100
	},
	{
	"EMA_loss": 0.7306564450263977,
	"EMA_runtime": 0.4703,
	"EMA_samples_per_second": 21.264,
	"EMA_steps_per_second": 21.264,
	"epoch": 91.30434782608695,
	"step": 2100
	},
	{
	"epoch": 91.73913043478261,
	"grad_norm": 2.2282097339630127,
	"learning_rate": 7.487848504295937e-06,
	"loss": 0.2597,
	"step": 2110
	},
	{
	"epoch": 92.17391304347827,
	"grad_norm": 2.146618127822876,
	"learning_rate": 7.4878388584705885e-06,
	"loss": 0.2901,
	"step": 2120
	},
	{
	"epoch": 92.6086956521739,
	"grad_norm": 1.9365864992141724,
	"learning_rate": 7.487827728682402e-06,
	"loss": 0.2796,
	"step": 2130
	},
	{
	"epoch": 93.04347826086956,
	"grad_norm": 1.677370309829712,
	"learning_rate": 7.487815114935791e-06,
	"loss": 0.2375,
	"step": 2140
	},
	{
	"epoch": 93.47826086956522,
	"grad_norm": 1.871509075164795,
	"learning_rate": 7.487801017235753e-06,
	"loss": 0.289,
	"step": 2150
	},
	{
	"epoch": 93.91304347826087,
	"grad_norm": 2.1130902767181396,
	"learning_rate": 7.4877854355878785e-06,
	"loss": 0.2698,
	"step": 2160
	},
	{
	"epoch": 94.34782608695652,
	"grad_norm": 1.9688533544540405,
	"learning_rate": 7.487768369998342e-06,
	"loss": 0.2168,
	"step": 2170
	},
	{
	"epoch": 94.78260869565217,
	"grad_norm": 2.1728529930114746,
	"learning_rate": 7.4877498204739075e-06,
	"loss": 0.2961,
	"step": 2180
	},
	{
	"epoch": 95.21739130434783,
	"grad_norm": 2.192168712615967,
	"learning_rate": 7.487729787021927e-06,
	"loss": 0.2599,
	"step": 2190
	},
	{
	"epoch": 95.65217391304348,
	"grad_norm": 2.4115936756134033,
	"learning_rate": 7.487708269650342e-06,
	"loss": 0.2587,
	"step": 2200
	},
	{
	"epoch": 96.08695652173913,
	"grad_norm": 2.353425979614258,
	"learning_rate": 7.487685268367682e-06,
	"loss": 0.259,
	"step": 2210
	},
	{
	"epoch": 96.52173913043478,
	"grad_norm": 1.855171799659729,
	"learning_rate": 7.487660783183063e-06,
	"loss": 0.2681,
	"step": 2220
	},
	{
	"epoch": 96.95652173913044,
	"grad_norm": 2.1836190223693848,
	"learning_rate": 7.48763481410619e-06,
	"loss": 0.2607,
	"step": 2230
	},
	{
	"epoch": 97.3913043478261,
	"grad_norm": 1.6038516759872437,
	"learning_rate": 7.487607361147356e-06,
	"loss": 0.2881,
	"step": 2240
	},
	{
	"epoch": 97.82608695652173,
	"grad_norm": 1.3469552993774414,
	"learning_rate": 7.487578424317443e-06,
	"loss": 0.2524,
	"step": 2250
	},
	{
	"epoch": 97.82608695652173,
	"eval_loss": 0.9057046175003052,
	"eval_runtime": 0.4015,
	"eval_samples_per_second": 24.909,
	"eval_steps_per_second": 24.909,
	"step": 2250
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.3966,
	"Start_State_samples_per_second": 25.217,
	"Start_State_steps_per_second": 25.217,
	"epoch": 97.82608695652173,
	"step": 2250
	},
	{
	"Raw_Model_loss": 0.9057046175003052,
	"Raw_Model_runtime": 0.3945,
	"Raw_Model_samples_per_second": 25.347,
	"Raw_Model_steps_per_second": 25.347,
	"epoch": 97.82608695652173,
	"step": 2250
	},
	{
	"SWA_loss": 0.7665841579437256,
	"SWA_runtime": 0.3965,
	"SWA_samples_per_second": 25.221,
	"SWA_steps_per_second": 25.221,
	"epoch": 97.82608695652173,
	"step": 2250
	},
	{
	"EMA_loss": 0.7307609915733337,
	"EMA_runtime": 0.402,
	"EMA_samples_per_second": 24.875,
	"EMA_steps_per_second": 24.875,
	"epoch": 97.82608695652173,
	"step": 2250
	},
	{
	"epoch": 98.26086956521739,
	"grad_norm": 1.9246830940246582,
	"learning_rate": 7.487548003627922e-06,
	"loss": 0.2415,
	"step": 2260
	},
	{
	"epoch": 98.69565217391305,
	"grad_norm": 1.7473000288009644,
	"learning_rate": 7.487516099090849e-06,
	"loss": 0.278,
	"step": 2270
	},
	{
	"epoch": 99.1304347826087,
	"grad_norm": 2.0333516597747803,
	"learning_rate": 7.48748271071887e-06,
	"loss": 0.2488,
	"step": 2280
	},
	{
	"epoch": 99.56521739130434,
	"grad_norm": 2.3631269931793213,
	"learning_rate": 7.48744783852522e-06,
	"loss": 0.2882,
	"step": 2290
	},
	{
	"epoch": 100.0,
	"grad_norm": 2.6425907611846924,
	"learning_rate": 7.487411482523721e-06,
	"loss": 0.2322,
	"step": 2300
	},
	{
	"epoch": 100.43478260869566,
	"grad_norm": 2.703728437423706,
	"learning_rate": 7.4873736427287825e-06,
	"loss": 0.2371,
	"step": 2310
	},
	{
	"epoch": 100.8695652173913,
	"grad_norm": 1.7555862665176392,
	"learning_rate": 7.487334319155404e-06,
	"loss": 0.2697,
	"step": 2320
	},
	{
	"epoch": 101.30434782608695,
	"grad_norm": 2.5154976844787598,
	"learning_rate": 7.487293511819172e-06,
	"loss": 0.2417,
	"step": 2330
	},
	{
	"epoch": 101.73913043478261,
	"grad_norm": 1.7718055248260498,
	"learning_rate": 7.4872512207362605e-06,
	"loss": 0.2446,
	"step": 2340
	},
	{
	"epoch": 102.17391304347827,
	"grad_norm": 1.7671442031860352,
	"learning_rate": 7.487207445923432e-06,
	"loss": 0.2936,
	"step": 2350
	},
	{
	"epoch": 102.6086956521739,
	"grad_norm": 2.0610148906707764,
	"learning_rate": 7.487162187398039e-06,
	"loss": 0.2845,
	"step": 2360
	},
	{
	"epoch": 103.04347826086956,
	"grad_norm": 1.9395049810409546,
	"learning_rate": 7.487115445178019e-06,
	"loss": 0.2163,
	"step": 2370
	},
	{
	"epoch": 103.47826086956522,
	"grad_norm": 2.1225855350494385,
	"learning_rate": 7.487067219281901e-06,
	"loss": 0.2913,
	"step": 2380
	},
	{
	"epoch": 103.91304347826087,
	"grad_norm": 2.034578561782837,
	"learning_rate": 7.4870175097287985e-06,
	"loss": 0.2417,
	"step": 2390
	},
	{
	"epoch": 104.34782608695652,
	"grad_norm": 1.9769914150238037,
	"learning_rate": 7.486966316538416e-06,
	"loss": 0.2563,
	"step": 2400
	},
	{
	"epoch": 104.34782608695652,
	"eval_loss": 0.9094018936157227,
	"eval_runtime": 0.5284,
	"eval_samples_per_second": 18.926,
	"eval_steps_per_second": 18.926,
	"step": 2400
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.491,
	"Start_State_samples_per_second": 20.366,
	"Start_State_steps_per_second": 20.366,
	"epoch": 104.34782608695652,
	"step": 2400
	},
	{
	"Raw_Model_loss": 0.9094018936157227,
	"Raw_Model_runtime": 0.5034,
	"Raw_Model_samples_per_second": 19.866,
	"Raw_Model_steps_per_second": 19.866,
	"epoch": 104.34782608695652,
	"step": 2400
	},
	{
	"SWA_loss": 0.7745841145515442,
	"SWA_runtime": 0.5911,
	"SWA_samples_per_second": 16.917,
	"SWA_steps_per_second": 16.917,
	"epoch": 104.34782608695652,
	"step": 2400
	},
	{
	"EMA_loss": 0.7307760119438171,
	"EMA_runtime": 0.4346,
	"EMA_samples_per_second": 23.011,
	"EMA_steps_per_second": 23.011,
	"epoch": 104.34782608695652,
	"step": 2400
	},
	{
	"epoch": 104.78260869565217,
	"grad_norm": 2.087158679962158,
	"learning_rate": 7.486913639731043e-06,
	"loss": 0.2497,
	"step": 2410
	},
	{
	"epoch": 105.21739130434783,
	"grad_norm": 1.996799349784851,
	"learning_rate": 7.48685947932756e-06,
	"loss": 0.2635,
	"step": 2420
	},
	{
	"epoch": 105.65217391304348,
	"grad_norm": 1.9105130434036255,
	"learning_rate": 7.4868038353494355e-06,
	"loss": 0.2602,
	"step": 2430
	},
	{
	"epoch": 106.08695652173913,
	"grad_norm": 2.1657402515411377,
	"learning_rate": 7.486746707818724e-06,
	"loss": 0.214,
	"step": 2440
	},
	{
	"epoch": 106.52173913043478,
	"grad_norm": 1.444199800491333,
	"learning_rate": 7.486688096758069e-06,
	"loss": 0.2819,
	"step": 2450
	},
	{
	"epoch": 106.95652173913044,
	"grad_norm": 1.8629169464111328,
	"learning_rate": 7.486628002190702e-06,
	"loss": 0.2444,
	"step": 2460
	},
	{
	"epoch": 107.3913043478261,
	"grad_norm": 2.290212631225586,
	"learning_rate": 7.486566424140442e-06,
	"loss": 0.304,
	"step": 2470
	},
	{
	"epoch": 107.82608695652173,
	"grad_norm": 2.3259527683258057,
	"learning_rate": 7.486503362631699e-06,
	"loss": 0.219,
	"step": 2480
	},
	{
	"epoch": 108.26086956521739,
	"grad_norm": 2.0435678958892822,
	"learning_rate": 7.486438817689465e-06,
	"loss": 0.2709,
	"step": 2490
	},
	{
	"epoch": 108.69565217391305,
	"grad_norm": 1.6399531364440918,
	"learning_rate": 7.486372789339326e-06,
	"loss": 0.2456,
	"step": 2500
	},
	{
	"epoch": 109.1304347826087,
	"grad_norm": 1.6286495923995972,
	"learning_rate": 7.486305277607452e-06,
	"loss": 0.2435,
	"step": 2510
	},
	{
	"epoch": 109.56521739130434,
	"grad_norm": 1.3312675952911377,
	"learning_rate": 7.486236282520606e-06,
	"loss": 0.2313,
	"step": 2520
	},
	{
	"epoch": 110.0,
	"grad_norm": 3.1992104053497314,
	"learning_rate": 7.48616580410613e-06,
	"loss": 0.2876,
	"step": 2530
	},
	{
	"epoch": 110.43478260869566,
	"grad_norm": 1.7260243892669678,
	"learning_rate": 7.486093842391963e-06,
	"loss": 0.2455,
	"step": 2540
	},
	{
	"epoch": 110.8695652173913,
	"grad_norm": 1.857021450996399,
	"learning_rate": 7.486020397406629e-06,
	"loss": 0.2697,
	"step": 2550
	},
	{
	"epoch": 110.8695652173913,
	"eval_loss": 0.9266101121902466,
	"eval_runtime": 0.4485,
	"eval_samples_per_second": 22.298,
	"eval_steps_per_second": 22.298,
	"step": 2550
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4407,
	"Start_State_samples_per_second": 22.69,
	"Start_State_steps_per_second": 22.69,
	"epoch": 110.8695652173913,
	"step": 2550
	},
	{
	"Raw_Model_loss": 0.9266101121902466,
	"Raw_Model_runtime": 0.4403,
	"Raw_Model_samples_per_second": 22.71,
	"Raw_Model_steps_per_second": 22.71,
	"epoch": 110.8695652173913,
	"step": 2550
	},
	{
	"SWA_loss": 0.7769848108291626,
	"SWA_runtime": 0.4455,
	"SWA_samples_per_second": 22.448,
	"SWA_steps_per_second": 22.448,
	"epoch": 110.8695652173913,
	"step": 2550
	},
	{
	"EMA_loss": 0.7314194440841675,
	"EMA_runtime": 0.4451,
	"EMA_samples_per_second": 22.468,
	"EMA_steps_per_second": 22.468,
	"epoch": 110.8695652173913,
	"step": 2550
	},
	{
	"epoch": 111.30434782608695,
	"grad_norm": 2.4148638248443604,
	"learning_rate": 7.485945469179237e-06,
	"loss": 0.282,
	"step": 2560
	},
	{
	"epoch": 111.73913043478261,
	"grad_norm": 2.007262945175171,
	"learning_rate": 7.485869057739486e-06,
	"loss": 0.228,
	"step": 2570
	},
	{
	"epoch": 112.17391304347827,
	"grad_norm": 2.0865132808685303,
	"learning_rate": 7.485791163117665e-06,
	"loss": 0.2463,
	"step": 2580
	},
	{
	"epoch": 112.6086956521739,
	"grad_norm": 1.6724177598953247,
	"learning_rate": 7.485711785344648e-06,
	"loss": 0.2463,
	"step": 2590
	},
	{
	"epoch": 113.04347826086956,
	"grad_norm": 2.1320908069610596,
	"learning_rate": 7.485630924451897e-06,
	"loss": 0.2661,
	"step": 2600
	},
	{
	"epoch": 113.47826086956522,
	"grad_norm": 1.8488856554031372,
	"learning_rate": 7.485548580471464e-06,
	"loss": 0.2261,
	"step": 2610
	},
	{
	"epoch": 113.91304347826087,
	"grad_norm": 2.1878151893615723,
	"learning_rate": 7.485464753435987e-06,
	"loss": 0.2756,
	"step": 2620
	},
	{
	"epoch": 114.34782608695652,
	"grad_norm": 1.984470009803772,
	"learning_rate": 7.485379443378693e-06,
	"loss": 0.2451,
	"step": 2630
	},
	{
	"epoch": 114.78260869565217,
	"grad_norm": 2.4623303413391113,
	"learning_rate": 7.485292650333394e-06,
	"loss": 0.2287,
	"step": 2640
	},
	{
	"epoch": 115.21739130434783,
	"grad_norm": 1.7331453561782837,
	"learning_rate": 7.485204374334494e-06,
	"loss": 0.2553,
	"step": 2650
	},
	{
	"epoch": 115.65217391304348,
	"grad_norm": 1.9090930223464966,
	"learning_rate": 7.485114615416982e-06,
	"loss": 0.2721,
	"step": 2660
	},
	{
	"epoch": 116.08695652173913,
	"grad_norm": 2.4040467739105225,
	"learning_rate": 7.485023373616437e-06,
	"loss": 0.2153,
	"step": 2670
	},
	{
	"epoch": 116.52173913043478,
	"grad_norm": 2.5749056339263916,
	"learning_rate": 7.484930648969023e-06,
	"loss": 0.245,
	"step": 2680
	},
	{
	"epoch": 116.95652173913044,
	"grad_norm": 1.6020243167877197,
	"learning_rate": 7.484836441511492e-06,
	"loss": 0.2443,
	"step": 2690
	},
	{
	"epoch": 117.3913043478261,
	"grad_norm": 1.6441881656646729,
	"learning_rate": 7.484740751281187e-06,
	"loss": 0.2361,
	"step": 2700
	},
	{
	"epoch": 117.3913043478261,
	"eval_loss": 0.9320739507675171,
	"eval_runtime": 0.408,
	"eval_samples_per_second": 24.509,
	"eval_steps_per_second": 24.509,
	"step": 2700
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.418,
	"Start_State_samples_per_second": 23.925,
	"Start_State_steps_per_second": 23.925,
	"epoch": 117.3913043478261,
	"step": 2700
	},
	{
	"Raw_Model_loss": 0.9320739507675171,
	"Raw_Model_runtime": 0.402,
	"Raw_Model_samples_per_second": 24.874,
	"Raw_Model_steps_per_second": 24.874,
	"epoch": 117.3913043478261,
	"step": 2700
	},
	{
	"SWA_loss": 0.7840545177459717,
	"SWA_runtime": 0.401,
	"SWA_samples_per_second": 24.935,
	"SWA_steps_per_second": 24.935,
	"epoch": 117.3913043478261,
	"step": 2700
	},
	{
	"EMA_loss": 0.730692982673645,
	"EMA_runtime": 0.3918,
	"EMA_samples_per_second": 25.525,
	"EMA_steps_per_second": 25.525,
	"epoch": 117.3913043478261,
	"step": 2700
	},
	{
	"epoch": 117.82608695652173,
	"grad_norm": 3.2310845851898193,
	"learning_rate": 3.7931296624941244e-07,
	"loss": 0.2406,
	"step": 2710
	},
	{
	"epoch": 118.26086956521739,
	"grad_norm": 1.8407368659973145,
	"learning_rate": 7.586259324988249e-07,
	"loss": 0.2518,
	"step": 2720
	},
	{
	"epoch": 118.69565217391305,
	"grad_norm": 2.2956159114837646,
	"learning_rate": 1.1379388987482372e-06,
	"loss": 0.2412,
	"step": 2730
	},
	{
	"epoch": 119.1304347826087,
	"grad_norm": 2.297415256500244,
	"learning_rate": 1.5172518649976497e-06,
	"loss": 0.2602,
	"step": 2740
	},
	{
	"epoch": 119.56521739130434,
	"grad_norm": 2.2018797397613525,
	"learning_rate": 1.8965648312470621e-06,
	"loss": 0.2596,
	"step": 2750
	},
	{
	"epoch": 120.0,
	"grad_norm": 3.6682052612304688,
	"learning_rate": 2.2758777974964743e-06,
	"loss": 0.219,
	"step": 2760
	},
	{
	"epoch": 120.43478260869566,
	"grad_norm": 1.9333362579345703,
	"learning_rate": 2.6551907637458867e-06,
	"loss": 0.2545,
	"step": 2770
	},
	{
	"epoch": 120.8695652173913,
	"grad_norm": 1.7708905935287476,
	"learning_rate": 3.0345037299952995e-06,
	"loss": 0.2189,
	"step": 2780
	},
	{
	"epoch": 121.30434782608695,
	"grad_norm": 1.4095892906188965,
	"learning_rate": 3.413816696244712e-06,
	"loss": 0.2728,
	"step": 2790
	},
	{
	"epoch": 121.73913043478261,
	"grad_norm": 1.991544246673584,
	"learning_rate": 3.7931296624941243e-06,
	"loss": 0.2699,
	"step": 2800
	},
	{
	"epoch": 122.17391304347827,
	"grad_norm": 2.028014898300171,
	"learning_rate": 3.793129286625273e-06,
	"loss": 0.2196,
	"step": 2810
	},
	{
	"epoch": 122.6086956521739,
	"grad_norm": 1.7729160785675049,
	"learning_rate": 3.7931281590188667e-06,
	"loss": 0.2634,
	"step": 2820
	},
	{
	"epoch": 123.04347826086956,
	"grad_norm": 1.902854323387146,
	"learning_rate": 3.7931262796753532e-06,
	"loss": 0.251,
	"step": 2830
	},
	{
	"epoch": 123.47826086956522,
	"grad_norm": 2.2296345233917236,
	"learning_rate": 3.7931236485954773e-06,
	"loss": 0.2424,
	"step": 2840
	},
	{
	"epoch": 123.91304347826087,
	"grad_norm": 2.3609299659729004,
	"learning_rate": 3.793120265780282e-06,
	"loss": 0.2388,
	"step": 2850
	},
	{
	"epoch": 123.91304347826087,
	"eval_loss": 0.9395554661750793,
	"eval_runtime": 0.3971,
	"eval_samples_per_second": 25.184,
	"eval_steps_per_second": 25.184,
	"step": 2850
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4171,
	"Start_State_samples_per_second": 23.977,
	"Start_State_steps_per_second": 23.977,
	"epoch": 123.91304347826087,
	"step": 2850
	},
	{
	"Raw_Model_loss": 0.9395554661750793,
	"Raw_Model_runtime": 0.3896,
	"Raw_Model_samples_per_second": 25.669,
	"Raw_Model_steps_per_second": 25.669,
	"epoch": 123.91304347826087,
	"step": 2850
	},
	{
	"SWA_loss": 0.7867599725723267,
	"SWA_runtime": 0.4887,
	"SWA_samples_per_second": 20.462,
	"SWA_steps_per_second": 20.462,
	"epoch": 123.91304347826087,
	"step": 2850
	},
	{
	"EMA_loss": 0.7314862608909607,
	"EMA_runtime": 0.4923,
	"EMA_samples_per_second": 20.313,
	"EMA_steps_per_second": 20.313,
	"epoch": 123.91304347826087,
	"step": 2850
	},
	{
	"epoch": 124.34782608695652,
	"grad_norm": 1.6401152610778809,
	"learning_rate": 3.793116131231107e-06,
	"loss": 0.2257,
	"step": 2860
	},
	{
	"epoch": 124.78260869565217,
	"grad_norm": 1.6049269437789917,
	"learning_rate": 3.793111244949593e-06,
	"loss": 0.2303,
	"step": 2870
	},
	{
	"epoch": 125.21739130434783,
	"grad_norm": 2.0744292736053467,
	"learning_rate": 3.793105606937675e-06,
	"loss": 0.2692,
	"step": 2880
	},
	{
	"epoch": 125.65217391304348,
	"grad_norm": 2.102421998977661,
	"learning_rate": 3.7930992171975892e-06,
	"loss": 0.2458,
	"step": 2890
	},
	{
	"epoch": 126.08695652173913,
	"grad_norm": 2.300477981567383,
	"learning_rate": 3.793092075731867e-06,
	"loss": 0.2518,
	"step": 2900
	},
	{
	"epoch": 126.52173913043478,
	"grad_norm": 1.6764642000198364,
	"learning_rate": 3.79308418254334e-06,
	"loss": 0.2022,
	"step": 2910
	},
	{
	"epoch": 126.95652173913044,
	"grad_norm": 1.5686938762664795,
	"learning_rate": 3.7930755376351365e-06,
	"loss": 0.2903,
	"step": 2920
	},
	{
	"epoch": 127.3913043478261,
	"grad_norm": 2.0804359912872314,
	"learning_rate": 3.7930661410106833e-06,
	"loss": 0.2556,
	"step": 2930
	},
	{
	"epoch": 127.82608695652173,
	"grad_norm": 2.6569416522979736,
	"learning_rate": 3.793055992673704e-06,
	"loss": 0.2196,
	"step": 2940
	},
	{
	"epoch": 128.2608695652174,
	"grad_norm": 2.325507164001465,
	"learning_rate": 3.7930450926282215e-06,
	"loss": 0.2961,
	"step": 2950
	},
	{
	"epoch": 128.69565217391303,
	"grad_norm": 1.6577781438827515,
	"learning_rate": 3.793033440878557e-06,
	"loss": 0.2414,
	"step": 2960
	},
	{
	"epoch": 129.1304347826087,
	"grad_norm": 1.6468480825424194,
	"learning_rate": 3.7930210374293287e-06,
	"loss": 0.2031,
	"step": 2970
	},
	{
	"epoch": 129.56521739130434,
	"grad_norm": 1.8844521045684814,
	"learning_rate": 3.793007882285452e-06,
	"loss": 0.2411,
	"step": 2980
	},
	{
	"epoch": 130.0,
	"grad_norm": 5.029874801635742,
	"learning_rate": 3.7929939754521417e-06,
	"loss": 0.2465,
	"step": 2990
	},
	{
	"epoch": 130.43478260869566,
	"grad_norm": 2.3793535232543945,
	"learning_rate": 3.79297931693491e-06,
	"loss": 0.2374,
	"step": 3000
	},
	{
	"epoch": 130.43478260869566,
	"eval_loss": 0.9428585171699524,
	"eval_runtime": 0.4348,
	"eval_samples_per_second": 22.999,
	"eval_steps_per_second": 22.999,
	"step": 3000
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4391,
	"Start_State_samples_per_second": 22.776,
	"Start_State_steps_per_second": 22.776,
	"epoch": 130.43478260869566,
	"step": 3000
	},
	{
	"Raw_Model_loss": 0.9428585171699524,
	"Raw_Model_runtime": 0.4353,
	"Raw_Model_samples_per_second": 22.971,
	"Raw_Model_steps_per_second": 22.971,
	"epoch": 130.43478260869566,
	"step": 3000
	},
	{
	"SWA_loss": 0.7945135831832886,
	"SWA_runtime": 0.4368,
	"SWA_samples_per_second": 22.892,
	"SWA_steps_per_second": 22.892,
	"epoch": 130.43478260869566,
	"step": 3000
	},
	{
	"EMA_loss": 0.7300070524215698,
	"EMA_runtime": 0.4353,
	"EMA_samples_per_second": 22.971,
	"EMA_steps_per_second": 22.971,
	"epoch": 130.43478260869566,
	"step": 3000
	},
	{
	"epoch": 130.8695652173913,
	"grad_norm": 2.055912733078003,
	"learning_rate": 3.7929639067395674e-06,
	"loss": 0.2305,
	"step": 3010
	},
	{
	"epoch": 131.30434782608697,
	"grad_norm": 1.8568785190582275,
	"learning_rate": 3.7929477448722217e-06,
	"loss": 0.2706,
	"step": 3020
	},
	{
	"epoch": 131.7391304347826,
	"grad_norm": 1.9422987699508667,
	"learning_rate": 3.792930831339279e-06,
	"loss": 0.2616,
	"step": 3030
	},
	{
	"epoch": 132.17391304347825,
	"grad_norm": 1.81191885471344,
	"learning_rate": 3.7929131661474433e-06,
	"loss": 0.2272,
	"step": 3040
	},
	{
	"epoch": 132.6086956521739,
	"grad_norm": 2.1437313556671143,
	"learning_rate": 3.7928947493037164e-06,
	"loss": 0.253,
	"step": 3050
	},
	{
	"epoch": 133.04347826086956,
	"grad_norm": 2.685347318649292,
	"learning_rate": 3.792875580815398e-06,
	"loss": 0.2152,
	"step": 3060
	},
	{
	"epoch": 133.47826086956522,
	"grad_norm": 1.2992076873779297,
	"learning_rate": 3.7928556606900864e-06,
	"loss": 0.2486,
	"step": 3070
	},
	{
	"epoch": 133.91304347826087,
	"grad_norm": 2.3356173038482666,
	"learning_rate": 3.7928349889356773e-06,
	"loss": 0.2736,
	"step": 3080
	},
	{
	"epoch": 134.34782608695653,
	"grad_norm": 1.9858746528625488,
	"learning_rate": 3.7928135655603634e-06,
	"loss": 0.254,
	"step": 3090
	},
	{
	"epoch": 134.7826086956522,
	"grad_norm": 1.929052710533142,
	"learning_rate": 3.792791390572637e-06,
	"loss": 0.2063,
	"step": 3100
	},
	{
	"epoch": 135.2173913043478,
	"grad_norm": 2.71032977104187,
	"learning_rate": 3.7927684639812876e-06,
	"loss": 0.2441,
	"step": 3110
	},
	{
	"epoch": 135.65217391304347,
	"grad_norm": 1.8756812810897827,
	"learning_rate": 3.7927447857954023e-06,
	"loss": 0.2854,
	"step": 3120
	},
	{
	"epoch": 136.08695652173913,
	"grad_norm": 2.36094069480896,
	"learning_rate": 3.792720356024367e-06,
	"loss": 0.2128,
	"step": 3130
	},
	{
	"epoch": 136.52173913043478,
	"grad_norm": 2.351156711578369,
	"learning_rate": 3.7926951746778637e-06,
	"loss": 0.2385,
	"step": 3140
	},
	{
	"epoch": 136.95652173913044,
	"grad_norm": 2.7988734245300293,
	"learning_rate": 3.7926692417658747e-06,
	"loss": 0.2336,
	"step": 3150
	},
	{
	"epoch": 136.95652173913044,
	"eval_loss": 0.9436905980110168,
	"eval_runtime": 0.4896,
	"eval_samples_per_second": 20.427,
	"eval_steps_per_second": 20.427,
	"step": 3150
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.578,
	"Start_State_samples_per_second": 17.301,
	"Start_State_steps_per_second": 17.301,
	"epoch": 136.95652173913044,
	"step": 3150
	},
	{
	"Raw_Model_loss": 0.9436905980110168,
	"Raw_Model_runtime": 0.5409,
	"Raw_Model_samples_per_second": 18.487,
	"Raw_Model_steps_per_second": 18.487,
	"epoch": 136.95652173913044,
	"step": 3150
	},
	{
	"SWA_loss": 0.7969022393226624,
	"SWA_runtime": 0.616,
	"SWA_samples_per_second": 16.235,
	"SWA_steps_per_second": 16.235,
	"epoch": 136.95652173913044,
	"step": 3150
	},
	{
	"EMA_loss": 0.7306665182113647,
	"EMA_runtime": 0.5117,
	"EMA_samples_per_second": 19.544,
	"EMA_steps_per_second": 19.544,
	"epoch": 136.95652173913044,
	"step": 3150
	},
	{
	"epoch": 137.3913043478261,
	"grad_norm": 1.775488257408142,
	"learning_rate": 3.792642557298678e-06,
	"loss": 0.2772,
	"step": 3160
	},
	{
	"epoch": 137.82608695652175,
	"grad_norm": 1.9294140338897705,
	"learning_rate": 3.7926151212868503e-06,
	"loss": 0.2351,
	"step": 3170
	},
	{
	"epoch": 138.2608695652174,
	"grad_norm": 1.642681360244751,
	"learning_rate": 3.792586933741268e-06,
	"loss": 0.2272,
	"step": 3180
	},
	{
	"epoch": 138.69565217391303,
	"grad_norm": 2.080634593963623,
	"learning_rate": 3.792557994673102e-06,
	"loss": 0.2754,
	"step": 3190
	},
	{
	"epoch": 139.1304347826087,
	"grad_norm": 1.3820661306381226,
	"learning_rate": 3.792528304093824e-06,
	"loss": 0.2258,
	"step": 3200
	},
	{
	"epoch": 139.56521739130434,
	"grad_norm": 2.019350051879883,
	"learning_rate": 3.7924978620152023e-06,
	"loss": 0.2705,
	"step": 3210
	},
	{
	"epoch": 140.0,
	"grad_norm": 2.975282907485962,
	"learning_rate": 3.7924666684493018e-06,
	"loss": 0.2302,
	"step": 3220
	},
	{
	"epoch": 140.43478260869566,
	"grad_norm": 2.264106273651123,
	"learning_rate": 3.792434723408488e-06,
	"loss": 0.2315,
	"step": 3230
	},
	{
	"epoch": 140.8695652173913,
	"grad_norm": 1.7037856578826904,
	"learning_rate": 3.7924020269054226e-06,
	"loss": 0.2381,
	"step": 3240
	},
	{
	"epoch": 141.30434782608697,
	"grad_norm": 1.9553606510162354,
	"learning_rate": 3.7923685789530654e-06,
	"loss": 0.2367,
	"step": 3250
	},
	{
	"epoch": 141.7391304347826,
	"grad_norm": 1.9915337562561035,
	"learning_rate": 3.7923343795646736e-06,
	"loss": 0.2491,
	"step": 3260
	},
	{
	"epoch": 142.17391304347825,
	"grad_norm": 1.7067251205444336,
	"learning_rate": 3.7922994287538036e-06,
	"loss": 0.2579,
	"step": 3270
	},
	{
	"epoch": 142.6086956521739,
	"grad_norm": 2.5622429847717285,
	"learning_rate": 3.792263726534308e-06,
	"loss": 0.2607,
	"step": 3280
	},
	{
	"epoch": 143.04347826086956,
	"grad_norm": 1.2580666542053223,
	"learning_rate": 3.7922272729203387e-06,
	"loss": 0.2155,
	"step": 3290
	},
	{
	"epoch": 143.47826086956522,
	"grad_norm": 1.8073185682296753,
	"learning_rate": 3.792190067926345e-06,
	"loss": 0.2478,
	"step": 3300
	},
	{
	"epoch": 143.47826086956522,
	"eval_loss": 0.9493485689163208,
	"eval_runtime": 0.41,
	"eval_samples_per_second": 24.39,
	"eval_steps_per_second": 24.39,
	"step": 3300
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.3933,
	"Start_State_samples_per_second": 25.423,
	"Start_State_steps_per_second": 25.423,
	"epoch": 143.47826086956522,
	"step": 3300
	},
	{
	"Raw_Model_loss": 0.9493485689163208,
	"Raw_Model_runtime": 0.3998,
	"Raw_Model_samples_per_second": 25.013,
	"Raw_Model_steps_per_second": 25.013,
	"epoch": 143.47826086956522,
	"step": 3300
	},
	{
	"SWA_loss": 0.8028978109359741,
	"SWA_runtime": 0.4094,
	"SWA_samples_per_second": 24.424,
	"SWA_steps_per_second": 24.424,
	"epoch": 143.47826086956522,
	"step": 3300
	},
	{
	"EMA_loss": 0.7308684587478638,
	"EMA_runtime": 0.3971,
	"EMA_samples_per_second": 25.181,
	"EMA_steps_per_second": 25.181,
	"epoch": 143.47826086956522,
	"step": 3300
	},
	{
	"epoch": 143.91304347826087,
	"grad_norm": 1.8851526975631714,
	"learning_rate": 3.7921521115670724e-06,
	"loss": 0.2538,
	"step": 3310
	},
	{
	"epoch": 144.34782608695653,
	"grad_norm": 1.569898247718811,
	"learning_rate": 3.7921134038575663e-06,
	"loss": 0.2145,
	"step": 3320
	},
	{
	"epoch": 144.7826086956522,
	"grad_norm": 1.718190312385559,
	"learning_rate": 3.79207394481317e-06,
	"loss": 0.2708,
	"step": 3330
	},
	{
	"epoch": 145.2173913043478,
	"grad_norm": 2.9095687866210938,
	"learning_rate": 3.7920337344495226e-06,
	"loss": 0.2084,
	"step": 3340
	},
	{
	"epoch": 145.65217391304347,
	"grad_norm": 1.8533018827438354,
	"learning_rate": 3.791992772782563e-06,
	"loss": 0.2381,
	"step": 3350
	},
	{
	"epoch": 146.08695652173913,
	"grad_norm": 1.9780678749084473,
	"learning_rate": 3.791951059828527e-06,
	"loss": 0.2651,
	"step": 3360
	},
	{
	"epoch": 146.52173913043478,
	"grad_norm": 1.834191083908081,
	"learning_rate": 3.791908595603947e-06,
	"loss": 0.2269,
	"step": 3370
	},
	{
	"epoch": 146.95652173913044,
	"grad_norm": 1.6292699575424194,
	"learning_rate": 3.7918653801256568e-06,
	"loss": 0.2159,
	"step": 3380
	},
	{
	"epoch": 147.3913043478261,
	"grad_norm": 1.5715214014053345,
	"learning_rate": 3.791821413410784e-06,
	"loss": 0.2288,
	"step": 3390
	},
	{
	"epoch": 147.82608695652175,
	"grad_norm": 1.5430243015289307,
	"learning_rate": 3.791776695476756e-06,
	"loss": 0.2538,
	"step": 3400
	},
	{
	"epoch": 148.2608695652174,
	"grad_norm": 1.466277837753296,
	"learning_rate": 3.791731226341297e-06,
	"loss": 0.2156,
	"step": 3410
	},
	{
	"epoch": 148.69565217391303,
	"grad_norm": 1.8279281854629517,
	"learning_rate": 3.7916850060224308e-06,
	"loss": 0.2498,
	"step": 3420
	},
	{
	"epoch": 149.1304347826087,
	"grad_norm": 1.7966867685317993,
	"learning_rate": 3.791638034538477e-06,
	"loss": 0.2716,
	"step": 3430
	},
	{
	"epoch": 149.56521739130434,
	"grad_norm": 2.2440056800842285,
	"learning_rate": 3.7915903119080527e-06,
	"loss": 0.265,
	"step": 3440
	},
	{
	"epoch": 150.0,
	"grad_norm": 3.2762231826782227,
	"learning_rate": 3.7915418381500747e-06,
	"loss": 0.2208,
	"step": 3450
	},
	{
	"epoch": 150.0,
	"eval_loss": 0.9505823850631714,
	"eval_runtime": 0.4422,
	"eval_samples_per_second": 22.615,
	"eval_steps_per_second": 22.615,
	"step": 3450
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4072,
	"Start_State_samples_per_second": 24.558,
	"Start_State_steps_per_second": 24.558,
	"epoch": 150.0,
	"step": 3450
	},
	{
	"Raw_Model_loss": 0.9505823850631714,
	"Raw_Model_runtime": 0.4153,
	"Raw_Model_samples_per_second": 24.076,
	"Raw_Model_steps_per_second": 24.076,
	"epoch": 150.0,
	"step": 3450
	},
	{
	"SWA_loss": 0.8045159578323364,
	"SWA_runtime": 0.402,
	"SWA_samples_per_second": 24.876,
	"SWA_steps_per_second": 24.876,
	"epoch": 150.0,
	"step": 3450
	},
	{
	"EMA_loss": 0.7316843271255493,
	"EMA_runtime": 0.4,
	"EMA_samples_per_second": 25.002,
	"EMA_steps_per_second": 25.002,
	"epoch": 150.0,
	"step": 3450
	},
	{
	"epoch": 150.43478260869566,
	"grad_norm": 1.7541120052337646,
	"learning_rate": 2.4672082280509036e-07,
	"loss": 0.214,
	"step": 3460
	},
	{
	"epoch": 150.8695652173913,
	"grad_norm": 2.0008656978607178,
	"learning_rate": 4.934416456101807e-07,
	"loss": 0.2627,
	"step": 3470
	},
	{
	"epoch": 151.30434782608697,
	"grad_norm": 1.6539170742034912,
	"learning_rate": 7.40162468415271e-07,
	"loss": 0.2,
	"step": 3480
	},
	{
	"epoch": 151.7391304347826,
	"grad_norm": 2.369926691055298,
	"learning_rate": 9.868832912203614e-07,
	"loss": 0.2478,
	"step": 3490
	},
	{
	"epoch": 152.17391304347825,
	"grad_norm": 2.07112979888916,
	"learning_rate": 1.2336041140254517e-06,
	"loss": 0.2427,
	"step": 3500
	},
	{
	"epoch": 152.6086956521739,
	"grad_norm": 1.6030749082565308,
	"learning_rate": 1.480324936830542e-06,
	"loss": 0.2402,
	"step": 3510
	},
	{
	"epoch": 153.04347826086956,
	"grad_norm": 1.5949645042419434,
	"learning_rate": 1.7270457596356322e-06,
	"loss": 0.2072,
	"step": 3520
	},
	{
	"epoch": 153.47826086956522,
	"grad_norm": 2.338641881942749,
	"learning_rate": 1.973766582440723e-06,
	"loss": 0.2506,
	"step": 3530
	},
	{
	"epoch": 153.91304347826087,
	"grad_norm": 2.719093084335327,
	"learning_rate": 2.220487405245813e-06,
	"loss": 0.2321,
	"step": 3540
	},
	{
	"epoch": 154.34782608695653,
	"grad_norm": 2.292358636856079,
	"learning_rate": 2.4672082280509034e-06,
	"loss": 0.2404,
	"step": 3550
	},
	{
	"epoch": 154.7826086956522,
	"grad_norm": 2.0019381046295166,
	"learning_rate": 2.4672079835702752e-06,
	"loss": 0.2343,
	"step": 3560
	},
	{
	"epoch": 155.2173913043478,
	"grad_norm": 1.6779125928878784,
	"learning_rate": 2.4672072501284865e-06,
	"loss": 0.1963,
	"step": 3570
	},
	{
	"epoch": 155.65217391304347,
	"grad_norm": 2.0632243156433105,
	"learning_rate": 2.467206027725829e-06,
	"loss": 0.267,
	"step": 3580
	},
	{
	"epoch": 156.08695652173913,
	"grad_norm": 1.6089539527893066,
	"learning_rate": 2.467204316362787e-06,
	"loss": 0.2034,
	"step": 3590
	},
	{
	"epoch": 156.52173913043478,
	"grad_norm": 2.475633382797241,
	"learning_rate": 2.4672021160400387e-06,
	"loss": 0.2685,
	"step": 3600
	},
	{
	"epoch": 156.52173913043478,
	"eval_loss": 0.9592596292495728,
	"eval_runtime": 0.4813,
	"eval_samples_per_second": 20.778,
	"eval_steps_per_second": 20.778,
	"step": 3600
	},
	{
	"Start_State_loss": 0.7309322357177734,
	"Start_State_runtime": 0.4223,
	"Start_State_samples_per_second": 23.679,
	"Start_State_steps_per_second": 23.679,
	"epoch": 156.52173913043478,
	"step": 3600
	},
	{
	"Raw_Model_loss": 0.9592596292495728,
	"Raw_Model_runtime": 0.3944,
	"Raw_Model_samples_per_second": 25.356,
	"Raw_Model_steps_per_second": 25.356,
	"epoch": 156.52173913043478,
	"step": 3600
	},
	{
	"SWA_loss": 0.8119293451309204,
	"SWA_runtime": 0.3904,
	"SWA_samples_per_second": 25.615,
	"SWA_steps_per_second": 25.615,
	"epoch": 156.52173913043478,
	"step": 3600
	},
	{
	"EMA_loss": 0.7311049103736877,
	"EMA_runtime": 0.4017,
	"EMA_samples_per_second": 24.896,
	"EMA_steps_per_second": 24.896,
	"epoch": 156.52173913043478,
	"step": 3600
	}
	],
	"logging_steps": 10,
	"max_steps": 50000,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2174,
	"save_steps": 150,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 9.28760054861906e+16,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}