testypoos2 / last-checkpoint /trainer_state.json
irishprancer's picture
Training in progress, step 3600, checkpoint
7c19f8e verified
raw
history blame
88.4 kB
{
"best_metric": 0.717534065246582,
"best_model_checkpoint": "./output/checkpoint-450",
"epoch": 156.52173913043478,
"eval_steps": 150,
"global_step": 3600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.43478260869565216,
"grad_norm": 1.5021440982818604,
"learning_rate": 3e-06,
"loss": 0.9061,
"step": 10
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.6870536804199219,
"learning_rate": 6e-06,
"loss": 0.9023,
"step": 20
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.729659080505371,
"learning_rate": 9e-06,
"loss": 0.9004,
"step": 30
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.453600525856018,
"learning_rate": 1.2e-05,
"loss": 0.9091,
"step": 40
},
{
"epoch": 2.1739130434782608,
"grad_norm": 1.3518075942993164,
"learning_rate": 1.5e-05,
"loss": 0.8362,
"step": 50
},
{
"epoch": 2.608695652173913,
"grad_norm": 2.031172513961792,
"learning_rate": 1.8e-05,
"loss": 0.8893,
"step": 60
},
{
"epoch": 3.0434782608695654,
"grad_norm": 1.484531283378601,
"learning_rate": 2.1e-05,
"loss": 0.8915,
"step": 70
},
{
"epoch": 3.4782608695652173,
"grad_norm": 1.7294986248016357,
"learning_rate": 2.4e-05,
"loss": 0.8233,
"step": 80
},
{
"epoch": 3.9130434782608696,
"grad_norm": 1.4242360591888428,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.8527,
"step": 90
},
{
"epoch": 4.3478260869565215,
"grad_norm": 1.3656773567199707,
"learning_rate": 3e-05,
"loss": 0.8648,
"step": 100
},
{
"epoch": 4.782608695652174,
"grad_norm": 2.19753098487854,
"learning_rate": 2.999999702723963e-05,
"loss": 0.8225,
"step": 110
},
{
"epoch": 5.217391304347826,
"grad_norm": 1.0726382732391357,
"learning_rate": 2.9999988108959687e-05,
"loss": 0.7654,
"step": 120
},
{
"epoch": 5.6521739130434785,
"grad_norm": 1.5603922605514526,
"learning_rate": 2.9999973245163716e-05,
"loss": 0.7417,
"step": 130
},
{
"epoch": 6.086956521739131,
"grad_norm": 1.9068461656570435,
"learning_rate": 2.99999524358576e-05,
"loss": 0.7654,
"step": 140
},
{
"epoch": 6.521739130434782,
"grad_norm": 1.1220637559890747,
"learning_rate": 2.9999925681049593e-05,
"loss": 0.7857,
"step": 150
},
{
"epoch": 6.521739130434782,
"eval_loss": 0.7963114976882935,
"eval_runtime": 0.4908,
"eval_samples_per_second": 20.374,
"eval_steps_per_second": 20.374,
"step": 150
},
{
"epoch": 6.956521739130435,
"grad_norm": 1.5331261157989502,
"learning_rate": 2.9999892980750297e-05,
"loss": 0.6585,
"step": 160
},
{
"epoch": 7.391304347826087,
"grad_norm": 1.3447493314743042,
"learning_rate": 2.9999854334972675e-05,
"loss": 0.7388,
"step": 170
},
{
"epoch": 7.826086956521739,
"grad_norm": 1.7259607315063477,
"learning_rate": 2.999980974373204e-05,
"loss": 0.7293,
"step": 180
},
{
"epoch": 8.26086956521739,
"grad_norm": 1.5403547286987305,
"learning_rate": 2.9999759207046075e-05,
"loss": 0.6247,
"step": 190
},
{
"epoch": 8.695652173913043,
"grad_norm": 1.7431354522705078,
"learning_rate": 2.9999702724934804e-05,
"loss": 0.6765,
"step": 200
},
{
"epoch": 9.130434782608695,
"grad_norm": 1.0416122674942017,
"learning_rate": 2.999964029742062e-05,
"loss": 0.6523,
"step": 210
},
{
"epoch": 9.565217391304348,
"grad_norm": 1.2200145721435547,
"learning_rate": 2.9999571924528263e-05,
"loss": 0.5592,
"step": 220
},
{
"epoch": 10.0,
"grad_norm": 1.526785969734192,
"learning_rate": 2.9999497606284837e-05,
"loss": 0.756,
"step": 230
},
{
"epoch": 10.434782608695652,
"grad_norm": 1.4215515851974487,
"learning_rate": 2.9999417342719796e-05,
"loss": 0.7117,
"step": 240
},
{
"epoch": 10.869565217391305,
"grad_norm": 0.9789811372756958,
"learning_rate": 2.9999331133864956e-05,
"loss": 0.5896,
"step": 250
},
{
"epoch": 11.304347826086957,
"grad_norm": 1.1944794654846191,
"learning_rate": 2.9999238979754485e-05,
"loss": 0.6547,
"step": 260
},
{
"epoch": 11.73913043478261,
"grad_norm": 1.050191044807434,
"learning_rate": 2.999914088042492e-05,
"loss": 0.6475,
"step": 270
},
{
"epoch": 12.173913043478262,
"grad_norm": 1.3121248483657837,
"learning_rate": 2.9999036835915132e-05,
"loss": 0.594,
"step": 280
},
{
"epoch": 12.608695652173914,
"grad_norm": 1.082655906677246,
"learning_rate": 2.9998926846266365e-05,
"loss": 0.6326,
"step": 290
},
{
"epoch": 13.043478260869565,
"grad_norm": 1.3888633251190186,
"learning_rate": 2.9998810911522213e-05,
"loss": 0.5806,
"step": 300
},
{
"epoch": 13.043478260869565,
"eval_loss": 0.7309322357177734,
"eval_runtime": 0.5145,
"eval_samples_per_second": 19.436,
"eval_steps_per_second": 19.436,
"step": 300
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.554,
"Start_State_samples_per_second": 18.05,
"Start_State_steps_per_second": 18.05,
"epoch": 13.043478260869565,
"step": 300
},
{
"SWA_loss": 0.7309322357177734,
"SWA_runtime": 0.5629,
"SWA_samples_per_second": 17.765,
"SWA_steps_per_second": 17.765,
"epoch": 13.043478260869565,
"step": 300
},
{
"EMA_loss": 0.7309322357177734,
"EMA_runtime": 0.5474,
"EMA_samples_per_second": 18.268,
"EMA_steps_per_second": 18.268,
"epoch": 13.043478260869565,
"step": 300
},
{
"epoch": 13.478260869565217,
"grad_norm": 1.7805578708648682,
"learning_rate": 2.9998689031728636e-05,
"loss": 0.5145,
"step": 310
},
{
"epoch": 13.91304347826087,
"grad_norm": 1.533318042755127,
"learning_rate": 2.9998561206933938e-05,
"loss": 0.6497,
"step": 320
},
{
"epoch": 14.347826086956522,
"grad_norm": 1.4810696840286255,
"learning_rate": 2.9998427437188786e-05,
"loss": 0.5741,
"step": 330
},
{
"epoch": 14.782608695652174,
"grad_norm": 1.3101780414581299,
"learning_rate": 2.99982877225462e-05,
"loss": 0.6013,
"step": 340
},
{
"epoch": 15.217391304347826,
"grad_norm": 0.9747373461723328,
"learning_rate": 2.9998142063061564e-05,
"loss": 0.4991,
"step": 350
},
{
"epoch": 15.652173913043478,
"grad_norm": 1.6347649097442627,
"learning_rate": 2.9997990458792603e-05,
"loss": 0.5624,
"step": 360
},
{
"epoch": 16.08695652173913,
"grad_norm": 1.6364760398864746,
"learning_rate": 2.9997832909799417e-05,
"loss": 0.667,
"step": 370
},
{
"epoch": 16.52173913043478,
"grad_norm": 0.9518026113510132,
"learning_rate": 2.9997669416144452e-05,
"loss": 0.513,
"step": 380
},
{
"epoch": 16.956521739130434,
"grad_norm": 0.9366481304168701,
"learning_rate": 2.999749997789251e-05,
"loss": 0.5798,
"step": 390
},
{
"epoch": 17.391304347826086,
"grad_norm": 1.1163969039916992,
"learning_rate": 2.9997324595110743e-05,
"loss": 0.518,
"step": 400
},
{
"epoch": 17.82608695652174,
"grad_norm": 1.2849133014678955,
"learning_rate": 2.9997143267868683e-05,
"loss": 0.5877,
"step": 410
},
{
"epoch": 18.26086956521739,
"grad_norm": 1.1642106771469116,
"learning_rate": 2.9996955996238192e-05,
"loss": 0.506,
"step": 420
},
{
"epoch": 18.695652173913043,
"grad_norm": 1.1996164321899414,
"learning_rate": 2.9996762780293503e-05,
"loss": 0.5315,
"step": 430
},
{
"epoch": 19.130434782608695,
"grad_norm": 1.214064121246338,
"learning_rate": 2.9996563620111197e-05,
"loss": 0.5334,
"step": 440
},
{
"epoch": 19.565217391304348,
"grad_norm": 1.4286197423934937,
"learning_rate": 2.9996358515770218e-05,
"loss": 0.5677,
"step": 450
},
{
"epoch": 19.565217391304348,
"eval_loss": 0.717534065246582,
"eval_runtime": 0.5321,
"eval_samples_per_second": 18.792,
"eval_steps_per_second": 18.792,
"step": 450
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4001,
"Start_State_samples_per_second": 24.995,
"Start_State_steps_per_second": 24.995,
"epoch": 19.565217391304348,
"step": 450
},
{
"Raw_Model_loss": 0.717534065246582,
"Raw_Model_runtime": 0.4009,
"Raw_Model_samples_per_second": 24.944,
"Raw_Model_steps_per_second": 24.944,
"epoch": 19.565217391304348,
"step": 450
},
{
"SWA_loss": 0.7233762741088867,
"SWA_runtime": 0.3946,
"SWA_samples_per_second": 25.341,
"SWA_steps_per_second": 25.341,
"epoch": 19.565217391304348,
"step": 450
},
{
"EMA_loss": 0.7309414744377136,
"EMA_runtime": 0.4041,
"EMA_samples_per_second": 24.748,
"EMA_steps_per_second": 24.748,
"epoch": 19.565217391304348,
"step": 450
},
{
"epoch": 20.0,
"grad_norm": 2.118807792663574,
"learning_rate": 2.9996147467351856e-05,
"loss": 0.5147,
"step": 460
},
{
"epoch": 20.434782608695652,
"grad_norm": 1.2774548530578613,
"learning_rate": 2.9995930474939773e-05,
"loss": 0.4785,
"step": 470
},
{
"epoch": 20.869565217391305,
"grad_norm": 1.4731013774871826,
"learning_rate": 2.9995707538619975e-05,
"loss": 0.5703,
"step": 480
},
{
"epoch": 21.304347826086957,
"grad_norm": 1.3251285552978516,
"learning_rate": 2.9995478658480822e-05,
"loss": 0.5164,
"step": 490
},
{
"epoch": 21.73913043478261,
"grad_norm": 1.2412965297698975,
"learning_rate": 2.9995243834613043e-05,
"loss": 0.5204,
"step": 500
},
{
"epoch": 22.17391304347826,
"grad_norm": 1.7840219736099243,
"learning_rate": 2.9995003067109707e-05,
"loss": 0.4838,
"step": 510
},
{
"epoch": 22.608695652173914,
"grad_norm": 1.5308188199996948,
"learning_rate": 2.9994756356066246e-05,
"loss": 0.5616,
"step": 520
},
{
"epoch": 23.043478260869566,
"grad_norm": 1.7345212697982788,
"learning_rate": 2.999450370158046e-05,
"loss": 0.4929,
"step": 530
},
{
"epoch": 23.47826086956522,
"grad_norm": 1.3111943006515503,
"learning_rate": 2.9994245103752478e-05,
"loss": 0.4384,
"step": 540
},
{
"epoch": 23.91304347826087,
"grad_norm": 1.234527349472046,
"learning_rate": 2.999398056268481e-05,
"loss": 0.5266,
"step": 550
},
{
"epoch": 24.347826086956523,
"grad_norm": 1.4057211875915527,
"learning_rate": 2.9993710078482306e-05,
"loss": 0.5204,
"step": 560
},
{
"epoch": 24.782608695652176,
"grad_norm": 0.9548116326332092,
"learning_rate": 2.9993433651252185e-05,
"loss": 0.4428,
"step": 570
},
{
"epoch": 25.217391304347824,
"grad_norm": 1.7164983749389648,
"learning_rate": 2.9993151281104006e-05,
"loss": 0.5329,
"step": 580
},
{
"epoch": 25.652173913043477,
"grad_norm": 1.1313426494598389,
"learning_rate": 2.9992862968149695e-05,
"loss": 0.4733,
"step": 590
},
{
"epoch": 26.08695652173913,
"grad_norm": 1.1755690574645996,
"learning_rate": 2.9992568712503533e-05,
"loss": 0.4607,
"step": 600
},
{
"epoch": 26.08695652173913,
"eval_loss": 0.7199033498764038,
"eval_runtime": 0.3999,
"eval_samples_per_second": 25.009,
"eval_steps_per_second": 25.009,
"step": 600
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.3949,
"Start_State_samples_per_second": 25.321,
"Start_State_steps_per_second": 25.321,
"epoch": 26.08695652173913,
"step": 600
},
{
"Raw_Model_loss": 0.7199033498764038,
"Raw_Model_runtime": 0.3918,
"Raw_Model_samples_per_second": 25.525,
"Raw_Model_steps_per_second": 25.525,
"epoch": 26.08695652173913,
"step": 600
},
{
"SWA_loss": 0.7180979251861572,
"SWA_runtime": 0.3964,
"SWA_samples_per_second": 25.23,
"SWA_steps_per_second": 25.23,
"epoch": 26.08695652173913,
"step": 600
},
{
"EMA_loss": 0.7309598326683044,
"EMA_runtime": 0.3932,
"EMA_samples_per_second": 25.43,
"EMA_steps_per_second": 25.43,
"epoch": 26.08695652173913,
"step": 600
},
{
"epoch": 26.52173913043478,
"grad_norm": 1.095847249031067,
"learning_rate": 2.9992268514282142e-05,
"loss": 0.5118,
"step": 610
},
{
"epoch": 26.956521739130434,
"grad_norm": 1.3382961750030518,
"learning_rate": 2.999196237360452e-05,
"loss": 0.4316,
"step": 620
},
{
"epoch": 27.391304347826086,
"grad_norm": 1.2023630142211914,
"learning_rate": 2.9991650290592016e-05,
"loss": 0.4756,
"step": 630
},
{
"epoch": 27.82608695652174,
"grad_norm": 1.3882129192352295,
"learning_rate": 2.999133226536832e-05,
"loss": 0.5011,
"step": 640
},
{
"epoch": 28.26086956521739,
"grad_norm": 1.4160760641098022,
"learning_rate": 2.9991008298059493e-05,
"loss": 0.4106,
"step": 650
},
{
"epoch": 28.695652173913043,
"grad_norm": 1.5552334785461426,
"learning_rate": 2.9990678388793944e-05,
"loss": 0.5064,
"step": 660
},
{
"epoch": 29.130434782608695,
"grad_norm": 1.3141825199127197,
"learning_rate": 2.999034253770244e-05,
"loss": 0.4349,
"step": 670
},
{
"epoch": 29.565217391304348,
"grad_norm": 1.0743430852890015,
"learning_rate": 2.9990000744918097e-05,
"loss": 0.4704,
"step": 680
},
{
"epoch": 30.0,
"grad_norm": 2.4982922077178955,
"learning_rate": 2.9989653010576392e-05,
"loss": 0.4144,
"step": 690
},
{
"epoch": 30.434782608695652,
"grad_norm": 1.3296608924865723,
"learning_rate": 2.9989299334815158e-05,
"loss": 0.4766,
"step": 700
},
{
"epoch": 30.869565217391305,
"grad_norm": 1.62749445438385,
"learning_rate": 2.9988939717774578e-05,
"loss": 0.412,
"step": 710
},
{
"epoch": 31.304347826086957,
"grad_norm": 0.9021294116973877,
"learning_rate": 2.9988574159597194e-05,
"loss": 0.4246,
"step": 720
},
{
"epoch": 31.73913043478261,
"grad_norm": 1.641708254814148,
"learning_rate": 2.9988202660427907e-05,
"loss": 0.4827,
"step": 730
},
{
"epoch": 32.17391304347826,
"grad_norm": 1.1983932256698608,
"learning_rate": 2.9987825220413958e-05,
"loss": 0.4382,
"step": 740
},
{
"epoch": 32.608695652173914,
"grad_norm": 1.765030026435852,
"learning_rate": 2.998744183970496e-05,
"loss": 0.4731,
"step": 750
},
{
"epoch": 32.608695652173914,
"eval_loss": 0.7314910888671875,
"eval_runtime": 0.4917,
"eval_samples_per_second": 20.337,
"eval_steps_per_second": 20.337,
"step": 750
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4719,
"Start_State_samples_per_second": 21.19,
"Start_State_steps_per_second": 21.19,
"epoch": 32.608695652173914,
"step": 750
},
{
"Raw_Model_loss": 0.7314910888671875,
"Raw_Model_runtime": 0.4163,
"Raw_Model_samples_per_second": 24.018,
"Raw_Model_steps_per_second": 24.018,
"epoch": 32.608695652173914,
"step": 750
},
{
"SWA_loss": 0.7177615761756897,
"SWA_runtime": 0.3941,
"SWA_samples_per_second": 25.374,
"SWA_steps_per_second": 25.374,
"epoch": 32.608695652173914,
"step": 750
},
{
"EMA_loss": 0.7303470969200134,
"EMA_runtime": 0.407,
"EMA_samples_per_second": 24.567,
"EMA_steps_per_second": 24.567,
"epoch": 32.608695652173914,
"step": 750
},
{
"epoch": 33.04347826086956,
"grad_norm": 1.495551347732544,
"learning_rate": 2.998705251845287e-05,
"loss": 0.4299,
"step": 760
},
{
"epoch": 33.47826086956522,
"grad_norm": 1.644679069519043,
"learning_rate": 2.9986657256812e-05,
"loss": 0.4302,
"step": 770
},
{
"epoch": 33.91304347826087,
"grad_norm": 1.3021020889282227,
"learning_rate": 2.9986256054939022e-05,
"loss": 0.4078,
"step": 780
},
{
"epoch": 34.34782608695652,
"grad_norm": 1.483847975730896,
"learning_rate": 2.9985848912992956e-05,
"loss": 0.4026,
"step": 790
},
{
"epoch": 34.78260869565217,
"grad_norm": 1.5579402446746826,
"learning_rate": 2.9985435831135184e-05,
"loss": 0.3833,
"step": 800
},
{
"epoch": 35.21739130434783,
"grad_norm": 1.3132578134536743,
"learning_rate": 2.9985016809529437e-05,
"loss": 0.4742,
"step": 810
},
{
"epoch": 35.65217391304348,
"grad_norm": 1.332205891609192,
"learning_rate": 2.9984591848341806e-05,
"loss": 0.4028,
"step": 820
},
{
"epoch": 36.08695652173913,
"grad_norm": 1.0762503147125244,
"learning_rate": 2.9984160947740723e-05,
"loss": 0.4181,
"step": 830
},
{
"epoch": 36.52173913043478,
"grad_norm": 1.1693116426467896,
"learning_rate": 2.9983724107896993e-05,
"loss": 0.3803,
"step": 840
},
{
"epoch": 36.95652173913044,
"grad_norm": 1.4850109815597534,
"learning_rate": 2.9983281328983757e-05,
"loss": 0.4498,
"step": 850
},
{
"epoch": 37.391304347826086,
"grad_norm": 1.8984379768371582,
"learning_rate": 2.9982832611176523e-05,
"loss": 0.4182,
"step": 860
},
{
"epoch": 37.82608695652174,
"grad_norm": 1.2748432159423828,
"learning_rate": 2.998237795465315e-05,
"loss": 0.3716,
"step": 870
},
{
"epoch": 38.26086956521739,
"grad_norm": 1.268835186958313,
"learning_rate": 2.9981917359593843e-05,
"loss": 0.4011,
"step": 880
},
{
"epoch": 38.69565217391305,
"grad_norm": 1.446075439453125,
"learning_rate": 2.9981450826181172e-05,
"loss": 0.3551,
"step": 890
},
{
"epoch": 39.130434782608695,
"grad_norm": 1.9400171041488647,
"learning_rate": 2.9980978354600057e-05,
"loss": 0.4631,
"step": 900
},
{
"epoch": 39.130434782608695,
"eval_loss": 0.7510843276977539,
"eval_runtime": 0.4747,
"eval_samples_per_second": 21.065,
"eval_steps_per_second": 21.065,
"step": 900
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4104,
"Start_State_samples_per_second": 24.364,
"Start_State_steps_per_second": 24.364,
"epoch": 39.130434782608695,
"step": 900
},
{
"Raw_Model_loss": 0.7510843276977539,
"Raw_Model_runtime": 0.4105,
"Raw_Model_samples_per_second": 24.36,
"Raw_Model_steps_per_second": 24.36,
"epoch": 39.130434782608695,
"step": 900
},
{
"SWA_loss": 0.7209179997444153,
"SWA_runtime": 0.4135,
"SWA_samples_per_second": 24.182,
"SWA_steps_per_second": 24.182,
"epoch": 39.130434782608695,
"step": 900
},
{
"EMA_loss": 0.7315307259559631,
"EMA_runtime": 0.4077,
"EMA_samples_per_second": 24.527,
"EMA_steps_per_second": 24.527,
"epoch": 39.130434782608695,
"step": 900
},
{
"epoch": 39.56521739130435,
"grad_norm": 1.5488810539245605,
"learning_rate": 2.9980499945037765e-05,
"loss": 0.3835,
"step": 910
},
{
"epoch": 40.0,
"grad_norm": 3.038804769515991,
"learning_rate": 2.998001559768393e-05,
"loss": 0.3862,
"step": 920
},
{
"epoch": 40.43478260869565,
"grad_norm": 1.543023705482483,
"learning_rate": 2.9979525312730525e-05,
"loss": 0.4491,
"step": 930
},
{
"epoch": 40.869565217391305,
"grad_norm": 1.6729778051376343,
"learning_rate": 2.9979029090371885e-05,
"loss": 0.3409,
"step": 940
},
{
"epoch": 41.30434782608695,
"grad_norm": 2.224083662033081,
"learning_rate": 2.99785269308047e-05,
"loss": 0.3417,
"step": 950
},
{
"epoch": 41.73913043478261,
"grad_norm": 1.5069278478622437,
"learning_rate": 2.9978018834228007e-05,
"loss": 0.3647,
"step": 960
},
{
"epoch": 42.17391304347826,
"grad_norm": 1.5148930549621582,
"learning_rate": 2.9977504800843197e-05,
"loss": 0.4348,
"step": 970
},
{
"epoch": 42.608695652173914,
"grad_norm": 1.5450372695922852,
"learning_rate": 2.9976984830854022e-05,
"loss": 0.3751,
"step": 980
},
{
"epoch": 43.04347826086956,
"grad_norm": 1.6492244005203247,
"learning_rate": 2.997645892446658e-05,
"loss": 0.3672,
"step": 990
},
{
"epoch": 43.47826086956522,
"grad_norm": 1.4176095724105835,
"learning_rate": 2.9975927081889322e-05,
"loss": 0.3908,
"step": 1000
},
{
"epoch": 43.91304347826087,
"grad_norm": 1.192176342010498,
"learning_rate": 2.9975389303333047e-05,
"loss": 0.3461,
"step": 1010
},
{
"epoch": 44.34782608695652,
"grad_norm": 2.0878190994262695,
"learning_rate": 2.997484558901093e-05,
"loss": 0.3918,
"step": 1020
},
{
"epoch": 44.78260869565217,
"grad_norm": 1.6980842351913452,
"learning_rate": 2.9974295939138465e-05,
"loss": 0.3809,
"step": 1030
},
{
"epoch": 45.21739130434783,
"grad_norm": 1.2522655725479126,
"learning_rate": 2.9973740353933523e-05,
"loss": 0.2649,
"step": 1040
},
{
"epoch": 45.65217391304348,
"grad_norm": 1.678786039352417,
"learning_rate": 2.997317883361632e-05,
"loss": 0.3613,
"step": 1050
},
{
"epoch": 45.65217391304348,
"eval_loss": 0.777652382850647,
"eval_runtime": 0.507,
"eval_samples_per_second": 19.723,
"eval_steps_per_second": 19.723,
"step": 1050
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.452,
"Start_State_samples_per_second": 22.123,
"Start_State_steps_per_second": 22.123,
"epoch": 45.65217391304348,
"step": 1050
},
{
"Raw_Model_loss": 0.777652382850647,
"Raw_Model_runtime": 0.4426,
"Raw_Model_samples_per_second": 22.594,
"Raw_Model_steps_per_second": 22.594,
"epoch": 45.65217391304348,
"step": 1050
},
{
"SWA_loss": 0.7228068709373474,
"SWA_runtime": 0.4408,
"SWA_samples_per_second": 22.684,
"SWA_steps_per_second": 22.684,
"epoch": 45.65217391304348,
"step": 1050
},
{
"EMA_loss": 0.7306644320487976,
"EMA_runtime": 0.3993,
"EMA_samples_per_second": 25.044,
"EMA_steps_per_second": 25.044,
"epoch": 45.65217391304348,
"step": 1050
},
{
"epoch": 46.08695652173913,
"grad_norm": 1.7957395315170288,
"learning_rate": 2.997261137840943e-05,
"loss": 0.4103,
"step": 1060
},
{
"epoch": 46.52173913043478,
"grad_norm": 2.156790256500244,
"learning_rate": 2.9972037988537758e-05,
"loss": 0.3785,
"step": 1070
},
{
"epoch": 46.95652173913044,
"grad_norm": 1.9486017227172852,
"learning_rate": 2.9971458664228595e-05,
"loss": 0.3324,
"step": 1080
},
{
"epoch": 47.391304347826086,
"grad_norm": 2.1510581970214844,
"learning_rate": 2.997087340571156e-05,
"loss": 0.3368,
"step": 1090
},
{
"epoch": 47.82608695652174,
"grad_norm": 1.5172206163406372,
"learning_rate": 2.997028221321863e-05,
"loss": 0.3563,
"step": 1100
},
{
"epoch": 48.26086956521739,
"grad_norm": 2.3161354064941406,
"learning_rate": 2.9969685086984132e-05,
"loss": 0.3734,
"step": 1110
},
{
"epoch": 48.69565217391305,
"grad_norm": 1.6685658693313599,
"learning_rate": 2.9969082027244755e-05,
"loss": 0.3001,
"step": 1120
},
{
"epoch": 49.130434782608695,
"grad_norm": 1.843396782875061,
"learning_rate": 2.996847303423953e-05,
"loss": 0.4154,
"step": 1130
},
{
"epoch": 49.56521739130435,
"grad_norm": 1.3093624114990234,
"learning_rate": 2.9967858108209838e-05,
"loss": 0.3713,
"step": 1140
},
{
"epoch": 50.0,
"grad_norm": 2.944302797317505,
"learning_rate": 2.9967237249399417e-05,
"loss": 0.292,
"step": 1150
},
{
"epoch": 50.43478260869565,
"grad_norm": 1.5263242721557617,
"learning_rate": 2.996661045805436e-05,
"loss": 0.2961,
"step": 1160
},
{
"epoch": 50.869565217391305,
"grad_norm": 1.8892343044281006,
"learning_rate": 2.9965977734423106e-05,
"loss": 0.3417,
"step": 1170
},
{
"epoch": 51.30434782608695,
"grad_norm": 1.9361391067504883,
"learning_rate": 2.9965339078756445e-05,
"loss": 0.3541,
"step": 1180
},
{
"epoch": 51.73913043478261,
"grad_norm": 1.1996322870254517,
"learning_rate": 2.9964694491307514e-05,
"loss": 0.2802,
"step": 1190
},
{
"epoch": 52.17391304347826,
"grad_norm": 2.5254251956939697,
"learning_rate": 2.996404397233182e-05,
"loss": 0.4085,
"step": 1200
},
{
"epoch": 52.17391304347826,
"eval_loss": 0.8014206886291504,
"eval_runtime": 0.508,
"eval_samples_per_second": 19.686,
"eval_steps_per_second": 19.686,
"step": 1200
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4,
"Start_State_samples_per_second": 25.001,
"Start_State_steps_per_second": 25.001,
"epoch": 52.17391304347826,
"step": 1200
},
{
"Raw_Model_loss": 0.8014206886291504,
"Raw_Model_runtime": 0.4013,
"Raw_Model_samples_per_second": 24.922,
"Raw_Model_steps_per_second": 24.922,
"epoch": 52.17391304347826,
"step": 1200
},
{
"SWA_loss": 0.7281149625778198,
"SWA_runtime": 0.4076,
"SWA_samples_per_second": 24.535,
"SWA_steps_per_second": 24.535,
"epoch": 52.17391304347826,
"step": 1200
},
{
"EMA_loss": 0.7309959530830383,
"EMA_runtime": 0.415,
"EMA_samples_per_second": 24.094,
"EMA_steps_per_second": 24.094,
"epoch": 52.17391304347826,
"step": 1200
},
{
"epoch": 52.608695652173914,
"grad_norm": 1.616297721862793,
"learning_rate": 1.4982021986165911e-06,
"loss": 0.2865,
"step": 1210
},
{
"epoch": 53.04347826086956,
"grad_norm": 1.7724196910858154,
"learning_rate": 2.9964043972331822e-06,
"loss": 0.3649,
"step": 1220
},
{
"epoch": 53.47826086956522,
"grad_norm": 1.3953560590744019,
"learning_rate": 4.494606595849773e-06,
"loss": 0.2719,
"step": 1230
},
{
"epoch": 53.91304347826087,
"grad_norm": 1.9502956867218018,
"learning_rate": 5.9928087944663644e-06,
"loss": 0.334,
"step": 1240
},
{
"epoch": 54.34782608695652,
"grad_norm": 1.9493101835250854,
"learning_rate": 7.491010993082955e-06,
"loss": 0.385,
"step": 1250
},
{
"epoch": 54.78260869565217,
"grad_norm": 1.1656595468521118,
"learning_rate": 8.989213191699545e-06,
"loss": 0.284,
"step": 1260
},
{
"epoch": 55.21739130434783,
"grad_norm": 1.5772318840026855,
"learning_rate": 1.0487415390316136e-05,
"loss": 0.3105,
"step": 1270
},
{
"epoch": 55.65217391304348,
"grad_norm": 1.708022117614746,
"learning_rate": 1.1985617588932729e-05,
"loss": 0.3553,
"step": 1280
},
{
"epoch": 56.08695652173913,
"grad_norm": 2.282125473022461,
"learning_rate": 1.348381978754932e-05,
"loss": 0.2844,
"step": 1290
},
{
"epoch": 56.52173913043478,
"grad_norm": 1.458001971244812,
"learning_rate": 1.498202198616591e-05,
"loss": 0.3387,
"step": 1300
},
{
"epoch": 56.95652173913044,
"grad_norm": 1.9748072624206543,
"learning_rate": 1.4982020501567203e-05,
"loss": 0.3318,
"step": 1310
},
{
"epoch": 57.391304347826086,
"grad_norm": 1.4179987907409668,
"learning_rate": 1.4982016047771664e-05,
"loss": 0.3109,
"step": 1320
},
{
"epoch": 57.82608695652174,
"grad_norm": 2.815448522567749,
"learning_rate": 1.4982008624781062e-05,
"loss": 0.3369,
"step": 1330
},
{
"epoch": 58.26086956521739,
"grad_norm": 1.4394376277923584,
"learning_rate": 1.4981998232598337e-05,
"loss": 0.3303,
"step": 1340
},
{
"epoch": 58.69565217391305,
"grad_norm": 1.8707002401351929,
"learning_rate": 1.4981984871227611e-05,
"loss": 0.3077,
"step": 1350
},
{
"epoch": 58.69565217391305,
"eval_loss": 0.8195747137069702,
"eval_runtime": 0.4109,
"eval_samples_per_second": 24.335,
"eval_steps_per_second": 24.335,
"step": 1350
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.3897,
"Start_State_samples_per_second": 25.658,
"Start_State_steps_per_second": 25.658,
"epoch": 58.69565217391305,
"step": 1350
},
{
"Raw_Model_loss": 0.8195747137069702,
"Raw_Model_runtime": 0.3979,
"Raw_Model_samples_per_second": 25.132,
"Raw_Model_steps_per_second": 25.132,
"epoch": 58.69565217391305,
"step": 1350
},
{
"SWA_loss": 0.7320815324783325,
"SWA_runtime": 0.395,
"SWA_samples_per_second": 25.319,
"SWA_steps_per_second": 25.319,
"epoch": 58.69565217391305,
"step": 1350
},
{
"EMA_loss": 0.7313606142997742,
"EMA_runtime": 0.3882,
"EMA_samples_per_second": 25.759,
"EMA_steps_per_second": 25.759,
"epoch": 58.69565217391305,
"step": 1350
},
{
"epoch": 59.130434782608695,
"grad_norm": 1.5965052843093872,
"learning_rate": 1.4981968540674177e-05,
"loss": 0.3206,
"step": 1360
},
{
"epoch": 59.56521739130435,
"grad_norm": 1.3822482824325562,
"learning_rate": 1.4981949240944509e-05,
"loss": 0.3011,
"step": 1370
},
{
"epoch": 60.0,
"grad_norm": 1.6288312673568726,
"learning_rate": 1.4981926972046258e-05,
"loss": 0.3095,
"step": 1380
},
{
"epoch": 60.43478260869565,
"grad_norm": 1.9036870002746582,
"learning_rate": 1.498190173398825e-05,
"loss": 0.3173,
"step": 1390
},
{
"epoch": 60.869565217391305,
"grad_norm": 1.5387356281280518,
"learning_rate": 1.4981873526780487e-05,
"loss": 0.3054,
"step": 1400
},
{
"epoch": 61.30434782608695,
"grad_norm": 1.4343056678771973,
"learning_rate": 1.4981842350434152e-05,
"loss": 0.3046,
"step": 1410
},
{
"epoch": 61.73913043478261,
"grad_norm": 1.4938664436340332,
"learning_rate": 1.49818082049616e-05,
"loss": 0.3205,
"step": 1420
},
{
"epoch": 62.17391304347826,
"grad_norm": 2.177480459213257,
"learning_rate": 1.4981771090376367e-05,
"loss": 0.2865,
"step": 1430
},
{
"epoch": 62.608695652173914,
"grad_norm": 1.8865878582000732,
"learning_rate": 1.4981731006693164e-05,
"loss": 0.3213,
"step": 1440
},
{
"epoch": 63.04347826086956,
"grad_norm": 1.3152176141738892,
"learning_rate": 1.4981687953927875e-05,
"loss": 0.3125,
"step": 1450
},
{
"epoch": 63.47826086956522,
"grad_norm": 1.9965901374816895,
"learning_rate": 1.498164193209757e-05,
"loss": 0.345,
"step": 1460
},
{
"epoch": 63.91304347826087,
"grad_norm": 1.6480698585510254,
"learning_rate": 1.498159294122049e-05,
"loss": 0.2924,
"step": 1470
},
{
"epoch": 64.34782608695652,
"grad_norm": 1.8093769550323486,
"learning_rate": 1.4981540981316052e-05,
"loss": 0.2688,
"step": 1480
},
{
"epoch": 64.78260869565217,
"grad_norm": 1.529961347579956,
"learning_rate": 1.4981486052404848e-05,
"loss": 0.3585,
"step": 1490
},
{
"epoch": 65.21739130434783,
"grad_norm": 1.4079116582870483,
"learning_rate": 1.4981428154508652e-05,
"loss": 0.269,
"step": 1500
},
{
"epoch": 65.21739130434783,
"eval_loss": 0.8343552350997925,
"eval_runtime": 0.4105,
"eval_samples_per_second": 24.363,
"eval_steps_per_second": 24.363,
"step": 1500
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4152,
"Start_State_samples_per_second": 24.085,
"Start_State_steps_per_second": 24.085,
"epoch": 65.21739130434783,
"step": 1500
},
{
"Raw_Model_loss": 0.8343552350997925,
"Raw_Model_runtime": 0.399,
"Raw_Model_samples_per_second": 25.06,
"Raw_Model_steps_per_second": 25.06,
"epoch": 65.21739130434783,
"step": 1500
},
{
"SWA_loss": 0.7395024299621582,
"SWA_runtime": 0.4083,
"SWA_samples_per_second": 24.489,
"SWA_steps_per_second": 24.489,
"epoch": 65.21739130434783,
"step": 1500
},
{
"EMA_loss": 0.7315851449966431,
"EMA_runtime": 0.4003,
"EMA_samples_per_second": 24.979,
"EMA_steps_per_second": 24.979,
"epoch": 65.21739130434783,
"step": 1500
},
{
"epoch": 65.65217391304348,
"grad_norm": 2.2492856979370117,
"learning_rate": 1.4981367287650419e-05,
"loss": 0.3161,
"step": 1510
},
{
"epoch": 66.08695652173913,
"grad_norm": 1.7571766376495361,
"learning_rate": 1.4981303451854267e-05,
"loss": 0.2947,
"step": 1520
},
{
"epoch": 66.52173913043478,
"grad_norm": 1.7509160041809082,
"learning_rate": 1.4981236647145501e-05,
"loss": 0.3107,
"step": 1530
},
{
"epoch": 66.95652173913044,
"grad_norm": 2.094277858734131,
"learning_rate": 1.4981166873550601e-05,
"loss": 0.3051,
"step": 1540
},
{
"epoch": 67.3913043478261,
"grad_norm": 1.7601019144058228,
"learning_rate": 1.4981094131097224e-05,
"loss": 0.2711,
"step": 1550
},
{
"epoch": 67.82608695652173,
"grad_norm": 2.0073230266571045,
"learning_rate": 1.49810184198142e-05,
"loss": 0.3434,
"step": 1560
},
{
"epoch": 68.26086956521739,
"grad_norm": 2.084998846054077,
"learning_rate": 1.498093973973154e-05,
"loss": 0.2506,
"step": 1570
},
{
"epoch": 68.69565217391305,
"grad_norm": 1.8126795291900635,
"learning_rate": 1.4980858090880429e-05,
"loss": 0.286,
"step": 1580
},
{
"epoch": 69.1304347826087,
"grad_norm": 1.9416148662567139,
"learning_rate": 1.4980773473293232e-05,
"loss": 0.3681,
"step": 1590
},
{
"epoch": 69.56521739130434,
"grad_norm": 1.978805422782898,
"learning_rate": 1.4980685887003486e-05,
"loss": 0.3073,
"step": 1600
},
{
"epoch": 70.0,
"grad_norm": 1.6534956693649292,
"learning_rate": 1.498059533204591e-05,
"loss": 0.2691,
"step": 1610
},
{
"epoch": 70.43478260869566,
"grad_norm": 2.2284836769104004,
"learning_rate": 1.4980501808456398e-05,
"loss": 0.3139,
"step": 1620
},
{
"epoch": 70.8695652173913,
"grad_norm": 1.9585868120193481,
"learning_rate": 1.4980405316272018e-05,
"loss": 0.2997,
"step": 1630
},
{
"epoch": 71.30434782608695,
"grad_norm": 2.346238851547241,
"learning_rate": 1.4980305855531015e-05,
"loss": 0.2891,
"step": 1640
},
{
"epoch": 71.73913043478261,
"grad_norm": 1.851641058921814,
"learning_rate": 1.4980203426272815e-05,
"loss": 0.2627,
"step": 1650
},
{
"epoch": 71.73913043478261,
"eval_loss": 0.8489276766777039,
"eval_runtime": 0.4811,
"eval_samples_per_second": 20.784,
"eval_steps_per_second": 20.784,
"step": 1650
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4395,
"Start_State_samples_per_second": 22.752,
"Start_State_steps_per_second": 22.752,
"epoch": 71.73913043478261,
"step": 1650
},
{
"Raw_Model_loss": 0.8489276766777039,
"Raw_Model_runtime": 0.4362,
"Raw_Model_samples_per_second": 22.928,
"Raw_Model_steps_per_second": 22.928,
"epoch": 71.73913043478261,
"step": 1650
},
{
"SWA_loss": 0.7444645166397095,
"SWA_runtime": 0.4365,
"SWA_samples_per_second": 22.911,
"SWA_steps_per_second": 22.911,
"epoch": 71.73913043478261,
"step": 1650
},
{
"EMA_loss": 0.7310957312583923,
"EMA_runtime": 0.4342,
"EMA_samples_per_second": 23.032,
"EMA_steps_per_second": 23.032,
"epoch": 71.73913043478261,
"step": 1650
},
{
"epoch": 72.17391304347827,
"grad_norm": 1.65473210811615,
"learning_rate": 1.4980098028538014e-05,
"loss": 0.328,
"step": 1660
},
{
"epoch": 72.6086956521739,
"grad_norm": 2.0884604454040527,
"learning_rate": 1.4979989662368391e-05,
"loss": 0.2959,
"step": 1670
},
{
"epoch": 73.04347826086956,
"grad_norm": 1.906488299369812,
"learning_rate": 1.4979878327806899e-05,
"loss": 0.3098,
"step": 1680
},
{
"epoch": 73.47826086956522,
"grad_norm": 2.01023530960083,
"learning_rate": 1.4979764024897668e-05,
"loss": 0.2878,
"step": 1690
},
{
"epoch": 73.91304347826087,
"grad_norm": 1.8358246088027954,
"learning_rate": 1.4979646753686002e-05,
"loss": 0.2796,
"step": 1700
},
{
"epoch": 74.34782608695652,
"grad_norm": 1.3833634853363037,
"learning_rate": 1.4979526514218385e-05,
"loss": 0.2769,
"step": 1710
},
{
"epoch": 74.78260869565217,
"grad_norm": 1.5111050605773926,
"learning_rate": 1.4979403306542473e-05,
"loss": 0.3278,
"step": 1720
},
{
"epoch": 75.21739130434783,
"grad_norm": 1.5712664127349854,
"learning_rate": 1.4979277130707107e-05,
"loss": 0.2338,
"step": 1730
},
{
"epoch": 75.65217391304348,
"grad_norm": 1.660670280456543,
"learning_rate": 1.4979147986762295e-05,
"loss": 0.3144,
"step": 1740
},
{
"epoch": 76.08695652173913,
"grad_norm": 1.8221240043640137,
"learning_rate": 1.4979015874759227e-05,
"loss": 0.2694,
"step": 1750
},
{
"epoch": 76.52173913043478,
"grad_norm": 1.8922370672225952,
"learning_rate": 1.4978880794750266e-05,
"loss": 0.2665,
"step": 1760
},
{
"epoch": 76.95652173913044,
"grad_norm": 1.296356201171875,
"learning_rate": 1.4978742746788957e-05,
"loss": 0.3007,
"step": 1770
},
{
"epoch": 77.3913043478261,
"grad_norm": 1.8244571685791016,
"learning_rate": 1.4978601730930014e-05,
"loss": 0.2842,
"step": 1780
},
{
"epoch": 77.82608695652173,
"grad_norm": 1.8345180749893188,
"learning_rate": 1.4978457747229335e-05,
"loss": 0.2714,
"step": 1790
},
{
"epoch": 78.26086956521739,
"grad_norm": 1.850252389907837,
"learning_rate": 1.497831079574399e-05,
"loss": 0.3055,
"step": 1800
},
{
"epoch": 78.26086956521739,
"eval_loss": 0.8643280267715454,
"eval_runtime": 0.4359,
"eval_samples_per_second": 22.94,
"eval_steps_per_second": 22.94,
"step": 1800
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4203,
"Start_State_samples_per_second": 23.79,
"Start_State_steps_per_second": 23.79,
"epoch": 78.26086956521739,
"step": 1800
},
{
"Raw_Model_loss": 0.8643280267715454,
"Raw_Model_runtime": 0.4033,
"Raw_Model_samples_per_second": 24.798,
"Raw_Model_steps_per_second": 24.798,
"epoch": 78.26086956521739,
"step": 1800
},
{
"SWA_loss": 0.7512942552566528,
"SWA_runtime": 0.3891,
"SWA_samples_per_second": 25.698,
"SWA_steps_per_second": 25.698,
"epoch": 78.26086956521739,
"step": 1800
},
{
"EMA_loss": 0.7310723066329956,
"EMA_runtime": 0.3904,
"EMA_samples_per_second": 25.613,
"EMA_steps_per_second": 25.613,
"epoch": 78.26086956521739,
"step": 1800
},
{
"epoch": 78.69565217391305,
"grad_norm": 1.2641412019729614,
"learning_rate": 1.4978160876532222e-05,
"loss": 0.2824,
"step": 1810
},
{
"epoch": 79.1304347826087,
"grad_norm": 1.9765238761901855,
"learning_rate": 1.4978007989653455e-05,
"loss": 0.2406,
"step": 1820
},
{
"epoch": 79.56521739130434,
"grad_norm": 1.5835498571395874,
"learning_rate": 1.4977852135168293e-05,
"loss": 0.2607,
"step": 1830
},
{
"epoch": 80.0,
"grad_norm": 1.8932580947875977,
"learning_rate": 1.4977693313138507e-05,
"loss": 0.3036,
"step": 1840
},
{
"epoch": 80.43478260869566,
"grad_norm": 2.1030030250549316,
"learning_rate": 1.4977531523627054e-05,
"loss": 0.2799,
"step": 1850
},
{
"epoch": 80.8695652173913,
"grad_norm": 1.2366570234298706,
"learning_rate": 1.4977366766698058e-05,
"loss": 0.2792,
"step": 1860
},
{
"epoch": 81.30434782608695,
"grad_norm": 1.5485888719558716,
"learning_rate": 1.4977199042416822e-05,
"loss": 0.2311,
"step": 1870
},
{
"epoch": 81.73913043478261,
"grad_norm": 1.5375139713287354,
"learning_rate": 1.4977028350849831e-05,
"loss": 0.3059,
"step": 1880
},
{
"epoch": 82.17391304347827,
"grad_norm": 1.6247549057006836,
"learning_rate": 1.4976854692064739e-05,
"loss": 0.2147,
"step": 1890
},
{
"epoch": 82.6086956521739,
"grad_norm": 1.8154581785202026,
"learning_rate": 1.497667806613038e-05,
"loss": 0.2594,
"step": 1900
},
{
"epoch": 83.04347826086956,
"grad_norm": 1.579021692276001,
"learning_rate": 1.497649847311676e-05,
"loss": 0.3002,
"step": 1910
},
{
"epoch": 83.47826086956522,
"grad_norm": 1.4831469058990479,
"learning_rate": 1.4976315913095068e-05,
"loss": 0.265,
"step": 1920
},
{
"epoch": 83.91304347826087,
"grad_norm": 2.305431842803955,
"learning_rate": 1.4976130386137666e-05,
"loss": 0.3039,
"step": 1930
},
{
"epoch": 84.34782608695652,
"grad_norm": 1.720330834388733,
"learning_rate": 1.4975941892318084e-05,
"loss": 0.2642,
"step": 1940
},
{
"epoch": 84.78260869565217,
"grad_norm": 2.2541563510894775,
"learning_rate": 1.497575043171104e-05,
"loss": 0.2798,
"step": 1950
},
{
"epoch": 84.78260869565217,
"eval_loss": 0.8888376355171204,
"eval_runtime": 0.4413,
"eval_samples_per_second": 22.658,
"eval_steps_per_second": 22.658,
"step": 1950
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4289,
"Start_State_samples_per_second": 23.318,
"Start_State_steps_per_second": 23.318,
"epoch": 84.78260869565217,
"step": 1950
},
{
"Raw_Model_loss": 0.8888376355171204,
"Raw_Model_runtime": 0.427,
"Raw_Model_samples_per_second": 23.417,
"Raw_Model_steps_per_second": 23.417,
"epoch": 84.78260869565217,
"step": 1950
},
{
"SWA_loss": 0.7555862665176392,
"SWA_runtime": 0.402,
"SWA_samples_per_second": 24.875,
"SWA_steps_per_second": 24.875,
"epoch": 84.78260869565217,
"step": 1950
},
{
"EMA_loss": 0.7317630052566528,
"EMA_runtime": 0.3962,
"EMA_samples_per_second": 25.24,
"EMA_steps_per_second": 25.24,
"epoch": 84.78260869565217,
"step": 1950
},
{
"epoch": 85.21739130434783,
"grad_norm": 2.295459270477295,
"learning_rate": 7.487875215855521e-07,
"loss": 0.2648,
"step": 1960
},
{
"epoch": 85.65217391304348,
"grad_norm": 1.9124552011489868,
"learning_rate": 1.4975750431711041e-06,
"loss": 0.2705,
"step": 1970
},
{
"epoch": 86.08695652173913,
"grad_norm": 2.192692756652832,
"learning_rate": 2.2463625647566557e-06,
"loss": 0.2532,
"step": 1980
},
{
"epoch": 86.52173913043478,
"grad_norm": 1.792695164680481,
"learning_rate": 2.9951500863422082e-06,
"loss": 0.2765,
"step": 1990
},
{
"epoch": 86.95652173913044,
"grad_norm": 2.0215790271759033,
"learning_rate": 3.74393760792776e-06,
"loss": 0.2769,
"step": 2000
},
{
"epoch": 87.3913043478261,
"grad_norm": 1.4278439283370972,
"learning_rate": 4.4927251295133115e-06,
"loss": 0.278,
"step": 2010
},
{
"epoch": 87.82608695652173,
"grad_norm": 1.9748132228851318,
"learning_rate": 5.241512651098863e-06,
"loss": 0.2587,
"step": 2020
},
{
"epoch": 88.26086956521739,
"grad_norm": 2.0187323093414307,
"learning_rate": 5.9903001726844164e-06,
"loss": 0.2613,
"step": 2030
},
{
"epoch": 88.69565217391305,
"grad_norm": 1.7434452772140503,
"learning_rate": 6.739087694269968e-06,
"loss": 0.2851,
"step": 2040
},
{
"epoch": 89.1304347826087,
"grad_norm": 1.828153371810913,
"learning_rate": 7.48787521585552e-06,
"loss": 0.2918,
"step": 2050
},
{
"epoch": 89.56521739130434,
"grad_norm": 1.5711168050765991,
"learning_rate": 7.487874473866896e-06,
"loss": 0.247,
"step": 2060
},
{
"epoch": 90.0,
"grad_norm": 1.6228244304656982,
"learning_rate": 7.487872247901318e-06,
"loss": 0.2522,
"step": 2070
},
{
"epoch": 90.43478260869566,
"grad_norm": 1.863221526145935,
"learning_rate": 7.4878685379596685e-06,
"loss": 0.2577,
"step": 2080
},
{
"epoch": 90.8695652173913,
"grad_norm": 1.7543621063232422,
"learning_rate": 7.487863344043418e-06,
"loss": 0.283,
"step": 2090
},
{
"epoch": 91.30434782608695,
"grad_norm": 1.765681266784668,
"learning_rate": 7.487856666154626e-06,
"loss": 0.2727,
"step": 2100
},
{
"epoch": 91.30434782608695,
"eval_loss": 0.8941524624824524,
"eval_runtime": 0.5508,
"eval_samples_per_second": 18.155,
"eval_steps_per_second": 18.155,
"step": 2100
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4064,
"Start_State_samples_per_second": 24.603,
"Start_State_steps_per_second": 24.603,
"epoch": 91.30434782608695,
"step": 2100
},
{
"Raw_Model_loss": 0.8941524624824524,
"Raw_Model_runtime": 0.4788,
"Raw_Model_samples_per_second": 20.886,
"Raw_Model_steps_per_second": 20.886,
"epoch": 91.30434782608695,
"step": 2100
},
{
"SWA_loss": 0.7625434994697571,
"SWA_runtime": 0.5432,
"SWA_samples_per_second": 18.408,
"SWA_steps_per_second": 18.408,
"epoch": 91.30434782608695,
"step": 2100
},
{
"EMA_loss": 0.7306564450263977,
"EMA_runtime": 0.4703,
"EMA_samples_per_second": 21.264,
"EMA_steps_per_second": 21.264,
"epoch": 91.30434782608695,
"step": 2100
},
{
"epoch": 91.73913043478261,
"grad_norm": 2.2282097339630127,
"learning_rate": 7.487848504295937e-06,
"loss": 0.2597,
"step": 2110
},
{
"epoch": 92.17391304347827,
"grad_norm": 2.146618127822876,
"learning_rate": 7.4878388584705885e-06,
"loss": 0.2901,
"step": 2120
},
{
"epoch": 92.6086956521739,
"grad_norm": 1.9365864992141724,
"learning_rate": 7.487827728682402e-06,
"loss": 0.2796,
"step": 2130
},
{
"epoch": 93.04347826086956,
"grad_norm": 1.677370309829712,
"learning_rate": 7.487815114935791e-06,
"loss": 0.2375,
"step": 2140
},
{
"epoch": 93.47826086956522,
"grad_norm": 1.871509075164795,
"learning_rate": 7.487801017235753e-06,
"loss": 0.289,
"step": 2150
},
{
"epoch": 93.91304347826087,
"grad_norm": 2.1130902767181396,
"learning_rate": 7.4877854355878785e-06,
"loss": 0.2698,
"step": 2160
},
{
"epoch": 94.34782608695652,
"grad_norm": 1.9688533544540405,
"learning_rate": 7.487768369998342e-06,
"loss": 0.2168,
"step": 2170
},
{
"epoch": 94.78260869565217,
"grad_norm": 2.1728529930114746,
"learning_rate": 7.4877498204739075e-06,
"loss": 0.2961,
"step": 2180
},
{
"epoch": 95.21739130434783,
"grad_norm": 2.192168712615967,
"learning_rate": 7.487729787021927e-06,
"loss": 0.2599,
"step": 2190
},
{
"epoch": 95.65217391304348,
"grad_norm": 2.4115936756134033,
"learning_rate": 7.487708269650342e-06,
"loss": 0.2587,
"step": 2200
},
{
"epoch": 96.08695652173913,
"grad_norm": 2.353425979614258,
"learning_rate": 7.487685268367682e-06,
"loss": 0.259,
"step": 2210
},
{
"epoch": 96.52173913043478,
"grad_norm": 1.855171799659729,
"learning_rate": 7.487660783183063e-06,
"loss": 0.2681,
"step": 2220
},
{
"epoch": 96.95652173913044,
"grad_norm": 2.1836190223693848,
"learning_rate": 7.48763481410619e-06,
"loss": 0.2607,
"step": 2230
},
{
"epoch": 97.3913043478261,
"grad_norm": 1.6038516759872437,
"learning_rate": 7.487607361147356e-06,
"loss": 0.2881,
"step": 2240
},
{
"epoch": 97.82608695652173,
"grad_norm": 1.3469552993774414,
"learning_rate": 7.487578424317443e-06,
"loss": 0.2524,
"step": 2250
},
{
"epoch": 97.82608695652173,
"eval_loss": 0.9057046175003052,
"eval_runtime": 0.4015,
"eval_samples_per_second": 24.909,
"eval_steps_per_second": 24.909,
"step": 2250
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.3966,
"Start_State_samples_per_second": 25.217,
"Start_State_steps_per_second": 25.217,
"epoch": 97.82608695652173,
"step": 2250
},
{
"Raw_Model_loss": 0.9057046175003052,
"Raw_Model_runtime": 0.3945,
"Raw_Model_samples_per_second": 25.347,
"Raw_Model_steps_per_second": 25.347,
"epoch": 97.82608695652173,
"step": 2250
},
{
"SWA_loss": 0.7665841579437256,
"SWA_runtime": 0.3965,
"SWA_samples_per_second": 25.221,
"SWA_steps_per_second": 25.221,
"epoch": 97.82608695652173,
"step": 2250
},
{
"EMA_loss": 0.7307609915733337,
"EMA_runtime": 0.402,
"EMA_samples_per_second": 24.875,
"EMA_steps_per_second": 24.875,
"epoch": 97.82608695652173,
"step": 2250
},
{
"epoch": 98.26086956521739,
"grad_norm": 1.9246830940246582,
"learning_rate": 7.487548003627922e-06,
"loss": 0.2415,
"step": 2260
},
{
"epoch": 98.69565217391305,
"grad_norm": 1.7473000288009644,
"learning_rate": 7.487516099090849e-06,
"loss": 0.278,
"step": 2270
},
{
"epoch": 99.1304347826087,
"grad_norm": 2.0333516597747803,
"learning_rate": 7.48748271071887e-06,
"loss": 0.2488,
"step": 2280
},
{
"epoch": 99.56521739130434,
"grad_norm": 2.3631269931793213,
"learning_rate": 7.48744783852522e-06,
"loss": 0.2882,
"step": 2290
},
{
"epoch": 100.0,
"grad_norm": 2.6425907611846924,
"learning_rate": 7.487411482523721e-06,
"loss": 0.2322,
"step": 2300
},
{
"epoch": 100.43478260869566,
"grad_norm": 2.703728437423706,
"learning_rate": 7.4873736427287825e-06,
"loss": 0.2371,
"step": 2310
},
{
"epoch": 100.8695652173913,
"grad_norm": 1.7555862665176392,
"learning_rate": 7.487334319155404e-06,
"loss": 0.2697,
"step": 2320
},
{
"epoch": 101.30434782608695,
"grad_norm": 2.5154976844787598,
"learning_rate": 7.487293511819172e-06,
"loss": 0.2417,
"step": 2330
},
{
"epoch": 101.73913043478261,
"grad_norm": 1.7718055248260498,
"learning_rate": 7.4872512207362605e-06,
"loss": 0.2446,
"step": 2340
},
{
"epoch": 102.17391304347827,
"grad_norm": 1.7671442031860352,
"learning_rate": 7.487207445923432e-06,
"loss": 0.2936,
"step": 2350
},
{
"epoch": 102.6086956521739,
"grad_norm": 2.0610148906707764,
"learning_rate": 7.487162187398039e-06,
"loss": 0.2845,
"step": 2360
},
{
"epoch": 103.04347826086956,
"grad_norm": 1.9395049810409546,
"learning_rate": 7.487115445178019e-06,
"loss": 0.2163,
"step": 2370
},
{
"epoch": 103.47826086956522,
"grad_norm": 2.1225855350494385,
"learning_rate": 7.487067219281901e-06,
"loss": 0.2913,
"step": 2380
},
{
"epoch": 103.91304347826087,
"grad_norm": 2.034578561782837,
"learning_rate": 7.4870175097287985e-06,
"loss": 0.2417,
"step": 2390
},
{
"epoch": 104.34782608695652,
"grad_norm": 1.9769914150238037,
"learning_rate": 7.486966316538416e-06,
"loss": 0.2563,
"step": 2400
},
{
"epoch": 104.34782608695652,
"eval_loss": 0.9094018936157227,
"eval_runtime": 0.5284,
"eval_samples_per_second": 18.926,
"eval_steps_per_second": 18.926,
"step": 2400
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.491,
"Start_State_samples_per_second": 20.366,
"Start_State_steps_per_second": 20.366,
"epoch": 104.34782608695652,
"step": 2400
},
{
"Raw_Model_loss": 0.9094018936157227,
"Raw_Model_runtime": 0.5034,
"Raw_Model_samples_per_second": 19.866,
"Raw_Model_steps_per_second": 19.866,
"epoch": 104.34782608695652,
"step": 2400
},
{
"SWA_loss": 0.7745841145515442,
"SWA_runtime": 0.5911,
"SWA_samples_per_second": 16.917,
"SWA_steps_per_second": 16.917,
"epoch": 104.34782608695652,
"step": 2400
},
{
"EMA_loss": 0.7307760119438171,
"EMA_runtime": 0.4346,
"EMA_samples_per_second": 23.011,
"EMA_steps_per_second": 23.011,
"epoch": 104.34782608695652,
"step": 2400
},
{
"epoch": 104.78260869565217,
"grad_norm": 2.087158679962158,
"learning_rate": 7.486913639731043e-06,
"loss": 0.2497,
"step": 2410
},
{
"epoch": 105.21739130434783,
"grad_norm": 1.996799349784851,
"learning_rate": 7.48685947932756e-06,
"loss": 0.2635,
"step": 2420
},
{
"epoch": 105.65217391304348,
"grad_norm": 1.9105130434036255,
"learning_rate": 7.4868038353494355e-06,
"loss": 0.2602,
"step": 2430
},
{
"epoch": 106.08695652173913,
"grad_norm": 2.1657402515411377,
"learning_rate": 7.486746707818724e-06,
"loss": 0.214,
"step": 2440
},
{
"epoch": 106.52173913043478,
"grad_norm": 1.444199800491333,
"learning_rate": 7.486688096758069e-06,
"loss": 0.2819,
"step": 2450
},
{
"epoch": 106.95652173913044,
"grad_norm": 1.8629169464111328,
"learning_rate": 7.486628002190702e-06,
"loss": 0.2444,
"step": 2460
},
{
"epoch": 107.3913043478261,
"grad_norm": 2.290212631225586,
"learning_rate": 7.486566424140442e-06,
"loss": 0.304,
"step": 2470
},
{
"epoch": 107.82608695652173,
"grad_norm": 2.3259527683258057,
"learning_rate": 7.486503362631699e-06,
"loss": 0.219,
"step": 2480
},
{
"epoch": 108.26086956521739,
"grad_norm": 2.0435678958892822,
"learning_rate": 7.486438817689465e-06,
"loss": 0.2709,
"step": 2490
},
{
"epoch": 108.69565217391305,
"grad_norm": 1.6399531364440918,
"learning_rate": 7.486372789339326e-06,
"loss": 0.2456,
"step": 2500
},
{
"epoch": 109.1304347826087,
"grad_norm": 1.6286495923995972,
"learning_rate": 7.486305277607452e-06,
"loss": 0.2435,
"step": 2510
},
{
"epoch": 109.56521739130434,
"grad_norm": 1.3312675952911377,
"learning_rate": 7.486236282520606e-06,
"loss": 0.2313,
"step": 2520
},
{
"epoch": 110.0,
"grad_norm": 3.1992104053497314,
"learning_rate": 7.48616580410613e-06,
"loss": 0.2876,
"step": 2530
},
{
"epoch": 110.43478260869566,
"grad_norm": 1.7260243892669678,
"learning_rate": 7.486093842391963e-06,
"loss": 0.2455,
"step": 2540
},
{
"epoch": 110.8695652173913,
"grad_norm": 1.857021450996399,
"learning_rate": 7.486020397406629e-06,
"loss": 0.2697,
"step": 2550
},
{
"epoch": 110.8695652173913,
"eval_loss": 0.9266101121902466,
"eval_runtime": 0.4485,
"eval_samples_per_second": 22.298,
"eval_steps_per_second": 22.298,
"step": 2550
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4407,
"Start_State_samples_per_second": 22.69,
"Start_State_steps_per_second": 22.69,
"epoch": 110.8695652173913,
"step": 2550
},
{
"Raw_Model_loss": 0.9266101121902466,
"Raw_Model_runtime": 0.4403,
"Raw_Model_samples_per_second": 22.71,
"Raw_Model_steps_per_second": 22.71,
"epoch": 110.8695652173913,
"step": 2550
},
{
"SWA_loss": 0.7769848108291626,
"SWA_runtime": 0.4455,
"SWA_samples_per_second": 22.448,
"SWA_steps_per_second": 22.448,
"epoch": 110.8695652173913,
"step": 2550
},
{
"EMA_loss": 0.7314194440841675,
"EMA_runtime": 0.4451,
"EMA_samples_per_second": 22.468,
"EMA_steps_per_second": 22.468,
"epoch": 110.8695652173913,
"step": 2550
},
{
"epoch": 111.30434782608695,
"grad_norm": 2.4148638248443604,
"learning_rate": 7.485945469179237e-06,
"loss": 0.282,
"step": 2560
},
{
"epoch": 111.73913043478261,
"grad_norm": 2.007262945175171,
"learning_rate": 7.485869057739486e-06,
"loss": 0.228,
"step": 2570
},
{
"epoch": 112.17391304347827,
"grad_norm": 2.0865132808685303,
"learning_rate": 7.485791163117665e-06,
"loss": 0.2463,
"step": 2580
},
{
"epoch": 112.6086956521739,
"grad_norm": 1.6724177598953247,
"learning_rate": 7.485711785344648e-06,
"loss": 0.2463,
"step": 2590
},
{
"epoch": 113.04347826086956,
"grad_norm": 2.1320908069610596,
"learning_rate": 7.485630924451897e-06,
"loss": 0.2661,
"step": 2600
},
{
"epoch": 113.47826086956522,
"grad_norm": 1.8488856554031372,
"learning_rate": 7.485548580471464e-06,
"loss": 0.2261,
"step": 2610
},
{
"epoch": 113.91304347826087,
"grad_norm": 2.1878151893615723,
"learning_rate": 7.485464753435987e-06,
"loss": 0.2756,
"step": 2620
},
{
"epoch": 114.34782608695652,
"grad_norm": 1.984470009803772,
"learning_rate": 7.485379443378693e-06,
"loss": 0.2451,
"step": 2630
},
{
"epoch": 114.78260869565217,
"grad_norm": 2.4623303413391113,
"learning_rate": 7.485292650333394e-06,
"loss": 0.2287,
"step": 2640
},
{
"epoch": 115.21739130434783,
"grad_norm": 1.7331453561782837,
"learning_rate": 7.485204374334494e-06,
"loss": 0.2553,
"step": 2650
},
{
"epoch": 115.65217391304348,
"grad_norm": 1.9090930223464966,
"learning_rate": 7.485114615416982e-06,
"loss": 0.2721,
"step": 2660
},
{
"epoch": 116.08695652173913,
"grad_norm": 2.4040467739105225,
"learning_rate": 7.485023373616437e-06,
"loss": 0.2153,
"step": 2670
},
{
"epoch": 116.52173913043478,
"grad_norm": 2.5749056339263916,
"learning_rate": 7.484930648969023e-06,
"loss": 0.245,
"step": 2680
},
{
"epoch": 116.95652173913044,
"grad_norm": 1.6020243167877197,
"learning_rate": 7.484836441511492e-06,
"loss": 0.2443,
"step": 2690
},
{
"epoch": 117.3913043478261,
"grad_norm": 1.6441881656646729,
"learning_rate": 7.484740751281187e-06,
"loss": 0.2361,
"step": 2700
},
{
"epoch": 117.3913043478261,
"eval_loss": 0.9320739507675171,
"eval_runtime": 0.408,
"eval_samples_per_second": 24.509,
"eval_steps_per_second": 24.509,
"step": 2700
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.418,
"Start_State_samples_per_second": 23.925,
"Start_State_steps_per_second": 23.925,
"epoch": 117.3913043478261,
"step": 2700
},
{
"Raw_Model_loss": 0.9320739507675171,
"Raw_Model_runtime": 0.402,
"Raw_Model_samples_per_second": 24.874,
"Raw_Model_steps_per_second": 24.874,
"epoch": 117.3913043478261,
"step": 2700
},
{
"SWA_loss": 0.7840545177459717,
"SWA_runtime": 0.401,
"SWA_samples_per_second": 24.935,
"SWA_steps_per_second": 24.935,
"epoch": 117.3913043478261,
"step": 2700
},
{
"EMA_loss": 0.730692982673645,
"EMA_runtime": 0.3918,
"EMA_samples_per_second": 25.525,
"EMA_steps_per_second": 25.525,
"epoch": 117.3913043478261,
"step": 2700
},
{
"epoch": 117.82608695652173,
"grad_norm": 3.2310845851898193,
"learning_rate": 3.7931296624941244e-07,
"loss": 0.2406,
"step": 2710
},
{
"epoch": 118.26086956521739,
"grad_norm": 1.8407368659973145,
"learning_rate": 7.586259324988249e-07,
"loss": 0.2518,
"step": 2720
},
{
"epoch": 118.69565217391305,
"grad_norm": 2.2956159114837646,
"learning_rate": 1.1379388987482372e-06,
"loss": 0.2412,
"step": 2730
},
{
"epoch": 119.1304347826087,
"grad_norm": 2.297415256500244,
"learning_rate": 1.5172518649976497e-06,
"loss": 0.2602,
"step": 2740
},
{
"epoch": 119.56521739130434,
"grad_norm": 2.2018797397613525,
"learning_rate": 1.8965648312470621e-06,
"loss": 0.2596,
"step": 2750
},
{
"epoch": 120.0,
"grad_norm": 3.6682052612304688,
"learning_rate": 2.2758777974964743e-06,
"loss": 0.219,
"step": 2760
},
{
"epoch": 120.43478260869566,
"grad_norm": 1.9333362579345703,
"learning_rate": 2.6551907637458867e-06,
"loss": 0.2545,
"step": 2770
},
{
"epoch": 120.8695652173913,
"grad_norm": 1.7708905935287476,
"learning_rate": 3.0345037299952995e-06,
"loss": 0.2189,
"step": 2780
},
{
"epoch": 121.30434782608695,
"grad_norm": 1.4095892906188965,
"learning_rate": 3.413816696244712e-06,
"loss": 0.2728,
"step": 2790
},
{
"epoch": 121.73913043478261,
"grad_norm": 1.991544246673584,
"learning_rate": 3.7931296624941243e-06,
"loss": 0.2699,
"step": 2800
},
{
"epoch": 122.17391304347827,
"grad_norm": 2.028014898300171,
"learning_rate": 3.793129286625273e-06,
"loss": 0.2196,
"step": 2810
},
{
"epoch": 122.6086956521739,
"grad_norm": 1.7729160785675049,
"learning_rate": 3.7931281590188667e-06,
"loss": 0.2634,
"step": 2820
},
{
"epoch": 123.04347826086956,
"grad_norm": 1.902854323387146,
"learning_rate": 3.7931262796753532e-06,
"loss": 0.251,
"step": 2830
},
{
"epoch": 123.47826086956522,
"grad_norm": 2.2296345233917236,
"learning_rate": 3.7931236485954773e-06,
"loss": 0.2424,
"step": 2840
},
{
"epoch": 123.91304347826087,
"grad_norm": 2.3609299659729004,
"learning_rate": 3.793120265780282e-06,
"loss": 0.2388,
"step": 2850
},
{
"epoch": 123.91304347826087,
"eval_loss": 0.9395554661750793,
"eval_runtime": 0.3971,
"eval_samples_per_second": 25.184,
"eval_steps_per_second": 25.184,
"step": 2850
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4171,
"Start_State_samples_per_second": 23.977,
"Start_State_steps_per_second": 23.977,
"epoch": 123.91304347826087,
"step": 2850
},
{
"Raw_Model_loss": 0.9395554661750793,
"Raw_Model_runtime": 0.3896,
"Raw_Model_samples_per_second": 25.669,
"Raw_Model_steps_per_second": 25.669,
"epoch": 123.91304347826087,
"step": 2850
},
{
"SWA_loss": 0.7867599725723267,
"SWA_runtime": 0.4887,
"SWA_samples_per_second": 20.462,
"SWA_steps_per_second": 20.462,
"epoch": 123.91304347826087,
"step": 2850
},
{
"EMA_loss": 0.7314862608909607,
"EMA_runtime": 0.4923,
"EMA_samples_per_second": 20.313,
"EMA_steps_per_second": 20.313,
"epoch": 123.91304347826087,
"step": 2850
},
{
"epoch": 124.34782608695652,
"grad_norm": 1.6401152610778809,
"learning_rate": 3.793116131231107e-06,
"loss": 0.2257,
"step": 2860
},
{
"epoch": 124.78260869565217,
"grad_norm": 1.6049269437789917,
"learning_rate": 3.793111244949593e-06,
"loss": 0.2303,
"step": 2870
},
{
"epoch": 125.21739130434783,
"grad_norm": 2.0744292736053467,
"learning_rate": 3.793105606937675e-06,
"loss": 0.2692,
"step": 2880
},
{
"epoch": 125.65217391304348,
"grad_norm": 2.102421998977661,
"learning_rate": 3.7930992171975892e-06,
"loss": 0.2458,
"step": 2890
},
{
"epoch": 126.08695652173913,
"grad_norm": 2.300477981567383,
"learning_rate": 3.793092075731867e-06,
"loss": 0.2518,
"step": 2900
},
{
"epoch": 126.52173913043478,
"grad_norm": 1.6764642000198364,
"learning_rate": 3.79308418254334e-06,
"loss": 0.2022,
"step": 2910
},
{
"epoch": 126.95652173913044,
"grad_norm": 1.5686938762664795,
"learning_rate": 3.7930755376351365e-06,
"loss": 0.2903,
"step": 2920
},
{
"epoch": 127.3913043478261,
"grad_norm": 2.0804359912872314,
"learning_rate": 3.7930661410106833e-06,
"loss": 0.2556,
"step": 2930
},
{
"epoch": 127.82608695652173,
"grad_norm": 2.6569416522979736,
"learning_rate": 3.793055992673704e-06,
"loss": 0.2196,
"step": 2940
},
{
"epoch": 128.2608695652174,
"grad_norm": 2.325507164001465,
"learning_rate": 3.7930450926282215e-06,
"loss": 0.2961,
"step": 2950
},
{
"epoch": 128.69565217391303,
"grad_norm": 1.6577781438827515,
"learning_rate": 3.793033440878557e-06,
"loss": 0.2414,
"step": 2960
},
{
"epoch": 129.1304347826087,
"grad_norm": 1.6468480825424194,
"learning_rate": 3.7930210374293287e-06,
"loss": 0.2031,
"step": 2970
},
{
"epoch": 129.56521739130434,
"grad_norm": 1.8844521045684814,
"learning_rate": 3.793007882285452e-06,
"loss": 0.2411,
"step": 2980
},
{
"epoch": 130.0,
"grad_norm": 5.029874801635742,
"learning_rate": 3.7929939754521417e-06,
"loss": 0.2465,
"step": 2990
},
{
"epoch": 130.43478260869566,
"grad_norm": 2.3793535232543945,
"learning_rate": 3.79297931693491e-06,
"loss": 0.2374,
"step": 3000
},
{
"epoch": 130.43478260869566,
"eval_loss": 0.9428585171699524,
"eval_runtime": 0.4348,
"eval_samples_per_second": 22.999,
"eval_steps_per_second": 22.999,
"step": 3000
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4391,
"Start_State_samples_per_second": 22.776,
"Start_State_steps_per_second": 22.776,
"epoch": 130.43478260869566,
"step": 3000
},
{
"Raw_Model_loss": 0.9428585171699524,
"Raw_Model_runtime": 0.4353,
"Raw_Model_samples_per_second": 22.971,
"Raw_Model_steps_per_second": 22.971,
"epoch": 130.43478260869566,
"step": 3000
},
{
"SWA_loss": 0.7945135831832886,
"SWA_runtime": 0.4368,
"SWA_samples_per_second": 22.892,
"SWA_steps_per_second": 22.892,
"epoch": 130.43478260869566,
"step": 3000
},
{
"EMA_loss": 0.7300070524215698,
"EMA_runtime": 0.4353,
"EMA_samples_per_second": 22.971,
"EMA_steps_per_second": 22.971,
"epoch": 130.43478260869566,
"step": 3000
},
{
"epoch": 130.8695652173913,
"grad_norm": 2.055912733078003,
"learning_rate": 3.7929639067395674e-06,
"loss": 0.2305,
"step": 3010
},
{
"epoch": 131.30434782608697,
"grad_norm": 1.8568785190582275,
"learning_rate": 3.7929477448722217e-06,
"loss": 0.2706,
"step": 3020
},
{
"epoch": 131.7391304347826,
"grad_norm": 1.9422987699508667,
"learning_rate": 3.792930831339279e-06,
"loss": 0.2616,
"step": 3030
},
{
"epoch": 132.17391304347825,
"grad_norm": 1.81191885471344,
"learning_rate": 3.7929131661474433e-06,
"loss": 0.2272,
"step": 3040
},
{
"epoch": 132.6086956521739,
"grad_norm": 2.1437313556671143,
"learning_rate": 3.7928947493037164e-06,
"loss": 0.253,
"step": 3050
},
{
"epoch": 133.04347826086956,
"grad_norm": 2.685347318649292,
"learning_rate": 3.792875580815398e-06,
"loss": 0.2152,
"step": 3060
},
{
"epoch": 133.47826086956522,
"grad_norm": 1.2992076873779297,
"learning_rate": 3.7928556606900864e-06,
"loss": 0.2486,
"step": 3070
},
{
"epoch": 133.91304347826087,
"grad_norm": 2.3356173038482666,
"learning_rate": 3.7928349889356773e-06,
"loss": 0.2736,
"step": 3080
},
{
"epoch": 134.34782608695653,
"grad_norm": 1.9858746528625488,
"learning_rate": 3.7928135655603634e-06,
"loss": 0.254,
"step": 3090
},
{
"epoch": 134.7826086956522,
"grad_norm": 1.929052710533142,
"learning_rate": 3.792791390572637e-06,
"loss": 0.2063,
"step": 3100
},
{
"epoch": 135.2173913043478,
"grad_norm": 2.71032977104187,
"learning_rate": 3.7927684639812876e-06,
"loss": 0.2441,
"step": 3110
},
{
"epoch": 135.65217391304347,
"grad_norm": 1.8756812810897827,
"learning_rate": 3.7927447857954023e-06,
"loss": 0.2854,
"step": 3120
},
{
"epoch": 136.08695652173913,
"grad_norm": 2.36094069480896,
"learning_rate": 3.792720356024367e-06,
"loss": 0.2128,
"step": 3130
},
{
"epoch": 136.52173913043478,
"grad_norm": 2.351156711578369,
"learning_rate": 3.7926951746778637e-06,
"loss": 0.2385,
"step": 3140
},
{
"epoch": 136.95652173913044,
"grad_norm": 2.7988734245300293,
"learning_rate": 3.7926692417658747e-06,
"loss": 0.2336,
"step": 3150
},
{
"epoch": 136.95652173913044,
"eval_loss": 0.9436905980110168,
"eval_runtime": 0.4896,
"eval_samples_per_second": 20.427,
"eval_steps_per_second": 20.427,
"step": 3150
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.578,
"Start_State_samples_per_second": 17.301,
"Start_State_steps_per_second": 17.301,
"epoch": 136.95652173913044,
"step": 3150
},
{
"Raw_Model_loss": 0.9436905980110168,
"Raw_Model_runtime": 0.5409,
"Raw_Model_samples_per_second": 18.487,
"Raw_Model_steps_per_second": 18.487,
"epoch": 136.95652173913044,
"step": 3150
},
{
"SWA_loss": 0.7969022393226624,
"SWA_runtime": 0.616,
"SWA_samples_per_second": 16.235,
"SWA_steps_per_second": 16.235,
"epoch": 136.95652173913044,
"step": 3150
},
{
"EMA_loss": 0.7306665182113647,
"EMA_runtime": 0.5117,
"EMA_samples_per_second": 19.544,
"EMA_steps_per_second": 19.544,
"epoch": 136.95652173913044,
"step": 3150
},
{
"epoch": 137.3913043478261,
"grad_norm": 1.775488257408142,
"learning_rate": 3.792642557298678e-06,
"loss": 0.2772,
"step": 3160
},
{
"epoch": 137.82608695652175,
"grad_norm": 1.9294140338897705,
"learning_rate": 3.7926151212868503e-06,
"loss": 0.2351,
"step": 3170
},
{
"epoch": 138.2608695652174,
"grad_norm": 1.642681360244751,
"learning_rate": 3.792586933741268e-06,
"loss": 0.2272,
"step": 3180
},
{
"epoch": 138.69565217391303,
"grad_norm": 2.080634593963623,
"learning_rate": 3.792557994673102e-06,
"loss": 0.2754,
"step": 3190
},
{
"epoch": 139.1304347826087,
"grad_norm": 1.3820661306381226,
"learning_rate": 3.792528304093824e-06,
"loss": 0.2258,
"step": 3200
},
{
"epoch": 139.56521739130434,
"grad_norm": 2.019350051879883,
"learning_rate": 3.7924978620152023e-06,
"loss": 0.2705,
"step": 3210
},
{
"epoch": 140.0,
"grad_norm": 2.975282907485962,
"learning_rate": 3.7924666684493018e-06,
"loss": 0.2302,
"step": 3220
},
{
"epoch": 140.43478260869566,
"grad_norm": 2.264106273651123,
"learning_rate": 3.792434723408488e-06,
"loss": 0.2315,
"step": 3230
},
{
"epoch": 140.8695652173913,
"grad_norm": 1.7037856578826904,
"learning_rate": 3.7924020269054226e-06,
"loss": 0.2381,
"step": 3240
},
{
"epoch": 141.30434782608697,
"grad_norm": 1.9553606510162354,
"learning_rate": 3.7923685789530654e-06,
"loss": 0.2367,
"step": 3250
},
{
"epoch": 141.7391304347826,
"grad_norm": 1.9915337562561035,
"learning_rate": 3.7923343795646736e-06,
"loss": 0.2491,
"step": 3260
},
{
"epoch": 142.17391304347825,
"grad_norm": 1.7067251205444336,
"learning_rate": 3.7922994287538036e-06,
"loss": 0.2579,
"step": 3270
},
{
"epoch": 142.6086956521739,
"grad_norm": 2.5622429847717285,
"learning_rate": 3.792263726534308e-06,
"loss": 0.2607,
"step": 3280
},
{
"epoch": 143.04347826086956,
"grad_norm": 1.2580666542053223,
"learning_rate": 3.7922272729203387e-06,
"loss": 0.2155,
"step": 3290
},
{
"epoch": 143.47826086956522,
"grad_norm": 1.8073185682296753,
"learning_rate": 3.792190067926345e-06,
"loss": 0.2478,
"step": 3300
},
{
"epoch": 143.47826086956522,
"eval_loss": 0.9493485689163208,
"eval_runtime": 0.41,
"eval_samples_per_second": 24.39,
"eval_steps_per_second": 24.39,
"step": 3300
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.3933,
"Start_State_samples_per_second": 25.423,
"Start_State_steps_per_second": 25.423,
"epoch": 143.47826086956522,
"step": 3300
},
{
"Raw_Model_loss": 0.9493485689163208,
"Raw_Model_runtime": 0.3998,
"Raw_Model_samples_per_second": 25.013,
"Raw_Model_steps_per_second": 25.013,
"epoch": 143.47826086956522,
"step": 3300
},
{
"SWA_loss": 0.8028978109359741,
"SWA_runtime": 0.4094,
"SWA_samples_per_second": 24.424,
"SWA_steps_per_second": 24.424,
"epoch": 143.47826086956522,
"step": 3300
},
{
"EMA_loss": 0.7308684587478638,
"EMA_runtime": 0.3971,
"EMA_samples_per_second": 25.181,
"EMA_steps_per_second": 25.181,
"epoch": 143.47826086956522,
"step": 3300
},
{
"epoch": 143.91304347826087,
"grad_norm": 1.8851526975631714,
"learning_rate": 3.7921521115670724e-06,
"loss": 0.2538,
"step": 3310
},
{
"epoch": 144.34782608695653,
"grad_norm": 1.569898247718811,
"learning_rate": 3.7921134038575663e-06,
"loss": 0.2145,
"step": 3320
},
{
"epoch": 144.7826086956522,
"grad_norm": 1.718190312385559,
"learning_rate": 3.79207394481317e-06,
"loss": 0.2708,
"step": 3330
},
{
"epoch": 145.2173913043478,
"grad_norm": 2.9095687866210938,
"learning_rate": 3.7920337344495226e-06,
"loss": 0.2084,
"step": 3340
},
{
"epoch": 145.65217391304347,
"grad_norm": 1.8533018827438354,
"learning_rate": 3.791992772782563e-06,
"loss": 0.2381,
"step": 3350
},
{
"epoch": 146.08695652173913,
"grad_norm": 1.9780678749084473,
"learning_rate": 3.791951059828527e-06,
"loss": 0.2651,
"step": 3360
},
{
"epoch": 146.52173913043478,
"grad_norm": 1.834191083908081,
"learning_rate": 3.791908595603947e-06,
"loss": 0.2269,
"step": 3370
},
{
"epoch": 146.95652173913044,
"grad_norm": 1.6292699575424194,
"learning_rate": 3.7918653801256568e-06,
"loss": 0.2159,
"step": 3380
},
{
"epoch": 147.3913043478261,
"grad_norm": 1.5715214014053345,
"learning_rate": 3.791821413410784e-06,
"loss": 0.2288,
"step": 3390
},
{
"epoch": 147.82608695652175,
"grad_norm": 1.5430243015289307,
"learning_rate": 3.791776695476756e-06,
"loss": 0.2538,
"step": 3400
},
{
"epoch": 148.2608695652174,
"grad_norm": 1.466277837753296,
"learning_rate": 3.791731226341297e-06,
"loss": 0.2156,
"step": 3410
},
{
"epoch": 148.69565217391303,
"grad_norm": 1.8279281854629517,
"learning_rate": 3.7916850060224308e-06,
"loss": 0.2498,
"step": 3420
},
{
"epoch": 149.1304347826087,
"grad_norm": 1.7966867685317993,
"learning_rate": 3.791638034538477e-06,
"loss": 0.2716,
"step": 3430
},
{
"epoch": 149.56521739130434,
"grad_norm": 2.2440056800842285,
"learning_rate": 3.7915903119080527e-06,
"loss": 0.265,
"step": 3440
},
{
"epoch": 150.0,
"grad_norm": 3.2762231826782227,
"learning_rate": 3.7915418381500747e-06,
"loss": 0.2208,
"step": 3450
},
{
"epoch": 150.0,
"eval_loss": 0.9505823850631714,
"eval_runtime": 0.4422,
"eval_samples_per_second": 22.615,
"eval_steps_per_second": 22.615,
"step": 3450
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4072,
"Start_State_samples_per_second": 24.558,
"Start_State_steps_per_second": 24.558,
"epoch": 150.0,
"step": 3450
},
{
"Raw_Model_loss": 0.9505823850631714,
"Raw_Model_runtime": 0.4153,
"Raw_Model_samples_per_second": 24.076,
"Raw_Model_steps_per_second": 24.076,
"epoch": 150.0,
"step": 3450
},
{
"SWA_loss": 0.8045159578323364,
"SWA_runtime": 0.402,
"SWA_samples_per_second": 24.876,
"SWA_steps_per_second": 24.876,
"epoch": 150.0,
"step": 3450
},
{
"EMA_loss": 0.7316843271255493,
"EMA_runtime": 0.4,
"EMA_samples_per_second": 25.002,
"EMA_steps_per_second": 25.002,
"epoch": 150.0,
"step": 3450
},
{
"epoch": 150.43478260869566,
"grad_norm": 1.7541120052337646,
"learning_rate": 2.4672082280509036e-07,
"loss": 0.214,
"step": 3460
},
{
"epoch": 150.8695652173913,
"grad_norm": 2.0008656978607178,
"learning_rate": 4.934416456101807e-07,
"loss": 0.2627,
"step": 3470
},
{
"epoch": 151.30434782608697,
"grad_norm": 1.6539170742034912,
"learning_rate": 7.40162468415271e-07,
"loss": 0.2,
"step": 3480
},
{
"epoch": 151.7391304347826,
"grad_norm": 2.369926691055298,
"learning_rate": 9.868832912203614e-07,
"loss": 0.2478,
"step": 3490
},
{
"epoch": 152.17391304347825,
"grad_norm": 2.07112979888916,
"learning_rate": 1.2336041140254517e-06,
"loss": 0.2427,
"step": 3500
},
{
"epoch": 152.6086956521739,
"grad_norm": 1.6030749082565308,
"learning_rate": 1.480324936830542e-06,
"loss": 0.2402,
"step": 3510
},
{
"epoch": 153.04347826086956,
"grad_norm": 1.5949645042419434,
"learning_rate": 1.7270457596356322e-06,
"loss": 0.2072,
"step": 3520
},
{
"epoch": 153.47826086956522,
"grad_norm": 2.338641881942749,
"learning_rate": 1.973766582440723e-06,
"loss": 0.2506,
"step": 3530
},
{
"epoch": 153.91304347826087,
"grad_norm": 2.719093084335327,
"learning_rate": 2.220487405245813e-06,
"loss": 0.2321,
"step": 3540
},
{
"epoch": 154.34782608695653,
"grad_norm": 2.292358636856079,
"learning_rate": 2.4672082280509034e-06,
"loss": 0.2404,
"step": 3550
},
{
"epoch": 154.7826086956522,
"grad_norm": 2.0019381046295166,
"learning_rate": 2.4672079835702752e-06,
"loss": 0.2343,
"step": 3560
},
{
"epoch": 155.2173913043478,
"grad_norm": 1.6779125928878784,
"learning_rate": 2.4672072501284865e-06,
"loss": 0.1963,
"step": 3570
},
{
"epoch": 155.65217391304347,
"grad_norm": 2.0632243156433105,
"learning_rate": 2.467206027725829e-06,
"loss": 0.267,
"step": 3580
},
{
"epoch": 156.08695652173913,
"grad_norm": 1.6089539527893066,
"learning_rate": 2.467204316362787e-06,
"loss": 0.2034,
"step": 3590
},
{
"epoch": 156.52173913043478,
"grad_norm": 2.475633382797241,
"learning_rate": 2.4672021160400387e-06,
"loss": 0.2685,
"step": 3600
},
{
"epoch": 156.52173913043478,
"eval_loss": 0.9592596292495728,
"eval_runtime": 0.4813,
"eval_samples_per_second": 20.778,
"eval_steps_per_second": 20.778,
"step": 3600
},
{
"Start_State_loss": 0.7309322357177734,
"Start_State_runtime": 0.4223,
"Start_State_samples_per_second": 23.679,
"Start_State_steps_per_second": 23.679,
"epoch": 156.52173913043478,
"step": 3600
},
{
"Raw_Model_loss": 0.9592596292495728,
"Raw_Model_runtime": 0.3944,
"Raw_Model_samples_per_second": 25.356,
"Raw_Model_steps_per_second": 25.356,
"epoch": 156.52173913043478,
"step": 3600
},
{
"SWA_loss": 0.8119293451309204,
"SWA_runtime": 0.3904,
"SWA_samples_per_second": 25.615,
"SWA_steps_per_second": 25.615,
"epoch": 156.52173913043478,
"step": 3600
},
{
"EMA_loss": 0.7311049103736877,
"EMA_runtime": 0.4017,
"EMA_samples_per_second": 24.896,
"EMA_steps_per_second": 24.896,
"epoch": 156.52173913043478,
"step": 3600
}
],
"logging_steps": 10,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2174,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.28760054861906e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}