diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,3927 +1,125 @@ { - "best_metric": 0.7177689671516418, - "best_model_checkpoint": "./output/checkpoint-450", - "epoch": 176.08695652173913, + "best_metric": 0.7963114976882935, + "best_model_checkpoint": "./output/checkpoint-150", + "epoch": 6.521739130434782, "eval_steps": 150, - "global_step": 4050, + "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.43478260869565216, - "grad_norm": 1.5021398067474365, + "grad_norm": 1.5021440982818604, "learning_rate": 3e-06, - "loss": 0.906, + "loss": 0.9061, "step": 10 }, { "epoch": 0.8695652173913043, - "grad_norm": 1.6870626211166382, + "grad_norm": 1.6870536804199219, "learning_rate": 6e-06, - "loss": 0.9025, + "loss": 0.9023, "step": 20 }, { "epoch": 1.3043478260869565, - "grad_norm": 1.7296104431152344, + "grad_norm": 1.729659080505371, "learning_rate": 9e-06, - "loss": 0.9005, + "loss": 0.9004, "step": 30 }, { "epoch": 1.7391304347826086, - "grad_norm": 1.4458670616149902, + "grad_norm": 1.453600525856018, "learning_rate": 1.2e-05, "loss": 0.9091, "step": 40 }, { "epoch": 2.1739130434782608, - "grad_norm": 1.3441009521484375, + "grad_norm": 1.3518075942993164, "learning_rate": 1.5e-05, - "loss": 0.8359, + "loss": 0.8362, "step": 50 }, { "epoch": 2.608695652173913, - "grad_norm": 2.031062364578247, + "grad_norm": 2.031172513961792, "learning_rate": 1.8e-05, - "loss": 0.8894, + "loss": 0.8893, "step": 60 }, { "epoch": 3.0434782608695654, - "grad_norm": 1.4844555854797363, + "grad_norm": 1.484531283378601, "learning_rate": 2.1e-05, - "loss": 0.8913, + "loss": 0.8915, "step": 70 }, { "epoch": 3.4782608695652173, - "grad_norm": 1.7371587753295898, + "grad_norm": 1.7294986248016357, "learning_rate": 2.4e-05, - "loss": 0.8235, + "loss": 0.8233, "step": 80 }, { "epoch": 3.9130434782608696, - "grad_norm": 1.424005150794983, + "grad_norm": 1.4242360591888428, "learning_rate": 2.7000000000000002e-05, "loss": 0.8527, "step": 90 }, { "epoch": 4.3478260869565215, - "grad_norm": 1.3659029006958008, + "grad_norm": 1.3656773567199707, "learning_rate": 3e-05, - "loss": 0.865, + "loss": 0.8648, "step": 100 }, { "epoch": 4.782608695652174, - "grad_norm": 2.198240041732788, + "grad_norm": 2.19753098487854, "learning_rate": 2.999999702723963e-05, - "loss": 0.8224, + "loss": 0.8225, "step": 110 }, { "epoch": 5.217391304347826, - "grad_norm": 1.0725653171539307, + "grad_norm": 1.0726382732391357, "learning_rate": 2.9999988108959687e-05, - "loss": 0.7653, + "loss": 0.7654, "step": 120 }, { "epoch": 5.6521739130434785, - "grad_norm": 1.56694757938385, + "grad_norm": 1.5603922605514526, "learning_rate": 2.9999973245163716e-05, - "loss": 0.741, + "loss": 0.7417, "step": 130 }, { "epoch": 6.086956521739131, - "grad_norm": 1.9073944091796875, + "grad_norm": 1.9068461656570435, "learning_rate": 2.99999524358576e-05, - "loss": 0.7653, + "loss": 0.7654, "step": 140 }, { "epoch": 6.521739130434782, - "grad_norm": 1.1215864419937134, + "grad_norm": 1.1220637559890747, "learning_rate": 2.9999925681049593e-05, - "loss": 0.7854, + "loss": 0.7857, "step": 150 }, { "epoch": 6.521739130434782, - "eval_loss": 0.7962762713432312, - "eval_runtime": 0.4855, - "eval_samples_per_second": 20.599, - "eval_steps_per_second": 20.599, + "eval_loss": 0.7963114976882935, + "eval_runtime": 0.4908, + "eval_samples_per_second": 20.374, + "eval_steps_per_second": 20.374, "step": 150 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4118, - "Start_State_samples_per_second": 24.285, - "Start_State_steps_per_second": 24.285, - "epoch": 6.521739130434782, - "step": 150 - }, - { - "Raw_Model_loss": 0.7962762713432312, - "Raw_Model_runtime": 0.4168, - "Raw_Model_samples_per_second": 23.993, - "Raw_Model_steps_per_second": 23.993, - "epoch": 6.521739130434782, - "step": 150 - }, - { - "SWA_loss": 0.861186683177948, - "SWA_runtime": 0.4016, - "SWA_samples_per_second": 24.9, - "SWA_steps_per_second": 24.9, - "epoch": 6.521739130434782, - "step": 150 - }, - { - "EMA_loss": 0.8616974949836731, - "EMA_runtime": 0.3952, - "EMA_samples_per_second": 25.302, - "EMA_steps_per_second": 25.302, - "epoch": 6.521739130434782, - "step": 150 - }, - { - "epoch": 6.956521739130435, - "grad_norm": 1.5400996208190918, - "learning_rate": 2.9999892980750297e-05, - "loss": 0.6586, - "step": 160 - }, - { - "epoch": 7.391304347826087, - "grad_norm": 1.3464864492416382, - "learning_rate": 2.9999854334972675e-05, - "loss": 0.7388, - "step": 170 - }, - { - "epoch": 7.826086956521739, - "grad_norm": 1.7265626192092896, - "learning_rate": 2.999980974373204e-05, - "loss": 0.7289, - "step": 180 - }, - { - "epoch": 8.26086956521739, - "grad_norm": 1.5396337509155273, - "learning_rate": 2.9999759207046075e-05, - "loss": 0.6244, - "step": 190 - }, - { - "epoch": 8.695652173913043, - "grad_norm": 1.7341505289077759, - "learning_rate": 2.9999702724934804e-05, - "loss": 0.6762, - "step": 200 - }, - { - "epoch": 9.130434782608695, - "grad_norm": 1.0384011268615723, - "learning_rate": 2.999964029742062e-05, - "loss": 0.6522, - "step": 210 - }, - { - "epoch": 9.565217391304348, - "grad_norm": 1.220421314239502, - "learning_rate": 2.9999571924528263e-05, - "loss": 0.5591, - "step": 220 - }, - { - "epoch": 10.0, - "grad_norm": 1.5278071165084839, - "learning_rate": 2.9999497606284837e-05, - "loss": 0.7558, - "step": 230 - }, - { - "epoch": 10.434782608695652, - "grad_norm": 1.4214218854904175, - "learning_rate": 2.9999417342719796e-05, - "loss": 0.7117, - "step": 240 - }, - { - "epoch": 10.869565217391305, - "grad_norm": 0.974699854850769, - "learning_rate": 2.9999331133864956e-05, - "loss": 0.5899, - "step": 250 - }, - { - "epoch": 11.304347826086957, - "grad_norm": 1.194456696510315, - "learning_rate": 2.9999238979754485e-05, - "loss": 0.6546, - "step": 260 - }, - { - "epoch": 11.73913043478261, - "grad_norm": 1.048299789428711, - "learning_rate": 2.999914088042492e-05, - "loss": 0.6477, - "step": 270 - }, - { - "epoch": 12.173913043478262, - "grad_norm": 1.3111830949783325, - "learning_rate": 2.9999036835915132e-05, - "loss": 0.5937, - "step": 280 - }, - { - "epoch": 12.608695652173914, - "grad_norm": 1.0831282138824463, - "learning_rate": 2.9998926846266365e-05, - "loss": 0.6326, - "step": 290 - }, - { - "epoch": 13.043478260869565, - "grad_norm": 1.3862762451171875, - "learning_rate": 2.9998810911522213e-05, - "loss": 0.5811, - "step": 300 - }, - { - "epoch": 13.043478260869565, - "eval_loss": 0.7311000227928162, - "eval_runtime": 0.5741, - "eval_samples_per_second": 17.419, - "eval_steps_per_second": 17.419, - "step": 300 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.5169, - "Start_State_samples_per_second": 19.347, - "Start_State_steps_per_second": 19.347, - "epoch": 13.043478260869565, - "step": 300 - }, - { - "Raw_Model_loss": 0.7311000227928162, - "Raw_Model_runtime": 0.5563, - "Raw_Model_samples_per_second": 17.977, - "Raw_Model_steps_per_second": 17.977, - "epoch": 13.043478260869565, - "step": 300 - }, - { - "SWA_loss": 0.7750778794288635, - "SWA_runtime": 0.5366, - "SWA_samples_per_second": 18.637, - "SWA_steps_per_second": 18.637, - "epoch": 13.043478260869565, - "step": 300 - }, - { - "EMA_loss": 0.8611688613891602, - "EMA_runtime": 0.452, - "EMA_samples_per_second": 22.125, - "EMA_steps_per_second": 22.125, - "epoch": 13.043478260869565, - "step": 300 - }, - { - "epoch": 13.478260869565217, - "grad_norm": 1.7830702066421509, - "learning_rate": 2.9998689031728636e-05, - "loss": 0.5142, - "step": 310 - }, - { - "epoch": 13.91304347826087, - "grad_norm": 1.5307224988937378, - "learning_rate": 2.9998561206933938e-05, - "loss": 0.6497, - "step": 320 - }, - { - "epoch": 14.347826086956522, - "grad_norm": 1.4800869226455688, - "learning_rate": 2.9998427437188786e-05, - "loss": 0.5744, - "step": 330 - }, - { - "epoch": 14.782608695652174, - "grad_norm": 1.3100613355636597, - "learning_rate": 2.99982877225462e-05, - "loss": 0.6011, - "step": 340 - }, - { - "epoch": 15.217391304347826, - "grad_norm": 0.9753432869911194, - "learning_rate": 2.9998142063061564e-05, - "loss": 0.4987, - "step": 350 - }, - { - "epoch": 15.652173913043478, - "grad_norm": 1.64210844039917, - "learning_rate": 2.9997990458792603e-05, - "loss": 0.5629, - "step": 360 - }, - { - "epoch": 16.08695652173913, - "grad_norm": 1.6374964714050293, - "learning_rate": 2.9997832909799417e-05, - "loss": 0.6673, - "step": 370 - }, - { - "epoch": 16.52173913043478, - "grad_norm": 0.9534397125244141, - "learning_rate": 2.9997669416144452e-05, - "loss": 0.5129, - "step": 380 - }, - { - "epoch": 16.956521739130434, - "grad_norm": 0.9338690638542175, - "learning_rate": 2.999749997789251e-05, - "loss": 0.5796, - "step": 390 - }, - { - "epoch": 17.391304347826086, - "grad_norm": 1.113961100578308, - "learning_rate": 2.9997324595110743e-05, - "loss": 0.5177, - "step": 400 - }, - { - "epoch": 17.82608695652174, - "grad_norm": 1.2833036184310913, - "learning_rate": 2.9997143267868683e-05, - "loss": 0.5877, - "step": 410 - }, - { - "epoch": 18.26086956521739, - "grad_norm": 1.161442518234253, - "learning_rate": 2.9996955996238192e-05, - "loss": 0.5058, - "step": 420 - }, - { - "epoch": 18.695652173913043, - "grad_norm": 1.1988558769226074, - "learning_rate": 2.9996762780293503e-05, - "loss": 0.5316, - "step": 430 - }, - { - "epoch": 19.130434782608695, - "grad_norm": 1.2128249406814575, - "learning_rate": 2.9996563620111197e-05, - "loss": 0.5336, - "step": 440 - }, - { - "epoch": 19.565217391304348, - "grad_norm": 1.4281935691833496, - "learning_rate": 2.9996358515770218e-05, - "loss": 0.5677, - "step": 450 - }, - { - "epoch": 19.565217391304348, - "eval_loss": 0.7177689671516418, - "eval_runtime": 0.4582, - "eval_samples_per_second": 21.823, - "eval_steps_per_second": 21.823, - "step": 450 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4487, - "Start_State_samples_per_second": 22.286, - "Start_State_steps_per_second": 22.286, - "epoch": 19.565217391304348, - "step": 450 - }, - { - "Raw_Model_loss": 0.7177689671516418, - "Raw_Model_runtime": 0.4474, - "Raw_Model_samples_per_second": 22.35, - "Raw_Model_steps_per_second": 22.35, - "epoch": 19.565217391304348, - "step": 450 - }, - { - "SWA_loss": 0.7554013133049011, - "SWA_runtime": 0.4473, - "SWA_samples_per_second": 22.355, - "SWA_steps_per_second": 22.355, - "epoch": 19.565217391304348, - "step": 450 - }, - { - "EMA_loss": 0.8611768484115601, - "EMA_runtime": 0.4393, - "EMA_samples_per_second": 22.765, - "EMA_steps_per_second": 22.765, - "epoch": 19.565217391304348, - "step": 450 - }, - { - "epoch": 20.0, - "grad_norm": 2.1267194747924805, - "learning_rate": 2.9996147467351856e-05, - "loss": 0.5149, - "step": 460 - }, - { - "epoch": 20.434782608695652, - "grad_norm": 1.276945948600769, - "learning_rate": 2.9995930474939773e-05, - "loss": 0.4782, - "step": 470 - }, - { - "epoch": 20.869565217391305, - "grad_norm": 1.471908688545227, - "learning_rate": 2.9995707538619975e-05, - "loss": 0.5705, - "step": 480 - }, - { - "epoch": 21.304347826086957, - "grad_norm": 1.3230303525924683, - "learning_rate": 2.9995478658480822e-05, - "loss": 0.5164, - "step": 490 - }, - { - "epoch": 21.73913043478261, - "grad_norm": 1.2438148260116577, - "learning_rate": 2.9995243834613043e-05, - "loss": 0.5207, - "step": 500 - }, - { - "epoch": 22.17391304347826, - "grad_norm": 1.7769482135772705, - "learning_rate": 2.9995003067109707e-05, - "loss": 0.4836, - "step": 510 - }, - { - "epoch": 22.608695652173914, - "grad_norm": 1.5336769819259644, - "learning_rate": 2.9994756356066246e-05, - "loss": 0.5618, - "step": 520 - }, - { - "epoch": 23.043478260869566, - "grad_norm": 1.7468706369400024, - "learning_rate": 2.999450370158046e-05, - "loss": 0.4928, - "step": 530 - }, - { - "epoch": 23.47826086956522, - "grad_norm": 1.309330701828003, - "learning_rate": 2.9994245103752478e-05, - "loss": 0.4384, - "step": 540 - }, - { - "epoch": 23.91304347826087, - "grad_norm": 1.232712745666504, - "learning_rate": 2.999398056268481e-05, - "loss": 0.5266, - "step": 550 - }, - { - "epoch": 24.347826086956523, - "grad_norm": 1.4076601266860962, - "learning_rate": 2.9993710078482306e-05, - "loss": 0.5203, - "step": 560 - }, - { - "epoch": 24.782608695652176, - "grad_norm": 0.9518764019012451, - "learning_rate": 2.9993433651252185e-05, - "loss": 0.4432, - "step": 570 - }, - { - "epoch": 25.217391304347824, - "grad_norm": 1.7226718664169312, - "learning_rate": 2.9993151281104006e-05, - "loss": 0.5325, - "step": 580 - }, - { - "epoch": 25.652173913043477, - "grad_norm": 1.1378567218780518, - "learning_rate": 2.9992862968149695e-05, - "loss": 0.4736, - "step": 590 - }, - { - "epoch": 26.08695652173913, - "grad_norm": 1.1712965965270996, - "learning_rate": 2.9992568712503533e-05, - "loss": 0.4608, - "step": 600 - }, - { - "epoch": 26.08695652173913, - "eval_loss": 0.7202123403549194, - "eval_runtime": 0.4189, - "eval_samples_per_second": 23.87, - "eval_steps_per_second": 23.87, - "step": 600 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4129, - "Start_State_samples_per_second": 24.217, - "Start_State_steps_per_second": 24.217, - "epoch": 26.08695652173913, - "step": 600 - }, - { - "Raw_Model_loss": 0.7202123403549194, - "Raw_Model_runtime": 0.4312, - "Raw_Model_samples_per_second": 23.189, - "Raw_Model_steps_per_second": 23.189, - "epoch": 26.08695652173913, - "step": 600 - }, - { - "SWA_loss": 0.734836220741272, - "SWA_runtime": 0.416, - "SWA_samples_per_second": 24.039, - "SWA_steps_per_second": 24.039, - "epoch": 26.08695652173913, - "step": 600 - }, - { - "EMA_loss": 0.8605602383613586, - "EMA_runtime": 0.4018, - "EMA_samples_per_second": 24.887, - "EMA_steps_per_second": 24.887, - "epoch": 26.08695652173913, - "step": 600 - }, - { - "epoch": 26.52173913043478, - "grad_norm": 1.0910325050354004, - "learning_rate": 2.9992268514282142e-05, - "loss": 0.5118, - "step": 610 - }, - { - "epoch": 26.956521739130434, - "grad_norm": 1.3383877277374268, - "learning_rate": 2.999196237360452e-05, - "loss": 0.4316, - "step": 620 - }, - { - "epoch": 27.391304347826086, - "grad_norm": 1.2017790079116821, - "learning_rate": 2.9991650290592016e-05, - "loss": 0.4754, - "step": 630 - }, - { - "epoch": 27.82608695652174, - "grad_norm": 1.3913823366165161, - "learning_rate": 2.999133226536832e-05, - "loss": 0.5013, - "step": 640 - }, - { - "epoch": 28.26086956521739, - "grad_norm": 1.4073095321655273, - "learning_rate": 2.9991008298059493e-05, - "loss": 0.4108, - "step": 650 - }, - { - "epoch": 28.695652173913043, - "grad_norm": 1.5551748275756836, - "learning_rate": 2.9990678388793944e-05, - "loss": 0.5063, - "step": 660 - }, - { - "epoch": 29.130434782608695, - "grad_norm": 1.3158533573150635, - "learning_rate": 2.999034253770244e-05, - "loss": 0.4348, - "step": 670 - }, - { - "epoch": 29.565217391304348, - "grad_norm": 1.0734323263168335, - "learning_rate": 2.9990000744918097e-05, - "loss": 0.4704, - "step": 680 - }, - { - "epoch": 30.0, - "grad_norm": 2.4888181686401367, - "learning_rate": 2.9989653010576392e-05, - "loss": 0.4143, - "step": 690 - }, - { - "epoch": 30.434782608695652, - "grad_norm": 1.3325855731964111, - "learning_rate": 2.9989299334815158e-05, - "loss": 0.4765, - "step": 700 - }, - { - "epoch": 30.869565217391305, - "grad_norm": 1.6293697357177734, - "learning_rate": 2.9988939717774578e-05, - "loss": 0.4124, - "step": 710 - }, - { - "epoch": 31.304347826086957, - "grad_norm": 0.9005484580993652, - "learning_rate": 2.9988574159597194e-05, - "loss": 0.4244, - "step": 720 - }, - { - "epoch": 31.73913043478261, - "grad_norm": 1.6467411518096924, - "learning_rate": 2.9988202660427907e-05, - "loss": 0.4824, - "step": 730 - }, - { - "epoch": 32.17391304347826, - "grad_norm": 1.2035257816314697, - "learning_rate": 2.9987825220413958e-05, - "loss": 0.4381, - "step": 740 - }, - { - "epoch": 32.608695652173914, - "grad_norm": 1.7662363052368164, - "learning_rate": 2.998744183970496e-05, - "loss": 0.4733, - "step": 750 - }, - { - "epoch": 32.608695652173914, - "eval_loss": 0.7324522733688354, - "eval_runtime": 0.5029, - "eval_samples_per_second": 19.886, - "eval_steps_per_second": 19.886, - "step": 750 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4839, - "Start_State_samples_per_second": 20.667, - "Start_State_steps_per_second": 20.667, - "epoch": 32.608695652173914, - "step": 750 - }, - { - "Raw_Model_loss": 0.7324522733688354, - "Raw_Model_runtime": 0.4871, - "Raw_Model_samples_per_second": 20.531, - "Raw_Model_steps_per_second": 20.531, - "epoch": 32.608695652173914, - "step": 750 - }, - { - "SWA_loss": 0.7293017506599426, - "SWA_runtime": 0.507, - "SWA_samples_per_second": 19.723, - "SWA_steps_per_second": 19.723, - "epoch": 32.608695652173914, - "step": 750 - }, - { - "EMA_loss": 0.8609846234321594, - "EMA_runtime": 0.4713, - "EMA_samples_per_second": 21.218, - "EMA_steps_per_second": 21.218, - "epoch": 32.608695652173914, - "step": 750 - }, - { - "epoch": 33.04347826086956, - "grad_norm": 1.4953798055648804, - "learning_rate": 2.998705251845287e-05, - "loss": 0.4301, - "step": 760 - }, - { - "epoch": 33.47826086956522, - "grad_norm": 1.6487337350845337, - "learning_rate": 2.9986657256812e-05, - "loss": 0.4301, - "step": 770 - }, - { - "epoch": 33.91304347826087, - "grad_norm": 1.302030086517334, - "learning_rate": 2.9986256054939022e-05, - "loss": 0.4078, - "step": 780 - }, - { - "epoch": 34.34782608695652, - "grad_norm": 1.4749252796173096, - "learning_rate": 2.9985848912992956e-05, - "loss": 0.4026, - "step": 790 - }, - { - "epoch": 34.78260869565217, - "grad_norm": 1.5546329021453857, - "learning_rate": 2.9985435831135184e-05, - "loss": 0.3832, - "step": 800 - }, - { - "epoch": 35.21739130434783, - "grad_norm": 1.3067240715026855, - "learning_rate": 2.9985016809529437e-05, - "loss": 0.4745, - "step": 810 - }, - { - "epoch": 35.65217391304348, - "grad_norm": 1.3341012001037598, - "learning_rate": 2.9984591848341806e-05, - "loss": 0.4026, - "step": 820 - }, - { - "epoch": 36.08695652173913, - "grad_norm": 1.0843638181686401, - "learning_rate": 2.9984160947740723e-05, - "loss": 0.4178, - "step": 830 - }, - { - "epoch": 36.52173913043478, - "grad_norm": 1.1590487957000732, - "learning_rate": 2.9983724107896993e-05, - "loss": 0.3803, - "step": 840 - }, - { - "epoch": 36.95652173913044, - "grad_norm": 1.4765552282333374, - "learning_rate": 2.9983281328983757e-05, - "loss": 0.4499, - "step": 850 - }, - { - "epoch": 37.391304347826086, - "grad_norm": 1.9148926734924316, - "learning_rate": 2.9982832611176523e-05, - "loss": 0.4183, - "step": 860 - }, - { - "epoch": 37.82608695652174, - "grad_norm": 1.2662427425384521, - "learning_rate": 2.998237795465315e-05, - "loss": 0.3714, - "step": 870 - }, - { - "epoch": 38.26086956521739, - "grad_norm": 1.2768863439559937, - "learning_rate": 2.9981917359593843e-05, - "loss": 0.4015, - "step": 880 - }, - { - "epoch": 38.69565217391305, - "grad_norm": 1.4421806335449219, - "learning_rate": 2.9981450826181172e-05, - "loss": 0.3553, - "step": 890 - }, - { - "epoch": 39.130434782608695, - "grad_norm": 1.9427447319030762, - "learning_rate": 2.9980978354600057e-05, - "loss": 0.4626, - "step": 900 - }, - { - "epoch": 39.130434782608695, - "eval_loss": 0.7509682774543762, - "eval_runtime": 0.4364, - "eval_samples_per_second": 22.914, - "eval_steps_per_second": 22.914, - "step": 900 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4232, - "Start_State_samples_per_second": 23.629, - "Start_State_steps_per_second": 23.629, - "epoch": 39.130434782608695, - "step": 900 - }, - { - "Raw_Model_loss": 0.7509682774543762, - "Raw_Model_runtime": 0.4354, - "Raw_Model_samples_per_second": 22.969, - "Raw_Model_steps_per_second": 22.969, - "epoch": 39.130434782608695, - "step": 900 - }, - { - "SWA_loss": 0.723495602607727, - "SWA_runtime": 0.4244, - "SWA_samples_per_second": 23.565, - "SWA_steps_per_second": 23.565, - "epoch": 39.130434782608695, - "step": 900 - }, - { - "EMA_loss": 0.8611685633659363, - "EMA_runtime": 0.4188, - "EMA_samples_per_second": 23.877, - "EMA_steps_per_second": 23.877, - "epoch": 39.130434782608695, - "step": 900 - }, - { - "epoch": 39.56521739130435, - "grad_norm": 1.5449541807174683, - "learning_rate": 2.9980499945037765e-05, - "loss": 0.3833, - "step": 910 - }, - { - "epoch": 40.0, - "grad_norm": 3.072025775909424, - "learning_rate": 2.998001559768393e-05, - "loss": 0.3867, - "step": 920 - }, - { - "epoch": 40.43478260869565, - "grad_norm": 1.5371581315994263, - "learning_rate": 2.9979525312730525e-05, - "loss": 0.4492, - "step": 930 - }, - { - "epoch": 40.869565217391305, - "grad_norm": 1.6783299446105957, - "learning_rate": 2.9979029090371885e-05, - "loss": 0.3413, - "step": 940 - }, - { - "epoch": 41.30434782608695, - "grad_norm": 2.2113845348358154, - "learning_rate": 2.99785269308047e-05, - "loss": 0.3421, - "step": 950 - }, - { - "epoch": 41.73913043478261, - "grad_norm": 1.5125486850738525, - "learning_rate": 2.9978018834228007e-05, - "loss": 0.3652, - "step": 960 - }, - { - "epoch": 42.17391304347826, - "grad_norm": 1.5023707151412964, - "learning_rate": 2.9977504800843197e-05, - "loss": 0.4348, - "step": 970 - }, - { - "epoch": 42.608695652173914, - "grad_norm": 1.5505523681640625, - "learning_rate": 2.9976984830854022e-05, - "loss": 0.3753, - "step": 980 - }, - { - "epoch": 43.04347826086956, - "grad_norm": 1.646248459815979, - "learning_rate": 2.997645892446658e-05, - "loss": 0.3667, - "step": 990 - }, - { - "epoch": 43.47826086956522, - "grad_norm": 1.427502989768982, - "learning_rate": 2.9975927081889322e-05, - "loss": 0.3914, - "step": 1000 - }, - { - "epoch": 43.91304347826087, - "grad_norm": 1.1922435760498047, - "learning_rate": 2.9975389303333047e-05, - "loss": 0.3459, - "step": 1010 - }, - { - "epoch": 44.34782608695652, - "grad_norm": 2.117877960205078, - "learning_rate": 2.997484558901093e-05, - "loss": 0.3921, - "step": 1020 - }, - { - "epoch": 44.78260869565217, - "grad_norm": 1.6940711736679077, - "learning_rate": 2.9974295939138465e-05, - "loss": 0.3805, - "step": 1030 - }, - { - "epoch": 45.21739130434783, - "grad_norm": 1.2534048557281494, - "learning_rate": 2.9973740353933523e-05, - "loss": 0.2646, - "step": 1040 - }, - { - "epoch": 45.65217391304348, - "grad_norm": 1.6683976650238037, - "learning_rate": 2.997317883361632e-05, - "loss": 0.3614, - "step": 1050 - }, - { - "epoch": 45.65217391304348, - "eval_loss": 0.7766519784927368, - "eval_runtime": 0.5173, - "eval_samples_per_second": 19.332, - "eval_steps_per_second": 19.332, - "step": 1050 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4919, - "Start_State_samples_per_second": 20.328, - "Start_State_steps_per_second": 20.328, - "epoch": 45.65217391304348, - "step": 1050 - }, - { - "Raw_Model_loss": 0.7766519784927368, - "Raw_Model_runtime": 0.5097, - "Raw_Model_samples_per_second": 19.618, - "Raw_Model_steps_per_second": 19.618, - "epoch": 45.65217391304348, - "step": 1050 - }, - { - "SWA_loss": 0.7227691411972046, - "SWA_runtime": 0.4081, - "SWA_samples_per_second": 24.506, - "SWA_steps_per_second": 24.506, - "epoch": 45.65217391304348, - "step": 1050 - }, - { - "EMA_loss": 0.8606861233711243, - "EMA_runtime": 0.4167, - "EMA_samples_per_second": 23.999, - "EMA_steps_per_second": 23.999, - "epoch": 45.65217391304348, - "step": 1050 - }, - { - "epoch": 46.08695652173913, - "grad_norm": 1.779891014099121, - "learning_rate": 2.997261137840943e-05, - "loss": 0.4102, - "step": 1060 - }, - { - "epoch": 46.52173913043478, - "grad_norm": 2.14410662651062, - "learning_rate": 2.9972037988537758e-05, - "loss": 0.3785, - "step": 1070 - }, - { - "epoch": 46.95652173913044, - "grad_norm": 1.9252405166625977, - "learning_rate": 2.9971458664228595e-05, - "loss": 0.3326, - "step": 1080 - }, - { - "epoch": 47.391304347826086, - "grad_norm": 2.159163236618042, - "learning_rate": 2.997087340571156e-05, - "loss": 0.3368, - "step": 1090 - }, - { - "epoch": 47.82608695652174, - "grad_norm": 1.5358129739761353, - "learning_rate": 2.997028221321863e-05, - "loss": 0.3563, - "step": 1100 - }, - { - "epoch": 48.26086956521739, - "grad_norm": 2.3225314617156982, - "learning_rate": 2.9969685086984132e-05, - "loss": 0.3735, - "step": 1110 - }, - { - "epoch": 48.69565217391305, - "grad_norm": 1.663732647895813, - "learning_rate": 2.9969082027244755e-05, - "loss": 0.3003, - "step": 1120 - }, - { - "epoch": 49.130434782608695, - "grad_norm": 1.8256464004516602, - "learning_rate": 2.996847303423953e-05, - "loss": 0.4151, - "step": 1130 - }, - { - "epoch": 49.56521739130435, - "grad_norm": 1.3243038654327393, - "learning_rate": 2.9967858108209838e-05, - "loss": 0.371, - "step": 1140 - }, - { - "epoch": 50.0, - "grad_norm": 3.001286506652832, - "learning_rate": 2.9967237249399417e-05, - "loss": 0.2918, - "step": 1150 - }, - { - "epoch": 50.43478260869565, - "grad_norm": 1.505326509475708, - "learning_rate": 2.996661045805436e-05, - "loss": 0.296, - "step": 1160 - }, - { - "epoch": 50.869565217391305, - "grad_norm": 1.8840476274490356, - "learning_rate": 2.9965977734423106e-05, - "loss": 0.3418, - "step": 1170 - }, - { - "epoch": 51.30434782608695, - "grad_norm": 1.9088075160980225, - "learning_rate": 2.9965339078756445e-05, - "loss": 0.3538, - "step": 1180 - }, - { - "epoch": 51.73913043478261, - "grad_norm": 1.201934814453125, - "learning_rate": 2.9964694491307514e-05, - "loss": 0.2799, - "step": 1190 - }, - { - "epoch": 52.17391304347826, - "grad_norm": 2.5051469802856445, - "learning_rate": 2.996404397233182e-05, - "loss": 0.4083, - "step": 1200 - }, - { - "epoch": 52.17391304347826, - "eval_loss": 0.8017474412918091, - "eval_runtime": 0.4089, - "eval_samples_per_second": 24.455, - "eval_steps_per_second": 24.455, - "step": 1200 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4047, - "Start_State_samples_per_second": 24.712, - "Start_State_steps_per_second": 24.712, - "epoch": 52.17391304347826, - "step": 1200 - }, - { - "Raw_Model_loss": 0.8017474412918091, - "Raw_Model_runtime": 0.4185, - "Raw_Model_samples_per_second": 23.893, - "Raw_Model_steps_per_second": 23.893, - "epoch": 52.17391304347826, - "step": 1200 - }, - { - "SWA_loss": 0.7242206335067749, - "SWA_runtime": 0.4069, - "SWA_samples_per_second": 24.576, - "SWA_steps_per_second": 24.576, - "epoch": 52.17391304347826, - "step": 1200 - }, - { - "EMA_loss": 0.8609538078308105, - "EMA_runtime": 0.4011, - "EMA_samples_per_second": 24.929, - "EMA_steps_per_second": 24.929, - "epoch": 52.17391304347826, - "step": 1200 - }, - { - "epoch": 52.608695652173914, - "grad_norm": 1.6022043228149414, - "learning_rate": 1.4982021986165911e-06, - "loss": 0.2864, - "step": 1210 - }, - { - "epoch": 53.04347826086956, - "grad_norm": 1.779731273651123, - "learning_rate": 2.9964043972331822e-06, - "loss": 0.365, - "step": 1220 - }, - { - "epoch": 53.47826086956522, - "grad_norm": 1.4084794521331787, - "learning_rate": 4.494606595849773e-06, - "loss": 0.272, - "step": 1230 - }, - { - "epoch": 53.91304347826087, - "grad_norm": 1.9352147579193115, - "learning_rate": 5.9928087944663644e-06, - "loss": 0.3341, - "step": 1240 - }, - { - "epoch": 54.34782608695652, - "grad_norm": 1.9726593494415283, - "learning_rate": 7.491010993082955e-06, - "loss": 0.385, - "step": 1250 - }, - { - "epoch": 54.78260869565217, - "grad_norm": 1.1675161123275757, - "learning_rate": 8.989213191699545e-06, - "loss": 0.2836, - "step": 1260 - }, - { - "epoch": 55.21739130434783, - "grad_norm": 1.5592656135559082, - "learning_rate": 1.0487415390316136e-05, - "loss": 0.3109, - "step": 1270 - }, - { - "epoch": 55.65217391304348, - "grad_norm": 1.7126950025558472, - "learning_rate": 1.1985617588932729e-05, - "loss": 0.3556, - "step": 1280 - }, - { - "epoch": 56.08695652173913, - "grad_norm": 2.3063290119171143, - "learning_rate": 1.348381978754932e-05, - "loss": 0.2845, - "step": 1290 - }, - { - "epoch": 56.52173913043478, - "grad_norm": 1.4599252939224243, - "learning_rate": 1.498202198616591e-05, - "loss": 0.339, - "step": 1300 - }, - { - "epoch": 56.95652173913044, - "grad_norm": 1.9888345003128052, - "learning_rate": 1.4982020501567203e-05, - "loss": 0.3316, - "step": 1310 - }, - { - "epoch": 57.391304347826086, - "grad_norm": 1.411199927330017, - "learning_rate": 1.4982016047771664e-05, - "loss": 0.3113, - "step": 1320 - }, - { - "epoch": 57.82608695652174, - "grad_norm": 2.7845284938812256, - "learning_rate": 1.4982008624781062e-05, - "loss": 0.3372, - "step": 1330 - }, - { - "epoch": 58.26086956521739, - "grad_norm": 1.4198298454284668, - "learning_rate": 1.4981998232598337e-05, - "loss": 0.3301, - "step": 1340 - }, - { - "epoch": 58.69565217391305, - "grad_norm": 1.7863256931304932, - "learning_rate": 1.4981984871227611e-05, - "loss": 0.3077, - "step": 1350 - }, - { - "epoch": 58.69565217391305, - "eval_loss": 0.8207708597183228, - "eval_runtime": 0.4025, - "eval_samples_per_second": 24.846, - "eval_steps_per_second": 24.846, - "step": 1350 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4066, - "Start_State_samples_per_second": 24.595, - "Start_State_steps_per_second": 24.595, - "epoch": 58.69565217391305, - "step": 1350 - }, - { - "Raw_Model_loss": 0.8207708597183228, - "Raw_Model_runtime": 0.4225, - "Raw_Model_samples_per_second": 23.668, - "Raw_Model_steps_per_second": 23.668, - "epoch": 58.69565217391305, - "step": 1350 - }, - { - "SWA_loss": 0.7249630689620972, - "SWA_runtime": 0.4213, - "SWA_samples_per_second": 23.737, - "SWA_steps_per_second": 23.737, - "epoch": 58.69565217391305, - "step": 1350 - }, - { - "EMA_loss": 0.8608277440071106, - "EMA_runtime": 0.4189, - "EMA_samples_per_second": 23.873, - "EMA_steps_per_second": 23.873, - "epoch": 58.69565217391305, - "step": 1350 - }, - { - "epoch": 59.130434782608695, - "grad_norm": 1.607054591178894, - "learning_rate": 1.4981968540674177e-05, - "loss": 0.3205, - "step": 1360 - }, - { - "epoch": 59.56521739130435, - "grad_norm": 1.3625128269195557, - "learning_rate": 1.4981949240944509e-05, - "loss": 0.3009, - "step": 1370 - }, - { - "epoch": 60.0, - "grad_norm": 1.6340285539627075, - "learning_rate": 1.4981926972046258e-05, - "loss": 0.3097, - "step": 1380 - }, - { - "epoch": 60.43478260869565, - "grad_norm": 1.8730145692825317, - "learning_rate": 1.498190173398825e-05, - "loss": 0.317, - "step": 1390 - }, - { - "epoch": 60.869565217391305, - "grad_norm": 1.536399483680725, - "learning_rate": 1.4981873526780487e-05, - "loss": 0.305, - "step": 1400 - }, - { - "epoch": 61.30434782608695, - "grad_norm": 1.4285606145858765, - "learning_rate": 1.4981842350434152e-05, - "loss": 0.3042, - "step": 1410 - }, - { - "epoch": 61.73913043478261, - "grad_norm": 1.4928925037384033, - "learning_rate": 1.49818082049616e-05, - "loss": 0.3203, - "step": 1420 - }, - { - "epoch": 62.17391304347826, - "grad_norm": 2.1700551509857178, - "learning_rate": 1.4981771090376367e-05, - "loss": 0.2862, - "step": 1430 - }, - { - "epoch": 62.608695652173914, - "grad_norm": 1.9556642770767212, - "learning_rate": 1.4981731006693164e-05, - "loss": 0.3212, - "step": 1440 - }, - { - "epoch": 63.04347826086956, - "grad_norm": 1.322287678718567, - "learning_rate": 1.4981687953927875e-05, - "loss": 0.3127, - "step": 1450 - }, - { - "epoch": 63.47826086956522, - "grad_norm": 1.992517113685608, - "learning_rate": 1.498164193209757e-05, - "loss": 0.3448, - "step": 1460 - }, - { - "epoch": 63.91304347826087, - "grad_norm": 1.6658724546432495, - "learning_rate": 1.498159294122049e-05, - "loss": 0.2927, - "step": 1470 - }, - { - "epoch": 64.34782608695652, - "grad_norm": 1.7991540431976318, - "learning_rate": 1.4981540981316052e-05, - "loss": 0.269, - "step": 1480 - }, - { - "epoch": 64.78260869565217, - "grad_norm": 1.4999051094055176, - "learning_rate": 1.4981486052404848e-05, - "loss": 0.3586, - "step": 1490 - }, - { - "epoch": 65.21739130434783, - "grad_norm": 1.407665729522705, - "learning_rate": 1.4981428154508652e-05, - "loss": 0.2692, - "step": 1500 - }, - { - "epoch": 65.21739130434783, - "eval_loss": 0.8339105844497681, - "eval_runtime": 0.424, - "eval_samples_per_second": 23.583, - "eval_steps_per_second": 23.583, - "step": 1500 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4059, - "Start_State_samples_per_second": 24.637, - "Start_State_steps_per_second": 24.637, - "epoch": 65.21739130434783, - "step": 1500 - }, - { - "Raw_Model_loss": 0.8339105844497681, - "Raw_Model_runtime": 0.4071, - "Raw_Model_samples_per_second": 24.561, - "Raw_Model_steps_per_second": 24.561, - "epoch": 65.21739130434783, - "step": 1500 - }, - { - "SWA_loss": 0.7287951707839966, - "SWA_runtime": 0.4129, - "SWA_samples_per_second": 24.217, - "SWA_steps_per_second": 24.217, - "epoch": 65.21739130434783, - "step": 1500 - }, - { - "EMA_loss": 0.8608457446098328, - "EMA_runtime": 0.4068, - "EMA_samples_per_second": 24.582, - "EMA_steps_per_second": 24.582, - "epoch": 65.21739130434783, - "step": 1500 - }, - { - "epoch": 65.65217391304348, - "grad_norm": 2.232325792312622, - "learning_rate": 1.4981367287650419e-05, - "loss": 0.3162, - "step": 1510 - }, - { - "epoch": 66.08695652173913, - "grad_norm": 1.7753976583480835, - "learning_rate": 1.4981303451854267e-05, - "loss": 0.2948, - "step": 1520 - }, - { - "epoch": 66.52173913043478, - "grad_norm": 1.7147599458694458, - "learning_rate": 1.4981236647145501e-05, - "loss": 0.3102, - "step": 1530 - }, - { - "epoch": 66.95652173913044, - "grad_norm": 2.0997328758239746, - "learning_rate": 1.4981166873550601e-05, - "loss": 0.3052, - "step": 1540 - }, - { - "epoch": 67.3913043478261, - "grad_norm": 1.7865650653839111, - "learning_rate": 1.4981094131097224e-05, - "loss": 0.271, - "step": 1550 - }, - { - "epoch": 67.82608695652173, - "grad_norm": 2.0209579467773438, - "learning_rate": 1.49810184198142e-05, - "loss": 0.3436, - "step": 1560 - }, - { - "epoch": 68.26086956521739, - "grad_norm": 2.070268392562866, - "learning_rate": 1.498093973973154e-05, - "loss": 0.2505, - "step": 1570 - }, - { - "epoch": 68.69565217391305, - "grad_norm": 1.8239392042160034, - "learning_rate": 1.4980858090880429e-05, - "loss": 0.2862, - "step": 1580 - }, - { - "epoch": 69.1304347826087, - "grad_norm": 1.9371496438980103, - "learning_rate": 1.4980773473293232e-05, - "loss": 0.3683, - "step": 1590 - }, - { - "epoch": 69.56521739130434, - "grad_norm": 2.0029847621917725, - "learning_rate": 1.4980685887003486e-05, - "loss": 0.3073, - "step": 1600 - }, - { - "epoch": 70.0, - "grad_norm": 1.6579641103744507, - "learning_rate": 1.498059533204591e-05, - "loss": 0.269, - "step": 1610 - }, - { - "epoch": 70.43478260869566, - "grad_norm": 2.2111008167266846, - "learning_rate": 1.4980501808456398e-05, - "loss": 0.3138, - "step": 1620 - }, - { - "epoch": 70.8695652173913, - "grad_norm": 1.944257140159607, - "learning_rate": 1.4980405316272018e-05, - "loss": 0.2998, - "step": 1630 - }, - { - "epoch": 71.30434782608695, - "grad_norm": 2.359985113143921, - "learning_rate": 1.4980305855531015e-05, - "loss": 0.2886, - "step": 1640 - }, - { - "epoch": 71.73913043478261, - "grad_norm": 1.8482180833816528, - "learning_rate": 1.4980203426272815e-05, - "loss": 0.2625, - "step": 1650 - }, - { - "epoch": 71.73913043478261, - "eval_loss": 0.8492822647094727, - "eval_runtime": 0.4449, - "eval_samples_per_second": 22.475, - "eval_steps_per_second": 22.475, - "step": 1650 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4434, - "Start_State_samples_per_second": 22.551, - "Start_State_steps_per_second": 22.551, - "epoch": 71.73913043478261, - "step": 1650 - }, - { - "Raw_Model_loss": 0.8492822647094727, - "Raw_Model_runtime": 0.4159, - "Raw_Model_samples_per_second": 24.045, - "Raw_Model_steps_per_second": 24.045, - "epoch": 71.73913043478261, - "step": 1650 - }, - { - "SWA_loss": 0.7319540977478027, - "SWA_runtime": 0.3989, - "SWA_samples_per_second": 25.072, - "SWA_steps_per_second": 25.072, - "epoch": 71.73913043478261, - "step": 1650 - }, - { - "EMA_loss": 0.8609111905097961, - "EMA_runtime": 0.4379, - "EMA_samples_per_second": 22.836, - "EMA_steps_per_second": 22.836, - "epoch": 71.73913043478261, - "step": 1650 - }, - { - "epoch": 72.17391304347827, - "grad_norm": 1.6176584959030151, - "learning_rate": 1.4980098028538014e-05, - "loss": 0.3276, - "step": 1660 - }, - { - "epoch": 72.6086956521739, - "grad_norm": 2.0951242446899414, - "learning_rate": 1.4979989662368391e-05, - "loss": 0.2962, - "step": 1670 - }, - { - "epoch": 73.04347826086956, - "grad_norm": 1.9010318517684937, - "learning_rate": 1.4979878327806899e-05, - "loss": 0.3096, - "step": 1680 - }, - { - "epoch": 73.47826086956522, - "grad_norm": 1.990721344947815, - "learning_rate": 1.4979764024897668e-05, - "loss": 0.2877, - "step": 1690 - }, - { - "epoch": 73.91304347826087, - "grad_norm": 1.8217382431030273, - "learning_rate": 1.4979646753686002e-05, - "loss": 0.2797, - "step": 1700 - }, - { - "epoch": 74.34782608695652, - "grad_norm": 1.3920949697494507, - "learning_rate": 1.4979526514218385e-05, - "loss": 0.277, - "step": 1710 - }, - { - "epoch": 74.78260869565217, - "grad_norm": 1.4952901601791382, - "learning_rate": 1.4979403306542473e-05, - "loss": 0.3281, - "step": 1720 - }, - { - "epoch": 75.21739130434783, - "grad_norm": 1.6056287288665771, - "learning_rate": 1.4979277130707107e-05, - "loss": 0.234, - "step": 1730 - }, - { - "epoch": 75.65217391304348, - "grad_norm": 1.6877388954162598, - "learning_rate": 1.4979147986762295e-05, - "loss": 0.3147, - "step": 1740 - }, - { - "epoch": 76.08695652173913, - "grad_norm": 1.7907490730285645, - "learning_rate": 1.4979015874759227e-05, - "loss": 0.2696, - "step": 1750 - }, - { - "epoch": 76.52173913043478, - "grad_norm": 1.866333246231079, - "learning_rate": 1.4978880794750266e-05, - "loss": 0.2663, - "step": 1760 - }, - { - "epoch": 76.95652173913044, - "grad_norm": 1.275960087776184, - "learning_rate": 1.4978742746788957e-05, - "loss": 0.3004, - "step": 1770 - }, - { - "epoch": 77.3913043478261, - "grad_norm": 1.8372234106063843, - "learning_rate": 1.4978601730930014e-05, - "loss": 0.2842, - "step": 1780 - }, - { - "epoch": 77.82608695652173, - "grad_norm": 1.8203933238983154, - "learning_rate": 1.4978457747229335e-05, - "loss": 0.2714, - "step": 1790 - }, - { - "epoch": 78.26086956521739, - "grad_norm": 1.8666887283325195, - "learning_rate": 1.497831079574399e-05, - "loss": 0.3054, - "step": 1800 - }, - { - "epoch": 78.26086956521739, - "eval_loss": 0.8706566691398621, - "eval_runtime": 0.4194, - "eval_samples_per_second": 23.844, - "eval_steps_per_second": 23.844, - "step": 1800 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4248, - "Start_State_samples_per_second": 23.542, - "Start_State_steps_per_second": 23.542, - "epoch": 78.26086956521739, - "step": 1800 - }, - { - "Raw_Model_loss": 0.8706566691398621, - "Raw_Model_runtime": 0.4195, - "Raw_Model_samples_per_second": 23.839, - "Raw_Model_steps_per_second": 23.839, - "epoch": 78.26086956521739, - "step": 1800 - }, - { - "SWA_loss": 0.7383162379264832, - "SWA_runtime": 0.4011, - "SWA_samples_per_second": 24.93, - "SWA_steps_per_second": 24.93, - "epoch": 78.26086956521739, - "step": 1800 - }, - { - "EMA_loss": 0.8612034916877747, - "EMA_runtime": 0.4073, - "EMA_samples_per_second": 24.55, - "EMA_steps_per_second": 24.55, - "epoch": 78.26086956521739, - "step": 1800 - }, - { - "epoch": 78.69565217391305, - "grad_norm": 1.2506405115127563, - "learning_rate": 1.4978160876532222e-05, - "loss": 0.2825, - "step": 1810 - }, - { - "epoch": 79.1304347826087, - "grad_norm": 1.9981244802474976, - "learning_rate": 1.4978007989653455e-05, - "loss": 0.2405, - "step": 1820 - }, - { - "epoch": 79.56521739130434, - "grad_norm": 1.615986704826355, - "learning_rate": 1.4977852135168293e-05, - "loss": 0.2606, - "step": 1830 - }, - { - "epoch": 80.0, - "grad_norm": 1.8828160762786865, - "learning_rate": 1.4977693313138507e-05, - "loss": 0.3032, - "step": 1840 - }, - { - "epoch": 80.43478260869566, - "grad_norm": 2.1056859493255615, - "learning_rate": 1.4977531523627054e-05, - "loss": 0.28, - "step": 1850 - }, - { - "epoch": 80.8695652173913, - "grad_norm": 1.2708007097244263, - "learning_rate": 1.4977366766698058e-05, - "loss": 0.2793, - "step": 1860 - }, - { - "epoch": 81.30434782608695, - "grad_norm": 1.5546568632125854, - "learning_rate": 1.4977199042416822e-05, - "loss": 0.231, - "step": 1870 - }, - { - "epoch": 81.73913043478261, - "grad_norm": 1.5425999164581299, - "learning_rate": 1.4977028350849831e-05, - "loss": 0.306, - "step": 1880 - }, - { - "epoch": 82.17391304347827, - "grad_norm": 1.6000920534133911, - "learning_rate": 1.4976854692064739e-05, - "loss": 0.2147, - "step": 1890 - }, - { - "epoch": 82.6086956521739, - "grad_norm": 1.8370901346206665, - "learning_rate": 1.497667806613038e-05, - "loss": 0.2592, - "step": 1900 - }, - { - "epoch": 83.04347826086956, - "grad_norm": 1.573868989944458, - "learning_rate": 1.497649847311676e-05, - "loss": 0.3003, - "step": 1910 - }, - { - "epoch": 83.47826086956522, - "grad_norm": 1.478338599205017, - "learning_rate": 1.4976315913095068e-05, - "loss": 0.2651, - "step": 1920 - }, - { - "epoch": 83.91304347826087, - "grad_norm": 2.278778076171875, - "learning_rate": 1.4976130386137666e-05, - "loss": 0.3041, - "step": 1930 - }, - { - "epoch": 84.34782608695652, - "grad_norm": 1.6957610845565796, - "learning_rate": 1.4975941892318084e-05, - "loss": 0.2639, - "step": 1940 - }, - { - "epoch": 84.78260869565217, - "grad_norm": 2.2558460235595703, - "learning_rate": 1.497575043171104e-05, - "loss": 0.2798, - "step": 1950 - }, - { - "epoch": 84.78260869565217, - "eval_loss": 0.8924320340156555, - "eval_runtime": 0.4145, - "eval_samples_per_second": 24.125, - "eval_steps_per_second": 24.125, - "step": 1950 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4181, - "Start_State_samples_per_second": 23.92, - "Start_State_steps_per_second": 23.92, - "epoch": 84.78260869565217, - "step": 1950 - }, - { - "Raw_Model_loss": 0.8924320340156555, - "Raw_Model_runtime": 0.3985, - "Raw_Model_samples_per_second": 25.093, - "Raw_Model_steps_per_second": 25.093, - "epoch": 84.78260869565217, - "step": 1950 - }, - { - "SWA_loss": 0.740094780921936, - "SWA_runtime": 0.4024, - "SWA_samples_per_second": 24.849, - "SWA_steps_per_second": 24.849, - "epoch": 84.78260869565217, - "step": 1950 - }, - { - "EMA_loss": 0.8611790537834167, - "EMA_runtime": 0.3965, - "EMA_samples_per_second": 25.224, - "EMA_steps_per_second": 25.224, - "epoch": 84.78260869565217, - "step": 1950 - }, - { - "epoch": 85.21739130434783, - "grad_norm": 2.3564674854278564, - "learning_rate": 7.487875215855521e-07, - "loss": 0.265, - "step": 1960 - }, - { - "epoch": 85.65217391304348, - "grad_norm": 1.8858447074890137, - "learning_rate": 1.4975750431711041e-06, - "loss": 0.2704, - "step": 1970 - }, - { - "epoch": 86.08695652173913, - "grad_norm": 2.1835811138153076, - "learning_rate": 2.2463625647566557e-06, - "loss": 0.2531, - "step": 1980 - }, - { - "epoch": 86.52173913043478, - "grad_norm": 1.674813985824585, - "learning_rate": 2.9951500863422082e-06, - "loss": 0.2767, - "step": 1990 - }, - { - "epoch": 86.95652173913044, - "grad_norm": 2.0097134113311768, - "learning_rate": 3.74393760792776e-06, - "loss": 0.2766, - "step": 2000 - }, - { - "epoch": 87.3913043478261, - "grad_norm": 1.4214787483215332, - "learning_rate": 4.4927251295133115e-06, - "loss": 0.2779, - "step": 2010 - }, - { - "epoch": 87.82608695652173, - "grad_norm": 2.0007896423339844, - "learning_rate": 5.241512651098863e-06, - "loss": 0.2588, - "step": 2020 - }, - { - "epoch": 88.26086956521739, - "grad_norm": 2.0449113845825195, - "learning_rate": 5.9903001726844164e-06, - "loss": 0.2614, - "step": 2030 - }, - { - "epoch": 88.69565217391305, - "grad_norm": 1.7983092069625854, - "learning_rate": 6.739087694269968e-06, - "loss": 0.2852, - "step": 2040 - }, - { - "epoch": 89.1304347826087, - "grad_norm": 1.8373875617980957, - "learning_rate": 7.48787521585552e-06, - "loss": 0.2914, - "step": 2050 - }, - { - "epoch": 89.56521739130434, - "grad_norm": 1.543720006942749, - "learning_rate": 7.487874473866896e-06, - "loss": 0.2467, - "step": 2060 - }, - { - "epoch": 90.0, - "grad_norm": 1.6378145217895508, - "learning_rate": 7.487872247901318e-06, - "loss": 0.2524, - "step": 2070 - }, - { - "epoch": 90.43478260869566, - "grad_norm": 1.8025075197219849, - "learning_rate": 7.4878685379596685e-06, - "loss": 0.2572, - "step": 2080 - }, - { - "epoch": 90.8695652173913, - "grad_norm": 1.7167291641235352, - "learning_rate": 7.487863344043418e-06, - "loss": 0.283, - "step": 2090 - }, - { - "epoch": 91.30434782608695, - "grad_norm": 1.7985183000564575, - "learning_rate": 7.487856666154626e-06, - "loss": 0.2721, - "step": 2100 - }, - { - "epoch": 91.30434782608695, - "eval_loss": 0.8964352607727051, - "eval_runtime": 0.4971, - "eval_samples_per_second": 20.118, - "eval_steps_per_second": 20.118, - "step": 2100 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.5202, - "Start_State_samples_per_second": 19.223, - "Start_State_steps_per_second": 19.223, - "epoch": 91.30434782608695, - "step": 2100 - }, - { - "Raw_Model_loss": 0.8964352607727051, - "Raw_Model_runtime": 0.4702, - "Raw_Model_samples_per_second": 21.269, - "Raw_Model_steps_per_second": 21.269, - "epoch": 91.30434782608695, - "step": 2100 - }, - { - "SWA_loss": 0.7461259365081787, - "SWA_runtime": 0.3981, - "SWA_samples_per_second": 25.122, - "SWA_steps_per_second": 25.122, - "epoch": 91.30434782608695, - "step": 2100 - }, - { - "EMA_loss": 0.8607404828071594, - "EMA_runtime": 0.3991, - "EMA_samples_per_second": 25.058, - "EMA_steps_per_second": 25.058, - "epoch": 91.30434782608695, - "step": 2100 - }, - { - "epoch": 91.73913043478261, - "grad_norm": 2.307114601135254, - "learning_rate": 7.487848504295937e-06, - "loss": 0.2596, - "step": 2110 - }, - { - "epoch": 92.17391304347827, - "grad_norm": 2.0132083892822266, - "learning_rate": 7.4878388584705885e-06, - "loss": 0.2902, - "step": 2120 - }, - { - "epoch": 92.6086956521739, - "grad_norm": 1.9218742847442627, - "learning_rate": 7.487827728682402e-06, - "loss": 0.2791, - "step": 2130 - }, - { - "epoch": 93.04347826086956, - "grad_norm": 1.6972328424453735, - "learning_rate": 7.487815114935791e-06, - "loss": 0.2376, - "step": 2140 - }, - { - "epoch": 93.47826086956522, - "grad_norm": 1.8078455924987793, - "learning_rate": 7.487801017235753e-06, - "loss": 0.289, - "step": 2150 - }, - { - "epoch": 93.91304347826087, - "grad_norm": 2.128847599029541, - "learning_rate": 7.4877854355878785e-06, - "loss": 0.27, - "step": 2160 - }, - { - "epoch": 94.34782608695652, - "grad_norm": 1.9462212324142456, - "learning_rate": 7.487768369998342e-06, - "loss": 0.2166, - "step": 2170 - }, - { - "epoch": 94.78260869565217, - "grad_norm": 2.225867986679077, - "learning_rate": 7.4877498204739075e-06, - "loss": 0.2959, - "step": 2180 - }, - { - "epoch": 95.21739130434783, - "grad_norm": 2.1711599826812744, - "learning_rate": 7.487729787021927e-06, - "loss": 0.2598, - "step": 2190 - }, - { - "epoch": 95.65217391304348, - "grad_norm": 2.3892881870269775, - "learning_rate": 7.487708269650342e-06, - "loss": 0.2585, - "step": 2200 - }, - { - "epoch": 96.08695652173913, - "grad_norm": 2.3716413974761963, - "learning_rate": 7.487685268367682e-06, - "loss": 0.2593, - "step": 2210 - }, - { - "epoch": 96.52173913043478, - "grad_norm": 1.8392366170883179, - "learning_rate": 7.487660783183063e-06, - "loss": 0.2681, - "step": 2220 - }, - { - "epoch": 96.95652173913044, - "grad_norm": 2.1921820640563965, - "learning_rate": 7.48763481410619e-06, - "loss": 0.2609, - "step": 2230 - }, - { - "epoch": 97.3913043478261, - "grad_norm": 1.5945699214935303, - "learning_rate": 7.487607361147356e-06, - "loss": 0.2883, - "step": 2240 - }, - { - "epoch": 97.82608695652173, - "grad_norm": 1.3208949565887451, - "learning_rate": 7.487578424317443e-06, - "loss": 0.2525, - "step": 2250 - }, - { - "epoch": 97.82608695652173, - "eval_loss": 0.9061517715454102, - "eval_runtime": 0.4781, - "eval_samples_per_second": 20.916, - "eval_steps_per_second": 20.916, - "step": 2250 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4046, - "Start_State_samples_per_second": 24.715, - "Start_State_steps_per_second": 24.715, - "epoch": 97.82608695652173, - "step": 2250 - }, - { - "Raw_Model_loss": 0.9061517715454102, - "Raw_Model_runtime": 0.4112, - "Raw_Model_samples_per_second": 24.319, - "Raw_Model_steps_per_second": 24.319, - "epoch": 97.82608695652173, - "step": 2250 - }, - { - "SWA_loss": 0.74998939037323, - "SWA_runtime": 0.4166, - "SWA_samples_per_second": 24.004, - "SWA_steps_per_second": 24.004, - "epoch": 97.82608695652173, - "step": 2250 - }, - { - "EMA_loss": 0.8602108955383301, - "EMA_runtime": 0.398, - "EMA_samples_per_second": 25.124, - "EMA_steps_per_second": 25.124, - "epoch": 97.82608695652173, - "step": 2250 - }, - { - "epoch": 98.26086956521739, - "grad_norm": 1.9422506093978882, - "learning_rate": 7.487548003627922e-06, - "loss": 0.2414, - "step": 2260 - }, - { - "epoch": 98.69565217391305, - "grad_norm": 1.745564341545105, - "learning_rate": 7.487516099090849e-06, - "loss": 0.278, - "step": 2270 - }, - { - "epoch": 99.1304347826087, - "grad_norm": 2.0466256141662598, - "learning_rate": 7.48748271071887e-06, - "loss": 0.2487, - "step": 2280 - }, - { - "epoch": 99.56521739130434, - "grad_norm": 2.3589112758636475, - "learning_rate": 7.48744783852522e-06, - "loss": 0.2882, - "step": 2290 - }, - { - "epoch": 100.0, - "grad_norm": 2.6583240032196045, - "learning_rate": 7.487411482523721e-06, - "loss": 0.2324, - "step": 2300 - }, - { - "epoch": 100.43478260869566, - "grad_norm": 2.685478448867798, - "learning_rate": 7.4873736427287825e-06, - "loss": 0.2368, - "step": 2310 - }, - { - "epoch": 100.8695652173913, - "grad_norm": 1.7692900896072388, - "learning_rate": 7.487334319155404e-06, - "loss": 0.2694, - "step": 2320 - }, - { - "epoch": 101.30434782608695, - "grad_norm": 2.5517287254333496, - "learning_rate": 7.487293511819172e-06, - "loss": 0.2417, - "step": 2330 - }, - { - "epoch": 101.73913043478261, - "grad_norm": 1.7970623970031738, - "learning_rate": 7.4872512207362605e-06, - "loss": 0.2446, - "step": 2340 - }, - { - "epoch": 102.17391304347827, - "grad_norm": 1.792651653289795, - "learning_rate": 7.487207445923432e-06, - "loss": 0.2934, - "step": 2350 - }, - { - "epoch": 102.6086956521739, - "grad_norm": 2.1051220893859863, - "learning_rate": 7.487162187398039e-06, - "loss": 0.2844, - "step": 2360 - }, - { - "epoch": 103.04347826086956, - "grad_norm": 1.9311975240707397, - "learning_rate": 7.487115445178019e-06, - "loss": 0.2162, - "step": 2370 - }, - { - "epoch": 103.47826086956522, - "grad_norm": 2.12684965133667, - "learning_rate": 7.487067219281901e-06, - "loss": 0.2911, - "step": 2380 - }, - { - "epoch": 103.91304347826087, - "grad_norm": 2.0107476711273193, - "learning_rate": 7.4870175097287985e-06, - "loss": 0.2413, - "step": 2390 - }, - { - "epoch": 104.34782608695652, - "grad_norm": 1.9675108194351196, - "learning_rate": 7.486966316538416e-06, - "loss": 0.2557, - "step": 2400 - }, - { - "epoch": 104.34782608695652, - "eval_loss": 0.9136893153190613, - "eval_runtime": 0.4193, - "eval_samples_per_second": 23.849, - "eval_steps_per_second": 23.849, - "step": 2400 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4057, - "Start_State_samples_per_second": 24.646, - "Start_State_steps_per_second": 24.646, - "epoch": 104.34782608695652, - "step": 2400 - }, - { - "Raw_Model_loss": 0.9136893153190613, - "Raw_Model_runtime": 0.4082, - "Raw_Model_samples_per_second": 24.497, - "Raw_Model_steps_per_second": 24.497, - "epoch": 104.34782608695652, - "step": 2400 - }, - { - "SWA_loss": 0.7567933797836304, - "SWA_runtime": 0.4029, - "SWA_samples_per_second": 24.818, - "SWA_steps_per_second": 24.818, - "epoch": 104.34782608695652, - "step": 2400 - }, - { - "EMA_loss": 0.8605263829231262, - "EMA_runtime": 0.4051, - "EMA_samples_per_second": 24.683, - "EMA_steps_per_second": 24.683, - "epoch": 104.34782608695652, - "step": 2400 - }, - { - "epoch": 104.78260869565217, - "grad_norm": 2.10827898979187, - "learning_rate": 7.486913639731043e-06, - "loss": 0.2495, - "step": 2410 - }, - { - "epoch": 105.21739130434783, - "grad_norm": 2.025355815887451, - "learning_rate": 7.48685947932756e-06, - "loss": 0.2637, - "step": 2420 - }, - { - "epoch": 105.65217391304348, - "grad_norm": 1.9276680946350098, - "learning_rate": 7.4868038353494355e-06, - "loss": 0.2603, - "step": 2430 - }, - { - "epoch": 106.08695652173913, - "grad_norm": 2.324167490005493, - "learning_rate": 7.486746707818724e-06, - "loss": 0.2141, - "step": 2440 - }, - { - "epoch": 106.52173913043478, - "grad_norm": 1.4006412029266357, - "learning_rate": 7.486688096758069e-06, - "loss": 0.2816, - "step": 2450 - }, - { - "epoch": 106.95652173913044, - "grad_norm": 1.8922216892242432, - "learning_rate": 7.486628002190702e-06, - "loss": 0.2444, - "step": 2460 - }, - { - "epoch": 107.3913043478261, - "grad_norm": 2.3611834049224854, - "learning_rate": 7.486566424140442e-06, - "loss": 0.3039, - "step": 2470 - }, - { - "epoch": 107.82608695652173, - "grad_norm": 2.2470717430114746, - "learning_rate": 7.486503362631699e-06, - "loss": 0.2188, - "step": 2480 - }, - { - "epoch": 108.26086956521739, - "grad_norm": 2.0604355335235596, - "learning_rate": 7.486438817689465e-06, - "loss": 0.2706, - "step": 2490 - }, - { - "epoch": 108.69565217391305, - "grad_norm": 1.6355359554290771, - "learning_rate": 7.486372789339326e-06, - "loss": 0.2454, - "step": 2500 - }, - { - "epoch": 109.1304347826087, - "grad_norm": 1.6156138181686401, - "learning_rate": 7.486305277607452e-06, - "loss": 0.2437, - "step": 2510 - }, - { - "epoch": 109.56521739130434, - "grad_norm": 1.3432440757751465, - "learning_rate": 7.486236282520606e-06, - "loss": 0.2309, - "step": 2520 - }, - { - "epoch": 110.0, - "grad_norm": 3.2272891998291016, - "learning_rate": 7.48616580410613e-06, - "loss": 0.2874, - "step": 2530 - }, - { - "epoch": 110.43478260869566, - "grad_norm": 1.7123788595199585, - "learning_rate": 7.486093842391963e-06, - "loss": 0.2452, - "step": 2540 - }, - { - "epoch": 110.8695652173913, - "grad_norm": 1.8407248258590698, - "learning_rate": 7.486020397406629e-06, - "loss": 0.2698, - "step": 2550 - }, - { - "epoch": 110.8695652173913, - "eval_loss": 0.9264782071113586, - "eval_runtime": 0.5916, - "eval_samples_per_second": 16.903, - "eval_steps_per_second": 16.903, - "step": 2550 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4032, - "Start_State_samples_per_second": 24.805, - "Start_State_steps_per_second": 24.805, - "epoch": 110.8695652173913, - "step": 2550 - }, - { - "Raw_Model_loss": 0.9264782071113586, - "Raw_Model_runtime": 0.4014, - "Raw_Model_samples_per_second": 24.913, - "Raw_Model_steps_per_second": 24.913, - "epoch": 110.8695652173913, - "step": 2550 - }, - { - "SWA_loss": 0.759516716003418, - "SWA_runtime": 0.403, - "SWA_samples_per_second": 24.814, - "SWA_steps_per_second": 24.814, - "epoch": 110.8695652173913, - "step": 2550 - }, - { - "EMA_loss": 0.8597530126571655, - "EMA_runtime": 0.415, - "EMA_samples_per_second": 24.098, - "EMA_steps_per_second": 24.098, - "epoch": 110.8695652173913, - "step": 2550 - }, - { - "epoch": 111.30434782608695, - "grad_norm": 2.409362554550171, - "learning_rate": 7.485945469179237e-06, - "loss": 0.2816, - "step": 2560 - }, - { - "epoch": 111.73913043478261, - "grad_norm": 2.021090030670166, - "learning_rate": 7.485869057739486e-06, - "loss": 0.228, - "step": 2570 - }, - { - "epoch": 112.17391304347827, - "grad_norm": 2.0017611980438232, - "learning_rate": 7.485791163117665e-06, - "loss": 0.2461, - "step": 2580 - }, - { - "epoch": 112.6086956521739, - "grad_norm": 1.6572258472442627, - "learning_rate": 7.485711785344648e-06, - "loss": 0.2461, - "step": 2590 - }, - { - "epoch": 113.04347826086956, - "grad_norm": 2.1028172969818115, - "learning_rate": 7.485630924451897e-06, - "loss": 0.2658, - "step": 2600 - }, - { - "epoch": 113.47826086956522, - "grad_norm": 1.8281258344650269, - "learning_rate": 7.485548580471464e-06, - "loss": 0.2257, - "step": 2610 - }, - { - "epoch": 113.91304347826087, - "grad_norm": 2.1749765872955322, - "learning_rate": 7.485464753435987e-06, - "loss": 0.2756, - "step": 2620 - }, - { - "epoch": 114.34782608695652, - "grad_norm": 2.009671688079834, - "learning_rate": 7.485379443378693e-06, - "loss": 0.2447, - "step": 2630 - }, - { - "epoch": 114.78260869565217, - "grad_norm": 2.52178955078125, - "learning_rate": 7.485292650333394e-06, - "loss": 0.2289, - "step": 2640 - }, - { - "epoch": 115.21739130434783, - "grad_norm": 1.7975748777389526, - "learning_rate": 7.485204374334494e-06, - "loss": 0.2551, - "step": 2650 - }, - { - "epoch": 115.65217391304348, - "grad_norm": 1.9182255268096924, - "learning_rate": 7.485114615416982e-06, - "loss": 0.2721, - "step": 2660 - }, - { - "epoch": 116.08695652173913, - "grad_norm": 2.3910796642303467, - "learning_rate": 7.485023373616437e-06, - "loss": 0.2156, - "step": 2670 - }, - { - "epoch": 116.52173913043478, - "grad_norm": 2.55471134185791, - "learning_rate": 7.484930648969023e-06, - "loss": 0.2447, - "step": 2680 - }, - { - "epoch": 116.95652173913044, - "grad_norm": 1.5849785804748535, - "learning_rate": 7.484836441511492e-06, - "loss": 0.2441, - "step": 2690 - }, - { - "epoch": 117.3913043478261, - "grad_norm": 1.6347429752349854, - "learning_rate": 7.484740751281187e-06, - "loss": 0.2362, - "step": 2700 - }, - { - "epoch": 117.3913043478261, - "eval_loss": 0.9330541491508484, - "eval_runtime": 0.4063, - "eval_samples_per_second": 24.614, - "eval_steps_per_second": 24.614, - "step": 2700 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4124, - "Start_State_samples_per_second": 24.246, - "Start_State_steps_per_second": 24.246, - "epoch": 117.3913043478261, - "step": 2700 - }, - { - "Raw_Model_loss": 0.9330541491508484, - "Raw_Model_runtime": 0.4024, - "Raw_Model_samples_per_second": 24.852, - "Raw_Model_steps_per_second": 24.852, - "epoch": 117.3913043478261, - "step": 2700 - }, - { - "SWA_loss": 0.7649438977241516, - "SWA_runtime": 0.4267, - "SWA_samples_per_second": 23.437, - "SWA_steps_per_second": 23.437, - "epoch": 117.3913043478261, - "step": 2700 - }, - { - "EMA_loss": 0.8608808517456055, - "EMA_runtime": 0.4102, - "EMA_samples_per_second": 24.381, - "EMA_steps_per_second": 24.381, - "epoch": 117.3913043478261, - "step": 2700 - }, - { - "epoch": 117.82608695652173, - "grad_norm": 3.383168935775757, - "learning_rate": 3.9100869016093776e-07, - "loss": 0.2406, - "step": 2710 - }, - { - "epoch": 118.26086956521739, - "grad_norm": 1.8521851301193237, - "learning_rate": 7.820173803218755e-07, - "loss": 0.2516, - "step": 2720 - }, - { - "epoch": 118.69565217391305, - "grad_norm": 2.2895267009735107, - "learning_rate": 1.1730260704828132e-06, - "loss": 0.2411, - "step": 2730 - }, - { - "epoch": 119.1304347826087, - "grad_norm": 2.308509349822998, - "learning_rate": 1.564034760643751e-06, - "loss": 0.2601, - "step": 2740 - }, - { - "epoch": 119.56521739130434, - "grad_norm": 2.1916897296905518, - "learning_rate": 1.9550434508046887e-06, - "loss": 0.2596, - "step": 2750 - }, - { - "epoch": 120.0, - "grad_norm": 4.756087779998779, - "learning_rate": 2.3460521409656263e-06, - "loss": 0.2188, - "step": 2760 - }, - { - "epoch": 120.43478260869566, - "grad_norm": 1.9395239353179932, - "learning_rate": 2.737060831126564e-06, - "loss": 0.2546, - "step": 2770 - }, - { - "epoch": 120.8695652173913, - "grad_norm": 1.7763605117797852, - "learning_rate": 3.128069521287502e-06, - "loss": 0.219, - "step": 2780 - }, - { - "epoch": 121.30434782608695, - "grad_norm": 1.3847345113754272, - "learning_rate": 3.5190782114484397e-06, - "loss": 0.2731, - "step": 2790 - }, - { - "epoch": 121.73913043478261, - "grad_norm": 2.002622365951538, - "learning_rate": 3.910086901609377e-06, - "loss": 0.2701, - "step": 2800 - }, - { - "epoch": 122.17391304347827, - "grad_norm": 2.062542200088501, - "learning_rate": 3.910086514150998e-06, - "loss": 0.2201, - "step": 2810 - }, - { - "epoch": 122.6086956521739, - "grad_norm": 1.7233972549438477, - "learning_rate": 3.910085351776012e-06, - "loss": 0.2632, - "step": 2820 - }, - { - "epoch": 123.04347826086956, - "grad_norm": 1.894426703453064, - "learning_rate": 3.91008341448488e-06, - "loss": 0.2512, - "step": 2830 - }, - { - "epoch": 123.47826086956522, - "grad_norm": 2.2210731506347656, - "learning_rate": 3.910080702278371e-06, - "loss": 0.2425, - "step": 2840 - }, - { - "epoch": 123.91304347826087, - "grad_norm": 2.2751896381378174, - "learning_rate": 3.91007721515756e-06, - "loss": 0.2389, - "step": 2850 - }, - { - "epoch": 123.91304347826087, - "eval_loss": 0.9399979710578918, - "eval_runtime": 0.5741, - "eval_samples_per_second": 17.419, - "eval_steps_per_second": 17.419, - "step": 2850 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.5559, - "Start_State_samples_per_second": 17.99, - "Start_State_steps_per_second": 17.99, - "epoch": 123.91304347826087, - "step": 2850 - }, - { - "Raw_Model_loss": 0.9399979710578918, - "Raw_Model_runtime": 0.5508, - "Raw_Model_samples_per_second": 18.155, - "Raw_Model_steps_per_second": 18.155, - "epoch": 123.91304347826087, - "step": 2850 - }, - { - "SWA_loss": 0.7678512334823608, - "SWA_runtime": 0.554, - "SWA_samples_per_second": 18.049, - "SWA_steps_per_second": 18.049, - "epoch": 123.91304347826087, - "step": 2850 - }, - { - "EMA_loss": 0.8602371215820312, - "EMA_runtime": 0.5253, - "EMA_samples_per_second": 19.037, - "EMA_steps_per_second": 19.037, - "epoch": 123.91304347826087, - "step": 2850 - }, - { - "epoch": 124.34782608695652, - "grad_norm": 1.6300952434539795, - "learning_rate": 3.910072953123827e-06, - "loss": 0.2256, - "step": 2860 - }, - { - "epoch": 124.78260869565217, - "grad_norm": 1.5945820808410645, - "learning_rate": 3.910067916178865e-06, - "loss": 0.2304, - "step": 2870 - }, - { - "epoch": 125.21739130434783, - "grad_norm": 2.0118942260742188, - "learning_rate": 3.9100621043246675e-06, - "loss": 0.2693, - "step": 2880 - }, - { - "epoch": 125.65217391304348, - "grad_norm": 2.1449036598205566, - "learning_rate": 3.910055517563539e-06, - "loss": 0.2454, - "step": 2890 - }, - { - "epoch": 126.08695652173913, - "grad_norm": 2.3814568519592285, - "learning_rate": 3.9100481558980905e-06, - "loss": 0.2517, - "step": 2900 - }, - { - "epoch": 126.52173913043478, - "grad_norm": 1.680646300315857, - "learning_rate": 3.91004001933124e-06, - "loss": 0.2023, - "step": 2910 - }, - { - "epoch": 126.95652173913044, - "grad_norm": 1.567590355873108, - "learning_rate": 3.9100311078662124e-06, - "loss": 0.2903, - "step": 2920 - }, - { - "epoch": 127.3913043478261, - "grad_norm": 2.0478575229644775, - "learning_rate": 3.9100214215065405e-06, - "loss": 0.2554, - "step": 2930 - }, - { - "epoch": 127.82608695652173, - "grad_norm": 2.724403142929077, - "learning_rate": 3.910010960256062e-06, - "loss": 0.2195, - "step": 2940 - }, - { - "epoch": 128.2608695652174, - "grad_norm": 2.3156094551086426, - "learning_rate": 3.909999724118925e-06, - "loss": 0.2952, - "step": 2950 - }, - { - "epoch": 128.69565217391303, - "grad_norm": 1.6324609518051147, - "learning_rate": 3.909987713099583e-06, - "loss": 0.2409, - "step": 2960 - }, - { - "epoch": 129.1304347826087, - "grad_norm": 1.66539466381073, - "learning_rate": 3.909974927202796e-06, - "loss": 0.2029, - "step": 2970 - }, - { - "epoch": 129.56521739130434, - "grad_norm": 1.877989649772644, - "learning_rate": 3.909961366433632e-06, - "loss": 0.2407, - "step": 2980 - }, - { - "epoch": 130.0, - "grad_norm": 5.461711406707764, - "learning_rate": 3.909947030797467e-06, - "loss": 0.2466, - "step": 2990 - }, - { - "epoch": 130.43478260869566, - "grad_norm": 2.4120867252349854, - "learning_rate": 3.909931920299982e-06, - "loss": 0.2372, - "step": 3000 - }, - { - "epoch": 130.43478260869566, - "eval_loss": 0.9418841600418091, - "eval_runtime": 0.4153, - "eval_samples_per_second": 24.08, - "eval_steps_per_second": 24.08, - "step": 3000 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4056, - "Start_State_samples_per_second": 24.655, - "Start_State_steps_per_second": 24.655, - "epoch": 130.43478260869566, - "step": 3000 - }, - { - "Raw_Model_loss": 0.9418841600418091, - "Raw_Model_runtime": 0.4028, - "Raw_Model_samples_per_second": 24.829, - "Raw_Model_steps_per_second": 24.829, - "epoch": 130.43478260869566, - "step": 3000 - }, - { - "SWA_loss": 0.7738855481147766, - "SWA_runtime": 0.4063, - "SWA_samples_per_second": 24.613, - "SWA_steps_per_second": 24.613, - "epoch": 130.43478260869566, - "step": 3000 - }, - { - "EMA_loss": 0.8603588938713074, - "EMA_runtime": 0.4125, - "EMA_samples_per_second": 24.244, - "EMA_steps_per_second": 24.244, - "epoch": 130.43478260869566, - "step": 3000 - }, - { - "epoch": 130.8695652173913, - "grad_norm": 2.050870418548584, - "learning_rate": 3.9099160349471675e-06, - "loss": 0.23, - "step": 3010 - }, - { - "epoch": 131.30434782608697, - "grad_norm": 1.7972759008407593, - "learning_rate": 3.90989937474532e-06, - "loss": 0.2704, - "step": 3020 - }, - { - "epoch": 131.7391304347826, - "grad_norm": 1.958837628364563, - "learning_rate": 3.909881939701041e-06, - "loss": 0.2614, - "step": 3030 - }, - { - "epoch": 132.17391304347825, - "grad_norm": 1.825850486755371, - "learning_rate": 3.909863729821243e-06, - "loss": 0.2269, - "step": 3040 - }, - { - "epoch": 132.6086956521739, - "grad_norm": 2.1669623851776123, - "learning_rate": 3.9098447451131435e-06, - "loss": 0.2528, - "step": 3050 - }, - { - "epoch": 133.04347826086956, - "grad_norm": 2.685922622680664, - "learning_rate": 3.909824985584268e-06, - "loss": 0.215, - "step": 3060 - }, - { - "epoch": 133.47826086956522, - "grad_norm": 1.285071611404419, - "learning_rate": 3.9098044512424475e-06, - "loss": 0.2484, - "step": 3070 - }, - { - "epoch": 133.91304347826087, - "grad_norm": 2.4123470783233643, - "learning_rate": 3.909783142095821e-06, - "loss": 0.2733, - "step": 3080 - }, - { - "epoch": 134.34782608695653, - "grad_norm": 1.9801201820373535, - "learning_rate": 3.909761058152836e-06, - "loss": 0.2539, - "step": 3090 - }, - { - "epoch": 134.7826086956522, - "grad_norm": 1.934043049812317, - "learning_rate": 3.9097381994222444e-06, - "loss": 0.206, - "step": 3100 - }, - { - "epoch": 135.2173913043478, - "grad_norm": 2.4174482822418213, - "learning_rate": 3.9097145659131085e-06, - "loss": 0.244, - "step": 3110 - }, - { - "epoch": 135.65217391304347, - "grad_norm": 1.85491943359375, - "learning_rate": 3.909690157634794e-06, - "loss": 0.2852, - "step": 3120 - }, - { - "epoch": 136.08695652173913, - "grad_norm": 2.3516900539398193, - "learning_rate": 3.909664974596977e-06, - "loss": 0.2128, - "step": 3130 - }, - { - "epoch": 136.52173913043478, - "grad_norm": 2.355637788772583, - "learning_rate": 3.909639016809639e-06, - "loss": 0.2381, - "step": 3140 - }, - { - "epoch": 136.95652173913044, - "grad_norm": 2.8338263034820557, - "learning_rate": 3.909612284283068e-06, - "loss": 0.2338, - "step": 3150 - }, - { - "epoch": 136.95652173913044, - "eval_loss": 0.9423562288284302, - "eval_runtime": 0.4463, - "eval_samples_per_second": 22.407, - "eval_steps_per_second": 22.407, - "step": 3150 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.394, - "Start_State_samples_per_second": 25.38, - "Start_State_steps_per_second": 25.38, - "epoch": 136.95652173913044, - "step": 3150 - }, - { - "Raw_Model_loss": 0.9423562288284302, - "Raw_Model_runtime": 0.4021, - "Raw_Model_samples_per_second": 24.868, - "Raw_Model_steps_per_second": 24.868, - "epoch": 136.95652173913044, - "step": 3150 - }, - { - "SWA_loss": 0.7764584422111511, - "SWA_runtime": 0.4066, - "SWA_samples_per_second": 24.596, - "SWA_steps_per_second": 24.596, - "epoch": 136.95652173913044, - "step": 3150 - }, - { - "EMA_loss": 0.861250102519989, - "EMA_runtime": 0.391, - "EMA_samples_per_second": 25.577, - "EMA_steps_per_second": 25.577, - "epoch": 136.95652173913044, - "step": 3150 - }, - { - "epoch": 137.3913043478261, - "grad_norm": 1.8168175220489502, - "learning_rate": 3.90958477702786e-06, - "loss": 0.2772, - "step": 3160 - }, - { - "epoch": 137.82608695652175, - "grad_norm": 1.9455727338790894, - "learning_rate": 3.909556495054918e-06, - "loss": 0.235, - "step": 3170 - }, - { - "epoch": 138.2608695652174, - "grad_norm": 1.652616024017334, - "learning_rate": 3.9095274383754535e-06, - "loss": 0.2271, - "step": 3180 - }, - { - "epoch": 138.69565217391303, - "grad_norm": 2.0651702880859375, - "learning_rate": 3.9094976070009825e-06, - "loss": 0.275, - "step": 3190 - }, - { - "epoch": 139.1304347826087, - "grad_norm": 1.3532943725585938, - "learning_rate": 3.90946700094333e-06, - "loss": 0.2252, - "step": 3200 - }, - { - "epoch": 139.56521739130434, - "grad_norm": 1.9652019739151, - "learning_rate": 3.909435620214626e-06, - "loss": 0.2701, - "step": 3210 - }, - { - "epoch": 140.0, - "grad_norm": 2.921708106994629, - "learning_rate": 3.909403464827308e-06, - "loss": 0.2301, - "step": 3220 - }, - { - "epoch": 140.43478260869566, - "grad_norm": 2.249617576599121, - "learning_rate": 3.909370534794125e-06, - "loss": 0.2311, - "step": 3230 - }, - { - "epoch": 140.8695652173913, - "grad_norm": 1.7195583581924438, - "learning_rate": 3.9093368301281256e-06, - "loss": 0.238, - "step": 3240 - }, - { - "epoch": 141.30434782608697, - "grad_norm": 1.9796561002731323, - "learning_rate": 3.909302350842671e-06, - "loss": 0.2368, - "step": 3250 - }, - { - "epoch": 141.7391304347826, - "grad_norm": 2.028313636779785, - "learning_rate": 3.909267096951428e-06, - "loss": 0.2492, - "step": 3260 - }, - { - "epoch": 142.17391304347825, - "grad_norm": 1.7353265285491943, - "learning_rate": 3.90923106846837e-06, - "loss": 0.2578, - "step": 3270 - }, - { - "epoch": 142.6086956521739, - "grad_norm": 2.5716023445129395, - "learning_rate": 3.9091942654077765e-06, - "loss": 0.2606, - "step": 3280 - }, - { - "epoch": 143.04347826086956, - "grad_norm": 1.2427494525909424, - "learning_rate": 3.9091566877842355e-06, - "loss": 0.2151, - "step": 3290 - }, - { - "epoch": 143.47826086956522, - "grad_norm": 1.797579288482666, - "learning_rate": 3.9091183356126425e-06, - "loss": 0.2476, - "step": 3300 - }, - { - "epoch": 143.47826086956522, - "eval_loss": 0.9522649049758911, - "eval_runtime": 0.4144, - "eval_samples_per_second": 24.129, - "eval_steps_per_second": 24.129, - "step": 3300 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4101, - "Start_State_samples_per_second": 24.382, - "Start_State_steps_per_second": 24.382, - "epoch": 143.47826086956522, - "step": 3300 - }, - { - "Raw_Model_loss": 0.9522649049758911, - "Raw_Model_runtime": 0.4058, - "Raw_Model_samples_per_second": 24.644, - "Raw_Model_steps_per_second": 24.644, - "epoch": 143.47826086956522, - "step": 3300 - }, - { - "SWA_loss": 0.7822953462600708, - "SWA_runtime": 0.4238, - "SWA_samples_per_second": 23.595, - "SWA_steps_per_second": 23.595, - "epoch": 143.47826086956522, - "step": 3300 - }, - { - "EMA_loss": 0.8601328134536743, - "EMA_runtime": 0.4018, - "EMA_samples_per_second": 24.887, - "EMA_steps_per_second": 24.887, - "epoch": 143.47826086956522, - "step": 3300 - }, - { - "epoch": 143.91304347826087, - "grad_norm": 1.9059531688690186, - "learning_rate": 3.909079208908198e-06, - "loss": 0.2535, - "step": 3310 - }, - { - "epoch": 144.34782608695653, - "grad_norm": 1.500815510749817, - "learning_rate": 3.909039307686411e-06, - "loss": 0.2141, - "step": 3320 - }, - { - "epoch": 144.7826086956522, - "grad_norm": 1.7481781244277954, - "learning_rate": 3.908998631963098e-06, - "loss": 0.2706, - "step": 3330 - }, - { - "epoch": 145.2173913043478, - "grad_norm": 2.9067223072052, - "learning_rate": 3.908957181754379e-06, - "loss": 0.2078, - "step": 3340 - }, - { - "epoch": 145.65217391304347, - "grad_norm": 1.8537293672561646, - "learning_rate": 3.908914957076686e-06, - "loss": 0.2382, - "step": 3350 - }, - { - "epoch": 146.08695652173913, - "grad_norm": 2.053541421890259, - "learning_rate": 3.908871957946754e-06, - "loss": 0.265, - "step": 3360 - }, - { - "epoch": 146.52173913043478, - "grad_norm": 2.026669979095459, - "learning_rate": 3.908828184381628e-06, - "loss": 0.2265, - "step": 3370 - }, - { - "epoch": 146.95652173913044, - "grad_norm": 1.6259890794754028, - "learning_rate": 3.908783636398657e-06, - "loss": 0.2153, - "step": 3380 - }, - { - "epoch": 147.3913043478261, - "grad_norm": 1.7665131092071533, - "learning_rate": 3.908738314015499e-06, - "loss": 0.2287, - "step": 3390 - }, - { - "epoch": 147.82608695652175, - "grad_norm": 1.5578436851501465, - "learning_rate": 3.908692217250118e-06, - "loss": 0.2535, - "step": 3400 - }, - { - "epoch": 148.2608695652174, - "grad_norm": 1.5355435609817505, - "learning_rate": 3.908645346120786e-06, - "loss": 0.2154, - "step": 3410 - }, - { - "epoch": 148.69565217391303, - "grad_norm": 1.8538081645965576, - "learning_rate": 3.908597700646081e-06, - "loss": 0.2498, - "step": 3420 - }, - { - "epoch": 149.1304347826087, - "grad_norm": 1.8780725002288818, - "learning_rate": 3.908549280844888e-06, - "loss": 0.2714, - "step": 3430 - }, - { - "epoch": 149.56521739130434, - "grad_norm": 2.210402250289917, - "learning_rate": 3.908500086736398e-06, - "loss": 0.2647, - "step": 3440 - }, - { - "epoch": 150.0, - "grad_norm": 3.242107391357422, - "learning_rate": 3.908450118340112e-06, - "loss": 0.2203, - "step": 3450 - }, - { - "epoch": 150.0, - "eval_loss": 0.9529827237129211, - "eval_runtime": 0.4227, - "eval_samples_per_second": 23.657, - "eval_steps_per_second": 23.657, - "step": 3450 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4313, - "Start_State_samples_per_second": 23.184, - "Start_State_steps_per_second": 23.184, - "epoch": 150.0, - "step": 3450 - }, - { - "Raw_Model_loss": 0.9529827237129211, - "Raw_Model_runtime": 0.4191, - "Raw_Model_samples_per_second": 23.858, - "Raw_Model_steps_per_second": 23.858, - "epoch": 150.0, - "step": 3450 - }, - { - "SWA_loss": 0.7833188772201538, - "SWA_runtime": 0.4121, - "SWA_samples_per_second": 24.265, - "SWA_steps_per_second": 24.265, - "epoch": 150.0, - "step": 3450 - }, - { - "EMA_loss": 0.8595975637435913, - "EMA_runtime": 0.4017, - "EMA_samples_per_second": 24.892, - "EMA_steps_per_second": 24.892, - "epoch": 150.0, - "step": 3450 - }, - { - "epoch": 150.43478260869566, - "grad_norm": 1.7044988870620728, - "learning_rate": 2.513945738151511e-07, - "loss": 0.2135, - "step": 3460 - }, - { - "epoch": 150.8695652173913, - "grad_norm": 2.001293897628784, - "learning_rate": 5.027891476303022e-07, - "loss": 0.2623, - "step": 3470 - }, - { - "epoch": 151.30434782608697, - "grad_norm": 1.6400986909866333, - "learning_rate": 7.541837214454532e-07, - "loss": 0.1997, - "step": 3480 - }, - { - "epoch": 151.7391304347826, - "grad_norm": 2.337966203689575, - "learning_rate": 1.0055782952606044e-06, - "loss": 0.2472, - "step": 3490 - }, - { - "epoch": 152.17391304347825, - "grad_norm": 2.081322431564331, - "learning_rate": 1.2569728690757554e-06, - "loss": 0.2426, - "step": 3500 - }, - { - "epoch": 152.6086956521739, - "grad_norm": 1.6173598766326904, - "learning_rate": 1.5083674428909064e-06, - "loss": 0.2398, - "step": 3510 - }, - { - "epoch": 153.04347826086956, - "grad_norm": 1.571141004562378, - "learning_rate": 1.7597620167060574e-06, - "loss": 0.2069, - "step": 3520 - }, - { - "epoch": 153.47826086956522, - "grad_norm": 2.327928066253662, - "learning_rate": 2.011156590521209e-06, - "loss": 0.2502, - "step": 3530 - }, - { - "epoch": 153.91304347826087, - "grad_norm": 2.673839807510376, - "learning_rate": 2.2625511643363598e-06, - "loss": 0.232, - "step": 3540 - }, - { - "epoch": 154.34782608695653, - "grad_norm": 2.2869648933410645, - "learning_rate": 2.5139457381515108e-06, - "loss": 0.2399, - "step": 3550 - }, - { - "epoch": 154.7826086956522, - "grad_norm": 2.043811798095703, - "learning_rate": 2.5139454890395686e-06, - "loss": 0.2345, - "step": 3560 - }, - { - "epoch": 155.2173913043478, - "grad_norm": 1.682305932044983, - "learning_rate": 2.51394474170384e-06, - "loss": 0.1958, - "step": 3570 - }, - { - "epoch": 155.65217391304347, - "grad_norm": 2.0729916095733643, - "learning_rate": 2.5139434961446224e-06, - "loss": 0.2663, - "step": 3580 - }, - { - "epoch": 156.08695652173913, - "grad_norm": 1.6533286571502686, - "learning_rate": 2.513941752362408e-06, - "loss": 0.2031, - "step": 3590 - }, - { - "epoch": 156.52173913043478, - "grad_norm": 2.51108980178833, - "learning_rate": 2.5139395103578894e-06, - "loss": 0.2679, - "step": 3600 - }, - { - "epoch": 156.52173913043478, - "eval_loss": 0.9608185887336731, - "eval_runtime": 0.4253, - "eval_samples_per_second": 23.515, - "eval_steps_per_second": 23.515, - "step": 3600 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.413, - "Start_State_samples_per_second": 24.215, - "Start_State_steps_per_second": 24.215, - "epoch": 156.52173913043478, - "step": 3600 - }, - { - "Raw_Model_loss": 0.9608185887336731, - "Raw_Model_runtime": 0.419, - "Raw_Model_samples_per_second": 23.864, - "Raw_Model_steps_per_second": 23.864, - "epoch": 156.52173913043478, - "step": 3600 - }, - { - "SWA_loss": 0.7903212308883667, - "SWA_runtime": 0.4071, - "SWA_samples_per_second": 24.562, - "SWA_steps_per_second": 24.562, - "epoch": 156.52173913043478, - "step": 3600 - }, - { - "EMA_loss": 0.8596304059028625, - "EMA_runtime": 0.4003, - "EMA_samples_per_second": 24.982, - "EMA_steps_per_second": 24.982, - "epoch": 156.52173913043478, - "step": 3600 - }, - { - "epoch": 156.95652173913044, - "grad_norm": 2.5946836471557617, - "learning_rate": 2.513936770131954e-06, - "loss": 0.1973, - "step": 3610 - }, - { - "epoch": 157.3913043478261, - "grad_norm": 1.8816180229187012, - "learning_rate": 2.5139335316856892e-06, - "loss": 0.2551, - "step": 3620 - }, - { - "epoch": 157.82608695652175, - "grad_norm": 1.969436764717102, - "learning_rate": 2.5139297950203775e-06, - "loss": 0.2349, - "step": 3630 - }, - { - "epoch": 158.2608695652174, - "grad_norm": 2.1921560764312744, - "learning_rate": 2.5139255601375007e-06, - "loss": 0.2243, - "step": 3640 - }, - { - "epoch": 158.69565217391303, - "grad_norm": 3.598989725112915, - "learning_rate": 2.513920827038737e-06, - "loss": 0.2276, - "step": 3650 - }, - { - "epoch": 159.1304347826087, - "grad_norm": 2.583705186843872, - "learning_rate": 2.513915595725963e-06, - "loss": 0.2528, - "step": 3660 - }, - { - "epoch": 159.56521739130434, - "grad_norm": 1.8946772813796997, - "learning_rate": 2.5139098662012514e-06, - "loss": 0.2368, - "step": 3670 - }, - { - "epoch": 160.0, - "grad_norm": 2.685317039489746, - "learning_rate": 2.513903638466874e-06, - "loss": 0.2026, - "step": 3680 - }, - { - "epoch": 160.43478260869566, - "grad_norm": 1.9969098567962646, - "learning_rate": 2.5138969125252985e-06, - "loss": 0.228, - "step": 3690 - }, - { - "epoch": 160.8695652173913, - "grad_norm": 1.5398179292678833, - "learning_rate": 2.5138896883791913e-06, - "loss": 0.2437, - "step": 3700 - }, - { - "epoch": 161.30434782608697, - "grad_norm": 1.6144198179244995, - "learning_rate": 2.5138819660314154e-06, - "loss": 0.2764, - "step": 3710 - }, - { - "epoch": 161.7391304347826, - "grad_norm": 2.053276777267456, - "learning_rate": 2.513873745485033e-06, - "loss": 0.2278, - "step": 3720 - }, - { - "epoch": 162.17391304347825, - "grad_norm": 2.3131282329559326, - "learning_rate": 2.513865026743301e-06, - "loss": 0.2157, - "step": 3730 - }, - { - "epoch": 162.6086956521739, - "grad_norm": 2.0463197231292725, - "learning_rate": 2.5138558098096753e-06, - "loss": 0.2233, - "step": 3740 - }, - { - "epoch": 163.04347826086956, - "grad_norm": 2.3754689693450928, - "learning_rate": 2.51384609468781e-06, - "loss": 0.2231, - "step": 3750 - }, - { - "epoch": 163.04347826086956, - "eval_loss": 0.9596047401428223, - "eval_runtime": 0.4563, - "eval_samples_per_second": 21.916, - "eval_steps_per_second": 21.916, - "step": 3750 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.456, - "Start_State_samples_per_second": 21.93, - "Start_State_steps_per_second": 21.93, - "epoch": 163.04347826086956, - "step": 3750 - }, - { - "Raw_Model_loss": 0.9596047401428223, - "Raw_Model_runtime": 0.4822, - "Raw_Model_samples_per_second": 20.737, - "Raw_Model_steps_per_second": 20.737, - "epoch": 163.04347826086956, - "step": 3750 - }, - { - "SWA_loss": 0.7939289808273315, - "SWA_runtime": 0.4295, - "SWA_samples_per_second": 23.281, - "SWA_steps_per_second": 23.281, - "epoch": 163.04347826086956, - "step": 3750 - }, - { - "EMA_loss": 0.8596266508102417, - "EMA_runtime": 0.4196, - "EMA_samples_per_second": 23.833, - "EMA_steps_per_second": 23.833, - "epoch": 163.04347826086956, - "step": 3750 - }, - { - "epoch": 163.47826086956522, - "grad_norm": 1.9908124208450317, - "learning_rate": 2.5138358813815557e-06, - "loss": 0.2365, - "step": 3760 - }, - { - "epoch": 163.91304347826087, - "grad_norm": 2.154146194458008, - "learning_rate": 2.5138251698949603e-06, - "loss": 0.2379, - "step": 3770 - }, - { - "epoch": 164.34782608695653, - "grad_norm": 1.2993087768554688, - "learning_rate": 2.5138139602322698e-06, - "loss": 0.2233, - "step": 3780 - }, - { - "epoch": 164.7826086956522, - "grad_norm": 2.012446641921997, - "learning_rate": 2.513802252397927e-06, - "loss": 0.2327, - "step": 3790 - }, - { - "epoch": 165.2173913043478, - "grad_norm": 2.131314277648926, - "learning_rate": 2.513790046396573e-06, - "loss": 0.261, - "step": 3800 - }, - { - "epoch": 165.65217391304347, - "grad_norm": 1.9400966167449951, - "learning_rate": 2.5137773422330448e-06, - "loss": 0.2375, - "step": 3810 - }, - { - "epoch": 166.08695652173913, - "grad_norm": 2.3741443157196045, - "learning_rate": 2.5137641399123794e-06, - "loss": 0.1815, - "step": 3820 - }, - { - "epoch": 166.52173913043478, - "grad_norm": 2.0041890144348145, - "learning_rate": 2.5137504394398086e-06, - "loss": 0.2567, - "step": 3830 - }, - { - "epoch": 166.95652173913044, - "grad_norm": 1.6475896835327148, - "learning_rate": 2.5137362408207634e-06, - "loss": 0.2357, - "step": 3840 - }, - { - "epoch": 167.3913043478261, - "grad_norm": 2.971357583999634, - "learning_rate": 2.5137215440608716e-06, - "loss": 0.2192, - "step": 3850 - }, - { - "epoch": 167.82608695652175, - "grad_norm": 3.0954818725585938, - "learning_rate": 2.5137063491659585e-06, - "loss": 0.218, - "step": 3860 - }, - { - "epoch": 168.2608695652174, - "grad_norm": 2.0101890563964844, - "learning_rate": 2.513690656142047e-06, - "loss": 0.2871, - "step": 3870 - }, - { - "epoch": 168.69565217391303, - "grad_norm": 1.8766013383865356, - "learning_rate": 2.513674464995357e-06, - "loss": 0.2125, - "step": 3880 - }, - { - "epoch": 169.1304347826087, - "grad_norm": 2.223860025405884, - "learning_rate": 2.5136577757323066e-06, - "loss": 0.1979, - "step": 3890 - }, - { - "epoch": 169.56521739130434, - "grad_norm": 1.864652395248413, - "learning_rate": 2.5136405883595107e-06, - "loss": 0.2454, - "step": 3900 - }, - { - "epoch": 169.56521739130434, - "eval_loss": 0.967199444770813, - "eval_runtime": 0.4027, - "eval_samples_per_second": 24.832, - "eval_steps_per_second": 24.832, - "step": 3900 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4115, - "Start_State_samples_per_second": 24.298, - "Start_State_steps_per_second": 24.298, - "epoch": 169.56521739130434, - "step": 3900 - }, - { - "Raw_Model_loss": 0.967199444770813, - "Raw_Model_runtime": 0.4276, - "Raw_Model_samples_per_second": 23.385, - "Raw_Model_steps_per_second": 23.385, - "epoch": 169.56521739130434, - "step": 3900 - }, - { - "SWA_loss": 0.7983330488204956, - "SWA_runtime": 0.4151, - "SWA_samples_per_second": 24.09, - "SWA_steps_per_second": 24.09, - "epoch": 169.56521739130434, - "step": 3900 - }, - { - "EMA_loss": 0.8598569631576538, - "EMA_runtime": 0.4151, - "EMA_samples_per_second": 24.093, - "EMA_steps_per_second": 24.093, - "epoch": 169.56521739130434, - "step": 3900 - }, - { - "epoch": 170.0, - "grad_norm": 3.2512736320495605, - "learning_rate": 2.5136229028837813e-06, - "loss": 0.2311, - "step": 3910 - }, - { - "epoch": 170.43478260869566, - "grad_norm": 1.862411618232727, - "learning_rate": 2.5136047193121285e-06, - "loss": 0.2351, - "step": 3920 - }, - { - "epoch": 170.8695652173913, - "grad_norm": 2.6634721755981445, - "learning_rate": 2.513586037651761e-06, - "loss": 0.2343, - "step": 3930 - }, - { - "epoch": 171.30434782608697, - "grad_norm": 2.177884340286255, - "learning_rate": 2.5135668579100817e-06, - "loss": 0.2296, - "step": 3940 - }, - { - "epoch": 171.7391304347826, - "grad_norm": 1.8351444005966187, - "learning_rate": 2.5135471800946947e-06, - "loss": 0.2501, - "step": 3950 - }, - { - "epoch": 172.17391304347825, - "grad_norm": 2.0342533588409424, - "learning_rate": 2.513527004213398e-06, - "loss": 0.2407, - "step": 3960 - }, - { - "epoch": 172.6086956521739, - "grad_norm": 2.0113251209259033, - "learning_rate": 2.5135063302741893e-06, - "loss": 0.234, - "step": 3970 - }, - { - "epoch": 173.04347826086956, - "grad_norm": 1.9626580476760864, - "learning_rate": 2.5134851582852637e-06, - "loss": 0.2137, - "step": 3980 - }, - { - "epoch": 173.47826086956522, - "grad_norm": 2.3283474445343018, - "learning_rate": 2.5134634882550122e-06, - "loss": 0.2388, - "step": 3990 - }, - { - "epoch": 173.91304347826087, - "grad_norm": 2.082240104675293, - "learning_rate": 2.5134413201920244e-06, - "loss": 0.2468, - "step": 4000 - }, - { - "epoch": 174.34782608695653, - "grad_norm": 2.389084815979004, - "learning_rate": 2.513418654105087e-06, - "loss": 0.2283, - "step": 4010 - }, - { - "epoch": 174.7826086956522, - "grad_norm": 2.385908365249634, - "learning_rate": 2.5133954900031847e-06, - "loss": 0.2295, - "step": 4020 - }, - { - "epoch": 175.2173913043478, - "grad_norm": 2.003931760787964, - "learning_rate": 2.513371827895498e-06, - "loss": 0.2329, - "step": 4030 - }, - { - "epoch": 175.65217391304347, - "grad_norm": 2.265186071395874, - "learning_rate": 2.5133476677914065e-06, - "loss": 0.2145, - "step": 4040 - }, - { - "epoch": 176.08695652173913, - "grad_norm": 1.8546191453933716, - "learning_rate": 2.5133230097004866e-06, - "loss": 0.2419, - "step": 4050 - }, - { - "epoch": 176.08695652173913, - "eval_loss": 0.9715728759765625, - "eval_runtime": 0.4124, - "eval_samples_per_second": 24.249, - "eval_steps_per_second": 24.249, - "step": 4050 - }, - { - "Start_State_loss": 0.861186683177948, - "Start_State_runtime": 0.4406, - "Start_State_samples_per_second": 22.694, - "Start_State_steps_per_second": 22.694, - "epoch": 176.08695652173913, - "step": 4050 - }, - { - "Raw_Model_loss": 0.9715728759765625, - "Raw_Model_runtime": 0.5139, - "Raw_Model_samples_per_second": 19.459, - "Raw_Model_steps_per_second": 19.459, - "epoch": 176.08695652173913, - "step": 4050 - }, - { - "SWA_loss": 0.800355076789856, - "SWA_runtime": 0.4756, - "SWA_samples_per_second": 21.028, - "SWA_steps_per_second": 21.028, - "epoch": 176.08695652173913, - "step": 4050 - }, - { - "EMA_loss": 0.8603526949882507, - "EMA_runtime": 0.4374, - "EMA_samples_per_second": 22.861, - "EMA_steps_per_second": 22.861, - "epoch": 176.08695652173913, - "step": 4050 } ], "logging_steps": 10, @@ -3941,7 +139,7 @@ "attributes": {} } }, - "total_flos": 1.0440404596622131e+17, + "total_flos": 3894839614291968.0, "train_batch_size": 4, "trial_name": null, "trial_params": null