Training in progress, step 700, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:134e1237af4835ec7c09b2ddfb5c01c7318849193ac07665aeebcb20602a7c35
 size 373077376

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa231a4fb18485169d08c9d1e7878f2c6c2747cf33272ebb7b91a615a73da69f
 size 373077376

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68a372c381851fbd34898ce48030e1bd1636e6f679c35483af912455776bdccf
 size 422377675

 version https://git-lfs.github.com/spec/v1
+oid sha256:ff95fd30c41364a06356f6550493cfc79f8b5f14e8279f05b156b0d50603cfb7
 size 422377675

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:229d1a5c5efb9c108732a998af8aefc8b44d8ea5fb5a5844f3f1fd3716527d07
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:36f1c8cafda7ec05bcf717e4cbc9d475e180378b36391598d72523001d0947ee
 size 14645

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3d5de6a60842cf6ceecc89c24a89facabb892cb2713ea74e41bdeaeb1177e51
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:b636e22decc0690abb4217d3b016f329ae73b4d12bae4602c74bba0c4d4ffdc1
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.6230529595015576,
   "eval_steps": 100,
-  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -4256,6 +4256,714 @@
       "eval_samples_per_second": 9.525,
       "eval_steps_per_second": 1.191,
       "step": 600
     }
   ],
   "logging_steps": 1,
@@ -4275,7 +4983,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.64170916069376e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.726895119418484,
   "eval_steps": 100,
+  "global_step": 700,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 9.525,
       "eval_steps_per_second": 1.191,
       "step": 600
+    },
+    {
+      "epoch": 0.6240913811007269,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.0003412480773376864,
+      "loss": 6.2849,
+      "step": 601
+    },
+    {
+      "epoch": 0.6251298026998962,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00033961934808841024,
+      "loss": 6.1491,
+      "step": 602
+    },
+    {
+      "epoch": 0.6261682242990654,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.0003379925136196088,
+      "loss": 6.281,
+      "step": 603
+    },
+    {
+      "epoch": 0.6272066458982347,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0003363675931511455,
+      "loss": 6.2496,
+      "step": 604
+    },
+    {
+      "epoch": 0.6282450674974039,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.0003347446058802708,
+      "loss": 6.2776,
+      "step": 605
+    },
+    {
+      "epoch": 0.6292834890965732,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00033312357098139617,
+      "loss": 6.3928,
+      "step": 606
+    },
+    {
+      "epoch": 0.6303219106957425,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0003315045076058671,
+      "loss": 6.2803,
+      "step": 607
+    },
+    {
+      "epoch": 0.6313603322949117,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00032988743488173697,
+      "loss": 6.2263,
+      "step": 608
+    },
+    {
+      "epoch": 0.632398753894081,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.000328272371913541,
+      "loss": 6.1383,
+      "step": 609
+    },
+    {
+      "epoch": 0.6334371754932503,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0003266593377820708,
+      "loss": 6.2603,
+      "step": 610
+    },
+    {
+      "epoch": 0.6344755970924195,
+      "grad_norm": 2.25,
+      "learning_rate": 0.0003250483515441485,
+      "loss": 6.3359,
+      "step": 611
+    },
+    {
+      "epoch": 0.6355140186915887,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0003234394322324019,
+      "loss": 6.1653,
+      "step": 612
+    },
+    {
+      "epoch": 0.6365524402907581,
+      "grad_norm": 2.125,
+      "learning_rate": 0.00032183259885504,
+      "loss": 6.2869,
+      "step": 613
+    },
+    {
+      "epoch": 0.6375908618899273,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00032022787039562745,
+      "loss": 6.3017,
+      "step": 614
+    },
+    {
+      "epoch": 0.6386292834890965,
+      "grad_norm": 4.3125,
+      "learning_rate": 0.0003186252658128611,
+      "loss": 6.1045,
+      "step": 615
+    },
+    {
+      "epoch": 0.6396677050882659,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00031702480404034565,
+      "loss": 6.3121,
+      "step": 616
+    },
+    {
+      "epoch": 0.6407061266874351,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.00031542650398637016,
+      "loss": 6.1043,
+      "step": 617
+    },
+    {
+      "epoch": 0.6417445482866043,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.0003138303845336844,
+      "loss": 6.2402,
+      "step": 618
+    },
+    {
+      "epoch": 0.6427829698857737,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0003122364645392762,
+      "loss": 6.2972,
+      "step": 619
+    },
+    {
+      "epoch": 0.6438213914849429,
+      "grad_norm": 2.234375,
+      "learning_rate": 0.0003106447628341482,
+      "loss": 6.2454,
+      "step": 620
+    },
+    {
+      "epoch": 0.6448598130841121,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0003090552982230954,
+      "loss": 6.1745,
+      "step": 621
+    },
+    {
+      "epoch": 0.6458982346832814,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00030746808948448366,
+      "loss": 6.224,
+      "step": 622
+    },
+    {
+      "epoch": 0.6469366562824507,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0003058831553700268,
+      "loss": 6.3142,
+      "step": 623
+    },
+    {
+      "epoch": 0.6479750778816199,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.00030430051460456596,
+      "loss": 6.2258,
+      "step": 624
+    },
+    {
+      "epoch": 0.6490134994807892,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.0003027201858858479,
+      "loss": 6.1406,
+      "step": 625
+    },
+    {
+      "epoch": 0.6500519210799585,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00030114218788430437,
+      "loss": 5.7233,
+      "step": 626
+    },
+    {
+      "epoch": 0.6510903426791277,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.0002995665392428313,
+      "loss": 6.0472,
+      "step": 627
+    },
+    {
+      "epoch": 0.652128764278297,
+      "grad_norm": 2.265625,
+      "learning_rate": 0.00029799325857656855,
+      "loss": 6.161,
+      "step": 628
+    },
+    {
+      "epoch": 0.6531671858774662,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00029642236447268024,
+      "loss": 6.112,
+      "step": 629
+    },
+    {
+      "epoch": 0.6542056074766355,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00029485387549013485,
+      "loss": 6.2132,
+      "step": 630
+    },
+    {
+      "epoch": 0.6552440290758048,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00029328781015948625,
+      "loss": 6.2657,
+      "step": 631
+    },
+    {
+      "epoch": 0.656282450674974,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00029172418698265444,
+      "loss": 6.1711,
+      "step": 632
+    },
+    {
+      "epoch": 0.6573208722741433,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0002901630244327075,
+      "loss": 6.1935,
+      "step": 633
+    },
+    {
+      "epoch": 0.6583592938733126,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00028860434095364263,
+      "loss": 6.4055,
+      "step": 634
+    },
+    {
+      "epoch": 0.6593977154724818,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.00028704815496016875,
+      "loss": 5.9916,
+      "step": 635
+    },
+    {
+      "epoch": 0.660436137071651,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00028549448483748886,
+      "loss": 6.1303,
+      "step": 636
+    },
+    {
+      "epoch": 0.6614745586708204,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.0002839433489410828,
+      "loss": 6.1363,
+      "step": 637
+    },
+    {
+      "epoch": 0.6625129802699896,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.0002823947655964901,
+      "loss": 6.1624,
+      "step": 638
+    },
+    {
+      "epoch": 0.6635514018691588,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.000280848753099094,
+      "loss": 5.9139,
+      "step": 639
+    },
+    {
+      "epoch": 0.6645898234683282,
+      "grad_norm": 2.625,
+      "learning_rate": 0.0002793053297139054,
+      "loss": 6.3661,
+      "step": 640
+    },
+    {
+      "epoch": 0.6656282450674974,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.0002777645136753459,
+      "loss": 6.3797,
+      "step": 641
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.0002762263231870339,
+      "loss": 6.0671,
+      "step": 642
+    },
+    {
+      "epoch": 0.667705088265836,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00027469077642156844,
+      "loss": 6.1843,
+      "step": 643
+    },
+    {
+      "epoch": 0.6687435098650052,
+      "grad_norm": 1.625,
+      "learning_rate": 0.000273157891520315,
+      "loss": 6.4473,
+      "step": 644
+    },
+    {
+      "epoch": 0.6697819314641744,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.00027162768659319114,
+      "loss": 6.3164,
+      "step": 645
+    },
+    {
+      "epoch": 0.6708203530633438,
+      "grad_norm": 1.6328125,
+      "learning_rate": 0.00027010017971845264,
+      "loss": 6.4004,
+      "step": 646
+    },
+    {
+      "epoch": 0.671858774662513,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00026857538894247947,
+      "loss": 6.242,
+      "step": 647
+    },
+    {
+      "epoch": 0.6728971962616822,
+      "grad_norm": 1.28125,
+      "learning_rate": 0.00026705333227956303,
+      "loss": 6.2597,
+      "step": 648
+    },
+    {
+      "epoch": 0.6739356178608515,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.000265534027711693,
+      "loss": 6.3972,
+      "step": 649
+    },
+    {
+      "epoch": 0.6749740394600208,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.00026401749318834527,
+      "loss": 6.4521,
+      "step": 650
+    },
+    {
+      "epoch": 0.67601246105919,
+      "grad_norm": 1.40625,
+      "learning_rate": 0.0002625037466262696,
+      "loss": 5.9594,
+      "step": 651
+    },
+    {
+      "epoch": 0.6770508826583593,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.0002609928059092779,
+      "loss": 6.2181,
+      "step": 652
+    },
+    {
+      "epoch": 0.6780893042575286,
+      "grad_norm": 1.3828125,
+      "learning_rate": 0.00025948468888803324,
+      "loss": 6.2781,
+      "step": 653
+    },
+    {
+      "epoch": 0.6791277258566978,
+      "grad_norm": 1.65625,
+      "learning_rate": 0.00025797941337983875,
+      "loss": 6.1757,
+      "step": 654
+    },
+    {
+      "epoch": 0.6801661474558671,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0002564769971684271,
+      "loss": 6.266,
+      "step": 655
+    },
+    {
+      "epoch": 0.6812045690550363,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00025497745800375036,
+      "loss": 6.3151,
+      "step": 656
+    },
+    {
+      "epoch": 0.6822429906542056,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0002534808136017707,
+      "loss": 6.2002,
+      "step": 657
+    },
+    {
+      "epoch": 0.6832814122533749,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00025198708164425045,
+      "loss": 6.1243,
+      "step": 658
+    },
+    {
+      "epoch": 0.6843198338525441,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.0002504962797785435,
+      "loss": 6.2666,
+      "step": 659
+    },
+    {
+      "epoch": 0.6853582554517134,
+      "grad_norm": 4.4375,
+      "learning_rate": 0.00024900842561738736,
+      "loss": 5.9076,
+      "step": 660
+    },
+    {
+      "epoch": 0.6863966770508827,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00024752353673869405,
+      "loss": 6.1968,
+      "step": 661
+    },
+    {
+      "epoch": 0.6874350986500519,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.00024604163068534315,
+      "loss": 6.2919,
+      "step": 662
+    },
+    {
+      "epoch": 0.6884735202492211,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00024456272496497415,
+      "loss": 6.1599,
+      "step": 663
+    },
+    {
+      "epoch": 0.6895119418483905,
+      "grad_norm": 1.8984375,
+      "learning_rate": 0.00024308683704978002,
+      "loss": 6.2052,
+      "step": 664
+    },
+    {
+      "epoch": 0.6905503634475597,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00024161398437630045,
+      "loss": 6.3025,
+      "step": 665
+    },
+    {
+      "epoch": 0.6915887850467289,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0002401441843452159,
+      "loss": 6.1938,
+      "step": 666
+    },
+    {
+      "epoch": 0.6926272066458983,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0002386774543211423,
+      "loss": 6.3049,
+      "step": 667
+    },
+    {
+      "epoch": 0.6936656282450675,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.0002372138116324254,
+      "loss": 6.3624,
+      "step": 668
+    },
+    {
+      "epoch": 0.6947040498442367,
+      "grad_norm": 1.84375,
+      "learning_rate": 0.00023575327357093658,
+      "loss": 6.3294,
+      "step": 669
+    },
+    {
+      "epoch": 0.6957424714434061,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0002342958573918682,
+      "loss": 6.097,
+      "step": 670
+    },
+    {
+      "epoch": 0.6967808930425753,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.0002328415803135298,
+      "loss": 6.1121,
+      "step": 671
+    },
+    {
+      "epoch": 0.6978193146417445,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.0002313904595171447,
+      "loss": 6.1288,
+      "step": 672
+    },
+    {
+      "epoch": 0.6988577362409139,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0002299425121466475,
+      "loss": 6.0291,
+      "step": 673
+    },
+    {
+      "epoch": 0.6998961578400831,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00022849775530848056,
+      "loss": 6.3553,
+      "step": 674
+    },
+    {
+      "epoch": 0.7009345794392523,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00022705620607139254,
+      "loss": 6.2154,
+      "step": 675
+    },
+    {
+      "epoch": 0.7019730010384216,
+      "grad_norm": 1.3984375,
+      "learning_rate": 0.00022561788146623679,
+      "loss": 6.1831,
+      "step": 676
+    },
+    {
+      "epoch": 0.7030114226375909,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.0002241827984857698,
+      "loss": 6.1788,
+      "step": 677
+    },
+    {
+      "epoch": 0.7040498442367601,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00022275097408445076,
+      "loss": 6.3325,
+      "step": 678
+    },
+    {
+      "epoch": 0.7050882658359294,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00022132242517824115,
+      "loss": 6.2826,
+      "step": 679
+    },
+    {
+      "epoch": 0.7061266874350987,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.0002198971686444047,
+      "loss": 6.1395,
+      "step": 680
+    },
+    {
+      "epoch": 0.7071651090342679,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00021847522132130827,
+      "loss": 6.3967,
+      "step": 681
+    },
+    {
+      "epoch": 0.7082035306334372,
+      "grad_norm": 2.234375,
+      "learning_rate": 0.00021705660000822285,
+      "loss": 6.4163,
+      "step": 682
+    },
+    {
+      "epoch": 0.7092419522326064,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00021564132146512495,
+      "loss": 6.1454,
+      "step": 683
+    },
+    {
+      "epoch": 0.7102803738317757,
+      "grad_norm": 1.9296875,
+      "learning_rate": 0.00021422940241249872,
+      "loss": 6.2889,
+      "step": 684
+    },
+    {
+      "epoch": 0.711318795430945,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0002128208595311384,
+      "loss": 6.227,
+      "step": 685
+    },
+    {
+      "epoch": 0.7123572170301142,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00021141570946195105,
+      "loss": 6.2655,
+      "step": 686
+    },
+    {
+      "epoch": 0.7133956386292835,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00021001396880576063,
+      "loss": 6.1813,
+      "step": 687
+    },
+    {
+      "epoch": 0.7144340602284528,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.0002086156541231109,
+      "loss": 6.3148,
+      "step": 688
+    },
+    {
+      "epoch": 0.715472481827622,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00020722078193407035,
+      "loss": 6.2127,
+      "step": 689
+    },
+    {
+      "epoch": 0.7165109034267912,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00020582936871803693,
+      "loss": 6.2863,
+      "step": 690
+    },
+    {
+      "epoch": 0.7175493250259606,
+      "grad_norm": 1.203125,
+      "learning_rate": 0.0002044414309135434,
+      "loss": 6.236,
+      "step": 691
+    },
+    {
+      "epoch": 0.7185877466251298,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00020305698491806295,
+      "loss": 6.3918,
+      "step": 692
+    },
+    {
+      "epoch": 0.719626168224299,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.0002016760470878158,
+      "loss": 5.9227,
+      "step": 693
+    },
+    {
+      "epoch": 0.7206645898234684,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0002002986337375755,
+      "loss": 6.0943,
+      "step": 694
+    },
+    {
+      "epoch": 0.7217030114226376,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00019892476114047664,
+      "loss": 6.2113,
+      "step": 695
+    },
+    {
+      "epoch": 0.7227414330218068,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00019755444552782225,
+      "loss": 6.3502,
+      "step": 696
+    },
+    {
+      "epoch": 0.7237798546209762,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00019618770308889227,
+      "loss": 5.9658,
+      "step": 697
+    },
+    {
+      "epoch": 0.7248182762201454,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00019482454997075228,
+      "loss": 6.1518,
+      "step": 698
+    },
+    {
+      "epoch": 0.7258566978193146,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.00019346500227806218,
+      "loss": 6.1651,
+      "step": 699
+    },
+    {
+      "epoch": 0.726895119418484,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00019210907607288723,
+      "loss": 6.2656,
+      "step": 700
+    },
+    {
+      "epoch": 0.726895119418484,
+      "eval_loss": 6.260857105255127,
+      "eval_runtime": 1.646,
+      "eval_samples_per_second": 9.72,
+      "eval_steps_per_second": 1.215,
+      "step": 700
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 8.91532735414272e+16,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null