Training in progress, step 129000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:857ead76dd55a0ff132114f3566b2633c2c5cdde85ae73d0787d641584b91007
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d51187329bb716afa734f026372750945e338e23b7c661997a4d4207a6fd698
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01c311980c8b0da96dd9e638e23b1e84aa50fb6a11433bc22a347279b706965b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60a157573f4024c9cf3f191281f1d04ef870f25b0126e228157b25abffaa2ebf
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22792,11 +22792,189 @@
|
|
| 22792 |
"eval_steps_per_second": 14.933,
|
| 22793 |
"num_input_tokens_seen": 67098059328,
|
| 22794 |
"step": 128000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22795 |
}
|
| 22796 |
],
|
| 22797 |
"logging_steps": 50,
|
| 22798 |
"max_steps": 140000,
|
| 22799 |
-
"num_input_tokens_seen":
|
| 22800 |
"num_train_epochs": 2,
|
| 22801 |
"save_steps": 1000,
|
| 22802 |
"stateful_callbacks": {
|
|
@@ -22811,7 +22989,7 @@
|
|
| 22811 |
"attributes": {}
|
| 22812 |
}
|
| 22813 |
},
|
| 22814 |
-
"total_flos": 1.
|
| 22815 |
"train_batch_size": 32,
|
| 22816 |
"trial_name": null,
|
| 22817 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.230668876778325,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 129000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22792 |
"eval_steps_per_second": 14.933,
|
| 22793 |
"num_input_tokens_seen": 67098059328,
|
| 22794 |
"step": 128000
|
| 22795 |
+
},
|
| 22796 |
+
{
|
| 22797 |
+
"epoch": 1.2216058289707479,
|
| 22798 |
+
"grad_norm": 0.13075117766857147,
|
| 22799 |
+
"learning_rate": 0.00038600663175469667,
|
| 22800 |
+
"loss": 2.0582,
|
| 22801 |
+
"num_input_tokens_seen": 67124264448,
|
| 22802 |
+
"step": 128050
|
| 22803 |
+
},
|
| 22804 |
+
{
|
| 22805 |
+
"epoch": 1.222082831486936,
|
| 22806 |
+
"grad_norm": 0.1297282576560974,
|
| 22807 |
+
"learning_rate": 0.00038327731807204744,
|
| 22808 |
+
"loss": 2.0595,
|
| 22809 |
+
"num_input_tokens_seen": 67150472320,
|
| 22810 |
+
"step": 128100
|
| 22811 |
+
},
|
| 22812 |
+
{
|
| 22813 |
+
"epoch": 1.2225598340031243,
|
| 22814 |
+
"grad_norm": 0.12640318274497986,
|
| 22815 |
+
"learning_rate": 0.00038055167787050134,
|
| 22816 |
+
"loss": 2.0525,
|
| 22817 |
+
"num_input_tokens_seen": 67176672192,
|
| 22818 |
+
"step": 128150
|
| 22819 |
+
},
|
| 22820 |
+
{
|
| 22821 |
+
"epoch": 1.2230368365193127,
|
| 22822 |
+
"grad_norm": 0.1315733790397644,
|
| 22823 |
+
"learning_rate": 0.00037782979693105293,
|
| 22824 |
+
"loss": 2.0499,
|
| 22825 |
+
"num_input_tokens_seen": 67202877408,
|
| 22826 |
+
"step": 128200
|
| 22827 |
+
},
|
| 22828 |
+
{
|
| 22829 |
+
"epoch": 1.223513839035501,
|
| 22830 |
+
"grad_norm": 0.12865200638771057,
|
| 22831 |
+
"learning_rate": 0.0003751117609163865,
|
| 22832 |
+
"loss": 2.051,
|
| 22833 |
+
"num_input_tokens_seen": 67229091168,
|
| 22834 |
+
"step": 128250
|
| 22835 |
+
},
|
| 22836 |
+
{
|
| 22837 |
+
"epoch": 1.2239908415516891,
|
| 22838 |
+
"grad_norm": 0.1271800547838211,
|
| 22839 |
+
"learning_rate": 0.00037239765536817873,
|
| 22840 |
+
"loss": 2.0555,
|
| 22841 |
+
"num_input_tokens_seen": 67255304768,
|
| 22842 |
+
"step": 128300
|
| 22843 |
+
},
|
| 22844 |
+
{
|
| 22845 |
+
"epoch": 1.2244678440678776,
|
| 22846 |
+
"grad_norm": 0.13572408258914948,
|
| 22847 |
+
"learning_rate": 0.0003696875657044073,
|
| 22848 |
+
"loss": 2.0622,
|
| 22849 |
+
"num_input_tokens_seen": 67281509184,
|
| 22850 |
+
"step": 128350
|
| 22851 |
+
},
|
| 22852 |
+
{
|
| 22853 |
+
"epoch": 1.2249448465840658,
|
| 22854 |
+
"grad_norm": 0.12558363378047943,
|
| 22855 |
+
"learning_rate": 0.0003669815772166625,
|
| 22856 |
+
"loss": 2.0548,
|
| 22857 |
+
"num_input_tokens_seen": 67307717088,
|
| 22858 |
+
"step": 128400
|
| 22859 |
+
},
|
| 22860 |
+
{
|
| 22861 |
+
"epoch": 1.225421849100254,
|
| 22862 |
+
"grad_norm": 0.13062912225723267,
|
| 22863 |
+
"learning_rate": 0.0003642797750674629,
|
| 22864 |
+
"loss": 2.0473,
|
| 22865 |
+
"num_input_tokens_seen": 67333928800,
|
| 22866 |
+
"step": 128450
|
| 22867 |
+
},
|
| 22868 |
+
{
|
| 22869 |
+
"epoch": 1.2258988516164422,
|
| 22870 |
+
"grad_norm": 0.1351100355386734,
|
| 22871 |
+
"learning_rate": 0.00036158224428757535,
|
| 22872 |
+
"loss": 2.0475,
|
| 22873 |
+
"num_input_tokens_seen": 67360131616,
|
| 22874 |
+
"step": 128500
|
| 22875 |
+
},
|
| 22876 |
+
{
|
| 22877 |
+
"epoch": 1.2258988516164422,
|
| 22878 |
+
"eval_loss": 1.9701597690582275,
|
| 22879 |
+
"eval_runtime": 82.4081,
|
| 22880 |
+
"eval_samples_per_second": 60.674,
|
| 22881 |
+
"eval_steps_per_second": 15.168,
|
| 22882 |
+
"num_input_tokens_seen": 67360131616,
|
| 22883 |
+
"step": 128500
|
| 22884 |
+
},
|
| 22885 |
+
{
|
| 22886 |
+
"epoch": 1.2263758541326306,
|
| 22887 |
+
"grad_norm": 0.13211333751678467,
|
| 22888 |
+
"learning_rate": 0.00035888906977333857,
|
| 22889 |
+
"loss": 2.0622,
|
| 22890 |
+
"num_input_tokens_seen": 67386344736,
|
| 22891 |
+
"step": 128550
|
| 22892 |
+
},
|
| 22893 |
+
{
|
| 22894 |
+
"epoch": 1.2268528566488188,
|
| 22895 |
+
"grad_norm": 0.12648384273052216,
|
| 22896 |
+
"learning_rate": 0.0003562003362839914,
|
| 22897 |
+
"loss": 2.051,
|
| 22898 |
+
"num_input_tokens_seen": 67412555520,
|
| 22899 |
+
"step": 128600
|
| 22900 |
+
},
|
| 22901 |
+
{
|
| 22902 |
+
"epoch": 1.227329859165007,
|
| 22903 |
+
"grad_norm": 0.13109999895095825,
|
| 22904 |
+
"learning_rate": 0.00035351612843900553,
|
| 22905 |
+
"loss": 2.0529,
|
| 22906 |
+
"num_input_tokens_seen": 67438769504,
|
| 22907 |
+
"step": 128650
|
| 22908 |
+
},
|
| 22909 |
+
{
|
| 22910 |
+
"epoch": 1.2278068616811955,
|
| 22911 |
+
"grad_norm": 0.12981992959976196,
|
| 22912 |
+
"learning_rate": 0.000350836530715422,
|
| 22913 |
+
"loss": 2.045,
|
| 22914 |
+
"num_input_tokens_seen": 67464972864,
|
| 22915 |
+
"step": 128700
|
| 22916 |
+
},
|
| 22917 |
+
{
|
| 22918 |
+
"epoch": 1.2282838641973837,
|
| 22919 |
+
"grad_norm": 0.1246839389204979,
|
| 22920 |
+
"learning_rate": 0.00034816162744519263,
|
| 22921 |
+
"loss": 2.0569,
|
| 22922 |
+
"num_input_tokens_seen": 67491186176,
|
| 22923 |
+
"step": 128750
|
| 22924 |
+
},
|
| 22925 |
+
{
|
| 22926 |
+
"epoch": 1.2287608667135719,
|
| 22927 |
+
"grad_norm": 0.13077682256698608,
|
| 22928 |
+
"learning_rate": 0.00034549150281252633,
|
| 22929 |
+
"loss": 2.0461,
|
| 22930 |
+
"num_input_tokens_seen": 67517399168,
|
| 22931 |
+
"step": 128800
|
| 22932 |
+
},
|
| 22933 |
+
{
|
| 22934 |
+
"epoch": 1.22923786922976,
|
| 22935 |
+
"grad_norm": 0.12939219176769257,
|
| 22936 |
+
"learning_rate": 0.000342826240851239,
|
| 22937 |
+
"loss": 2.047,
|
| 22938 |
+
"num_input_tokens_seen": 67543606592,
|
| 22939 |
+
"step": 128850
|
| 22940 |
+
},
|
| 22941 |
+
{
|
| 22942 |
+
"epoch": 1.2297148717459485,
|
| 22943 |
+
"grad_norm": 0.12711487710475922,
|
| 22944 |
+
"learning_rate": 0.00034016592544210936,
|
| 22945 |
+
"loss": 2.0411,
|
| 22946 |
+
"num_input_tokens_seen": 67569807488,
|
| 22947 |
+
"step": 128900
|
| 22948 |
+
},
|
| 22949 |
+
{
|
| 22950 |
+
"epoch": 1.2301918742621367,
|
| 22951 |
+
"grad_norm": 0.13154172897338867,
|
| 22952 |
+
"learning_rate": 0.00033751064031023887,
|
| 22953 |
+
"loss": 2.0536,
|
| 22954 |
+
"num_input_tokens_seen": 67596020896,
|
| 22955 |
+
"step": 128950
|
| 22956 |
+
},
|
| 22957 |
+
{
|
| 22958 |
+
"epoch": 1.230668876778325,
|
| 22959 |
+
"grad_norm": 0.1312495321035385,
|
| 22960 |
+
"learning_rate": 0.00033486046902241664,
|
| 22961 |
+
"loss": 2.0558,
|
| 22962 |
+
"num_input_tokens_seen": 67622231264,
|
| 22963 |
+
"step": 129000
|
| 22964 |
+
},
|
| 22965 |
+
{
|
| 22966 |
+
"epoch": 1.230668876778325,
|
| 22967 |
+
"eval_loss": 1.9686726331710815,
|
| 22968 |
+
"eval_runtime": 82.3322,
|
| 22969 |
+
"eval_samples_per_second": 60.73,
|
| 22970 |
+
"eval_steps_per_second": 15.182,
|
| 22971 |
+
"num_input_tokens_seen": 67622231264,
|
| 22972 |
+
"step": 129000
|
| 22973 |
}
|
| 22974 |
],
|
| 22975 |
"logging_steps": 50,
|
| 22976 |
"max_steps": 140000,
|
| 22977 |
+
"num_input_tokens_seen": 67622231264,
|
| 22978 |
"num_train_epochs": 2,
|
| 22979 |
"save_steps": 1000,
|
| 22980 |
"stateful_callbacks": {
|
|
|
|
| 22989 |
"attributes": {}
|
| 22990 |
}
|
| 22991 |
},
|
| 22992 |
+
"total_flos": 1.1967901504229745e+20,
|
| 22993 |
"train_batch_size": 32,
|
| 22994 |
"trial_name": null,
|
| 22995 |
"trial_params": null
|