Training in progress, step 157000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6fe11c67454b196001e4ed79995fab7a74672309c3dd2962d5f80ffb63bc57b
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b377b9ca632415e5cc259fc332790fb52abc8eed12ac83f3e71264ccc731be8f
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7eb45c212b2ad29aa591c00be2416908d2afe25e585ed34f3beb9e136a92cef
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50e7a91a2d6de7899f3ef55596029edda575e3992fc80bcaed05a5c6cf935dee
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -27776,11 +27776,189 @@
|
|
| 27776 |
"eval_steps_per_second": 15.575,
|
| 27777 |
"num_input_tokens_seen": 90165273280,
|
| 27778 |
"step": 156000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27779 |
}
|
| 27780 |
],
|
| 27781 |
"logging_steps": 50,
|
| 27782 |
"max_steps": 200000,
|
| 27783 |
-
"num_input_tokens_seen":
|
| 27784 |
"num_train_epochs": 5,
|
| 27785 |
"save_steps": 1000,
|
| 27786 |
"stateful_callbacks": {
|
|
@@ -27795,7 +27973,7 @@
|
|
| 27795 |
"attributes": {}
|
| 27796 |
}
|
| 27797 |
},
|
| 27798 |
-
"total_flos": 1.
|
| 27799 |
"train_batch_size": 32,
|
| 27800 |
"trial_name": null,
|
| 27801 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.4487226393866526,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 157000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 27776 |
"eval_steps_per_second": 15.575,
|
| 27777 |
"num_input_tokens_seen": 90165273280,
|
| 27778 |
"step": 156000
|
| 27779 |
+
},
|
| 27780 |
+
{
|
| 27781 |
+
"epoch": 3.4278548839814578,
|
| 27782 |
+
"grad_norm": 0.09176173806190491,
|
| 27783 |
+
"learning_rate": 0.0001,
|
| 27784 |
+
"loss": 2.3456,
|
| 27785 |
+
"num_input_tokens_seen": 90217696288,
|
| 27786 |
+
"step": 156050
|
| 27787 |
+
},
|
| 27788 |
+
{
|
| 27789 |
+
"epoch": 3.428953186897521,
|
| 27790 |
+
"grad_norm": 0.089874766767025,
|
| 27791 |
+
"learning_rate": 0.0001,
|
| 27792 |
+
"loss": 2.3475,
|
| 27793 |
+
"num_input_tokens_seen": 90270125088,
|
| 27794 |
+
"step": 156100
|
| 27795 |
+
},
|
| 27796 |
+
{
|
| 27797 |
+
"epoch": 3.4300514898135837,
|
| 27798 |
+
"grad_norm": 0.0928313210606575,
|
| 27799 |
+
"learning_rate": 0.0001,
|
| 27800 |
+
"loss": 2.3515,
|
| 27801 |
+
"num_input_tokens_seen": 90322553888,
|
| 27802 |
+
"step": 156150
|
| 27803 |
+
},
|
| 27804 |
+
{
|
| 27805 |
+
"epoch": 3.4311497927296464,
|
| 27806 |
+
"grad_norm": 0.09114927798509598,
|
| 27807 |
+
"learning_rate": 0.0001,
|
| 27808 |
+
"loss": 2.3494,
|
| 27809 |
+
"num_input_tokens_seen": 90374980192,
|
| 27810 |
+
"step": 156200
|
| 27811 |
+
},
|
| 27812 |
+
{
|
| 27813 |
+
"epoch": 3.4322480956457095,
|
| 27814 |
+
"grad_norm": 0.09503049403429031,
|
| 27815 |
+
"learning_rate": 0.0001,
|
| 27816 |
+
"loss": 2.3439,
|
| 27817 |
+
"num_input_tokens_seen": 90427408992,
|
| 27818 |
+
"step": 156250
|
| 27819 |
+
},
|
| 27820 |
+
{
|
| 27821 |
+
"epoch": 3.4333463985617723,
|
| 27822 |
+
"grad_norm": 0.09301070868968964,
|
| 27823 |
+
"learning_rate": 0.0001,
|
| 27824 |
+
"loss": 2.3487,
|
| 27825 |
+
"num_input_tokens_seen": 90479837120,
|
| 27826 |
+
"step": 156300
|
| 27827 |
+
},
|
| 27828 |
+
{
|
| 27829 |
+
"epoch": 3.434444701477835,
|
| 27830 |
+
"grad_norm": 0.09190023690462112,
|
| 27831 |
+
"learning_rate": 0.0001,
|
| 27832 |
+
"loss": 2.3448,
|
| 27833 |
+
"num_input_tokens_seen": 90532258144,
|
| 27834 |
+
"step": 156350
|
| 27835 |
+
},
|
| 27836 |
+
{
|
| 27837 |
+
"epoch": 3.435543004393898,
|
| 27838 |
+
"grad_norm": 0.09636224061250687,
|
| 27839 |
+
"learning_rate": 0.0001,
|
| 27840 |
+
"loss": 2.3456,
|
| 27841 |
+
"num_input_tokens_seen": 90584686944,
|
| 27842 |
+
"step": 156400
|
| 27843 |
+
},
|
| 27844 |
+
{
|
| 27845 |
+
"epoch": 3.436641307309961,
|
| 27846 |
+
"grad_norm": 0.09875821322202682,
|
| 27847 |
+
"learning_rate": 0.0001,
|
| 27848 |
+
"loss": 2.3415,
|
| 27849 |
+
"num_input_tokens_seen": 90637115744,
|
| 27850 |
+
"step": 156450
|
| 27851 |
+
},
|
| 27852 |
+
{
|
| 27853 |
+
"epoch": 3.437739610226024,
|
| 27854 |
+
"grad_norm": 0.09387224912643433,
|
| 27855 |
+
"learning_rate": 0.0001,
|
| 27856 |
+
"loss": 2.3483,
|
| 27857 |
+
"num_input_tokens_seen": 90689540064,
|
| 27858 |
+
"step": 156500
|
| 27859 |
+
},
|
| 27860 |
+
{
|
| 27861 |
+
"epoch": 3.437739610226024,
|
| 27862 |
+
"eval_loss": 2.2581753730773926,
|
| 27863 |
+
"eval_runtime": 79.6955,
|
| 27864 |
+
"eval_samples_per_second": 62.739,
|
| 27865 |
+
"eval_steps_per_second": 15.685,
|
| 27866 |
+
"num_input_tokens_seen": 90689540064,
|
| 27867 |
+
"step": 156500
|
| 27868 |
+
},
|
| 27869 |
+
{
|
| 27870 |
+
"epoch": 3.4388379131420868,
|
| 27871 |
+
"grad_norm": 0.08944698423147202,
|
| 27872 |
+
"learning_rate": 0.0001,
|
| 27873 |
+
"loss": 2.3459,
|
| 27874 |
+
"num_input_tokens_seen": 90741968864,
|
| 27875 |
+
"step": 156550
|
| 27876 |
+
},
|
| 27877 |
+
{
|
| 27878 |
+
"epoch": 3.4399362160581495,
|
| 27879 |
+
"grad_norm": 0.09725566953420639,
|
| 27880 |
+
"learning_rate": 0.0001,
|
| 27881 |
+
"loss": 2.3399,
|
| 27882 |
+
"num_input_tokens_seen": 90794397664,
|
| 27883 |
+
"step": 156600
|
| 27884 |
+
},
|
| 27885 |
+
{
|
| 27886 |
+
"epoch": 3.4410345189742126,
|
| 27887 |
+
"grad_norm": 0.09932785481214523,
|
| 27888 |
+
"learning_rate": 0.0001,
|
| 27889 |
+
"loss": 2.3475,
|
| 27890 |
+
"num_input_tokens_seen": 90846826464,
|
| 27891 |
+
"step": 156650
|
| 27892 |
+
},
|
| 27893 |
+
{
|
| 27894 |
+
"epoch": 3.4421328218902754,
|
| 27895 |
+
"grad_norm": 0.09854361414909363,
|
| 27896 |
+
"learning_rate": 0.0001,
|
| 27897 |
+
"loss": 2.3449,
|
| 27898 |
+
"num_input_tokens_seen": 90899255264,
|
| 27899 |
+
"step": 156700
|
| 27900 |
+
},
|
| 27901 |
+
{
|
| 27902 |
+
"epoch": 3.4432311248063385,
|
| 27903 |
+
"grad_norm": 0.09402545541524887,
|
| 27904 |
+
"learning_rate": 0.0001,
|
| 27905 |
+
"loss": 2.3434,
|
| 27906 |
+
"num_input_tokens_seen": 90951684064,
|
| 27907 |
+
"step": 156750
|
| 27908 |
+
},
|
| 27909 |
+
{
|
| 27910 |
+
"epoch": 3.4443294277224012,
|
| 27911 |
+
"grad_norm": 0.09715921431779861,
|
| 27912 |
+
"learning_rate": 0.0001,
|
| 27913 |
+
"loss": 2.3455,
|
| 27914 |
+
"num_input_tokens_seen": 91004112864,
|
| 27915 |
+
"step": 156800
|
| 27916 |
+
},
|
| 27917 |
+
{
|
| 27918 |
+
"epoch": 3.445427730638464,
|
| 27919 |
+
"grad_norm": 0.09590257704257965,
|
| 27920 |
+
"learning_rate": 0.0001,
|
| 27921 |
+
"loss": 2.348,
|
| 27922 |
+
"num_input_tokens_seen": 91056541664,
|
| 27923 |
+
"step": 156850
|
| 27924 |
+
},
|
| 27925 |
+
{
|
| 27926 |
+
"epoch": 3.446526033554527,
|
| 27927 |
+
"grad_norm": 0.10155434161424637,
|
| 27928 |
+
"learning_rate": 0.0001,
|
| 27929 |
+
"loss": 2.3403,
|
| 27930 |
+
"num_input_tokens_seen": 91108970464,
|
| 27931 |
+
"step": 156900
|
| 27932 |
+
},
|
| 27933 |
+
{
|
| 27934 |
+
"epoch": 3.44762433647059,
|
| 27935 |
+
"grad_norm": 0.09132086485624313,
|
| 27936 |
+
"learning_rate": 0.0001,
|
| 27937 |
+
"loss": 2.3569,
|
| 27938 |
+
"num_input_tokens_seen": 91161399264,
|
| 27939 |
+
"step": 156950
|
| 27940 |
+
},
|
| 27941 |
+
{
|
| 27942 |
+
"epoch": 3.4487226393866526,
|
| 27943 |
+
"grad_norm": 0.0917491465806961,
|
| 27944 |
+
"learning_rate": 0.0001,
|
| 27945 |
+
"loss": 2.3454,
|
| 27946 |
+
"num_input_tokens_seen": 91213822304,
|
| 27947 |
+
"step": 157000
|
| 27948 |
+
},
|
| 27949 |
+
{
|
| 27950 |
+
"epoch": 3.4487226393866526,
|
| 27951 |
+
"eval_loss": 2.2578930854797363,
|
| 27952 |
+
"eval_runtime": 80.5074,
|
| 27953 |
+
"eval_samples_per_second": 62.106,
|
| 27954 |
+
"eval_steps_per_second": 15.527,
|
| 27955 |
+
"num_input_tokens_seen": 91213822304,
|
| 27956 |
+
"step": 157000
|
| 27957 |
}
|
| 27958 |
],
|
| 27959 |
"logging_steps": 50,
|
| 27960 |
"max_steps": 200000,
|
| 27961 |
+
"num_input_tokens_seen": 91213822304,
|
| 27962 |
"num_train_epochs": 5,
|
| 27963 |
"save_steps": 1000,
|
| 27964 |
"stateful_callbacks": {
|
|
|
|
| 27973 |
"attributes": {}
|
| 27974 |
}
|
| 27975 |
},
|
| 27976 |
+
"total_flos": 1.614318280768918e+20,
|
| 27977 |
"train_batch_size": 32,
|
| 27978 |
"trial_name": null,
|
| 27979 |
"trial_params": null
|