Training in progress, step 45000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3ffbf5a816a6aa824466bdde4390b737dfef3183acb26f39844f7b4017bf30d
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6b19fbbf1f84052b99affd1a4abf045aa0dc4dae5e3396c29093fc71d96182f
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2efdaece0c1a392cf0dde4c3fd595f174e50c13358c4a6e5301669f684c3b3b
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4373b3ab47408a8ab65ab61c7aee7bfdf3c940344f36a198973da2bfc9da86a8
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7840,11 +7840,189 @@
|
|
| 7840 |
"eval_steps_per_second": 24.302,
|
| 7841 |
"num_input_tokens_seen": 11534331456,
|
| 7842 |
"step": 44000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7843 |
}
|
| 7844 |
],
|
| 7845 |
"logging_steps": 50,
|
| 7846 |
"max_steps": 70000,
|
| 7847 |
-
"num_input_tokens_seen":
|
| 7848 |
"num_train_epochs": 1,
|
| 7849 |
"save_steps": 1000,
|
| 7850 |
"stateful_callbacks": {
|
|
@@ -7859,7 +8037,7 @@
|
|
| 7859 |
"attributes": {}
|
| 7860 |
}
|
| 7861 |
},
|
| 7862 |
-
"total_flos": 3.
|
| 7863 |
"train_batch_size": 64,
|
| 7864 |
"trial_name": null,
|
| 7865 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2146511322847228,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 45000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7840 |
"eval_steps_per_second": 24.302,
|
| 7841 |
"num_input_tokens_seen": 11534331456,
|
| 7842 |
"step": 44000
|
| 7843 |
+
},
|
| 7844 |
+
{
|
| 7845 |
+
"epoch": 0.21011960838093421,
|
| 7846 |
+
"grad_norm": 0.19737008213996887,
|
| 7847 |
+
"learning_rate": 0.001,
|
| 7848 |
+
"loss": 2.6272,
|
| 7849 |
+
"num_input_tokens_seen": 11547438656,
|
| 7850 |
+
"step": 44050
|
| 7851 |
+
},
|
| 7852 |
+
{
|
| 7853 |
+
"epoch": 0.21035810963902835,
|
| 7854 |
+
"grad_norm": 0.1984977424144745,
|
| 7855 |
+
"learning_rate": 0.001,
|
| 7856 |
+
"loss": 2.6417,
|
| 7857 |
+
"num_input_tokens_seen": 11560545856,
|
| 7858 |
+
"step": 44100
|
| 7859 |
+
},
|
| 7860 |
+
{
|
| 7861 |
+
"epoch": 0.21059661089712248,
|
| 7862 |
+
"grad_norm": 0.19575904309749603,
|
| 7863 |
+
"learning_rate": 0.001,
|
| 7864 |
+
"loss": 2.6277,
|
| 7865 |
+
"num_input_tokens_seen": 11573653056,
|
| 7866 |
+
"step": 44150
|
| 7867 |
+
},
|
| 7868 |
+
{
|
| 7869 |
+
"epoch": 0.2108351121552166,
|
| 7870 |
+
"grad_norm": 0.19875651597976685,
|
| 7871 |
+
"learning_rate": 0.001,
|
| 7872 |
+
"loss": 2.6362,
|
| 7873 |
+
"num_input_tokens_seen": 11586760256,
|
| 7874 |
+
"step": 44200
|
| 7875 |
+
},
|
| 7876 |
+
{
|
| 7877 |
+
"epoch": 0.21107361341331077,
|
| 7878 |
+
"grad_norm": 0.20936185121536255,
|
| 7879 |
+
"learning_rate": 0.001,
|
| 7880 |
+
"loss": 2.6217,
|
| 7881 |
+
"num_input_tokens_seen": 11599867456,
|
| 7882 |
+
"step": 44250
|
| 7883 |
+
},
|
| 7884 |
+
{
|
| 7885 |
+
"epoch": 0.2113121146714049,
|
| 7886 |
+
"grad_norm": 0.19474463164806366,
|
| 7887 |
+
"learning_rate": 0.001,
|
| 7888 |
+
"loss": 2.6235,
|
| 7889 |
+
"num_input_tokens_seen": 11612974656,
|
| 7890 |
+
"step": 44300
|
| 7891 |
+
},
|
| 7892 |
+
{
|
| 7893 |
+
"epoch": 0.21155061592949903,
|
| 7894 |
+
"grad_norm": 0.20833207666873932,
|
| 7895 |
+
"learning_rate": 0.001,
|
| 7896 |
+
"loss": 2.6,
|
| 7897 |
+
"num_input_tokens_seen": 11626081856,
|
| 7898 |
+
"step": 44350
|
| 7899 |
+
},
|
| 7900 |
+
{
|
| 7901 |
+
"epoch": 0.21178911718759316,
|
| 7902 |
+
"grad_norm": 0.19269512593746185,
|
| 7903 |
+
"learning_rate": 0.001,
|
| 7904 |
+
"loss": 2.6211,
|
| 7905 |
+
"num_input_tokens_seen": 11639189056,
|
| 7906 |
+
"step": 44400
|
| 7907 |
+
},
|
| 7908 |
+
{
|
| 7909 |
+
"epoch": 0.2120276184456873,
|
| 7910 |
+
"grad_norm": 0.21018226444721222,
|
| 7911 |
+
"learning_rate": 0.001,
|
| 7912 |
+
"loss": 2.6294,
|
| 7913 |
+
"num_input_tokens_seen": 11652296256,
|
| 7914 |
+
"step": 44450
|
| 7915 |
+
},
|
| 7916 |
+
{
|
| 7917 |
+
"epoch": 0.21226611970378143,
|
| 7918 |
+
"grad_norm": 0.19836543500423431,
|
| 7919 |
+
"learning_rate": 0.001,
|
| 7920 |
+
"loss": 2.6051,
|
| 7921 |
+
"num_input_tokens_seen": 11665403456,
|
| 7922 |
+
"step": 44500
|
| 7923 |
+
},
|
| 7924 |
+
{
|
| 7925 |
+
"epoch": 0.21226611970378143,
|
| 7926 |
+
"eval_loss": 2.499817132949829,
|
| 7927 |
+
"eval_runtime": 50.9003,
|
| 7928 |
+
"eval_samples_per_second": 98.231,
|
| 7929 |
+
"eval_steps_per_second": 24.558,
|
| 7930 |
+
"num_input_tokens_seen": 11665403456,
|
| 7931 |
+
"step": 44500
|
| 7932 |
+
},
|
| 7933 |
+
{
|
| 7934 |
+
"epoch": 0.21250462096187558,
|
| 7935 |
+
"grad_norm": 0.18411967158317566,
|
| 7936 |
+
"learning_rate": 0.001,
|
| 7937 |
+
"loss": 2.6228,
|
| 7938 |
+
"num_input_tokens_seen": 11678510656,
|
| 7939 |
+
"step": 44550
|
| 7940 |
+
},
|
| 7941 |
+
{
|
| 7942 |
+
"epoch": 0.21274312221996972,
|
| 7943 |
+
"grad_norm": 0.19387467205524445,
|
| 7944 |
+
"learning_rate": 0.001,
|
| 7945 |
+
"loss": 2.5902,
|
| 7946 |
+
"num_input_tokens_seen": 11691617856,
|
| 7947 |
+
"step": 44600
|
| 7948 |
+
},
|
| 7949 |
+
{
|
| 7950 |
+
"epoch": 0.21298162347806385,
|
| 7951 |
+
"grad_norm": 0.22076952457427979,
|
| 7952 |
+
"learning_rate": 0.001,
|
| 7953 |
+
"loss": 2.613,
|
| 7954 |
+
"num_input_tokens_seen": 11704725056,
|
| 7955 |
+
"step": 44650
|
| 7956 |
+
},
|
| 7957 |
+
{
|
| 7958 |
+
"epoch": 0.21322012473615798,
|
| 7959 |
+
"grad_norm": 0.33861082792282104,
|
| 7960 |
+
"learning_rate": 0.001,
|
| 7961 |
+
"loss": 2.6142,
|
| 7962 |
+
"num_input_tokens_seen": 11717832256,
|
| 7963 |
+
"step": 44700
|
| 7964 |
+
},
|
| 7965 |
+
{
|
| 7966 |
+
"epoch": 0.2134586259942521,
|
| 7967 |
+
"grad_norm": 0.20097902417182922,
|
| 7968 |
+
"learning_rate": 0.001,
|
| 7969 |
+
"loss": 2.6549,
|
| 7970 |
+
"num_input_tokens_seen": 11730939456,
|
| 7971 |
+
"step": 44750
|
| 7972 |
+
},
|
| 7973 |
+
{
|
| 7974 |
+
"epoch": 0.21369712725234627,
|
| 7975 |
+
"grad_norm": 0.24534635245800018,
|
| 7976 |
+
"learning_rate": 0.001,
|
| 7977 |
+
"loss": 2.6293,
|
| 7978 |
+
"num_input_tokens_seen": 11744046656,
|
| 7979 |
+
"step": 44800
|
| 7980 |
+
},
|
| 7981 |
+
{
|
| 7982 |
+
"epoch": 0.2139356285104404,
|
| 7983 |
+
"grad_norm": 0.2439020723104477,
|
| 7984 |
+
"learning_rate": 0.001,
|
| 7985 |
+
"loss": 2.635,
|
| 7986 |
+
"num_input_tokens_seen": 11757153856,
|
| 7987 |
+
"step": 44850
|
| 7988 |
+
},
|
| 7989 |
+
{
|
| 7990 |
+
"epoch": 0.21417412976853453,
|
| 7991 |
+
"grad_norm": 0.24259154498577118,
|
| 7992 |
+
"learning_rate": 0.001,
|
| 7993 |
+
"loss": 2.6232,
|
| 7994 |
+
"num_input_tokens_seen": 11770261056,
|
| 7995 |
+
"step": 44900
|
| 7996 |
+
},
|
| 7997 |
+
{
|
| 7998 |
+
"epoch": 0.21441263102662866,
|
| 7999 |
+
"grad_norm": 0.23554636538028717,
|
| 8000 |
+
"learning_rate": 0.001,
|
| 8001 |
+
"loss": 2.6061,
|
| 8002 |
+
"num_input_tokens_seen": 11783368256,
|
| 8003 |
+
"step": 44950
|
| 8004 |
+
},
|
| 8005 |
+
{
|
| 8006 |
+
"epoch": 0.2146511322847228,
|
| 8007 |
+
"grad_norm": 0.20377275347709656,
|
| 8008 |
+
"learning_rate": 0.001,
|
| 8009 |
+
"loss": 2.6156,
|
| 8010 |
+
"num_input_tokens_seen": 11796475456,
|
| 8011 |
+
"step": 45000
|
| 8012 |
+
},
|
| 8013 |
+
{
|
| 8014 |
+
"epoch": 0.2146511322847228,
|
| 8015 |
+
"eval_loss": 2.503781318664551,
|
| 8016 |
+
"eval_runtime": 51.1656,
|
| 8017 |
+
"eval_samples_per_second": 97.722,
|
| 8018 |
+
"eval_steps_per_second": 24.43,
|
| 8019 |
+
"num_input_tokens_seen": 11796475456,
|
| 8020 |
+
"step": 45000
|
| 8021 |
}
|
| 8022 |
],
|
| 8023 |
"logging_steps": 50,
|
| 8024 |
"max_steps": 70000,
|
| 8025 |
+
"num_input_tokens_seen": 11796475456,
|
| 8026 |
"num_train_epochs": 1,
|
| 8027 |
"save_steps": 1000,
|
| 8028 |
"stateful_callbacks": {
|
|
|
|
| 8037 |
"attributes": {}
|
| 8038 |
}
|
| 8039 |
},
|
| 8040 |
+
"total_flos": 3.1556723180804506e+18,
|
| 8041 |
"train_batch_size": 64,
|
| 8042 |
"trial_name": null,
|
| 8043 |
"trial_params": null
|