Training in progress, step 158000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:416215f0149ccdc8795d454a743c8eca2679e864ee093c2c92b9a7dc89d715bb
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f007001464c0735c2a1e81f89ec3fb177e1babb5501510e4e9c8bbe36278a009
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09a1e45eb7b9bd5bee8831d08f28097d6e76d93bacd9f185db082ca81501cddf
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30aae8d205cf2e798817f5244eb6202efaefc5672b52769c357c6911e1e312c2
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 3.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -27954,11 +27954,189 @@
|
|
| 27954 |
"eval_steps_per_second": 15.527,
|
| 27955 |
"num_input_tokens_seen": 91213822304,
|
| 27956 |
"step": 157000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27957 |
}
|
| 27958 |
],
|
| 27959 |
"logging_steps": 50,
|
| 27960 |
"max_steps": 200000,
|
| 27961 |
-
"num_input_tokens_seen":
|
| 27962 |
"num_train_epochs": 5,
|
| 27963 |
"save_steps": 1000,
|
| 27964 |
"stateful_callbacks": {
|
|
@@ -27973,7 +28151,7 @@
|
|
| 27973 |
"attributes": {}
|
| 27974 |
}
|
| 27975 |
},
|
| 27976 |
-
"total_flos": 1.
|
| 27977 |
"train_batch_size": 32,
|
| 27978 |
"trial_name": null,
|
| 27979 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.4706886977079106,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 158000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 27954 |
"eval_steps_per_second": 15.527,
|
| 27955 |
"num_input_tokens_seen": 91213822304,
|
| 27956 |
"step": 157000
|
| 27957 |
+
},
|
| 27958 |
+
{
|
| 27959 |
+
"epoch": 3.4498209423027157,
|
| 27960 |
+
"grad_norm": 0.09102839231491089,
|
| 27961 |
+
"learning_rate": 0.0001,
|
| 27962 |
+
"loss": 2.3407,
|
| 27963 |
+
"num_input_tokens_seen": 91266251104,
|
| 27964 |
+
"step": 157050
|
| 27965 |
+
},
|
| 27966 |
+
{
|
| 27967 |
+
"epoch": 3.4509192452187785,
|
| 27968 |
+
"grad_norm": 0.09813258051872253,
|
| 27969 |
+
"learning_rate": 0.0001,
|
| 27970 |
+
"loss": 2.3461,
|
| 27971 |
+
"num_input_tokens_seen": 91318679904,
|
| 27972 |
+
"step": 157100
|
| 27973 |
+
},
|
| 27974 |
+
{
|
| 27975 |
+
"epoch": 3.452017548134841,
|
| 27976 |
+
"grad_norm": 0.09532950073480606,
|
| 27977 |
+
"learning_rate": 0.0001,
|
| 27978 |
+
"loss": 2.3457,
|
| 27979 |
+
"num_input_tokens_seen": 91371108704,
|
| 27980 |
+
"step": 157150
|
| 27981 |
+
},
|
| 27982 |
+
{
|
| 27983 |
+
"epoch": 3.4531158510509044,
|
| 27984 |
+
"grad_norm": 0.10110923647880554,
|
| 27985 |
+
"learning_rate": 0.0001,
|
| 27986 |
+
"loss": 2.3426,
|
| 27987 |
+
"num_input_tokens_seen": 91423537504,
|
| 27988 |
+
"step": 157200
|
| 27989 |
+
},
|
| 27990 |
+
{
|
| 27991 |
+
"epoch": 3.454214153966967,
|
| 27992 |
+
"grad_norm": 0.09686494618654251,
|
| 27993 |
+
"learning_rate": 0.0001,
|
| 27994 |
+
"loss": 2.342,
|
| 27995 |
+
"num_input_tokens_seen": 91475959552,
|
| 27996 |
+
"step": 157250
|
| 27997 |
+
},
|
| 27998 |
+
{
|
| 27999 |
+
"epoch": 3.4553124568830302,
|
| 28000 |
+
"grad_norm": 0.09327523410320282,
|
| 28001 |
+
"learning_rate": 0.0001,
|
| 28002 |
+
"loss": 2.3477,
|
| 28003 |
+
"num_input_tokens_seen": 91528385536,
|
| 28004 |
+
"step": 157300
|
| 28005 |
+
},
|
| 28006 |
+
{
|
| 28007 |
+
"epoch": 3.456410759799093,
|
| 28008 |
+
"grad_norm": 0.10524465143680573,
|
| 28009 |
+
"learning_rate": 0.0001,
|
| 28010 |
+
"loss": 2.351,
|
| 28011 |
+
"num_input_tokens_seen": 91580812064,
|
| 28012 |
+
"step": 157350
|
| 28013 |
+
},
|
| 28014 |
+
{
|
| 28015 |
+
"epoch": 3.4575090627151557,
|
| 28016 |
+
"grad_norm": 0.08858100324869156,
|
| 28017 |
+
"learning_rate": 0.0001,
|
| 28018 |
+
"loss": 2.3443,
|
| 28019 |
+
"num_input_tokens_seen": 91633240864,
|
| 28020 |
+
"step": 157400
|
| 28021 |
+
},
|
| 28022 |
+
{
|
| 28023 |
+
"epoch": 3.458607365631219,
|
| 28024 |
+
"grad_norm": 0.0905861109495163,
|
| 28025 |
+
"learning_rate": 0.0001,
|
| 28026 |
+
"loss": 2.338,
|
| 28027 |
+
"num_input_tokens_seen": 91685669664,
|
| 28028 |
+
"step": 157450
|
| 28029 |
+
},
|
| 28030 |
+
{
|
| 28031 |
+
"epoch": 3.4597056685472816,
|
| 28032 |
+
"grad_norm": 0.0902877077460289,
|
| 28033 |
+
"learning_rate": 0.0001,
|
| 28034 |
+
"loss": 2.3447,
|
| 28035 |
+
"num_input_tokens_seen": 91738098464,
|
| 28036 |
+
"step": 157500
|
| 28037 |
+
},
|
| 28038 |
+
{
|
| 28039 |
+
"epoch": 3.4597056685472816,
|
| 28040 |
+
"eval_loss": 2.257868766784668,
|
| 28041 |
+
"eval_runtime": 80.702,
|
| 28042 |
+
"eval_samples_per_second": 61.956,
|
| 28043 |
+
"eval_steps_per_second": 15.489,
|
| 28044 |
+
"num_input_tokens_seen": 91738098464,
|
| 28045 |
+
"step": 157500
|
| 28046 |
+
},
|
| 28047 |
+
{
|
| 28048 |
+
"epoch": 3.4608039714633447,
|
| 28049 |
+
"grad_norm": 0.09488774091005325,
|
| 28050 |
+
"learning_rate": 0.0001,
|
| 28051 |
+
"loss": 2.348,
|
| 28052 |
+
"num_input_tokens_seen": 91790527264,
|
| 28053 |
+
"step": 157550
|
| 28054 |
+
},
|
| 28055 |
+
{
|
| 28056 |
+
"epoch": 3.4619022743794075,
|
| 28057 |
+
"grad_norm": 0.09437818825244904,
|
| 28058 |
+
"learning_rate": 0.0001,
|
| 28059 |
+
"loss": 2.3366,
|
| 28060 |
+
"num_input_tokens_seen": 91842955104,
|
| 28061 |
+
"step": 157600
|
| 28062 |
+
},
|
| 28063 |
+
{
|
| 28064 |
+
"epoch": 3.46300057729547,
|
| 28065 |
+
"grad_norm": 0.09216772764921188,
|
| 28066 |
+
"learning_rate": 0.0001,
|
| 28067 |
+
"loss": 2.3455,
|
| 28068 |
+
"num_input_tokens_seen": 91895383904,
|
| 28069 |
+
"step": 157650
|
| 28070 |
+
},
|
| 28071 |
+
{
|
| 28072 |
+
"epoch": 3.464098880211533,
|
| 28073 |
+
"grad_norm": 0.0893646627664566,
|
| 28074 |
+
"learning_rate": 0.0001,
|
| 28075 |
+
"loss": 2.3436,
|
| 28076 |
+
"num_input_tokens_seen": 91947812704,
|
| 28077 |
+
"step": 157700
|
| 28078 |
+
},
|
| 28079 |
+
{
|
| 28080 |
+
"epoch": 3.465197183127596,
|
| 28081 |
+
"grad_norm": 0.10555808991193771,
|
| 28082 |
+
"learning_rate": 0.0001,
|
| 28083 |
+
"loss": 2.3407,
|
| 28084 |
+
"num_input_tokens_seen": 92000241504,
|
| 28085 |
+
"step": 157750
|
| 28086 |
+
},
|
| 28087 |
+
{
|
| 28088 |
+
"epoch": 3.466295486043659,
|
| 28089 |
+
"grad_norm": 0.09263647347688675,
|
| 28090 |
+
"learning_rate": 0.0001,
|
| 28091 |
+
"loss": 2.3455,
|
| 28092 |
+
"num_input_tokens_seen": 92052667776,
|
| 28093 |
+
"step": 157800
|
| 28094 |
+
},
|
| 28095 |
+
{
|
| 28096 |
+
"epoch": 3.467393788959722,
|
| 28097 |
+
"grad_norm": 0.09790777415037155,
|
| 28098 |
+
"learning_rate": 0.0001,
|
| 28099 |
+
"loss": 2.3426,
|
| 28100 |
+
"num_input_tokens_seen": 92105094528,
|
| 28101 |
+
"step": 157850
|
| 28102 |
+
},
|
| 28103 |
+
{
|
| 28104 |
+
"epoch": 3.4684920918757847,
|
| 28105 |
+
"grad_norm": 0.0883532464504242,
|
| 28106 |
+
"learning_rate": 0.0001,
|
| 28107 |
+
"loss": 2.3453,
|
| 28108 |
+
"num_input_tokens_seen": 92157523328,
|
| 28109 |
+
"step": 157900
|
| 28110 |
+
},
|
| 28111 |
+
{
|
| 28112 |
+
"epoch": 3.4695903947918474,
|
| 28113 |
+
"grad_norm": 0.09379395842552185,
|
| 28114 |
+
"learning_rate": 0.0001,
|
| 28115 |
+
"loss": 2.3452,
|
| 28116 |
+
"num_input_tokens_seen": 92209949248,
|
| 28117 |
+
"step": 157950
|
| 28118 |
+
},
|
| 28119 |
+
{
|
| 28120 |
+
"epoch": 3.4706886977079106,
|
| 28121 |
+
"grad_norm": 0.09533659368753433,
|
| 28122 |
+
"learning_rate": 0.0001,
|
| 28123 |
+
"loss": 2.3454,
|
| 28124 |
+
"num_input_tokens_seen": 92262378048,
|
| 28125 |
+
"step": 158000
|
| 28126 |
+
},
|
| 28127 |
+
{
|
| 28128 |
+
"epoch": 3.4706886977079106,
|
| 28129 |
+
"eval_loss": 2.258249521255493,
|
| 28130 |
+
"eval_runtime": 80.6484,
|
| 28131 |
+
"eval_samples_per_second": 61.998,
|
| 28132 |
+
"eval_steps_per_second": 15.499,
|
| 28133 |
+
"num_input_tokens_seen": 92262378048,
|
| 28134 |
+
"step": 158000
|
| 28135 |
}
|
| 28136 |
],
|
| 28137 |
"logging_steps": 50,
|
| 28138 |
"max_steps": 200000,
|
| 28139 |
+
"num_input_tokens_seen": 92262378048,
|
| 28140 |
"num_train_epochs": 5,
|
| 28141 |
"save_steps": 1000,
|
| 28142 |
"stateful_callbacks": {
|
|
|
|
| 28151 |
"attributes": {}
|
| 28152 |
}
|
| 28153 |
},
|
| 28154 |
+
"total_flos": 1.632875804872041e+20,
|
| 28155 |
"train_batch_size": 32,
|
| 28156 |
"trial_name": null,
|
| 28157 |
"trial_params": null
|