Training in progress, step 41000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fb18d4c27c64f6607996dc76ab059b3274f96bf50194e20861ca91446bac906
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4a71156c2d2f2da1c265821c7ca99486fbc72cc466c418215c7150c425f5836
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:028c63076d3d8e5d0c73e4da1b6fc8793d1c56810af68c19f7f253b3016ce7ac
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bafdd2692f3ffed299379761090a99347b59a938d0713ea16130141db6dd54e
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7128,11 +7128,189 @@
|
|
| 7128 |
"eval_steps_per_second": 24.51,
|
| 7129 |
"num_input_tokens_seen": 10485755456,
|
| 7130 |
"step": 40000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7131 |
}
|
| 7132 |
],
|
| 7133 |
"logging_steps": 50,
|
| 7134 |
"max_steps": 70000,
|
| 7135 |
-
"num_input_tokens_seen":
|
| 7136 |
"num_train_epochs": 1,
|
| 7137 |
"save_steps": 1000,
|
| 7138 |
"stateful_callbacks": {
|
|
@@ -7147,7 +7325,7 @@
|
|
| 7147 |
"attributes": {}
|
| 7148 |
}
|
| 7149 |
},
|
| 7150 |
-
"total_flos": 2.
|
| 7151 |
"train_batch_size": 64,
|
| 7152 |
"trial_name": null,
|
| 7153 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.1955710316371919,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 41000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7128 |
"eval_steps_per_second": 24.51,
|
| 7129 |
"num_input_tokens_seen": 10485755456,
|
| 7130 |
"step": 40000
|
| 7131 |
+
},
|
| 7132 |
+
{
|
| 7133 |
+
"epoch": 0.19103950773340328,
|
| 7134 |
+
"grad_norm": 0.18717694282531738,
|
| 7135 |
+
"learning_rate": 0.001,
|
| 7136 |
+
"loss": 2.6512,
|
| 7137 |
+
"num_input_tokens_seen": 10498862656,
|
| 7138 |
+
"step": 40050
|
| 7139 |
+
},
|
| 7140 |
+
{
|
| 7141 |
+
"epoch": 0.19127800899149744,
|
| 7142 |
+
"grad_norm": 0.2009858638048172,
|
| 7143 |
+
"learning_rate": 0.001,
|
| 7144 |
+
"loss": 2.6289,
|
| 7145 |
+
"num_input_tokens_seen": 10511969856,
|
| 7146 |
+
"step": 40100
|
| 7147 |
+
},
|
| 7148 |
+
{
|
| 7149 |
+
"epoch": 0.19151651024959157,
|
| 7150 |
+
"grad_norm": 0.2515949010848999,
|
| 7151 |
+
"learning_rate": 0.001,
|
| 7152 |
+
"loss": 2.6342,
|
| 7153 |
+
"num_input_tokens_seen": 10525077056,
|
| 7154 |
+
"step": 40150
|
| 7155 |
+
},
|
| 7156 |
+
{
|
| 7157 |
+
"epoch": 0.1917550115076857,
|
| 7158 |
+
"grad_norm": 0.19864948093891144,
|
| 7159 |
+
"learning_rate": 0.001,
|
| 7160 |
+
"loss": 2.6191,
|
| 7161 |
+
"num_input_tokens_seen": 10538184256,
|
| 7162 |
+
"step": 40200
|
| 7163 |
+
},
|
| 7164 |
+
{
|
| 7165 |
+
"epoch": 0.19199351276577983,
|
| 7166 |
+
"grad_norm": 0.17704185843467712,
|
| 7167 |
+
"learning_rate": 0.001,
|
| 7168 |
+
"loss": 2.6176,
|
| 7169 |
+
"num_input_tokens_seen": 10551291456,
|
| 7170 |
+
"step": 40250
|
| 7171 |
+
},
|
| 7172 |
+
{
|
| 7173 |
+
"epoch": 0.19223201402387396,
|
| 7174 |
+
"grad_norm": 0.2097242772579193,
|
| 7175 |
+
"learning_rate": 0.001,
|
| 7176 |
+
"loss": 2.6509,
|
| 7177 |
+
"num_input_tokens_seen": 10564398656,
|
| 7178 |
+
"step": 40300
|
| 7179 |
+
},
|
| 7180 |
+
{
|
| 7181 |
+
"epoch": 0.19247051528196812,
|
| 7182 |
+
"grad_norm": 0.18630579113960266,
|
| 7183 |
+
"learning_rate": 0.001,
|
| 7184 |
+
"loss": 2.6273,
|
| 7185 |
+
"num_input_tokens_seen": 10577505856,
|
| 7186 |
+
"step": 40350
|
| 7187 |
+
},
|
| 7188 |
+
{
|
| 7189 |
+
"epoch": 0.19270901654006226,
|
| 7190 |
+
"grad_norm": 0.24162743985652924,
|
| 7191 |
+
"learning_rate": 0.001,
|
| 7192 |
+
"loss": 2.6405,
|
| 7193 |
+
"num_input_tokens_seen": 10590613056,
|
| 7194 |
+
"step": 40400
|
| 7195 |
+
},
|
| 7196 |
+
{
|
| 7197 |
+
"epoch": 0.1929475177981564,
|
| 7198 |
+
"grad_norm": 0.19576874375343323,
|
| 7199 |
+
"learning_rate": 0.001,
|
| 7200 |
+
"loss": 2.6403,
|
| 7201 |
+
"num_input_tokens_seen": 10603720256,
|
| 7202 |
+
"step": 40450
|
| 7203 |
+
},
|
| 7204 |
+
{
|
| 7205 |
+
"epoch": 0.19318601905625052,
|
| 7206 |
+
"grad_norm": 0.18408045172691345,
|
| 7207 |
+
"learning_rate": 0.001,
|
| 7208 |
+
"loss": 2.6149,
|
| 7209 |
+
"num_input_tokens_seen": 10616827456,
|
| 7210 |
+
"step": 40500
|
| 7211 |
+
},
|
| 7212 |
+
{
|
| 7213 |
+
"epoch": 0.19318601905625052,
|
| 7214 |
+
"eval_loss": 2.511899709701538,
|
| 7215 |
+
"eval_runtime": 51.5326,
|
| 7216 |
+
"eval_samples_per_second": 97.026,
|
| 7217 |
+
"eval_steps_per_second": 24.257,
|
| 7218 |
+
"num_input_tokens_seen": 10616827456,
|
| 7219 |
+
"step": 40500
|
| 7220 |
+
},
|
| 7221 |
+
{
|
| 7222 |
+
"epoch": 0.19342452031434465,
|
| 7223 |
+
"grad_norm": 0.20845313370227814,
|
| 7224 |
+
"learning_rate": 0.001,
|
| 7225 |
+
"loss": 2.6242,
|
| 7226 |
+
"num_input_tokens_seen": 10629934656,
|
| 7227 |
+
"step": 40550
|
| 7228 |
+
},
|
| 7229 |
+
{
|
| 7230 |
+
"epoch": 0.19366302157243878,
|
| 7231 |
+
"grad_norm": 0.20603816211223602,
|
| 7232 |
+
"learning_rate": 0.001,
|
| 7233 |
+
"loss": 2.6305,
|
| 7234 |
+
"num_input_tokens_seen": 10643041856,
|
| 7235 |
+
"step": 40600
|
| 7236 |
+
},
|
| 7237 |
+
{
|
| 7238 |
+
"epoch": 0.19390152283053294,
|
| 7239 |
+
"grad_norm": 0.2180013507604599,
|
| 7240 |
+
"learning_rate": 0.001,
|
| 7241 |
+
"loss": 2.6271,
|
| 7242 |
+
"num_input_tokens_seen": 10656149056,
|
| 7243 |
+
"step": 40650
|
| 7244 |
+
},
|
| 7245 |
+
{
|
| 7246 |
+
"epoch": 0.19414002408862707,
|
| 7247 |
+
"grad_norm": 0.22217005491256714,
|
| 7248 |
+
"learning_rate": 0.001,
|
| 7249 |
+
"loss": 2.6407,
|
| 7250 |
+
"num_input_tokens_seen": 10669256256,
|
| 7251 |
+
"step": 40700
|
| 7252 |
+
},
|
| 7253 |
+
{
|
| 7254 |
+
"epoch": 0.1943785253467212,
|
| 7255 |
+
"grad_norm": 0.21379347145557404,
|
| 7256 |
+
"learning_rate": 0.001,
|
| 7257 |
+
"loss": 2.6209,
|
| 7258 |
+
"num_input_tokens_seen": 10682363456,
|
| 7259 |
+
"step": 40750
|
| 7260 |
+
},
|
| 7261 |
+
{
|
| 7262 |
+
"epoch": 0.19461702660481534,
|
| 7263 |
+
"grad_norm": 0.2011626958847046,
|
| 7264 |
+
"learning_rate": 0.001,
|
| 7265 |
+
"loss": 2.6471,
|
| 7266 |
+
"num_input_tokens_seen": 10695470656,
|
| 7267 |
+
"step": 40800
|
| 7268 |
+
},
|
| 7269 |
+
{
|
| 7270 |
+
"epoch": 0.19485552786290947,
|
| 7271 |
+
"grad_norm": 0.1946493685245514,
|
| 7272 |
+
"learning_rate": 0.001,
|
| 7273 |
+
"loss": 2.6267,
|
| 7274 |
+
"num_input_tokens_seen": 10708577856,
|
| 7275 |
+
"step": 40850
|
| 7276 |
+
},
|
| 7277 |
+
{
|
| 7278 |
+
"epoch": 0.19509402912100363,
|
| 7279 |
+
"grad_norm": 0.19157454371452332,
|
| 7280 |
+
"learning_rate": 0.001,
|
| 7281 |
+
"loss": 2.6362,
|
| 7282 |
+
"num_input_tokens_seen": 10721685056,
|
| 7283 |
+
"step": 40900
|
| 7284 |
+
},
|
| 7285 |
+
{
|
| 7286 |
+
"epoch": 0.19533253037909776,
|
| 7287 |
+
"grad_norm": 0.1978122442960739,
|
| 7288 |
+
"learning_rate": 0.001,
|
| 7289 |
+
"loss": 2.6448,
|
| 7290 |
+
"num_input_tokens_seen": 10734792256,
|
| 7291 |
+
"step": 40950
|
| 7292 |
+
},
|
| 7293 |
+
{
|
| 7294 |
+
"epoch": 0.1955710316371919,
|
| 7295 |
+
"grad_norm": 0.19996555149555206,
|
| 7296 |
+
"learning_rate": 0.001,
|
| 7297 |
+
"loss": 2.626,
|
| 7298 |
+
"num_input_tokens_seen": 10747899456,
|
| 7299 |
+
"step": 41000
|
| 7300 |
+
},
|
| 7301 |
+
{
|
| 7302 |
+
"epoch": 0.1955710316371919,
|
| 7303 |
+
"eval_loss": 2.5084941387176514,
|
| 7304 |
+
"eval_runtime": 51.6987,
|
| 7305 |
+
"eval_samples_per_second": 96.714,
|
| 7306 |
+
"eval_steps_per_second": 24.179,
|
| 7307 |
+
"num_input_tokens_seen": 10747899456,
|
| 7308 |
+
"step": 41000
|
| 7309 |
}
|
| 7310 |
],
|
| 7311 |
"logging_steps": 50,
|
| 7312 |
"max_steps": 70000,
|
| 7313 |
+
"num_input_tokens_seen": 10747899456,
|
| 7314 |
"num_train_epochs": 1,
|
| 7315 |
"save_steps": 1000,
|
| 7316 |
"stateful_callbacks": {
|
|
|
|
| 7325 |
"attributes": {}
|
| 7326 |
}
|
| 7327 |
},
|
| 7328 |
+
"total_flos": 2.8751680039786906e+18,
|
| 7329 |
"train_batch_size": 64,
|
| 7330 |
"trial_name": null,
|
| 7331 |
"trial_params": null
|