Training in progress, step 42000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b793c31018c10b83151888a761e5fecf881d8cfcf10fe82ad108fb7a30b9cb35
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c753061fb3a47402b7408e67c6f3761fca04d13fb94ac46b9adfdfc16d0184d4
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aaf95bbf390f32ec661a712de605a0c816388cfa815f81914058fe6bdabdcd9
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a94a7467707318fda39e274661a096a9de559314c283be40d75a871d8d1d3d18
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7306,11 +7306,189 @@
|
|
| 7306 |
"eval_steps_per_second": 24.179,
|
| 7307 |
"num_input_tokens_seen": 10747899456,
|
| 7308 |
"step": 41000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7309 |
}
|
| 7310 |
],
|
| 7311 |
"logging_steps": 50,
|
| 7312 |
"max_steps": 70000,
|
| 7313 |
-
"num_input_tokens_seen":
|
| 7314 |
"num_train_epochs": 1,
|
| 7315 |
"save_steps": 1000,
|
| 7316 |
"stateful_callbacks": {
|
|
@@ -7325,7 +7503,7 @@
|
|
| 7325 |
"attributes": {}
|
| 7326 |
}
|
| 7327 |
},
|
| 7328 |
-
"total_flos": 2.
|
| 7329 |
"train_batch_size": 64,
|
| 7330 |
"trial_name": null,
|
| 7331 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2003410567990746,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 42000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7306 |
"eval_steps_per_second": 24.179,
|
| 7307 |
"num_input_tokens_seen": 10747899456,
|
| 7308 |
"step": 41000
|
| 7309 |
+
},
|
| 7310 |
+
{
|
| 7311 |
+
"epoch": 0.19580953289528602,
|
| 7312 |
+
"grad_norm": 0.20298945903778076,
|
| 7313 |
+
"learning_rate": 0.001,
|
| 7314 |
+
"loss": 2.6233,
|
| 7315 |
+
"num_input_tokens_seen": 10761006656,
|
| 7316 |
+
"step": 41050
|
| 7317 |
+
},
|
| 7318 |
+
{
|
| 7319 |
+
"epoch": 0.19604803415338015,
|
| 7320 |
+
"grad_norm": 0.2280716896057129,
|
| 7321 |
+
"learning_rate": 0.001,
|
| 7322 |
+
"loss": 2.6427,
|
| 7323 |
+
"num_input_tokens_seen": 10774113856,
|
| 7324 |
+
"step": 41100
|
| 7325 |
+
},
|
| 7326 |
+
{
|
| 7327 |
+
"epoch": 0.19628653541147428,
|
| 7328 |
+
"grad_norm": 0.19223643839359283,
|
| 7329 |
+
"learning_rate": 0.001,
|
| 7330 |
+
"loss": 2.6263,
|
| 7331 |
+
"num_input_tokens_seen": 10787221056,
|
| 7332 |
+
"step": 41150
|
| 7333 |
+
},
|
| 7334 |
+
{
|
| 7335 |
+
"epoch": 0.19652503666956844,
|
| 7336 |
+
"grad_norm": 0.19221842288970947,
|
| 7337 |
+
"learning_rate": 0.001,
|
| 7338 |
+
"loss": 2.6401,
|
| 7339 |
+
"num_input_tokens_seen": 10800328256,
|
| 7340 |
+
"step": 41200
|
| 7341 |
+
},
|
| 7342 |
+
{
|
| 7343 |
+
"epoch": 0.19676353792766257,
|
| 7344 |
+
"grad_norm": 0.19479979574680328,
|
| 7345 |
+
"learning_rate": 0.001,
|
| 7346 |
+
"loss": 2.6269,
|
| 7347 |
+
"num_input_tokens_seen": 10813435456,
|
| 7348 |
+
"step": 41250
|
| 7349 |
+
},
|
| 7350 |
+
{
|
| 7351 |
+
"epoch": 0.1970020391857567,
|
| 7352 |
+
"grad_norm": 0.24501195549964905,
|
| 7353 |
+
"learning_rate": 0.001,
|
| 7354 |
+
"loss": 2.618,
|
| 7355 |
+
"num_input_tokens_seen": 10826542656,
|
| 7356 |
+
"step": 41300
|
| 7357 |
+
},
|
| 7358 |
+
{
|
| 7359 |
+
"epoch": 0.19724054044385084,
|
| 7360 |
+
"grad_norm": 0.1994044929742813,
|
| 7361 |
+
"learning_rate": 0.001,
|
| 7362 |
+
"loss": 2.64,
|
| 7363 |
+
"num_input_tokens_seen": 10839649856,
|
| 7364 |
+
"step": 41350
|
| 7365 |
+
},
|
| 7366 |
+
{
|
| 7367 |
+
"epoch": 0.19747904170194497,
|
| 7368 |
+
"grad_norm": 0.20831650495529175,
|
| 7369 |
+
"learning_rate": 0.001,
|
| 7370 |
+
"loss": 2.6513,
|
| 7371 |
+
"num_input_tokens_seen": 10852757056,
|
| 7372 |
+
"step": 41400
|
| 7373 |
+
},
|
| 7374 |
+
{
|
| 7375 |
+
"epoch": 0.19771754296003913,
|
| 7376 |
+
"grad_norm": 0.21919438242912292,
|
| 7377 |
+
"learning_rate": 0.001,
|
| 7378 |
+
"loss": 2.6379,
|
| 7379 |
+
"num_input_tokens_seen": 10865864256,
|
| 7380 |
+
"step": 41450
|
| 7381 |
+
},
|
| 7382 |
+
{
|
| 7383 |
+
"epoch": 0.19795604421813326,
|
| 7384 |
+
"grad_norm": 0.23088768124580383,
|
| 7385 |
+
"learning_rate": 0.001,
|
| 7386 |
+
"loss": 2.6449,
|
| 7387 |
+
"num_input_tokens_seen": 10878971456,
|
| 7388 |
+
"step": 41500
|
| 7389 |
+
},
|
| 7390 |
+
{
|
| 7391 |
+
"epoch": 0.19795604421813326,
|
| 7392 |
+
"eval_loss": 2.5156567096710205,
|
| 7393 |
+
"eval_runtime": 51.6776,
|
| 7394 |
+
"eval_samples_per_second": 96.754,
|
| 7395 |
+
"eval_steps_per_second": 24.188,
|
| 7396 |
+
"num_input_tokens_seen": 10878971456,
|
| 7397 |
+
"step": 41500
|
| 7398 |
+
},
|
| 7399 |
+
{
|
| 7400 |
+
"epoch": 0.1981945454762274,
|
| 7401 |
+
"grad_norm": 0.1982518881559372,
|
| 7402 |
+
"learning_rate": 0.001,
|
| 7403 |
+
"loss": 2.6304,
|
| 7404 |
+
"num_input_tokens_seen": 10892078656,
|
| 7405 |
+
"step": 41550
|
| 7406 |
+
},
|
| 7407 |
+
{
|
| 7408 |
+
"epoch": 0.19843304673432152,
|
| 7409 |
+
"grad_norm": 0.2099853903055191,
|
| 7410 |
+
"learning_rate": 0.001,
|
| 7411 |
+
"loss": 2.6305,
|
| 7412 |
+
"num_input_tokens_seen": 10905185856,
|
| 7413 |
+
"step": 41600
|
| 7414 |
+
},
|
| 7415 |
+
{
|
| 7416 |
+
"epoch": 0.19867154799241565,
|
| 7417 |
+
"grad_norm": 0.19403131306171417,
|
| 7418 |
+
"learning_rate": 0.001,
|
| 7419 |
+
"loss": 2.6419,
|
| 7420 |
+
"num_input_tokens_seen": 10918293056,
|
| 7421 |
+
"step": 41650
|
| 7422 |
+
},
|
| 7423 |
+
{
|
| 7424 |
+
"epoch": 0.19891004925050979,
|
| 7425 |
+
"grad_norm": 0.20865993201732635,
|
| 7426 |
+
"learning_rate": 0.001,
|
| 7427 |
+
"loss": 2.6116,
|
| 7428 |
+
"num_input_tokens_seen": 10931400256,
|
| 7429 |
+
"step": 41700
|
| 7430 |
+
},
|
| 7431 |
+
{
|
| 7432 |
+
"epoch": 0.19914855050860394,
|
| 7433 |
+
"grad_norm": 0.19042626023292542,
|
| 7434 |
+
"learning_rate": 0.001,
|
| 7435 |
+
"loss": 2.6271,
|
| 7436 |
+
"num_input_tokens_seen": 10944507456,
|
| 7437 |
+
"step": 41750
|
| 7438 |
+
},
|
| 7439 |
+
{
|
| 7440 |
+
"epoch": 0.19938705176669808,
|
| 7441 |
+
"grad_norm": 0.20514579117298126,
|
| 7442 |
+
"learning_rate": 0.001,
|
| 7443 |
+
"loss": 2.6348,
|
| 7444 |
+
"num_input_tokens_seen": 10957614656,
|
| 7445 |
+
"step": 41800
|
| 7446 |
+
},
|
| 7447 |
+
{
|
| 7448 |
+
"epoch": 0.1996255530247922,
|
| 7449 |
+
"grad_norm": 0.21224668622016907,
|
| 7450 |
+
"learning_rate": 0.001,
|
| 7451 |
+
"loss": 2.6314,
|
| 7452 |
+
"num_input_tokens_seen": 10970721856,
|
| 7453 |
+
"step": 41850
|
| 7454 |
+
},
|
| 7455 |
+
{
|
| 7456 |
+
"epoch": 0.19986405428288634,
|
| 7457 |
+
"grad_norm": 0.18857082724571228,
|
| 7458 |
+
"learning_rate": 0.001,
|
| 7459 |
+
"loss": 2.6217,
|
| 7460 |
+
"num_input_tokens_seen": 10983829056,
|
| 7461 |
+
"step": 41900
|
| 7462 |
+
},
|
| 7463 |
+
{
|
| 7464 |
+
"epoch": 0.20010255554098047,
|
| 7465 |
+
"grad_norm": 0.18431074917316437,
|
| 7466 |
+
"learning_rate": 0.001,
|
| 7467 |
+
"loss": 2.6267,
|
| 7468 |
+
"num_input_tokens_seen": 10996936256,
|
| 7469 |
+
"step": 41950
|
| 7470 |
+
},
|
| 7471 |
+
{
|
| 7472 |
+
"epoch": 0.2003410567990746,
|
| 7473 |
+
"grad_norm": 0.20570099353790283,
|
| 7474 |
+
"learning_rate": 0.001,
|
| 7475 |
+
"loss": 2.6016,
|
| 7476 |
+
"num_input_tokens_seen": 11010043456,
|
| 7477 |
+
"step": 42000
|
| 7478 |
+
},
|
| 7479 |
+
{
|
| 7480 |
+
"epoch": 0.2003410567990746,
|
| 7481 |
+
"eval_loss": 2.506241798400879,
|
| 7482 |
+
"eval_runtime": 51.5548,
|
| 7483 |
+
"eval_samples_per_second": 96.984,
|
| 7484 |
+
"eval_steps_per_second": 24.246,
|
| 7485 |
+
"num_input_tokens_seen": 11010043456,
|
| 7486 |
+
"step": 42000
|
| 7487 |
}
|
| 7488 |
],
|
| 7489 |
"logging_steps": 50,
|
| 7490 |
"max_steps": 70000,
|
| 7491 |
+
"num_input_tokens_seen": 11010043456,
|
| 7492 |
"num_train_epochs": 1,
|
| 7493 |
"save_steps": 1000,
|
| 7494 |
"stateful_callbacks": {
|
|
|
|
| 7503 |
"attributes": {}
|
| 7504 |
}
|
| 7505 |
},
|
| 7506 |
+
"total_flos": 2.9452940825041306e+18,
|
| 7507 |
"train_batch_size": 64,
|
| 7508 |
"trial_name": null,
|
| 7509 |
"trial_params": null
|