Training in progress, step 127000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9d1695bc9de636b5aaeaf2dd7d5f58cbc5a682eb69ac9b38095e92d54ec5937
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4dc4491cbd42db47871ad0a656d153441e2ea2d0c5e68c9fdfe29f91fdedede3
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8542f0951d699465323349728bdecbda5c5f0e8274e699cbba04806de2fddeeb
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:27a3a16e476801029c30325a569467f804e448c3ecc89accd2bd78b3749ec27f
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22436,11 +22436,189 @@
|
|
| 22436 |
"eval_steps_per_second": 15.278,
|
| 22437 |
"num_input_tokens_seen": 66049692768,
|
| 22438 |
"step": 126000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22439 |
}
|
| 22440 |
],
|
| 22441 |
"logging_steps": 50,
|
| 22442 |
"max_steps": 140000,
|
| 22443 |
-
"num_input_tokens_seen":
|
| 22444 |
"num_train_epochs": 2,
|
| 22445 |
"save_steps": 1000,
|
| 22446 |
"stateful_callbacks": {
|
|
@@ -22455,7 +22633,7 @@
|
|
| 22455 |
"attributes": {}
|
| 22456 |
}
|
| 22457 |
},
|
| 22458 |
-
"total_flos": 1.
|
| 22459 |
"train_batch_size": 32,
|
| 22460 |
"trial_name": null,
|
| 22461 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.2115887761307942,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 127000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22436 |
"eval_steps_per_second": 15.278,
|
| 22437 |
"num_input_tokens_seen": 66049692768,
|
| 22438 |
"step": 126000
|
| 22439 |
+
},
|
| 22440 |
+
{
|
| 22441 |
+
"epoch": 1.202525728323217,
|
| 22442 |
+
"grad_norm": 0.1393454372882843,
|
| 22443 |
+
"learning_rate": 0.0004971950212723752,
|
| 22444 |
+
"loss": 2.0569,
|
| 22445 |
+
"num_input_tokens_seen": 66075907072,
|
| 22446 |
+
"step": 126050
|
| 22447 |
+
},
|
| 22448 |
+
{
|
| 22449 |
+
"epoch": 1.2030027308394051,
|
| 22450 |
+
"grad_norm": 0.1390795111656189,
|
| 22451 |
+
"learning_rate": 0.0004943901308226771,
|
| 22452 |
+
"loss": 2.0579,
|
| 22453 |
+
"num_input_tokens_seen": 66102120320,
|
| 22454 |
+
"step": 126100
|
| 22455 |
+
},
|
| 22456 |
+
{
|
| 22457 |
+
"epoch": 1.2034797333555933,
|
| 22458 |
+
"grad_norm": 0.136804461479187,
|
| 22459 |
+
"learning_rate": 0.0004915854169260539,
|
| 22460 |
+
"loss": 2.0594,
|
| 22461 |
+
"num_input_tokens_seen": 66128330880,
|
| 22462 |
+
"step": 126150
|
| 22463 |
+
},
|
| 22464 |
+
{
|
| 22465 |
+
"epoch": 1.2039567358717818,
|
| 22466 |
+
"grad_norm": 0.14418946206569672,
|
| 22467 |
+
"learning_rate": 0.0004887809678520976,
|
| 22468 |
+
"loss": 2.0521,
|
| 22469 |
+
"num_input_tokens_seen": 66154537216,
|
| 22470 |
+
"step": 126200
|
| 22471 |
+
},
|
| 22472 |
+
{
|
| 22473 |
+
"epoch": 1.20443373838797,
|
| 22474 |
+
"grad_norm": 0.1406649798154831,
|
| 22475 |
+
"learning_rate": 0.00048597687186206556,
|
| 22476 |
+
"loss": 2.0604,
|
| 22477 |
+
"num_input_tokens_seen": 66180744192,
|
| 22478 |
+
"step": 126250
|
| 22479 |
+
},
|
| 22480 |
+
{
|
| 22481 |
+
"epoch": 1.2049107409041582,
|
| 22482 |
+
"grad_norm": 0.13004782795906067,
|
| 22483 |
+
"learning_rate": 0.0004831732172061032,
|
| 22484 |
+
"loss": 2.0633,
|
| 22485 |
+
"num_input_tokens_seen": 66206951232,
|
| 22486 |
+
"step": 126300
|
| 22487 |
+
},
|
| 22488 |
+
{
|
| 22489 |
+
"epoch": 1.2053877434203466,
|
| 22490 |
+
"grad_norm": 0.1319655478000641,
|
| 22491 |
+
"learning_rate": 0.00048037009212046586,
|
| 22492 |
+
"loss": 2.0609,
|
| 22493 |
+
"num_input_tokens_seen": 66233151744,
|
| 22494 |
+
"step": 126350
|
| 22495 |
+
},
|
| 22496 |
+
{
|
| 22497 |
+
"epoch": 1.2058647459365348,
|
| 22498 |
+
"grad_norm": 0.13051386177539825,
|
| 22499 |
+
"learning_rate": 0.0004775675848247427,
|
| 22500 |
+
"loss": 2.0591,
|
| 22501 |
+
"num_input_tokens_seen": 66259358592,
|
| 22502 |
+
"step": 126400
|
| 22503 |
+
},
|
| 22504 |
+
{
|
| 22505 |
+
"epoch": 1.206341748452723,
|
| 22506 |
+
"grad_norm": 0.12983474135398865,
|
| 22507 |
+
"learning_rate": 0.0004747657835190795,
|
| 22508 |
+
"loss": 2.0571,
|
| 22509 |
+
"num_input_tokens_seen": 66285559520,
|
| 22510 |
+
"step": 126450
|
| 22511 |
+
},
|
| 22512 |
+
{
|
| 22513 |
+
"epoch": 1.2068187509689114,
|
| 22514 |
+
"grad_norm": 0.12744031846523285,
|
| 22515 |
+
"learning_rate": 0.00047196477638140405,
|
| 22516 |
+
"loss": 2.0581,
|
| 22517 |
+
"num_input_tokens_seen": 66311770112,
|
| 22518 |
+
"step": 126500
|
| 22519 |
+
},
|
| 22520 |
+
{
|
| 22521 |
+
"epoch": 1.2068187509689114,
|
| 22522 |
+
"eval_loss": 1.9767038822174072,
|
| 22523 |
+
"eval_runtime": 82.0094,
|
| 22524 |
+
"eval_samples_per_second": 60.969,
|
| 22525 |
+
"eval_steps_per_second": 15.242,
|
| 22526 |
+
"num_input_tokens_seen": 66311770112,
|
| 22527 |
+
"step": 126500
|
| 22528 |
+
},
|
| 22529 |
+
{
|
| 22530 |
+
"epoch": 1.2072957534850997,
|
| 22531 |
+
"grad_norm": 0.13606679439544678,
|
| 22532 |
+
"learning_rate": 0.00046916465156464924,
|
| 22533 |
+
"loss": 2.062,
|
| 22534 |
+
"num_input_tokens_seen": 66337979200,
|
| 22535 |
+
"step": 126550
|
| 22536 |
+
},
|
| 22537 |
+
{
|
| 22538 |
+
"epoch": 1.2077727560012879,
|
| 22539 |
+
"grad_norm": 0.12876896560192108,
|
| 22540 |
+
"learning_rate": 0.0004663654971939802,
|
| 22541 |
+
"loss": 2.0627,
|
| 22542 |
+
"num_input_tokens_seen": 66364192640,
|
| 22543 |
+
"step": 126600
|
| 22544 |
+
},
|
| 22545 |
+
{
|
| 22546 |
+
"epoch": 1.2082497585174763,
|
| 22547 |
+
"grad_norm": 0.18826884031295776,
|
| 22548 |
+
"learning_rate": 0.00046356740136402,
|
| 22549 |
+
"loss": 2.0573,
|
| 22550 |
+
"num_input_tokens_seen": 66390404768,
|
| 22551 |
+
"step": 126650
|
| 22552 |
+
},
|
| 22553 |
+
{
|
| 22554 |
+
"epoch": 1.2087267610336645,
|
| 22555 |
+
"grad_norm": 0.1488431692123413,
|
| 22556 |
+
"learning_rate": 0.0004607704521360776,
|
| 22557 |
+
"loss": 2.0592,
|
| 22558 |
+
"num_input_tokens_seen": 66416613920,
|
| 22559 |
+
"step": 126700
|
| 22560 |
+
},
|
| 22561 |
+
{
|
| 22562 |
+
"epoch": 1.2092037635498527,
|
| 22563 |
+
"grad_norm": 0.12901978194713593,
|
| 22564 |
+
"learning_rate": 0.0004579747375353763,
|
| 22565 |
+
"loss": 2.0601,
|
| 22566 |
+
"num_input_tokens_seen": 66442820800,
|
| 22567 |
+
"step": 126750
|
| 22568 |
+
},
|
| 22569 |
+
{
|
| 22570 |
+
"epoch": 1.209680766066041,
|
| 22571 |
+
"grad_norm": 0.13032038509845734,
|
| 22572 |
+
"learning_rate": 0.0004551803455482833,
|
| 22573 |
+
"loss": 2.0675,
|
| 22574 |
+
"num_input_tokens_seen": 66469028480,
|
| 22575 |
+
"step": 126800
|
| 22576 |
+
},
|
| 22577 |
+
{
|
| 22578 |
+
"epoch": 1.2101577685822293,
|
| 22579 |
+
"grad_norm": 0.13756315410137177,
|
| 22580 |
+
"learning_rate": 0.00045238736411954073,
|
| 22581 |
+
"loss": 2.0543,
|
| 22582 |
+
"num_input_tokens_seen": 66495230816,
|
| 22583 |
+
"step": 126850
|
| 22584 |
+
},
|
| 22585 |
+
{
|
| 22586 |
+
"epoch": 1.2106347710984176,
|
| 22587 |
+
"grad_norm": 0.13066066801548004,
|
| 22588 |
+
"learning_rate": 0.0004495958811494978,
|
| 22589 |
+
"loss": 2.0545,
|
| 22590 |
+
"num_input_tokens_seen": 66521443360,
|
| 22591 |
+
"step": 126900
|
| 22592 |
+
},
|
| 22593 |
+
{
|
| 22594 |
+
"epoch": 1.2111117736146058,
|
| 22595 |
+
"grad_norm": 0.13837099075317383,
|
| 22596 |
+
"learning_rate": 0.00044680598449134434,
|
| 22597 |
+
"loss": 2.0557,
|
| 22598 |
+
"num_input_tokens_seen": 66547651488,
|
| 22599 |
+
"step": 126950
|
| 22600 |
+
},
|
| 22601 |
+
{
|
| 22602 |
+
"epoch": 1.2115887761307942,
|
| 22603 |
+
"grad_norm": 0.13125094771385193,
|
| 22604 |
+
"learning_rate": 0.0004440177619483461,
|
| 22605 |
+
"loss": 2.0633,
|
| 22606 |
+
"num_input_tokens_seen": 66573856704,
|
| 22607 |
+
"step": 127000
|
| 22608 |
+
},
|
| 22609 |
+
{
|
| 22610 |
+
"epoch": 1.2115887761307942,
|
| 22611 |
+
"eval_loss": 1.9741461277008057,
|
| 22612 |
+
"eval_runtime": 82.3333,
|
| 22613 |
+
"eval_samples_per_second": 60.729,
|
| 22614 |
+
"eval_steps_per_second": 15.182,
|
| 22615 |
+
"num_input_tokens_seen": 66573856704,
|
| 22616 |
+
"step": 127000
|
| 22617 |
}
|
| 22618 |
],
|
| 22619 |
"logging_steps": 50,
|
| 22620 |
"max_steps": 140000,
|
| 22621 |
+
"num_input_tokens_seen": 66573856704,
|
| 22622 |
"num_train_epochs": 2,
|
| 22623 |
"save_steps": 1000,
|
| 22624 |
"stateful_callbacks": {
|
|
|
|
| 22633 |
"attributes": {}
|
| 22634 |
}
|
| 22635 |
},
|
| 22636 |
+
"total_flos": 1.1782358329461719e+20,
|
| 22637 |
"train_batch_size": 32,
|
| 22638 |
"trial_name": null,
|
| 22639 |
"trial_params": null
|