Training in progress, step 66000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6627f46453f0eddcb5503378a89a14a6529d63c8f3e731e04b523860ef73959
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55adb983e10ce2c91d34635b0e2c61b12341302e3599339214fbe162d24db56d
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5506f8ab70fc0520e3fcff77fee663d3576573119296fd847d8ec1a26a45a3cf
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d828325c04baaeca4bef8dd14dbbff2a89fb26da8a22793521965c92d2ced694
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11578,11 +11578,189 @@
|
|
| 11578 |
"eval_steps_per_second": 23.574,
|
| 11579 |
"num_input_tokens_seen": 17039360000,
|
| 11580 |
"step": 65000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11581 |
}
|
| 11582 |
],
|
| 11583 |
"logging_steps": 50,
|
| 11584 |
"max_steps": 70000,
|
| 11585 |
-
"num_input_tokens_seen":
|
| 11586 |
"num_train_epochs": 1,
|
| 11587 |
"save_steps": 1000,
|
| 11588 |
"stateful_callbacks": {
|
|
@@ -11597,7 +11775,7 @@
|
|
| 11597 |
"attributes": {}
|
| 11598 |
}
|
| 11599 |
},
|
| 11600 |
-
"total_flos": 4.
|
| 11601 |
"train_batch_size": 64,
|
| 11602 |
"trial_name": null,
|
| 11603 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.44394892569404854,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 66000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11578 |
"eval_steps_per_second": 23.574,
|
| 11579 |
"num_input_tokens_seen": 17039360000,
|
| 11580 |
"step": 65000
|
| 11581 |
+
},
|
| 11582 |
+
{
|
| 11583 |
+
"epoch": 0.4375587517636039,
|
| 11584 |
+
"grad_norm": 0.16188842058181763,
|
| 11585 |
+
"learning_rate": 0.00021814686889249158,
|
| 11586 |
+
"loss": 2.9812,
|
| 11587 |
+
"num_input_tokens_seen": 17052467200,
|
| 11588 |
+
"step": 65050
|
| 11589 |
+
},
|
| 11590 |
+
{
|
| 11591 |
+
"epoch": 0.43789507670731154,
|
| 11592 |
+
"grad_norm": 0.14550812542438507,
|
| 11593 |
+
"learning_rate": 0.00021410601988619394,
|
| 11594 |
+
"loss": 2.9856,
|
| 11595 |
+
"num_input_tokens_seen": 17065574400,
|
| 11596 |
+
"step": 65100
|
| 11597 |
+
},
|
| 11598 |
+
{
|
| 11599 |
+
"epoch": 0.43823140165101915,
|
| 11600 |
+
"grad_norm": 0.1500539779663086,
|
| 11601 |
+
"learning_rate": 0.00021009272593674322,
|
| 11602 |
+
"loss": 2.9827,
|
| 11603 |
+
"num_input_tokens_seen": 17078681600,
|
| 11604 |
+
"step": 65150
|
| 11605 |
+
},
|
| 11606 |
+
{
|
| 11607 |
+
"epoch": 0.43856772659472676,
|
| 11608 |
+
"grad_norm": 0.1571357101202011,
|
| 11609 |
+
"learning_rate": 0.00020610737385376348,
|
| 11610 |
+
"loss": 2.9788,
|
| 11611 |
+
"num_input_tokens_seen": 17091788800,
|
| 11612 |
+
"step": 65200
|
| 11613 |
+
},
|
| 11614 |
+
{
|
| 11615 |
+
"epoch": 0.43890405153843437,
|
| 11616 |
+
"grad_norm": 0.1671544760465622,
|
| 11617 |
+
"learning_rate": 0.00020215034775378332,
|
| 11618 |
+
"loss": 2.9758,
|
| 11619 |
+
"num_input_tokens_seen": 17104896000,
|
| 11620 |
+
"step": 65250
|
| 11621 |
+
},
|
| 11622 |
+
{
|
| 11623 |
+
"epoch": 0.439240376482142,
|
| 11624 |
+
"grad_norm": 0.15525776147842407,
|
| 11625 |
+
"learning_rate": 0.0001982220290232143,
|
| 11626 |
+
"loss": 2.9823,
|
| 11627 |
+
"num_input_tokens_seen": 17118003200,
|
| 11628 |
+
"step": 65300
|
| 11629 |
+
},
|
| 11630 |
+
{
|
| 11631 |
+
"epoch": 0.4395767014258496,
|
| 11632 |
+
"grad_norm": 0.14799903333187103,
|
| 11633 |
+
"learning_rate": 0.00019432279628159188,
|
| 11634 |
+
"loss": 2.9781,
|
| 11635 |
+
"num_input_tokens_seen": 17131110400,
|
| 11636 |
+
"step": 65350
|
| 11637 |
+
},
|
| 11638 |
+
{
|
| 11639 |
+
"epoch": 0.4399130263695572,
|
| 11640 |
+
"grad_norm": 0.16087676584720612,
|
| 11641 |
+
"learning_rate": 0.00019045302534508295,
|
| 11642 |
+
"loss": 2.9805,
|
| 11643 |
+
"num_input_tokens_seen": 17144217600,
|
| 11644 |
+
"step": 65400
|
| 11645 |
+
},
|
| 11646 |
+
{
|
| 11647 |
+
"epoch": 0.4402493513132648,
|
| 11648 |
+
"grad_norm": 0.15892113745212555,
|
| 11649 |
+
"learning_rate": 0.0001866130891902653,
|
| 11650 |
+
"loss": 2.9823,
|
| 11651 |
+
"num_input_tokens_seen": 17157324800,
|
| 11652 |
+
"step": 65450
|
| 11653 |
+
},
|
| 11654 |
+
{
|
| 11655 |
+
"epoch": 0.4405856762569724,
|
| 11656 |
+
"grad_norm": 0.187602236866951,
|
| 11657 |
+
"learning_rate": 0.00018280335791817732,
|
| 11658 |
+
"loss": 2.9804,
|
| 11659 |
+
"num_input_tokens_seen": 17170432000,
|
| 11660 |
+
"step": 65500
|
| 11661 |
+
},
|
| 11662 |
+
{
|
| 11663 |
+
"epoch": 0.4405856762569724,
|
| 11664 |
+
"eval_loss": 2.875824451446533,
|
| 11665 |
+
"eval_runtime": 53.0867,
|
| 11666 |
+
"eval_samples_per_second": 94.186,
|
| 11667 |
+
"eval_steps_per_second": 23.546,
|
| 11668 |
+
"num_input_tokens_seen": 17170432000,
|
| 11669 |
+
"step": 65500
|
| 11670 |
+
},
|
| 11671 |
+
{
|
| 11672 |
+
"epoch": 0.44092200120068004,
|
| 11673 |
+
"grad_norm": 0.15579210221767426,
|
| 11674 |
+
"learning_rate": 0.0001790241987186485,
|
| 11675 |
+
"loss": 2.9734,
|
| 11676 |
+
"num_input_tokens_seen": 17183539200,
|
| 11677 |
+
"step": 65550
|
| 11678 |
+
},
|
| 11679 |
+
{
|
| 11680 |
+
"epoch": 0.44125832614438765,
|
| 11681 |
+
"grad_norm": 0.15250550210475922,
|
| 11682 |
+
"learning_rate": 0.00017527597583490823,
|
| 11683 |
+
"loss": 2.9787,
|
| 11684 |
+
"num_input_tokens_seen": 17196646400,
|
| 11685 |
+
"step": 65600
|
| 11686 |
+
},
|
| 11687 |
+
{
|
| 11688 |
+
"epoch": 0.44159465108809526,
|
| 11689 |
+
"grad_norm": 0.15954890847206116,
|
| 11690 |
+
"learning_rate": 0.00017155905052847938,
|
| 11691 |
+
"loss": 2.978,
|
| 11692 |
+
"num_input_tokens_seen": 17209753600,
|
| 11693 |
+
"step": 65650
|
| 11694 |
+
},
|
| 11695 |
+
{
|
| 11696 |
+
"epoch": 0.44193097603180287,
|
| 11697 |
+
"grad_norm": 0.15598754584789276,
|
| 11698 |
+
"learning_rate": 0.00016787378104435928,
|
| 11699 |
+
"loss": 2.9809,
|
| 11700 |
+
"num_input_tokens_seen": 17222860800,
|
| 11701 |
+
"step": 65700
|
| 11702 |
+
},
|
| 11703 |
+
{
|
| 11704 |
+
"epoch": 0.4422673009755105,
|
| 11705 |
+
"grad_norm": 0.14709477126598358,
|
| 11706 |
+
"learning_rate": 0.00016422052257649078,
|
| 11707 |
+
"loss": 2.9793,
|
| 11708 |
+
"num_input_tokens_seen": 17235968000,
|
| 11709 |
+
"step": 65750
|
| 11710 |
+
},
|
| 11711 |
+
{
|
| 11712 |
+
"epoch": 0.4426036259192181,
|
| 11713 |
+
"grad_norm": 0.15505217015743256,
|
| 11714 |
+
"learning_rate": 0.0001605996272335291,
|
| 11715 |
+
"loss": 2.9763,
|
| 11716 |
+
"num_input_tokens_seen": 17249075200,
|
| 11717 |
+
"step": 65800
|
| 11718 |
+
},
|
| 11719 |
+
{
|
| 11720 |
+
"epoch": 0.4429399508629257,
|
| 11721 |
+
"grad_norm": 0.14491549134254456,
|
| 11722 |
+
"learning_rate": 0.0001570114440049037,
|
| 11723 |
+
"loss": 2.9756,
|
| 11724 |
+
"num_input_tokens_seen": 17262182400,
|
| 11725 |
+
"step": 65850
|
| 11726 |
+
},
|
| 11727 |
+
{
|
| 11728 |
+
"epoch": 0.4432762758066333,
|
| 11729 |
+
"grad_norm": 0.1571652740240097,
|
| 11730 |
+
"learning_rate": 0.00015345631872718213,
|
| 11731 |
+
"loss": 2.977,
|
| 11732 |
+
"num_input_tokens_seen": 17275289600,
|
| 11733 |
+
"step": 65900
|
| 11734 |
+
},
|
| 11735 |
+
{
|
| 11736 |
+
"epoch": 0.4436126007503409,
|
| 11737 |
+
"grad_norm": 0.18299035727977753,
|
| 11738 |
+
"learning_rate": 0.00014993459405073824,
|
| 11739 |
+
"loss": 2.9788,
|
| 11740 |
+
"num_input_tokens_seen": 17288396800,
|
| 11741 |
+
"step": 65950
|
| 11742 |
+
},
|
| 11743 |
+
{
|
| 11744 |
+
"epoch": 0.44394892569404854,
|
| 11745 |
+
"grad_norm": 0.14829285442829132,
|
| 11746 |
+
"learning_rate": 0.00014644660940672628,
|
| 11747 |
+
"loss": 2.9851,
|
| 11748 |
+
"num_input_tokens_seen": 17301504000,
|
| 11749 |
+
"step": 66000
|
| 11750 |
+
},
|
| 11751 |
+
{
|
| 11752 |
+
"epoch": 0.44394892569404854,
|
| 11753 |
+
"eval_loss": 2.8729286193847656,
|
| 11754 |
+
"eval_runtime": 53.2839,
|
| 11755 |
+
"eval_samples_per_second": 93.837,
|
| 11756 |
+
"eval_steps_per_second": 23.459,
|
| 11757 |
+
"num_input_tokens_seen": 17301504000,
|
| 11758 |
+
"step": 66000
|
| 11759 |
}
|
| 11760 |
],
|
| 11761 |
"logging_steps": 50,
|
| 11762 |
"max_steps": 70000,
|
| 11763 |
+
"num_input_tokens_seen": 17301504000,
|
| 11764 |
"num_train_epochs": 1,
|
| 11765 |
"save_steps": 1000,
|
| 11766 |
"stateful_callbacks": {
|
|
|
|
| 11775 |
"attributes": {}
|
| 11776 |
}
|
| 11777 |
},
|
| 11778 |
+
"total_flos": 4.62832118267904e+18,
|
| 11779 |
"train_batch_size": 64,
|
| 11780 |
"trial_name": null,
|
| 11781 |
"trial_params": null
|