Training in progress, step 27000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5b523d0237ff4825791520de6c6899e7d737f3dbfe8441d833895f1e2466285
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:149a5f50fc47d3d0a29e92a6c18a1d78db3365d41cfd7f18ae74185f9b0fbc4e
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d1d738e0f013e71559a982b5bed46734a8c7b8ac496ca76379bed24380a52a1
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5585e9833c9684d1dabff9cec651205ae9bf4f81ab2bb2b589702ce44919fbb3
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4636,11 +4636,189 @@
|
|
| 4636 |
"eval_steps_per_second": 18.896,
|
| 4637 |
"num_input_tokens_seen": 27262972160,
|
| 4638 |
"step": 26000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4639 |
}
|
| 4640 |
],
|
| 4641 |
"logging_steps": 50,
|
| 4642 |
"max_steps": 200000,
|
| 4643 |
-
"num_input_tokens_seen":
|
| 4644 |
"num_train_epochs": 5,
|
| 4645 |
"save_steps": 1000,
|
| 4646 |
"stateful_callbacks": {
|
|
@@ -4655,7 +4833,7 @@
|
|
| 4655 |
"attributes": {}
|
| 4656 |
}
|
| 4657 |
},
|
| 4658 |
-
"total_flos": 1.
|
| 4659 |
"train_batch_size": 64,
|
| 4660 |
"trial_name": null,
|
| 4661 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.5930831675583508,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 27000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4636 |
"eval_steps_per_second": 18.896,
|
| 4637 |
"num_input_tokens_seen": 27262972160,
|
| 4638 |
"step": 26000
|
| 4639 |
+
},
|
| 4640 |
+
{
|
| 4641 |
+
"epoch": 0.5722154264775939,
|
| 4642 |
+
"grad_norm": 0.1538383513689041,
|
| 4643 |
+
"learning_rate": 0.001,
|
| 4644 |
+
"loss": 2.6783,
|
| 4645 |
+
"num_input_tokens_seen": 27315400960,
|
| 4646 |
+
"step": 26050
|
| 4647 |
+
},
|
| 4648 |
+
{
|
| 4649 |
+
"epoch": 0.5733137286397391,
|
| 4650 |
+
"grad_norm": 0.15545998513698578,
|
| 4651 |
+
"learning_rate": 0.001,
|
| 4652 |
+
"loss": 2.6798,
|
| 4653 |
+
"num_input_tokens_seen": 27367829760,
|
| 4654 |
+
"step": 26100
|
| 4655 |
+
},
|
| 4656 |
+
{
|
| 4657 |
+
"epoch": 0.5744120308018842,
|
| 4658 |
+
"grad_norm": 0.15456970036029816,
|
| 4659 |
+
"learning_rate": 0.001,
|
| 4660 |
+
"loss": 2.6836,
|
| 4661 |
+
"num_input_tokens_seen": 27420258560,
|
| 4662 |
+
"step": 26150
|
| 4663 |
+
},
|
| 4664 |
+
{
|
| 4665 |
+
"epoch": 0.5755103329640292,
|
| 4666 |
+
"grad_norm": 0.1353277862071991,
|
| 4667 |
+
"learning_rate": 0.001,
|
| 4668 |
+
"loss": 2.6777,
|
| 4669 |
+
"num_input_tokens_seen": 27472687360,
|
| 4670 |
+
"step": 26200
|
| 4671 |
+
},
|
| 4672 |
+
{
|
| 4673 |
+
"epoch": 0.5766086351261743,
|
| 4674 |
+
"grad_norm": 0.15124258399009705,
|
| 4675 |
+
"learning_rate": 0.001,
|
| 4676 |
+
"loss": 2.681,
|
| 4677 |
+
"num_input_tokens_seen": 27525116160,
|
| 4678 |
+
"step": 26250
|
| 4679 |
+
},
|
| 4680 |
+
{
|
| 4681 |
+
"epoch": 0.5777069372883195,
|
| 4682 |
+
"grad_norm": 0.14200901985168457,
|
| 4683 |
+
"learning_rate": 0.001,
|
| 4684 |
+
"loss": 2.6827,
|
| 4685 |
+
"num_input_tokens_seen": 27577544960,
|
| 4686 |
+
"step": 26300
|
| 4687 |
+
},
|
| 4688 |
+
{
|
| 4689 |
+
"epoch": 0.5788052394504645,
|
| 4690 |
+
"grad_norm": 0.15356388688087463,
|
| 4691 |
+
"learning_rate": 0.001,
|
| 4692 |
+
"loss": 2.6802,
|
| 4693 |
+
"num_input_tokens_seen": 27629973760,
|
| 4694 |
+
"step": 26350
|
| 4695 |
+
},
|
| 4696 |
+
{
|
| 4697 |
+
"epoch": 0.5799035416126096,
|
| 4698 |
+
"grad_norm": 0.17395390570163727,
|
| 4699 |
+
"learning_rate": 0.001,
|
| 4700 |
+
"loss": 2.6921,
|
| 4701 |
+
"num_input_tokens_seen": 27682402560,
|
| 4702 |
+
"step": 26400
|
| 4703 |
+
},
|
| 4704 |
+
{
|
| 4705 |
+
"epoch": 0.5810018437747547,
|
| 4706 |
+
"grad_norm": 0.1507692188024521,
|
| 4707 |
+
"learning_rate": 0.001,
|
| 4708 |
+
"loss": 2.6811,
|
| 4709 |
+
"num_input_tokens_seen": 27734831360,
|
| 4710 |
+
"step": 26450
|
| 4711 |
+
},
|
| 4712 |
+
{
|
| 4713 |
+
"epoch": 0.5821001459368998,
|
| 4714 |
+
"grad_norm": 0.14512786269187927,
|
| 4715 |
+
"learning_rate": 0.001,
|
| 4716 |
+
"loss": 2.6798,
|
| 4717 |
+
"num_input_tokens_seen": 27787260160,
|
| 4718 |
+
"step": 26500
|
| 4719 |
+
},
|
| 4720 |
+
{
|
| 4721 |
+
"epoch": 0.5821001459368998,
|
| 4722 |
+
"eval_loss": 2.5802626609802246,
|
| 4723 |
+
"eval_runtime": 67.1032,
|
| 4724 |
+
"eval_samples_per_second": 74.512,
|
| 4725 |
+
"eval_steps_per_second": 18.628,
|
| 4726 |
+
"num_input_tokens_seen": 27787260160,
|
| 4727 |
+
"step": 26500
|
| 4728 |
+
},
|
| 4729 |
+
{
|
| 4730 |
+
"epoch": 0.5831984480990449,
|
| 4731 |
+
"grad_norm": 0.15365912020206451,
|
| 4732 |
+
"learning_rate": 0.001,
|
| 4733 |
+
"loss": 2.6813,
|
| 4734 |
+
"num_input_tokens_seen": 27839688960,
|
| 4735 |
+
"step": 26550
|
| 4736 |
+
},
|
| 4737 |
+
{
|
| 4738 |
+
"epoch": 0.58429675026119,
|
| 4739 |
+
"grad_norm": 0.14015646278858185,
|
| 4740 |
+
"learning_rate": 0.001,
|
| 4741 |
+
"loss": 2.6774,
|
| 4742 |
+
"num_input_tokens_seen": 27892117760,
|
| 4743 |
+
"step": 26600
|
| 4744 |
+
},
|
| 4745 |
+
{
|
| 4746 |
+
"epoch": 0.5853950524233351,
|
| 4747 |
+
"grad_norm": 0.1529797911643982,
|
| 4748 |
+
"learning_rate": 0.001,
|
| 4749 |
+
"loss": 2.6751,
|
| 4750 |
+
"num_input_tokens_seen": 27944546560,
|
| 4751 |
+
"step": 26650
|
| 4752 |
+
},
|
| 4753 |
+
{
|
| 4754 |
+
"epoch": 0.5864933545854801,
|
| 4755 |
+
"grad_norm": 0.16909636557102203,
|
| 4756 |
+
"learning_rate": 0.001,
|
| 4757 |
+
"loss": 2.6795,
|
| 4758 |
+
"num_input_tokens_seen": 27996975360,
|
| 4759 |
+
"step": 26700
|
| 4760 |
+
},
|
| 4761 |
+
{
|
| 4762 |
+
"epoch": 0.5875916567476253,
|
| 4763 |
+
"grad_norm": 0.14130276441574097,
|
| 4764 |
+
"learning_rate": 0.001,
|
| 4765 |
+
"loss": 2.6809,
|
| 4766 |
+
"num_input_tokens_seen": 28049404160,
|
| 4767 |
+
"step": 26750
|
| 4768 |
+
},
|
| 4769 |
+
{
|
| 4770 |
+
"epoch": 0.5886899589097704,
|
| 4771 |
+
"grad_norm": 0.15182790160179138,
|
| 4772 |
+
"learning_rate": 0.001,
|
| 4773 |
+
"loss": 2.685,
|
| 4774 |
+
"num_input_tokens_seen": 28101832960,
|
| 4775 |
+
"step": 26800
|
| 4776 |
+
},
|
| 4777 |
+
{
|
| 4778 |
+
"epoch": 0.5897882610719154,
|
| 4779 |
+
"grad_norm": 0.12757331132888794,
|
| 4780 |
+
"learning_rate": 0.001,
|
| 4781 |
+
"loss": 2.6766,
|
| 4782 |
+
"num_input_tokens_seen": 28154261760,
|
| 4783 |
+
"step": 26850
|
| 4784 |
+
},
|
| 4785 |
+
{
|
| 4786 |
+
"epoch": 0.5908865632340605,
|
| 4787 |
+
"grad_norm": 0.1527504026889801,
|
| 4788 |
+
"learning_rate": 0.001,
|
| 4789 |
+
"loss": 2.6767,
|
| 4790 |
+
"num_input_tokens_seen": 28206690560,
|
| 4791 |
+
"step": 26900
|
| 4792 |
+
},
|
| 4793 |
+
{
|
| 4794 |
+
"epoch": 0.5919848653962057,
|
| 4795 |
+
"grad_norm": 0.18337304890155792,
|
| 4796 |
+
"learning_rate": 0.001,
|
| 4797 |
+
"loss": 2.6752,
|
| 4798 |
+
"num_input_tokens_seen": 28259119360,
|
| 4799 |
+
"step": 26950
|
| 4800 |
+
},
|
| 4801 |
+
{
|
| 4802 |
+
"epoch": 0.5930831675583508,
|
| 4803 |
+
"grad_norm": 0.1472473442554474,
|
| 4804 |
+
"learning_rate": 0.001,
|
| 4805 |
+
"loss": 2.6717,
|
| 4806 |
+
"num_input_tokens_seen": 28311548160,
|
| 4807 |
+
"step": 27000
|
| 4808 |
+
},
|
| 4809 |
+
{
|
| 4810 |
+
"epoch": 0.5930831675583508,
|
| 4811 |
+
"eval_loss": 2.5781941413879395,
|
| 4812 |
+
"eval_runtime": 66.2194,
|
| 4813 |
+
"eval_samples_per_second": 75.507,
|
| 4814 |
+
"eval_steps_per_second": 18.877,
|
| 4815 |
+
"num_input_tokens_seen": 28311548160,
|
| 4816 |
+
"step": 27000
|
| 4817 |
}
|
| 4818 |
],
|
| 4819 |
"logging_steps": 50,
|
| 4820 |
"max_steps": 200000,
|
| 4821 |
+
"num_input_tokens_seen": 28311548160,
|
| 4822 |
"num_train_epochs": 5,
|
| 4823 |
"save_steps": 1000,
|
| 4824 |
"stateful_callbacks": {
|
|
|
|
| 4833 |
"attributes": {}
|
| 4834 |
}
|
| 4835 |
},
|
| 4836 |
+
"total_flos": 1.6123634144144916e+19,
|
| 4837 |
"train_batch_size": 64,
|
| 4838 |
"trial_name": null,
|
| 4839 |
"trial_params": null
|