Training in progress, step 33000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b4d3f9e40108aa240d3ccb2dec6c98e3c8dee794d5b181e301f16cb825f4f24c
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34f35e26a2da9f3a49992a7401bf48035da49e1863e9c12106da901a102fce6c
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f916fc54175e9c81473454541a77405165ddc25577e0b82acf56f2d60728d556
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:281c918d3dcf25df4f5a9bbf64a4fd88f0fa5c69087d3374f9f2ce6266f988a9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -5704,11 +5704,189 @@
|
|
| 5704 |
"eval_steps_per_second": 18.716,
|
| 5705 |
"num_input_tokens_seen": 33554428160,
|
| 5706 |
"step": 32000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5707 |
}
|
| 5708 |
],
|
| 5709 |
"logging_steps": 50,
|
| 5710 |
"max_steps": 200000,
|
| 5711 |
-
"num_input_tokens_seen":
|
| 5712 |
"num_train_epochs": 5,
|
| 5713 |
"save_steps": 1000,
|
| 5714 |
"stateful_callbacks": {
|
|
@@ -5723,7 +5901,7 @@
|
|
| 5723 |
"attributes": {}
|
| 5724 |
}
|
| 5725 |
},
|
| 5726 |
-
"total_flos": 1.
|
| 5727 |
"train_batch_size": 64,
|
| 5728 |
"trial_name": null,
|
| 5729 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.724879427015762,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 33000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 5704 |
"eval_steps_per_second": 18.716,
|
| 5705 |
"num_input_tokens_seen": 33554428160,
|
| 5706 |
"step": 32000
|
| 5707 |
+
},
|
| 5708 |
+
{
|
| 5709 |
+
"epoch": 0.7040116859350052,
|
| 5710 |
+
"grad_norm": 0.15333816409111023,
|
| 5711 |
+
"learning_rate": 0.001,
|
| 5712 |
+
"loss": 2.6605,
|
| 5713 |
+
"num_input_tokens_seen": 33606856960,
|
| 5714 |
+
"step": 32050
|
| 5715 |
+
},
|
| 5716 |
+
{
|
| 5717 |
+
"epoch": 0.7051099880971503,
|
| 5718 |
+
"grad_norm": 0.14965052902698517,
|
| 5719 |
+
"learning_rate": 0.001,
|
| 5720 |
+
"loss": 2.6551,
|
| 5721 |
+
"num_input_tokens_seen": 33659285760,
|
| 5722 |
+
"step": 32100
|
| 5723 |
+
},
|
| 5724 |
+
{
|
| 5725 |
+
"epoch": 0.7062082902592954,
|
| 5726 |
+
"grad_norm": 0.1994074285030365,
|
| 5727 |
+
"learning_rate": 0.001,
|
| 5728 |
+
"loss": 2.6652,
|
| 5729 |
+
"num_input_tokens_seen": 33711714560,
|
| 5730 |
+
"step": 32150
|
| 5731 |
+
},
|
| 5732 |
+
{
|
| 5733 |
+
"epoch": 0.7073065924214406,
|
| 5734 |
+
"grad_norm": 0.3089894652366638,
|
| 5735 |
+
"learning_rate": 0.001,
|
| 5736 |
+
"loss": 2.6814,
|
| 5737 |
+
"num_input_tokens_seen": 33764143360,
|
| 5738 |
+
"step": 32200
|
| 5739 |
+
},
|
| 5740 |
+
{
|
| 5741 |
+
"epoch": 0.7084048945835856,
|
| 5742 |
+
"grad_norm": 0.14903652667999268,
|
| 5743 |
+
"learning_rate": 0.001,
|
| 5744 |
+
"loss": 2.6834,
|
| 5745 |
+
"num_input_tokens_seen": 33816572160,
|
| 5746 |
+
"step": 32250
|
| 5747 |
+
},
|
| 5748 |
+
{
|
| 5749 |
+
"epoch": 0.7095031967457307,
|
| 5750 |
+
"grad_norm": 0.17594854533672333,
|
| 5751 |
+
"learning_rate": 0.001,
|
| 5752 |
+
"loss": 2.6618,
|
| 5753 |
+
"num_input_tokens_seen": 33869000960,
|
| 5754 |
+
"step": 32300
|
| 5755 |
+
},
|
| 5756 |
+
{
|
| 5757 |
+
"epoch": 0.7106014989078758,
|
| 5758 |
+
"grad_norm": 0.15634667873382568,
|
| 5759 |
+
"learning_rate": 0.001,
|
| 5760 |
+
"loss": 2.6663,
|
| 5761 |
+
"num_input_tokens_seen": 33921429760,
|
| 5762 |
+
"step": 32350
|
| 5763 |
+
},
|
| 5764 |
+
{
|
| 5765 |
+
"epoch": 0.7116998010700208,
|
| 5766 |
+
"grad_norm": 0.13893702626228333,
|
| 5767 |
+
"learning_rate": 0.001,
|
| 5768 |
+
"loss": 2.67,
|
| 5769 |
+
"num_input_tokens_seen": 33973858560,
|
| 5770 |
+
"step": 32400
|
| 5771 |
+
},
|
| 5772 |
+
{
|
| 5773 |
+
"epoch": 0.712798103232166,
|
| 5774 |
+
"grad_norm": 0.16974663734436035,
|
| 5775 |
+
"learning_rate": 0.001,
|
| 5776 |
+
"loss": 2.6686,
|
| 5777 |
+
"num_input_tokens_seen": 34026287360,
|
| 5778 |
+
"step": 32450
|
| 5779 |
+
},
|
| 5780 |
+
{
|
| 5781 |
+
"epoch": 0.7138964053943111,
|
| 5782 |
+
"grad_norm": 0.15336968004703522,
|
| 5783 |
+
"learning_rate": 0.001,
|
| 5784 |
+
"loss": 2.6703,
|
| 5785 |
+
"num_input_tokens_seen": 34078716160,
|
| 5786 |
+
"step": 32500
|
| 5787 |
+
},
|
| 5788 |
+
{
|
| 5789 |
+
"epoch": 0.7138964053943111,
|
| 5790 |
+
"eval_loss": 2.5648574829101562,
|
| 5791 |
+
"eval_runtime": 66.0796,
|
| 5792 |
+
"eval_samples_per_second": 75.666,
|
| 5793 |
+
"eval_steps_per_second": 18.917,
|
| 5794 |
+
"num_input_tokens_seen": 34078716160,
|
| 5795 |
+
"step": 32500
|
| 5796 |
+
},
|
| 5797 |
+
{
|
| 5798 |
+
"epoch": 0.7149947075564561,
|
| 5799 |
+
"grad_norm": 1.428727626800537,
|
| 5800 |
+
"learning_rate": 0.001,
|
| 5801 |
+
"loss": 2.8433,
|
| 5802 |
+
"num_input_tokens_seen": 34131144960,
|
| 5803 |
+
"step": 32550
|
| 5804 |
+
},
|
| 5805 |
+
{
|
| 5806 |
+
"epoch": 0.7160930097186012,
|
| 5807 |
+
"grad_norm": 0.1666879504919052,
|
| 5808 |
+
"learning_rate": 0.001,
|
| 5809 |
+
"loss": 2.7236,
|
| 5810 |
+
"num_input_tokens_seen": 34183573760,
|
| 5811 |
+
"step": 32600
|
| 5812 |
+
},
|
| 5813 |
+
{
|
| 5814 |
+
"epoch": 0.7171913118807464,
|
| 5815 |
+
"grad_norm": 0.16038021445274353,
|
| 5816 |
+
"learning_rate": 0.001,
|
| 5817 |
+
"loss": 2.6876,
|
| 5818 |
+
"num_input_tokens_seen": 34236002560,
|
| 5819 |
+
"step": 32650
|
| 5820 |
+
},
|
| 5821 |
+
{
|
| 5822 |
+
"epoch": 0.7182896140428915,
|
| 5823 |
+
"grad_norm": 0.1514110267162323,
|
| 5824 |
+
"learning_rate": 0.001,
|
| 5825 |
+
"loss": 2.6717,
|
| 5826 |
+
"num_input_tokens_seen": 34288431360,
|
| 5827 |
+
"step": 32700
|
| 5828 |
+
},
|
| 5829 |
+
{
|
| 5830 |
+
"epoch": 0.7193879162050365,
|
| 5831 |
+
"grad_norm": 0.13304661214351654,
|
| 5832 |
+
"learning_rate": 0.001,
|
| 5833 |
+
"loss": 2.6664,
|
| 5834 |
+
"num_input_tokens_seen": 34340860160,
|
| 5835 |
+
"step": 32750
|
| 5836 |
+
},
|
| 5837 |
+
{
|
| 5838 |
+
"epoch": 0.7204862183671816,
|
| 5839 |
+
"grad_norm": 0.15957415103912354,
|
| 5840 |
+
"learning_rate": 0.001,
|
| 5841 |
+
"loss": 2.6683,
|
| 5842 |
+
"num_input_tokens_seen": 34393288960,
|
| 5843 |
+
"step": 32800
|
| 5844 |
+
},
|
| 5845 |
+
{
|
| 5846 |
+
"epoch": 0.7215845205293268,
|
| 5847 |
+
"grad_norm": 0.14532499015331268,
|
| 5848 |
+
"learning_rate": 0.001,
|
| 5849 |
+
"loss": 2.6632,
|
| 5850 |
+
"num_input_tokens_seen": 34445717760,
|
| 5851 |
+
"step": 32850
|
| 5852 |
+
},
|
| 5853 |
+
{
|
| 5854 |
+
"epoch": 0.7226828226914718,
|
| 5855 |
+
"grad_norm": 0.1402454972267151,
|
| 5856 |
+
"learning_rate": 0.001,
|
| 5857 |
+
"loss": 2.6631,
|
| 5858 |
+
"num_input_tokens_seen": 34498146560,
|
| 5859 |
+
"step": 32900
|
| 5860 |
+
},
|
| 5861 |
+
{
|
| 5862 |
+
"epoch": 0.7237811248536169,
|
| 5863 |
+
"grad_norm": 0.17248420417308807,
|
| 5864 |
+
"learning_rate": 0.001,
|
| 5865 |
+
"loss": 2.6743,
|
| 5866 |
+
"num_input_tokens_seen": 34550575360,
|
| 5867 |
+
"step": 32950
|
| 5868 |
+
},
|
| 5869 |
+
{
|
| 5870 |
+
"epoch": 0.724879427015762,
|
| 5871 |
+
"grad_norm": 0.1455400288105011,
|
| 5872 |
+
"learning_rate": 0.001,
|
| 5873 |
+
"loss": 2.6598,
|
| 5874 |
+
"num_input_tokens_seen": 34603004160,
|
| 5875 |
+
"step": 33000
|
| 5876 |
+
},
|
| 5877 |
+
{
|
| 5878 |
+
"epoch": 0.724879427015762,
|
| 5879 |
+
"eval_loss": 2.5639312267303467,
|
| 5880 |
+
"eval_runtime": 66.9575,
|
| 5881 |
+
"eval_samples_per_second": 74.674,
|
| 5882 |
+
"eval_steps_per_second": 18.669,
|
| 5883 |
+
"num_input_tokens_seen": 34603004160,
|
| 5884 |
+
"step": 33000
|
| 5885 |
}
|
| 5886 |
],
|
| 5887 |
"logging_steps": 50,
|
| 5888 |
"max_steps": 200000,
|
| 5889 |
+
"num_input_tokens_seen": 34603004160,
|
| 5890 |
"num_train_epochs": 5,
|
| 5891 |
"save_steps": 1000,
|
| 5892 |
"stateful_callbacks": {
|
|
|
|
| 5901 |
"attributes": {}
|
| 5902 |
}
|
| 5903 |
},
|
| 5904 |
+
"total_flos": 1.9706664439934484e+19,
|
| 5905 |
"train_batch_size": 64,
|
| 5906 |
"trial_name": null,
|
| 5907 |
"trial_params": null
|