Training in progress, step 140000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6e43382fe5ddb78fed06a23ba6c7b8489c50f8ee7949d8db86e49cd8910036e
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c18874d88aac76ea7c7006e997509fca95df88b10d2c13b5a6816de7643ed6e
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82993dca9aea22266a253201514efb5478f36bf5a374573dc48fbab5e03c52d6
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf74877c1fcc66d6df58cb7c2b28db5c3be81aec77034ec2a9ace3e30449eb22
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24750,11 +24750,189 @@
|
|
| 24750 |
"eval_steps_per_second": 15.14,
|
| 24751 |
"num_input_tokens_seen": 72864248896,
|
| 24752 |
"step": 139000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24753 |
}
|
| 24754 |
],
|
| 24755 |
"logging_steps": 50,
|
| 24756 |
"max_steps": 140000,
|
| 24757 |
-
"num_input_tokens_seen":
|
| 24758 |
"num_train_epochs": 2,
|
| 24759 |
"save_steps": 1000,
|
| 24760 |
"stateful_callbacks": {
|
|
@@ -24764,12 +24942,12 @@
|
|
| 24764 |
"should_evaluate": false,
|
| 24765 |
"should_log": false,
|
| 24766 |
"should_save": true,
|
| 24767 |
-
"should_training_stop":
|
| 24768 |
},
|
| 24769 |
"attributes": {}
|
| 24770 |
}
|
| 24771 |
},
|
| 24772 |
-
"total_flos": 1.
|
| 24773 |
"train_batch_size": 32,
|
| 24774 |
"trial_name": null,
|
| 24775 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.335609430339745,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 140000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24750 |
"eval_steps_per_second": 15.14,
|
| 24751 |
"num_input_tokens_seen": 72864248896,
|
| 24752 |
"step": 139000
|
| 24753 |
+
},
|
| 24754 |
+
{
|
| 24755 |
+
"epoch": 1.3265463825321677,
|
| 24756 |
+
"grad_norm": 0.11243559420108795,
|
| 24757 |
+
"learning_rate": 2.837655575097964e-06,
|
| 24758 |
+
"loss": 2.0318,
|
| 24759 |
+
"num_input_tokens_seen": 72890458688,
|
| 24760 |
+
"step": 139050
|
| 24761 |
+
},
|
| 24762 |
+
{
|
| 24763 |
+
"epoch": 1.3270233850483562,
|
| 24764 |
+
"grad_norm": 0.11617834120988846,
|
| 24765 |
+
"learning_rate": 2.547062725623828e-06,
|
| 24766 |
+
"loss": 2.0384,
|
| 24767 |
+
"num_input_tokens_seen": 72916673088,
|
| 24768 |
+
"step": 139100
|
| 24769 |
+
},
|
| 24770 |
+
{
|
| 24771 |
+
"epoch": 1.3275003875645444,
|
| 24772 |
+
"grad_norm": 0.11737903952598572,
|
| 24773 |
+
"learning_rate": 2.2721256504567023e-06,
|
| 24774 |
+
"loss": 2.0235,
|
| 24775 |
+
"num_input_tokens_seen": 72942884768,
|
| 24776 |
+
"step": 139150
|
| 24777 |
+
},
|
| 24778 |
+
{
|
| 24779 |
+
"epoch": 1.3279773900807328,
|
| 24780 |
+
"grad_norm": 0.10866422206163406,
|
| 24781 |
+
"learning_rate": 2.012853002380466e-06,
|
| 24782 |
+
"loss": 2.024,
|
| 24783 |
+
"num_input_tokens_seen": 72969088544,
|
| 24784 |
+
"step": 139200
|
| 24785 |
+
},
|
| 24786 |
+
{
|
| 24787 |
+
"epoch": 1.328454392596921,
|
| 24788 |
+
"grad_norm": 0.11547800898551941,
|
| 24789 |
+
"learning_rate": 1.769252941190458e-06,
|
| 24790 |
+
"loss": 2.0323,
|
| 24791 |
+
"num_input_tokens_seen": 72995301472,
|
| 24792 |
+
"step": 139250
|
| 24793 |
+
},
|
| 24794 |
+
{
|
| 24795 |
+
"epoch": 1.3289313951131092,
|
| 24796 |
+
"grad_norm": 0.11617856472730637,
|
| 24797 |
+
"learning_rate": 1.541333133436018e-06,
|
| 24798 |
+
"loss": 2.0294,
|
| 24799 |
+
"num_input_tokens_seen": 73021507392,
|
| 24800 |
+
"step": 139300
|
| 24801 |
+
},
|
| 24802 |
+
{
|
| 24803 |
+
"epoch": 1.3294083976292974,
|
| 24804 |
+
"grad_norm": 0.11435816437005997,
|
| 24805 |
+
"learning_rate": 1.3291007521799014e-06,
|
| 24806 |
+
"loss": 2.0288,
|
| 24807 |
+
"num_input_tokens_seen": 73047719968,
|
| 24808 |
+
"step": 139350
|
| 24809 |
+
},
|
| 24810 |
+
{
|
| 24811 |
+
"epoch": 1.3298854001454858,
|
| 24812 |
+
"grad_norm": 0.11262206733226776,
|
| 24813 |
+
"learning_rate": 1.132562476771959e-06,
|
| 24814 |
+
"loss": 2.0301,
|
| 24815 |
+
"num_input_tokens_seen": 73073924576,
|
| 24816 |
+
"step": 139400
|
| 24817 |
+
},
|
| 24818 |
+
{
|
| 24819 |
+
"epoch": 1.330362402661674,
|
| 24820 |
+
"grad_norm": 0.11383078992366791,
|
| 24821 |
+
"learning_rate": 9.517244926393609e-07,
|
| 24822 |
+
"loss": 2.0187,
|
| 24823 |
+
"num_input_tokens_seen": 73100138976,
|
| 24824 |
+
"step": 139450
|
| 24825 |
+
},
|
| 24826 |
+
{
|
| 24827 |
+
"epoch": 1.3308394051778623,
|
| 24828 |
+
"grad_norm": 0.1159028634428978,
|
| 24829 |
+
"learning_rate": 7.865924910916978e-07,
|
| 24830 |
+
"loss": 2.0366,
|
| 24831 |
+
"num_input_tokens_seen": 73126349984,
|
| 24832 |
+
"step": 139500
|
| 24833 |
+
},
|
| 24834 |
+
{
|
| 24835 |
+
"epoch": 1.3308394051778623,
|
| 24836 |
+
"eval_loss": 1.9510103464126587,
|
| 24837 |
+
"eval_runtime": 82.8489,
|
| 24838 |
+
"eval_samples_per_second": 60.351,
|
| 24839 |
+
"eval_steps_per_second": 15.088,
|
| 24840 |
+
"num_input_tokens_seen": 73126349984,
|
| 24841 |
+
"step": 139500
|
| 24842 |
+
},
|
| 24843 |
+
{
|
| 24844 |
+
"epoch": 1.3313164076940507,
|
| 24845 |
+
"grad_norm": 0.1160767450928688,
|
| 24846 |
+
"learning_rate": 6.371716691419005e-07,
|
| 24847 |
+
"loss": 2.0374,
|
| 24848 |
+
"num_input_tokens_seen": 73152559296,
|
| 24849 |
+
"step": 139550
|
| 24850 |
+
},
|
| 24851 |
+
{
|
| 24852 |
+
"epoch": 1.331793410210239,
|
| 24853 |
+
"grad_norm": 0.11154640465974808,
|
| 24854 |
+
"learning_rate": 5.034667293427053e-07,
|
| 24855 |
+
"loss": 2.0385,
|
| 24856 |
+
"num_input_tokens_seen": 73178773696,
|
| 24857 |
+
"step": 139600
|
| 24858 |
+
},
|
| 24859 |
+
{
|
| 24860 |
+
"epoch": 1.332270412726427,
|
| 24861 |
+
"grad_norm": 0.11127237975597382,
|
| 24862 |
+
"learning_rate": 3.854818796385495e-07,
|
| 24863 |
+
"loss": 2.0281,
|
| 24864 |
+
"num_input_tokens_seen": 73204985664,
|
| 24865 |
+
"step": 139650
|
| 24866 |
+
},
|
| 24867 |
+
{
|
| 24868 |
+
"epoch": 1.3327474152426153,
|
| 24869 |
+
"grad_norm": 0.11270651966333389,
|
| 24870 |
+
"learning_rate": 2.8322083323334415e-07,
|
| 24871 |
+
"loss": 2.022,
|
| 24872 |
+
"num_input_tokens_seen": 73231192992,
|
| 24873 |
+
"step": 139700
|
| 24874 |
+
},
|
| 24875 |
+
{
|
| 24876 |
+
"epoch": 1.3332244177588037,
|
| 24877 |
+
"grad_norm": 0.11388963460922241,
|
| 24878 |
+
"learning_rate": 1.9668680847356734e-07,
|
| 24879 |
+
"loss": 2.0305,
|
| 24880 |
+
"num_input_tokens_seen": 73257397792,
|
| 24881 |
+
"step": 139750
|
| 24882 |
+
},
|
| 24883 |
+
{
|
| 24884 |
+
"epoch": 1.333701420274992,
|
| 24885 |
+
"grad_norm": 0.11808367073535919,
|
| 24886 |
+
"learning_rate": 1.2588252874673466e-07,
|
| 24887 |
+
"loss": 2.0302,
|
| 24888 |
+
"num_input_tokens_seen": 73283607648,
|
| 24889 |
+
"step": 139800
|
| 24890 |
+
},
|
| 24891 |
+
{
|
| 24892 |
+
"epoch": 1.3341784227911802,
|
| 24893 |
+
"grad_norm": 0.11369805783033371,
|
| 24894 |
+
"learning_rate": 7.081022239591173e-08,
|
| 24895 |
+
"loss": 2.0355,
|
| 24896 |
+
"num_input_tokens_seen": 73309822048,
|
| 24897 |
+
"step": 139850
|
| 24898 |
+
},
|
| 24899 |
+
{
|
| 24900 |
+
"epoch": 1.3346554253073686,
|
| 24901 |
+
"grad_norm": 0.11115424335002899,
|
| 24902 |
+
"learning_rate": 3.147162264971471e-08,
|
| 24903 |
+
"loss": 2.027,
|
| 24904 |
+
"num_input_tokens_seen": 73336032384,
|
| 24905 |
+
"step": 139900
|
| 24906 |
+
},
|
| 24907 |
+
{
|
| 24908 |
+
"epoch": 1.3351324278235568,
|
| 24909 |
+
"grad_norm": 0.11730392277240753,
|
| 24910 |
+
"learning_rate": 7.867967567354306e-09,
|
| 24911 |
+
"loss": 2.0268,
|
| 24912 |
+
"num_input_tokens_seen": 73362242112,
|
| 24913 |
+
"step": 139950
|
| 24914 |
+
},
|
| 24915 |
+
{
|
| 24916 |
+
"epoch": 1.335609430339745,
|
| 24917 |
+
"grad_norm": 0.11209023743867874,
|
| 24918 |
+
"learning_rate": 0.0,
|
| 24919 |
+
"loss": 2.0315,
|
| 24920 |
+
"num_input_tokens_seen": 73388446624,
|
| 24921 |
+
"step": 140000
|
| 24922 |
+
},
|
| 24923 |
+
{
|
| 24924 |
+
"epoch": 1.335609430339745,
|
| 24925 |
+
"eval_loss": 1.9509990215301514,
|
| 24926 |
+
"eval_runtime": 82.6099,
|
| 24927 |
+
"eval_samples_per_second": 60.525,
|
| 24928 |
+
"eval_steps_per_second": 15.131,
|
| 24929 |
+
"num_input_tokens_seen": 73388446624,
|
| 24930 |
+
"step": 140000
|
| 24931 |
}
|
| 24932 |
],
|
| 24933 |
"logging_steps": 50,
|
| 24934 |
"max_steps": 140000,
|
| 24935 |
+
"num_input_tokens_seen": 73388446624,
|
| 24936 |
"num_train_epochs": 2,
|
| 24937 |
"save_steps": 1000,
|
| 24938 |
"stateful_callbacks": {
|
|
|
|
| 24942 |
"should_evaluate": false,
|
| 24943 |
"should_log": false,
|
| 24944 |
"should_save": true,
|
| 24945 |
+
"should_training_stop": true
|
| 24946 |
},
|
| 24947 |
"attributes": {}
|
| 24948 |
}
|
| 24949 |
},
|
| 24950 |
+
"total_flos": 1.2988416447181578e+20,
|
| 24951 |
"train_batch_size": 32,
|
| 24952 |
"trial_name": null,
|
| 24953 |
"trial_params": null
|