Training in progress, step 125000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1410301944
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:811ff470374af97e47c736f298958834b69f1700c42f81f0e13b4a5264484ae8
|
| 3 |
size 1410301944
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2820185786
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d11d76e5f8cf7c010d7ddfa9e036517a28b9f13eec8d65dc499e46a38c1f4b3
|
| 3 |
size 2820185786
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc320281dd48fee58a87ebe65c5af2ea4c357e61810ad0f123ab838f0f93b01f
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31f9a4be764158103ef48222bfd8b15ec527d59f5ba7b3fa5af00980fe9404f9
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -22080,11 +22080,189 @@
|
|
| 22080 |
"eval_steps_per_second": 15.101,
|
| 22081 |
"num_input_tokens_seen": 65001257824,
|
| 22082 |
"step": 124000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22083 |
}
|
| 22084 |
],
|
| 22085 |
"logging_steps": 50,
|
| 22086 |
"max_steps": 140000,
|
| 22087 |
-
"num_input_tokens_seen":
|
| 22088 |
"num_train_epochs": 2,
|
| 22089 |
"save_steps": 1000,
|
| 22090 |
"stateful_callbacks": {
|
|
@@ -22099,7 +22277,7 @@
|
|
| 22099 |
"attributes": {}
|
| 22100 |
}
|
| 22101 |
},
|
| 22102 |
-
"total_flos": 1.
|
| 22103 |
"train_batch_size": 32,
|
| 22104 |
"trial_name": null,
|
| 22105 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1925086754832632,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 125000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 22080 |
"eval_steps_per_second": 15.101,
|
| 22081 |
"num_input_tokens_seen": 65001257824,
|
| 22082 |
"step": 124000
|
| 22083 |
+
},
|
| 22084 |
+
{
|
| 22085 |
+
"epoch": 1.183445627675686,
|
| 22086 |
+
"grad_norm": 0.12966303527355194,
|
| 22087 |
+
"learning_rate": 0.0006085240641360281,
|
| 22088 |
+
"loss": 2.0655,
|
| 22089 |
+
"num_input_tokens_seen": 65027466432,
|
| 22090 |
+
"step": 124050
|
| 22091 |
+
},
|
| 22092 |
+
{
|
| 22093 |
+
"epoch": 1.1839226301918742,
|
| 22094 |
+
"grad_norm": 0.13216206431388855,
|
| 22095 |
+
"learning_rate": 0.0006057842458386314,
|
| 22096 |
+
"loss": 2.0787,
|
| 22097 |
+
"num_input_tokens_seen": 65053680192,
|
| 22098 |
+
"step": 124100
|
| 22099 |
+
},
|
| 22100 |
+
{
|
| 22101 |
+
"epoch": 1.1843996327080626,
|
| 22102 |
+
"grad_norm": 0.13295891880989075,
|
| 22103 |
+
"learning_rate": 0.0006030410983131733,
|
| 22104 |
+
"loss": 2.0654,
|
| 22105 |
+
"num_input_tokens_seen": 65079892928,
|
| 22106 |
+
"step": 124150
|
| 22107 |
+
},
|
| 22108 |
+
{
|
| 22109 |
+
"epoch": 1.1848766352242508,
|
| 22110 |
+
"grad_norm": 0.14478819072246552,
|
| 22111 |
+
"learning_rate": 0.0006002947078916364,
|
| 22112 |
+
"loss": 2.0638,
|
| 22113 |
+
"num_input_tokens_seen": 65106107328,
|
| 22114 |
+
"step": 124200
|
| 22115 |
+
},
|
| 22116 |
+
{
|
| 22117 |
+
"epoch": 1.185353637740439,
|
| 22118 |
+
"grad_norm": 0.13410045206546783,
|
| 22119 |
+
"learning_rate": 0.0005975451610080642,
|
| 22120 |
+
"loss": 2.0711,
|
| 22121 |
+
"num_input_tokens_seen": 65132321728,
|
| 22122 |
+
"step": 124250
|
| 22123 |
+
},
|
| 22124 |
+
{
|
| 22125 |
+
"epoch": 1.1858306402566274,
|
| 22126 |
+
"grad_norm": 0.14699777960777283,
|
| 22127 |
+
"learning_rate": 0.0005947925441958392,
|
| 22128 |
+
"loss": 2.0574,
|
| 22129 |
+
"num_input_tokens_seen": 65158534656,
|
| 22130 |
+
"step": 124300
|
| 22131 |
+
},
|
| 22132 |
+
{
|
| 22133 |
+
"epoch": 1.1863076427728156,
|
| 22134 |
+
"grad_norm": 0.13368327915668488,
|
| 22135 |
+
"learning_rate": 0.0005920369440849609,
|
| 22136 |
+
"loss": 2.0626,
|
| 22137 |
+
"num_input_tokens_seen": 65184748736,
|
| 22138 |
+
"step": 124350
|
| 22139 |
+
},
|
| 22140 |
+
{
|
| 22141 |
+
"epoch": 1.1867846452890038,
|
| 22142 |
+
"grad_norm": 0.13047395646572113,
|
| 22143 |
+
"learning_rate": 0.0005892784473993184,
|
| 22144 |
+
"loss": 2.06,
|
| 22145 |
+
"num_input_tokens_seen": 65210950912,
|
| 22146 |
+
"step": 124400
|
| 22147 |
+
},
|
| 22148 |
+
{
|
| 22149 |
+
"epoch": 1.187261647805192,
|
| 22150 |
+
"grad_norm": 0.13072432577610016,
|
| 22151 |
+
"learning_rate": 0.0005865171409539613,
|
| 22152 |
+
"loss": 2.0869,
|
| 22153 |
+
"num_input_tokens_seen": 65237165312,
|
| 22154 |
+
"step": 124450
|
| 22155 |
+
},
|
| 22156 |
+
{
|
| 22157 |
+
"epoch": 1.1877386503213805,
|
| 22158 |
+
"grad_norm": 0.14443765580654144,
|
| 22159 |
+
"learning_rate": 0.0005837531116523682,
|
| 22160 |
+
"loss": 2.0675,
|
| 22161 |
+
"num_input_tokens_seen": 65263378112,
|
| 22162 |
+
"step": 124500
|
| 22163 |
+
},
|
| 22164 |
+
{
|
| 22165 |
+
"epoch": 1.1877386503213805,
|
| 22166 |
+
"eval_loss": 1.9832085371017456,
|
| 22167 |
+
"eval_runtime": 83.5278,
|
| 22168 |
+
"eval_samples_per_second": 59.86,
|
| 22169 |
+
"eval_steps_per_second": 14.965,
|
| 22170 |
+
"num_input_tokens_seen": 65263378112,
|
| 22171 |
+
"step": 124500
|
| 22172 |
+
},
|
| 22173 |
+
{
|
| 22174 |
+
"epoch": 1.1882156528375687,
|
| 22175 |
+
"grad_norm": 0.13271184265613556,
|
| 22176 |
+
"learning_rate": 0.0005809864464837105,
|
| 22177 |
+
"loss": 2.0507,
|
| 22178 |
+
"num_input_tokens_seen": 65289588448,
|
| 22179 |
+
"step": 124550
|
| 22180 |
+
},
|
| 22181 |
+
{
|
| 22182 |
+
"epoch": 1.188692655353757,
|
| 22183 |
+
"grad_norm": 0.13720299303531647,
|
| 22184 |
+
"learning_rate": 0.0005782172325201155,
|
| 22185 |
+
"loss": 2.0728,
|
| 22186 |
+
"num_input_tokens_seen": 65315802432,
|
| 22187 |
+
"step": 124600
|
| 22188 |
+
},
|
| 22189 |
+
{
|
| 22190 |
+
"epoch": 1.1891696578699453,
|
| 22191 |
+
"grad_norm": 0.12747812271118164,
|
| 22192 |
+
"learning_rate": 0.0005754455569139257,
|
| 22193 |
+
"loss": 2.0786,
|
| 22194 |
+
"num_input_tokens_seen": 65342011648,
|
| 22195 |
+
"step": 124650
|
| 22196 |
+
},
|
| 22197 |
+
{
|
| 22198 |
+
"epoch": 1.1896466603861335,
|
| 22199 |
+
"grad_norm": 0.13649390637874603,
|
| 22200 |
+
"learning_rate": 0.0005726715068949564,
|
| 22201 |
+
"loss": 2.0578,
|
| 22202 |
+
"num_input_tokens_seen": 65368225184,
|
| 22203 |
+
"step": 124700
|
| 22204 |
+
},
|
| 22205 |
+
{
|
| 22206 |
+
"epoch": 1.1901236629023217,
|
| 22207 |
+
"grad_norm": 0.13283640146255493,
|
| 22208 |
+
"learning_rate": 0.0005698951697677498,
|
| 22209 |
+
"loss": 2.0616,
|
| 22210 |
+
"num_input_tokens_seen": 65394434464,
|
| 22211 |
+
"step": 124750
|
| 22212 |
+
},
|
| 22213 |
+
{
|
| 22214 |
+
"epoch": 1.1906006654185102,
|
| 22215 |
+
"grad_norm": 0.13304251432418823,
|
| 22216 |
+
"learning_rate": 0.0005671166329088278,
|
| 22217 |
+
"loss": 2.0657,
|
| 22218 |
+
"num_input_tokens_seen": 65420648864,
|
| 22219 |
+
"step": 124800
|
| 22220 |
+
},
|
| 22221 |
+
{
|
| 22222 |
+
"epoch": 1.1910776679346984,
|
| 22223 |
+
"grad_norm": 0.1442023664712906,
|
| 22224 |
+
"learning_rate": 0.000564335983763942,
|
| 22225 |
+
"loss": 2.0584,
|
| 22226 |
+
"num_input_tokens_seen": 65446854944,
|
| 22227 |
+
"step": 124850
|
| 22228 |
+
},
|
| 22229 |
+
{
|
| 22230 |
+
"epoch": 1.1915546704508866,
|
| 22231 |
+
"grad_norm": 0.13637055456638336,
|
| 22232 |
+
"learning_rate": 0.0005615533098453215,
|
| 22233 |
+
"loss": 2.0719,
|
| 22234 |
+
"num_input_tokens_seen": 65473067296,
|
| 22235 |
+
"step": 124900
|
| 22236 |
+
},
|
| 22237 |
+
{
|
| 22238 |
+
"epoch": 1.192031672967075,
|
| 22239 |
+
"grad_norm": 0.13165481388568878,
|
| 22240 |
+
"learning_rate": 0.0005587686987289189,
|
| 22241 |
+
"loss": 2.0594,
|
| 22242 |
+
"num_input_tokens_seen": 65499281184,
|
| 22243 |
+
"step": 124950
|
| 22244 |
+
},
|
| 22245 |
+
{
|
| 22246 |
+
"epoch": 1.1925086754832632,
|
| 22247 |
+
"grad_norm": 0.14200669527053833,
|
| 22248 |
+
"learning_rate": 0.0005559822380516539,
|
| 22249 |
+
"loss": 2.0692,
|
| 22250 |
+
"num_input_tokens_seen": 65525493280,
|
| 22251 |
+
"step": 125000
|
| 22252 |
+
},
|
| 22253 |
+
{
|
| 22254 |
+
"epoch": 1.1925086754832632,
|
| 22255 |
+
"eval_loss": 1.982203722000122,
|
| 22256 |
+
"eval_runtime": 82.3332,
|
| 22257 |
+
"eval_samples_per_second": 60.729,
|
| 22258 |
+
"eval_steps_per_second": 15.182,
|
| 22259 |
+
"num_input_tokens_seen": 65525493280,
|
| 22260 |
+
"step": 125000
|
| 22261 |
}
|
| 22262 |
],
|
| 22263 |
"logging_steps": 50,
|
| 22264 |
"max_steps": 140000,
|
| 22265 |
+
"num_input_tokens_seen": 65525493280,
|
| 22266 |
"num_train_epochs": 2,
|
| 22267 |
"save_steps": 1000,
|
| 22268 |
"stateful_callbacks": {
|
|
|
|
| 22277 |
"attributes": {}
|
| 22278 |
}
|
| 22279 |
},
|
| 22280 |
+
"total_flos": 1.1596817125562573e+20,
|
| 22281 |
"train_batch_size": 32,
|
| 22282 |
"trial_name": null,
|
| 22283 |
"trial_params": null
|