Training in progress, step 58000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24ebb1df57ac2ee9b586e62f321c007518f59293b5104f6e4c9cd4556be49e20
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:284d00e91b8ed248cc64cf350da118b741fc38fb51627a69c88a312c68a088a3
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec1bfb0db1c21e8b4cd52af95928aa8366b624cdfe8a7ae4baa053e84325dfb8
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:546d8e8727a1368f14dcaccf9c4cddd7ddc8e71b1cf1d15c1ef9e8250409d1c7
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10154,11 +10154,189 @@
|
|
| 10154 |
"eval_steps_per_second": 23.418,
|
| 10155 |
"num_input_tokens_seen": 14942203456,
|
| 10156 |
"step": 57000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10157 |
}
|
| 10158 |
],
|
| 10159 |
"logging_steps": 50,
|
| 10160 |
"max_steps": 70000,
|
| 10161 |
-
"num_input_tokens_seen":
|
| 10162 |
"num_train_epochs": 1,
|
| 10163 |
"save_steps": 1000,
|
| 10164 |
"stateful_callbacks": {
|
|
@@ -10173,7 +10351,7 @@
|
|
| 10173 |
"attributes": {}
|
| 10174 |
}
|
| 10175 |
},
|
| 10176 |
-
"total_flos":
|
| 10177 |
"train_batch_size": 64,
|
| 10178 |
"trial_name": null,
|
| 10179 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2766614593891983,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 58000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10154 |
"eval_steps_per_second": 23.418,
|
| 10155 |
"num_input_tokens_seen": 14942203456,
|
| 10156 |
"step": 57000
|
| 10157 |
+
},
|
| 10158 |
+
{
|
| 10159 |
+
"epoch": 0.2721299354854097,
|
| 10160 |
+
"grad_norm": 0.22044019401073456,
|
| 10161 |
+
"learning_rate": 0.0009861849601988384,
|
| 10162 |
+
"loss": 2.6119,
|
| 10163 |
+
"num_input_tokens_seen": 14955310656,
|
| 10164 |
+
"step": 57050
|
| 10165 |
+
},
|
| 10166 |
+
{
|
| 10167 |
+
"epoch": 0.2723684367435038,
|
| 10168 |
+
"grad_norm": 0.2155238389968872,
|
| 10169 |
+
"learning_rate": 0.0009848447601883434,
|
| 10170 |
+
"loss": 2.5869,
|
| 10171 |
+
"num_input_tokens_seen": 14968417856,
|
| 10172 |
+
"step": 57100
|
| 10173 |
+
},
|
| 10174 |
+
{
|
| 10175 |
+
"epoch": 0.27260693800159796,
|
| 10176 |
+
"grad_norm": 0.21131549775600433,
|
| 10177 |
+
"learning_rate": 0.0009834435247725033,
|
| 10178 |
+
"loss": 2.5988,
|
| 10179 |
+
"num_input_tokens_seen": 14981525056,
|
| 10180 |
+
"step": 57150
|
| 10181 |
+
},
|
| 10182 |
+
{
|
| 10183 |
+
"epoch": 0.27284543925969207,
|
| 10184 |
+
"grad_norm": 0.21247337758541107,
|
| 10185 |
+
"learning_rate": 0.0009819814303479266,
|
| 10186 |
+
"loss": 2.6198,
|
| 10187 |
+
"num_input_tokens_seen": 14994632256,
|
| 10188 |
+
"step": 57200
|
| 10189 |
+
},
|
| 10190 |
+
{
|
| 10191 |
+
"epoch": 0.27308394051778623,
|
| 10192 |
+
"grad_norm": 0.21916711330413818,
|
| 10193 |
+
"learning_rate": 0.00098045866097255,
|
| 10194 |
+
"loss": 2.6019,
|
| 10195 |
+
"num_input_tokens_seen": 15007739456,
|
| 10196 |
+
"step": 57250
|
| 10197 |
+
},
|
| 10198 |
+
{
|
| 10199 |
+
"epoch": 0.2733224417758804,
|
| 10200 |
+
"grad_norm": 0.1925441473722458,
|
| 10201 |
+
"learning_rate": 0.0009788754083424652,
|
| 10202 |
+
"loss": 2.6143,
|
| 10203 |
+
"num_input_tokens_seen": 15020846656,
|
| 10204 |
+
"step": 57300
|
| 10205 |
+
},
|
| 10206 |
+
{
|
| 10207 |
+
"epoch": 0.2735609430339745,
|
| 10208 |
+
"grad_norm": 0.38578665256500244,
|
| 10209 |
+
"learning_rate": 0.0009772318717677904,
|
| 10210 |
+
"loss": 2.6037,
|
| 10211 |
+
"num_input_tokens_seen": 15033953856,
|
| 10212 |
+
"step": 57350
|
| 10213 |
+
},
|
| 10214 |
+
{
|
| 10215 |
+
"epoch": 0.27379944429206865,
|
| 10216 |
+
"grad_norm": 0.19650611281394958,
|
| 10217 |
+
"learning_rate": 0.0009755282581475768,
|
| 10218 |
+
"loss": 2.5745,
|
| 10219 |
+
"num_input_tokens_seen": 15047061056,
|
| 10220 |
+
"step": 57400
|
| 10221 |
+
},
|
| 10222 |
+
{
|
| 10223 |
+
"epoch": 0.27403794555016275,
|
| 10224 |
+
"grad_norm": 0.2376088798046112,
|
| 10225 |
+
"learning_rate": 0.0009737647819437645,
|
| 10226 |
+
"loss": 2.5968,
|
| 10227 |
+
"num_input_tokens_seen": 15060168256,
|
| 10228 |
+
"step": 57450
|
| 10229 |
+
},
|
| 10230 |
+
{
|
| 10231 |
+
"epoch": 0.2742764468082569,
|
| 10232 |
+
"grad_norm": 0.21746863424777985,
|
| 10233 |
+
"learning_rate": 0.0009719416651541838,
|
| 10234 |
+
"loss": 2.5965,
|
| 10235 |
+
"num_input_tokens_seen": 15073275456,
|
| 10236 |
+
"step": 57500
|
| 10237 |
+
},
|
| 10238 |
+
{
|
| 10239 |
+
"epoch": 0.2742764468082569,
|
| 10240 |
+
"eval_loss": 2.483751058578491,
|
| 10241 |
+
"eval_runtime": 53.9622,
|
| 10242 |
+
"eval_samples_per_second": 92.657,
|
| 10243 |
+
"eval_steps_per_second": 23.164,
|
| 10244 |
+
"num_input_tokens_seen": 15073275456,
|
| 10245 |
+
"step": 57500
|
| 10246 |
+
},
|
| 10247 |
+
{
|
| 10248 |
+
"epoch": 0.27451494806635107,
|
| 10249 |
+
"grad_norm": 0.2898815870285034,
|
| 10250 |
+
"learning_rate": 0.0009700591372846095,
|
| 10251 |
+
"loss": 2.6105,
|
| 10252 |
+
"num_input_tokens_seen": 15086382656,
|
| 10253 |
+
"step": 57550
|
| 10254 |
+
},
|
| 10255 |
+
{
|
| 10256 |
+
"epoch": 0.2747534493244452,
|
| 10257 |
+
"grad_norm": 0.24887384474277496,
|
| 10258 |
+
"learning_rate": 0.0009681174353198686,
|
| 10259 |
+
"loss": 2.6103,
|
| 10260 |
+
"num_input_tokens_seen": 15099489856,
|
| 10261 |
+
"step": 57600
|
| 10262 |
+
},
|
| 10263 |
+
{
|
| 10264 |
+
"epoch": 0.27499195058253934,
|
| 10265 |
+
"grad_norm": 0.26613715291023254,
|
| 10266 |
+
"learning_rate": 0.0009661168036940071,
|
| 10267 |
+
"loss": 2.6296,
|
| 10268 |
+
"num_input_tokens_seen": 15112597056,
|
| 10269 |
+
"step": 57650
|
| 10270 |
+
},
|
| 10271 |
+
{
|
| 10272 |
+
"epoch": 0.27523045184063344,
|
| 10273 |
+
"grad_norm": 0.23983849585056305,
|
| 10274 |
+
"learning_rate": 0.0009640574942595195,
|
| 10275 |
+
"loss": 2.6008,
|
| 10276 |
+
"num_input_tokens_seen": 15125704256,
|
| 10277 |
+
"step": 57700
|
| 10278 |
+
},
|
| 10279 |
+
{
|
| 10280 |
+
"epoch": 0.2754689530987276,
|
| 10281 |
+
"grad_norm": 0.23169022798538208,
|
| 10282 |
+
"learning_rate": 0.0009619397662556434,
|
| 10283 |
+
"loss": 2.596,
|
| 10284 |
+
"num_input_tokens_seen": 15138811456,
|
| 10285 |
+
"step": 57750
|
| 10286 |
+
},
|
| 10287 |
+
{
|
| 10288 |
+
"epoch": 0.27570745435682176,
|
| 10289 |
+
"grad_norm": 0.21353812515735626,
|
| 10290 |
+
"learning_rate": 0.0009597638862757254,
|
| 10291 |
+
"loss": 2.6039,
|
| 10292 |
+
"num_input_tokens_seen": 15151918656,
|
| 10293 |
+
"step": 57800
|
| 10294 |
+
},
|
| 10295 |
+
{
|
| 10296 |
+
"epoch": 0.27594595561491586,
|
| 10297 |
+
"grad_norm": 0.2561227083206177,
|
| 10298 |
+
"learning_rate": 0.00095753012823366,
|
| 10299 |
+
"loss": 2.6046,
|
| 10300 |
+
"num_input_tokens_seen": 15165025856,
|
| 10301 |
+
"step": 57850
|
| 10302 |
+
},
|
| 10303 |
+
{
|
| 10304 |
+
"epoch": 0.27618445687301,
|
| 10305 |
+
"grad_norm": 0.20380394160747528,
|
| 10306 |
+
"learning_rate": 0.000955238773329408,
|
| 10307 |
+
"loss": 2.5968,
|
| 10308 |
+
"num_input_tokens_seen": 15178133056,
|
| 10309 |
+
"step": 57900
|
| 10310 |
+
},
|
| 10311 |
+
{
|
| 10312 |
+
"epoch": 0.2764229581311041,
|
| 10313 |
+
"grad_norm": 0.26447024941444397,
|
| 10314 |
+
"learning_rate": 0.000952890110013597,
|
| 10315 |
+
"loss": 2.5848,
|
| 10316 |
+
"num_input_tokens_seen": 15191240256,
|
| 10317 |
+
"step": 57950
|
| 10318 |
+
},
|
| 10319 |
+
{
|
| 10320 |
+
"epoch": 0.2766614593891983,
|
| 10321 |
+
"grad_norm": 0.23530781269073486,
|
| 10322 |
+
"learning_rate": 0.0009504844339512095,
|
| 10323 |
+
"loss": 2.582,
|
| 10324 |
+
"num_input_tokens_seen": 15204347456,
|
| 10325 |
+
"step": 58000
|
| 10326 |
+
},
|
| 10327 |
+
{
|
| 10328 |
+
"epoch": 0.2766614593891983,
|
| 10329 |
+
"eval_loss": 2.482050895690918,
|
| 10330 |
+
"eval_runtime": 53.5775,
|
| 10331 |
+
"eval_samples_per_second": 93.323,
|
| 10332 |
+
"eval_steps_per_second": 23.331,
|
| 10333 |
+
"num_input_tokens_seen": 15204347456,
|
| 10334 |
+
"step": 58000
|
| 10335 |
}
|
| 10336 |
],
|
| 10337 |
"logging_steps": 50,
|
| 10338 |
"max_steps": 70000,
|
| 10339 |
+
"num_input_tokens_seen": 15204347456,
|
| 10340 |
"num_train_epochs": 1,
|
| 10341 |
"save_steps": 1000,
|
| 10342 |
"stateful_callbacks": {
|
|
|
|
| 10351 |
"attributes": {}
|
| 10352 |
}
|
| 10353 |
},
|
| 10354 |
+
"total_flos": 4.0673113389111706e+18,
|
| 10355 |
"train_batch_size": 64,
|
| 10356 |
"trial_name": null,
|
| 10357 |
"trial_params": null
|