Training in progress, step 11838, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 328277848
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:321e47c1dc832a190631f95fac9772b430fc73140d3d5243eca49fa4976c0528
|
| 3 |
size 328277848
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 318646859
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2507c13555041b38460e977baf58f0396d26b9357458953b37b5599c0ee7222
|
| 3 |
size 318646859
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b172ef1a2b23540cb3d53eed9b6dcd9ee9e06553bb8c4f5a46142cb0fe60689
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8249,6 +8249,237 @@
|
|
| 8249 |
"eval_samples_per_second": 246.128,
|
| 8250 |
"eval_steps_per_second": 5.169,
|
| 8251 |
"step": 11500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8252 |
}
|
| 8253 |
],
|
| 8254 |
"logging_steps": 10,
|
|
@@ -8263,12 +8494,12 @@
|
|
| 8263 |
"should_evaluate": false,
|
| 8264 |
"should_log": false,
|
| 8265 |
"should_save": true,
|
| 8266 |
-
"should_training_stop":
|
| 8267 |
},
|
| 8268 |
"attributes": {}
|
| 8269 |
}
|
| 8270 |
},
|
| 8271 |
-
"total_flos": 3.
|
| 8272 |
"train_batch_size": 48,
|
| 8273 |
"trial_name": null,
|
| 8274 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 11838,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8249 |
"eval_samples_per_second": 246.128,
|
| 8250 |
"eval_steps_per_second": 5.169,
|
| 8251 |
"step": 11500
|
| 8252 |
+
},
|
| 8253 |
+
{
|
| 8254 |
+
"epoch": 1.9445852339922283,
|
| 8255 |
+
"grad_norm": 0.4352650046348572,
|
| 8256 |
+
"learning_rate": 8.270650415620584e-07,
|
| 8257 |
+
"loss": 4.2965538024902346,
|
| 8258 |
+
"step": 11510
|
| 8259 |
+
},
|
| 8260 |
+
{
|
| 8261 |
+
"epoch": 1.9462747085656362,
|
| 8262 |
+
"grad_norm": 0.43585142493247986,
|
| 8263 |
+
"learning_rate": 7.775944524542055e-07,
|
| 8264 |
+
"loss": 4.270129776000976,
|
| 8265 |
+
"step": 11520
|
| 8266 |
+
},
|
| 8267 |
+
{
|
| 8268 |
+
"epoch": 1.9479641831390437,
|
| 8269 |
+
"grad_norm": 0.4469541907310486,
|
| 8270 |
+
"learning_rate": 7.296455308872406e-07,
|
| 8271 |
+
"loss": 4.283909606933594,
|
| 8272 |
+
"step": 11530
|
| 8273 |
+
},
|
| 8274 |
+
{
|
| 8275 |
+
"epoch": 1.9496536577124515,
|
| 8276 |
+
"grad_norm": 0.4361380636692047,
|
| 8277 |
+
"learning_rate": 6.832187658113441e-07,
|
| 8278 |
+
"loss": 4.296160125732422,
|
| 8279 |
+
"step": 11540
|
| 8280 |
+
},
|
| 8281 |
+
{
|
| 8282 |
+
"epoch": 1.9513431322858592,
|
| 8283 |
+
"grad_norm": 0.44409504532814026,
|
| 8284 |
+
"learning_rate": 6.383146306547626e-07,
|
| 8285 |
+
"loss": 4.304541778564453,
|
| 8286 |
+
"step": 11550
|
| 8287 |
+
},
|
| 8288 |
+
{
|
| 8289 |
+
"epoch": 1.9530326068592667,
|
| 8290 |
+
"grad_norm": 0.45062074065208435,
|
| 8291 |
+
"learning_rate": 5.949335833189628e-07,
|
| 8292 |
+
"loss": 4.3281913757324215,
|
| 8293 |
+
"step": 11560
|
| 8294 |
+
},
|
| 8295 |
+
{
|
| 8296 |
+
"epoch": 1.9547220814326745,
|
| 8297 |
+
"grad_norm": 0.45208507776260376,
|
| 8298 |
+
"learning_rate": 5.530760661741018e-07,
|
| 8299 |
+
"loss": 4.3035846710205075,
|
| 8300 |
+
"step": 11570
|
| 8301 |
+
},
|
| 8302 |
+
{
|
| 8303 |
+
"epoch": 1.9564115560060822,
|
| 8304 |
+
"grad_norm": 0.44333794713020325,
|
| 8305 |
+
"learning_rate": 5.127425060543478e-07,
|
| 8306 |
+
"loss": 4.278887939453125,
|
| 8307 |
+
"step": 11580
|
| 8308 |
+
},
|
| 8309 |
+
{
|
| 8310 |
+
"epoch": 1.9581010305794897,
|
| 8311 |
+
"grad_norm": 0.44367748498916626,
|
| 8312 |
+
"learning_rate": 4.7393331425364943e-07,
|
| 8313 |
+
"loss": 4.281793594360352,
|
| 8314 |
+
"step": 11590
|
| 8315 |
+
},
|
| 8316 |
+
{
|
| 8317 |
+
"epoch": 1.9597905051528974,
|
| 8318 |
+
"grad_norm": 0.4411092698574066,
|
| 8319 |
+
"learning_rate": 4.3664888652144017e-07,
|
| 8320 |
+
"loss": 4.278807067871094,
|
| 8321 |
+
"step": 11600
|
| 8322 |
+
},
|
| 8323 |
+
{
|
| 8324 |
+
"epoch": 1.9614799797263052,
|
| 8325 |
+
"grad_norm": 0.44609910249710083,
|
| 8326 |
+
"learning_rate": 4.008896030587072e-07,
|
| 8327 |
+
"loss": 4.270274353027344,
|
| 8328 |
+
"step": 11610
|
| 8329 |
+
},
|
| 8330 |
+
{
|
| 8331 |
+
"epoch": 1.9631694542997127,
|
| 8332 |
+
"grad_norm": 0.43740522861480713,
|
| 8333 |
+
"learning_rate": 3.6665582851406195e-07,
|
| 8334 |
+
"loss": 4.296014785766602,
|
| 8335 |
+
"step": 11620
|
| 8336 |
+
},
|
| 8337 |
+
{
|
| 8338 |
+
"epoch": 1.9648589288731204,
|
| 8339 |
+
"grad_norm": 0.44448962807655334,
|
| 8340 |
+
"learning_rate": 3.3394791198000927e-07,
|
| 8341 |
+
"loss": 4.282284927368164,
|
| 8342 |
+
"step": 11630
|
| 8343 |
+
},
|
| 8344 |
+
{
|
| 8345 |
+
"epoch": 1.9665484034465281,
|
| 8346 |
+
"grad_norm": 0.45065152645111084,
|
| 8347 |
+
"learning_rate": 3.027661869893672e-07,
|
| 8348 |
+
"loss": 4.2820892333984375,
|
| 8349 |
+
"step": 11640
|
| 8350 |
+
},
|
| 8351 |
+
{
|
| 8352 |
+
"epoch": 1.9682378780199357,
|
| 8353 |
+
"grad_norm": 0.4398045539855957,
|
| 8354 |
+
"learning_rate": 2.731109715119861e-07,
|
| 8355 |
+
"loss": 4.281244277954102,
|
| 8356 |
+
"step": 11650
|
| 8357 |
+
},
|
| 8358 |
+
{
|
| 8359 |
+
"epoch": 1.9699273525933436,
|
| 8360 |
+
"grad_norm": 0.4467960000038147,
|
| 8361 |
+
"learning_rate": 2.4498256795135173e-07,
|
| 8362 |
+
"loss": 4.307322311401367,
|
| 8363 |
+
"step": 11660
|
| 8364 |
+
},
|
| 8365 |
+
{
|
| 8366 |
+
"epoch": 1.9716168271667511,
|
| 8367 |
+
"grad_norm": 0.4327242970466614,
|
| 8368 |
+
"learning_rate": 2.183812631415871e-07,
|
| 8369 |
+
"loss": 4.275672149658203,
|
| 8370 |
+
"step": 11670
|
| 8371 |
+
},
|
| 8372 |
+
{
|
| 8373 |
+
"epoch": 1.9733063017401589,
|
| 8374 |
+
"grad_norm": 0.43306484818458557,
|
| 8375 |
+
"learning_rate": 1.933073283445219e-07,
|
| 8376 |
+
"loss": 4.291437149047852,
|
| 8377 |
+
"step": 11680
|
| 8378 |
+
},
|
| 8379 |
+
{
|
| 8380 |
+
"epoch": 1.9749957763135666,
|
| 8381 |
+
"grad_norm": 0.4464097023010254,
|
| 8382 |
+
"learning_rate": 1.697610192469112e-07,
|
| 8383 |
+
"loss": 4.312542343139649,
|
| 8384 |
+
"step": 11690
|
| 8385 |
+
},
|
| 8386 |
+
{
|
| 8387 |
+
"epoch": 1.976685250886974,
|
| 8388 |
+
"grad_norm": 0.4436480700969696,
|
| 8389 |
+
"learning_rate": 1.4774257595783766e-07,
|
| 8390 |
+
"loss": 4.300673294067383,
|
| 8391 |
+
"step": 11700
|
| 8392 |
+
},
|
| 8393 |
+
{
|
| 8394 |
+
"epoch": 1.9783747254603818,
|
| 8395 |
+
"grad_norm": 0.44450485706329346,
|
| 8396 |
+
"learning_rate": 1.272522230062467e-07,
|
| 8397 |
+
"loss": 4.290340805053711,
|
| 8398 |
+
"step": 11710
|
| 8399 |
+
},
|
| 8400 |
+
{
|
| 8401 |
+
"epoch": 1.9800642000337896,
|
| 8402 |
+
"grad_norm": 0.4362986981868744,
|
| 8403 |
+
"learning_rate": 1.0829016933869838e-07,
|
| 8404 |
+
"loss": 4.2894245147705075,
|
| 8405 |
+
"step": 11720
|
| 8406 |
+
},
|
| 8407 |
+
{
|
| 8408 |
+
"epoch": 1.981753674607197,
|
| 8409 |
+
"grad_norm": 0.43450725078582764,
|
| 8410 |
+
"learning_rate": 9.085660831715247e-08,
|
| 8411 |
+
"loss": 4.298795700073242,
|
| 8412 |
+
"step": 11730
|
| 8413 |
+
},
|
| 8414 |
+
{
|
| 8415 |
+
"epoch": 1.9834431491806048,
|
| 8416 |
+
"grad_norm": 0.44246765971183777,
|
| 8417 |
+
"learning_rate": 7.495171771710328e-08,
|
| 8418 |
+
"loss": 4.293585968017578,
|
| 8419 |
+
"step": 11740
|
| 8420 |
+
},
|
| 8421 |
+
{
|
| 8422 |
+
"epoch": 1.9851326237540126,
|
| 8423 |
+
"grad_norm": 0.43929263949394226,
|
| 8424 |
+
"learning_rate": 6.057565972568123e-08,
|
| 8425 |
+
"loss": 4.293174743652344,
|
| 8426 |
+
"step": 11750
|
| 8427 |
+
},
|
| 8428 |
+
{
|
| 8429 |
+
"epoch": 1.98682209832742,
|
| 8430 |
+
"grad_norm": 0.4450415372848511,
|
| 8431 |
+
"learning_rate": 4.772858094005405e-08,
|
| 8432 |
+
"loss": 4.3004913330078125,
|
| 8433 |
+
"step": 11760
|
| 8434 |
+
},
|
| 8435 |
+
{
|
| 8436 |
+
"epoch": 1.9885115729008278,
|
| 8437 |
+
"grad_norm": 0.4472520053386688,
|
| 8438 |
+
"learning_rate": 3.641061236591136e-08,
|
| 8439 |
+
"loss": 4.2836250305175785,
|
| 8440 |
+
"step": 11770
|
| 8441 |
+
},
|
| 8442 |
+
{
|
| 8443 |
+
"epoch": 1.9902010474742355,
|
| 8444 |
+
"grad_norm": 0.44302183389663696,
|
| 8445 |
+
"learning_rate": 2.6621869416099118e-08,
|
| 8446 |
+
"loss": 4.290175247192383,
|
| 8447 |
+
"step": 11780
|
| 8448 |
+
},
|
| 8449 |
+
{
|
| 8450 |
+
"epoch": 1.991890522047643,
|
| 8451 |
+
"grad_norm": 0.4414844512939453,
|
| 8452 |
+
"learning_rate": 1.8362451909520458e-08,
|
| 8453 |
+
"loss": 4.286873245239258,
|
| 8454 |
+
"step": 11790
|
| 8455 |
+
},
|
| 8456 |
+
{
|
| 8457 |
+
"epoch": 1.993579996621051,
|
| 8458 |
+
"grad_norm": 0.44598934054374695,
|
| 8459 |
+
"learning_rate": 1.16324440700033e-08,
|
| 8460 |
+
"loss": 4.297615051269531,
|
| 8461 |
+
"step": 11800
|
| 8462 |
+
},
|
| 8463 |
+
{
|
| 8464 |
+
"epoch": 1.9952694711944585,
|
| 8465 |
+
"grad_norm": 0.4523853361606598,
|
| 8466 |
+
"learning_rate": 6.431914525567572e-09,
|
| 8467 |
+
"loss": 4.289733505249023,
|
| 8468 |
+
"step": 11810
|
| 8469 |
+
},
|
| 8470 |
+
{
|
| 8471 |
+
"epoch": 1.996958945767866,
|
| 8472 |
+
"grad_norm": 0.4494129419326782,
|
| 8473 |
+
"learning_rate": 2.760916307625871e-09,
|
| 8474 |
+
"loss": 4.304800415039063,
|
| 8475 |
+
"step": 11820
|
| 8476 |
+
},
|
| 8477 |
+
{
|
| 8478 |
+
"epoch": 1.998648420341274,
|
| 8479 |
+
"grad_norm": 0.4344528913497925,
|
| 8480 |
+
"learning_rate": 6.194868504838524e-10,
|
| 8481 |
+
"loss": 4.279055786132813,
|
| 8482 |
+
"step": 11830
|
| 8483 |
}
|
| 8484 |
],
|
| 8485 |
"logging_steps": 10,
|
|
|
|
| 8494 |
"should_evaluate": false,
|
| 8495 |
"should_log": false,
|
| 8496 |
"should_save": true,
|
| 8497 |
+
"should_training_stop": true
|
| 8498 |
},
|
| 8499 |
"attributes": {}
|
| 8500 |
}
|
| 8501 |
},
|
| 8502 |
+
"total_flos": 3.959258038224814e+17,
|
| 8503 |
"train_batch_size": 48,
|
| 8504 |
"trial_name": null,
|
| 8505 |
"trial_params": null
|