Training in progress, step 37000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 517931840
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ce938a644f0cf4d10d231b631256c1bcbd8d98d79787b20ca3ed148b88756be
|
| 3 |
size 517931840
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1035661434
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64656c8de22e45c2941d2ea854ec0d370243cfeea2920fb181966f363dd14777
|
| 3 |
size 1035661434
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0451e520bbe84b70e4cd2907956e95cd6d56464539f21e68e26c043e5cf63b1e
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90815e584013ee668de6d5b656c515902fbacbb32f54a71d2d1d29e05110019f
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6416,11 +6416,189 @@
|
|
| 6416 |
"eval_steps_per_second": 18.606,
|
| 6417 |
"num_input_tokens_seen": 37748732160,
|
| 6418 |
"step": 36000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6419 |
}
|
| 6420 |
],
|
| 6421 |
"logging_steps": 50,
|
| 6422 |
"max_steps": 200000,
|
| 6423 |
-
"num_input_tokens_seen":
|
| 6424 |
"num_train_epochs": 5,
|
| 6425 |
"save_steps": 1000,
|
| 6426 |
"stateful_callbacks": {
|
|
@@ -6435,7 +6613,7 @@
|
|
| 6435 |
"attributes": {}
|
| 6436 |
}
|
| 6437 |
},
|
| 6438 |
-
"total_flos": 2.
|
| 6439 |
"train_batch_size": 64,
|
| 6440 |
"trial_name": null,
|
| 6441 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.8127435999873696,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 37000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6416 |
"eval_steps_per_second": 18.606,
|
| 6417 |
"num_input_tokens_seen": 37748732160,
|
| 6418 |
"step": 36000
|
| 6419 |
+
},
|
| 6420 |
+
{
|
| 6421 |
+
"epoch": 0.7918758589066127,
|
| 6422 |
+
"grad_norm": 0.16054154932498932,
|
| 6423 |
+
"learning_rate": 0.001,
|
| 6424 |
+
"loss": 2.6516,
|
| 6425 |
+
"num_input_tokens_seen": 37801160960,
|
| 6426 |
+
"step": 36050
|
| 6427 |
+
},
|
| 6428 |
+
{
|
| 6429 |
+
"epoch": 0.7929741610687578,
|
| 6430 |
+
"grad_norm": 0.15180550515651703,
|
| 6431 |
+
"learning_rate": 0.001,
|
| 6432 |
+
"loss": 2.6508,
|
| 6433 |
+
"num_input_tokens_seen": 37853589760,
|
| 6434 |
+
"step": 36100
|
| 6435 |
+
},
|
| 6436 |
+
{
|
| 6437 |
+
"epoch": 0.794072463230903,
|
| 6438 |
+
"grad_norm": 0.19564937055110931,
|
| 6439 |
+
"learning_rate": 0.001,
|
| 6440 |
+
"loss": 2.6532,
|
| 6441 |
+
"num_input_tokens_seen": 37906018560,
|
| 6442 |
+
"step": 36150
|
| 6443 |
+
},
|
| 6444 |
+
{
|
| 6445 |
+
"epoch": 0.795170765393048,
|
| 6446 |
+
"grad_norm": 0.15047501027584076,
|
| 6447 |
+
"learning_rate": 0.001,
|
| 6448 |
+
"loss": 2.6567,
|
| 6449 |
+
"num_input_tokens_seen": 37958447360,
|
| 6450 |
+
"step": 36200
|
| 6451 |
+
},
|
| 6452 |
+
{
|
| 6453 |
+
"epoch": 0.7962690675551931,
|
| 6454 |
+
"grad_norm": 0.1420314759016037,
|
| 6455 |
+
"learning_rate": 0.001,
|
| 6456 |
+
"loss": 2.6511,
|
| 6457 |
+
"num_input_tokens_seen": 38010876160,
|
| 6458 |
+
"step": 36250
|
| 6459 |
+
},
|
| 6460 |
+
{
|
| 6461 |
+
"epoch": 0.7973673697173382,
|
| 6462 |
+
"grad_norm": 0.14328153431415558,
|
| 6463 |
+
"learning_rate": 0.001,
|
| 6464 |
+
"loss": 2.6601,
|
| 6465 |
+
"num_input_tokens_seen": 38063304960,
|
| 6466 |
+
"step": 36300
|
| 6467 |
+
},
|
| 6468 |
+
{
|
| 6469 |
+
"epoch": 0.7984656718794833,
|
| 6470 |
+
"grad_norm": 0.15527622401714325,
|
| 6471 |
+
"learning_rate": 0.001,
|
| 6472 |
+
"loss": 2.6598,
|
| 6473 |
+
"num_input_tokens_seen": 38115733760,
|
| 6474 |
+
"step": 36350
|
| 6475 |
+
},
|
| 6476 |
+
{
|
| 6477 |
+
"epoch": 0.7995639740416284,
|
| 6478 |
+
"grad_norm": 0.15956974029541016,
|
| 6479 |
+
"learning_rate": 0.001,
|
| 6480 |
+
"loss": 2.6522,
|
| 6481 |
+
"num_input_tokens_seen": 38168162560,
|
| 6482 |
+
"step": 36400
|
| 6483 |
+
},
|
| 6484 |
+
{
|
| 6485 |
+
"epoch": 0.8006622762037735,
|
| 6486 |
+
"grad_norm": 0.15193034708499908,
|
| 6487 |
+
"learning_rate": 0.001,
|
| 6488 |
+
"loss": 2.6561,
|
| 6489 |
+
"num_input_tokens_seen": 38220591360,
|
| 6490 |
+
"step": 36450
|
| 6491 |
+
},
|
| 6492 |
+
{
|
| 6493 |
+
"epoch": 0.8017605783659186,
|
| 6494 |
+
"grad_norm": 0.1692439615726471,
|
| 6495 |
+
"learning_rate": 0.001,
|
| 6496 |
+
"loss": 2.653,
|
| 6497 |
+
"num_input_tokens_seen": 38273020160,
|
| 6498 |
+
"step": 36500
|
| 6499 |
+
},
|
| 6500 |
+
{
|
| 6501 |
+
"epoch": 0.8017605783659186,
|
| 6502 |
+
"eval_loss": 2.553743362426758,
|
| 6503 |
+
"eval_runtime": 66.3488,
|
| 6504 |
+
"eval_samples_per_second": 75.359,
|
| 6505 |
+
"eval_steps_per_second": 18.84,
|
| 6506 |
+
"num_input_tokens_seen": 38273020160,
|
| 6507 |
+
"step": 36500
|
| 6508 |
+
},
|
| 6509 |
+
{
|
| 6510 |
+
"epoch": 0.8028588805280636,
|
| 6511 |
+
"grad_norm": 0.473707377910614,
|
| 6512 |
+
"learning_rate": 0.001,
|
| 6513 |
+
"loss": 2.6604,
|
| 6514 |
+
"num_input_tokens_seen": 38325448960,
|
| 6515 |
+
"step": 36550
|
| 6516 |
+
},
|
| 6517 |
+
{
|
| 6518 |
+
"epoch": 0.8039571826902088,
|
| 6519 |
+
"grad_norm": 0.16226574778556824,
|
| 6520 |
+
"learning_rate": 0.001,
|
| 6521 |
+
"loss": 2.6615,
|
| 6522 |
+
"num_input_tokens_seen": 38377877760,
|
| 6523 |
+
"step": 36600
|
| 6524 |
+
},
|
| 6525 |
+
{
|
| 6526 |
+
"epoch": 0.8050554848523539,
|
| 6527 |
+
"grad_norm": 0.17274035513401031,
|
| 6528 |
+
"learning_rate": 0.001,
|
| 6529 |
+
"loss": 2.6616,
|
| 6530 |
+
"num_input_tokens_seen": 38430306560,
|
| 6531 |
+
"step": 36650
|
| 6532 |
+
},
|
| 6533 |
+
{
|
| 6534 |
+
"epoch": 0.8061537870144989,
|
| 6535 |
+
"grad_norm": 0.14171990752220154,
|
| 6536 |
+
"learning_rate": 0.001,
|
| 6537 |
+
"loss": 2.6628,
|
| 6538 |
+
"num_input_tokens_seen": 38482735360,
|
| 6539 |
+
"step": 36700
|
| 6540 |
+
},
|
| 6541 |
+
{
|
| 6542 |
+
"epoch": 0.807252089176644,
|
| 6543 |
+
"grad_norm": 0.3828020989894867,
|
| 6544 |
+
"learning_rate": 0.001,
|
| 6545 |
+
"loss": 2.6717,
|
| 6546 |
+
"num_input_tokens_seen": 38535164160,
|
| 6547 |
+
"step": 36750
|
| 6548 |
+
},
|
| 6549 |
+
{
|
| 6550 |
+
"epoch": 0.8083503913387892,
|
| 6551 |
+
"grad_norm": 0.20836575329303741,
|
| 6552 |
+
"learning_rate": 0.001,
|
| 6553 |
+
"loss": 2.685,
|
| 6554 |
+
"num_input_tokens_seen": 38587592960,
|
| 6555 |
+
"step": 36800
|
| 6556 |
+
},
|
| 6557 |
+
{
|
| 6558 |
+
"epoch": 0.8094486935009343,
|
| 6559 |
+
"grad_norm": 0.14613227546215057,
|
| 6560 |
+
"learning_rate": 0.001,
|
| 6561 |
+
"loss": 2.6687,
|
| 6562 |
+
"num_input_tokens_seen": 38640021760,
|
| 6563 |
+
"step": 36850
|
| 6564 |
+
},
|
| 6565 |
+
{
|
| 6566 |
+
"epoch": 0.8105469956630793,
|
| 6567 |
+
"grad_norm": 0.16505028307437897,
|
| 6568 |
+
"learning_rate": 0.001,
|
| 6569 |
+
"loss": 2.6654,
|
| 6570 |
+
"num_input_tokens_seen": 38692450560,
|
| 6571 |
+
"step": 36900
|
| 6572 |
+
},
|
| 6573 |
+
{
|
| 6574 |
+
"epoch": 0.8116452978252244,
|
| 6575 |
+
"grad_norm": 0.15305323898792267,
|
| 6576 |
+
"learning_rate": 0.001,
|
| 6577 |
+
"loss": 2.6612,
|
| 6578 |
+
"num_input_tokens_seen": 38744879360,
|
| 6579 |
+
"step": 36950
|
| 6580 |
+
},
|
| 6581 |
+
{
|
| 6582 |
+
"epoch": 0.8127435999873696,
|
| 6583 |
+
"grad_norm": 0.2416296899318695,
|
| 6584 |
+
"learning_rate": 0.001,
|
| 6585 |
+
"loss": 2.6614,
|
| 6586 |
+
"num_input_tokens_seen": 38797308160,
|
| 6587 |
+
"step": 37000
|
| 6588 |
+
},
|
| 6589 |
+
{
|
| 6590 |
+
"epoch": 0.8127435999873696,
|
| 6591 |
+
"eval_loss": 2.5642571449279785,
|
| 6592 |
+
"eval_runtime": 66.5631,
|
| 6593 |
+
"eval_samples_per_second": 75.117,
|
| 6594 |
+
"eval_steps_per_second": 18.779,
|
| 6595 |
+
"num_input_tokens_seen": 38797308160,
|
| 6596 |
+
"step": 37000
|
| 6597 |
}
|
| 6598 |
],
|
| 6599 |
"logging_steps": 50,
|
| 6600 |
"max_steps": 200000,
|
| 6601 |
+
"num_input_tokens_seen": 38797308160,
|
| 6602 |
"num_train_epochs": 5,
|
| 6603 |
"save_steps": 1000,
|
| 6604 |
"stateful_callbacks": {
|
|
|
|
| 6613 |
"attributes": {}
|
| 6614 |
}
|
| 6615 |
},
|
| 6616 |
+
"total_flos": 2.2095351303794196e+19,
|
| 6617 |
"train_batch_size": 64,
|
| 6618 |
"trial_name": null,
|
| 6619 |
"trial_params": null
|