Training in progress, step 10000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 2384234968
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21e9c11e02543045a52d1d10e85b29deee320e577ed8c40299be1aac88002bab
|
| 3 |
size 2384234968
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4768663315
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c87bdbbf96a91780aaf4a58c008036f2bfda78e91f3d428d63005f735fe1e0c
|
| 3 |
size 4768663315
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33e6b43d263edc3fb19dbc74c4a7ae9df523ccc7c2602c8a0c606ae6abf92007
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 2.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8558,6 +8558,456 @@
|
|
| 8558 |
"mean_token_accuracy": 0.7248899202793837,
|
| 8559 |
"num_tokens": 77821952.0,
|
| 8560 |
"step": 9500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8561 |
}
|
| 8562 |
],
|
| 8563 |
"logging_steps": 10,
|
|
@@ -8577,7 +9027,7 @@
|
|
| 8577 |
"attributes": {}
|
| 8578 |
}
|
| 8579 |
},
|
| 8580 |
-
"total_flos": 2.
|
| 8581 |
"train_batch_size": 2,
|
| 8582 |
"trial_name": null,
|
| 8583 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.8320883633660214,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 10000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8558 |
"mean_token_accuracy": 0.7248899202793837,
|
| 8559 |
"num_tokens": 77821952.0,
|
| 8560 |
"step": 9500
|
| 8561 |
+
},
|
| 8562 |
+
{
|
| 8563 |
+
"epoch": 2.693312564166106,
|
| 8564 |
+
"grad_norm": 1.6963791847229004,
|
| 8565 |
+
"learning_rate": 1.137102695898458e-06,
|
| 8566 |
+
"loss": 0.1448,
|
| 8567 |
+
"mean_token_accuracy": 0.7517612528055906,
|
| 8568 |
+
"num_tokens": 77903872.0,
|
| 8569 |
+
"step": 9510
|
| 8570 |
+
},
|
| 8571 |
+
{
|
| 8572 |
+
"epoch": 2.6961447233334512,
|
| 8573 |
+
"grad_norm": 1.5691133737564087,
|
| 8574 |
+
"learning_rate": 1.1266128186300221e-06,
|
| 8575 |
+
"loss": 0.1062,
|
| 8576 |
+
"mean_token_accuracy": 0.7805895309895277,
|
| 8577 |
+
"num_tokens": 77985792.0,
|
| 8578 |
+
"step": 9520
|
| 8579 |
+
},
|
| 8580 |
+
{
|
| 8581 |
+
"epoch": 2.6989768825007965,
|
| 8582 |
+
"grad_norm": 1.3455393314361572,
|
| 8583 |
+
"learning_rate": 1.1161229413615862e-06,
|
| 8584 |
+
"loss": 0.1181,
|
| 8585 |
+
"mean_token_accuracy": 0.7727250501513481,
|
| 8586 |
+
"num_tokens": 78067712.0,
|
| 8587 |
+
"step": 9530
|
| 8588 |
+
},
|
| 8589 |
+
{
|
| 8590 |
+
"epoch": 2.7018090416681417,
|
| 8591 |
+
"grad_norm": 1.7499293088912964,
|
| 8592 |
+
"learning_rate": 1.1056330640931503e-06,
|
| 8593 |
+
"loss": 0.1621,
|
| 8594 |
+
"mean_token_accuracy": 0.7321550864726305,
|
| 8595 |
+
"num_tokens": 78149632.0,
|
| 8596 |
+
"step": 9540
|
| 8597 |
+
},
|
| 8598 |
+
{
|
| 8599 |
+
"epoch": 2.704641200835487,
|
| 8600 |
+
"grad_norm": 1.078167200088501,
|
| 8601 |
+
"learning_rate": 1.0951431868247141e-06,
|
| 8602 |
+
"loss": 0.1142,
|
| 8603 |
+
"mean_token_accuracy": 0.7804916825145483,
|
| 8604 |
+
"num_tokens": 78231552.0,
|
| 8605 |
+
"step": 9550
|
| 8606 |
+
},
|
| 8607 |
+
{
|
| 8608 |
+
"epoch": 2.7074733600028322,
|
| 8609 |
+
"grad_norm": 1.411314845085144,
|
| 8610 |
+
"learning_rate": 1.0846533095562784e-06,
|
| 8611 |
+
"loss": 0.1143,
|
| 8612 |
+
"mean_token_accuracy": 0.7715141884982586,
|
| 8613 |
+
"num_tokens": 78313472.0,
|
| 8614 |
+
"step": 9560
|
| 8615 |
+
},
|
| 8616 |
+
{
|
| 8617 |
+
"epoch": 2.7103055191701775,
|
| 8618 |
+
"grad_norm": 1.734834909439087,
|
| 8619 |
+
"learning_rate": 1.0741634322878423e-06,
|
| 8620 |
+
"loss": 0.1425,
|
| 8621 |
+
"mean_token_accuracy": 0.7414505925029516,
|
| 8622 |
+
"num_tokens": 78395392.0,
|
| 8623 |
+
"step": 9570
|
| 8624 |
+
},
|
| 8625 |
+
{
|
| 8626 |
+
"epoch": 2.7131376783375227,
|
| 8627 |
+
"grad_norm": 1.7494261264801025,
|
| 8628 |
+
"learning_rate": 1.0636735550194063e-06,
|
| 8629 |
+
"loss": 0.1213,
|
| 8630 |
+
"mean_token_accuracy": 0.759222112223506,
|
| 8631 |
+
"num_tokens": 78477312.0,
|
| 8632 |
+
"step": 9580
|
| 8633 |
+
},
|
| 8634 |
+
{
|
| 8635 |
+
"epoch": 2.7159698375048675,
|
| 8636 |
+
"grad_norm": 1.2814098596572876,
|
| 8637 |
+
"learning_rate": 1.0531836777509704e-06,
|
| 8638 |
+
"loss": 0.1335,
|
| 8639 |
+
"mean_token_accuracy": 0.7782045040279627,
|
| 8640 |
+
"num_tokens": 78559232.0,
|
| 8641 |
+
"step": 9590
|
| 8642 |
+
},
|
| 8643 |
+
{
|
| 8644 |
+
"epoch": 2.7188019966722132,
|
| 8645 |
+
"grad_norm": 1.2416023015975952,
|
| 8646 |
+
"learning_rate": 1.0426938004825345e-06,
|
| 8647 |
+
"loss": 0.131,
|
| 8648 |
+
"mean_token_accuracy": 0.7627446163445711,
|
| 8649 |
+
"num_tokens": 78641152.0,
|
| 8650 |
+
"step": 9600
|
| 8651 |
+
},
|
| 8652 |
+
{
|
| 8653 |
+
"epoch": 2.721634155839558,
|
| 8654 |
+
"grad_norm": 1.2916755676269531,
|
| 8655 |
+
"learning_rate": 1.0322039232140984e-06,
|
| 8656 |
+
"loss": 0.1292,
|
| 8657 |
+
"mean_token_accuracy": 0.7665728945285082,
|
| 8658 |
+
"num_tokens": 78723072.0,
|
| 8659 |
+
"step": 9610
|
| 8660 |
+
},
|
| 8661 |
+
{
|
| 8662 |
+
"epoch": 2.7244663150069033,
|
| 8663 |
+
"grad_norm": 0.9685536026954651,
|
| 8664 |
+
"learning_rate": 1.0217140459456624e-06,
|
| 8665 |
+
"loss": 0.0966,
|
| 8666 |
+
"mean_token_accuracy": 0.7933586113154888,
|
| 8667 |
+
"num_tokens": 78804992.0,
|
| 8668 |
+
"step": 9620
|
| 8669 |
+
},
|
| 8670 |
+
{
|
| 8671 |
+
"epoch": 2.7272984741742485,
|
| 8672 |
+
"grad_norm": 1.0701133012771606,
|
| 8673 |
+
"learning_rate": 1.0112241686772265e-06,
|
| 8674 |
+
"loss": 0.0929,
|
| 8675 |
+
"mean_token_accuracy": 0.768480920419097,
|
| 8676 |
+
"num_tokens": 78886912.0,
|
| 8677 |
+
"step": 9630
|
| 8678 |
+
},
|
| 8679 |
+
{
|
| 8680 |
+
"epoch": 2.730130633341594,
|
| 8681 |
+
"grad_norm": 1.155450701713562,
|
| 8682 |
+
"learning_rate": 1.0007342914087906e-06,
|
| 8683 |
+
"loss": 0.1217,
|
| 8684 |
+
"mean_token_accuracy": 0.7712084148079157,
|
| 8685 |
+
"num_tokens": 78968832.0,
|
| 8686 |
+
"step": 9640
|
| 8687 |
+
},
|
| 8688 |
+
{
|
| 8689 |
+
"epoch": 2.732962792508939,
|
| 8690 |
+
"grad_norm": 1.2108891010284424,
|
| 8691 |
+
"learning_rate": 9.902444141403547e-07,
|
| 8692 |
+
"loss": 0.1269,
|
| 8693 |
+
"mean_token_accuracy": 0.759784734621644,
|
| 8694 |
+
"num_tokens": 79050752.0,
|
| 8695 |
+
"step": 9650
|
| 8696 |
+
},
|
| 8697 |
+
{
|
| 8698 |
+
"epoch": 2.7357949516762843,
|
| 8699 |
+
"grad_norm": 1.3404109477996826,
|
| 8700 |
+
"learning_rate": 9.797545368719187e-07,
|
| 8701 |
+
"loss": 0.115,
|
| 8702 |
+
"mean_token_accuracy": 0.7742294497787953,
|
| 8703 |
+
"num_tokens": 79132672.0,
|
| 8704 |
+
"step": 9660
|
| 8705 |
+
},
|
| 8706 |
+
{
|
| 8707 |
+
"epoch": 2.7386271108436295,
|
| 8708 |
+
"grad_norm": 0.9352473616600037,
|
| 8709 |
+
"learning_rate": 9.692646596034828e-07,
|
| 8710 |
+
"loss": 0.1153,
|
| 8711 |
+
"mean_token_accuracy": 0.7558341480791568,
|
| 8712 |
+
"num_tokens": 79214592.0,
|
| 8713 |
+
"step": 9670
|
| 8714 |
+
},
|
| 8715 |
+
{
|
| 8716 |
+
"epoch": 2.741459270010975,
|
| 8717 |
+
"grad_norm": 1.2585588693618774,
|
| 8718 |
+
"learning_rate": 9.587747823350467e-07,
|
| 8719 |
+
"loss": 0.1447,
|
| 8720 |
+
"mean_token_accuracy": 0.7387475546449422,
|
| 8721 |
+
"num_tokens": 79296512.0,
|
| 8722 |
+
"step": 9680
|
| 8723 |
+
},
|
| 8724 |
+
{
|
| 8725 |
+
"epoch": 2.74429142917832,
|
| 8726 |
+
"grad_norm": 1.4785575866699219,
|
| 8727 |
+
"learning_rate": 9.482849050666109e-07,
|
| 8728 |
+
"loss": 0.1194,
|
| 8729 |
+
"mean_token_accuracy": 0.7637353233993054,
|
| 8730 |
+
"num_tokens": 79378432.0,
|
| 8731 |
+
"step": 9690
|
| 8732 |
+
},
|
| 8733 |
+
{
|
| 8734 |
+
"epoch": 2.747123588345665,
|
| 8735 |
+
"grad_norm": 0.9869931936264038,
|
| 8736 |
+
"learning_rate": 9.377950277981748e-07,
|
| 8737 |
+
"loss": 0.1237,
|
| 8738 |
+
"mean_token_accuracy": 0.7830968666821718,
|
| 8739 |
+
"num_tokens": 79460352.0,
|
| 8740 |
+
"step": 9700
|
| 8741 |
+
},
|
| 8742 |
+
{
|
| 8743 |
+
"epoch": 2.74995574751301,
|
| 8744 |
+
"grad_norm": 1.2523363828659058,
|
| 8745 |
+
"learning_rate": 9.273051505297388e-07,
|
| 8746 |
+
"loss": 0.1295,
|
| 8747 |
+
"mean_token_accuracy": 0.7593199610710144,
|
| 8748 |
+
"num_tokens": 79542272.0,
|
| 8749 |
+
"step": 9710
|
| 8750 |
+
},
|
| 8751 |
+
{
|
| 8752 |
+
"epoch": 2.7527879066803553,
|
| 8753 |
+
"grad_norm": 1.2600061893463135,
|
| 8754 |
+
"learning_rate": 9.16815273261303e-07,
|
| 8755 |
+
"loss": 0.1209,
|
| 8756 |
+
"mean_token_accuracy": 0.7813111554831267,
|
| 8757 |
+
"num_tokens": 79624192.0,
|
| 8758 |
+
"step": 9720
|
| 8759 |
+
},
|
| 8760 |
+
{
|
| 8761 |
+
"epoch": 2.7556200658477006,
|
| 8762 |
+
"grad_norm": 0.9577277898788452,
|
| 8763 |
+
"learning_rate": 9.063253959928669e-07,
|
| 8764 |
+
"loss": 0.1156,
|
| 8765 |
+
"mean_token_accuracy": 0.7740337550640106,
|
| 8766 |
+
"num_tokens": 79706112.0,
|
| 8767 |
+
"step": 9730
|
| 8768 |
+
},
|
| 8769 |
+
{
|
| 8770 |
+
"epoch": 2.758452225015046,
|
| 8771 |
+
"grad_norm": 1.1340205669403076,
|
| 8772 |
+
"learning_rate": 8.958355187244309e-07,
|
| 8773 |
+
"loss": 0.1038,
|
| 8774 |
+
"mean_token_accuracy": 0.7865215256810189,
|
| 8775 |
+
"num_tokens": 79788032.0,
|
| 8776 |
+
"step": 9740
|
| 8777 |
+
},
|
| 8778 |
+
{
|
| 8779 |
+
"epoch": 2.761284384182391,
|
| 8780 |
+
"grad_norm": 1.5387784242630005,
|
| 8781 |
+
"learning_rate": 8.853456414559951e-07,
|
| 8782 |
+
"loss": 0.1328,
|
| 8783 |
+
"mean_token_accuracy": 0.7656678043305873,
|
| 8784 |
+
"num_tokens": 79869952.0,
|
| 8785 |
+
"step": 9750
|
| 8786 |
+
},
|
| 8787 |
+
{
|
| 8788 |
+
"epoch": 2.7641165433497363,
|
| 8789 |
+
"grad_norm": 1.7430437803268433,
|
| 8790 |
+
"learning_rate": 8.748557641875591e-07,
|
| 8791 |
+
"loss": 0.095,
|
| 8792 |
+
"mean_token_accuracy": 0.7949119359254837,
|
| 8793 |
+
"num_tokens": 79951872.0,
|
| 8794 |
+
"step": 9760
|
| 8795 |
+
},
|
| 8796 |
+
{
|
| 8797 |
+
"epoch": 2.7669487025170816,
|
| 8798 |
+
"grad_norm": 1.7460997104644775,
|
| 8799 |
+
"learning_rate": 8.64365886919123e-07,
|
| 8800 |
+
"loss": 0.1196,
|
| 8801 |
+
"mean_token_accuracy": 0.775464779511094,
|
| 8802 |
+
"num_tokens": 80033792.0,
|
| 8803 |
+
"step": 9770
|
| 8804 |
+
},
|
| 8805 |
+
{
|
| 8806 |
+
"epoch": 2.769780861684427,
|
| 8807 |
+
"grad_norm": 1.1114528179168701,
|
| 8808 |
+
"learning_rate": 8.538760096506872e-07,
|
| 8809 |
+
"loss": 0.1293,
|
| 8810 |
+
"mean_token_accuracy": 0.7551736798137426,
|
| 8811 |
+
"num_tokens": 80115712.0,
|
| 8812 |
+
"step": 9780
|
| 8813 |
+
},
|
| 8814 |
+
{
|
| 8815 |
+
"epoch": 2.7726130208517716,
|
| 8816 |
+
"grad_norm": 1.3568215370178223,
|
| 8817 |
+
"learning_rate": 8.433861323822512e-07,
|
| 8818 |
+
"loss": 0.0965,
|
| 8819 |
+
"mean_token_accuracy": 0.7976272024214268,
|
| 8820 |
+
"num_tokens": 80197632.0,
|
| 8821 |
+
"step": 9790
|
| 8822 |
+
},
|
| 8823 |
+
{
|
| 8824 |
+
"epoch": 2.7754451800191173,
|
| 8825 |
+
"grad_norm": 1.039504885673523,
|
| 8826 |
+
"learning_rate": 8.328962551138151e-07,
|
| 8827 |
+
"loss": 0.1181,
|
| 8828 |
+
"mean_token_accuracy": 0.7570694729685783,
|
| 8829 |
+
"num_tokens": 80279552.0,
|
| 8830 |
+
"step": 9800
|
| 8831 |
+
},
|
| 8832 |
+
{
|
| 8833 |
+
"epoch": 2.778277339186462,
|
| 8834 |
+
"grad_norm": 0.9073276519775391,
|
| 8835 |
+
"learning_rate": 8.224063778453793e-07,
|
| 8836 |
+
"loss": 0.1327,
|
| 8837 |
+
"mean_token_accuracy": 0.7564946163445712,
|
| 8838 |
+
"num_tokens": 80361472.0,
|
| 8839 |
+
"step": 9810
|
| 8840 |
+
},
|
| 8841 |
+
{
|
| 8842 |
+
"epoch": 2.7811094983538074,
|
| 8843 |
+
"grad_norm": 2.061521291732788,
|
| 8844 |
+
"learning_rate": 8.119165005769433e-07,
|
| 8845 |
+
"loss": 0.1195,
|
| 8846 |
+
"mean_token_accuracy": 0.7596379648894072,
|
| 8847 |
+
"num_tokens": 80443392.0,
|
| 8848 |
+
"step": 9820
|
| 8849 |
+
},
|
| 8850 |
+
{
|
| 8851 |
+
"epoch": 2.7839416575211526,
|
| 8852 |
+
"grad_norm": 1.21349036693573,
|
| 8853 |
+
"learning_rate": 8.014266233085073e-07,
|
| 8854 |
+
"loss": 0.126,
|
| 8855 |
+
"mean_token_accuracy": 0.7478473570197821,
|
| 8856 |
+
"num_tokens": 80525312.0,
|
| 8857 |
+
"step": 9830
|
| 8858 |
+
},
|
| 8859 |
+
{
|
| 8860 |
+
"epoch": 2.786773816688498,
|
| 8861 |
+
"grad_norm": 1.4586316347122192,
|
| 8862 |
+
"learning_rate": 7.909367460400715e-07,
|
| 8863 |
+
"loss": 0.1223,
|
| 8864 |
+
"mean_token_accuracy": 0.770731408149004,
|
| 8865 |
+
"num_tokens": 80607232.0,
|
| 8866 |
+
"step": 9840
|
| 8867 |
+
},
|
| 8868 |
+
{
|
| 8869 |
+
"epoch": 2.789605975855843,
|
| 8870 |
+
"grad_norm": 1.3496206998825073,
|
| 8871 |
+
"learning_rate": 7.804468687716354e-07,
|
| 8872 |
+
"loss": 0.1015,
|
| 8873 |
+
"mean_token_accuracy": 0.7758072383701802,
|
| 8874 |
+
"num_tokens": 80689152.0,
|
| 8875 |
+
"step": 9850
|
| 8876 |
+
},
|
| 8877 |
+
{
|
| 8878 |
+
"epoch": 2.7924381350231884,
|
| 8879 |
+
"grad_norm": 1.2071694135665894,
|
| 8880 |
+
"learning_rate": 7.699569915031994e-07,
|
| 8881 |
+
"loss": 0.1146,
|
| 8882 |
+
"mean_token_accuracy": 0.774987768009305,
|
| 8883 |
+
"num_tokens": 80771072.0,
|
| 8884 |
+
"step": 9860
|
| 8885 |
+
},
|
| 8886 |
+
{
|
| 8887 |
+
"epoch": 2.7952702941905336,
|
| 8888 |
+
"grad_norm": 1.2012773752212524,
|
| 8889 |
+
"learning_rate": 7.594671142347636e-07,
|
| 8890 |
+
"loss": 0.1262,
|
| 8891 |
+
"mean_token_accuracy": 0.7779476504772902,
|
| 8892 |
+
"num_tokens": 80852992.0,
|
| 8893 |
+
"step": 9870
|
| 8894 |
+
},
|
| 8895 |
+
{
|
| 8896 |
+
"epoch": 2.7981024533578784,
|
| 8897 |
+
"grad_norm": 1.2166376113891602,
|
| 8898 |
+
"learning_rate": 7.489772369663275e-07,
|
| 8899 |
+
"loss": 0.1141,
|
| 8900 |
+
"mean_token_accuracy": 0.779562134295702,
|
| 8901 |
+
"num_tokens": 80934912.0,
|
| 8902 |
+
"step": 9880
|
| 8903 |
+
},
|
| 8904 |
+
{
|
| 8905 |
+
"epoch": 2.800934612525224,
|
| 8906 |
+
"grad_norm": 1.269511103630066,
|
| 8907 |
+
"learning_rate": 7.384873596978916e-07,
|
| 8908 |
+
"loss": 0.1247,
|
| 8909 |
+
"mean_token_accuracy": 0.7734099797904491,
|
| 8910 |
+
"num_tokens": 81016832.0,
|
| 8911 |
+
"step": 9890
|
| 8912 |
+
},
|
| 8913 |
+
{
|
| 8914 |
+
"epoch": 2.803766771692569,
|
| 8915 |
+
"grad_norm": 1.0128493309020996,
|
| 8916 |
+
"learning_rate": 7.279974824294557e-07,
|
| 8917 |
+
"loss": 0.1261,
|
| 8918 |
+
"mean_token_accuracy": 0.7762353252619505,
|
| 8919 |
+
"num_tokens": 81098752.0,
|
| 8920 |
+
"step": 9900
|
| 8921 |
+
},
|
| 8922 |
+
{
|
| 8923 |
+
"epoch": 2.806598930859914,
|
| 8924 |
+
"grad_norm": 1.538405179977417,
|
| 8925 |
+
"learning_rate": 7.175076051610197e-07,
|
| 8926 |
+
"loss": 0.1291,
|
| 8927 |
+
"mean_token_accuracy": 0.7826198644936084,
|
| 8928 |
+
"num_tokens": 81180672.0,
|
| 8929 |
+
"step": 9910
|
| 8930 |
+
},
|
| 8931 |
+
{
|
| 8932 |
+
"epoch": 2.8094310900272594,
|
| 8933 |
+
"grad_norm": 1.5747365951538086,
|
| 8934 |
+
"learning_rate": 7.070177278925837e-07,
|
| 8935 |
+
"loss": 0.1306,
|
| 8936 |
+
"mean_token_accuracy": 0.776382091268897,
|
| 8937 |
+
"num_tokens": 81262592.0,
|
| 8938 |
+
"step": 9920
|
| 8939 |
+
},
|
| 8940 |
+
{
|
| 8941 |
+
"epoch": 2.8122632491946047,
|
| 8942 |
+
"grad_norm": 1.071977972984314,
|
| 8943 |
+
"learning_rate": 6.965278506241478e-07,
|
| 8944 |
+
"loss": 0.1108,
|
| 8945 |
+
"mean_token_accuracy": 0.7729818969964981,
|
| 8946 |
+
"num_tokens": 81344512.0,
|
| 8947 |
+
"step": 9930
|
| 8948 |
+
},
|
| 8949 |
+
{
|
| 8950 |
+
"epoch": 2.81509540836195,
|
| 8951 |
+
"grad_norm": 1.172013282775879,
|
| 8952 |
+
"learning_rate": 6.860379733557118e-07,
|
| 8953 |
+
"loss": 0.1414,
|
| 8954 |
+
"mean_token_accuracy": 0.7524706482887268,
|
| 8955 |
+
"num_tokens": 81426432.0,
|
| 8956 |
+
"step": 9940
|
| 8957 |
+
},
|
| 8958 |
+
{
|
| 8959 |
+
"epoch": 2.817927567529295,
|
| 8960 |
+
"grad_norm": 1.3133201599121094,
|
| 8961 |
+
"learning_rate": 6.755480960872759e-07,
|
| 8962 |
+
"loss": 0.1192,
|
| 8963 |
+
"mean_token_accuracy": 0.7713796466588974,
|
| 8964 |
+
"num_tokens": 81508352.0,
|
| 8965 |
+
"step": 9950
|
| 8966 |
+
},
|
| 8967 |
+
{
|
| 8968 |
+
"epoch": 2.8207597266966404,
|
| 8969 |
+
"grad_norm": 1.6226385831832886,
|
| 8970 |
+
"learning_rate": 6.650582188188398e-07,
|
| 8971 |
+
"loss": 0.1244,
|
| 8972 |
+
"mean_token_accuracy": 0.7702299427241087,
|
| 8973 |
+
"num_tokens": 81590272.0,
|
| 8974 |
+
"step": 9960
|
| 8975 |
+
},
|
| 8976 |
+
{
|
| 8977 |
+
"epoch": 2.8235918858639857,
|
| 8978 |
+
"grad_norm": 1.5946696996688843,
|
| 8979 |
+
"learning_rate": 6.545683415504039e-07,
|
| 8980 |
+
"loss": 0.1186,
|
| 8981 |
+
"mean_token_accuracy": 0.7681873787194491,
|
| 8982 |
+
"num_tokens": 81672192.0,
|
| 8983 |
+
"step": 9970
|
| 8984 |
+
},
|
| 8985 |
+
{
|
| 8986 |
+
"epoch": 2.826424045031331,
|
| 8987 |
+
"grad_norm": 1.3367503881454468,
|
| 8988 |
+
"learning_rate": 6.44078464281968e-07,
|
| 8989 |
+
"loss": 0.129,
|
| 8990 |
+
"mean_token_accuracy": 0.7627201572060585,
|
| 8991 |
+
"num_tokens": 81754112.0,
|
| 8992 |
+
"step": 9980
|
| 8993 |
+
},
|
| 8994 |
+
{
|
| 8995 |
+
"epoch": 2.8292562041986757,
|
| 8996 |
+
"grad_norm": 1.6041656732559204,
|
| 8997 |
+
"learning_rate": 6.335885870135319e-07,
|
| 8998 |
+
"loss": 0.0955,
|
| 8999 |
+
"mean_token_accuracy": 0.7912304297089576,
|
| 9000 |
+
"num_tokens": 81836032.0,
|
| 9001 |
+
"step": 9990
|
| 9002 |
+
},
|
| 9003 |
+
{
|
| 9004 |
+
"epoch": 2.8320883633660214,
|
| 9005 |
+
"grad_norm": 1.589345097541809,
|
| 9006 |
+
"learning_rate": 6.23098709745096e-07,
|
| 9007 |
+
"loss": 0.1391,
|
| 9008 |
+
"mean_token_accuracy": 0.7658023487776517,
|
| 9009 |
+
"num_tokens": 81917952.0,
|
| 9010 |
+
"step": 10000
|
| 9011 |
}
|
| 9012 |
],
|
| 9013 |
"logging_steps": 10,
|
|
|
|
| 9027 |
"attributes": {}
|
| 9028 |
}
|
| 9029 |
},
|
| 9030 |
+
"total_flos": 2.1649315150902067e+17,
|
| 9031 |
"train_batch_size": 2,
|
| 9032 |
"trial_name": null,
|
| 9033 |
"trial_params": null
|