Training in progress, step 1000, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 332316480
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60f39bf0e2d04be586e3cdf61f38e4002d8f4ccf2b3e7506a28c3eebf56ae883
|
| 3 |
size 332316480
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 169158677
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d52cab9303197d3ac09cad8604a16b3cd57e8bcfc3339e579cfd996a7044c903
|
| 3 |
size 169158677
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:391d73924ccf821cbbf0fdb4254c0376997b82cf4a16ef088c45a6a109b91100
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f94ebdc28f5491fc51fc2ecbab5d9e2e3ba6be348d92d880d778a28fcd2cbce
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0
|
| 6 |
"eval_steps": 50,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10678,6 +10678,567 @@
|
|
| 10678 |
"memory/max_active (GiB)": 7.78,
|
| 10679 |
"memory/max_allocated (GiB)": 7.78,
|
| 10680 |
"step": 950
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10681 |
}
|
| 10682 |
],
|
| 10683 |
"logging_steps": 1,
|
|
@@ -10692,12 +11253,12 @@
|
|
| 10692 |
"should_evaluate": false,
|
| 10693 |
"should_log": false,
|
| 10694 |
"should_save": true,
|
| 10695 |
-
"should_training_stop":
|
| 10696 |
},
|
| 10697 |
"attributes": {}
|
| 10698 |
}
|
| 10699 |
},
|
| 10700 |
-
"total_flos":
|
| 10701 |
"train_batch_size": 1,
|
| 10702 |
"trial_name": null,
|
| 10703 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0,
|
| 6 |
"eval_steps": 50,
|
| 7 |
+
"global_step": 1000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10678 |
"memory/max_active (GiB)": 7.78,
|
| 10679 |
"memory/max_allocated (GiB)": 7.78,
|
| 10680 |
"step": 950
|
| 10681 |
+
},
|
| 10682 |
+
{
|
| 10683 |
+
"epoch": 0.951,
|
| 10684 |
+
"grad_norm": 0.8099629878997803,
|
| 10685 |
+
"learning_rate": 1.30832912661093e-06,
|
| 10686 |
+
"loss": 2.287,
|
| 10687 |
+
"memory/device_reserved (GiB)": 17.74,
|
| 10688 |
+
"memory/max_active (GiB)": 17.43,
|
| 10689 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10690 |
+
"step": 951,
|
| 10691 |
+
"tokens_per_second_per_gpu": 997.07
|
| 10692 |
+
},
|
| 10693 |
+
{
|
| 10694 |
+
"epoch": 0.952,
|
| 10695 |
+
"grad_norm": 0.910591185092926,
|
| 10696 |
+
"learning_rate": 1.2566280820298426e-06,
|
| 10697 |
+
"loss": 2.2643,
|
| 10698 |
+
"memory/device_reserved (GiB)": 17.78,
|
| 10699 |
+
"memory/max_active (GiB)": 17.43,
|
| 10700 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10701 |
+
"step": 952,
|
| 10702 |
+
"tokens_per_second_per_gpu": 821.07
|
| 10703 |
+
},
|
| 10704 |
+
{
|
| 10705 |
+
"epoch": 0.953,
|
| 10706 |
+
"grad_norm": 0.8110288381576538,
|
| 10707 |
+
"learning_rate": 1.2059628086956044e-06,
|
| 10708 |
+
"loss": 2.3573,
|
| 10709 |
+
"memory/device_reserved (GiB)": 17.78,
|
| 10710 |
+
"memory/max_active (GiB)": 17.43,
|
| 10711 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10712 |
+
"step": 953,
|
| 10713 |
+
"tokens_per_second_per_gpu": 1025.55
|
| 10714 |
+
},
|
| 10715 |
+
{
|
| 10716 |
+
"epoch": 0.954,
|
| 10717 |
+
"grad_norm": 0.8043859004974365,
|
| 10718 |
+
"learning_rate": 1.1563338380629618e-06,
|
| 10719 |
+
"loss": 2.5223,
|
| 10720 |
+
"memory/device_reserved (GiB)": 17.78,
|
| 10721 |
+
"memory/max_active (GiB)": 17.43,
|
| 10722 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10723 |
+
"step": 954,
|
| 10724 |
+
"tokens_per_second_per_gpu": 1124.93
|
| 10725 |
+
},
|
| 10726 |
+
{
|
| 10727 |
+
"epoch": 0.955,
|
| 10728 |
+
"grad_norm": 0.7169449329376221,
|
| 10729 |
+
"learning_rate": 1.1077416907163574e-06,
|
| 10730 |
+
"loss": 2.1511,
|
| 10731 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10732 |
+
"memory/max_active (GiB)": 17.43,
|
| 10733 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10734 |
+
"step": 955,
|
| 10735 |
+
"tokens_per_second_per_gpu": 1207.88
|
| 10736 |
+
},
|
| 10737 |
+
{
|
| 10738 |
+
"epoch": 0.956,
|
| 10739 |
+
"grad_norm": 0.9546728730201721,
|
| 10740 |
+
"learning_rate": 1.0601868763643996e-06,
|
| 10741 |
+
"loss": 2.3221,
|
| 10742 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10743 |
+
"memory/max_active (GiB)": 17.43,
|
| 10744 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10745 |
+
"step": 956,
|
| 10746 |
+
"tokens_per_second_per_gpu": 715.9
|
| 10747 |
+
},
|
| 10748 |
+
{
|
| 10749 |
+
"epoch": 0.957,
|
| 10750 |
+
"grad_norm": 0.9332824945449829,
|
| 10751 |
+
"learning_rate": 1.0136698938346011e-06,
|
| 10752 |
+
"loss": 2.4007,
|
| 10753 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10754 |
+
"memory/max_active (GiB)": 17.43,
|
| 10755 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10756 |
+
"step": 957,
|
| 10757 |
+
"tokens_per_second_per_gpu": 909.16
|
| 10758 |
+
},
|
| 10759 |
+
{
|
| 10760 |
+
"epoch": 0.958,
|
| 10761 |
+
"grad_norm": 0.948166012763977,
|
| 10762 |
+
"learning_rate": 9.68191231068083e-07,
|
| 10763 |
+
"loss": 2.2667,
|
| 10764 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10765 |
+
"memory/max_active (GiB)": 17.43,
|
| 10766 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10767 |
+
"step": 958,
|
| 10768 |
+
"tokens_per_second_per_gpu": 739.18
|
| 10769 |
+
},
|
| 10770 |
+
{
|
| 10771 |
+
"epoch": 0.959,
|
| 10772 |
+
"grad_norm": 0.7676699161529541,
|
| 10773 |
+
"learning_rate": 9.237513651145225e-07,
|
| 10774 |
+
"loss": 2.1496,
|
| 10775 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10776 |
+
"memory/max_active (GiB)": 17.43,
|
| 10777 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10778 |
+
"step": 959,
|
| 10779 |
+
"tokens_per_second_per_gpu": 1144.45
|
| 10780 |
+
},
|
| 10781 |
+
{
|
| 10782 |
+
"epoch": 0.96,
|
| 10783 |
+
"grad_norm": 0.7553421854972839,
|
| 10784 |
+
"learning_rate": 8.803507621270579e-07,
|
| 10785 |
+
"loss": 2.4495,
|
| 10786 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10787 |
+
"memory/max_active (GiB)": 17.43,
|
| 10788 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10789 |
+
"step": 960,
|
| 10790 |
+
"tokens_per_second_per_gpu": 1184.03
|
| 10791 |
+
},
|
| 10792 |
+
{
|
| 10793 |
+
"epoch": 0.961,
|
| 10794 |
+
"grad_norm": 0.8734245896339417,
|
| 10795 |
+
"learning_rate": 8.379898773574924e-07,
|
| 10796 |
+
"loss": 2.4696,
|
| 10797 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10798 |
+
"memory/max_active (GiB)": 17.43,
|
| 10799 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10800 |
+
"step": 961,
|
| 10801 |
+
"tokens_per_second_per_gpu": 985.05
|
| 10802 |
+
},
|
| 10803 |
+
{
|
| 10804 |
+
"epoch": 0.962,
|
| 10805 |
+
"grad_norm": 0.9553205370903015,
|
| 10806 |
+
"learning_rate": 7.966691551514527e-07,
|
| 10807 |
+
"loss": 2.5366,
|
| 10808 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10809 |
+
"memory/max_active (GiB)": 17.43,
|
| 10810 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10811 |
+
"step": 962,
|
| 10812 |
+
"tokens_per_second_per_gpu": 789.81
|
| 10813 |
+
},
|
| 10814 |
+
{
|
| 10815 |
+
"epoch": 0.963,
|
| 10816 |
+
"grad_norm": 1.0424203872680664,
|
| 10817 |
+
"learning_rate": 7.563890289437825e-07,
|
| 10818 |
+
"loss": 2.3128,
|
| 10819 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10820 |
+
"memory/max_active (GiB)": 17.43,
|
| 10821 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10822 |
+
"step": 963,
|
| 10823 |
+
"tokens_per_second_per_gpu": 642.35
|
| 10824 |
+
},
|
| 10825 |
+
{
|
| 10826 |
+
"epoch": 0.964,
|
| 10827 |
+
"grad_norm": 0.9237273931503296,
|
| 10828 |
+
"learning_rate": 7.171499212539123e-07,
|
| 10829 |
+
"loss": 2.3017,
|
| 10830 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10831 |
+
"memory/max_active (GiB)": 17.43,
|
| 10832 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10833 |
+
"step": 964,
|
| 10834 |
+
"tokens_per_second_per_gpu": 771.48
|
| 10835 |
+
},
|
| 10836 |
+
{
|
| 10837 |
+
"epoch": 0.965,
|
| 10838 |
+
"grad_norm": 0.821221649646759,
|
| 10839 |
+
"learning_rate": 6.78952243681541e-07,
|
| 10840 |
+
"loss": 2.3436,
|
| 10841 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10842 |
+
"memory/max_active (GiB)": 17.43,
|
| 10843 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10844 |
+
"step": 965,
|
| 10845 |
+
"tokens_per_second_per_gpu": 1111.51
|
| 10846 |
+
},
|
| 10847 |
+
{
|
| 10848 |
+
"epoch": 0.966,
|
| 10849 |
+
"grad_norm": 0.8634496331214905,
|
| 10850 |
+
"learning_rate": 6.41796396902239e-07,
|
| 10851 |
+
"loss": 2.0172,
|
| 10852 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10853 |
+
"memory/max_active (GiB)": 17.43,
|
| 10854 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10855 |
+
"step": 966,
|
| 10856 |
+
"tokens_per_second_per_gpu": 789.7
|
| 10857 |
+
},
|
| 10858 |
+
{
|
| 10859 |
+
"epoch": 0.967,
|
| 10860 |
+
"grad_norm": 0.8414502143859863,
|
| 10861 |
+
"learning_rate": 6.056827706632185e-07,
|
| 10862 |
+
"loss": 2.282,
|
| 10863 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10864 |
+
"memory/max_active (GiB)": 17.43,
|
| 10865 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10866 |
+
"step": 967,
|
| 10867 |
+
"tokens_per_second_per_gpu": 1027.47
|
| 10868 |
+
},
|
| 10869 |
+
{
|
| 10870 |
+
"epoch": 0.968,
|
| 10871 |
+
"grad_norm": 1.1495898962020874,
|
| 10872 |
+
"learning_rate": 5.706117437793701e-07,
|
| 10873 |
+
"loss": 2.3397,
|
| 10874 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10875 |
+
"memory/max_active (GiB)": 17.43,
|
| 10876 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10877 |
+
"step": 968,
|
| 10878 |
+
"tokens_per_second_per_gpu": 534.09
|
| 10879 |
+
},
|
| 10880 |
+
{
|
| 10881 |
+
"epoch": 0.969,
|
| 10882 |
+
"grad_norm": 0.8324930667877197,
|
| 10883 |
+
"learning_rate": 5.365836841291438e-07,
|
| 10884 |
+
"loss": 2.4827,
|
| 10885 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10886 |
+
"memory/max_active (GiB)": 17.43,
|
| 10887 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10888 |
+
"step": 969,
|
| 10889 |
+
"tokens_per_second_per_gpu": 1116.92
|
| 10890 |
+
},
|
| 10891 |
+
{
|
| 10892 |
+
"epoch": 0.97,
|
| 10893 |
+
"grad_norm": 1.0874335765838623,
|
| 10894 |
+
"learning_rate": 5.035989486508075e-07,
|
| 10895 |
+
"loss": 2.3351,
|
| 10896 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10897 |
+
"memory/max_active (GiB)": 17.43,
|
| 10898 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10899 |
+
"step": 970,
|
| 10900 |
+
"tokens_per_second_per_gpu": 579.44
|
| 10901 |
+
},
|
| 10902 |
+
{
|
| 10903 |
+
"epoch": 0.971,
|
| 10904 |
+
"grad_norm": 0.791401743888855,
|
| 10905 |
+
"learning_rate": 4.7165788333860536e-07,
|
| 10906 |
+
"loss": 2.2922,
|
| 10907 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10908 |
+
"memory/max_active (GiB)": 17.43,
|
| 10909 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10910 |
+
"step": 971,
|
| 10911 |
+
"tokens_per_second_per_gpu": 1109.94
|
| 10912 |
+
},
|
| 10913 |
+
{
|
| 10914 |
+
"epoch": 0.972,
|
| 10915 |
+
"grad_norm": 0.8731902241706848,
|
| 10916 |
+
"learning_rate": 4.4076082323920576e-07,
|
| 10917 |
+
"loss": 2.4829,
|
| 10918 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10919 |
+
"memory/max_active (GiB)": 17.43,
|
| 10920 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10921 |
+
"step": 972,
|
| 10922 |
+
"tokens_per_second_per_gpu": 877.97
|
| 10923 |
+
},
|
| 10924 |
+
{
|
| 10925 |
+
"epoch": 0.973,
|
| 10926 |
+
"grad_norm": 0.8560281991958618,
|
| 10927 |
+
"learning_rate": 4.1090809244814785e-07,
|
| 10928 |
+
"loss": 2.2317,
|
| 10929 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10930 |
+
"memory/max_active (GiB)": 17.43,
|
| 10931 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10932 |
+
"step": 973,
|
| 10933 |
+
"tokens_per_second_per_gpu": 932.27
|
| 10934 |
+
},
|
| 10935 |
+
{
|
| 10936 |
+
"epoch": 0.974,
|
| 10937 |
+
"grad_norm": 0.9274902939796448,
|
| 10938 |
+
"learning_rate": 3.82100004106456e-07,
|
| 10939 |
+
"loss": 2.5177,
|
| 10940 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10941 |
+
"memory/max_active (GiB)": 17.43,
|
| 10942 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10943 |
+
"step": 974,
|
| 10944 |
+
"tokens_per_second_per_gpu": 967.07
|
| 10945 |
+
},
|
| 10946 |
+
{
|
| 10947 |
+
"epoch": 0.975,
|
| 10948 |
+
"grad_norm": 0.9513389468193054,
|
| 10949 |
+
"learning_rate": 3.543368603973529e-07,
|
| 10950 |
+
"loss": 2.3584,
|
| 10951 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10952 |
+
"memory/max_active (GiB)": 17.43,
|
| 10953 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10954 |
+
"step": 975,
|
| 10955 |
+
"tokens_per_second_per_gpu": 776.96
|
| 10956 |
+
},
|
| 10957 |
+
{
|
| 10958 |
+
"epoch": 0.976,
|
| 10959 |
+
"grad_norm": 0.8030345439910889,
|
| 10960 |
+
"learning_rate": 3.2761895254306287e-07,
|
| 10961 |
+
"loss": 2.1989,
|
| 10962 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10963 |
+
"memory/max_active (GiB)": 17.43,
|
| 10964 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10965 |
+
"step": 976,
|
| 10966 |
+
"tokens_per_second_per_gpu": 1064.08
|
| 10967 |
+
},
|
| 10968 |
+
{
|
| 10969 |
+
"epoch": 0.977,
|
| 10970 |
+
"grad_norm": 0.8223397135734558,
|
| 10971 |
+
"learning_rate": 3.019465608018024e-07,
|
| 10972 |
+
"loss": 2.3525,
|
| 10973 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10974 |
+
"memory/max_active (GiB)": 17.43,
|
| 10975 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10976 |
+
"step": 977,
|
| 10977 |
+
"tokens_per_second_per_gpu": 1110.89
|
| 10978 |
+
},
|
| 10979 |
+
{
|
| 10980 |
+
"epoch": 0.978,
|
| 10981 |
+
"grad_norm": 1.0492770671844482,
|
| 10982 |
+
"learning_rate": 2.773199544648164e-07,
|
| 10983 |
+
"loss": 2.336,
|
| 10984 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10985 |
+
"memory/max_active (GiB)": 17.43,
|
| 10986 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10987 |
+
"step": 978,
|
| 10988 |
+
"tokens_per_second_per_gpu": 779.71
|
| 10989 |
+
},
|
| 10990 |
+
{
|
| 10991 |
+
"epoch": 0.979,
|
| 10992 |
+
"grad_norm": 0.897686779499054,
|
| 10993 |
+
"learning_rate": 2.537393918535358e-07,
|
| 10994 |
+
"loss": 2.357,
|
| 10995 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 10996 |
+
"memory/max_active (GiB)": 17.43,
|
| 10997 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 10998 |
+
"step": 979,
|
| 10999 |
+
"tokens_per_second_per_gpu": 980.21
|
| 11000 |
+
},
|
| 11001 |
+
{
|
| 11002 |
+
"epoch": 0.98,
|
| 11003 |
+
"grad_norm": 0.8448941707611084,
|
| 11004 |
+
"learning_rate": 2.312051203169352e-07,
|
| 11005 |
+
"loss": 2.176,
|
| 11006 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11007 |
+
"memory/max_active (GiB)": 17.43,
|
| 11008 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11009 |
+
"step": 980,
|
| 11010 |
+
"tokens_per_second_per_gpu": 1001.24
|
| 11011 |
+
},
|
| 11012 |
+
{
|
| 11013 |
+
"epoch": 0.981,
|
| 11014 |
+
"grad_norm": 0.9005848169326782,
|
| 11015 |
+
"learning_rate": 2.0971737622883515e-07,
|
| 11016 |
+
"loss": 2.2181,
|
| 11017 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11018 |
+
"memory/max_active (GiB)": 17.43,
|
| 11019 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11020 |
+
"step": 981,
|
| 11021 |
+
"tokens_per_second_per_gpu": 900.62
|
| 11022 |
+
},
|
| 11023 |
+
{
|
| 11024 |
+
"epoch": 0.982,
|
| 11025 |
+
"grad_norm": 0.8972439169883728,
|
| 11026 |
+
"learning_rate": 1.8927638498551502e-07,
|
| 11027 |
+
"loss": 2.2886,
|
| 11028 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11029 |
+
"memory/max_active (GiB)": 17.43,
|
| 11030 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11031 |
+
"step": 982,
|
| 11032 |
+
"tokens_per_second_per_gpu": 875.01
|
| 11033 |
+
},
|
| 11034 |
+
{
|
| 11035 |
+
"epoch": 0.983,
|
| 11036 |
+
"grad_norm": 0.8892665505409241,
|
| 11037 |
+
"learning_rate": 1.6988236100329292e-07,
|
| 11038 |
+
"loss": 2.2567,
|
| 11039 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11040 |
+
"memory/max_active (GiB)": 17.43,
|
| 11041 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11042 |
+
"step": 983,
|
| 11043 |
+
"tokens_per_second_per_gpu": 844.42
|
| 11044 |
+
},
|
| 11045 |
+
{
|
| 11046 |
+
"epoch": 0.984,
|
| 11047 |
+
"grad_norm": 0.915696918964386,
|
| 11048 |
+
"learning_rate": 1.5153550771630498e-07,
|
| 11049 |
+
"loss": 2.3351,
|
| 11050 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11051 |
+
"memory/max_active (GiB)": 17.43,
|
| 11052 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11053 |
+
"step": 984,
|
| 11054 |
+
"tokens_per_second_per_gpu": 815.64
|
| 11055 |
+
},
|
| 11056 |
+
{
|
| 11057 |
+
"epoch": 0.985,
|
| 11058 |
+
"grad_norm": 0.9981441497802734,
|
| 11059 |
+
"learning_rate": 1.3423601757436287e-07,
|
| 11060 |
+
"loss": 2.2343,
|
| 11061 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11062 |
+
"memory/max_active (GiB)": 17.43,
|
| 11063 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11064 |
+
"step": 985,
|
| 11065 |
+
"tokens_per_second_per_gpu": 702.43
|
| 11066 |
+
},
|
| 11067 |
+
{
|
| 11068 |
+
"epoch": 0.986,
|
| 11069 |
+
"grad_norm": 0.8215169906616211,
|
| 11070 |
+
"learning_rate": 1.179840720409331e-07,
|
| 11071 |
+
"loss": 2.192,
|
| 11072 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11073 |
+
"memory/max_active (GiB)": 17.43,
|
| 11074 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11075 |
+
"step": 986,
|
| 11076 |
+
"tokens_per_second_per_gpu": 1012.4
|
| 11077 |
+
},
|
| 11078 |
+
{
|
| 11079 |
+
"epoch": 0.987,
|
| 11080 |
+
"grad_norm": 1.0433471202850342,
|
| 11081 |
+
"learning_rate": 1.0277984159122733e-07,
|
| 11082 |
+
"loss": 2.3544,
|
| 11083 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11084 |
+
"memory/max_active (GiB)": 17.43,
|
| 11085 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11086 |
+
"step": 987,
|
| 11087 |
+
"tokens_per_second_per_gpu": 829.67
|
| 11088 |
+
},
|
| 11089 |
+
{
|
| 11090 |
+
"epoch": 0.988,
|
| 11091 |
+
"grad_norm": 0.8312088847160339,
|
| 11092 |
+
"learning_rate": 8.862348571043733e-08,
|
| 11093 |
+
"loss": 2.3737,
|
| 11094 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11095 |
+
"memory/max_active (GiB)": 17.43,
|
| 11096 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11097 |
+
"step": 988,
|
| 11098 |
+
"tokens_per_second_per_gpu": 1123.09
|
| 11099 |
+
},
|
| 11100 |
+
{
|
| 11101 |
+
"epoch": 0.989,
|
| 11102 |
+
"grad_norm": 1.0085126161575317,
|
| 11103 |
+
"learning_rate": 7.551515289203615e-08,
|
| 11104 |
+
"loss": 2.0985,
|
| 11105 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11106 |
+
"memory/max_active (GiB)": 17.43,
|
| 11107 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11108 |
+
"step": 989,
|
| 11109 |
+
"tokens_per_second_per_gpu": 642.57
|
| 11110 |
+
},
|
| 11111 |
+
{
|
| 11112 |
+
"epoch": 0.99,
|
| 11113 |
+
"grad_norm": 0.9324679970741272,
|
| 11114 |
+
"learning_rate": 6.34549806362239e-08,
|
| 11115 |
+
"loss": 2.5521,
|
| 11116 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11117 |
+
"memory/max_active (GiB)": 17.43,
|
| 11118 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11119 |
+
"step": 990,
|
| 11120 |
+
"tokens_per_second_per_gpu": 979.2
|
| 11121 |
+
},
|
| 11122 |
+
{
|
| 11123 |
+
"epoch": 0.991,
|
| 11124 |
+
"grad_norm": 0.8679972290992737,
|
| 11125 |
+
"learning_rate": 5.2443095448506674e-08,
|
| 11126 |
+
"loss": 2.2688,
|
| 11127 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11128 |
+
"memory/max_active (GiB)": 17.43,
|
| 11129 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11130 |
+
"step": 991,
|
| 11131 |
+
"tokens_per_second_per_gpu": 857.33
|
| 11132 |
+
},
|
| 11133 |
+
{
|
| 11134 |
+
"epoch": 0.992,
|
| 11135 |
+
"grad_norm": 0.8510658740997314,
|
| 11136 |
+
"learning_rate": 4.247961283835311e-08,
|
| 11137 |
+
"loss": 2.2254,
|
| 11138 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11139 |
+
"memory/max_active (GiB)": 17.43,
|
| 11140 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11141 |
+
"step": 992,
|
| 11142 |
+
"tokens_per_second_per_gpu": 952.28
|
| 11143 |
+
},
|
| 11144 |
+
{
|
| 11145 |
+
"epoch": 0.993,
|
| 11146 |
+
"grad_norm": 0.8851034641265869,
|
| 11147 |
+
"learning_rate": 3.356463731798432e-08,
|
| 11148 |
+
"loss": 2.3777,
|
| 11149 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11150 |
+
"memory/max_active (GiB)": 17.43,
|
| 11151 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11152 |
+
"step": 993,
|
| 11153 |
+
"tokens_per_second_per_gpu": 930.35
|
| 11154 |
+
},
|
| 11155 |
+
{
|
| 11156 |
+
"epoch": 0.994,
|
| 11157 |
+
"grad_norm": 0.847767174243927,
|
| 11158 |
+
"learning_rate": 2.5698262401263605e-08,
|
| 11159 |
+
"loss": 2.501,
|
| 11160 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11161 |
+
"memory/max_active (GiB)": 17.43,
|
| 11162 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11163 |
+
"step": 994,
|
| 11164 |
+
"tokens_per_second_per_gpu": 1058.39
|
| 11165 |
+
},
|
| 11166 |
+
{
|
| 11167 |
+
"epoch": 0.995,
|
| 11168 |
+
"grad_norm": 0.8249082565307617,
|
| 11169 |
+
"learning_rate": 1.888057060274173e-08,
|
| 11170 |
+
"loss": 2.0256,
|
| 11171 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11172 |
+
"memory/max_active (GiB)": 17.43,
|
| 11173 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11174 |
+
"step": 995,
|
| 11175 |
+
"tokens_per_second_per_gpu": 914.1
|
| 11176 |
+
},
|
| 11177 |
+
{
|
| 11178 |
+
"epoch": 0.996,
|
| 11179 |
+
"grad_norm": 1.0485869646072388,
|
| 11180 |
+
"learning_rate": 1.3111633436779791e-08,
|
| 11181 |
+
"loss": 2.3413,
|
| 11182 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11183 |
+
"memory/max_active (GiB)": 17.43,
|
| 11184 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11185 |
+
"step": 996,
|
| 11186 |
+
"tokens_per_second_per_gpu": 546.19
|
| 11187 |
+
},
|
| 11188 |
+
{
|
| 11189 |
+
"epoch": 0.997,
|
| 11190 |
+
"grad_norm": 0.923591136932373,
|
| 11191 |
+
"learning_rate": 8.391511416816489e-09,
|
| 11192 |
+
"loss": 2.4325,
|
| 11193 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11194 |
+
"memory/max_active (GiB)": 17.43,
|
| 11195 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11196 |
+
"step": 997,
|
| 11197 |
+
"tokens_per_second_per_gpu": 844.28
|
| 11198 |
+
},
|
| 11199 |
+
{
|
| 11200 |
+
"epoch": 0.998,
|
| 11201 |
+
"grad_norm": 1.0989410877227783,
|
| 11202 |
+
"learning_rate": 4.720254054679796e-09,
|
| 11203 |
+
"loss": 2.2315,
|
| 11204 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11205 |
+
"memory/max_active (GiB)": 17.43,
|
| 11206 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11207 |
+
"step": 998,
|
| 11208 |
+
"tokens_per_second_per_gpu": 518.76
|
| 11209 |
+
},
|
| 11210 |
+
{
|
| 11211 |
+
"epoch": 0.999,
|
| 11212 |
+
"grad_norm": 1.190458059310913,
|
| 11213 |
+
"learning_rate": 2.0978998601206556e-09,
|
| 11214 |
+
"loss": 2.5925,
|
| 11215 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11216 |
+
"memory/max_active (GiB)": 17.43,
|
| 11217 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11218 |
+
"step": 999,
|
| 11219 |
+
"tokens_per_second_per_gpu": 618.02
|
| 11220 |
+
},
|
| 11221 |
+
{
|
| 11222 |
+
"epoch": 1.0,
|
| 11223 |
+
"grad_norm": 0.9831822514533997,
|
| 11224 |
+
"learning_rate": 5.244763404133046e-10,
|
| 11225 |
+
"loss": 2.2586,
|
| 11226 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11227 |
+
"memory/max_active (GiB)": 17.43,
|
| 11228 |
+
"memory/max_allocated (GiB)": 17.43,
|
| 11229 |
+
"step": 1000,
|
| 11230 |
+
"tokens_per_second_per_gpu": 740.06
|
| 11231 |
+
},
|
| 11232 |
+
{
|
| 11233 |
+
"epoch": 1.0,
|
| 11234 |
+
"eval_loss": 2.245497226715088,
|
| 11235 |
+
"eval_runtime": 67.8857,
|
| 11236 |
+
"eval_samples_per_second": 2.887,
|
| 11237 |
+
"eval_steps_per_second": 1.444,
|
| 11238 |
+
"memory/device_reserved (GiB)": 17.79,
|
| 11239 |
+
"memory/max_active (GiB)": 7.78,
|
| 11240 |
+
"memory/max_allocated (GiB)": 7.78,
|
| 11241 |
+
"step": 1000
|
| 11242 |
}
|
| 11243 |
],
|
| 11244 |
"logging_steps": 1,
|
|
|
|
| 11253 |
"should_evaluate": false,
|
| 11254 |
"should_log": false,
|
| 11255 |
"should_save": true,
|
| 11256 |
+
"should_training_stop": true
|
| 11257 |
},
|
| 11258 |
"attributes": {}
|
| 11259 |
}
|
| 11260 |
},
|
| 11261 |
+
"total_flos": 2.0713221703021363e+17,
|
| 11262 |
"train_batch_size": 1,
|
| 11263 |
"trial_name": null,
|
| 11264 |
"trial_params": null
|