Training in progress, step 18000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +1403 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 487156538
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:927ded2136161debbd96279849965288bf7431b685070e843239f647821d61f6
|
| 3 |
size 487156538
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1059459406
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d903e8e590da474361b4050df7262a9b6e838c971d4765f118a1f9c5c121e79
|
| 3 |
size 1059459406
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f63df0717d7d403241658b9d1ef68a022304bfea4ed08ee6b9ae2a0e774deb9
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3124bc54b5cf99f9af69a72fc0ea506085633799107751230612ddfb52753447
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3754b63410489156d62387b5a35a069ce6d7e085a76b453cde9b242ce0aa0610
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93cbb7d637d48f06226dfd537b7544f80d61c3f2631cc41c955ddf30bc5b0a70
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -11208,6 +11208,1406 @@
|
|
| 11208 |
"learning_rate": 0.0004949612511467957,
|
| 11209 |
"loss": 16.6007,
|
| 11210 |
"step": 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11211 |
}
|
| 11212 |
],
|
| 11213 |
"logging_steps": 10,
|
|
@@ -11227,7 +12627,7 @@
|
|
| 11227 |
"attributes": {}
|
| 11228 |
}
|
| 11229 |
},
|
| 11230 |
-
"total_flos":
|
| 11231 |
"train_batch_size": 48,
|
| 11232 |
"trial_name": null,
|
| 11233 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.035099521769015894,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 18000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 11208 |
"learning_rate": 0.0004949612511467957,
|
| 11209 |
"loss": 16.6007,
|
| 11210 |
"step": 16000
|
| 11211 |
+
},
|
| 11212 |
+
{
|
| 11213 |
+
"epoch": 0.03121907464010803,
|
| 11214 |
+
"grad_norm": 7.5,
|
| 11215 |
+
"learning_rate": 0.000494958000131341,
|
| 11216 |
+
"loss": 16.6753,
|
| 11217 |
+
"step": 16010
|
| 11218 |
+
},
|
| 11219 |
+
{
|
| 11220 |
+
"epoch": 0.03123857437442415,
|
| 11221 |
+
"grad_norm": 6.65625,
|
| 11222 |
+
"learning_rate": 0.0004949547491158863,
|
| 11223 |
+
"loss": 16.6359,
|
| 11224 |
+
"step": 16020
|
| 11225 |
+
},
|
| 11226 |
+
{
|
| 11227 |
+
"epoch": 0.03125807410874027,
|
| 11228 |
+
"grad_norm": 7.09375,
|
| 11229 |
+
"learning_rate": 0.0004949514981004317,
|
| 11230 |
+
"loss": 16.5558,
|
| 11231 |
+
"step": 16030
|
| 11232 |
+
},
|
| 11233 |
+
{
|
| 11234 |
+
"epoch": 0.03127757384305639,
|
| 11235 |
+
"grad_norm": 7.34375,
|
| 11236 |
+
"learning_rate": 0.000494948247084977,
|
| 11237 |
+
"loss": 16.643,
|
| 11238 |
+
"step": 16040
|
| 11239 |
+
},
|
| 11240 |
+
{
|
| 11241 |
+
"epoch": 0.031297073577372506,
|
| 11242 |
+
"grad_norm": 7.21875,
|
| 11243 |
+
"learning_rate": 0.0004949449960695223,
|
| 11244 |
+
"loss": 16.7139,
|
| 11245 |
+
"step": 16050
|
| 11246 |
+
},
|
| 11247 |
+
{
|
| 11248 |
+
"epoch": 0.03131657331168863,
|
| 11249 |
+
"grad_norm": 10.125,
|
| 11250 |
+
"learning_rate": 0.0004949417450540676,
|
| 11251 |
+
"loss": 16.5427,
|
| 11252 |
+
"step": 16060
|
| 11253 |
+
},
|
| 11254 |
+
{
|
| 11255 |
+
"epoch": 0.03133607304600475,
|
| 11256 |
+
"grad_norm": 101.5,
|
| 11257 |
+
"learning_rate": 0.000494938494038613,
|
| 11258 |
+
"loss": 16.6593,
|
| 11259 |
+
"step": 16070
|
| 11260 |
+
},
|
| 11261 |
+
{
|
| 11262 |
+
"epoch": 0.03135557278032087,
|
| 11263 |
+
"grad_norm": 8.25,
|
| 11264 |
+
"learning_rate": 0.0004949352430231583,
|
| 11265 |
+
"loss": 16.719,
|
| 11266 |
+
"step": 16080
|
| 11267 |
+
},
|
| 11268 |
+
{
|
| 11269 |
+
"epoch": 0.03137507251463699,
|
| 11270 |
+
"grad_norm": 13.9375,
|
| 11271 |
+
"learning_rate": 0.0004949319920077036,
|
| 11272 |
+
"loss": 16.6558,
|
| 11273 |
+
"step": 16090
|
| 11274 |
+
},
|
| 11275 |
+
{
|
| 11276 |
+
"epoch": 0.03139457224895311,
|
| 11277 |
+
"grad_norm": 7.6875,
|
| 11278 |
+
"learning_rate": 0.000494928740992249,
|
| 11279 |
+
"loss": 16.4561,
|
| 11280 |
+
"step": 16100
|
| 11281 |
+
},
|
| 11282 |
+
{
|
| 11283 |
+
"epoch": 0.031414071983269225,
|
| 11284 |
+
"grad_norm": 9.0625,
|
| 11285 |
+
"learning_rate": 0.0004949254899767943,
|
| 11286 |
+
"loss": 16.4558,
|
| 11287 |
+
"step": 16110
|
| 11288 |
+
},
|
| 11289 |
+
{
|
| 11290 |
+
"epoch": 0.031433571717585346,
|
| 11291 |
+
"grad_norm": 8.6875,
|
| 11292 |
+
"learning_rate": 0.0004949222389613396,
|
| 11293 |
+
"loss": 16.5572,
|
| 11294 |
+
"step": 16120
|
| 11295 |
+
},
|
| 11296 |
+
{
|
| 11297 |
+
"epoch": 0.03145307145190147,
|
| 11298 |
+
"grad_norm": 9.9375,
|
| 11299 |
+
"learning_rate": 0.0004949189879458849,
|
| 11300 |
+
"loss": 16.6075,
|
| 11301 |
+
"step": 16130
|
| 11302 |
+
},
|
| 11303 |
+
{
|
| 11304 |
+
"epoch": 0.03147257118621759,
|
| 11305 |
+
"grad_norm": 13.0,
|
| 11306 |
+
"learning_rate": 0.0004949157369304303,
|
| 11307 |
+
"loss": 16.552,
|
| 11308 |
+
"step": 16140
|
| 11309 |
+
},
|
| 11310 |
+
{
|
| 11311 |
+
"epoch": 0.03149207092053371,
|
| 11312 |
+
"grad_norm": 6.5,
|
| 11313 |
+
"learning_rate": 0.0004949124859149756,
|
| 11314 |
+
"loss": 16.5597,
|
| 11315 |
+
"step": 16150
|
| 11316 |
+
},
|
| 11317 |
+
{
|
| 11318 |
+
"epoch": 0.03151157065484983,
|
| 11319 |
+
"grad_norm": 8.3125,
|
| 11320 |
+
"learning_rate": 0.0004949092348995209,
|
| 11321 |
+
"loss": 16.758,
|
| 11322 |
+
"step": 16160
|
| 11323 |
+
},
|
| 11324 |
+
{
|
| 11325 |
+
"epoch": 0.03153107038916595,
|
| 11326 |
+
"grad_norm": 6.875,
|
| 11327 |
+
"learning_rate": 0.0004949059838840663,
|
| 11328 |
+
"loss": 16.614,
|
| 11329 |
+
"step": 16170
|
| 11330 |
+
},
|
| 11331 |
+
{
|
| 11332 |
+
"epoch": 0.031550570123482065,
|
| 11333 |
+
"grad_norm": 6.875,
|
| 11334 |
+
"learning_rate": 0.0004949027328686116,
|
| 11335 |
+
"loss": 16.6196,
|
| 11336 |
+
"step": 16180
|
| 11337 |
+
},
|
| 11338 |
+
{
|
| 11339 |
+
"epoch": 0.031570069857798186,
|
| 11340 |
+
"grad_norm": 6.75,
|
| 11341 |
+
"learning_rate": 0.0004948994818531569,
|
| 11342 |
+
"loss": 16.6579,
|
| 11343 |
+
"step": 16190
|
| 11344 |
+
},
|
| 11345 |
+
{
|
| 11346 |
+
"epoch": 0.03158956959211431,
|
| 11347 |
+
"grad_norm": 8.75,
|
| 11348 |
+
"learning_rate": 0.0004948962308377022,
|
| 11349 |
+
"loss": 16.7175,
|
| 11350 |
+
"step": 16200
|
| 11351 |
+
},
|
| 11352 |
+
{
|
| 11353 |
+
"epoch": 0.03160906932643043,
|
| 11354 |
+
"grad_norm": 8.25,
|
| 11355 |
+
"learning_rate": 0.0004948929798222476,
|
| 11356 |
+
"loss": 16.6421,
|
| 11357 |
+
"step": 16210
|
| 11358 |
+
},
|
| 11359 |
+
{
|
| 11360 |
+
"epoch": 0.03162856906074655,
|
| 11361 |
+
"grad_norm": 7.53125,
|
| 11362 |
+
"learning_rate": 0.0004948897288067928,
|
| 11363 |
+
"loss": 16.659,
|
| 11364 |
+
"step": 16220
|
| 11365 |
+
},
|
| 11366 |
+
{
|
| 11367 |
+
"epoch": 0.03164806879506267,
|
| 11368 |
+
"grad_norm": 6.625,
|
| 11369 |
+
"learning_rate": 0.0004948864777913381,
|
| 11370 |
+
"loss": 16.5863,
|
| 11371 |
+
"step": 16230
|
| 11372 |
+
},
|
| 11373 |
+
{
|
| 11374 |
+
"epoch": 0.031667568529378784,
|
| 11375 |
+
"grad_norm": 6.59375,
|
| 11376 |
+
"learning_rate": 0.0004948832267758834,
|
| 11377 |
+
"loss": 16.5491,
|
| 11378 |
+
"step": 16240
|
| 11379 |
+
},
|
| 11380 |
+
{
|
| 11381 |
+
"epoch": 0.031687068263694905,
|
| 11382 |
+
"grad_norm": 9.3125,
|
| 11383 |
+
"learning_rate": 0.0004948799757604288,
|
| 11384 |
+
"loss": 16.6378,
|
| 11385 |
+
"step": 16250
|
| 11386 |
+
},
|
| 11387 |
+
{
|
| 11388 |
+
"epoch": 0.031706567998011026,
|
| 11389 |
+
"grad_norm": 8.4375,
|
| 11390 |
+
"learning_rate": 0.0004948767247449741,
|
| 11391 |
+
"loss": 16.6742,
|
| 11392 |
+
"step": 16260
|
| 11393 |
+
},
|
| 11394 |
+
{
|
| 11395 |
+
"epoch": 0.03172606773232715,
|
| 11396 |
+
"grad_norm": 6.90625,
|
| 11397 |
+
"learning_rate": 0.0004948734737295194,
|
| 11398 |
+
"loss": 16.6117,
|
| 11399 |
+
"step": 16270
|
| 11400 |
+
},
|
| 11401 |
+
{
|
| 11402 |
+
"epoch": 0.03174556746664327,
|
| 11403 |
+
"grad_norm": 7.28125,
|
| 11404 |
+
"learning_rate": 0.0004948702227140648,
|
| 11405 |
+
"loss": 16.688,
|
| 11406 |
+
"step": 16280
|
| 11407 |
+
},
|
| 11408 |
+
{
|
| 11409 |
+
"epoch": 0.03176506720095939,
|
| 11410 |
+
"grad_norm": 7.375,
|
| 11411 |
+
"learning_rate": 0.0004948669716986101,
|
| 11412 |
+
"loss": 16.6857,
|
| 11413 |
+
"step": 16290
|
| 11414 |
+
},
|
| 11415 |
+
{
|
| 11416 |
+
"epoch": 0.03178456693527551,
|
| 11417 |
+
"grad_norm": 7.78125,
|
| 11418 |
+
"learning_rate": 0.0004948637206831554,
|
| 11419 |
+
"loss": 16.7109,
|
| 11420 |
+
"step": 16300
|
| 11421 |
+
},
|
| 11422 |
+
{
|
| 11423 |
+
"epoch": 0.031804066669591624,
|
| 11424 |
+
"grad_norm": 8.375,
|
| 11425 |
+
"learning_rate": 0.0004948604696677007,
|
| 11426 |
+
"loss": 16.6176,
|
| 11427 |
+
"step": 16310
|
| 11428 |
+
},
|
| 11429 |
+
{
|
| 11430 |
+
"epoch": 0.031823566403907745,
|
| 11431 |
+
"grad_norm": 7.25,
|
| 11432 |
+
"learning_rate": 0.0004948572186522461,
|
| 11433 |
+
"loss": 16.7202,
|
| 11434 |
+
"step": 16320
|
| 11435 |
+
},
|
| 11436 |
+
{
|
| 11437 |
+
"epoch": 0.031843066138223866,
|
| 11438 |
+
"grad_norm": 7.78125,
|
| 11439 |
+
"learning_rate": 0.0004948539676367914,
|
| 11440 |
+
"loss": 16.5284,
|
| 11441 |
+
"step": 16330
|
| 11442 |
+
},
|
| 11443 |
+
{
|
| 11444 |
+
"epoch": 0.03186256587253999,
|
| 11445 |
+
"grad_norm": 8.625,
|
| 11446 |
+
"learning_rate": 0.0004948507166213367,
|
| 11447 |
+
"loss": 16.6153,
|
| 11448 |
+
"step": 16340
|
| 11449 |
+
},
|
| 11450 |
+
{
|
| 11451 |
+
"epoch": 0.03188206560685611,
|
| 11452 |
+
"grad_norm": 6.9375,
|
| 11453 |
+
"learning_rate": 0.0004948474656058821,
|
| 11454 |
+
"loss": 16.5773,
|
| 11455 |
+
"step": 16350
|
| 11456 |
+
},
|
| 11457 |
+
{
|
| 11458 |
+
"epoch": 0.03190156534117223,
|
| 11459 |
+
"grad_norm": 268.0,
|
| 11460 |
+
"learning_rate": 0.0004948442145904274,
|
| 11461 |
+
"loss": 16.6544,
|
| 11462 |
+
"step": 16360
|
| 11463 |
+
},
|
| 11464 |
+
{
|
| 11465 |
+
"epoch": 0.03192106507548834,
|
| 11466 |
+
"grad_norm": 9.4375,
|
| 11467 |
+
"learning_rate": 0.0004948409635749726,
|
| 11468 |
+
"loss": 16.7142,
|
| 11469 |
+
"step": 16370
|
| 11470 |
+
},
|
| 11471 |
+
{
|
| 11472 |
+
"epoch": 0.031940564809804464,
|
| 11473 |
+
"grad_norm": 6.65625,
|
| 11474 |
+
"learning_rate": 0.0004948377125595179,
|
| 11475 |
+
"loss": 16.7066,
|
| 11476 |
+
"step": 16380
|
| 11477 |
+
},
|
| 11478 |
+
{
|
| 11479 |
+
"epoch": 0.031960064544120585,
|
| 11480 |
+
"grad_norm": 9.0,
|
| 11481 |
+
"learning_rate": 0.0004948344615440633,
|
| 11482 |
+
"loss": 16.6738,
|
| 11483 |
+
"step": 16390
|
| 11484 |
+
},
|
| 11485 |
+
{
|
| 11486 |
+
"epoch": 0.031979564278436706,
|
| 11487 |
+
"grad_norm": 7.15625,
|
| 11488 |
+
"learning_rate": 0.0004948312105286086,
|
| 11489 |
+
"loss": 16.5181,
|
| 11490 |
+
"step": 16400
|
| 11491 |
+
},
|
| 11492 |
+
{
|
| 11493 |
+
"epoch": 0.03199906401275283,
|
| 11494 |
+
"grad_norm": 7.34375,
|
| 11495 |
+
"learning_rate": 0.0004948279595131539,
|
| 11496 |
+
"loss": 16.6785,
|
| 11497 |
+
"step": 16410
|
| 11498 |
+
},
|
| 11499 |
+
{
|
| 11500 |
+
"epoch": 0.03201856374706895,
|
| 11501 |
+
"grad_norm": 6.5625,
|
| 11502 |
+
"learning_rate": 0.0004948247084976992,
|
| 11503 |
+
"loss": 16.5873,
|
| 11504 |
+
"step": 16420
|
| 11505 |
+
},
|
| 11506 |
+
{
|
| 11507 |
+
"epoch": 0.03203806348138507,
|
| 11508 |
+
"grad_norm": 6.71875,
|
| 11509 |
+
"learning_rate": 0.0004948214574822446,
|
| 11510 |
+
"loss": 16.5564,
|
| 11511 |
+
"step": 16430
|
| 11512 |
+
},
|
| 11513 |
+
{
|
| 11514 |
+
"epoch": 0.03205756321570118,
|
| 11515 |
+
"grad_norm": 6.90625,
|
| 11516 |
+
"learning_rate": 0.0004948182064667899,
|
| 11517 |
+
"loss": 16.5584,
|
| 11518 |
+
"step": 16440
|
| 11519 |
+
},
|
| 11520 |
+
{
|
| 11521 |
+
"epoch": 0.032077062950017304,
|
| 11522 |
+
"grad_norm": 6.59375,
|
| 11523 |
+
"learning_rate": 0.0004948149554513352,
|
| 11524 |
+
"loss": 16.5856,
|
| 11525 |
+
"step": 16450
|
| 11526 |
+
},
|
| 11527 |
+
{
|
| 11528 |
+
"epoch": 0.032096562684333425,
|
| 11529 |
+
"grad_norm": 8.125,
|
| 11530 |
+
"learning_rate": 0.0004948117044358806,
|
| 11531 |
+
"loss": 16.6783,
|
| 11532 |
+
"step": 16460
|
| 11533 |
+
},
|
| 11534 |
+
{
|
| 11535 |
+
"epoch": 0.032116062418649546,
|
| 11536 |
+
"grad_norm": 8.875,
|
| 11537 |
+
"learning_rate": 0.0004948084534204259,
|
| 11538 |
+
"loss": 16.5922,
|
| 11539 |
+
"step": 16470
|
| 11540 |
+
},
|
| 11541 |
+
{
|
| 11542 |
+
"epoch": 0.03213556215296567,
|
| 11543 |
+
"grad_norm": 9.5625,
|
| 11544 |
+
"learning_rate": 0.0004948052024049712,
|
| 11545 |
+
"loss": 16.541,
|
| 11546 |
+
"step": 16480
|
| 11547 |
+
},
|
| 11548 |
+
{
|
| 11549 |
+
"epoch": 0.03215506188728179,
|
| 11550 |
+
"grad_norm": 9.25,
|
| 11551 |
+
"learning_rate": 0.0004948019513895165,
|
| 11552 |
+
"loss": 16.4979,
|
| 11553 |
+
"step": 16490
|
| 11554 |
+
},
|
| 11555 |
+
{
|
| 11556 |
+
"epoch": 0.0321745616215979,
|
| 11557 |
+
"grad_norm": 7.4375,
|
| 11558 |
+
"learning_rate": 0.0004947987003740619,
|
| 11559 |
+
"loss": 16.5768,
|
| 11560 |
+
"step": 16500
|
| 11561 |
+
},
|
| 11562 |
+
{
|
| 11563 |
+
"epoch": 0.03219406135591402,
|
| 11564 |
+
"grad_norm": 6.8125,
|
| 11565 |
+
"learning_rate": 0.0004947954493586072,
|
| 11566 |
+
"loss": 16.6362,
|
| 11567 |
+
"step": 16510
|
| 11568 |
+
},
|
| 11569 |
+
{
|
| 11570 |
+
"epoch": 0.032213561090230144,
|
| 11571 |
+
"grad_norm": 7.25,
|
| 11572 |
+
"learning_rate": 0.0004947921983431524,
|
| 11573 |
+
"loss": 16.6625,
|
| 11574 |
+
"step": 16520
|
| 11575 |
+
},
|
| 11576 |
+
{
|
| 11577 |
+
"epoch": 0.032233060824546265,
|
| 11578 |
+
"grad_norm": 7.0625,
|
| 11579 |
+
"learning_rate": 0.0004947889473276978,
|
| 11580 |
+
"loss": 16.4659,
|
| 11581 |
+
"step": 16530
|
| 11582 |
+
},
|
| 11583 |
+
{
|
| 11584 |
+
"epoch": 0.032252560558862386,
|
| 11585 |
+
"grad_norm": 8.5625,
|
| 11586 |
+
"learning_rate": 0.0004947856963122431,
|
| 11587 |
+
"loss": 16.5525,
|
| 11588 |
+
"step": 16540
|
| 11589 |
+
},
|
| 11590 |
+
{
|
| 11591 |
+
"epoch": 0.03227206029317851,
|
| 11592 |
+
"grad_norm": 7.46875,
|
| 11593 |
+
"learning_rate": 0.0004947824452967884,
|
| 11594 |
+
"loss": 16.6808,
|
| 11595 |
+
"step": 16550
|
| 11596 |
+
},
|
| 11597 |
+
{
|
| 11598 |
+
"epoch": 0.03229156002749463,
|
| 11599 |
+
"grad_norm": 7.25,
|
| 11600 |
+
"learning_rate": 0.0004947791942813337,
|
| 11601 |
+
"loss": 16.6119,
|
| 11602 |
+
"step": 16560
|
| 11603 |
+
},
|
| 11604 |
+
{
|
| 11605 |
+
"epoch": 0.03231105976181074,
|
| 11606 |
+
"grad_norm": 6.59375,
|
| 11607 |
+
"learning_rate": 0.0004947759432658791,
|
| 11608 |
+
"loss": 16.623,
|
| 11609 |
+
"step": 16570
|
| 11610 |
+
},
|
| 11611 |
+
{
|
| 11612 |
+
"epoch": 0.032330559496126864,
|
| 11613 |
+
"grad_norm": 8.125,
|
| 11614 |
+
"learning_rate": 0.0004947726922504244,
|
| 11615 |
+
"loss": 16.5381,
|
| 11616 |
+
"step": 16580
|
| 11617 |
+
},
|
| 11618 |
+
{
|
| 11619 |
+
"epoch": 0.032350059230442985,
|
| 11620 |
+
"grad_norm": 7.40625,
|
| 11621 |
+
"learning_rate": 0.0004947694412349697,
|
| 11622 |
+
"loss": 16.7365,
|
| 11623 |
+
"step": 16590
|
| 11624 |
+
},
|
| 11625 |
+
{
|
| 11626 |
+
"epoch": 0.032369558964759106,
|
| 11627 |
+
"grad_norm": 6.5625,
|
| 11628 |
+
"learning_rate": 0.000494766190219515,
|
| 11629 |
+
"loss": 16.6286,
|
| 11630 |
+
"step": 16600
|
| 11631 |
+
},
|
| 11632 |
+
{
|
| 11633 |
+
"epoch": 0.03238905869907523,
|
| 11634 |
+
"grad_norm": 11.875,
|
| 11635 |
+
"learning_rate": 0.0004947629392040604,
|
| 11636 |
+
"loss": 16.5944,
|
| 11637 |
+
"step": 16610
|
| 11638 |
+
},
|
| 11639 |
+
{
|
| 11640 |
+
"epoch": 0.03240855843339135,
|
| 11641 |
+
"grad_norm": 7.0625,
|
| 11642 |
+
"learning_rate": 0.0004947596881886057,
|
| 11643 |
+
"loss": 16.6532,
|
| 11644 |
+
"step": 16620
|
| 11645 |
+
},
|
| 11646 |
+
{
|
| 11647 |
+
"epoch": 0.03242805816770746,
|
| 11648 |
+
"grad_norm": 7.03125,
|
| 11649 |
+
"learning_rate": 0.000494756437173151,
|
| 11650 |
+
"loss": 16.6551,
|
| 11651 |
+
"step": 16630
|
| 11652 |
+
},
|
| 11653 |
+
{
|
| 11654 |
+
"epoch": 0.03244755790202358,
|
| 11655 |
+
"grad_norm": 8.6875,
|
| 11656 |
+
"learning_rate": 0.0004947531861576964,
|
| 11657 |
+
"loss": 16.5883,
|
| 11658 |
+
"step": 16640
|
| 11659 |
+
},
|
| 11660 |
+
{
|
| 11661 |
+
"epoch": 0.032467057636339704,
|
| 11662 |
+
"grad_norm": 7.15625,
|
| 11663 |
+
"learning_rate": 0.0004947499351422417,
|
| 11664 |
+
"loss": 16.6404,
|
| 11665 |
+
"step": 16650
|
| 11666 |
+
},
|
| 11667 |
+
{
|
| 11668 |
+
"epoch": 0.032486557370655825,
|
| 11669 |
+
"grad_norm": 10.8125,
|
| 11670 |
+
"learning_rate": 0.000494746684126787,
|
| 11671 |
+
"loss": 16.6752,
|
| 11672 |
+
"step": 16660
|
| 11673 |
+
},
|
| 11674 |
+
{
|
| 11675 |
+
"epoch": 0.032506057104971946,
|
| 11676 |
+
"grad_norm": 15.4375,
|
| 11677 |
+
"learning_rate": 0.0004947434331113324,
|
| 11678 |
+
"loss": 16.7058,
|
| 11679 |
+
"step": 16670
|
| 11680 |
+
},
|
| 11681 |
+
{
|
| 11682 |
+
"epoch": 0.03252555683928807,
|
| 11683 |
+
"grad_norm": 8.9375,
|
| 11684 |
+
"learning_rate": 0.0004947401820958777,
|
| 11685 |
+
"loss": 16.6456,
|
| 11686 |
+
"step": 16680
|
| 11687 |
+
},
|
| 11688 |
+
{
|
| 11689 |
+
"epoch": 0.03254505657360419,
|
| 11690 |
+
"grad_norm": 7.5625,
|
| 11691 |
+
"learning_rate": 0.000494736931080423,
|
| 11692 |
+
"loss": 16.4961,
|
| 11693 |
+
"step": 16690
|
| 11694 |
+
},
|
| 11695 |
+
{
|
| 11696 |
+
"epoch": 0.0325645563079203,
|
| 11697 |
+
"grad_norm": 8.4375,
|
| 11698 |
+
"learning_rate": 0.0004947336800649683,
|
| 11699 |
+
"loss": 16.6947,
|
| 11700 |
+
"step": 16700
|
| 11701 |
+
},
|
| 11702 |
+
{
|
| 11703 |
+
"epoch": 0.03258405604223642,
|
| 11704 |
+
"grad_norm": 7.53125,
|
| 11705 |
+
"learning_rate": 0.0004947304290495137,
|
| 11706 |
+
"loss": 16.6582,
|
| 11707 |
+
"step": 16710
|
| 11708 |
+
},
|
| 11709 |
+
{
|
| 11710 |
+
"epoch": 0.032603555776552544,
|
| 11711 |
+
"grad_norm": 7.53125,
|
| 11712 |
+
"learning_rate": 0.0004947271780340589,
|
| 11713 |
+
"loss": 16.5912,
|
| 11714 |
+
"step": 16720
|
| 11715 |
+
},
|
| 11716 |
+
{
|
| 11717 |
+
"epoch": 0.032623055510868665,
|
| 11718 |
+
"grad_norm": 7.3125,
|
| 11719 |
+
"learning_rate": 0.0004947239270186042,
|
| 11720 |
+
"loss": 16.5378,
|
| 11721 |
+
"step": 16730
|
| 11722 |
+
},
|
| 11723 |
+
{
|
| 11724 |
+
"epoch": 0.032642555245184786,
|
| 11725 |
+
"grad_norm": 7.4375,
|
| 11726 |
+
"learning_rate": 0.0004947206760031495,
|
| 11727 |
+
"loss": 16.6431,
|
| 11728 |
+
"step": 16740
|
| 11729 |
+
},
|
| 11730 |
+
{
|
| 11731 |
+
"epoch": 0.03266205497950091,
|
| 11732 |
+
"grad_norm": 6.53125,
|
| 11733 |
+
"learning_rate": 0.0004947174249876949,
|
| 11734 |
+
"loss": 16.5495,
|
| 11735 |
+
"step": 16750
|
| 11736 |
+
},
|
| 11737 |
+
{
|
| 11738 |
+
"epoch": 0.03268155471381702,
|
| 11739 |
+
"grad_norm": 6.6875,
|
| 11740 |
+
"learning_rate": 0.0004947141739722402,
|
| 11741 |
+
"loss": 16.5369,
|
| 11742 |
+
"step": 16760
|
| 11743 |
+
},
|
| 11744 |
+
{
|
| 11745 |
+
"epoch": 0.03270105444813314,
|
| 11746 |
+
"grad_norm": 10.5,
|
| 11747 |
+
"learning_rate": 0.0004947109229567855,
|
| 11748 |
+
"loss": 16.6108,
|
| 11749 |
+
"step": 16770
|
| 11750 |
+
},
|
| 11751 |
+
{
|
| 11752 |
+
"epoch": 0.03272055418244926,
|
| 11753 |
+
"grad_norm": 7.34375,
|
| 11754 |
+
"learning_rate": 0.0004947076719413309,
|
| 11755 |
+
"loss": 16.7242,
|
| 11756 |
+
"step": 16780
|
| 11757 |
+
},
|
| 11758 |
+
{
|
| 11759 |
+
"epoch": 0.032740053916765384,
|
| 11760 |
+
"grad_norm": 10.4375,
|
| 11761 |
+
"learning_rate": 0.0004947044209258762,
|
| 11762 |
+
"loss": 16.5792,
|
| 11763 |
+
"step": 16790
|
| 11764 |
+
},
|
| 11765 |
+
{
|
| 11766 |
+
"epoch": 0.032759553651081505,
|
| 11767 |
+
"grad_norm": 7.84375,
|
| 11768 |
+
"learning_rate": 0.0004947011699104215,
|
| 11769 |
+
"loss": 16.6258,
|
| 11770 |
+
"step": 16800
|
| 11771 |
+
},
|
| 11772 |
+
{
|
| 11773 |
+
"epoch": 0.032779053385397626,
|
| 11774 |
+
"grad_norm": 9.625,
|
| 11775 |
+
"learning_rate": 0.0004946979188949668,
|
| 11776 |
+
"loss": 16.5363,
|
| 11777 |
+
"step": 16810
|
| 11778 |
+
},
|
| 11779 |
+
{
|
| 11780 |
+
"epoch": 0.03279855311971375,
|
| 11781 |
+
"grad_norm": 6.59375,
|
| 11782 |
+
"learning_rate": 0.0004946946678795122,
|
| 11783 |
+
"loss": 16.6006,
|
| 11784 |
+
"step": 16820
|
| 11785 |
+
},
|
| 11786 |
+
{
|
| 11787 |
+
"epoch": 0.03281805285402986,
|
| 11788 |
+
"grad_norm": 6.03125,
|
| 11789 |
+
"learning_rate": 0.0004946914168640575,
|
| 11790 |
+
"loss": 16.6048,
|
| 11791 |
+
"step": 16830
|
| 11792 |
+
},
|
| 11793 |
+
{
|
| 11794 |
+
"epoch": 0.03283755258834598,
|
| 11795 |
+
"grad_norm": 7.5625,
|
| 11796 |
+
"learning_rate": 0.0004946881658486028,
|
| 11797 |
+
"loss": 16.4615,
|
| 11798 |
+
"step": 16840
|
| 11799 |
+
},
|
| 11800 |
+
{
|
| 11801 |
+
"epoch": 0.0328570523226621,
|
| 11802 |
+
"grad_norm": 8.75,
|
| 11803 |
+
"learning_rate": 0.0004946849148331482,
|
| 11804 |
+
"loss": 16.6284,
|
| 11805 |
+
"step": 16850
|
| 11806 |
+
},
|
| 11807 |
+
{
|
| 11808 |
+
"epoch": 0.032876552056978224,
|
| 11809 |
+
"grad_norm": 7.46875,
|
| 11810 |
+
"learning_rate": 0.0004946816638176935,
|
| 11811 |
+
"loss": 16.6879,
|
| 11812 |
+
"step": 16860
|
| 11813 |
+
},
|
| 11814 |
+
{
|
| 11815 |
+
"epoch": 0.032896051791294345,
|
| 11816 |
+
"grad_norm": 7.96875,
|
| 11817 |
+
"learning_rate": 0.0004946784128022388,
|
| 11818 |
+
"loss": 16.5871,
|
| 11819 |
+
"step": 16870
|
| 11820 |
+
},
|
| 11821 |
+
{
|
| 11822 |
+
"epoch": 0.032915551525610466,
|
| 11823 |
+
"grad_norm": 8.75,
|
| 11824 |
+
"learning_rate": 0.0004946751617867841,
|
| 11825 |
+
"loss": 16.5434,
|
| 11826 |
+
"step": 16880
|
| 11827 |
+
},
|
| 11828 |
+
{
|
| 11829 |
+
"epoch": 0.03293505125992658,
|
| 11830 |
+
"grad_norm": 6.53125,
|
| 11831 |
+
"learning_rate": 0.0004946719107713295,
|
| 11832 |
+
"loss": 16.5516,
|
| 11833 |
+
"step": 16890
|
| 11834 |
+
},
|
| 11835 |
+
{
|
| 11836 |
+
"epoch": 0.0329545509942427,
|
| 11837 |
+
"grad_norm": 7.96875,
|
| 11838 |
+
"learning_rate": 0.0004946686597558748,
|
| 11839 |
+
"loss": 16.5549,
|
| 11840 |
+
"step": 16900
|
| 11841 |
+
},
|
| 11842 |
+
{
|
| 11843 |
+
"epoch": 0.03297405072855882,
|
| 11844 |
+
"grad_norm": 9.25,
|
| 11845 |
+
"learning_rate": 0.0004946654087404201,
|
| 11846 |
+
"loss": 16.5084,
|
| 11847 |
+
"step": 16910
|
| 11848 |
+
},
|
| 11849 |
+
{
|
| 11850 |
+
"epoch": 0.03299355046287494,
|
| 11851 |
+
"grad_norm": 6.9375,
|
| 11852 |
+
"learning_rate": 0.0004946621577249655,
|
| 11853 |
+
"loss": 16.5905,
|
| 11854 |
+
"step": 16920
|
| 11855 |
+
},
|
| 11856 |
+
{
|
| 11857 |
+
"epoch": 0.033013050197191064,
|
| 11858 |
+
"grad_norm": 7.0,
|
| 11859 |
+
"learning_rate": 0.0004946589067095108,
|
| 11860 |
+
"loss": 16.5883,
|
| 11861 |
+
"step": 16930
|
| 11862 |
+
},
|
| 11863 |
+
{
|
| 11864 |
+
"epoch": 0.033032549931507185,
|
| 11865 |
+
"grad_norm": 10.6875,
|
| 11866 |
+
"learning_rate": 0.0004946556556940561,
|
| 11867 |
+
"loss": 16.5743,
|
| 11868 |
+
"step": 16940
|
| 11869 |
+
},
|
| 11870 |
+
{
|
| 11871 |
+
"epoch": 0.033052049665823306,
|
| 11872 |
+
"grad_norm": 8.4375,
|
| 11873 |
+
"learning_rate": 0.0004946524046786014,
|
| 11874 |
+
"loss": 16.5847,
|
| 11875 |
+
"step": 16950
|
| 11876 |
+
},
|
| 11877 |
+
{
|
| 11878 |
+
"epoch": 0.03307154940013942,
|
| 11879 |
+
"grad_norm": 7.90625,
|
| 11880 |
+
"learning_rate": 0.0004946491536631467,
|
| 11881 |
+
"loss": 16.5937,
|
| 11882 |
+
"step": 16960
|
| 11883 |
+
},
|
| 11884 |
+
{
|
| 11885 |
+
"epoch": 0.03309104913445554,
|
| 11886 |
+
"grad_norm": 6.875,
|
| 11887 |
+
"learning_rate": 0.000494645902647692,
|
| 11888 |
+
"loss": 16.5317,
|
| 11889 |
+
"step": 16970
|
| 11890 |
+
},
|
| 11891 |
+
{
|
| 11892 |
+
"epoch": 0.03311054886877166,
|
| 11893 |
+
"grad_norm": 9.8125,
|
| 11894 |
+
"learning_rate": 0.0004946426516322373,
|
| 11895 |
+
"loss": 16.5498,
|
| 11896 |
+
"step": 16980
|
| 11897 |
+
},
|
| 11898 |
+
{
|
| 11899 |
+
"epoch": 0.03313004860308778,
|
| 11900 |
+
"grad_norm": 7.0625,
|
| 11901 |
+
"learning_rate": 0.0004946394006167826,
|
| 11902 |
+
"loss": 16.6557,
|
| 11903 |
+
"step": 16990
|
| 11904 |
+
},
|
| 11905 |
+
{
|
| 11906 |
+
"epoch": 0.033149548337403904,
|
| 11907 |
+
"grad_norm": 7.25,
|
| 11908 |
+
"learning_rate": 0.000494636149601328,
|
| 11909 |
+
"loss": 16.5932,
|
| 11910 |
+
"step": 17000
|
| 11911 |
+
},
|
| 11912 |
+
{
|
| 11913 |
+
"epoch": 0.033169048071720025,
|
| 11914 |
+
"grad_norm": 7.34375,
|
| 11915 |
+
"learning_rate": 0.0004946328985858733,
|
| 11916 |
+
"loss": 16.5905,
|
| 11917 |
+
"step": 17010
|
| 11918 |
+
},
|
| 11919 |
+
{
|
| 11920 |
+
"epoch": 0.03318854780603614,
|
| 11921 |
+
"grad_norm": 6.59375,
|
| 11922 |
+
"learning_rate": 0.0004946296475704186,
|
| 11923 |
+
"loss": 16.6248,
|
| 11924 |
+
"step": 17020
|
| 11925 |
+
},
|
| 11926 |
+
{
|
| 11927 |
+
"epoch": 0.03320804754035226,
|
| 11928 |
+
"grad_norm": 7.4375,
|
| 11929 |
+
"learning_rate": 0.000494626396554964,
|
| 11930 |
+
"loss": 16.6214,
|
| 11931 |
+
"step": 17030
|
| 11932 |
+
},
|
| 11933 |
+
{
|
| 11934 |
+
"epoch": 0.03322754727466838,
|
| 11935 |
+
"grad_norm": 7.4375,
|
| 11936 |
+
"learning_rate": 0.0004946231455395093,
|
| 11937 |
+
"loss": 16.656,
|
| 11938 |
+
"step": 17040
|
| 11939 |
+
},
|
| 11940 |
+
{
|
| 11941 |
+
"epoch": 0.0332470470089845,
|
| 11942 |
+
"grad_norm": 8.75,
|
| 11943 |
+
"learning_rate": 0.0004946198945240546,
|
| 11944 |
+
"loss": 16.5983,
|
| 11945 |
+
"step": 17050
|
| 11946 |
+
},
|
| 11947 |
+
{
|
| 11948 |
+
"epoch": 0.03326654674330062,
|
| 11949 |
+
"grad_norm": 9.125,
|
| 11950 |
+
"learning_rate": 0.0004946166435085999,
|
| 11951 |
+
"loss": 16.501,
|
| 11952 |
+
"step": 17060
|
| 11953 |
+
},
|
| 11954 |
+
{
|
| 11955 |
+
"epoch": 0.033286046477616744,
|
| 11956 |
+
"grad_norm": 7.78125,
|
| 11957 |
+
"learning_rate": 0.0004946133924931453,
|
| 11958 |
+
"loss": 16.6354,
|
| 11959 |
+
"step": 17070
|
| 11960 |
+
},
|
| 11961 |
+
{
|
| 11962 |
+
"epoch": 0.033305546211932865,
|
| 11963 |
+
"grad_norm": 10.0,
|
| 11964 |
+
"learning_rate": 0.0004946101414776906,
|
| 11965 |
+
"loss": 16.5313,
|
| 11966 |
+
"step": 17080
|
| 11967 |
+
},
|
| 11968 |
+
{
|
| 11969 |
+
"epoch": 0.03332504594624898,
|
| 11970 |
+
"grad_norm": 8.3125,
|
| 11971 |
+
"learning_rate": 0.0004946068904622359,
|
| 11972 |
+
"loss": 16.6214,
|
| 11973 |
+
"step": 17090
|
| 11974 |
+
},
|
| 11975 |
+
{
|
| 11976 |
+
"epoch": 0.0333445456805651,
|
| 11977 |
+
"grad_norm": 6.84375,
|
| 11978 |
+
"learning_rate": 0.0004946036394467813,
|
| 11979 |
+
"loss": 16.5653,
|
| 11980 |
+
"step": 17100
|
| 11981 |
+
},
|
| 11982 |
+
{
|
| 11983 |
+
"epoch": 0.03336404541488122,
|
| 11984 |
+
"grad_norm": 17.75,
|
| 11985 |
+
"learning_rate": 0.0004946003884313266,
|
| 11986 |
+
"loss": 16.6099,
|
| 11987 |
+
"step": 17110
|
| 11988 |
+
},
|
| 11989 |
+
{
|
| 11990 |
+
"epoch": 0.03338354514919734,
|
| 11991 |
+
"grad_norm": 7.75,
|
| 11992 |
+
"learning_rate": 0.0004945971374158719,
|
| 11993 |
+
"loss": 16.5001,
|
| 11994 |
+
"step": 17120
|
| 11995 |
+
},
|
| 11996 |
+
{
|
| 11997 |
+
"epoch": 0.033403044883513464,
|
| 11998 |
+
"grad_norm": 7.59375,
|
| 11999 |
+
"learning_rate": 0.0004945938864004172,
|
| 12000 |
+
"loss": 16.6556,
|
| 12001 |
+
"step": 17130
|
| 12002 |
+
},
|
| 12003 |
+
{
|
| 12004 |
+
"epoch": 0.033422544617829585,
|
| 12005 |
+
"grad_norm": 7.34375,
|
| 12006 |
+
"learning_rate": 0.0004945906353849625,
|
| 12007 |
+
"loss": 16.571,
|
| 12008 |
+
"step": 17140
|
| 12009 |
+
},
|
| 12010 |
+
{
|
| 12011 |
+
"epoch": 0.0334420443521457,
|
| 12012 |
+
"grad_norm": 9.25,
|
| 12013 |
+
"learning_rate": 0.0004945873843695078,
|
| 12014 |
+
"loss": 16.6326,
|
| 12015 |
+
"step": 17150
|
| 12016 |
+
},
|
| 12017 |
+
{
|
| 12018 |
+
"epoch": 0.03346154408646182,
|
| 12019 |
+
"grad_norm": 9.5625,
|
| 12020 |
+
"learning_rate": 0.0004945841333540531,
|
| 12021 |
+
"loss": 16.6164,
|
| 12022 |
+
"step": 17160
|
| 12023 |
+
},
|
| 12024 |
+
{
|
| 12025 |
+
"epoch": 0.03348104382077794,
|
| 12026 |
+
"grad_norm": 7.875,
|
| 12027 |
+
"learning_rate": 0.0004945808823385984,
|
| 12028 |
+
"loss": 16.5917,
|
| 12029 |
+
"step": 17170
|
| 12030 |
+
},
|
| 12031 |
+
{
|
| 12032 |
+
"epoch": 0.03350054355509406,
|
| 12033 |
+
"grad_norm": 6.5625,
|
| 12034 |
+
"learning_rate": 0.0004945776313231438,
|
| 12035 |
+
"loss": 16.6178,
|
| 12036 |
+
"step": 17180
|
| 12037 |
+
},
|
| 12038 |
+
{
|
| 12039 |
+
"epoch": 0.03352004328941018,
|
| 12040 |
+
"grad_norm": 6.65625,
|
| 12041 |
+
"learning_rate": 0.0004945743803076891,
|
| 12042 |
+
"loss": 16.5651,
|
| 12043 |
+
"step": 17190
|
| 12044 |
+
},
|
| 12045 |
+
{
|
| 12046 |
+
"epoch": 0.033539543023726304,
|
| 12047 |
+
"grad_norm": 8.0625,
|
| 12048 |
+
"learning_rate": 0.0004945711292922344,
|
| 12049 |
+
"loss": 16.7342,
|
| 12050 |
+
"step": 17200
|
| 12051 |
+
},
|
| 12052 |
+
{
|
| 12053 |
+
"epoch": 0.033559042758042425,
|
| 12054 |
+
"grad_norm": 6.5,
|
| 12055 |
+
"learning_rate": 0.0004945678782767798,
|
| 12056 |
+
"loss": 16.6286,
|
| 12057 |
+
"step": 17210
|
| 12058 |
+
},
|
| 12059 |
+
{
|
| 12060 |
+
"epoch": 0.03357854249235854,
|
| 12061 |
+
"grad_norm": 7.6875,
|
| 12062 |
+
"learning_rate": 0.0004945646272613251,
|
| 12063 |
+
"loss": 16.602,
|
| 12064 |
+
"step": 17220
|
| 12065 |
+
},
|
| 12066 |
+
{
|
| 12067 |
+
"epoch": 0.03359804222667466,
|
| 12068 |
+
"grad_norm": 7.53125,
|
| 12069 |
+
"learning_rate": 0.0004945613762458704,
|
| 12070 |
+
"loss": 16.652,
|
| 12071 |
+
"step": 17230
|
| 12072 |
+
},
|
| 12073 |
+
{
|
| 12074 |
+
"epoch": 0.03361754196099078,
|
| 12075 |
+
"grad_norm": 20.75,
|
| 12076 |
+
"learning_rate": 0.0004945581252304157,
|
| 12077 |
+
"loss": 16.5934,
|
| 12078 |
+
"step": 17240
|
| 12079 |
+
},
|
| 12080 |
+
{
|
| 12081 |
+
"epoch": 0.0336370416953069,
|
| 12082 |
+
"grad_norm": 7.6875,
|
| 12083 |
+
"learning_rate": 0.0004945548742149611,
|
| 12084 |
+
"loss": 16.5494,
|
| 12085 |
+
"step": 17250
|
| 12086 |
+
},
|
| 12087 |
+
{
|
| 12088 |
+
"epoch": 0.03365654142962302,
|
| 12089 |
+
"grad_norm": 7.09375,
|
| 12090 |
+
"learning_rate": 0.0004945516231995064,
|
| 12091 |
+
"loss": 16.6584,
|
| 12092 |
+
"step": 17260
|
| 12093 |
+
},
|
| 12094 |
+
{
|
| 12095 |
+
"epoch": 0.033676041163939144,
|
| 12096 |
+
"grad_norm": 8.625,
|
| 12097 |
+
"learning_rate": 0.0004945483721840517,
|
| 12098 |
+
"loss": 16.6333,
|
| 12099 |
+
"step": 17270
|
| 12100 |
+
},
|
| 12101 |
+
{
|
| 12102 |
+
"epoch": 0.03369554089825526,
|
| 12103 |
+
"grad_norm": 6.75,
|
| 12104 |
+
"learning_rate": 0.0004945451211685971,
|
| 12105 |
+
"loss": 16.6222,
|
| 12106 |
+
"step": 17280
|
| 12107 |
+
},
|
| 12108 |
+
{
|
| 12109 |
+
"epoch": 0.03371504063257138,
|
| 12110 |
+
"grad_norm": 7.125,
|
| 12111 |
+
"learning_rate": 0.0004945418701531423,
|
| 12112 |
+
"loss": 16.4551,
|
| 12113 |
+
"step": 17290
|
| 12114 |
+
},
|
| 12115 |
+
{
|
| 12116 |
+
"epoch": 0.0337345403668875,
|
| 12117 |
+
"grad_norm": 7.5625,
|
| 12118 |
+
"learning_rate": 0.0004945386191376876,
|
| 12119 |
+
"loss": 16.4826,
|
| 12120 |
+
"step": 17300
|
| 12121 |
+
},
|
| 12122 |
+
{
|
| 12123 |
+
"epoch": 0.03375404010120362,
|
| 12124 |
+
"grad_norm": 8.1875,
|
| 12125 |
+
"learning_rate": 0.0004945353681222329,
|
| 12126 |
+
"loss": 16.6638,
|
| 12127 |
+
"step": 17310
|
| 12128 |
+
},
|
| 12129 |
+
{
|
| 12130 |
+
"epoch": 0.03377353983551974,
|
| 12131 |
+
"grad_norm": 6.96875,
|
| 12132 |
+
"learning_rate": 0.0004945321171067783,
|
| 12133 |
+
"loss": 16.5386,
|
| 12134 |
+
"step": 17320
|
| 12135 |
+
},
|
| 12136 |
+
{
|
| 12137 |
+
"epoch": 0.03379303956983586,
|
| 12138 |
+
"grad_norm": 8.0,
|
| 12139 |
+
"learning_rate": 0.0004945288660913236,
|
| 12140 |
+
"loss": 16.6675,
|
| 12141 |
+
"step": 17330
|
| 12142 |
+
},
|
| 12143 |
+
{
|
| 12144 |
+
"epoch": 0.033812539304151984,
|
| 12145 |
+
"grad_norm": 9.0625,
|
| 12146 |
+
"learning_rate": 0.0004945256150758689,
|
| 12147 |
+
"loss": 16.6567,
|
| 12148 |
+
"step": 17340
|
| 12149 |
+
},
|
| 12150 |
+
{
|
| 12151 |
+
"epoch": 0.0338320390384681,
|
| 12152 |
+
"grad_norm": 7.28125,
|
| 12153 |
+
"learning_rate": 0.0004945223640604143,
|
| 12154 |
+
"loss": 16.641,
|
| 12155 |
+
"step": 17350
|
| 12156 |
+
},
|
| 12157 |
+
{
|
| 12158 |
+
"epoch": 0.03385153877278422,
|
| 12159 |
+
"grad_norm": 8.375,
|
| 12160 |
+
"learning_rate": 0.0004945191130449596,
|
| 12161 |
+
"loss": 16.5874,
|
| 12162 |
+
"step": 17360
|
| 12163 |
+
},
|
| 12164 |
+
{
|
| 12165 |
+
"epoch": 0.03387103850710034,
|
| 12166 |
+
"grad_norm": 6.84375,
|
| 12167 |
+
"learning_rate": 0.0004945158620295049,
|
| 12168 |
+
"loss": 16.6083,
|
| 12169 |
+
"step": 17370
|
| 12170 |
+
},
|
| 12171 |
+
{
|
| 12172 |
+
"epoch": 0.03389053824141646,
|
| 12173 |
+
"grad_norm": 7.3125,
|
| 12174 |
+
"learning_rate": 0.0004945126110140502,
|
| 12175 |
+
"loss": 16.5926,
|
| 12176 |
+
"step": 17380
|
| 12177 |
+
},
|
| 12178 |
+
{
|
| 12179 |
+
"epoch": 0.03391003797573258,
|
| 12180 |
+
"grad_norm": 7.5625,
|
| 12181 |
+
"learning_rate": 0.0004945093599985956,
|
| 12182 |
+
"loss": 16.5794,
|
| 12183 |
+
"step": 17390
|
| 12184 |
+
},
|
| 12185 |
+
{
|
| 12186 |
+
"epoch": 0.0339295377100487,
|
| 12187 |
+
"grad_norm": 7.40625,
|
| 12188 |
+
"learning_rate": 0.0004945061089831409,
|
| 12189 |
+
"loss": 16.5721,
|
| 12190 |
+
"step": 17400
|
| 12191 |
+
},
|
| 12192 |
+
{
|
| 12193 |
+
"epoch": 0.03394903744436482,
|
| 12194 |
+
"grad_norm": 6.5,
|
| 12195 |
+
"learning_rate": 0.0004945028579676862,
|
| 12196 |
+
"loss": 16.6443,
|
| 12197 |
+
"step": 17410
|
| 12198 |
+
},
|
| 12199 |
+
{
|
| 12200 |
+
"epoch": 0.03396853717868094,
|
| 12201 |
+
"grad_norm": 6.59375,
|
| 12202 |
+
"learning_rate": 0.0004944996069522316,
|
| 12203 |
+
"loss": 16.6037,
|
| 12204 |
+
"step": 17420
|
| 12205 |
+
},
|
| 12206 |
+
{
|
| 12207 |
+
"epoch": 0.03398803691299706,
|
| 12208 |
+
"grad_norm": 8.5,
|
| 12209 |
+
"learning_rate": 0.0004944963559367769,
|
| 12210 |
+
"loss": 16.5677,
|
| 12211 |
+
"step": 17430
|
| 12212 |
+
},
|
| 12213 |
+
{
|
| 12214 |
+
"epoch": 0.03400753664731318,
|
| 12215 |
+
"grad_norm": 10.1875,
|
| 12216 |
+
"learning_rate": 0.0004944931049213222,
|
| 12217 |
+
"loss": 16.5652,
|
| 12218 |
+
"step": 17440
|
| 12219 |
+
},
|
| 12220 |
+
{
|
| 12221 |
+
"epoch": 0.0340270363816293,
|
| 12222 |
+
"grad_norm": 6.9375,
|
| 12223 |
+
"learning_rate": 0.0004944898539058674,
|
| 12224 |
+
"loss": 16.635,
|
| 12225 |
+
"step": 17450
|
| 12226 |
+
},
|
| 12227 |
+
{
|
| 12228 |
+
"epoch": 0.03404653611594542,
|
| 12229 |
+
"grad_norm": 6.53125,
|
| 12230 |
+
"learning_rate": 0.0004944866028904128,
|
| 12231 |
+
"loss": 16.6619,
|
| 12232 |
+
"step": 17460
|
| 12233 |
+
},
|
| 12234 |
+
{
|
| 12235 |
+
"epoch": 0.03406603585026154,
|
| 12236 |
+
"grad_norm": 6.875,
|
| 12237 |
+
"learning_rate": 0.0004944833518749581,
|
| 12238 |
+
"loss": 16.6537,
|
| 12239 |
+
"step": 17470
|
| 12240 |
+
},
|
| 12241 |
+
{
|
| 12242 |
+
"epoch": 0.03408553558457766,
|
| 12243 |
+
"grad_norm": 7.59375,
|
| 12244 |
+
"learning_rate": 0.0004944801008595034,
|
| 12245 |
+
"loss": 16.6335,
|
| 12246 |
+
"step": 17480
|
| 12247 |
+
},
|
| 12248 |
+
{
|
| 12249 |
+
"epoch": 0.03410503531889378,
|
| 12250 |
+
"grad_norm": 6.6875,
|
| 12251 |
+
"learning_rate": 0.0004944768498440487,
|
| 12252 |
+
"loss": 16.6003,
|
| 12253 |
+
"step": 17490
|
| 12254 |
+
},
|
| 12255 |
+
{
|
| 12256 |
+
"epoch": 0.0341245350532099,
|
| 12257 |
+
"grad_norm": 7.375,
|
| 12258 |
+
"learning_rate": 0.0004944735988285941,
|
| 12259 |
+
"loss": 16.6049,
|
| 12260 |
+
"step": 17500
|
| 12261 |
+
},
|
| 12262 |
+
{
|
| 12263 |
+
"epoch": 0.03414403478752602,
|
| 12264 |
+
"grad_norm": 8.5625,
|
| 12265 |
+
"learning_rate": 0.0004944703478131394,
|
| 12266 |
+
"loss": 16.6644,
|
| 12267 |
+
"step": 17510
|
| 12268 |
+
},
|
| 12269 |
+
{
|
| 12270 |
+
"epoch": 0.03416353452184214,
|
| 12271 |
+
"grad_norm": 7.59375,
|
| 12272 |
+
"learning_rate": 0.0004944670967976847,
|
| 12273 |
+
"loss": 16.5921,
|
| 12274 |
+
"step": 17520
|
| 12275 |
+
},
|
| 12276 |
+
{
|
| 12277 |
+
"epoch": 0.03418303425615826,
|
| 12278 |
+
"grad_norm": 10.5625,
|
| 12279 |
+
"learning_rate": 0.0004944638457822301,
|
| 12280 |
+
"loss": 16.7162,
|
| 12281 |
+
"step": 17530
|
| 12282 |
+
},
|
| 12283 |
+
{
|
| 12284 |
+
"epoch": 0.034202533990474376,
|
| 12285 |
+
"grad_norm": 12.375,
|
| 12286 |
+
"learning_rate": 0.0004944605947667754,
|
| 12287 |
+
"loss": 16.6499,
|
| 12288 |
+
"step": 17540
|
| 12289 |
+
},
|
| 12290 |
+
{
|
| 12291 |
+
"epoch": 0.0342220337247905,
|
| 12292 |
+
"grad_norm": 6.625,
|
| 12293 |
+
"learning_rate": 0.0004944573437513207,
|
| 12294 |
+
"loss": 16.5904,
|
| 12295 |
+
"step": 17550
|
| 12296 |
+
},
|
| 12297 |
+
{
|
| 12298 |
+
"epoch": 0.03424153345910662,
|
| 12299 |
+
"grad_norm": 8.875,
|
| 12300 |
+
"learning_rate": 0.000494454092735866,
|
| 12301 |
+
"loss": 16.5917,
|
| 12302 |
+
"step": 17560
|
| 12303 |
+
},
|
| 12304 |
+
{
|
| 12305 |
+
"epoch": 0.03426103319342274,
|
| 12306 |
+
"grad_norm": 8.25,
|
| 12307 |
+
"learning_rate": 0.0004944508417204114,
|
| 12308 |
+
"loss": 16.5057,
|
| 12309 |
+
"step": 17570
|
| 12310 |
+
},
|
| 12311 |
+
{
|
| 12312 |
+
"epoch": 0.03428053292773886,
|
| 12313 |
+
"grad_norm": 9.875,
|
| 12314 |
+
"learning_rate": 0.0004944475907049567,
|
| 12315 |
+
"loss": 16.4435,
|
| 12316 |
+
"step": 17580
|
| 12317 |
+
},
|
| 12318 |
+
{
|
| 12319 |
+
"epoch": 0.03430003266205498,
|
| 12320 |
+
"grad_norm": 7.03125,
|
| 12321 |
+
"learning_rate": 0.000494444339689502,
|
| 12322 |
+
"loss": 16.5729,
|
| 12323 |
+
"step": 17590
|
| 12324 |
+
},
|
| 12325 |
+
{
|
| 12326 |
+
"epoch": 0.0343195323963711,
|
| 12327 |
+
"grad_norm": 7.8125,
|
| 12328 |
+
"learning_rate": 0.0004944410886740474,
|
| 12329 |
+
"loss": 16.4975,
|
| 12330 |
+
"step": 17600
|
| 12331 |
+
},
|
| 12332 |
+
{
|
| 12333 |
+
"epoch": 0.034339032130687216,
|
| 12334 |
+
"grad_norm": 8.625,
|
| 12335 |
+
"learning_rate": 0.0004944378376585927,
|
| 12336 |
+
"loss": 16.5803,
|
| 12337 |
+
"step": 17610
|
| 12338 |
+
},
|
| 12339 |
+
{
|
| 12340 |
+
"epoch": 0.03435853186500334,
|
| 12341 |
+
"grad_norm": 7.0,
|
| 12342 |
+
"learning_rate": 0.000494434586643138,
|
| 12343 |
+
"loss": 16.6196,
|
| 12344 |
+
"step": 17620
|
| 12345 |
+
},
|
| 12346 |
+
{
|
| 12347 |
+
"epoch": 0.03437803159931946,
|
| 12348 |
+
"grad_norm": 7.34375,
|
| 12349 |
+
"learning_rate": 0.0004944313356276833,
|
| 12350 |
+
"loss": 16.5954,
|
| 12351 |
+
"step": 17630
|
| 12352 |
+
},
|
| 12353 |
+
{
|
| 12354 |
+
"epoch": 0.03439753133363558,
|
| 12355 |
+
"grad_norm": 6.71875,
|
| 12356 |
+
"learning_rate": 0.0004944280846122287,
|
| 12357 |
+
"loss": 16.6301,
|
| 12358 |
+
"step": 17640
|
| 12359 |
+
},
|
| 12360 |
+
{
|
| 12361 |
+
"epoch": 0.0344170310679517,
|
| 12362 |
+
"grad_norm": 8.3125,
|
| 12363 |
+
"learning_rate": 0.000494424833596774,
|
| 12364 |
+
"loss": 16.5994,
|
| 12365 |
+
"step": 17650
|
| 12366 |
+
},
|
| 12367 |
+
{
|
| 12368 |
+
"epoch": 0.03443653080226782,
|
| 12369 |
+
"grad_norm": 7.78125,
|
| 12370 |
+
"learning_rate": 0.0004944215825813193,
|
| 12371 |
+
"loss": 16.584,
|
| 12372 |
+
"step": 17660
|
| 12373 |
+
},
|
| 12374 |
+
{
|
| 12375 |
+
"epoch": 0.034456030536583936,
|
| 12376 |
+
"grad_norm": 8.8125,
|
| 12377 |
+
"learning_rate": 0.0004944183315658647,
|
| 12378 |
+
"loss": 16.5309,
|
| 12379 |
+
"step": 17670
|
| 12380 |
+
},
|
| 12381 |
+
{
|
| 12382 |
+
"epoch": 0.03447553027090006,
|
| 12383 |
+
"grad_norm": 6.90625,
|
| 12384 |
+
"learning_rate": 0.00049441508055041,
|
| 12385 |
+
"loss": 16.6363,
|
| 12386 |
+
"step": 17680
|
| 12387 |
+
},
|
| 12388 |
+
{
|
| 12389 |
+
"epoch": 0.03449503000521618,
|
| 12390 |
+
"grad_norm": 6.65625,
|
| 12391 |
+
"learning_rate": 0.0004944118295349552,
|
| 12392 |
+
"loss": 16.6538,
|
| 12393 |
+
"step": 17690
|
| 12394 |
+
},
|
| 12395 |
+
{
|
| 12396 |
+
"epoch": 0.0345145297395323,
|
| 12397 |
+
"grad_norm": 58.0,
|
| 12398 |
+
"learning_rate": 0.0004944085785195005,
|
| 12399 |
+
"loss": 16.5892,
|
| 12400 |
+
"step": 17700
|
| 12401 |
+
},
|
| 12402 |
+
{
|
| 12403 |
+
"epoch": 0.03453402947384842,
|
| 12404 |
+
"grad_norm": 9.0,
|
| 12405 |
+
"learning_rate": 0.0004944053275040459,
|
| 12406 |
+
"loss": 16.5848,
|
| 12407 |
+
"step": 17710
|
| 12408 |
+
},
|
| 12409 |
+
{
|
| 12410 |
+
"epoch": 0.03455352920816454,
|
| 12411 |
+
"grad_norm": 7.15625,
|
| 12412 |
+
"learning_rate": 0.0004944020764885912,
|
| 12413 |
+
"loss": 16.5623,
|
| 12414 |
+
"step": 17720
|
| 12415 |
+
},
|
| 12416 |
+
{
|
| 12417 |
+
"epoch": 0.03457302894248066,
|
| 12418 |
+
"grad_norm": 8.5,
|
| 12419 |
+
"learning_rate": 0.0004943988254731365,
|
| 12420 |
+
"loss": 16.6047,
|
| 12421 |
+
"step": 17730
|
| 12422 |
+
},
|
| 12423 |
+
{
|
| 12424 |
+
"epoch": 0.034592528676796776,
|
| 12425 |
+
"grad_norm": 6.9375,
|
| 12426 |
+
"learning_rate": 0.0004943955744576818,
|
| 12427 |
+
"loss": 16.6139,
|
| 12428 |
+
"step": 17740
|
| 12429 |
+
},
|
| 12430 |
+
{
|
| 12431 |
+
"epoch": 0.0346120284111129,
|
| 12432 |
+
"grad_norm": 9.25,
|
| 12433 |
+
"learning_rate": 0.0004943923234422272,
|
| 12434 |
+
"loss": 16.602,
|
| 12435 |
+
"step": 17750
|
| 12436 |
+
},
|
| 12437 |
+
{
|
| 12438 |
+
"epoch": 0.03463152814542902,
|
| 12439 |
+
"grad_norm": 11.625,
|
| 12440 |
+
"learning_rate": 0.0004943890724267725,
|
| 12441 |
+
"loss": 16.5514,
|
| 12442 |
+
"step": 17760
|
| 12443 |
+
},
|
| 12444 |
+
{
|
| 12445 |
+
"epoch": 0.03465102787974514,
|
| 12446 |
+
"grad_norm": 8.0625,
|
| 12447 |
+
"learning_rate": 0.0004943858214113178,
|
| 12448 |
+
"loss": 16.5125,
|
| 12449 |
+
"step": 17770
|
| 12450 |
+
},
|
| 12451 |
+
{
|
| 12452 |
+
"epoch": 0.03467052761406126,
|
| 12453 |
+
"grad_norm": 7.0,
|
| 12454 |
+
"learning_rate": 0.0004943825703958632,
|
| 12455 |
+
"loss": 16.575,
|
| 12456 |
+
"step": 17780
|
| 12457 |
+
},
|
| 12458 |
+
{
|
| 12459 |
+
"epoch": 0.03469002734837738,
|
| 12460 |
+
"grad_norm": 8.125,
|
| 12461 |
+
"learning_rate": 0.0004943793193804085,
|
| 12462 |
+
"loss": 16.6736,
|
| 12463 |
+
"step": 17790
|
| 12464 |
+
},
|
| 12465 |
+
{
|
| 12466 |
+
"epoch": 0.034709527082693495,
|
| 12467 |
+
"grad_norm": 12.875,
|
| 12468 |
+
"learning_rate": 0.0004943760683649538,
|
| 12469 |
+
"loss": 16.6935,
|
| 12470 |
+
"step": 17800
|
| 12471 |
+
},
|
| 12472 |
+
{
|
| 12473 |
+
"epoch": 0.034729026817009616,
|
| 12474 |
+
"grad_norm": 9.4375,
|
| 12475 |
+
"learning_rate": 0.0004943728173494991,
|
| 12476 |
+
"loss": 16.5395,
|
| 12477 |
+
"step": 17810
|
| 12478 |
+
},
|
| 12479 |
+
{
|
| 12480 |
+
"epoch": 0.03474852655132574,
|
| 12481 |
+
"grad_norm": 7.25,
|
| 12482 |
+
"learning_rate": 0.0004943695663340445,
|
| 12483 |
+
"loss": 16.4912,
|
| 12484 |
+
"step": 17820
|
| 12485 |
+
},
|
| 12486 |
+
{
|
| 12487 |
+
"epoch": 0.03476802628564186,
|
| 12488 |
+
"grad_norm": 6.46875,
|
| 12489 |
+
"learning_rate": 0.0004943663153185898,
|
| 12490 |
+
"loss": 16.5062,
|
| 12491 |
+
"step": 17830
|
| 12492 |
+
},
|
| 12493 |
+
{
|
| 12494 |
+
"epoch": 0.03478752601995798,
|
| 12495 |
+
"grad_norm": 6.5,
|
| 12496 |
+
"learning_rate": 0.0004943630643031351,
|
| 12497 |
+
"loss": 16.5508,
|
| 12498 |
+
"step": 17840
|
| 12499 |
+
},
|
| 12500 |
+
{
|
| 12501 |
+
"epoch": 0.0348070257542741,
|
| 12502 |
+
"grad_norm": 9.375,
|
| 12503 |
+
"learning_rate": 0.0004943598132876805,
|
| 12504 |
+
"loss": 16.7509,
|
| 12505 |
+
"step": 17850
|
| 12506 |
+
},
|
| 12507 |
+
{
|
| 12508 |
+
"epoch": 0.03482652548859022,
|
| 12509 |
+
"grad_norm": 8.3125,
|
| 12510 |
+
"learning_rate": 0.0004943565622722258,
|
| 12511 |
+
"loss": 16.6271,
|
| 12512 |
+
"step": 17860
|
| 12513 |
+
},
|
| 12514 |
+
{
|
| 12515 |
+
"epoch": 0.034846025222906335,
|
| 12516 |
+
"grad_norm": 8.6875,
|
| 12517 |
+
"learning_rate": 0.0004943533112567711,
|
| 12518 |
+
"loss": 16.5867,
|
| 12519 |
+
"step": 17870
|
| 12520 |
+
},
|
| 12521 |
+
{
|
| 12522 |
+
"epoch": 0.034865524957222456,
|
| 12523 |
+
"grad_norm": 7.4375,
|
| 12524 |
+
"learning_rate": 0.0004943500602413164,
|
| 12525 |
+
"loss": 16.6566,
|
| 12526 |
+
"step": 17880
|
| 12527 |
+
},
|
| 12528 |
+
{
|
| 12529 |
+
"epoch": 0.03488502469153858,
|
| 12530 |
+
"grad_norm": 7.53125,
|
| 12531 |
+
"learning_rate": 0.0004943468092258618,
|
| 12532 |
+
"loss": 16.573,
|
| 12533 |
+
"step": 17890
|
| 12534 |
+
},
|
| 12535 |
+
{
|
| 12536 |
+
"epoch": 0.0349045244258547,
|
| 12537 |
+
"grad_norm": 7.21875,
|
| 12538 |
+
"learning_rate": 0.0004943435582104071,
|
| 12539 |
+
"loss": 16.6032,
|
| 12540 |
+
"step": 17900
|
| 12541 |
+
},
|
| 12542 |
+
{
|
| 12543 |
+
"epoch": 0.03492402416017082,
|
| 12544 |
+
"grad_norm": 7.6875,
|
| 12545 |
+
"learning_rate": 0.0004943403071949523,
|
| 12546 |
+
"loss": 16.6089,
|
| 12547 |
+
"step": 17910
|
| 12548 |
+
},
|
| 12549 |
+
{
|
| 12550 |
+
"epoch": 0.03494352389448694,
|
| 12551 |
+
"grad_norm": 7.46875,
|
| 12552 |
+
"learning_rate": 0.0004943370561794977,
|
| 12553 |
+
"loss": 16.6048,
|
| 12554 |
+
"step": 17920
|
| 12555 |
+
},
|
| 12556 |
+
{
|
| 12557 |
+
"epoch": 0.03496302362880306,
|
| 12558 |
+
"grad_norm": 7.78125,
|
| 12559 |
+
"learning_rate": 0.000494333805164043,
|
| 12560 |
+
"loss": 16.608,
|
| 12561 |
+
"step": 17930
|
| 12562 |
+
},
|
| 12563 |
+
{
|
| 12564 |
+
"epoch": 0.034982523363119175,
|
| 12565 |
+
"grad_norm": 8.0,
|
| 12566 |
+
"learning_rate": 0.0004943305541485883,
|
| 12567 |
+
"loss": 16.6105,
|
| 12568 |
+
"step": 17940
|
| 12569 |
+
},
|
| 12570 |
+
{
|
| 12571 |
+
"epoch": 0.035002023097435296,
|
| 12572 |
+
"grad_norm": 8.0,
|
| 12573 |
+
"learning_rate": 0.0004943273031331336,
|
| 12574 |
+
"loss": 16.5891,
|
| 12575 |
+
"step": 17950
|
| 12576 |
+
},
|
| 12577 |
+
{
|
| 12578 |
+
"epoch": 0.03502152283175142,
|
| 12579 |
+
"grad_norm": 9.1875,
|
| 12580 |
+
"learning_rate": 0.000494324052117679,
|
| 12581 |
+
"loss": 16.5736,
|
| 12582 |
+
"step": 17960
|
| 12583 |
+
},
|
| 12584 |
+
{
|
| 12585 |
+
"epoch": 0.03504102256606754,
|
| 12586 |
+
"grad_norm": 7.5,
|
| 12587 |
+
"learning_rate": 0.0004943208011022243,
|
| 12588 |
+
"loss": 16.6407,
|
| 12589 |
+
"step": 17970
|
| 12590 |
+
},
|
| 12591 |
+
{
|
| 12592 |
+
"epoch": 0.03506052230038366,
|
| 12593 |
+
"grad_norm": 9.625,
|
| 12594 |
+
"learning_rate": 0.0004943175500867696,
|
| 12595 |
+
"loss": 16.5699,
|
| 12596 |
+
"step": 17980
|
| 12597 |
+
},
|
| 12598 |
+
{
|
| 12599 |
+
"epoch": 0.03508002203469978,
|
| 12600 |
+
"grad_norm": 6.875,
|
| 12601 |
+
"learning_rate": 0.000494314299071315,
|
| 12602 |
+
"loss": 16.5092,
|
| 12603 |
+
"step": 17990
|
| 12604 |
+
},
|
| 12605 |
+
{
|
| 12606 |
+
"epoch": 0.035099521769015894,
|
| 12607 |
+
"grad_norm": 27.625,
|
| 12608 |
+
"learning_rate": 0.0004943110480558603,
|
| 12609 |
+
"loss": 16.482,
|
| 12610 |
+
"step": 18000
|
| 12611 |
}
|
| 12612 |
],
|
| 12613 |
"logging_steps": 10,
|
|
|
|
| 12627 |
"attributes": {}
|
| 12628 |
}
|
| 12629 |
},
|
| 12630 |
+
"total_flos": 4.004838781385651e+19,
|
| 12631 |
"train_batch_size": 48,
|
| 12632 |
"trial_name": null,
|
| 12633 |
"trial_params": null
|