Training in progress, step 50000
Browse files- optimizer.pt +1 -1
- rng_state_0.pth +1 -1
- rng_state_1.pth +1 -1
- runs/Jun07_20-36-29_829f6f605e43/events.out.tfevents.1717792718.829f6f605e43.85.0 +2 -2
- scheduler.pt +1 -1
- trainer_state.json +1762 -3
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 11230198
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1875f0dc76db8717f7a2ebbdf55b28259425b5ec418cf9a61e01ccfc78df1b62
|
| 3 |
size 11230198
|
rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14512
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5aeb0c54903210b6bb77aabf8f4802e4126d4bae40ff815b9d0b63767286cff
|
| 3 |
size 14512
|
rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14512
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2087fa1159897fc8e7870700fdb75275c4b88dbf7d3cd02c5397018e197c58f1
|
| 3 |
size 14512
|
runs/Jun07_20-36-29_829f6f605e43/events.out.tfevents.1717792718.829f6f605e43.85.0
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:160f7656dc8773a4a4cfb7600f73292b3ce736a95308ca1b31e7de2232bdb074
|
| 3 |
+
size 542274
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73ca23880fe7eefec74f8b2319f74a91041ac4fc69eab046e71d83c57c8c783c
|
| 3 |
size 1064
|
trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 5000,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -15838,6 +15838,1765 @@
|
|
| 15838 |
"eval_samples_per_second": 3575.081,
|
| 15839 |
"eval_steps_per_second": 3.494,
|
| 15840 |
"step": 45000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15841 |
}
|
| 15842 |
],
|
| 15843 |
"logging_steps": 20,
|
|
@@ -15857,7 +17616,7 @@
|
|
| 15857 |
"attributes": {}
|
| 15858 |
}
|
| 15859 |
},
|
| 15860 |
-
"total_flos":
|
| 15861 |
"train_batch_size": 512,
|
| 15862 |
"trial_name": null,
|
| 15863 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": null,
|
| 3 |
"best_model_checkpoint": null,
|
| 4 |
+
"epoch": 0.904895484571532,
|
| 5 |
"eval_steps": 5000,
|
| 6 |
+
"global_step": 50000,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 15838 |
"eval_samples_per_second": 3575.081,
|
| 15839 |
"eval_steps_per_second": 3.494,
|
| 15840 |
"step": 45000
|
| 15841 |
+
},
|
| 15842 |
+
{
|
| 15843 |
+
"epoch": 0.8147678943082074,
|
| 15844 |
+
"grad_norm": NaN,
|
| 15845 |
+
"learning_rate": 0.0003422030524220305,
|
| 15846 |
+
"loss": 1.1013,
|
| 15847 |
+
"step": 45020
|
| 15848 |
+
},
|
| 15849 |
+
{
|
| 15850 |
+
"epoch": 0.8151298525020361,
|
| 15851 |
+
"grad_norm": NaN,
|
| 15852 |
+
"learning_rate": 0.0003422030524220305,
|
| 15853 |
+
"loss": 0.0,
|
| 15854 |
+
"step": 45040
|
| 15855 |
+
},
|
| 15856 |
+
{
|
| 15857 |
+
"epoch": 0.8154918106958646,
|
| 15858 |
+
"grad_norm": NaN,
|
| 15859 |
+
"learning_rate": 0.0003422030524220305,
|
| 15860 |
+
"loss": 1.0811,
|
| 15861 |
+
"step": 45060
|
| 15862 |
+
},
|
| 15863 |
+
{
|
| 15864 |
+
"epoch": 0.8158537688896932,
|
| 15865 |
+
"grad_norm": NaN,
|
| 15866 |
+
"learning_rate": 0.0003422030524220305,
|
| 15867 |
+
"loss": 0.5771,
|
| 15868 |
+
"step": 45080
|
| 15869 |
+
},
|
| 15870 |
+
{
|
| 15871 |
+
"epoch": 0.8162157270835219,
|
| 15872 |
+
"grad_norm": NaN,
|
| 15873 |
+
"learning_rate": 0.0003422030524220305,
|
| 15874 |
+
"loss": 1.2551,
|
| 15875 |
+
"step": 45100
|
| 15876 |
+
},
|
| 15877 |
+
{
|
| 15878 |
+
"epoch": 0.8165776852773504,
|
| 15879 |
+
"grad_norm": NaN,
|
| 15880 |
+
"learning_rate": 0.0003422573445134825,
|
| 15881 |
+
"loss": 1.5968,
|
| 15882 |
+
"step": 45120
|
| 15883 |
+
},
|
| 15884 |
+
{
|
| 15885 |
+
"epoch": 0.8169396434711791,
|
| 15886 |
+
"grad_norm": NaN,
|
| 15887 |
+
"learning_rate": 0.0003422573445134825,
|
| 15888 |
+
"loss": 0.6273,
|
| 15889 |
+
"step": 45140
|
| 15890 |
+
},
|
| 15891 |
+
{
|
| 15892 |
+
"epoch": 0.8173016016650076,
|
| 15893 |
+
"grad_norm": NaN,
|
| 15894 |
+
"learning_rate": 0.00034231163660493455,
|
| 15895 |
+
"loss": 1.762,
|
| 15896 |
+
"step": 45160
|
| 15897 |
+
},
|
| 15898 |
+
{
|
| 15899 |
+
"epoch": 0.8176635598588363,
|
| 15900 |
+
"grad_norm": NaN,
|
| 15901 |
+
"learning_rate": 0.00034231163660493455,
|
| 15902 |
+
"loss": 0.7482,
|
| 15903 |
+
"step": 45180
|
| 15904 |
+
},
|
| 15905 |
+
{
|
| 15906 |
+
"epoch": 0.8180255180526649,
|
| 15907 |
+
"grad_norm": NaN,
|
| 15908 |
+
"learning_rate": 0.00034231163660493455,
|
| 15909 |
+
"loss": 0.6348,
|
| 15910 |
+
"step": 45200
|
| 15911 |
+
},
|
| 15912 |
+
{
|
| 15913 |
+
"epoch": 0.8183874762464936,
|
| 15914 |
+
"grad_norm": NaN,
|
| 15915 |
+
"learning_rate": 0.00034231163660493455,
|
| 15916 |
+
"loss": 2.9022,
|
| 15917 |
+
"step": 45220
|
| 15918 |
+
},
|
| 15919 |
+
{
|
| 15920 |
+
"epoch": 0.8187494344403221,
|
| 15921 |
+
"grad_norm": NaN,
|
| 15922 |
+
"learning_rate": 0.00034231163660493455,
|
| 15923 |
+
"loss": 3.9902,
|
| 15924 |
+
"step": 45240
|
| 15925 |
+
},
|
| 15926 |
+
{
|
| 15927 |
+
"epoch": 0.8191113926341508,
|
| 15928 |
+
"grad_norm": NaN,
|
| 15929 |
+
"learning_rate": 0.00034231163660493455,
|
| 15930 |
+
"loss": 1.3003,
|
| 15931 |
+
"step": 45260
|
| 15932 |
+
},
|
| 15933 |
+
{
|
| 15934 |
+
"epoch": 0.8194733508279793,
|
| 15935 |
+
"grad_norm": NaN,
|
| 15936 |
+
"learning_rate": 0.00034231163660493455,
|
| 15937 |
+
"loss": 0.7519,
|
| 15938 |
+
"step": 45280
|
| 15939 |
+
},
|
| 15940 |
+
{
|
| 15941 |
+
"epoch": 0.819835309021808,
|
| 15942 |
+
"grad_norm": NaN,
|
| 15943 |
+
"learning_rate": 0.00034231163660493455,
|
| 15944 |
+
"loss": 3.7594,
|
| 15945 |
+
"step": 45300
|
| 15946 |
+
},
|
| 15947 |
+
{
|
| 15948 |
+
"epoch": 0.8201972672156366,
|
| 15949 |
+
"grad_norm": NaN,
|
| 15950 |
+
"learning_rate": 0.00034231163660493455,
|
| 15951 |
+
"loss": 0.5259,
|
| 15952 |
+
"step": 45320
|
| 15953 |
+
},
|
| 15954 |
+
{
|
| 15955 |
+
"epoch": 0.8205592254094652,
|
| 15956 |
+
"grad_norm": NaN,
|
| 15957 |
+
"learning_rate": 0.0003423659286963865,
|
| 15958 |
+
"loss": 1.3492,
|
| 15959 |
+
"step": 45340
|
| 15960 |
+
},
|
| 15961 |
+
{
|
| 15962 |
+
"epoch": 0.8209211836032938,
|
| 15963 |
+
"grad_norm": NaN,
|
| 15964 |
+
"learning_rate": 0.0003423659286963865,
|
| 15965 |
+
"loss": 1.8379,
|
| 15966 |
+
"step": 45360
|
| 15967 |
+
},
|
| 15968 |
+
{
|
| 15969 |
+
"epoch": 0.8212831417971225,
|
| 15970 |
+
"grad_norm": NaN,
|
| 15971 |
+
"learning_rate": 0.0003423659286963865,
|
| 15972 |
+
"loss": 0.9331,
|
| 15973 |
+
"step": 45380
|
| 15974 |
+
},
|
| 15975 |
+
{
|
| 15976 |
+
"epoch": 0.821645099990951,
|
| 15977 |
+
"grad_norm": NaN,
|
| 15978 |
+
"learning_rate": 0.0003423659286963865,
|
| 15979 |
+
"loss": 1.242,
|
| 15980 |
+
"step": 45400
|
| 15981 |
+
},
|
| 15982 |
+
{
|
| 15983 |
+
"epoch": 0.8220070581847797,
|
| 15984 |
+
"grad_norm": NaN,
|
| 15985 |
+
"learning_rate": 0.0003423659286963865,
|
| 15986 |
+
"loss": 1.2382,
|
| 15987 |
+
"step": 45420
|
| 15988 |
+
},
|
| 15989 |
+
{
|
| 15990 |
+
"epoch": 0.8223690163786083,
|
| 15991 |
+
"grad_norm": NaN,
|
| 15992 |
+
"learning_rate": 0.0003423659286963865,
|
| 15993 |
+
"loss": 3.0546,
|
| 15994 |
+
"step": 45440
|
| 15995 |
+
},
|
| 15996 |
+
{
|
| 15997 |
+
"epoch": 0.8227309745724369,
|
| 15998 |
+
"grad_norm": NaN,
|
| 15999 |
+
"learning_rate": 0.0003423659286963865,
|
| 16000 |
+
"loss": 1.1098,
|
| 16001 |
+
"step": 45460
|
| 16002 |
+
},
|
| 16003 |
+
{
|
| 16004 |
+
"epoch": 0.8230929327662655,
|
| 16005 |
+
"grad_norm": NaN,
|
| 16006 |
+
"learning_rate": 0.0003423659286963865,
|
| 16007 |
+
"loss": 4.5014,
|
| 16008 |
+
"step": 45480
|
| 16009 |
+
},
|
| 16010 |
+
{
|
| 16011 |
+
"epoch": 0.823454890960094,
|
| 16012 |
+
"grad_norm": NaN,
|
| 16013 |
+
"learning_rate": 0.0003423659286963865,
|
| 16014 |
+
"loss": 0.8949,
|
| 16015 |
+
"step": 45500
|
| 16016 |
+
},
|
| 16017 |
+
{
|
| 16018 |
+
"epoch": 0.8238168491539227,
|
| 16019 |
+
"grad_norm": NaN,
|
| 16020 |
+
"learning_rate": 0.0003423659286963865,
|
| 16021 |
+
"loss": 3.0434,
|
| 16022 |
+
"step": 45520
|
| 16023 |
+
},
|
| 16024 |
+
{
|
| 16025 |
+
"epoch": 0.8241788073477513,
|
| 16026 |
+
"grad_norm": NaN,
|
| 16027 |
+
"learning_rate": 0.0003423659286963865,
|
| 16028 |
+
"loss": 0.8785,
|
| 16029 |
+
"step": 45540
|
| 16030 |
+
},
|
| 16031 |
+
{
|
| 16032 |
+
"epoch": 0.82454076554158,
|
| 16033 |
+
"grad_norm": NaN,
|
| 16034 |
+
"learning_rate": 0.0003423659286963865,
|
| 16035 |
+
"loss": 3.6592,
|
| 16036 |
+
"step": 45560
|
| 16037 |
+
},
|
| 16038 |
+
{
|
| 16039 |
+
"epoch": 0.8249027237354085,
|
| 16040 |
+
"grad_norm": NaN,
|
| 16041 |
+
"learning_rate": 0.00034242022078783855,
|
| 16042 |
+
"loss": 3.3117,
|
| 16043 |
+
"step": 45580
|
| 16044 |
+
},
|
| 16045 |
+
{
|
| 16046 |
+
"epoch": 0.8252646819292372,
|
| 16047 |
+
"grad_norm": NaN,
|
| 16048 |
+
"learning_rate": 0.00034242022078783855,
|
| 16049 |
+
"loss": 0.6415,
|
| 16050 |
+
"step": 45600
|
| 16051 |
+
},
|
| 16052 |
+
{
|
| 16053 |
+
"epoch": 0.8256266401230657,
|
| 16054 |
+
"grad_norm": NaN,
|
| 16055 |
+
"learning_rate": 0.00034242022078783855,
|
| 16056 |
+
"loss": 0.855,
|
| 16057 |
+
"step": 45620
|
| 16058 |
+
},
|
| 16059 |
+
{
|
| 16060 |
+
"epoch": 0.8259885983168944,
|
| 16061 |
+
"grad_norm": NaN,
|
| 16062 |
+
"learning_rate": 0.00034242022078783855,
|
| 16063 |
+
"loss": 5.2258,
|
| 16064 |
+
"step": 45640
|
| 16065 |
+
},
|
| 16066 |
+
{
|
| 16067 |
+
"epoch": 0.826350556510723,
|
| 16068 |
+
"grad_norm": NaN,
|
| 16069 |
+
"learning_rate": 0.0003424745128792906,
|
| 16070 |
+
"loss": 1.9846,
|
| 16071 |
+
"step": 45660
|
| 16072 |
+
},
|
| 16073 |
+
{
|
| 16074 |
+
"epoch": 0.8267125147045516,
|
| 16075 |
+
"grad_norm": NaN,
|
| 16076 |
+
"learning_rate": 0.0003424745128792906,
|
| 16077 |
+
"loss": 0.6199,
|
| 16078 |
+
"step": 45680
|
| 16079 |
+
},
|
| 16080 |
+
{
|
| 16081 |
+
"epoch": 0.8270744728983802,
|
| 16082 |
+
"grad_norm": NaN,
|
| 16083 |
+
"learning_rate": 0.0003424745128792906,
|
| 16084 |
+
"loss": 1.4499,
|
| 16085 |
+
"step": 45700
|
| 16086 |
+
},
|
| 16087 |
+
{
|
| 16088 |
+
"epoch": 0.8274364310922089,
|
| 16089 |
+
"grad_norm": NaN,
|
| 16090 |
+
"learning_rate": 0.0003424745128792906,
|
| 16091 |
+
"loss": 0.9893,
|
| 16092 |
+
"step": 45720
|
| 16093 |
+
},
|
| 16094 |
+
{
|
| 16095 |
+
"epoch": 0.8277983892860374,
|
| 16096 |
+
"grad_norm": NaN,
|
| 16097 |
+
"learning_rate": 0.0003424745128792906,
|
| 16098 |
+
"loss": 1.379,
|
| 16099 |
+
"step": 45740
|
| 16100 |
+
},
|
| 16101 |
+
{
|
| 16102 |
+
"epoch": 0.8281603474798661,
|
| 16103 |
+
"grad_norm": NaN,
|
| 16104 |
+
"learning_rate": 0.0003424745128792906,
|
| 16105 |
+
"loss": 2.8108,
|
| 16106 |
+
"step": 45760
|
| 16107 |
+
},
|
| 16108 |
+
{
|
| 16109 |
+
"epoch": 0.8285223056736947,
|
| 16110 |
+
"grad_norm": NaN,
|
| 16111 |
+
"learning_rate": 0.0003424745128792906,
|
| 16112 |
+
"loss": 3.075,
|
| 16113 |
+
"step": 45780
|
| 16114 |
+
},
|
| 16115 |
+
{
|
| 16116 |
+
"epoch": 0.8288842638675233,
|
| 16117 |
+
"grad_norm": NaN,
|
| 16118 |
+
"learning_rate": 0.0003424745128792906,
|
| 16119 |
+
"loss": 3.9405,
|
| 16120 |
+
"step": 45800
|
| 16121 |
+
},
|
| 16122 |
+
{
|
| 16123 |
+
"epoch": 0.8292462220613519,
|
| 16124 |
+
"grad_norm": NaN,
|
| 16125 |
+
"learning_rate": 0.0003424745128792906,
|
| 16126 |
+
"loss": 3.0244,
|
| 16127 |
+
"step": 45820
|
| 16128 |
+
},
|
| 16129 |
+
{
|
| 16130 |
+
"epoch": 0.8296081802551806,
|
| 16131 |
+
"grad_norm": NaN,
|
| 16132 |
+
"learning_rate": 0.0003424745128792906,
|
| 16133 |
+
"loss": 0.9193,
|
| 16134 |
+
"step": 45840
|
| 16135 |
+
},
|
| 16136 |
+
{
|
| 16137 |
+
"epoch": 0.8299701384490091,
|
| 16138 |
+
"grad_norm": NaN,
|
| 16139 |
+
"learning_rate": 0.0003424745128792906,
|
| 16140 |
+
"loss": 2.2784,
|
| 16141 |
+
"step": 45860
|
| 16142 |
+
},
|
| 16143 |
+
{
|
| 16144 |
+
"epoch": 0.8303320966428378,
|
| 16145 |
+
"grad_norm": NaN,
|
| 16146 |
+
"learning_rate": 0.0003425830970621946,
|
| 16147 |
+
"loss": 5.6385,
|
| 16148 |
+
"step": 45880
|
| 16149 |
+
},
|
| 16150 |
+
{
|
| 16151 |
+
"epoch": 0.8306940548366664,
|
| 16152 |
+
"grad_norm": NaN,
|
| 16153 |
+
"learning_rate": 0.0003425830970621946,
|
| 16154 |
+
"loss": 0.6865,
|
| 16155 |
+
"step": 45900
|
| 16156 |
+
},
|
| 16157 |
+
{
|
| 16158 |
+
"epoch": 0.831056013030495,
|
| 16159 |
+
"grad_norm": NaN,
|
| 16160 |
+
"learning_rate": 0.0003425830970621946,
|
| 16161 |
+
"loss": 0.6382,
|
| 16162 |
+
"step": 45920
|
| 16163 |
+
},
|
| 16164 |
+
{
|
| 16165 |
+
"epoch": 0.8314179712243236,
|
| 16166 |
+
"grad_norm": NaN,
|
| 16167 |
+
"learning_rate": 0.0003425830970621946,
|
| 16168 |
+
"loss": 2.9461,
|
| 16169 |
+
"step": 45940
|
| 16170 |
+
},
|
| 16171 |
+
{
|
| 16172 |
+
"epoch": 0.8317799294181522,
|
| 16173 |
+
"grad_norm": NaN,
|
| 16174 |
+
"learning_rate": 0.0003425830970621946,
|
| 16175 |
+
"loss": 0.5432,
|
| 16176 |
+
"step": 45960
|
| 16177 |
+
},
|
| 16178 |
+
{
|
| 16179 |
+
"epoch": 0.8321418876119808,
|
| 16180 |
+
"grad_norm": NaN,
|
| 16181 |
+
"learning_rate": 0.0003425830970621946,
|
| 16182 |
+
"loss": 0.3098,
|
| 16183 |
+
"step": 45980
|
| 16184 |
+
},
|
| 16185 |
+
{
|
| 16186 |
+
"epoch": 0.8325038458058094,
|
| 16187 |
+
"grad_norm": NaN,
|
| 16188 |
+
"learning_rate": 0.0003425830970621946,
|
| 16189 |
+
"loss": 0.7931,
|
| 16190 |
+
"step": 46000
|
| 16191 |
+
},
|
| 16192 |
+
{
|
| 16193 |
+
"epoch": 0.832865803999638,
|
| 16194 |
+
"grad_norm": NaN,
|
| 16195 |
+
"learning_rate": 0.0003425830970621946,
|
| 16196 |
+
"loss": 1.3182,
|
| 16197 |
+
"step": 46020
|
| 16198 |
+
},
|
| 16199 |
+
{
|
| 16200 |
+
"epoch": 0.8332277621934666,
|
| 16201 |
+
"grad_norm": NaN,
|
| 16202 |
+
"learning_rate": 0.0003425830970621946,
|
| 16203 |
+
"loss": 0.883,
|
| 16204 |
+
"step": 46040
|
| 16205 |
+
},
|
| 16206 |
+
{
|
| 16207 |
+
"epoch": 0.8335897203872953,
|
| 16208 |
+
"grad_norm": NaN,
|
| 16209 |
+
"learning_rate": 0.0003425830970621946,
|
| 16210 |
+
"loss": 0.8214,
|
| 16211 |
+
"step": 46060
|
| 16212 |
+
},
|
| 16213 |
+
{
|
| 16214 |
+
"epoch": 0.8339516785811238,
|
| 16215 |
+
"grad_norm": NaN,
|
| 16216 |
+
"learning_rate": 0.0003425830970621946,
|
| 16217 |
+
"loss": 1.4961,
|
| 16218 |
+
"step": 46080
|
| 16219 |
+
},
|
| 16220 |
+
{
|
| 16221 |
+
"epoch": 0.8343136367749525,
|
| 16222 |
+
"grad_norm": NaN,
|
| 16223 |
+
"learning_rate": 0.0003425830970621946,
|
| 16224 |
+
"loss": 1.9674,
|
| 16225 |
+
"step": 46100
|
| 16226 |
+
},
|
| 16227 |
+
{
|
| 16228 |
+
"epoch": 0.8346755949687811,
|
| 16229 |
+
"grad_norm": NaN,
|
| 16230 |
+
"learning_rate": 0.0003426373891536466,
|
| 16231 |
+
"loss": 2.7082,
|
| 16232 |
+
"step": 46120
|
| 16233 |
+
},
|
| 16234 |
+
{
|
| 16235 |
+
"epoch": 0.8350375531626097,
|
| 16236 |
+
"grad_norm": NaN,
|
| 16237 |
+
"learning_rate": 0.0003426373891536466,
|
| 16238 |
+
"loss": 4.7325,
|
| 16239 |
+
"step": 46140
|
| 16240 |
+
},
|
| 16241 |
+
{
|
| 16242 |
+
"epoch": 0.8353995113564383,
|
| 16243 |
+
"grad_norm": NaN,
|
| 16244 |
+
"learning_rate": 0.0003426373891536466,
|
| 16245 |
+
"loss": 3.6999,
|
| 16246 |
+
"step": 46160
|
| 16247 |
+
},
|
| 16248 |
+
{
|
| 16249 |
+
"epoch": 0.835761469550267,
|
| 16250 |
+
"grad_norm": NaN,
|
| 16251 |
+
"learning_rate": 0.0003426373891536466,
|
| 16252 |
+
"loss": 1.4293,
|
| 16253 |
+
"step": 46180
|
| 16254 |
+
},
|
| 16255 |
+
{
|
| 16256 |
+
"epoch": 0.8361234277440955,
|
| 16257 |
+
"grad_norm": NaN,
|
| 16258 |
+
"learning_rate": 0.0003426373891536466,
|
| 16259 |
+
"loss": 2.882,
|
| 16260 |
+
"step": 46200
|
| 16261 |
+
},
|
| 16262 |
+
{
|
| 16263 |
+
"epoch": 0.8364853859379242,
|
| 16264 |
+
"grad_norm": NaN,
|
| 16265 |
+
"learning_rate": 0.0003426373891536466,
|
| 16266 |
+
"loss": 1.5639,
|
| 16267 |
+
"step": 46220
|
| 16268 |
+
},
|
| 16269 |
+
{
|
| 16270 |
+
"epoch": 0.8368473441317528,
|
| 16271 |
+
"grad_norm": NaN,
|
| 16272 |
+
"learning_rate": 0.0003426373891536466,
|
| 16273 |
+
"loss": 3.3409,
|
| 16274 |
+
"step": 46240
|
| 16275 |
+
},
|
| 16276 |
+
{
|
| 16277 |
+
"epoch": 0.8372093023255814,
|
| 16278 |
+
"grad_norm": NaN,
|
| 16279 |
+
"learning_rate": 0.0003426373891536466,
|
| 16280 |
+
"loss": 2.6536,
|
| 16281 |
+
"step": 46260
|
| 16282 |
+
},
|
| 16283 |
+
{
|
| 16284 |
+
"epoch": 0.83757126051941,
|
| 16285 |
+
"grad_norm": NaN,
|
| 16286 |
+
"learning_rate": 0.0003426373891536466,
|
| 16287 |
+
"loss": 1.5417,
|
| 16288 |
+
"step": 46280
|
| 16289 |
+
},
|
| 16290 |
+
{
|
| 16291 |
+
"epoch": 0.8379332187132387,
|
| 16292 |
+
"grad_norm": NaN,
|
| 16293 |
+
"learning_rate": 0.0003426916812450986,
|
| 16294 |
+
"loss": 1.6685,
|
| 16295 |
+
"step": 46300
|
| 16296 |
+
},
|
| 16297 |
+
{
|
| 16298 |
+
"epoch": 0.8382951769070672,
|
| 16299 |
+
"grad_norm": NaN,
|
| 16300 |
+
"learning_rate": 0.0003426916812450986,
|
| 16301 |
+
"loss": 2.3909,
|
| 16302 |
+
"step": 46320
|
| 16303 |
+
},
|
| 16304 |
+
{
|
| 16305 |
+
"epoch": 0.8386571351008959,
|
| 16306 |
+
"grad_norm": NaN,
|
| 16307 |
+
"learning_rate": 0.0003426916812450986,
|
| 16308 |
+
"loss": 0.3746,
|
| 16309 |
+
"step": 46340
|
| 16310 |
+
},
|
| 16311 |
+
{
|
| 16312 |
+
"epoch": 0.8390190932947245,
|
| 16313 |
+
"grad_norm": NaN,
|
| 16314 |
+
"learning_rate": 0.0003426916812450986,
|
| 16315 |
+
"loss": 2.5086,
|
| 16316 |
+
"step": 46360
|
| 16317 |
+
},
|
| 16318 |
+
{
|
| 16319 |
+
"epoch": 0.839381051488553,
|
| 16320 |
+
"grad_norm": NaN,
|
| 16321 |
+
"learning_rate": 0.0003426916812450986,
|
| 16322 |
+
"loss": 0.5264,
|
| 16323 |
+
"step": 46380
|
| 16324 |
+
},
|
| 16325 |
+
{
|
| 16326 |
+
"epoch": 0.8397430096823817,
|
| 16327 |
+
"grad_norm": NaN,
|
| 16328 |
+
"learning_rate": 0.0003426916812450986,
|
| 16329 |
+
"loss": 1.46,
|
| 16330 |
+
"step": 46400
|
| 16331 |
+
},
|
| 16332 |
+
{
|
| 16333 |
+
"epoch": 0.8401049678762103,
|
| 16334 |
+
"grad_norm": NaN,
|
| 16335 |
+
"learning_rate": 0.0003426916812450986,
|
| 16336 |
+
"loss": 3.4459,
|
| 16337 |
+
"step": 46420
|
| 16338 |
+
},
|
| 16339 |
+
{
|
| 16340 |
+
"epoch": 0.8404669260700389,
|
| 16341 |
+
"grad_norm": NaN,
|
| 16342 |
+
"learning_rate": 0.0003426916812450986,
|
| 16343 |
+
"loss": 3.2337,
|
| 16344 |
+
"step": 46440
|
| 16345 |
+
},
|
| 16346 |
+
{
|
| 16347 |
+
"epoch": 0.8408288842638675,
|
| 16348 |
+
"grad_norm": NaN,
|
| 16349 |
+
"learning_rate": 0.0003426916812450986,
|
| 16350 |
+
"loss": 2.0381,
|
| 16351 |
+
"step": 46460
|
| 16352 |
+
},
|
| 16353 |
+
{
|
| 16354 |
+
"epoch": 0.8411908424576962,
|
| 16355 |
+
"grad_norm": NaN,
|
| 16356 |
+
"learning_rate": 0.0003427459733365506,
|
| 16357 |
+
"loss": 1.5477,
|
| 16358 |
+
"step": 46480
|
| 16359 |
+
},
|
| 16360 |
+
{
|
| 16361 |
+
"epoch": 0.8415528006515247,
|
| 16362 |
+
"grad_norm": NaN,
|
| 16363 |
+
"learning_rate": 0.0003427459733365506,
|
| 16364 |
+
"loss": 2.866,
|
| 16365 |
+
"step": 46500
|
| 16366 |
+
},
|
| 16367 |
+
{
|
| 16368 |
+
"epoch": 0.8419147588453534,
|
| 16369 |
+
"grad_norm": NaN,
|
| 16370 |
+
"learning_rate": 0.0003427459733365506,
|
| 16371 |
+
"loss": 0.0,
|
| 16372 |
+
"step": 46520
|
| 16373 |
+
},
|
| 16374 |
+
{
|
| 16375 |
+
"epoch": 0.8422767170391819,
|
| 16376 |
+
"grad_norm": NaN,
|
| 16377 |
+
"learning_rate": 0.0003427459733365506,
|
| 16378 |
+
"loss": 0.6529,
|
| 16379 |
+
"step": 46540
|
| 16380 |
+
},
|
| 16381 |
+
{
|
| 16382 |
+
"epoch": 0.8426386752330106,
|
| 16383 |
+
"grad_norm": NaN,
|
| 16384 |
+
"learning_rate": 0.0003427459733365506,
|
| 16385 |
+
"loss": 1.8621,
|
| 16386 |
+
"step": 46560
|
| 16387 |
+
},
|
| 16388 |
+
{
|
| 16389 |
+
"epoch": 0.8430006334268392,
|
| 16390 |
+
"grad_norm": NaN,
|
| 16391 |
+
"learning_rate": 0.0003427459733365506,
|
| 16392 |
+
"loss": 1.5342,
|
| 16393 |
+
"step": 46580
|
| 16394 |
+
},
|
| 16395 |
+
{
|
| 16396 |
+
"epoch": 0.8433625916206678,
|
| 16397 |
+
"grad_norm": NaN,
|
| 16398 |
+
"learning_rate": 0.0003427459733365506,
|
| 16399 |
+
"loss": 0.3288,
|
| 16400 |
+
"step": 46600
|
| 16401 |
+
},
|
| 16402 |
+
{
|
| 16403 |
+
"epoch": 0.8437245498144964,
|
| 16404 |
+
"grad_norm": NaN,
|
| 16405 |
+
"learning_rate": 0.0003427459733365506,
|
| 16406 |
+
"loss": 2.7671,
|
| 16407 |
+
"step": 46620
|
| 16408 |
+
},
|
| 16409 |
+
{
|
| 16410 |
+
"epoch": 0.8440865080083251,
|
| 16411 |
+
"grad_norm": NaN,
|
| 16412 |
+
"learning_rate": 0.0003427459733365506,
|
| 16413 |
+
"loss": 4.344,
|
| 16414 |
+
"step": 46640
|
| 16415 |
+
},
|
| 16416 |
+
{
|
| 16417 |
+
"epoch": 0.8444484662021536,
|
| 16418 |
+
"grad_norm": NaN,
|
| 16419 |
+
"learning_rate": 0.0003427459733365506,
|
| 16420 |
+
"loss": 1.3153,
|
| 16421 |
+
"step": 46660
|
| 16422 |
+
},
|
| 16423 |
+
{
|
| 16424 |
+
"epoch": 0.8448104243959823,
|
| 16425 |
+
"grad_norm": NaN,
|
| 16426 |
+
"learning_rate": 0.0003427459733365506,
|
| 16427 |
+
"loss": 0.0,
|
| 16428 |
+
"step": 46680
|
| 16429 |
+
},
|
| 16430 |
+
{
|
| 16431 |
+
"epoch": 0.8451723825898109,
|
| 16432 |
+
"grad_norm": NaN,
|
| 16433 |
+
"learning_rate": 0.0003427459733365506,
|
| 16434 |
+
"loss": 3.0078,
|
| 16435 |
+
"step": 46700
|
| 16436 |
+
},
|
| 16437 |
+
{
|
| 16438 |
+
"epoch": 0.8455343407836395,
|
| 16439 |
+
"grad_norm": NaN,
|
| 16440 |
+
"learning_rate": 0.0003427459733365506,
|
| 16441 |
+
"loss": 2.3489,
|
| 16442 |
+
"step": 46720
|
| 16443 |
+
},
|
| 16444 |
+
{
|
| 16445 |
+
"epoch": 0.8458962989774681,
|
| 16446 |
+
"grad_norm": NaN,
|
| 16447 |
+
"learning_rate": 0.0003427459733365506,
|
| 16448 |
+
"loss": 1.9476,
|
| 16449 |
+
"step": 46740
|
| 16450 |
+
},
|
| 16451 |
+
{
|
| 16452 |
+
"epoch": 0.8462582571712968,
|
| 16453 |
+
"grad_norm": NaN,
|
| 16454 |
+
"learning_rate": 0.0003427459733365506,
|
| 16455 |
+
"loss": 0.0,
|
| 16456 |
+
"step": 46760
|
| 16457 |
+
},
|
| 16458 |
+
{
|
| 16459 |
+
"epoch": 0.8466202153651253,
|
| 16460 |
+
"grad_norm": NaN,
|
| 16461 |
+
"learning_rate": 0.0003427459733365506,
|
| 16462 |
+
"loss": 1.507,
|
| 16463 |
+
"step": 46780
|
| 16464 |
+
},
|
| 16465 |
+
{
|
| 16466 |
+
"epoch": 0.8469821735589539,
|
| 16467 |
+
"grad_norm": NaN,
|
| 16468 |
+
"learning_rate": 0.0003427459733365506,
|
| 16469 |
+
"loss": 4.0155,
|
| 16470 |
+
"step": 46800
|
| 16471 |
+
},
|
| 16472 |
+
{
|
| 16473 |
+
"epoch": 0.8473441317527826,
|
| 16474 |
+
"grad_norm": NaN,
|
| 16475 |
+
"learning_rate": 0.00034280026542800264,
|
| 16476 |
+
"loss": 6.1755,
|
| 16477 |
+
"step": 46820
|
| 16478 |
+
},
|
| 16479 |
+
{
|
| 16480 |
+
"epoch": 0.8477060899466111,
|
| 16481 |
+
"grad_norm": NaN,
|
| 16482 |
+
"learning_rate": 0.00034280026542800264,
|
| 16483 |
+
"loss": 2.4112,
|
| 16484 |
+
"step": 46840
|
| 16485 |
+
},
|
| 16486 |
+
{
|
| 16487 |
+
"epoch": 0.8480680481404398,
|
| 16488 |
+
"grad_norm": NaN,
|
| 16489 |
+
"learning_rate": 0.00034280026542800264,
|
| 16490 |
+
"loss": 0.2907,
|
| 16491 |
+
"step": 46860
|
| 16492 |
+
},
|
| 16493 |
+
{
|
| 16494 |
+
"epoch": 0.8484300063342683,
|
| 16495 |
+
"grad_norm": NaN,
|
| 16496 |
+
"learning_rate": 0.00034280026542800264,
|
| 16497 |
+
"loss": 3.5964,
|
| 16498 |
+
"step": 46880
|
| 16499 |
+
},
|
| 16500 |
+
{
|
| 16501 |
+
"epoch": 0.848791964528097,
|
| 16502 |
+
"grad_norm": NaN,
|
| 16503 |
+
"learning_rate": 0.00034280026542800264,
|
| 16504 |
+
"loss": 0.3755,
|
| 16505 |
+
"step": 46900
|
| 16506 |
+
},
|
| 16507 |
+
{
|
| 16508 |
+
"epoch": 0.8491539227219256,
|
| 16509 |
+
"grad_norm": NaN,
|
| 16510 |
+
"learning_rate": 0.00034280026542800264,
|
| 16511 |
+
"loss": 0.8192,
|
| 16512 |
+
"step": 46920
|
| 16513 |
+
},
|
| 16514 |
+
{
|
| 16515 |
+
"epoch": 0.8495158809157543,
|
| 16516 |
+
"grad_norm": NaN,
|
| 16517 |
+
"learning_rate": 0.00034280026542800264,
|
| 16518 |
+
"loss": 2.625,
|
| 16519 |
+
"step": 46940
|
| 16520 |
+
},
|
| 16521 |
+
{
|
| 16522 |
+
"epoch": 0.8498778391095828,
|
| 16523 |
+
"grad_norm": NaN,
|
| 16524 |
+
"learning_rate": 0.00034285455751945467,
|
| 16525 |
+
"loss": 3.4662,
|
| 16526 |
+
"step": 46960
|
| 16527 |
+
},
|
| 16528 |
+
{
|
| 16529 |
+
"epoch": 0.8502397973034115,
|
| 16530 |
+
"grad_norm": NaN,
|
| 16531 |
+
"learning_rate": 0.00034285455751945467,
|
| 16532 |
+
"loss": 0.3487,
|
| 16533 |
+
"step": 46980
|
| 16534 |
+
},
|
| 16535 |
+
{
|
| 16536 |
+
"epoch": 0.85060175549724,
|
| 16537 |
+
"grad_norm": NaN,
|
| 16538 |
+
"learning_rate": 0.00034285455751945467,
|
| 16539 |
+
"loss": 1.3247,
|
| 16540 |
+
"step": 47000
|
| 16541 |
+
},
|
| 16542 |
+
{
|
| 16543 |
+
"epoch": 0.8509637136910687,
|
| 16544 |
+
"grad_norm": NaN,
|
| 16545 |
+
"learning_rate": 0.00034285455751945467,
|
| 16546 |
+
"loss": 2.4207,
|
| 16547 |
+
"step": 47020
|
| 16548 |
+
},
|
| 16549 |
+
{
|
| 16550 |
+
"epoch": 0.8513256718848973,
|
| 16551 |
+
"grad_norm": NaN,
|
| 16552 |
+
"learning_rate": 0.00034285455751945467,
|
| 16553 |
+
"loss": 3.5487,
|
| 16554 |
+
"step": 47040
|
| 16555 |
+
},
|
| 16556 |
+
{
|
| 16557 |
+
"epoch": 0.8516876300787259,
|
| 16558 |
+
"grad_norm": NaN,
|
| 16559 |
+
"learning_rate": 0.00034285455751945467,
|
| 16560 |
+
"loss": 2.8014,
|
| 16561 |
+
"step": 47060
|
| 16562 |
+
},
|
| 16563 |
+
{
|
| 16564 |
+
"epoch": 0.8520495882725545,
|
| 16565 |
+
"grad_norm": NaN,
|
| 16566 |
+
"learning_rate": 0.00034285455751945467,
|
| 16567 |
+
"loss": 0.8827,
|
| 16568 |
+
"step": 47080
|
| 16569 |
+
},
|
| 16570 |
+
{
|
| 16571 |
+
"epoch": 0.8524115464663832,
|
| 16572 |
+
"grad_norm": NaN,
|
| 16573 |
+
"learning_rate": 0.00034285455751945467,
|
| 16574 |
+
"loss": 1.686,
|
| 16575 |
+
"step": 47100
|
| 16576 |
+
},
|
| 16577 |
+
{
|
| 16578 |
+
"epoch": 0.8527735046602117,
|
| 16579 |
+
"grad_norm": NaN,
|
| 16580 |
+
"learning_rate": 0.0003429088496109067,
|
| 16581 |
+
"loss": 3.4005,
|
| 16582 |
+
"step": 47120
|
| 16583 |
+
},
|
| 16584 |
+
{
|
| 16585 |
+
"epoch": 0.8531354628540404,
|
| 16586 |
+
"grad_norm": NaN,
|
| 16587 |
+
"learning_rate": 0.0003429088496109067,
|
| 16588 |
+
"loss": 4.9585,
|
| 16589 |
+
"step": 47140
|
| 16590 |
+
},
|
| 16591 |
+
{
|
| 16592 |
+
"epoch": 0.853497421047869,
|
| 16593 |
+
"grad_norm": NaN,
|
| 16594 |
+
"learning_rate": 0.0003429088496109067,
|
| 16595 |
+
"loss": 0.7554,
|
| 16596 |
+
"step": 47160
|
| 16597 |
+
},
|
| 16598 |
+
{
|
| 16599 |
+
"epoch": 0.8538593792416976,
|
| 16600 |
+
"grad_norm": NaN,
|
| 16601 |
+
"learning_rate": 0.0003429088496109067,
|
| 16602 |
+
"loss": 1.6588,
|
| 16603 |
+
"step": 47180
|
| 16604 |
+
},
|
| 16605 |
+
{
|
| 16606 |
+
"epoch": 0.8542213374355262,
|
| 16607 |
+
"grad_norm": NaN,
|
| 16608 |
+
"learning_rate": 0.0003429088496109067,
|
| 16609 |
+
"loss": 0.8945,
|
| 16610 |
+
"step": 47200
|
| 16611 |
+
},
|
| 16612 |
+
{
|
| 16613 |
+
"epoch": 0.8545832956293548,
|
| 16614 |
+
"grad_norm": NaN,
|
| 16615 |
+
"learning_rate": 0.0003429088496109067,
|
| 16616 |
+
"loss": 1.9265,
|
| 16617 |
+
"step": 47220
|
| 16618 |
+
},
|
| 16619 |
+
{
|
| 16620 |
+
"epoch": 0.8549452538231834,
|
| 16621 |
+
"grad_norm": NaN,
|
| 16622 |
+
"learning_rate": 0.0003429088496109067,
|
| 16623 |
+
"loss": 1.0519,
|
| 16624 |
+
"step": 47240
|
| 16625 |
+
},
|
| 16626 |
+
{
|
| 16627 |
+
"epoch": 0.855307212017012,
|
| 16628 |
+
"grad_norm": NaN,
|
| 16629 |
+
"learning_rate": 0.0003429088496109067,
|
| 16630 |
+
"loss": 0.7737,
|
| 16631 |
+
"step": 47260
|
| 16632 |
+
},
|
| 16633 |
+
{
|
| 16634 |
+
"epoch": 0.8556691702108407,
|
| 16635 |
+
"grad_norm": NaN,
|
| 16636 |
+
"learning_rate": 0.0003429088496109067,
|
| 16637 |
+
"loss": 1.9628,
|
| 16638 |
+
"step": 47280
|
| 16639 |
+
},
|
| 16640 |
+
{
|
| 16641 |
+
"epoch": 0.8560311284046692,
|
| 16642 |
+
"grad_norm": NaN,
|
| 16643 |
+
"learning_rate": 0.0003429088496109067,
|
| 16644 |
+
"loss": 2.8813,
|
| 16645 |
+
"step": 47300
|
| 16646 |
+
},
|
| 16647 |
+
{
|
| 16648 |
+
"epoch": 0.8563930865984979,
|
| 16649 |
+
"grad_norm": NaN,
|
| 16650 |
+
"learning_rate": 0.0003429088496109067,
|
| 16651 |
+
"loss": 0.3755,
|
| 16652 |
+
"step": 47320
|
| 16653 |
+
},
|
| 16654 |
+
{
|
| 16655 |
+
"epoch": 0.8567550447923264,
|
| 16656 |
+
"grad_norm": NaN,
|
| 16657 |
+
"learning_rate": 0.0003429088496109067,
|
| 16658 |
+
"loss": 0.3112,
|
| 16659 |
+
"step": 47340
|
| 16660 |
+
},
|
| 16661 |
+
{
|
| 16662 |
+
"epoch": 0.8571170029861551,
|
| 16663 |
+
"grad_norm": NaN,
|
| 16664 |
+
"learning_rate": 0.0003429088496109067,
|
| 16665 |
+
"loss": 0.2919,
|
| 16666 |
+
"step": 47360
|
| 16667 |
+
},
|
| 16668 |
+
{
|
| 16669 |
+
"epoch": 0.8574789611799837,
|
| 16670 |
+
"grad_norm": NaN,
|
| 16671 |
+
"learning_rate": 0.0003430174337938107,
|
| 16672 |
+
"loss": 3.1259,
|
| 16673 |
+
"step": 47380
|
| 16674 |
+
},
|
| 16675 |
+
{
|
| 16676 |
+
"epoch": 0.8578409193738123,
|
| 16677 |
+
"grad_norm": NaN,
|
| 16678 |
+
"learning_rate": 0.00034307172588526273,
|
| 16679 |
+
"loss": 2.6819,
|
| 16680 |
+
"step": 47400
|
| 16681 |
+
},
|
| 16682 |
+
{
|
| 16683 |
+
"epoch": 0.8582028775676409,
|
| 16684 |
+
"grad_norm": NaN,
|
| 16685 |
+
"learning_rate": 0.00034307172588526273,
|
| 16686 |
+
"loss": 0.0,
|
| 16687 |
+
"step": 47420
|
| 16688 |
+
},
|
| 16689 |
+
{
|
| 16690 |
+
"epoch": 0.8585648357614696,
|
| 16691 |
+
"grad_norm": NaN,
|
| 16692 |
+
"learning_rate": 0.00034307172588526273,
|
| 16693 |
+
"loss": 3.8217,
|
| 16694 |
+
"step": 47440
|
| 16695 |
+
},
|
| 16696 |
+
{
|
| 16697 |
+
"epoch": 0.8589267939552981,
|
| 16698 |
+
"grad_norm": NaN,
|
| 16699 |
+
"learning_rate": 0.0003431260179767147,
|
| 16700 |
+
"loss": 4.0489,
|
| 16701 |
+
"step": 47460
|
| 16702 |
+
},
|
| 16703 |
+
{
|
| 16704 |
+
"epoch": 0.8592887521491268,
|
| 16705 |
+
"grad_norm": NaN,
|
| 16706 |
+
"learning_rate": 0.0003431260179767147,
|
| 16707 |
+
"loss": 5.597,
|
| 16708 |
+
"step": 47480
|
| 16709 |
+
},
|
| 16710 |
+
{
|
| 16711 |
+
"epoch": 0.8596507103429554,
|
| 16712 |
+
"grad_norm": NaN,
|
| 16713 |
+
"learning_rate": 0.0003431260179767147,
|
| 16714 |
+
"loss": 0.8297,
|
| 16715 |
+
"step": 47500
|
| 16716 |
+
},
|
| 16717 |
+
{
|
| 16718 |
+
"epoch": 0.860012668536784,
|
| 16719 |
+
"grad_norm": NaN,
|
| 16720 |
+
"learning_rate": 0.0003431260179767147,
|
| 16721 |
+
"loss": 0.9444,
|
| 16722 |
+
"step": 47520
|
| 16723 |
+
},
|
| 16724 |
+
{
|
| 16725 |
+
"epoch": 0.8603746267306126,
|
| 16726 |
+
"grad_norm": NaN,
|
| 16727 |
+
"learning_rate": 0.0003431260179767147,
|
| 16728 |
+
"loss": 0.0,
|
| 16729 |
+
"step": 47540
|
| 16730 |
+
},
|
| 16731 |
+
{
|
| 16732 |
+
"epoch": 0.8607365849244413,
|
| 16733 |
+
"grad_norm": NaN,
|
| 16734 |
+
"learning_rate": 0.0003431260179767147,
|
| 16735 |
+
"loss": 0.911,
|
| 16736 |
+
"step": 47560
|
| 16737 |
+
},
|
| 16738 |
+
{
|
| 16739 |
+
"epoch": 0.8610985431182698,
|
| 16740 |
+
"grad_norm": NaN,
|
| 16741 |
+
"learning_rate": 0.0003431260179767147,
|
| 16742 |
+
"loss": 2.627,
|
| 16743 |
+
"step": 47580
|
| 16744 |
+
},
|
| 16745 |
+
{
|
| 16746 |
+
"epoch": 0.8614605013120985,
|
| 16747 |
+
"grad_norm": NaN,
|
| 16748 |
+
"learning_rate": 0.0003431260179767147,
|
| 16749 |
+
"loss": 2.7942,
|
| 16750 |
+
"step": 47600
|
| 16751 |
+
},
|
| 16752 |
+
{
|
| 16753 |
+
"epoch": 0.8618224595059271,
|
| 16754 |
+
"grad_norm": NaN,
|
| 16755 |
+
"learning_rate": 0.00034318031006816673,
|
| 16756 |
+
"loss": 1.1399,
|
| 16757 |
+
"step": 47620
|
| 16758 |
+
},
|
| 16759 |
+
{
|
| 16760 |
+
"epoch": 0.8621844176997556,
|
| 16761 |
+
"grad_norm": NaN,
|
| 16762 |
+
"learning_rate": 0.00034318031006816673,
|
| 16763 |
+
"loss": 0.0,
|
| 16764 |
+
"step": 47640
|
| 16765 |
+
},
|
| 16766 |
+
{
|
| 16767 |
+
"epoch": 0.8625463758935843,
|
| 16768 |
+
"grad_norm": NaN,
|
| 16769 |
+
"learning_rate": 0.00034318031006816673,
|
| 16770 |
+
"loss": 1.3527,
|
| 16771 |
+
"step": 47660
|
| 16772 |
+
},
|
| 16773 |
+
{
|
| 16774 |
+
"epoch": 0.8629083340874129,
|
| 16775 |
+
"grad_norm": NaN,
|
| 16776 |
+
"learning_rate": 0.00034318031006816673,
|
| 16777 |
+
"loss": 0.9181,
|
| 16778 |
+
"step": 47680
|
| 16779 |
+
},
|
| 16780 |
+
{
|
| 16781 |
+
"epoch": 0.8632702922812415,
|
| 16782 |
+
"grad_norm": NaN,
|
| 16783 |
+
"learning_rate": 0.00034323460215961876,
|
| 16784 |
+
"loss": 3.826,
|
| 16785 |
+
"step": 47700
|
| 16786 |
+
},
|
| 16787 |
+
{
|
| 16788 |
+
"epoch": 0.8636322504750701,
|
| 16789 |
+
"grad_norm": NaN,
|
| 16790 |
+
"learning_rate": 0.00034323460215961876,
|
| 16791 |
+
"loss": 1.3246,
|
| 16792 |
+
"step": 47720
|
| 16793 |
+
},
|
| 16794 |
+
{
|
| 16795 |
+
"epoch": 0.8639942086688988,
|
| 16796 |
+
"grad_norm": NaN,
|
| 16797 |
+
"learning_rate": 0.00034323460215961876,
|
| 16798 |
+
"loss": 0.9301,
|
| 16799 |
+
"step": 47740
|
| 16800 |
+
},
|
| 16801 |
+
{
|
| 16802 |
+
"epoch": 0.8643561668627273,
|
| 16803 |
+
"grad_norm": NaN,
|
| 16804 |
+
"learning_rate": 0.00034328889425107073,
|
| 16805 |
+
"loss": 2.455,
|
| 16806 |
+
"step": 47760
|
| 16807 |
+
},
|
| 16808 |
+
{
|
| 16809 |
+
"epoch": 0.864718125056556,
|
| 16810 |
+
"grad_norm": NaN,
|
| 16811 |
+
"learning_rate": 0.00034328889425107073,
|
| 16812 |
+
"loss": 0.3296,
|
| 16813 |
+
"step": 47780
|
| 16814 |
+
},
|
| 16815 |
+
{
|
| 16816 |
+
"epoch": 0.8650800832503845,
|
| 16817 |
+
"grad_norm": NaN,
|
| 16818 |
+
"learning_rate": 0.00034328889425107073,
|
| 16819 |
+
"loss": 3.0621,
|
| 16820 |
+
"step": 47800
|
| 16821 |
+
},
|
| 16822 |
+
{
|
| 16823 |
+
"epoch": 0.8654420414442132,
|
| 16824 |
+
"grad_norm": NaN,
|
| 16825 |
+
"learning_rate": 0.00034328889425107073,
|
| 16826 |
+
"loss": 1.9652,
|
| 16827 |
+
"step": 47820
|
| 16828 |
+
},
|
| 16829 |
+
{
|
| 16830 |
+
"epoch": 0.8658039996380418,
|
| 16831 |
+
"grad_norm": NaN,
|
| 16832 |
+
"learning_rate": 0.00034328889425107073,
|
| 16833 |
+
"loss": 2.605,
|
| 16834 |
+
"step": 47840
|
| 16835 |
+
},
|
| 16836 |
+
{
|
| 16837 |
+
"epoch": 0.8661659578318704,
|
| 16838 |
+
"grad_norm": NaN,
|
| 16839 |
+
"learning_rate": 0.00034328889425107073,
|
| 16840 |
+
"loss": 2.629,
|
| 16841 |
+
"step": 47860
|
| 16842 |
+
},
|
| 16843 |
+
{
|
| 16844 |
+
"epoch": 0.866527916025699,
|
| 16845 |
+
"grad_norm": NaN,
|
| 16846 |
+
"learning_rate": 0.00034328889425107073,
|
| 16847 |
+
"loss": 0.5865,
|
| 16848 |
+
"step": 47880
|
| 16849 |
+
},
|
| 16850 |
+
{
|
| 16851 |
+
"epoch": 0.8668898742195277,
|
| 16852 |
+
"grad_norm": NaN,
|
| 16853 |
+
"learning_rate": 0.00034328889425107073,
|
| 16854 |
+
"loss": 2.0123,
|
| 16855 |
+
"step": 47900
|
| 16856 |
+
},
|
| 16857 |
+
{
|
| 16858 |
+
"epoch": 0.8672518324133562,
|
| 16859 |
+
"grad_norm": NaN,
|
| 16860 |
+
"learning_rate": 0.00034328889425107073,
|
| 16861 |
+
"loss": 1.215,
|
| 16862 |
+
"step": 47920
|
| 16863 |
+
},
|
| 16864 |
+
{
|
| 16865 |
+
"epoch": 0.8676137906071849,
|
| 16866 |
+
"grad_norm": NaN,
|
| 16867 |
+
"learning_rate": 0.00034328889425107073,
|
| 16868 |
+
"loss": 2.0918,
|
| 16869 |
+
"step": 47940
|
| 16870 |
+
},
|
| 16871 |
+
{
|
| 16872 |
+
"epoch": 0.8679757488010135,
|
| 16873 |
+
"grad_norm": NaN,
|
| 16874 |
+
"learning_rate": 0.00034328889425107073,
|
| 16875 |
+
"loss": 0.4018,
|
| 16876 |
+
"step": 47960
|
| 16877 |
+
},
|
| 16878 |
+
{
|
| 16879 |
+
"epoch": 0.8683377069948421,
|
| 16880 |
+
"grad_norm": NaN,
|
| 16881 |
+
"learning_rate": 0.00034334318634252276,
|
| 16882 |
+
"loss": 6.8876,
|
| 16883 |
+
"step": 47980
|
| 16884 |
+
},
|
| 16885 |
+
{
|
| 16886 |
+
"epoch": 0.8686996651886707,
|
| 16887 |
+
"grad_norm": NaN,
|
| 16888 |
+
"learning_rate": 0.00034334318634252276,
|
| 16889 |
+
"loss": 3.6365,
|
| 16890 |
+
"step": 48000
|
| 16891 |
+
},
|
| 16892 |
+
{
|
| 16893 |
+
"epoch": 0.8690616233824994,
|
| 16894 |
+
"grad_norm": NaN,
|
| 16895 |
+
"learning_rate": 0.00034334318634252276,
|
| 16896 |
+
"loss": 1.1087,
|
| 16897 |
+
"step": 48020
|
| 16898 |
+
},
|
| 16899 |
+
{
|
| 16900 |
+
"epoch": 0.8694235815763279,
|
| 16901 |
+
"grad_norm": NaN,
|
| 16902 |
+
"learning_rate": 0.00034334318634252276,
|
| 16903 |
+
"loss": 0.7051,
|
| 16904 |
+
"step": 48040
|
| 16905 |
+
},
|
| 16906 |
+
{
|
| 16907 |
+
"epoch": 0.8697855397701566,
|
| 16908 |
+
"grad_norm": NaN,
|
| 16909 |
+
"learning_rate": 0.00034334318634252276,
|
| 16910 |
+
"loss": 0.0,
|
| 16911 |
+
"step": 48060
|
| 16912 |
+
},
|
| 16913 |
+
{
|
| 16914 |
+
"epoch": 0.8701474979639852,
|
| 16915 |
+
"grad_norm": NaN,
|
| 16916 |
+
"learning_rate": 0.00034334318634252276,
|
| 16917 |
+
"loss": 0.0,
|
| 16918 |
+
"step": 48080
|
| 16919 |
+
},
|
| 16920 |
+
{
|
| 16921 |
+
"epoch": 0.8705094561578137,
|
| 16922 |
+
"grad_norm": NaN,
|
| 16923 |
+
"learning_rate": 0.00034334318634252276,
|
| 16924 |
+
"loss": 2.3614,
|
| 16925 |
+
"step": 48100
|
| 16926 |
+
},
|
| 16927 |
+
{
|
| 16928 |
+
"epoch": 0.8708714143516424,
|
| 16929 |
+
"grad_norm": NaN,
|
| 16930 |
+
"learning_rate": 0.00034334318634252276,
|
| 16931 |
+
"loss": 2.1832,
|
| 16932 |
+
"step": 48120
|
| 16933 |
+
},
|
| 16934 |
+
{
|
| 16935 |
+
"epoch": 0.871233372545471,
|
| 16936 |
+
"grad_norm": NaN,
|
| 16937 |
+
"learning_rate": 0.00034334318634252276,
|
| 16938 |
+
"loss": 1.172,
|
| 16939 |
+
"step": 48140
|
| 16940 |
+
},
|
| 16941 |
+
{
|
| 16942 |
+
"epoch": 0.8715953307392996,
|
| 16943 |
+
"grad_norm": NaN,
|
| 16944 |
+
"learning_rate": 0.0003433974784339748,
|
| 16945 |
+
"loss": 2.1687,
|
| 16946 |
+
"step": 48160
|
| 16947 |
+
},
|
| 16948 |
+
{
|
| 16949 |
+
"epoch": 0.8719572889331282,
|
| 16950 |
+
"grad_norm": NaN,
|
| 16951 |
+
"learning_rate": 0.0003433974784339748,
|
| 16952 |
+
"loss": 0.786,
|
| 16953 |
+
"step": 48180
|
| 16954 |
+
},
|
| 16955 |
+
{
|
| 16956 |
+
"epoch": 0.8723192471269569,
|
| 16957 |
+
"grad_norm": NaN,
|
| 16958 |
+
"learning_rate": 0.0003433974784339748,
|
| 16959 |
+
"loss": 0.4405,
|
| 16960 |
+
"step": 48200
|
| 16961 |
+
},
|
| 16962 |
+
{
|
| 16963 |
+
"epoch": 0.8726812053207854,
|
| 16964 |
+
"grad_norm": NaN,
|
| 16965 |
+
"learning_rate": 0.0003433974784339748,
|
| 16966 |
+
"loss": 0.5855,
|
| 16967 |
+
"step": 48220
|
| 16968 |
+
},
|
| 16969 |
+
{
|
| 16970 |
+
"epoch": 0.8730431635146141,
|
| 16971 |
+
"grad_norm": NaN,
|
| 16972 |
+
"learning_rate": 0.0003433974784339748,
|
| 16973 |
+
"loss": 0.8813,
|
| 16974 |
+
"step": 48240
|
| 16975 |
+
},
|
| 16976 |
+
{
|
| 16977 |
+
"epoch": 0.8734051217084426,
|
| 16978 |
+
"grad_norm": NaN,
|
| 16979 |
+
"learning_rate": 0.0003435060626168788,
|
| 16980 |
+
"loss": 2.8654,
|
| 16981 |
+
"step": 48260
|
| 16982 |
+
},
|
| 16983 |
+
{
|
| 16984 |
+
"epoch": 0.8737670799022713,
|
| 16985 |
+
"grad_norm": NaN,
|
| 16986 |
+
"learning_rate": 0.0003435060626168788,
|
| 16987 |
+
"loss": 0.7883,
|
| 16988 |
+
"step": 48280
|
| 16989 |
+
},
|
| 16990 |
+
{
|
| 16991 |
+
"epoch": 0.8741290380960999,
|
| 16992 |
+
"grad_norm": NaN,
|
| 16993 |
+
"learning_rate": 0.0003435060626168788,
|
| 16994 |
+
"loss": 1.6977,
|
| 16995 |
+
"step": 48300
|
| 16996 |
+
},
|
| 16997 |
+
{
|
| 16998 |
+
"epoch": 0.8744909962899285,
|
| 16999 |
+
"grad_norm": NaN,
|
| 17000 |
+
"learning_rate": 0.0003435060626168788,
|
| 17001 |
+
"loss": 0.6638,
|
| 17002 |
+
"step": 48320
|
| 17003 |
+
},
|
| 17004 |
+
{
|
| 17005 |
+
"epoch": 0.8748529544837571,
|
| 17006 |
+
"grad_norm": NaN,
|
| 17007 |
+
"learning_rate": 0.0003435060626168788,
|
| 17008 |
+
"loss": 0.5823,
|
| 17009 |
+
"step": 48340
|
| 17010 |
+
},
|
| 17011 |
+
{
|
| 17012 |
+
"epoch": 0.8752149126775858,
|
| 17013 |
+
"grad_norm": NaN,
|
| 17014 |
+
"learning_rate": 0.0003435060626168788,
|
| 17015 |
+
"loss": 0.8745,
|
| 17016 |
+
"step": 48360
|
| 17017 |
+
},
|
| 17018 |
+
{
|
| 17019 |
+
"epoch": 0.8755768708714143,
|
| 17020 |
+
"grad_norm": NaN,
|
| 17021 |
+
"learning_rate": 0.0003435060626168788,
|
| 17022 |
+
"loss": 2.6587,
|
| 17023 |
+
"step": 48380
|
| 17024 |
+
},
|
| 17025 |
+
{
|
| 17026 |
+
"epoch": 0.875938829065243,
|
| 17027 |
+
"grad_norm": NaN,
|
| 17028 |
+
"learning_rate": 0.0003435603547083308,
|
| 17029 |
+
"loss": 1.6498,
|
| 17030 |
+
"step": 48400
|
| 17031 |
+
},
|
| 17032 |
+
{
|
| 17033 |
+
"epoch": 0.8763007872590716,
|
| 17034 |
+
"grad_norm": NaN,
|
| 17035 |
+
"learning_rate": 0.0003435603547083308,
|
| 17036 |
+
"loss": 1.0264,
|
| 17037 |
+
"step": 48420
|
| 17038 |
+
},
|
| 17039 |
+
{
|
| 17040 |
+
"epoch": 0.8766627454529002,
|
| 17041 |
+
"grad_norm": NaN,
|
| 17042 |
+
"learning_rate": 0.0003435603547083308,
|
| 17043 |
+
"loss": 4.9079,
|
| 17044 |
+
"step": 48440
|
| 17045 |
+
},
|
| 17046 |
+
{
|
| 17047 |
+
"epoch": 0.8770247036467288,
|
| 17048 |
+
"grad_norm": NaN,
|
| 17049 |
+
"learning_rate": 0.0003435603547083308,
|
| 17050 |
+
"loss": 0.5742,
|
| 17051 |
+
"step": 48460
|
| 17052 |
+
},
|
| 17053 |
+
{
|
| 17054 |
+
"epoch": 0.8773866618405575,
|
| 17055 |
+
"grad_norm": NaN,
|
| 17056 |
+
"learning_rate": 0.0003436146467997828,
|
| 17057 |
+
"loss": 3.0971,
|
| 17058 |
+
"step": 48480
|
| 17059 |
+
},
|
| 17060 |
+
{
|
| 17061 |
+
"epoch": 0.877748620034386,
|
| 17062 |
+
"grad_norm": NaN,
|
| 17063 |
+
"learning_rate": 0.0003436146467997828,
|
| 17064 |
+
"loss": 1.5435,
|
| 17065 |
+
"step": 48500
|
| 17066 |
+
},
|
| 17067 |
+
{
|
| 17068 |
+
"epoch": 0.8781105782282146,
|
| 17069 |
+
"grad_norm": NaN,
|
| 17070 |
+
"learning_rate": 0.0003436146467997828,
|
| 17071 |
+
"loss": 0.8805,
|
| 17072 |
+
"step": 48520
|
| 17073 |
+
},
|
| 17074 |
+
{
|
| 17075 |
+
"epoch": 0.8784725364220433,
|
| 17076 |
+
"grad_norm": NaN,
|
| 17077 |
+
"learning_rate": 0.0003436146467997828,
|
| 17078 |
+
"loss": 1.0995,
|
| 17079 |
+
"step": 48540
|
| 17080 |
+
},
|
| 17081 |
+
{
|
| 17082 |
+
"epoch": 0.8788344946158718,
|
| 17083 |
+
"grad_norm": NaN,
|
| 17084 |
+
"learning_rate": 0.0003436689388912348,
|
| 17085 |
+
"loss": 2.2358,
|
| 17086 |
+
"step": 48560
|
| 17087 |
+
},
|
| 17088 |
+
{
|
| 17089 |
+
"epoch": 0.8791964528097005,
|
| 17090 |
+
"grad_norm": NaN,
|
| 17091 |
+
"learning_rate": 0.0003436689388912348,
|
| 17092 |
+
"loss": 1.6608,
|
| 17093 |
+
"step": 48580
|
| 17094 |
+
},
|
| 17095 |
+
{
|
| 17096 |
+
"epoch": 0.879558411003529,
|
| 17097 |
+
"grad_norm": NaN,
|
| 17098 |
+
"learning_rate": 0.0003436689388912348,
|
| 17099 |
+
"loss": 2.8619,
|
| 17100 |
+
"step": 48600
|
| 17101 |
+
},
|
| 17102 |
+
{
|
| 17103 |
+
"epoch": 0.8799203691973577,
|
| 17104 |
+
"grad_norm": NaN,
|
| 17105 |
+
"learning_rate": 0.0003436689388912348,
|
| 17106 |
+
"loss": 0.9254,
|
| 17107 |
+
"step": 48620
|
| 17108 |
+
},
|
| 17109 |
+
{
|
| 17110 |
+
"epoch": 0.8802823273911863,
|
| 17111 |
+
"grad_norm": NaN,
|
| 17112 |
+
"learning_rate": 0.00034372323098268685,
|
| 17113 |
+
"loss": 4.944,
|
| 17114 |
+
"step": 48640
|
| 17115 |
+
},
|
| 17116 |
+
{
|
| 17117 |
+
"epoch": 0.880644285585015,
|
| 17118 |
+
"grad_norm": NaN,
|
| 17119 |
+
"learning_rate": 0.00034372323098268685,
|
| 17120 |
+
"loss": 1.3753,
|
| 17121 |
+
"step": 48660
|
| 17122 |
+
},
|
| 17123 |
+
{
|
| 17124 |
+
"epoch": 0.8810062437788435,
|
| 17125 |
+
"grad_norm": NaN,
|
| 17126 |
+
"learning_rate": 0.00034372323098268685,
|
| 17127 |
+
"loss": 1.2057,
|
| 17128 |
+
"step": 48680
|
| 17129 |
+
},
|
| 17130 |
+
{
|
| 17131 |
+
"epoch": 0.8813682019726722,
|
| 17132 |
+
"grad_norm": NaN,
|
| 17133 |
+
"learning_rate": 0.00034372323098268685,
|
| 17134 |
+
"loss": 0.8484,
|
| 17135 |
+
"step": 48700
|
| 17136 |
+
},
|
| 17137 |
+
{
|
| 17138 |
+
"epoch": 0.8817301601665007,
|
| 17139 |
+
"grad_norm": NaN,
|
| 17140 |
+
"learning_rate": 0.00034372323098268685,
|
| 17141 |
+
"loss": 1.9288,
|
| 17142 |
+
"step": 48720
|
| 17143 |
+
},
|
| 17144 |
+
{
|
| 17145 |
+
"epoch": 0.8820921183603294,
|
| 17146 |
+
"grad_norm": NaN,
|
| 17147 |
+
"learning_rate": 0.0003437775230741388,
|
| 17148 |
+
"loss": 2.7616,
|
| 17149 |
+
"step": 48740
|
| 17150 |
+
},
|
| 17151 |
+
{
|
| 17152 |
+
"epoch": 0.882454076554158,
|
| 17153 |
+
"grad_norm": NaN,
|
| 17154 |
+
"learning_rate": 0.0003437775230741388,
|
| 17155 |
+
"loss": 1.1404,
|
| 17156 |
+
"step": 48760
|
| 17157 |
+
},
|
| 17158 |
+
{
|
| 17159 |
+
"epoch": 0.8828160347479866,
|
| 17160 |
+
"grad_norm": NaN,
|
| 17161 |
+
"learning_rate": 0.0003437775230741388,
|
| 17162 |
+
"loss": 0.9172,
|
| 17163 |
+
"step": 48780
|
| 17164 |
+
},
|
| 17165 |
+
{
|
| 17166 |
+
"epoch": 0.8831779929418152,
|
| 17167 |
+
"grad_norm": NaN,
|
| 17168 |
+
"learning_rate": 0.0003437775230741388,
|
| 17169 |
+
"loss": 0.4789,
|
| 17170 |
+
"step": 48800
|
| 17171 |
+
},
|
| 17172 |
+
{
|
| 17173 |
+
"epoch": 0.8835399511356439,
|
| 17174 |
+
"grad_norm": NaN,
|
| 17175 |
+
"learning_rate": 0.0003437775230741388,
|
| 17176 |
+
"loss": 2.6007,
|
| 17177 |
+
"step": 48820
|
| 17178 |
+
},
|
| 17179 |
+
{
|
| 17180 |
+
"epoch": 0.8839019093294724,
|
| 17181 |
+
"grad_norm": NaN,
|
| 17182 |
+
"learning_rate": 0.0003437775230741388,
|
| 17183 |
+
"loss": 2.2257,
|
| 17184 |
+
"step": 48840
|
| 17185 |
+
},
|
| 17186 |
+
{
|
| 17187 |
+
"epoch": 0.8842638675233011,
|
| 17188 |
+
"grad_norm": NaN,
|
| 17189 |
+
"learning_rate": 0.0003437775230741388,
|
| 17190 |
+
"loss": 1.0041,
|
| 17191 |
+
"step": 48860
|
| 17192 |
+
},
|
| 17193 |
+
{
|
| 17194 |
+
"epoch": 0.8846258257171297,
|
| 17195 |
+
"grad_norm": NaN,
|
| 17196 |
+
"learning_rate": 0.0003437775230741388,
|
| 17197 |
+
"loss": 0.4423,
|
| 17198 |
+
"step": 48880
|
| 17199 |
+
},
|
| 17200 |
+
{
|
| 17201 |
+
"epoch": 0.8849877839109583,
|
| 17202 |
+
"grad_norm": NaN,
|
| 17203 |
+
"learning_rate": 0.0003437775230741388,
|
| 17204 |
+
"loss": 0.2653,
|
| 17205 |
+
"step": 48900
|
| 17206 |
+
},
|
| 17207 |
+
{
|
| 17208 |
+
"epoch": 0.8853497421047869,
|
| 17209 |
+
"grad_norm": NaN,
|
| 17210 |
+
"learning_rate": 0.0003437775230741388,
|
| 17211 |
+
"loss": 2.8073,
|
| 17212 |
+
"step": 48920
|
| 17213 |
+
},
|
| 17214 |
+
{
|
| 17215 |
+
"epoch": 0.8857117002986155,
|
| 17216 |
+
"grad_norm": NaN,
|
| 17217 |
+
"learning_rate": 0.0003437775230741388,
|
| 17218 |
+
"loss": 1.0093,
|
| 17219 |
+
"step": 48940
|
| 17220 |
+
},
|
| 17221 |
+
{
|
| 17222 |
+
"epoch": 0.8860736584924441,
|
| 17223 |
+
"grad_norm": NaN,
|
| 17224 |
+
"learning_rate": 0.0003437775230741388,
|
| 17225 |
+
"loss": 0.0,
|
| 17226 |
+
"step": 48960
|
| 17227 |
+
},
|
| 17228 |
+
{
|
| 17229 |
+
"epoch": 0.8864356166862727,
|
| 17230 |
+
"grad_norm": NaN,
|
| 17231 |
+
"learning_rate": 0.0003437775230741388,
|
| 17232 |
+
"loss": 0.2918,
|
| 17233 |
+
"step": 48980
|
| 17234 |
+
},
|
| 17235 |
+
{
|
| 17236 |
+
"epoch": 0.8867975748801014,
|
| 17237 |
+
"grad_norm": NaN,
|
| 17238 |
+
"learning_rate": 0.0003437775230741388,
|
| 17239 |
+
"loss": 0.4383,
|
| 17240 |
+
"step": 49000
|
| 17241 |
+
},
|
| 17242 |
+
{
|
| 17243 |
+
"epoch": 0.8871595330739299,
|
| 17244 |
+
"grad_norm": NaN,
|
| 17245 |
+
"learning_rate": 0.00034383181516559085,
|
| 17246 |
+
"loss": 2.0041,
|
| 17247 |
+
"step": 49020
|
| 17248 |
+
},
|
| 17249 |
+
{
|
| 17250 |
+
"epoch": 0.8875214912677586,
|
| 17251 |
+
"grad_norm": NaN,
|
| 17252 |
+
"learning_rate": 0.00034383181516559085,
|
| 17253 |
+
"loss": 1.1767,
|
| 17254 |
+
"step": 49040
|
| 17255 |
+
},
|
| 17256 |
+
{
|
| 17257 |
+
"epoch": 0.8878834494615871,
|
| 17258 |
+
"grad_norm": NaN,
|
| 17259 |
+
"learning_rate": 0.00034383181516559085,
|
| 17260 |
+
"loss": 1.8047,
|
| 17261 |
+
"step": 49060
|
| 17262 |
+
},
|
| 17263 |
+
{
|
| 17264 |
+
"epoch": 0.8882454076554158,
|
| 17265 |
+
"grad_norm": NaN,
|
| 17266 |
+
"learning_rate": 0.00034383181516559085,
|
| 17267 |
+
"loss": 0.7776,
|
| 17268 |
+
"step": 49080
|
| 17269 |
+
},
|
| 17270 |
+
{
|
| 17271 |
+
"epoch": 0.8886073658492444,
|
| 17272 |
+
"grad_norm": NaN,
|
| 17273 |
+
"learning_rate": 0.00034383181516559085,
|
| 17274 |
+
"loss": 2.9128,
|
| 17275 |
+
"step": 49100
|
| 17276 |
+
},
|
| 17277 |
+
{
|
| 17278 |
+
"epoch": 0.888969324043073,
|
| 17279 |
+
"grad_norm": NaN,
|
| 17280 |
+
"learning_rate": 0.00034383181516559085,
|
| 17281 |
+
"loss": 3.8472,
|
| 17282 |
+
"step": 49120
|
| 17283 |
+
},
|
| 17284 |
+
{
|
| 17285 |
+
"epoch": 0.8893312822369016,
|
| 17286 |
+
"grad_norm": NaN,
|
| 17287 |
+
"learning_rate": 0.00034383181516559085,
|
| 17288 |
+
"loss": 1.9238,
|
| 17289 |
+
"step": 49140
|
| 17290 |
+
},
|
| 17291 |
+
{
|
| 17292 |
+
"epoch": 0.8896932404307303,
|
| 17293 |
+
"grad_norm": NaN,
|
| 17294 |
+
"learning_rate": 0.00034383181516559085,
|
| 17295 |
+
"loss": 0.8734,
|
| 17296 |
+
"step": 49160
|
| 17297 |
+
},
|
| 17298 |
+
{
|
| 17299 |
+
"epoch": 0.8900551986245588,
|
| 17300 |
+
"grad_norm": NaN,
|
| 17301 |
+
"learning_rate": 0.00034383181516559085,
|
| 17302 |
+
"loss": 1.8095,
|
| 17303 |
+
"step": 49180
|
| 17304 |
+
},
|
| 17305 |
+
{
|
| 17306 |
+
"epoch": 0.8904171568183875,
|
| 17307 |
+
"grad_norm": NaN,
|
| 17308 |
+
"learning_rate": 0.00034383181516559085,
|
| 17309 |
+
"loss": 3.3127,
|
| 17310 |
+
"step": 49200
|
| 17311 |
+
},
|
| 17312 |
+
{
|
| 17313 |
+
"epoch": 0.8907791150122161,
|
| 17314 |
+
"grad_norm": NaN,
|
| 17315 |
+
"learning_rate": 0.00034383181516559085,
|
| 17316 |
+
"loss": 0.3799,
|
| 17317 |
+
"step": 49220
|
| 17318 |
+
},
|
| 17319 |
+
{
|
| 17320 |
+
"epoch": 0.8911410732060447,
|
| 17321 |
+
"grad_norm": NaN,
|
| 17322 |
+
"learning_rate": 0.00034383181516559085,
|
| 17323 |
+
"loss": 1.4065,
|
| 17324 |
+
"step": 49240
|
| 17325 |
+
},
|
| 17326 |
+
{
|
| 17327 |
+
"epoch": 0.8915030313998733,
|
| 17328 |
+
"grad_norm": NaN,
|
| 17329 |
+
"learning_rate": 0.00034383181516559085,
|
| 17330 |
+
"loss": 1.4969,
|
| 17331 |
+
"step": 49260
|
| 17332 |
+
},
|
| 17333 |
+
{
|
| 17334 |
+
"epoch": 0.891864989593702,
|
| 17335 |
+
"grad_norm": NaN,
|
| 17336 |
+
"learning_rate": 0.00034383181516559085,
|
| 17337 |
+
"loss": 1.4145,
|
| 17338 |
+
"step": 49280
|
| 17339 |
+
},
|
| 17340 |
+
{
|
| 17341 |
+
"epoch": 0.8922269477875305,
|
| 17342 |
+
"grad_norm": NaN,
|
| 17343 |
+
"learning_rate": 0.00034383181516559085,
|
| 17344 |
+
"loss": 1.1898,
|
| 17345 |
+
"step": 49300
|
| 17346 |
+
},
|
| 17347 |
+
{
|
| 17348 |
+
"epoch": 0.8925889059813592,
|
| 17349 |
+
"grad_norm": NaN,
|
| 17350 |
+
"learning_rate": 0.00034383181516559085,
|
| 17351 |
+
"loss": 0.3764,
|
| 17352 |
+
"step": 49320
|
| 17353 |
+
},
|
| 17354 |
+
{
|
| 17355 |
+
"epoch": 0.8929508641751878,
|
| 17356 |
+
"grad_norm": NaN,
|
| 17357 |
+
"learning_rate": 0.00034383181516559085,
|
| 17358 |
+
"loss": 1.0612,
|
| 17359 |
+
"step": 49340
|
| 17360 |
+
},
|
| 17361 |
+
{
|
| 17362 |
+
"epoch": 0.8933128223690163,
|
| 17363 |
+
"grad_norm": NaN,
|
| 17364 |
+
"learning_rate": 0.00034383181516559085,
|
| 17365 |
+
"loss": 0.6792,
|
| 17366 |
+
"step": 49360
|
| 17367 |
+
},
|
| 17368 |
+
{
|
| 17369 |
+
"epoch": 0.893674780562845,
|
| 17370 |
+
"grad_norm": NaN,
|
| 17371 |
+
"learning_rate": 0.00034383181516559085,
|
| 17372 |
+
"loss": 1.563,
|
| 17373 |
+
"step": 49380
|
| 17374 |
+
},
|
| 17375 |
+
{
|
| 17376 |
+
"epoch": 0.8940367387566736,
|
| 17377 |
+
"grad_norm": NaN,
|
| 17378 |
+
"learning_rate": 0.00034383181516559085,
|
| 17379 |
+
"loss": 1.8383,
|
| 17380 |
+
"step": 49400
|
| 17381 |
+
},
|
| 17382 |
+
{
|
| 17383 |
+
"epoch": 0.8943986969505022,
|
| 17384 |
+
"grad_norm": NaN,
|
| 17385 |
+
"learning_rate": 0.00034383181516559085,
|
| 17386 |
+
"loss": 2.0042,
|
| 17387 |
+
"step": 49420
|
| 17388 |
+
},
|
| 17389 |
+
{
|
| 17390 |
+
"epoch": 0.8947606551443308,
|
| 17391 |
+
"grad_norm": NaN,
|
| 17392 |
+
"learning_rate": 0.00034383181516559085,
|
| 17393 |
+
"loss": 1.3138,
|
| 17394 |
+
"step": 49440
|
| 17395 |
+
},
|
| 17396 |
+
{
|
| 17397 |
+
"epoch": 0.8951226133381595,
|
| 17398 |
+
"grad_norm": NaN,
|
| 17399 |
+
"learning_rate": 0.00034383181516559085,
|
| 17400 |
+
"loss": 2.119,
|
| 17401 |
+
"step": 49460
|
| 17402 |
+
},
|
| 17403 |
+
{
|
| 17404 |
+
"epoch": 0.895484571531988,
|
| 17405 |
+
"grad_norm": NaN,
|
| 17406 |
+
"learning_rate": 0.00034383181516559085,
|
| 17407 |
+
"loss": 1.8186,
|
| 17408 |
+
"step": 49480
|
| 17409 |
+
},
|
| 17410 |
+
{
|
| 17411 |
+
"epoch": 0.8958465297258167,
|
| 17412 |
+
"grad_norm": NaN,
|
| 17413 |
+
"learning_rate": 0.0003438861072570429,
|
| 17414 |
+
"loss": 0.7511,
|
| 17415 |
+
"step": 49500
|
| 17416 |
+
},
|
| 17417 |
+
{
|
| 17418 |
+
"epoch": 0.8962084879196452,
|
| 17419 |
+
"grad_norm": NaN,
|
| 17420 |
+
"learning_rate": 0.0003438861072570429,
|
| 17421 |
+
"loss": 1.9937,
|
| 17422 |
+
"step": 49520
|
| 17423 |
+
},
|
| 17424 |
+
{
|
| 17425 |
+
"epoch": 0.8965704461134739,
|
| 17426 |
+
"grad_norm": NaN,
|
| 17427 |
+
"learning_rate": 0.0003438861072570429,
|
| 17428 |
+
"loss": 1.9893,
|
| 17429 |
+
"step": 49540
|
| 17430 |
+
},
|
| 17431 |
+
{
|
| 17432 |
+
"epoch": 0.8969324043073025,
|
| 17433 |
+
"grad_norm": NaN,
|
| 17434 |
+
"learning_rate": 0.0003439403993484949,
|
| 17435 |
+
"loss": 0.5879,
|
| 17436 |
+
"step": 49560
|
| 17437 |
+
},
|
| 17438 |
+
{
|
| 17439 |
+
"epoch": 0.8972943625011311,
|
| 17440 |
+
"grad_norm": NaN,
|
| 17441 |
+
"learning_rate": 0.0003439403993484949,
|
| 17442 |
+
"loss": 0.0,
|
| 17443 |
+
"step": 49580
|
| 17444 |
+
},
|
| 17445 |
+
{
|
| 17446 |
+
"epoch": 0.8976563206949597,
|
| 17447 |
+
"grad_norm": NaN,
|
| 17448 |
+
"learning_rate": 0.0003439403993484949,
|
| 17449 |
+
"loss": 1.9287,
|
| 17450 |
+
"step": 49600
|
| 17451 |
+
},
|
| 17452 |
+
{
|
| 17453 |
+
"epoch": 0.8980182788887884,
|
| 17454 |
+
"grad_norm": NaN,
|
| 17455 |
+
"learning_rate": 0.0003439403993484949,
|
| 17456 |
+
"loss": 2.0609,
|
| 17457 |
+
"step": 49620
|
| 17458 |
+
},
|
| 17459 |
+
{
|
| 17460 |
+
"epoch": 0.8983802370826169,
|
| 17461 |
+
"grad_norm": NaN,
|
| 17462 |
+
"learning_rate": 0.0003439403993484949,
|
| 17463 |
+
"loss": 2.6145,
|
| 17464 |
+
"step": 49640
|
| 17465 |
+
},
|
| 17466 |
+
{
|
| 17467 |
+
"epoch": 0.8987421952764456,
|
| 17468 |
+
"grad_norm": NaN,
|
| 17469 |
+
"learning_rate": 0.0003439403993484949,
|
| 17470 |
+
"loss": 3.2522,
|
| 17471 |
+
"step": 49660
|
| 17472 |
+
},
|
| 17473 |
+
{
|
| 17474 |
+
"epoch": 0.8991041534702742,
|
| 17475 |
+
"grad_norm": NaN,
|
| 17476 |
+
"learning_rate": 0.0003439403993484949,
|
| 17477 |
+
"loss": 2.7444,
|
| 17478 |
+
"step": 49680
|
| 17479 |
+
},
|
| 17480 |
+
{
|
| 17481 |
+
"epoch": 0.8994661116641028,
|
| 17482 |
+
"grad_norm": NaN,
|
| 17483 |
+
"learning_rate": 0.0003439403993484949,
|
| 17484 |
+
"loss": 4.5388,
|
| 17485 |
+
"step": 49700
|
| 17486 |
+
},
|
| 17487 |
+
{
|
| 17488 |
+
"epoch": 0.8998280698579314,
|
| 17489 |
+
"grad_norm": NaN,
|
| 17490 |
+
"learning_rate": 0.0003439403993484949,
|
| 17491 |
+
"loss": 1.8448,
|
| 17492 |
+
"step": 49720
|
| 17493 |
+
},
|
| 17494 |
+
{
|
| 17495 |
+
"epoch": 0.9001900280517601,
|
| 17496 |
+
"grad_norm": NaN,
|
| 17497 |
+
"learning_rate": 0.0003439403993484949,
|
| 17498 |
+
"loss": 1.514,
|
| 17499 |
+
"step": 49740
|
| 17500 |
+
},
|
| 17501 |
+
{
|
| 17502 |
+
"epoch": 0.9005519862455886,
|
| 17503 |
+
"grad_norm": NaN,
|
| 17504 |
+
"learning_rate": 0.0003439403993484949,
|
| 17505 |
+
"loss": 2.3255,
|
| 17506 |
+
"step": 49760
|
| 17507 |
+
},
|
| 17508 |
+
{
|
| 17509 |
+
"epoch": 0.9009139444394173,
|
| 17510 |
+
"grad_norm": NaN,
|
| 17511 |
+
"learning_rate": 0.0003439403993484949,
|
| 17512 |
+
"loss": 0.3116,
|
| 17513 |
+
"step": 49780
|
| 17514 |
+
},
|
| 17515 |
+
{
|
| 17516 |
+
"epoch": 0.9012759026332459,
|
| 17517 |
+
"grad_norm": NaN,
|
| 17518 |
+
"learning_rate": 0.0003439403993484949,
|
| 17519 |
+
"loss": 2.52,
|
| 17520 |
+
"step": 49800
|
| 17521 |
+
},
|
| 17522 |
+
{
|
| 17523 |
+
"epoch": 0.9016378608270744,
|
| 17524 |
+
"grad_norm": NaN,
|
| 17525 |
+
"learning_rate": 0.0003439403993484949,
|
| 17526 |
+
"loss": 3.8961,
|
| 17527 |
+
"step": 49820
|
| 17528 |
+
},
|
| 17529 |
+
{
|
| 17530 |
+
"epoch": 0.9019998190209031,
|
| 17531 |
+
"grad_norm": NaN,
|
| 17532 |
+
"learning_rate": 0.0003439403993484949,
|
| 17533 |
+
"loss": 2.786,
|
| 17534 |
+
"step": 49840
|
| 17535 |
+
},
|
| 17536 |
+
{
|
| 17537 |
+
"epoch": 0.9023617772147317,
|
| 17538 |
+
"grad_norm": NaN,
|
| 17539 |
+
"learning_rate": 0.0003439403993484949,
|
| 17540 |
+
"loss": 0.263,
|
| 17541 |
+
"step": 49860
|
| 17542 |
+
},
|
| 17543 |
+
{
|
| 17544 |
+
"epoch": 0.9027237354085603,
|
| 17545 |
+
"grad_norm": NaN,
|
| 17546 |
+
"learning_rate": 0.0003439403993484949,
|
| 17547 |
+
"loss": 1.4603,
|
| 17548 |
+
"step": 49880
|
| 17549 |
+
},
|
| 17550 |
+
{
|
| 17551 |
+
"epoch": 0.9030856936023889,
|
| 17552 |
+
"grad_norm": NaN,
|
| 17553 |
+
"learning_rate": 0.0003439403993484949,
|
| 17554 |
+
"loss": 2.8763,
|
| 17555 |
+
"step": 49900
|
| 17556 |
+
},
|
| 17557 |
+
{
|
| 17558 |
+
"epoch": 0.9034476517962176,
|
| 17559 |
+
"grad_norm": NaN,
|
| 17560 |
+
"learning_rate": 0.0003439403993484949,
|
| 17561 |
+
"loss": 2.9407,
|
| 17562 |
+
"step": 49920
|
| 17563 |
+
},
|
| 17564 |
+
{
|
| 17565 |
+
"epoch": 0.9038096099900461,
|
| 17566 |
+
"grad_norm": NaN,
|
| 17567 |
+
"learning_rate": 0.00034399469143994694,
|
| 17568 |
+
"loss": 3.1258,
|
| 17569 |
+
"step": 49940
|
| 17570 |
+
},
|
| 17571 |
+
{
|
| 17572 |
+
"epoch": 0.9041715681838748,
|
| 17573 |
+
"grad_norm": NaN,
|
| 17574 |
+
"learning_rate": 0.00034399469143994694,
|
| 17575 |
+
"loss": 0.8813,
|
| 17576 |
+
"step": 49960
|
| 17577 |
+
},
|
| 17578 |
+
{
|
| 17579 |
+
"epoch": 0.9045335263777033,
|
| 17580 |
+
"grad_norm": NaN,
|
| 17581 |
+
"learning_rate": 0.00034399469143994694,
|
| 17582 |
+
"loss": 1.0321,
|
| 17583 |
+
"step": 49980
|
| 17584 |
+
},
|
| 17585 |
+
{
|
| 17586 |
+
"epoch": 0.904895484571532,
|
| 17587 |
+
"grad_norm": NaN,
|
| 17588 |
+
"learning_rate": 0.00034399469143994694,
|
| 17589 |
+
"loss": 3.1083,
|
| 17590 |
+
"step": 50000
|
| 17591 |
+
},
|
| 17592 |
+
{
|
| 17593 |
+
"epoch": 0.904895484571532,
|
| 17594 |
+
"eval_accuracy": 4.551287535289588e-05,
|
| 17595 |
+
"eval_loss": NaN,
|
| 17596 |
+
"eval_runtime": 170.9007,
|
| 17597 |
+
"eval_samples_per_second": 3556.686,
|
| 17598 |
+
"eval_steps_per_second": 3.476,
|
| 17599 |
+
"step": 50000
|
| 17600 |
}
|
| 17601 |
],
|
| 17602 |
"logging_steps": 20,
|
|
|
|
| 17616 |
"attributes": {}
|
| 17617 |
}
|
| 17618 |
},
|
| 17619 |
+
"total_flos": 7.19181053952e+16,
|
| 17620 |
"train_batch_size": 512,
|
| 17621 |
"trial_name": null,
|
| 17622 |
"trial_params": null
|