Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1953 -3
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a710773cfd7f93749b548b4dc475790d75538b97475d047166dceb50704eb746
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fedf1b4c8a508947f08f4a98315b58cd6a43e2a1adda4f18d9617c092f6a8844
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0412622810efe6fde95b3cfeff4557f637e942d79ee2fa68f136e7ee99e430b1
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5967d2fde5e8af8b726d755ee2aea2a1a3996cd4db019463bea602f6a5c353f
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -26769,6 +26769,1956 @@
|
|
| 26769 |
"eval_samples_per_second": 195.089,
|
| 26770 |
"eval_steps_per_second": 1.531,
|
| 26771 |
"step": 343000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26772 |
}
|
| 26773 |
],
|
| 26774 |
"logging_steps": 100,
|
|
@@ -26788,7 +28738,7 @@
|
|
| 26788 |
"attributes": {}
|
| 26789 |
}
|
| 26790 |
},
|
| 26791 |
-
"total_flos":
|
| 26792 |
"train_batch_size": 128,
|
| 26793 |
"trial_name": null,
|
| 26794 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.002796,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 368000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 26769 |
"eval_samples_per_second": 195.089,
|
| 26770 |
"eval_steps_per_second": 1.531,
|
| 26771 |
"step": 343000
|
| 26772 |
+
},
|
| 26773 |
+
{
|
| 26774 |
+
"epoch": 0.0002,
|
| 26775 |
+
"grad_norm": 0.8649879693984985,
|
| 26776 |
+
"learning_rate": 1.1537548189140518e-05,
|
| 26777 |
+
"loss": 0.6746,
|
| 26778 |
+
"step": 343100
|
| 26779 |
+
},
|
| 26780 |
+
{
|
| 26781 |
+
"epoch": 0.0004,
|
| 26782 |
+
"grad_norm": 0.8530526161193848,
|
| 26783 |
+
"learning_rate": 1.1524181255002655e-05,
|
| 26784 |
+
"loss": 0.6714,
|
| 26785 |
+
"step": 343200
|
| 26786 |
+
},
|
| 26787 |
+
{
|
| 26788 |
+
"epoch": 0.0006,
|
| 26789 |
+
"grad_norm": 0.8391575813293457,
|
| 26790 |
+
"learning_rate": 1.1510819748922983e-05,
|
| 26791 |
+
"loss": 0.673,
|
| 26792 |
+
"step": 343300
|
| 26793 |
+
},
|
| 26794 |
+
{
|
| 26795 |
+
"epoch": 0.0008,
|
| 26796 |
+
"grad_norm": 0.8824005126953125,
|
| 26797 |
+
"learning_rate": 1.149746367628349e-05,
|
| 26798 |
+
"loss": 0.6745,
|
| 26799 |
+
"step": 343400
|
| 26800 |
+
},
|
| 26801 |
+
{
|
| 26802 |
+
"epoch": 0.001,
|
| 26803 |
+
"grad_norm": 0.9381487965583801,
|
| 26804 |
+
"learning_rate": 1.1484113042464018e-05,
|
| 26805 |
+
"loss": 0.6775,
|
| 26806 |
+
"step": 343500
|
| 26807 |
+
},
|
| 26808 |
+
{
|
| 26809 |
+
"epoch": 0.0012,
|
| 26810 |
+
"grad_norm": 0.8851874470710754,
|
| 26811 |
+
"learning_rate": 1.1470767852842192e-05,
|
| 26812 |
+
"loss": 0.6714,
|
| 26813 |
+
"step": 343600
|
| 26814 |
+
},
|
| 26815 |
+
{
|
| 26816 |
+
"epoch": 0.0014,
|
| 26817 |
+
"grad_norm": 0.8769415616989136,
|
| 26818 |
+
"learning_rate": 1.1457428112793467e-05,
|
| 26819 |
+
"loss": 0.6649,
|
| 26820 |
+
"step": 343700
|
| 26821 |
+
},
|
| 26822 |
+
{
|
| 26823 |
+
"epoch": 0.0016,
|
| 26824 |
+
"grad_norm": 0.8536527156829834,
|
| 26825 |
+
"learning_rate": 1.1444093827691072e-05,
|
| 26826 |
+
"loss": 0.6689,
|
| 26827 |
+
"step": 343800
|
| 26828 |
+
},
|
| 26829 |
+
{
|
| 26830 |
+
"epoch": 0.0018,
|
| 26831 |
+
"grad_norm": 0.8344665765762329,
|
| 26832 |
+
"learning_rate": 1.143076500290606e-05,
|
| 26833 |
+
"loss": 0.6714,
|
| 26834 |
+
"step": 343900
|
| 26835 |
+
},
|
| 26836 |
+
{
|
| 26837 |
+
"epoch": 0.002,
|
| 26838 |
+
"grad_norm": 0.857262372970581,
|
| 26839 |
+
"learning_rate": 1.141744164380728e-05,
|
| 26840 |
+
"loss": 0.668,
|
| 26841 |
+
"step": 344000
|
| 26842 |
+
},
|
| 26843 |
+
{
|
| 26844 |
+
"epoch": 0.002,
|
| 26845 |
+
"eval_loss": 2.0636377334594727,
|
| 26846 |
+
"eval_runtime": 52.1973,
|
| 26847 |
+
"eval_samples_per_second": 195.297,
|
| 26848 |
+
"eval_steps_per_second": 1.533,
|
| 26849 |
+
"step": 344000
|
| 26850 |
+
},
|
| 26851 |
+
{
|
| 26852 |
+
"epoch": 0.0022,
|
| 26853 |
+
"grad_norm": 0.9240826964378357,
|
| 26854 |
+
"learning_rate": 1.1404123755761394e-05,
|
| 26855 |
+
"loss": 0.6738,
|
| 26856 |
+
"step": 344100
|
| 26857 |
+
},
|
| 26858 |
+
{
|
| 26859 |
+
"epoch": 0.0024,
|
| 26860 |
+
"grad_norm": 0.864179790019989,
|
| 26861 |
+
"learning_rate": 1.1390811344132823e-05,
|
| 26862 |
+
"loss": 0.6675,
|
| 26863 |
+
"step": 344200
|
| 26864 |
+
},
|
| 26865 |
+
{
|
| 26866 |
+
"epoch": 0.0026,
|
| 26867 |
+
"grad_norm": 0.9233891367912292,
|
| 26868 |
+
"learning_rate": 1.1377504414283816e-05,
|
| 26869 |
+
"loss": 0.6683,
|
| 26870 |
+
"step": 344300
|
| 26871 |
+
},
|
| 26872 |
+
{
|
| 26873 |
+
"epoch": 0.0028,
|
| 26874 |
+
"grad_norm": 0.8253393769264221,
|
| 26875 |
+
"learning_rate": 1.13642029715744e-05,
|
| 26876 |
+
"loss": 0.6724,
|
| 26877 |
+
"step": 344400
|
| 26878 |
+
},
|
| 26879 |
+
{
|
| 26880 |
+
"epoch": 0.003,
|
| 26881 |
+
"grad_norm": 0.9402153491973877,
|
| 26882 |
+
"learning_rate": 1.1350907021362409e-05,
|
| 26883 |
+
"loss": 0.6686,
|
| 26884 |
+
"step": 344500
|
| 26885 |
+
},
|
| 26886 |
+
{
|
| 26887 |
+
"epoch": 0.0032,
|
| 26888 |
+
"grad_norm": 0.8452779054641724,
|
| 26889 |
+
"learning_rate": 1.1337616569003425e-05,
|
| 26890 |
+
"loss": 0.6776,
|
| 26891 |
+
"step": 344600
|
| 26892 |
+
},
|
| 26893 |
+
{
|
| 26894 |
+
"epoch": 0.0034,
|
| 26895 |
+
"grad_norm": 0.8500985503196716,
|
| 26896 |
+
"learning_rate": 1.1324331619850856e-05,
|
| 26897 |
+
"loss": 0.6654,
|
| 26898 |
+
"step": 344700
|
| 26899 |
+
},
|
| 26900 |
+
{
|
| 26901 |
+
"epoch": 0.0036,
|
| 26902 |
+
"grad_norm": 0.8803905248641968,
|
| 26903 |
+
"learning_rate": 1.1311052179255871e-05,
|
| 26904 |
+
"loss": 0.675,
|
| 26905 |
+
"step": 344800
|
| 26906 |
+
},
|
| 26907 |
+
{
|
| 26908 |
+
"epoch": 0.0038,
|
| 26909 |
+
"grad_norm": 0.9099257588386536,
|
| 26910 |
+
"learning_rate": 1.1297778252567443e-05,
|
| 26911 |
+
"loss": 0.6569,
|
| 26912 |
+
"step": 344900
|
| 26913 |
+
},
|
| 26914 |
+
{
|
| 26915 |
+
"epoch": 0.004,
|
| 26916 |
+
"grad_norm": 0.8804642558097839,
|
| 26917 |
+
"learning_rate": 1.1284509845132297e-05,
|
| 26918 |
+
"loss": 0.6655,
|
| 26919 |
+
"step": 345000
|
| 26920 |
+
},
|
| 26921 |
+
{
|
| 26922 |
+
"epoch": 0.004,
|
| 26923 |
+
"eval_loss": 2.05592942237854,
|
| 26924 |
+
"eval_runtime": 51.7883,
|
| 26925 |
+
"eval_samples_per_second": 196.84,
|
| 26926 |
+
"eval_steps_per_second": 1.545,
|
| 26927 |
+
"step": 345000
|
| 26928 |
+
},
|
| 26929 |
+
{
|
| 26930 |
+
"epoch": 0.0042,
|
| 26931 |
+
"grad_norm": 0.8482286930084229,
|
| 26932 |
+
"learning_rate": 1.1271246962294935e-05,
|
| 26933 |
+
"loss": 0.6641,
|
| 26934 |
+
"step": 345100
|
| 26935 |
+
},
|
| 26936 |
+
{
|
| 26937 |
+
"epoch": 0.0044,
|
| 26938 |
+
"grad_norm": 0.8636903166770935,
|
| 26939 |
+
"learning_rate": 1.1257989609397654e-05,
|
| 26940 |
+
"loss": 0.6632,
|
| 26941 |
+
"step": 345200
|
| 26942 |
+
},
|
| 26943 |
+
{
|
| 26944 |
+
"epoch": 0.0046,
|
| 26945 |
+
"grad_norm": 0.8937559723854065,
|
| 26946 |
+
"learning_rate": 1.1244737791780524e-05,
|
| 26947 |
+
"loss": 0.6634,
|
| 26948 |
+
"step": 345300
|
| 26949 |
+
},
|
| 26950 |
+
{
|
| 26951 |
+
"epoch": 0.0048,
|
| 26952 |
+
"grad_norm": 0.8914988040924072,
|
| 26953 |
+
"learning_rate": 1.123149151478136e-05,
|
| 26954 |
+
"loss": 0.6693,
|
| 26955 |
+
"step": 345400
|
| 26956 |
+
},
|
| 26957 |
+
{
|
| 26958 |
+
"epoch": 0.005,
|
| 26959 |
+
"grad_norm": 1.0172580480575562,
|
| 26960 |
+
"learning_rate": 1.1218250783735765e-05,
|
| 26961 |
+
"loss": 0.6605,
|
| 26962 |
+
"step": 345500
|
| 26963 |
+
},
|
| 26964 |
+
{
|
| 26965 |
+
"epoch": 0.0052,
|
| 26966 |
+
"grad_norm": 0.9080793857574463,
|
| 26967 |
+
"learning_rate": 1.1205015603977107e-05,
|
| 26968 |
+
"loss": 0.6706,
|
| 26969 |
+
"step": 345600
|
| 26970 |
+
},
|
| 26971 |
+
{
|
| 26972 |
+
"epoch": 0.0054,
|
| 26973 |
+
"grad_norm": 0.8460882306098938,
|
| 26974 |
+
"learning_rate": 1.1191785980836522e-05,
|
| 26975 |
+
"loss": 0.6701,
|
| 26976 |
+
"step": 345700
|
| 26977 |
+
},
|
| 26978 |
+
{
|
| 26979 |
+
"epoch": 0.0056,
|
| 26980 |
+
"grad_norm": 0.8949432373046875,
|
| 26981 |
+
"learning_rate": 1.1178561919642885e-05,
|
| 26982 |
+
"loss": 0.6571,
|
| 26983 |
+
"step": 345800
|
| 26984 |
+
},
|
| 26985 |
+
{
|
| 26986 |
+
"epoch": 0.0058,
|
| 26987 |
+
"grad_norm": 0.8934834599494934,
|
| 26988 |
+
"learning_rate": 1.1165343425722851e-05,
|
| 26989 |
+
"loss": 0.6621,
|
| 26990 |
+
"step": 345900
|
| 26991 |
+
},
|
| 26992 |
+
{
|
| 26993 |
+
"epoch": 0.006,
|
| 26994 |
+
"grad_norm": 0.8950237035751343,
|
| 26995 |
+
"learning_rate": 1.1152130504400834e-05,
|
| 26996 |
+
"loss": 0.6678,
|
| 26997 |
+
"step": 346000
|
| 26998 |
+
},
|
| 26999 |
+
{
|
| 27000 |
+
"epoch": 0.006,
|
| 27001 |
+
"eval_loss": 2.0553648471832275,
|
| 27002 |
+
"eval_runtime": 51.8108,
|
| 27003 |
+
"eval_samples_per_second": 196.754,
|
| 27004 |
+
"eval_steps_per_second": 1.544,
|
| 27005 |
+
"step": 346000
|
| 27006 |
+
},
|
| 27007 |
+
{
|
| 27008 |
+
"epoch": 0.0062,
|
| 27009 |
+
"grad_norm": 0.9523611068725586,
|
| 27010 |
+
"learning_rate": 1.1138923160999002e-05,
|
| 27011 |
+
"loss": 0.673,
|
| 27012 |
+
"step": 346100
|
| 27013 |
+
},
|
| 27014 |
+
{
|
| 27015 |
+
"epoch": 0.0064,
|
| 27016 |
+
"grad_norm": 0.874225914478302,
|
| 27017 |
+
"learning_rate": 1.1125721400837255e-05,
|
| 27018 |
+
"loss": 0.6609,
|
| 27019 |
+
"step": 346200
|
| 27020 |
+
},
|
| 27021 |
+
{
|
| 27022 |
+
"epoch": 0.0066,
|
| 27023 |
+
"grad_norm": 0.9157487750053406,
|
| 27024 |
+
"learning_rate": 1.1112525229233268e-05,
|
| 27025 |
+
"loss": 0.6622,
|
| 27026 |
+
"step": 346300
|
| 27027 |
+
},
|
| 27028 |
+
{
|
| 27029 |
+
"epoch": 0.0068,
|
| 27030 |
+
"grad_norm": 0.9365401864051819,
|
| 27031 |
+
"learning_rate": 1.1099334651502466e-05,
|
| 27032 |
+
"loss": 0.6603,
|
| 27033 |
+
"step": 346400
|
| 27034 |
+
},
|
| 27035 |
+
{
|
| 27036 |
+
"epoch": 0.007,
|
| 27037 |
+
"grad_norm": 0.9212621450424194,
|
| 27038 |
+
"learning_rate": 1.1086149672957993e-05,
|
| 27039 |
+
"loss": 0.6618,
|
| 27040 |
+
"step": 346500
|
| 27041 |
+
},
|
| 27042 |
+
{
|
| 27043 |
+
"epoch": 0.0072,
|
| 27044 |
+
"grad_norm": 0.9013537168502808,
|
| 27045 |
+
"learning_rate": 1.107297029891077e-05,
|
| 27046 |
+
"loss": 0.6665,
|
| 27047 |
+
"step": 346600
|
| 27048 |
+
},
|
| 27049 |
+
{
|
| 27050 |
+
"epoch": 0.0074,
|
| 27051 |
+
"grad_norm": 0.8723328709602356,
|
| 27052 |
+
"learning_rate": 1.1059796534669447e-05,
|
| 27053 |
+
"loss": 0.6548,
|
| 27054 |
+
"step": 346700
|
| 27055 |
+
},
|
| 27056 |
+
{
|
| 27057 |
+
"epoch": 0.0076,
|
| 27058 |
+
"grad_norm": 0.8133809566497803,
|
| 27059 |
+
"learning_rate": 1.1046628385540419e-05,
|
| 27060 |
+
"loss": 0.6352,
|
| 27061 |
+
"step": 346800
|
| 27062 |
+
},
|
| 27063 |
+
{
|
| 27064 |
+
"epoch": 0.0078,
|
| 27065 |
+
"grad_norm": 0.8866004347801208,
|
| 27066 |
+
"learning_rate": 1.1033465856827802e-05,
|
| 27067 |
+
"loss": 0.6679,
|
| 27068 |
+
"step": 346900
|
| 27069 |
+
},
|
| 27070 |
+
{
|
| 27071 |
+
"epoch": 0.008,
|
| 27072 |
+
"grad_norm": 0.9575750231742859,
|
| 27073 |
+
"learning_rate": 1.1020308953833467e-05,
|
| 27074 |
+
"loss": 0.6658,
|
| 27075 |
+
"step": 347000
|
| 27076 |
+
},
|
| 27077 |
+
{
|
| 27078 |
+
"epoch": 0.008,
|
| 27079 |
+
"eval_loss": 2.0689334869384766,
|
| 27080 |
+
"eval_runtime": 51.6857,
|
| 27081 |
+
"eval_samples_per_second": 197.231,
|
| 27082 |
+
"eval_steps_per_second": 1.548,
|
| 27083 |
+
"step": 347000
|
| 27084 |
+
},
|
| 27085 |
+
{
|
| 27086 |
+
"epoch": 0.0082,
|
| 27087 |
+
"grad_norm": 0.8472666144371033,
|
| 27088 |
+
"learning_rate": 1.100715768185701e-05,
|
| 27089 |
+
"loss": 0.6504,
|
| 27090 |
+
"step": 347100
|
| 27091 |
+
},
|
| 27092 |
+
{
|
| 27093 |
+
"epoch": 0.0084,
|
| 27094 |
+
"grad_norm": 0.8880901336669922,
|
| 27095 |
+
"learning_rate": 1.0994012046195779e-05,
|
| 27096 |
+
"loss": 0.6706,
|
| 27097 |
+
"step": 347200
|
| 27098 |
+
},
|
| 27099 |
+
{
|
| 27100 |
+
"epoch": 0.0086,
|
| 27101 |
+
"grad_norm": 0.8281514644622803,
|
| 27102 |
+
"learning_rate": 1.0980872052144809e-05,
|
| 27103 |
+
"loss": 0.6514,
|
| 27104 |
+
"step": 347300
|
| 27105 |
+
},
|
| 27106 |
+
{
|
| 27107 |
+
"epoch": 0.0088,
|
| 27108 |
+
"grad_norm": 0.8914335370063782,
|
| 27109 |
+
"learning_rate": 1.09677377049969e-05,
|
| 27110 |
+
"loss": 0.6526,
|
| 27111 |
+
"step": 347400
|
| 27112 |
+
},
|
| 27113 |
+
{
|
| 27114 |
+
"epoch": 0.009,
|
| 27115 |
+
"grad_norm": 0.9571097493171692,
|
| 27116 |
+
"learning_rate": 1.0954609010042568e-05,
|
| 27117 |
+
"loss": 0.6623,
|
| 27118 |
+
"step": 347500
|
| 27119 |
+
},
|
| 27120 |
+
{
|
| 27121 |
+
"epoch": 0.0092,
|
| 27122 |
+
"grad_norm": 0.9575111865997314,
|
| 27123 |
+
"learning_rate": 1.0941485972570053e-05,
|
| 27124 |
+
"loss": 0.6526,
|
| 27125 |
+
"step": 347600
|
| 27126 |
+
},
|
| 27127 |
+
{
|
| 27128 |
+
"epoch": 0.0094,
|
| 27129 |
+
"grad_norm": 0.7946931719779968,
|
| 27130 |
+
"learning_rate": 1.0928368597865298e-05,
|
| 27131 |
+
"loss": 0.6621,
|
| 27132 |
+
"step": 347700
|
| 27133 |
+
},
|
| 27134 |
+
{
|
| 27135 |
+
"epoch": 0.0096,
|
| 27136 |
+
"grad_norm": 0.901408851146698,
|
| 27137 |
+
"learning_rate": 1.0915256891211992e-05,
|
| 27138 |
+
"loss": 0.6575,
|
| 27139 |
+
"step": 347800
|
| 27140 |
+
},
|
| 27141 |
+
{
|
| 27142 |
+
"epoch": 0.0098,
|
| 27143 |
+
"grad_norm": 0.8669435977935791,
|
| 27144 |
+
"learning_rate": 1.0902150857891532e-05,
|
| 27145 |
+
"loss": 0.6603,
|
| 27146 |
+
"step": 347900
|
| 27147 |
+
},
|
| 27148 |
+
{
|
| 27149 |
+
"epoch": 0.01,
|
| 27150 |
+
"grad_norm": 0.8946738243103027,
|
| 27151 |
+
"learning_rate": 1.0889050503183016e-05,
|
| 27152 |
+
"loss": 0.6667,
|
| 27153 |
+
"step": 348000
|
| 27154 |
+
},
|
| 27155 |
+
{
|
| 27156 |
+
"epoch": 0.01,
|
| 27157 |
+
"eval_loss": 2.0592565536499023,
|
| 27158 |
+
"eval_runtime": 51.912,
|
| 27159 |
+
"eval_samples_per_second": 196.371,
|
| 27160 |
+
"eval_steps_per_second": 1.541,
|
| 27161 |
+
"step": 348000
|
| 27162 |
+
},
|
| 27163 |
+
{
|
| 27164 |
+
"epoch": 0.0102,
|
| 27165 |
+
"grad_norm": 0.8748307228088379,
|
| 27166 |
+
"learning_rate": 1.0875955832363266e-05,
|
| 27167 |
+
"loss": 0.6613,
|
| 27168 |
+
"step": 348100
|
| 27169 |
+
},
|
| 27170 |
+
{
|
| 27171 |
+
"epoch": 0.0104,
|
| 27172 |
+
"grad_norm": 0.846490740776062,
|
| 27173 |
+
"learning_rate": 1.0862866850706818e-05,
|
| 27174 |
+
"loss": 0.6577,
|
| 27175 |
+
"step": 348200
|
| 27176 |
+
},
|
| 27177 |
+
{
|
| 27178 |
+
"epoch": 0.0106,
|
| 27179 |
+
"grad_norm": 0.860930323600769,
|
| 27180 |
+
"learning_rate": 1.0849783563485921e-05,
|
| 27181 |
+
"loss": 0.6552,
|
| 27182 |
+
"step": 348300
|
| 27183 |
+
},
|
| 27184 |
+
{
|
| 27185 |
+
"epoch": 0.0108,
|
| 27186 |
+
"grad_norm": 0.8625341653823853,
|
| 27187 |
+
"learning_rate": 1.0836705975970504e-05,
|
| 27188 |
+
"loss": 0.6437,
|
| 27189 |
+
"step": 348400
|
| 27190 |
+
},
|
| 27191 |
+
{
|
| 27192 |
+
"epoch": 0.011,
|
| 27193 |
+
"grad_norm": 0.8479413986206055,
|
| 27194 |
+
"learning_rate": 1.0823634093428226e-05,
|
| 27195 |
+
"loss": 0.664,
|
| 27196 |
+
"step": 348500
|
| 27197 |
+
},
|
| 27198 |
+
{
|
| 27199 |
+
"epoch": 0.0112,
|
| 27200 |
+
"grad_norm": 0.9355835914611816,
|
| 27201 |
+
"learning_rate": 1.0810567921124436e-05,
|
| 27202 |
+
"loss": 0.6606,
|
| 27203 |
+
"step": 348600
|
| 27204 |
+
},
|
| 27205 |
+
{
|
| 27206 |
+
"epoch": 0.0114,
|
| 27207 |
+
"grad_norm": 0.9027217626571655,
|
| 27208 |
+
"learning_rate": 1.0797507464322203e-05,
|
| 27209 |
+
"loss": 0.6509,
|
| 27210 |
+
"step": 348700
|
| 27211 |
+
},
|
| 27212 |
+
{
|
| 27213 |
+
"epoch": 0.0116,
|
| 27214 |
+
"grad_norm": 0.8765237927436829,
|
| 27215 |
+
"learning_rate": 1.0784452728282257e-05,
|
| 27216 |
+
"loss": 0.6564,
|
| 27217 |
+
"step": 348800
|
| 27218 |
+
},
|
| 27219 |
+
{
|
| 27220 |
+
"epoch": 0.0118,
|
| 27221 |
+
"grad_norm": 0.9060245156288147,
|
| 27222 |
+
"learning_rate": 1.0771403718263051e-05,
|
| 27223 |
+
"loss": 0.6555,
|
| 27224 |
+
"step": 348900
|
| 27225 |
+
},
|
| 27226 |
+
{
|
| 27227 |
+
"epoch": 0.012,
|
| 27228 |
+
"grad_norm": 0.9202615022659302,
|
| 27229 |
+
"learning_rate": 1.0758360439520727e-05,
|
| 27230 |
+
"loss": 0.6522,
|
| 27231 |
+
"step": 349000
|
| 27232 |
+
},
|
| 27233 |
+
{
|
| 27234 |
+
"epoch": 0.012,
|
| 27235 |
+
"eval_loss": 2.057035207748413,
|
| 27236 |
+
"eval_runtime": 51.8702,
|
| 27237 |
+
"eval_samples_per_second": 196.529,
|
| 27238 |
+
"eval_steps_per_second": 1.542,
|
| 27239 |
+
"step": 349000
|
| 27240 |
+
},
|
| 27241 |
+
{
|
| 27242 |
+
"epoch": 0.0122,
|
| 27243 |
+
"grad_norm": 0.8476743102073669,
|
| 27244 |
+
"learning_rate": 1.0745322897309124e-05,
|
| 27245 |
+
"loss": 0.6623,
|
| 27246 |
+
"step": 349100
|
| 27247 |
+
},
|
| 27248 |
+
{
|
| 27249 |
+
"epoch": 0.0124,
|
| 27250 |
+
"grad_norm": 0.9493403434753418,
|
| 27251 |
+
"learning_rate": 1.073229109687974e-05,
|
| 27252 |
+
"loss": 0.6697,
|
| 27253 |
+
"step": 349200
|
| 27254 |
+
},
|
| 27255 |
+
{
|
| 27256 |
+
"epoch": 0.0126,
|
| 27257 |
+
"grad_norm": 0.8388432860374451,
|
| 27258 |
+
"learning_rate": 1.07192650434818e-05,
|
| 27259 |
+
"loss": 0.6494,
|
| 27260 |
+
"step": 349300
|
| 27261 |
+
},
|
| 27262 |
+
{
|
| 27263 |
+
"epoch": 0.0128,
|
| 27264 |
+
"grad_norm": 0.9042513966560364,
|
| 27265 |
+
"learning_rate": 1.0706244742362192e-05,
|
| 27266 |
+
"loss": 0.6473,
|
| 27267 |
+
"step": 349400
|
| 27268 |
+
},
|
| 27269 |
+
{
|
| 27270 |
+
"epoch": 0.013,
|
| 27271 |
+
"grad_norm": 0.8294413089752197,
|
| 27272 |
+
"learning_rate": 1.06932301987655e-05,
|
| 27273 |
+
"loss": 0.6652,
|
| 27274 |
+
"step": 349500
|
| 27275 |
+
},
|
| 27276 |
+
{
|
| 27277 |
+
"epoch": 0.0132,
|
| 27278 |
+
"grad_norm": 0.9279148578643799,
|
| 27279 |
+
"learning_rate": 1.0680221417933963e-05,
|
| 27280 |
+
"loss": 0.6506,
|
| 27281 |
+
"step": 349600
|
| 27282 |
+
},
|
| 27283 |
+
{
|
| 27284 |
+
"epoch": 0.0134,
|
| 27285 |
+
"grad_norm": 0.8778104782104492,
|
| 27286 |
+
"learning_rate": 1.066721840510753e-05,
|
| 27287 |
+
"loss": 0.663,
|
| 27288 |
+
"step": 349700
|
| 27289 |
+
},
|
| 27290 |
+
{
|
| 27291 |
+
"epoch": 0.0136,
|
| 27292 |
+
"grad_norm": 0.8701128959655762,
|
| 27293 |
+
"learning_rate": 1.0654221165523817e-05,
|
| 27294 |
+
"loss": 0.6605,
|
| 27295 |
+
"step": 349800
|
| 27296 |
+
},
|
| 27297 |
+
{
|
| 27298 |
+
"epoch": 0.0138,
|
| 27299 |
+
"grad_norm": 0.9396702647209167,
|
| 27300 |
+
"learning_rate": 1.0641229704418093e-05,
|
| 27301 |
+
"loss": 0.658,
|
| 27302 |
+
"step": 349900
|
| 27303 |
+
},
|
| 27304 |
+
{
|
| 27305 |
+
"epoch": 0.014,
|
| 27306 |
+
"grad_norm": 0.891123354434967,
|
| 27307 |
+
"learning_rate": 1.0628244027023329e-05,
|
| 27308 |
+
"loss": 0.6186,
|
| 27309 |
+
"step": 350000
|
| 27310 |
+
},
|
| 27311 |
+
{
|
| 27312 |
+
"epoch": 0.014,
|
| 27313 |
+
"eval_loss": 2.059767961502075,
|
| 27314 |
+
"eval_runtime": 51.9881,
|
| 27315 |
+
"eval_samples_per_second": 196.083,
|
| 27316 |
+
"eval_steps_per_second": 1.539,
|
| 27317 |
+
"step": 350000
|
| 27318 |
+
},
|
| 27319 |
+
{
|
| 27320 |
+
"epoch": 0.0142,
|
| 27321 |
+
"grad_norm": 0.8995864391326904,
|
| 27322 |
+
"learning_rate": 1.061526413857015e-05,
|
| 27323 |
+
"loss": 0.6545,
|
| 27324 |
+
"step": 350100
|
| 27325 |
+
},
|
| 27326 |
+
{
|
| 27327 |
+
"epoch": 0.0144,
|
| 27328 |
+
"grad_norm": 0.8432427048683167,
|
| 27329 |
+
"learning_rate": 1.0602290044286866e-05,
|
| 27330 |
+
"loss": 0.6527,
|
| 27331 |
+
"step": 350200
|
| 27332 |
+
},
|
| 27333 |
+
{
|
| 27334 |
+
"epoch": 0.0146,
|
| 27335 |
+
"grad_norm": 0.8539645671844482,
|
| 27336 |
+
"learning_rate": 1.058932174939942e-05,
|
| 27337 |
+
"loss": 0.66,
|
| 27338 |
+
"step": 350300
|
| 27339 |
+
},
|
| 27340 |
+
{
|
| 27341 |
+
"epoch": 0.0148,
|
| 27342 |
+
"grad_norm": 0.8698434233665466,
|
| 27343 |
+
"learning_rate": 1.0576359259131452e-05,
|
| 27344 |
+
"loss": 0.6686,
|
| 27345 |
+
"step": 350400
|
| 27346 |
+
},
|
| 27347 |
+
{
|
| 27348 |
+
"epoch": 0.015,
|
| 27349 |
+
"grad_norm": 0.8616706728935242,
|
| 27350 |
+
"learning_rate": 1.0563402578704248e-05,
|
| 27351 |
+
"loss": 0.6605,
|
| 27352 |
+
"step": 350500
|
| 27353 |
+
},
|
| 27354 |
+
{
|
| 27355 |
+
"epoch": 0.0152,
|
| 27356 |
+
"grad_norm": 0.891680121421814,
|
| 27357 |
+
"learning_rate": 1.0550451713336768e-05,
|
| 27358 |
+
"loss": 0.6471,
|
| 27359 |
+
"step": 350600
|
| 27360 |
+
},
|
| 27361 |
+
{
|
| 27362 |
+
"epoch": 0.0154,
|
| 27363 |
+
"grad_norm": 0.9290798306465149,
|
| 27364 |
+
"learning_rate": 1.05375066682456e-05,
|
| 27365 |
+
"loss": 0.6575,
|
| 27366 |
+
"step": 350700
|
| 27367 |
+
},
|
| 27368 |
+
{
|
| 27369 |
+
"epoch": 0.0156,
|
| 27370 |
+
"grad_norm": 0.8489027619361877,
|
| 27371 |
+
"learning_rate": 1.0524567448645018e-05,
|
| 27372 |
+
"loss": 0.6484,
|
| 27373 |
+
"step": 350800
|
| 27374 |
+
},
|
| 27375 |
+
{
|
| 27376 |
+
"epoch": 0.0158,
|
| 27377 |
+
"grad_norm": 0.8927240371704102,
|
| 27378 |
+
"learning_rate": 1.0511634059746935e-05,
|
| 27379 |
+
"loss": 0.6637,
|
| 27380 |
+
"step": 350900
|
| 27381 |
+
},
|
| 27382 |
+
{
|
| 27383 |
+
"epoch": 0.016,
|
| 27384 |
+
"grad_norm": 0.8975149393081665,
|
| 27385 |
+
"learning_rate": 1.0498706506760933e-05,
|
| 27386 |
+
"loss": 0.6729,
|
| 27387 |
+
"step": 351000
|
| 27388 |
+
},
|
| 27389 |
+
{
|
| 27390 |
+
"epoch": 0.016,
|
| 27391 |
+
"eval_loss": 2.0625927448272705,
|
| 27392 |
+
"eval_runtime": 52.1361,
|
| 27393 |
+
"eval_samples_per_second": 195.527,
|
| 27394 |
+
"eval_steps_per_second": 1.534,
|
| 27395 |
+
"step": 351000
|
| 27396 |
+
},
|
| 27397 |
+
{
|
| 27398 |
+
"epoch": 0.0162,
|
| 27399 |
+
"grad_norm": 0.8605362176895142,
|
| 27400 |
+
"learning_rate": 1.0485784794894205e-05,
|
| 27401 |
+
"loss": 0.6494,
|
| 27402 |
+
"step": 351100
|
| 27403 |
+
},
|
| 27404 |
+
{
|
| 27405 |
+
"epoch": 0.0164,
|
| 27406 |
+
"grad_norm": 0.9211152791976929,
|
| 27407 |
+
"learning_rate": 1.0472868929351622e-05,
|
| 27408 |
+
"loss": 0.6661,
|
| 27409 |
+
"step": 351200
|
| 27410 |
+
},
|
| 27411 |
+
{
|
| 27412 |
+
"epoch": 0.0166,
|
| 27413 |
+
"grad_norm": 0.9342173337936401,
|
| 27414 |
+
"learning_rate": 1.045995891533571e-05,
|
| 27415 |
+
"loss": 0.6567,
|
| 27416 |
+
"step": 351300
|
| 27417 |
+
},
|
| 27418 |
+
{
|
| 27419 |
+
"epoch": 0.0168,
|
| 27420 |
+
"grad_norm": 0.9137123227119446,
|
| 27421 |
+
"learning_rate": 1.0447054758046598e-05,
|
| 27422 |
+
"loss": 0.6396,
|
| 27423 |
+
"step": 351400
|
| 27424 |
+
},
|
| 27425 |
+
{
|
| 27426 |
+
"epoch": 0.017,
|
| 27427 |
+
"grad_norm": 0.9604211449623108,
|
| 27428 |
+
"learning_rate": 1.043415646268209e-05,
|
| 27429 |
+
"loss": 0.6496,
|
| 27430 |
+
"step": 351500
|
| 27431 |
+
},
|
| 27432 |
+
{
|
| 27433 |
+
"epoch": 0.0172,
|
| 27434 |
+
"grad_norm": 0.8666329979896545,
|
| 27435 |
+
"learning_rate": 1.0421264034437616e-05,
|
| 27436 |
+
"loss": 0.664,
|
| 27437 |
+
"step": 351600
|
| 27438 |
+
},
|
| 27439 |
+
{
|
| 27440 |
+
"epoch": 0.0174,
|
| 27441 |
+
"grad_norm": 0.86720871925354,
|
| 27442 |
+
"learning_rate": 1.0408377478506253e-05,
|
| 27443 |
+
"loss": 0.657,
|
| 27444 |
+
"step": 351700
|
| 27445 |
+
},
|
| 27446 |
+
{
|
| 27447 |
+
"epoch": 0.0176,
|
| 27448 |
+
"grad_norm": 0.9042288064956665,
|
| 27449 |
+
"learning_rate": 1.0395496800078692e-05,
|
| 27450 |
+
"loss": 0.6564,
|
| 27451 |
+
"step": 351800
|
| 27452 |
+
},
|
| 27453 |
+
{
|
| 27454 |
+
"epoch": 0.0178,
|
| 27455 |
+
"grad_norm": 0.9693347811698914,
|
| 27456 |
+
"learning_rate": 1.038262200434327e-05,
|
| 27457 |
+
"loss": 0.644,
|
| 27458 |
+
"step": 351900
|
| 27459 |
+
},
|
| 27460 |
+
{
|
| 27461 |
+
"epoch": 0.018,
|
| 27462 |
+
"grad_norm": 0.8999383449554443,
|
| 27463 |
+
"learning_rate": 1.0369753096485957e-05,
|
| 27464 |
+
"loss": 0.6534,
|
| 27465 |
+
"step": 352000
|
| 27466 |
+
},
|
| 27467 |
+
{
|
| 27468 |
+
"epoch": 0.018,
|
| 27469 |
+
"eval_loss": 2.0669960975646973,
|
| 27470 |
+
"eval_runtime": 52.2938,
|
| 27471 |
+
"eval_samples_per_second": 194.937,
|
| 27472 |
+
"eval_steps_per_second": 1.53,
|
| 27473 |
+
"step": 352000
|
| 27474 |
+
},
|
| 27475 |
+
{
|
| 27476 |
+
"epoch": 0.0182,
|
| 27477 |
+
"grad_norm": 0.907943844795227,
|
| 27478 |
+
"learning_rate": 1.0356890081690356e-05,
|
| 27479 |
+
"loss": 0.6459,
|
| 27480 |
+
"step": 352100
|
| 27481 |
+
},
|
| 27482 |
+
{
|
| 27483 |
+
"epoch": 0.0184,
|
| 27484 |
+
"grad_norm": 0.866569995880127,
|
| 27485 |
+
"learning_rate": 1.034403296513767e-05,
|
| 27486 |
+
"loss": 0.6519,
|
| 27487 |
+
"step": 352200
|
| 27488 |
+
},
|
| 27489 |
+
{
|
| 27490 |
+
"epoch": 0.0186,
|
| 27491 |
+
"grad_norm": 0.904236376285553,
|
| 27492 |
+
"learning_rate": 1.0331181752006755e-05,
|
| 27493 |
+
"loss": 0.6554,
|
| 27494 |
+
"step": 352300
|
| 27495 |
+
},
|
| 27496 |
+
{
|
| 27497 |
+
"epoch": 0.0188,
|
| 27498 |
+
"grad_norm": 0.9165827035903931,
|
| 27499 |
+
"learning_rate": 1.0318336447474075e-05,
|
| 27500 |
+
"loss": 0.6773,
|
| 27501 |
+
"step": 352400
|
| 27502 |
+
},
|
| 27503 |
+
{
|
| 27504 |
+
"epoch": 0.019,
|
| 27505 |
+
"grad_norm": 0.8540114164352417,
|
| 27506 |
+
"learning_rate": 1.0305497056713726e-05,
|
| 27507 |
+
"loss": 0.6529,
|
| 27508 |
+
"step": 352500
|
| 27509 |
+
},
|
| 27510 |
+
{
|
| 27511 |
+
"epoch": 0.0192,
|
| 27512 |
+
"grad_norm": 0.9309752583503723,
|
| 27513 |
+
"learning_rate": 1.0292663584897396e-05,
|
| 27514 |
+
"loss": 0.6535,
|
| 27515 |
+
"step": 352600
|
| 27516 |
+
},
|
| 27517 |
+
{
|
| 27518 |
+
"epoch": 0.0194,
|
| 27519 |
+
"grad_norm": 0.8861046433448792,
|
| 27520 |
+
"learning_rate": 1.0279836037194417e-05,
|
| 27521 |
+
"loss": 0.6607,
|
| 27522 |
+
"step": 352700
|
| 27523 |
+
},
|
| 27524 |
+
{
|
| 27525 |
+
"epoch": 0.0196,
|
| 27526 |
+
"grad_norm": 0.9103682637214661,
|
| 27527 |
+
"learning_rate": 1.026701441877173e-05,
|
| 27528 |
+
"loss": 0.6708,
|
| 27529 |
+
"step": 352800
|
| 27530 |
+
},
|
| 27531 |
+
{
|
| 27532 |
+
"epoch": 0.0198,
|
| 27533 |
+
"grad_norm": 0.9763253927230835,
|
| 27534 |
+
"learning_rate": 1.0254198734793865e-05,
|
| 27535 |
+
"loss": 0.6319,
|
| 27536 |
+
"step": 352900
|
| 27537 |
+
},
|
| 27538 |
+
{
|
| 27539 |
+
"epoch": 0.02,
|
| 27540 |
+
"grad_norm": 0.8923797011375427,
|
| 27541 |
+
"learning_rate": 1.0241388990422986e-05,
|
| 27542 |
+
"loss": 0.6605,
|
| 27543 |
+
"step": 353000
|
| 27544 |
+
},
|
| 27545 |
+
{
|
| 27546 |
+
"epoch": 0.02,
|
| 27547 |
+
"eval_loss": 2.066145658493042,
|
| 27548 |
+
"eval_runtime": 52.3003,
|
| 27549 |
+
"eval_samples_per_second": 194.913,
|
| 27550 |
+
"eval_steps_per_second": 1.53,
|
| 27551 |
+
"step": 353000
|
| 27552 |
+
},
|
| 27553 |
+
{
|
| 27554 |
+
"epoch": 0.0202,
|
| 27555 |
+
"grad_norm": 0.8869938850402832,
|
| 27556 |
+
"learning_rate": 1.0228585190818857e-05,
|
| 27557 |
+
"loss": 0.6594,
|
| 27558 |
+
"step": 353100
|
| 27559 |
+
},
|
| 27560 |
+
{
|
| 27561 |
+
"epoch": 0.0204,
|
| 27562 |
+
"grad_norm": 0.8605444431304932,
|
| 27563 |
+
"learning_rate": 1.0215787341138854e-05,
|
| 27564 |
+
"loss": 0.664,
|
| 27565 |
+
"step": 353200
|
| 27566 |
+
},
|
| 27567 |
+
{
|
| 27568 |
+
"epoch": 0.0206,
|
| 27569 |
+
"grad_norm": 1.001497745513916,
|
| 27570 |
+
"learning_rate": 1.0202995446537933e-05,
|
| 27571 |
+
"loss": 0.6574,
|
| 27572 |
+
"step": 353300
|
| 27573 |
+
},
|
| 27574 |
+
{
|
| 27575 |
+
"epoch": 0.0208,
|
| 27576 |
+
"grad_norm": 0.8902758359909058,
|
| 27577 |
+
"learning_rate": 1.0190209512168677e-05,
|
| 27578 |
+
"loss": 0.6536,
|
| 27579 |
+
"step": 353400
|
| 27580 |
+
},
|
| 27581 |
+
{
|
| 27582 |
+
"epoch": 0.021,
|
| 27583 |
+
"grad_norm": 0.9075655341148376,
|
| 27584 |
+
"learning_rate": 1.017742954318127e-05,
|
| 27585 |
+
"loss": 0.6545,
|
| 27586 |
+
"step": 353500
|
| 27587 |
+
},
|
| 27588 |
+
{
|
| 27589 |
+
"epoch": 0.0212,
|
| 27590 |
+
"grad_norm": 0.9329447746276855,
|
| 27591 |
+
"learning_rate": 1.016465554472346e-05,
|
| 27592 |
+
"loss": 0.6589,
|
| 27593 |
+
"step": 353600
|
| 27594 |
+
},
|
| 27595 |
+
{
|
| 27596 |
+
"epoch": 0.0214,
|
| 27597 |
+
"grad_norm": 0.8853082656860352,
|
| 27598 |
+
"learning_rate": 1.0151887521940628e-05,
|
| 27599 |
+
"loss": 0.6532,
|
| 27600 |
+
"step": 353700
|
| 27601 |
+
},
|
| 27602 |
+
{
|
| 27603 |
+
"epoch": 0.0216,
|
| 27604 |
+
"grad_norm": 0.8958137631416321,
|
| 27605 |
+
"learning_rate": 1.0139125479975722e-05,
|
| 27606 |
+
"loss": 0.6563,
|
| 27607 |
+
"step": 353800
|
| 27608 |
+
},
|
| 27609 |
+
{
|
| 27610 |
+
"epoch": 0.0218,
|
| 27611 |
+
"grad_norm": 0.865190863609314,
|
| 27612 |
+
"learning_rate": 1.0126369423969293e-05,
|
| 27613 |
+
"loss": 0.6585,
|
| 27614 |
+
"step": 353900
|
| 27615 |
+
},
|
| 27616 |
+
{
|
| 27617 |
+
"epoch": 0.022,
|
| 27618 |
+
"grad_norm": 0.9948294162750244,
|
| 27619 |
+
"learning_rate": 1.0113619359059482e-05,
|
| 27620 |
+
"loss": 0.65,
|
| 27621 |
+
"step": 354000
|
| 27622 |
+
},
|
| 27623 |
+
{
|
| 27624 |
+
"epoch": 0.022,
|
| 27625 |
+
"eval_loss": 2.085937976837158,
|
| 27626 |
+
"eval_runtime": 52.093,
|
| 27627 |
+
"eval_samples_per_second": 195.689,
|
| 27628 |
+
"eval_steps_per_second": 1.536,
|
| 27629 |
+
"step": 354000
|
| 27630 |
+
},
|
| 27631 |
+
{
|
| 27632 |
+
"epoch": 0.0222,
|
| 27633 |
+
"grad_norm": 0.9526733160018921,
|
| 27634 |
+
"learning_rate": 1.0100875290382022e-05,
|
| 27635 |
+
"loss": 0.6509,
|
| 27636 |
+
"step": 354100
|
| 27637 |
+
},
|
| 27638 |
+
{
|
| 27639 |
+
"epoch": 0.0224,
|
| 27640 |
+
"grad_norm": 0.8897534608840942,
|
| 27641 |
+
"learning_rate": 1.0088137223070205e-05,
|
| 27642 |
+
"loss": 0.6609,
|
| 27643 |
+
"step": 354200
|
| 27644 |
+
},
|
| 27645 |
+
{
|
| 27646 |
+
"epoch": 0.0226,
|
| 27647 |
+
"grad_norm": 0.8177494406700134,
|
| 27648 |
+
"learning_rate": 1.007540516225493e-05,
|
| 27649 |
+
"loss": 0.6531,
|
| 27650 |
+
"step": 354300
|
| 27651 |
+
},
|
| 27652 |
+
{
|
| 27653 |
+
"epoch": 0.0228,
|
| 27654 |
+
"grad_norm": 0.9328579306602478,
|
| 27655 |
+
"learning_rate": 1.006267911306468e-05,
|
| 27656 |
+
"loss": 0.7497,
|
| 27657 |
+
"step": 354400
|
| 27658 |
+
},
|
| 27659 |
+
{
|
| 27660 |
+
"epoch": 0.023,
|
| 27661 |
+
"grad_norm": 0.8657885193824768,
|
| 27662 |
+
"learning_rate": 1.004995908062549e-05,
|
| 27663 |
+
"loss": 0.7346,
|
| 27664 |
+
"step": 354500
|
| 27665 |
+
},
|
| 27666 |
+
{
|
| 27667 |
+
"epoch": 0.0232,
|
| 27668 |
+
"grad_norm": 0.8872801661491394,
|
| 27669 |
+
"learning_rate": 1.0037245070060991e-05,
|
| 27670 |
+
"loss": 0.7475,
|
| 27671 |
+
"step": 354600
|
| 27672 |
+
},
|
| 27673 |
+
{
|
| 27674 |
+
"epoch": 0.0234,
|
| 27675 |
+
"grad_norm": 0.8421425223350525,
|
| 27676 |
+
"learning_rate": 1.002453708649239e-05,
|
| 27677 |
+
"loss": 0.7338,
|
| 27678 |
+
"step": 354700
|
| 27679 |
+
},
|
| 27680 |
+
{
|
| 27681 |
+
"epoch": 0.0236,
|
| 27682 |
+
"grad_norm": 0.8456546068191528,
|
| 27683 |
+
"learning_rate": 1.0011835135038469e-05,
|
| 27684 |
+
"loss": 0.7163,
|
| 27685 |
+
"step": 354800
|
| 27686 |
+
},
|
| 27687 |
+
{
|
| 27688 |
+
"epoch": 0.0238,
|
| 27689 |
+
"grad_norm": 0.9232527613639832,
|
| 27690 |
+
"learning_rate": 9.999139220815554e-06,
|
| 27691 |
+
"loss": 0.715,
|
| 27692 |
+
"step": 354900
|
| 27693 |
+
},
|
| 27694 |
+
{
|
| 27695 |
+
"epoch": 0.024,
|
| 27696 |
+
"grad_norm": 0.8569039702415466,
|
| 27697 |
+
"learning_rate": 9.986449348937568e-06,
|
| 27698 |
+
"loss": 0.7392,
|
| 27699 |
+
"step": 355000
|
| 27700 |
+
},
|
| 27701 |
+
{
|
| 27702 |
+
"epoch": 0.024,
|
| 27703 |
+
"eval_loss": 2.056723117828369,
|
| 27704 |
+
"eval_runtime": 52.2992,
|
| 27705 |
+
"eval_samples_per_second": 194.917,
|
| 27706 |
+
"eval_steps_per_second": 1.53,
|
| 27707 |
+
"step": 355000
|
| 27708 |
+
},
|
| 27709 |
+
{
|
| 27710 |
+
"epoch": 0.0242,
|
| 27711 |
+
"grad_norm": 0.8463347554206848,
|
| 27712 |
+
"learning_rate": 9.973765524515988e-06,
|
| 27713 |
+
"loss": 0.719,
|
| 27714 |
+
"step": 355100
|
| 27715 |
+
},
|
| 27716 |
+
{
|
| 27717 |
+
"epoch": 0.0244,
|
| 27718 |
+
"grad_norm": 0.9859148263931274,
|
| 27719 |
+
"learning_rate": 9.961087752659866e-06,
|
| 27720 |
+
"loss": 0.7161,
|
| 27721 |
+
"step": 355200
|
| 27722 |
+
},
|
| 27723 |
+
{
|
| 27724 |
+
"epoch": 0.0246,
|
| 27725 |
+
"grad_norm": 0.8795856833457947,
|
| 27726 |
+
"learning_rate": 9.94841603847579e-06,
|
| 27727 |
+
"loss": 0.7211,
|
| 27728 |
+
"step": 355300
|
| 27729 |
+
},
|
| 27730 |
+
{
|
| 27731 |
+
"epoch": 0.0248,
|
| 27732 |
+
"grad_norm": 0.8623588681221008,
|
| 27733 |
+
"learning_rate": 9.935750387067935e-06,
|
| 27734 |
+
"loss": 0.7134,
|
| 27735 |
+
"step": 355400
|
| 27736 |
+
},
|
| 27737 |
+
{
|
| 27738 |
+
"epoch": 0.025,
|
| 27739 |
+
"grad_norm": 0.8915929794311523,
|
| 27740 |
+
"learning_rate": 9.923090803538021e-06,
|
| 27741 |
+
"loss": 0.718,
|
| 27742 |
+
"step": 355500
|
| 27743 |
+
},
|
| 27744 |
+
{
|
| 27745 |
+
"epoch": 0.0252,
|
| 27746 |
+
"grad_norm": 0.9230467081069946,
|
| 27747 |
+
"learning_rate": 9.91043729298534e-06,
|
| 27748 |
+
"loss": 0.7092,
|
| 27749 |
+
"step": 355600
|
| 27750 |
+
},
|
| 27751 |
+
{
|
| 27752 |
+
"epoch": 0.0254,
|
| 27753 |
+
"grad_norm": 0.9159933924674988,
|
| 27754 |
+
"learning_rate": 9.8977898605067e-06,
|
| 27755 |
+
"loss": 0.7139,
|
| 27756 |
+
"step": 355700
|
| 27757 |
+
},
|
| 27758 |
+
{
|
| 27759 |
+
"epoch": 0.0256,
|
| 27760 |
+
"grad_norm": 1.0485515594482422,
|
| 27761 |
+
"learning_rate": 9.885148511196502e-06,
|
| 27762 |
+
"loss": 0.7071,
|
| 27763 |
+
"step": 355800
|
| 27764 |
+
},
|
| 27765 |
+
{
|
| 27766 |
+
"epoch": 0.0258,
|
| 27767 |
+
"grad_norm": 0.8589327335357666,
|
| 27768 |
+
"learning_rate": 9.872513250146681e-06,
|
| 27769 |
+
"loss": 0.7102,
|
| 27770 |
+
"step": 355900
|
| 27771 |
+
},
|
| 27772 |
+
{
|
| 27773 |
+
"epoch": 0.026,
|
| 27774 |
+
"grad_norm": 0.9215981960296631,
|
| 27775 |
+
"learning_rate": 9.859884082446707e-06,
|
| 27776 |
+
"loss": 0.6789,
|
| 27777 |
+
"step": 356000
|
| 27778 |
+
},
|
| 27779 |
+
{
|
| 27780 |
+
"epoch": 0.026,
|
| 27781 |
+
"eval_loss": 2.081296920776367,
|
| 27782 |
+
"eval_runtime": 52.2111,
|
| 27783 |
+
"eval_samples_per_second": 195.246,
|
| 27784 |
+
"eval_steps_per_second": 1.532,
|
| 27785 |
+
"step": 356000
|
| 27786 |
+
},
|
| 27787 |
+
{
|
| 27788 |
+
"epoch": 0.0262,
|
| 27789 |
+
"grad_norm": 0.8868950605392456,
|
| 27790 |
+
"learning_rate": 9.847261013183615e-06,
|
| 27791 |
+
"loss": 0.6801,
|
| 27792 |
+
"step": 356100
|
| 27793 |
+
},
|
| 27794 |
+
{
|
| 27795 |
+
"epoch": 0.0264,
|
| 27796 |
+
"grad_norm": 0.9825394749641418,
|
| 27797 |
+
"learning_rate": 9.834644047441974e-06,
|
| 27798 |
+
"loss": 0.6582,
|
| 27799 |
+
"step": 356200
|
| 27800 |
+
},
|
| 27801 |
+
{
|
| 27802 |
+
"epoch": 0.0266,
|
| 27803 |
+
"grad_norm": 0.8572143316268921,
|
| 27804 |
+
"learning_rate": 9.822033190303906e-06,
|
| 27805 |
+
"loss": 0.6731,
|
| 27806 |
+
"step": 356300
|
| 27807 |
+
},
|
| 27808 |
+
{
|
| 27809 |
+
"epoch": 0.0268,
|
| 27810 |
+
"grad_norm": 0.8867204785346985,
|
| 27811 |
+
"learning_rate": 9.809428446849044e-06,
|
| 27812 |
+
"loss": 0.6634,
|
| 27813 |
+
"step": 356400
|
| 27814 |
+
},
|
| 27815 |
+
{
|
| 27816 |
+
"epoch": 0.027,
|
| 27817 |
+
"grad_norm": 0.8682609796524048,
|
| 27818 |
+
"learning_rate": 9.796829822154589e-06,
|
| 27819 |
+
"loss": 0.6678,
|
| 27820 |
+
"step": 356500
|
| 27821 |
+
},
|
| 27822 |
+
{
|
| 27823 |
+
"epoch": 0.0272,
|
| 27824 |
+
"grad_norm": 0.8932370543479919,
|
| 27825 |
+
"learning_rate": 9.784237321295262e-06,
|
| 27826 |
+
"loss": 0.6707,
|
| 27827 |
+
"step": 356600
|
| 27828 |
+
},
|
| 27829 |
+
{
|
| 27830 |
+
"epoch": 0.0274,
|
| 27831 |
+
"grad_norm": 0.860748291015625,
|
| 27832 |
+
"learning_rate": 9.771650949343331e-06,
|
| 27833 |
+
"loss": 0.6604,
|
| 27834 |
+
"step": 356700
|
| 27835 |
+
},
|
| 27836 |
+
{
|
| 27837 |
+
"epoch": 0.0276,
|
| 27838 |
+
"grad_norm": 0.8779944181442261,
|
| 27839 |
+
"learning_rate": 9.759070711368568e-06,
|
| 27840 |
+
"loss": 0.6639,
|
| 27841 |
+
"step": 356800
|
| 27842 |
+
},
|
| 27843 |
+
{
|
| 27844 |
+
"epoch": 0.0278,
|
| 27845 |
+
"grad_norm": 0.9277738928794861,
|
| 27846 |
+
"learning_rate": 9.746496612438299e-06,
|
| 27847 |
+
"loss": 0.6617,
|
| 27848 |
+
"step": 356900
|
| 27849 |
+
},
|
| 27850 |
+
{
|
| 27851 |
+
"epoch": 0.028,
|
| 27852 |
+
"grad_norm": 0.8405406475067139,
|
| 27853 |
+
"learning_rate": 9.733928657617373e-06,
|
| 27854 |
+
"loss": 0.6663,
|
| 27855 |
+
"step": 357000
|
| 27856 |
+
},
|
| 27857 |
+
{
|
| 27858 |
+
"epoch": 0.028,
|
| 27859 |
+
"eval_loss": 2.0634403228759766,
|
| 27860 |
+
"eval_runtime": 52.3193,
|
| 27861 |
+
"eval_samples_per_second": 194.842,
|
| 27862 |
+
"eval_steps_per_second": 1.529,
|
| 27863 |
+
"step": 357000
|
| 27864 |
+
},
|
| 27865 |
+
{
|
| 27866 |
+
"epoch": 0.0282,
|
| 27867 |
+
"grad_norm": 0.8827060461044312,
|
| 27868 |
+
"learning_rate": 9.721366851968165e-06,
|
| 27869 |
+
"loss": 0.6748,
|
| 27870 |
+
"step": 357100
|
| 27871 |
+
},
|
| 27872 |
+
{
|
| 27873 |
+
"epoch": 0.0284,
|
| 27874 |
+
"grad_norm": 0.908746063709259,
|
| 27875 |
+
"learning_rate": 9.708811200550552e-06,
|
| 27876 |
+
"loss": 0.6614,
|
| 27877 |
+
"step": 357200
|
| 27878 |
+
},
|
| 27879 |
+
{
|
| 27880 |
+
"epoch": 0.0286,
|
| 27881 |
+
"grad_norm": 0.8800754547119141,
|
| 27882 |
+
"learning_rate": 9.69626170842196e-06,
|
| 27883 |
+
"loss": 0.6661,
|
| 27884 |
+
"step": 357300
|
| 27885 |
+
},
|
| 27886 |
+
{
|
| 27887 |
+
"epoch": 0.0288,
|
| 27888 |
+
"grad_norm": 0.9010385870933533,
|
| 27889 |
+
"learning_rate": 9.68371838063733e-06,
|
| 27890 |
+
"loss": 0.6466,
|
| 27891 |
+
"step": 357400
|
| 27892 |
+
},
|
| 27893 |
+
{
|
| 27894 |
+
"epoch": 0.029,
|
| 27895 |
+
"grad_norm": 0.868073046207428,
|
| 27896 |
+
"learning_rate": 9.671181222249099e-06,
|
| 27897 |
+
"loss": 0.6561,
|
| 27898 |
+
"step": 357500
|
| 27899 |
+
},
|
| 27900 |
+
{
|
| 27901 |
+
"epoch": 0.0292,
|
| 27902 |
+
"grad_norm": 0.982118546962738,
|
| 27903 |
+
"learning_rate": 9.658650238307235e-06,
|
| 27904 |
+
"loss": 0.6696,
|
| 27905 |
+
"step": 357600
|
| 27906 |
+
},
|
| 27907 |
+
{
|
| 27908 |
+
"epoch": 0.0294,
|
| 27909 |
+
"grad_norm": 0.832084059715271,
|
| 27910 |
+
"learning_rate": 9.646125433859221e-06,
|
| 27911 |
+
"loss": 0.6513,
|
| 27912 |
+
"step": 357700
|
| 27913 |
+
},
|
| 27914 |
+
{
|
| 27915 |
+
"epoch": 0.0296,
|
| 27916 |
+
"grad_norm": 0.9348160028457642,
|
| 27917 |
+
"learning_rate": 9.633606813950055e-06,
|
| 27918 |
+
"loss": 0.6558,
|
| 27919 |
+
"step": 357800
|
| 27920 |
+
},
|
| 27921 |
+
{
|
| 27922 |
+
"epoch": 0.0298,
|
| 27923 |
+
"grad_norm": 0.8417104482650757,
|
| 27924 |
+
"learning_rate": 9.621094383622217e-06,
|
| 27925 |
+
"loss": 0.6621,
|
| 27926 |
+
"step": 357900
|
| 27927 |
+
},
|
| 27928 |
+
{
|
| 27929 |
+
"epoch": 0.03,
|
| 27930 |
+
"grad_norm": 0.8583792448043823,
|
| 27931 |
+
"learning_rate": 9.608588147915726e-06,
|
| 27932 |
+
"loss": 0.6572,
|
| 27933 |
+
"step": 358000
|
| 27934 |
+
},
|
| 27935 |
+
{
|
| 27936 |
+
"epoch": 0.03,
|
| 27937 |
+
"eval_loss": 2.086122512817383,
|
| 27938 |
+
"eval_runtime": 52.2197,
|
| 27939 |
+
"eval_samples_per_second": 195.214,
|
| 27940 |
+
"eval_steps_per_second": 1.532,
|
| 27941 |
+
"step": 358000
|
| 27942 |
+
},
|
| 27943 |
+
{
|
| 27944 |
+
"epoch": 0.0002,
|
| 27945 |
+
"grad_norm": 0.8814049959182739,
|
| 27946 |
+
"learning_rate": 9.596088111868085e-06,
|
| 27947 |
+
"loss": 0.653,
|
| 27948 |
+
"step": 358100
|
| 27949 |
+
},
|
| 27950 |
+
{
|
| 27951 |
+
"epoch": 0.0004,
|
| 27952 |
+
"grad_norm": 0.8665258288383484,
|
| 27953 |
+
"learning_rate": 9.583594280514318e-06,
|
| 27954 |
+
"loss": 0.6518,
|
| 27955 |
+
"step": 358200
|
| 27956 |
+
},
|
| 27957 |
+
{
|
| 27958 |
+
"epoch": 0.0006,
|
| 27959 |
+
"grad_norm": 0.9076094627380371,
|
| 27960 |
+
"learning_rate": 9.571106658886925e-06,
|
| 27961 |
+
"loss": 0.6583,
|
| 27962 |
+
"step": 358300
|
| 27963 |
+
},
|
| 27964 |
+
{
|
| 27965 |
+
"epoch": 0.0008,
|
| 27966 |
+
"grad_norm": 0.9470544457435608,
|
| 27967 |
+
"learning_rate": 9.558625252015924e-06,
|
| 27968 |
+
"loss": 0.6539,
|
| 27969 |
+
"step": 358400
|
| 27970 |
+
},
|
| 27971 |
+
{
|
| 27972 |
+
"epoch": 0.001,
|
| 27973 |
+
"grad_norm": 0.9310306310653687,
|
| 27974 |
+
"learning_rate": 9.546150064928824e-06,
|
| 27975 |
+
"loss": 0.661,
|
| 27976 |
+
"step": 358500
|
| 27977 |
+
},
|
| 27978 |
+
{
|
| 27979 |
+
"epoch": 0.0012,
|
| 27980 |
+
"grad_norm": 0.8882910013198853,
|
| 27981 |
+
"learning_rate": 9.53368110265064e-06,
|
| 27982 |
+
"loss": 0.6644,
|
| 27983 |
+
"step": 358600
|
| 27984 |
+
},
|
| 27985 |
+
{
|
| 27986 |
+
"epoch": 0.0014,
|
| 27987 |
+
"grad_norm": 0.912969172000885,
|
| 27988 |
+
"learning_rate": 9.52121837020385e-06,
|
| 27989 |
+
"loss": 0.6477,
|
| 27990 |
+
"step": 358700
|
| 27991 |
+
},
|
| 27992 |
+
{
|
| 27993 |
+
"epoch": 0.0016,
|
| 27994 |
+
"grad_norm": 0.9159826040267944,
|
| 27995 |
+
"learning_rate": 9.50876187260845e-06,
|
| 27996 |
+
"loss": 0.6581,
|
| 27997 |
+
"step": 358800
|
| 27998 |
+
},
|
| 27999 |
+
{
|
| 28000 |
+
"epoch": 0.0018,
|
| 28001 |
+
"grad_norm": 0.8334347605705261,
|
| 28002 |
+
"learning_rate": 9.49631161488192e-06,
|
| 28003 |
+
"loss": 0.6605,
|
| 28004 |
+
"step": 358900
|
| 28005 |
+
},
|
| 28006 |
+
{
|
| 28007 |
+
"epoch": 0.002,
|
| 28008 |
+
"grad_norm": 0.9216808676719666,
|
| 28009 |
+
"learning_rate": 9.483867602039212e-06,
|
| 28010 |
+
"loss": 0.6609,
|
| 28011 |
+
"step": 359000
|
| 28012 |
+
},
|
| 28013 |
+
{
|
| 28014 |
+
"epoch": 0.002,
|
| 28015 |
+
"eval_loss": 2.071388006210327,
|
| 28016 |
+
"eval_runtime": 52.0422,
|
| 28017 |
+
"eval_samples_per_second": 195.879,
|
| 28018 |
+
"eval_steps_per_second": 1.537,
|
| 28019 |
+
"step": 359000
|
| 28020 |
+
},
|
| 28021 |
+
{
|
| 28022 |
+
"epoch": 0.0022,
|
| 28023 |
+
"grad_norm": 0.9010413289070129,
|
| 28024 |
+
"learning_rate": 9.471429839092777e-06,
|
| 28025 |
+
"loss": 0.6428,
|
| 28026 |
+
"step": 359100
|
| 28027 |
+
},
|
| 28028 |
+
{
|
| 28029 |
+
"epoch": 0.0024,
|
| 28030 |
+
"grad_norm": 0.8659740686416626,
|
| 28031 |
+
"learning_rate": 9.458998331052546e-06,
|
| 28032 |
+
"loss": 0.6462,
|
| 28033 |
+
"step": 359200
|
| 28034 |
+
},
|
| 28035 |
+
{
|
| 28036 |
+
"epoch": 0.0026,
|
| 28037 |
+
"grad_norm": 0.9039402604103088,
|
| 28038 |
+
"learning_rate": 9.446573082925938e-06,
|
| 28039 |
+
"loss": 0.6413,
|
| 28040 |
+
"step": 359300
|
| 28041 |
+
},
|
| 28042 |
+
{
|
| 28043 |
+
"epoch": 0.0028,
|
| 28044 |
+
"grad_norm": 0.9015378952026367,
|
| 28045 |
+
"learning_rate": 9.434154099717824e-06,
|
| 28046 |
+
"loss": 0.6521,
|
| 28047 |
+
"step": 359400
|
| 28048 |
+
},
|
| 28049 |
+
{
|
| 28050 |
+
"epoch": 0.003,
|
| 28051 |
+
"grad_norm": 0.8885050415992737,
|
| 28052 |
+
"learning_rate": 9.421741386430575e-06,
|
| 28053 |
+
"loss": 0.647,
|
| 28054 |
+
"step": 359500
|
| 28055 |
+
},
|
| 28056 |
+
{
|
| 28057 |
+
"epoch": 0.0032,
|
| 28058 |
+
"grad_norm": 0.8669450879096985,
|
| 28059 |
+
"learning_rate": 9.409334948064033e-06,
|
| 28060 |
+
"loss": 0.6564,
|
| 28061 |
+
"step": 359600
|
| 28062 |
+
},
|
| 28063 |
+
{
|
| 28064 |
+
"epoch": 0.0034,
|
| 28065 |
+
"grad_norm": 0.9445268511772156,
|
| 28066 |
+
"learning_rate": 9.396934789615519e-06,
|
| 28067 |
+
"loss": 0.6683,
|
| 28068 |
+
"step": 359700
|
| 28069 |
+
},
|
| 28070 |
+
{
|
| 28071 |
+
"epoch": 0.0036,
|
| 28072 |
+
"grad_norm": 0.8911668062210083,
|
| 28073 |
+
"learning_rate": 9.384540916079798e-06,
|
| 28074 |
+
"loss": 0.6713,
|
| 28075 |
+
"step": 359800
|
| 28076 |
+
},
|
| 28077 |
+
{
|
| 28078 |
+
"epoch": 0.0038,
|
| 28079 |
+
"grad_norm": 0.8700185418128967,
|
| 28080 |
+
"learning_rate": 9.372153332449127e-06,
|
| 28081 |
+
"loss": 0.6621,
|
| 28082 |
+
"step": 359900
|
| 28083 |
+
},
|
| 28084 |
+
{
|
| 28085 |
+
"epoch": 0.004,
|
| 28086 |
+
"grad_norm": 0.8949635028839111,
|
| 28087 |
+
"learning_rate": 9.359772043713226e-06,
|
| 28088 |
+
"loss": 0.6468,
|
| 28089 |
+
"step": 360000
|
| 28090 |
+
},
|
| 28091 |
+
{
|
| 28092 |
+
"epoch": 0.004,
|
| 28093 |
+
"eval_loss": 2.0606133937835693,
|
| 28094 |
+
"eval_runtime": 51.5712,
|
| 28095 |
+
"eval_samples_per_second": 197.668,
|
| 28096 |
+
"eval_steps_per_second": 1.551,
|
| 28097 |
+
"step": 360000
|
| 28098 |
+
},
|
| 28099 |
+
{
|
| 28100 |
+
"epoch": 0.0042,
|
| 28101 |
+
"grad_norm": 0.875957190990448,
|
| 28102 |
+
"learning_rate": 9.347397054859283e-06,
|
| 28103 |
+
"loss": 0.6823,
|
| 28104 |
+
"step": 360100
|
| 28105 |
+
},
|
| 28106 |
+
{
|
| 28107 |
+
"epoch": 0.0044,
|
| 28108 |
+
"grad_norm": 0.8829663395881653,
|
| 28109 |
+
"learning_rate": 9.335028370871925e-06,
|
| 28110 |
+
"loss": 0.6758,
|
| 28111 |
+
"step": 360200
|
| 28112 |
+
},
|
| 28113 |
+
{
|
| 28114 |
+
"epoch": 0.0046,
|
| 28115 |
+
"grad_norm": 0.8770716786384583,
|
| 28116 |
+
"learning_rate": 9.322665996733268e-06,
|
| 28117 |
+
"loss": 0.6601,
|
| 28118 |
+
"step": 360300
|
| 28119 |
+
},
|
| 28120 |
+
{
|
| 28121 |
+
"epoch": 0.0048,
|
| 28122 |
+
"grad_norm": 0.9599934220314026,
|
| 28123 |
+
"learning_rate": 9.310309937422873e-06,
|
| 28124 |
+
"loss": 0.666,
|
| 28125 |
+
"step": 360400
|
| 28126 |
+
},
|
| 28127 |
+
{
|
| 28128 |
+
"epoch": 0.005,
|
| 28129 |
+
"grad_norm": 0.8904752135276794,
|
| 28130 |
+
"learning_rate": 9.297960197917766e-06,
|
| 28131 |
+
"loss": 0.662,
|
| 28132 |
+
"step": 360500
|
| 28133 |
+
},
|
| 28134 |
+
{
|
| 28135 |
+
"epoch": 0.0052,
|
| 28136 |
+
"grad_norm": 0.9215303659439087,
|
| 28137 |
+
"learning_rate": 9.285616783192404e-06,
|
| 28138 |
+
"loss": 0.6637,
|
| 28139 |
+
"step": 360600
|
| 28140 |
+
},
|
| 28141 |
+
{
|
| 28142 |
+
"epoch": 0.0054,
|
| 28143 |
+
"grad_norm": 0.9662516117095947,
|
| 28144 |
+
"learning_rate": 9.273279698218726e-06,
|
| 28145 |
+
"loss": 0.6735,
|
| 28146 |
+
"step": 360700
|
| 28147 |
+
},
|
| 28148 |
+
{
|
| 28149 |
+
"epoch": 0.0056,
|
| 28150 |
+
"grad_norm": 0.9039230346679688,
|
| 28151 |
+
"learning_rate": 9.260948947966111e-06,
|
| 28152 |
+
"loss": 0.682,
|
| 28153 |
+
"step": 360800
|
| 28154 |
+
},
|
| 28155 |
+
{
|
| 28156 |
+
"epoch": 0.0058,
|
| 28157 |
+
"grad_norm": 0.914978563785553,
|
| 28158 |
+
"learning_rate": 9.248624537401368e-06,
|
| 28159 |
+
"loss": 0.6691,
|
| 28160 |
+
"step": 360900
|
| 28161 |
+
},
|
| 28162 |
+
{
|
| 28163 |
+
"epoch": 0.006,
|
| 28164 |
+
"grad_norm": 0.8637982606887817,
|
| 28165 |
+
"learning_rate": 9.236306471488779e-06,
|
| 28166 |
+
"loss": 0.6775,
|
| 28167 |
+
"step": 361000
|
| 28168 |
+
},
|
| 28169 |
+
{
|
| 28170 |
+
"epoch": 0.006,
|
| 28171 |
+
"eval_loss": 2.0751538276672363,
|
| 28172 |
+
"eval_runtime": 51.7366,
|
| 28173 |
+
"eval_samples_per_second": 197.037,
|
| 28174 |
+
"eval_steps_per_second": 1.546,
|
| 28175 |
+
"step": 361000
|
| 28176 |
+
},
|
| 28177 |
+
{
|
| 28178 |
+
"epoch": 0.0062,
|
| 28179 |
+
"grad_norm": 0.8795140981674194,
|
| 28180 |
+
"learning_rate": 9.223994755190058e-06,
|
| 28181 |
+
"loss": 0.683,
|
| 28182 |
+
"step": 361100
|
| 28183 |
+
},
|
| 28184 |
+
{
|
| 28185 |
+
"epoch": 0.0064,
|
| 28186 |
+
"grad_norm": 0.9144249558448792,
|
| 28187 |
+
"learning_rate": 9.21168939346437e-06,
|
| 28188 |
+
"loss": 0.7081,
|
| 28189 |
+
"step": 361200
|
| 28190 |
+
},
|
| 28191 |
+
{
|
| 28192 |
+
"epoch": 0.0066,
|
| 28193 |
+
"grad_norm": 0.8885230422019958,
|
| 28194 |
+
"learning_rate": 9.199390391268301e-06,
|
| 28195 |
+
"loss": 0.6968,
|
| 28196 |
+
"step": 361300
|
| 28197 |
+
},
|
| 28198 |
+
{
|
| 28199 |
+
"epoch": 0.0068,
|
| 28200 |
+
"grad_norm": 0.8315828442573547,
|
| 28201 |
+
"learning_rate": 9.18709775355589e-06,
|
| 28202 |
+
"loss": 0.6809,
|
| 28203 |
+
"step": 361400
|
| 28204 |
+
},
|
| 28205 |
+
{
|
| 28206 |
+
"epoch": 0.007,
|
| 28207 |
+
"grad_norm": 0.8375496864318848,
|
| 28208 |
+
"learning_rate": 9.174811485278614e-06,
|
| 28209 |
+
"loss": 0.686,
|
| 28210 |
+
"step": 361500
|
| 28211 |
+
},
|
| 28212 |
+
{
|
| 28213 |
+
"epoch": 0.0072,
|
| 28214 |
+
"grad_norm": 0.9053453207015991,
|
| 28215 |
+
"learning_rate": 9.162531591385387e-06,
|
| 28216 |
+
"loss": 0.6921,
|
| 28217 |
+
"step": 361600
|
| 28218 |
+
},
|
| 28219 |
+
{
|
| 28220 |
+
"epoch": 0.0074,
|
| 28221 |
+
"grad_norm": 0.8914540410041809,
|
| 28222 |
+
"learning_rate": 9.150258076822535e-06,
|
| 28223 |
+
"loss": 0.6832,
|
| 28224 |
+
"step": 361700
|
| 28225 |
+
},
|
| 28226 |
+
{
|
| 28227 |
+
"epoch": 0.0076,
|
| 28228 |
+
"grad_norm": 0.8982157707214355,
|
| 28229 |
+
"learning_rate": 9.13799094653383e-06,
|
| 28230 |
+
"loss": 0.6969,
|
| 28231 |
+
"step": 361800
|
| 28232 |
+
},
|
| 28233 |
+
{
|
| 28234 |
+
"epoch": 0.0078,
|
| 28235 |
+
"grad_norm": 1.0123343467712402,
|
| 28236 |
+
"learning_rate": 9.125730205460478e-06,
|
| 28237 |
+
"loss": 0.6915,
|
| 28238 |
+
"step": 361900
|
| 28239 |
+
},
|
| 28240 |
+
{
|
| 28241 |
+
"epoch": 0.008,
|
| 28242 |
+
"grad_norm": 0.904523491859436,
|
| 28243 |
+
"learning_rate": 9.113475858541118e-06,
|
| 28244 |
+
"loss": 0.6884,
|
| 28245 |
+
"step": 362000
|
| 28246 |
+
},
|
| 28247 |
+
{
|
| 28248 |
+
"epoch": 0.008,
|
| 28249 |
+
"eval_loss": 2.0824785232543945,
|
| 28250 |
+
"eval_runtime": 51.6588,
|
| 28251 |
+
"eval_samples_per_second": 197.333,
|
| 28252 |
+
"eval_steps_per_second": 1.549,
|
| 28253 |
+
"step": 362000
|
| 28254 |
+
},
|
| 28255 |
+
{
|
| 28256 |
+
"epoch": 0.0082,
|
| 28257 |
+
"grad_norm": 0.8671389818191528,
|
| 28258 |
+
"learning_rate": 9.101227910711765e-06,
|
| 28259 |
+
"loss": 0.706,
|
| 28260 |
+
"step": 362100
|
| 28261 |
+
},
|
| 28262 |
+
{
|
| 28263 |
+
"epoch": 0.0084,
|
| 28264 |
+
"grad_norm": 0.8754188418388367,
|
| 28265 |
+
"learning_rate": 9.088986366905908e-06,
|
| 28266 |
+
"loss": 0.6918,
|
| 28267 |
+
"step": 362200
|
| 28268 |
+
},
|
| 28269 |
+
{
|
| 28270 |
+
"epoch": 0.0086,
|
| 28271 |
+
"grad_norm": 0.8821722865104675,
|
| 28272 |
+
"learning_rate": 9.076751232054439e-06,
|
| 28273 |
+
"loss": 0.6902,
|
| 28274 |
+
"step": 362300
|
| 28275 |
+
},
|
| 28276 |
+
{
|
| 28277 |
+
"epoch": 0.0088,
|
| 28278 |
+
"grad_norm": 0.8519936800003052,
|
| 28279 |
+
"learning_rate": 9.064522511085677e-06,
|
| 28280 |
+
"loss": 0.6897,
|
| 28281 |
+
"step": 362400
|
| 28282 |
+
},
|
| 28283 |
+
{
|
| 28284 |
+
"epoch": 0.009,
|
| 28285 |
+
"grad_norm": 0.9249884486198425,
|
| 28286 |
+
"learning_rate": 9.052300208925335e-06,
|
| 28287 |
+
"loss": 0.6762,
|
| 28288 |
+
"step": 362500
|
| 28289 |
+
},
|
| 28290 |
+
{
|
| 28291 |
+
"epoch": 0.0092,
|
| 28292 |
+
"grad_norm": 0.9254834651947021,
|
| 28293 |
+
"learning_rate": 9.040084330496562e-06,
|
| 28294 |
+
"loss": 0.6836,
|
| 28295 |
+
"step": 362600
|
| 28296 |
+
},
|
| 28297 |
+
{
|
| 28298 |
+
"epoch": 0.0094,
|
| 28299 |
+
"grad_norm": 0.907455325126648,
|
| 28300 |
+
"learning_rate": 9.027874880719911e-06,
|
| 28301 |
+
"loss": 0.6816,
|
| 28302 |
+
"step": 362700
|
| 28303 |
+
},
|
| 28304 |
+
{
|
| 28305 |
+
"epoch": 0.0096,
|
| 28306 |
+
"grad_norm": 0.8891639709472656,
|
| 28307 |
+
"learning_rate": 9.015671864513356e-06,
|
| 28308 |
+
"loss": 0.6493,
|
| 28309 |
+
"step": 362800
|
| 28310 |
+
},
|
| 28311 |
+
{
|
| 28312 |
+
"epoch": 0.0098,
|
| 28313 |
+
"grad_norm": 0.9093591570854187,
|
| 28314 |
+
"learning_rate": 9.003475286792257e-06,
|
| 28315 |
+
"loss": 0.659,
|
| 28316 |
+
"step": 362900
|
| 28317 |
+
},
|
| 28318 |
+
{
|
| 28319 |
+
"epoch": 0.01,
|
| 28320 |
+
"grad_norm": 0.8426594138145447,
|
| 28321 |
+
"learning_rate": 8.991285152469395e-06,
|
| 28322 |
+
"loss": 0.6498,
|
| 28323 |
+
"step": 363000
|
| 28324 |
+
},
|
| 28325 |
+
{
|
| 28326 |
+
"epoch": 0.01,
|
| 28327 |
+
"eval_loss": 2.0885329246520996,
|
| 28328 |
+
"eval_runtime": 51.6994,
|
| 28329 |
+
"eval_samples_per_second": 197.178,
|
| 28330 |
+
"eval_steps_per_second": 1.547,
|
| 28331 |
+
"step": 363000
|
| 28332 |
+
},
|
| 28333 |
+
{
|
| 28334 |
+
"epoch": 0.0102,
|
| 28335 |
+
"grad_norm": 0.9149935245513916,
|
| 28336 |
+
"learning_rate": 8.979101466454962e-06,
|
| 28337 |
+
"loss": 0.6595,
|
| 28338 |
+
"step": 363100
|
| 28339 |
+
},
|
| 28340 |
+
{
|
| 28341 |
+
"epoch": 0.0104,
|
| 28342 |
+
"grad_norm": 0.893366277217865,
|
| 28343 |
+
"learning_rate": 8.966924233656552e-06,
|
| 28344 |
+
"loss": 0.6622,
|
| 28345 |
+
"step": 363200
|
| 28346 |
+
},
|
| 28347 |
+
{
|
| 28348 |
+
"epoch": 0.0106,
|
| 28349 |
+
"grad_norm": 0.8946834206581116,
|
| 28350 |
+
"learning_rate": 8.954753458979132e-06,
|
| 28351 |
+
"loss": 0.6639,
|
| 28352 |
+
"step": 363300
|
| 28353 |
+
},
|
| 28354 |
+
{
|
| 28355 |
+
"epoch": 0.0108,
|
| 28356 |
+
"grad_norm": 0.8848134279251099,
|
| 28357 |
+
"learning_rate": 8.9425891473251e-06,
|
| 28358 |
+
"loss": 0.6623,
|
| 28359 |
+
"step": 363400
|
| 28360 |
+
},
|
| 28361 |
+
{
|
| 28362 |
+
"epoch": 0.011,
|
| 28363 |
+
"grad_norm": 0.8674115538597107,
|
| 28364 |
+
"learning_rate": 8.93043130359425e-06,
|
| 28365 |
+
"loss": 0.6483,
|
| 28366 |
+
"step": 363500
|
| 28367 |
+
},
|
| 28368 |
+
{
|
| 28369 |
+
"epoch": 0.0112,
|
| 28370 |
+
"grad_norm": 0.8136773109436035,
|
| 28371 |
+
"learning_rate": 8.91827993268374e-06,
|
| 28372 |
+
"loss": 0.6598,
|
| 28373 |
+
"step": 363600
|
| 28374 |
+
},
|
| 28375 |
+
{
|
| 28376 |
+
"epoch": 0.0114,
|
| 28377 |
+
"grad_norm": 0.9210416674613953,
|
| 28378 |
+
"learning_rate": 8.906135039488148e-06,
|
| 28379 |
+
"loss": 0.6427,
|
| 28380 |
+
"step": 363700
|
| 28381 |
+
},
|
| 28382 |
+
{
|
| 28383 |
+
"epoch": 0.0116,
|
| 28384 |
+
"grad_norm": 0.8708541393280029,
|
| 28385 |
+
"learning_rate": 8.89399662889944e-06,
|
| 28386 |
+
"loss": 0.6523,
|
| 28387 |
+
"step": 363800
|
| 28388 |
+
},
|
| 28389 |
+
{
|
| 28390 |
+
"epoch": 0.0118,
|
| 28391 |
+
"grad_norm": 0.8490440845489502,
|
| 28392 |
+
"learning_rate": 8.881864705806971e-06,
|
| 28393 |
+
"loss": 0.6571,
|
| 28394 |
+
"step": 363900
|
| 28395 |
+
},
|
| 28396 |
+
{
|
| 28397 |
+
"epoch": 0.012,
|
| 28398 |
+
"grad_norm": 0.8714786767959595,
|
| 28399 |
+
"learning_rate": 8.869739275097464e-06,
|
| 28400 |
+
"loss": 0.6535,
|
| 28401 |
+
"step": 364000
|
| 28402 |
+
},
|
| 28403 |
+
{
|
| 28404 |
+
"epoch": 0.012,
|
| 28405 |
+
"eval_loss": 2.0917515754699707,
|
| 28406 |
+
"eval_runtime": 51.7459,
|
| 28407 |
+
"eval_samples_per_second": 197.001,
|
| 28408 |
+
"eval_steps_per_second": 1.546,
|
| 28409 |
+
"step": 364000
|
| 28410 |
+
},
|
| 28411 |
+
{
|
| 28412 |
+
"epoch": 0.0122,
|
| 28413 |
+
"grad_norm": 0.8995687961578369,
|
| 28414 |
+
"learning_rate": 8.857620341655045e-06,
|
| 28415 |
+
"loss": 0.6561,
|
| 28416 |
+
"step": 364100
|
| 28417 |
+
},
|
| 28418 |
+
{
|
| 28419 |
+
"epoch": 0.0124,
|
| 28420 |
+
"grad_norm": 0.9087790846824646,
|
| 28421 |
+
"learning_rate": 8.845507910361223e-06,
|
| 28422 |
+
"loss": 0.6506,
|
| 28423 |
+
"step": 364200
|
| 28424 |
+
},
|
| 28425 |
+
{
|
| 28426 |
+
"epoch": 0.0126,
|
| 28427 |
+
"grad_norm": 0.9006063342094421,
|
| 28428 |
+
"learning_rate": 8.833401986094893e-06,
|
| 28429 |
+
"loss": 0.6628,
|
| 28430 |
+
"step": 364300
|
| 28431 |
+
},
|
| 28432 |
+
{
|
| 28433 |
+
"epoch": 0.0128,
|
| 28434 |
+
"grad_norm": 0.9575886726379395,
|
| 28435 |
+
"learning_rate": 8.821302573732302e-06,
|
| 28436 |
+
"loss": 0.6563,
|
| 28437 |
+
"step": 364400
|
| 28438 |
+
},
|
| 28439 |
+
{
|
| 28440 |
+
"epoch": 0.013,
|
| 28441 |
+
"grad_norm": 0.8845739960670471,
|
| 28442 |
+
"learning_rate": 8.809209678147095e-06,
|
| 28443 |
+
"loss": 0.649,
|
| 28444 |
+
"step": 364500
|
| 28445 |
+
},
|
| 28446 |
+
{
|
| 28447 |
+
"epoch": 0.0132,
|
| 28448 |
+
"grad_norm": 0.8682934641838074,
|
| 28449 |
+
"learning_rate": 8.797123304210298e-06,
|
| 28450 |
+
"loss": 0.6513,
|
| 28451 |
+
"step": 364600
|
| 28452 |
+
},
|
| 28453 |
+
{
|
| 28454 |
+
"epoch": 0.0134,
|
| 28455 |
+
"grad_norm": 0.8966580033302307,
|
| 28456 |
+
"learning_rate": 8.785043456790302e-06,
|
| 28457 |
+
"loss": 0.6443,
|
| 28458 |
+
"step": 364700
|
| 28459 |
+
},
|
| 28460 |
+
{
|
| 28461 |
+
"epoch": 0.0136,
|
| 28462 |
+
"grad_norm": 0.8867930769920349,
|
| 28463 |
+
"learning_rate": 8.772970140752854e-06,
|
| 28464 |
+
"loss": 0.6473,
|
| 28465 |
+
"step": 364800
|
| 28466 |
+
},
|
| 28467 |
+
{
|
| 28468 |
+
"epoch": 0.0138,
|
| 28469 |
+
"grad_norm": 0.8712829351425171,
|
| 28470 |
+
"learning_rate": 8.760903360961096e-06,
|
| 28471 |
+
"loss": 0.6428,
|
| 28472 |
+
"step": 364900
|
| 28473 |
+
},
|
| 28474 |
+
{
|
| 28475 |
+
"epoch": 0.014,
|
| 28476 |
+
"grad_norm": 0.8830559253692627,
|
| 28477 |
+
"learning_rate": 8.748843122275519e-06,
|
| 28478 |
+
"loss": 0.657,
|
| 28479 |
+
"step": 365000
|
| 28480 |
+
},
|
| 28481 |
+
{
|
| 28482 |
+
"epoch": 0.014,
|
| 28483 |
+
"eval_loss": 2.077829122543335,
|
| 28484 |
+
"eval_runtime": 51.6249,
|
| 28485 |
+
"eval_samples_per_second": 197.463,
|
| 28486 |
+
"eval_steps_per_second": 1.55,
|
| 28487 |
+
"step": 365000
|
| 28488 |
+
},
|
| 28489 |
+
{
|
| 28490 |
+
"epoch": 0.0142,
|
| 28491 |
+
"grad_norm": 0.9168245792388916,
|
| 28492 |
+
"learning_rate": 8.736789429553998e-06,
|
| 28493 |
+
"loss": 0.6542,
|
| 28494 |
+
"step": 365100
|
| 28495 |
+
},
|
| 28496 |
+
{
|
| 28497 |
+
"epoch": 0.0144,
|
| 28498 |
+
"grad_norm": 0.9041379690170288,
|
| 28499 |
+
"learning_rate": 8.724742287651741e-06,
|
| 28500 |
+
"loss": 0.6422,
|
| 28501 |
+
"step": 365200
|
| 28502 |
+
},
|
| 28503 |
+
{
|
| 28504 |
+
"epoch": 0.0146,
|
| 28505 |
+
"grad_norm": 0.8760838508605957,
|
| 28506 |
+
"learning_rate": 8.712701701421344e-06,
|
| 28507 |
+
"loss": 0.6532,
|
| 28508 |
+
"step": 365300
|
| 28509 |
+
},
|
| 28510 |
+
{
|
| 28511 |
+
"epoch": 0.0148,
|
| 28512 |
+
"grad_norm": 0.8739610910415649,
|
| 28513 |
+
"learning_rate": 8.700667675712764e-06,
|
| 28514 |
+
"loss": 0.6485,
|
| 28515 |
+
"step": 365400
|
| 28516 |
+
},
|
| 28517 |
+
{
|
| 28518 |
+
"epoch": 0.015,
|
| 28519 |
+
"grad_norm": 0.9175285696983337,
|
| 28520 |
+
"learning_rate": 8.688640215373287e-06,
|
| 28521 |
+
"loss": 0.6433,
|
| 28522 |
+
"step": 365500
|
| 28523 |
+
},
|
| 28524 |
+
{
|
| 28525 |
+
"epoch": 0.0152,
|
| 28526 |
+
"grad_norm": 0.8679957985877991,
|
| 28527 |
+
"learning_rate": 8.676619325247578e-06,
|
| 28528 |
+
"loss": 0.627,
|
| 28529 |
+
"step": 365600
|
| 28530 |
+
},
|
| 28531 |
+
{
|
| 28532 |
+
"epoch": 0.0154,
|
| 28533 |
+
"grad_norm": 0.9219822287559509,
|
| 28534 |
+
"learning_rate": 8.664605010177653e-06,
|
| 28535 |
+
"loss": 0.6342,
|
| 28536 |
+
"step": 365700
|
| 28537 |
+
},
|
| 28538 |
+
{
|
| 28539 |
+
"epoch": 0.0156,
|
| 28540 |
+
"grad_norm": 0.8707392811775208,
|
| 28541 |
+
"learning_rate": 8.652597275002888e-06,
|
| 28542 |
+
"loss": 0.6441,
|
| 28543 |
+
"step": 365800
|
| 28544 |
+
},
|
| 28545 |
+
{
|
| 28546 |
+
"epoch": 0.0158,
|
| 28547 |
+
"grad_norm": 0.8975892663002014,
|
| 28548 |
+
"learning_rate": 8.640596124559975e-06,
|
| 28549 |
+
"loss": 0.6119,
|
| 28550 |
+
"step": 365900
|
| 28551 |
+
},
|
| 28552 |
+
{
|
| 28553 |
+
"epoch": 0.016,
|
| 28554 |
+
"grad_norm": 0.8921619057655334,
|
| 28555 |
+
"learning_rate": 8.628601563682986e-06,
|
| 28556 |
+
"loss": 0.6493,
|
| 28557 |
+
"step": 366000
|
| 28558 |
+
},
|
| 28559 |
+
{
|
| 28560 |
+
"epoch": 0.016,
|
| 28561 |
+
"eval_loss": 2.0901429653167725,
|
| 28562 |
+
"eval_runtime": 51.9763,
|
| 28563 |
+
"eval_samples_per_second": 196.128,
|
| 28564 |
+
"eval_steps_per_second": 1.539,
|
| 28565 |
+
"step": 366000
|
| 28566 |
+
},
|
| 28567 |
+
{
|
| 28568 |
+
"epoch": 0.0162,
|
| 28569 |
+
"grad_norm": 0.9101726412773132,
|
| 28570 |
+
"learning_rate": 8.616613597203333e-06,
|
| 28571 |
+
"loss": 0.6456,
|
| 28572 |
+
"step": 366100
|
| 28573 |
+
},
|
| 28574 |
+
{
|
| 28575 |
+
"epoch": 0.0164,
|
| 28576 |
+
"grad_norm": 0.9642266035079956,
|
| 28577 |
+
"learning_rate": 8.604632229949768e-06,
|
| 28578 |
+
"loss": 0.6411,
|
| 28579 |
+
"step": 366200
|
| 28580 |
+
},
|
| 28581 |
+
{
|
| 28582 |
+
"epoch": 0.0166,
|
| 28583 |
+
"grad_norm": 0.8600582480430603,
|
| 28584 |
+
"learning_rate": 8.592657466748372e-06,
|
| 28585 |
+
"loss": 0.635,
|
| 28586 |
+
"step": 366300
|
| 28587 |
+
},
|
| 28588 |
+
{
|
| 28589 |
+
"epoch": 0.0168,
|
| 28590 |
+
"grad_norm": 0.9204874038696289,
|
| 28591 |
+
"learning_rate": 8.580689312422587e-06,
|
| 28592 |
+
"loss": 0.6456,
|
| 28593 |
+
"step": 366400
|
| 28594 |
+
},
|
| 28595 |
+
{
|
| 28596 |
+
"epoch": 0.017,
|
| 28597 |
+
"grad_norm": 0.857318103313446,
|
| 28598 |
+
"learning_rate": 8.568727771793186e-06,
|
| 28599 |
+
"loss": 0.6385,
|
| 28600 |
+
"step": 366500
|
| 28601 |
+
},
|
| 28602 |
+
{
|
| 28603 |
+
"epoch": 0.0172,
|
| 28604 |
+
"grad_norm": 0.9361177682876587,
|
| 28605 |
+
"learning_rate": 8.55677284967828e-06,
|
| 28606 |
+
"loss": 0.6299,
|
| 28607 |
+
"step": 366600
|
| 28608 |
+
},
|
| 28609 |
+
{
|
| 28610 |
+
"epoch": 1.000196,
|
| 28611 |
+
"grad_norm": 0.9187692999839783,
|
| 28612 |
+
"learning_rate": 8.544824550893294e-06,
|
| 28613 |
+
"loss": 0.6425,
|
| 28614 |
+
"step": 366700
|
| 28615 |
+
},
|
| 28616 |
+
{
|
| 28617 |
+
"epoch": 1.000396,
|
| 28618 |
+
"grad_norm": 0.8672967553138733,
|
| 28619 |
+
"learning_rate": 8.532882880251011e-06,
|
| 28620 |
+
"loss": 0.6341,
|
| 28621 |
+
"step": 366800
|
| 28622 |
+
},
|
| 28623 |
+
{
|
| 28624 |
+
"epoch": 1.000596,
|
| 28625 |
+
"grad_norm": 0.888131320476532,
|
| 28626 |
+
"learning_rate": 8.520947842561544e-06,
|
| 28627 |
+
"loss": 0.6451,
|
| 28628 |
+
"step": 366900
|
| 28629 |
+
},
|
| 28630 |
+
{
|
| 28631 |
+
"epoch": 1.000796,
|
| 28632 |
+
"grad_norm": 0.8518761992454529,
|
| 28633 |
+
"learning_rate": 8.509019442632308e-06,
|
| 28634 |
+
"loss": 0.637,
|
| 28635 |
+
"step": 367000
|
| 28636 |
+
},
|
| 28637 |
+
{
|
| 28638 |
+
"epoch": 1.000796,
|
| 28639 |
+
"eval_loss": 2.082726240158081,
|
| 28640 |
+
"eval_runtime": 51.6098,
|
| 28641 |
+
"eval_samples_per_second": 197.521,
|
| 28642 |
+
"eval_steps_per_second": 1.55,
|
| 28643 |
+
"step": 367000
|
| 28644 |
+
},
|
| 28645 |
+
{
|
| 28646 |
+
"epoch": 1.000996,
|
| 28647 |
+
"grad_norm": 0.9279243350028992,
|
| 28648 |
+
"learning_rate": 8.497097685268068e-06,
|
| 28649 |
+
"loss": 0.6471,
|
| 28650 |
+
"step": 367100
|
| 28651 |
+
},
|
| 28652 |
+
{
|
| 28653 |
+
"epoch": 1.001196,
|
| 28654 |
+
"grad_norm": 0.9042778611183167,
|
| 28655 |
+
"learning_rate": 8.485182575270905e-06,
|
| 28656 |
+
"loss": 0.6494,
|
| 28657 |
+
"step": 367200
|
| 28658 |
+
},
|
| 28659 |
+
{
|
| 28660 |
+
"epoch": 1.001396,
|
| 28661 |
+
"grad_norm": 0.9116953611373901,
|
| 28662 |
+
"learning_rate": 8.473274117440235e-06,
|
| 28663 |
+
"loss": 0.6333,
|
| 28664 |
+
"step": 367300
|
| 28665 |
+
},
|
| 28666 |
+
{
|
| 28667 |
+
"epoch": 1.001596,
|
| 28668 |
+
"grad_norm": 0.9247483611106873,
|
| 28669 |
+
"learning_rate": 8.461372316572765e-06,
|
| 28670 |
+
"loss": 0.6432,
|
| 28671 |
+
"step": 367400
|
| 28672 |
+
},
|
| 28673 |
+
{
|
| 28674 |
+
"epoch": 1.001796,
|
| 28675 |
+
"grad_norm": 0.8390426635742188,
|
| 28676 |
+
"learning_rate": 8.44947717746255e-06,
|
| 28677 |
+
"loss": 0.6492,
|
| 28678 |
+
"step": 367500
|
| 28679 |
+
},
|
| 28680 |
+
{
|
| 28681 |
+
"epoch": 1.001996,
|
| 28682 |
+
"grad_norm": 0.8003919720649719,
|
| 28683 |
+
"learning_rate": 8.437588704900948e-06,
|
| 28684 |
+
"loss": 0.6472,
|
| 28685 |
+
"step": 367600
|
| 28686 |
+
},
|
| 28687 |
+
{
|
| 28688 |
+
"epoch": 1.002196,
|
| 28689 |
+
"grad_norm": 0.8807201981544495,
|
| 28690 |
+
"learning_rate": 8.425706903676645e-06,
|
| 28691 |
+
"loss": 0.6338,
|
| 28692 |
+
"step": 367700
|
| 28693 |
+
},
|
| 28694 |
+
{
|
| 28695 |
+
"epoch": 1.002396,
|
| 28696 |
+
"grad_norm": 0.8409605622291565,
|
| 28697 |
+
"learning_rate": 8.41383177857561e-06,
|
| 28698 |
+
"loss": 0.6371,
|
| 28699 |
+
"step": 367800
|
| 28700 |
+
},
|
| 28701 |
+
{
|
| 28702 |
+
"epoch": 1.002596,
|
| 28703 |
+
"grad_norm": 0.8772279024124146,
|
| 28704 |
+
"learning_rate": 8.401963334381149e-06,
|
| 28705 |
+
"loss": 0.6305,
|
| 28706 |
+
"step": 367900
|
| 28707 |
+
},
|
| 28708 |
+
{
|
| 28709 |
+
"epoch": 1.002796,
|
| 28710 |
+
"grad_norm": 0.921270489692688,
|
| 28711 |
+
"learning_rate": 8.390101575873871e-06,
|
| 28712 |
+
"loss": 0.6414,
|
| 28713 |
+
"step": 368000
|
| 28714 |
+
},
|
| 28715 |
+
{
|
| 28716 |
+
"epoch": 1.002796,
|
| 28717 |
+
"eval_loss": 2.0858559608459473,
|
| 28718 |
+
"eval_runtime": 51.7813,
|
| 28719 |
+
"eval_samples_per_second": 196.867,
|
| 28720 |
+
"eval_steps_per_second": 1.545,
|
| 28721 |
+
"step": 368000
|
| 28722 |
}
|
| 28723 |
],
|
| 28724 |
"logging_steps": 100,
|
|
|
|
| 28738 |
"attributes": {}
|
| 28739 |
}
|
| 28740 |
},
|
| 28741 |
+
"total_flos": 3.211620496844764e+19,
|
| 28742 |
"train_batch_size": 128,
|
| 28743 |
"trial_name": null,
|
| 28744 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5777
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04f252a64f6373afbaec36fc31e345451d91b06580ee09a9823282cc3866516c
|
| 3 |
size 5777
|