Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +2343 -3
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:724be5ec56c8cea0a6bccb0fb0bcec03b849814458eb8b51ff9f3d953d0ed14c
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aae36e7eb1c7e8d3c5cc3aa77fc98b6aae23dbfbb8ba5dbcfe46c7087de864d3
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d808ac48aeb2285a7d15fe96957631f4317dc7cd8cbbaa8b381b1638da837ef8
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ef58d5b955824dfbbc6cf55d8b7019f163372cbafcda9d38b4c7e503714eff0
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -24429,6 +24429,2346 @@
|
|
| 24429 |
"eval_samples_per_second": 195.892,
|
| 24430 |
"eval_steps_per_second": 1.537,
|
| 24431 |
"step": 313000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24432 |
}
|
| 24433 |
],
|
| 24434 |
"logging_steps": 100,
|
|
@@ -24448,7 +26788,7 @@
|
|
| 24448 |
"attributes": {}
|
| 24449 |
}
|
| 24450 |
},
|
| 24451 |
-
"total_flos": 2.
|
| 24452 |
"train_batch_size": 128,
|
| 24453 |
"trial_name": null,
|
| 24454 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.034,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 343000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 24429 |
"eval_samples_per_second": 195.892,
|
| 24430 |
"eval_steps_per_second": 1.537,
|
| 24431 |
"step": 313000
|
| 24432 |
+
},
|
| 24433 |
+
{
|
| 24434 |
+
"epoch": 0.0202,
|
| 24435 |
+
"grad_norm": 0.8182185888290405,
|
| 24436 |
+
"learning_rate": 1.5767536216792224e-05,
|
| 24437 |
+
"loss": 1.1693,
|
| 24438 |
+
"step": 313100
|
| 24439 |
+
},
|
| 24440 |
+
{
|
| 24441 |
+
"epoch": 0.0204,
|
| 24442 |
+
"grad_norm": 0.8927570581436157,
|
| 24443 |
+
"learning_rate": 1.575279304684168e-05,
|
| 24444 |
+
"loss": 1.1373,
|
| 24445 |
+
"step": 313200
|
| 24446 |
+
},
|
| 24447 |
+
{
|
| 24448 |
+
"epoch": 0.0206,
|
| 24449 |
+
"grad_norm": 0.881147027015686,
|
| 24450 |
+
"learning_rate": 1.573805360166499e-05,
|
| 24451 |
+
"loss": 1.1504,
|
| 24452 |
+
"step": 313300
|
| 24453 |
+
},
|
| 24454 |
+
{
|
| 24455 |
+
"epoch": 0.0208,
|
| 24456 |
+
"grad_norm": 0.8833571672439575,
|
| 24457 |
+
"learning_rate": 1.572331788719921e-05,
|
| 24458 |
+
"loss": 1.1405,
|
| 24459 |
+
"step": 313400
|
| 24460 |
+
},
|
| 24461 |
+
{
|
| 24462 |
+
"epoch": 0.021,
|
| 24463 |
+
"grad_norm": 0.8598502278327942,
|
| 24464 |
+
"learning_rate": 1.5708585909379864e-05,
|
| 24465 |
+
"loss": 1.1365,
|
| 24466 |
+
"step": 313500
|
| 24467 |
+
},
|
| 24468 |
+
{
|
| 24469 |
+
"epoch": 0.0212,
|
| 24470 |
+
"grad_norm": 0.8392401337623596,
|
| 24471 |
+
"learning_rate": 1.5693857674141012e-05,
|
| 24472 |
+
"loss": 1.1331,
|
| 24473 |
+
"step": 313600
|
| 24474 |
+
},
|
| 24475 |
+
{
|
| 24476 |
+
"epoch": 0.0214,
|
| 24477 |
+
"grad_norm": 0.8870404958724976,
|
| 24478 |
+
"learning_rate": 1.5679133187415168e-05,
|
| 24479 |
+
"loss": 1.115,
|
| 24480 |
+
"step": 313700
|
| 24481 |
+
},
|
| 24482 |
+
{
|
| 24483 |
+
"epoch": 0.0216,
|
| 24484 |
+
"grad_norm": 0.9391428232192993,
|
| 24485 |
+
"learning_rate": 1.566441245513337e-05,
|
| 24486 |
+
"loss": 1.1178,
|
| 24487 |
+
"step": 313800
|
| 24488 |
+
},
|
| 24489 |
+
{
|
| 24490 |
+
"epoch": 0.0218,
|
| 24491 |
+
"grad_norm": 0.8332740664482117,
|
| 24492 |
+
"learning_rate": 1.5649695483225107e-05,
|
| 24493 |
+
"loss": 1.1335,
|
| 24494 |
+
"step": 313900
|
| 24495 |
+
},
|
| 24496 |
+
{
|
| 24497 |
+
"epoch": 0.022,
|
| 24498 |
+
"grad_norm": 0.8561016917228699,
|
| 24499 |
+
"learning_rate": 1.5634982277618392e-05,
|
| 24500 |
+
"loss": 1.126,
|
| 24501 |
+
"step": 314000
|
| 24502 |
+
},
|
| 24503 |
+
{
|
| 24504 |
+
"epoch": 0.022,
|
| 24505 |
+
"eval_loss": 2.42391037940979,
|
| 24506 |
+
"eval_runtime": 51.9237,
|
| 24507 |
+
"eval_samples_per_second": 196.327,
|
| 24508 |
+
"eval_steps_per_second": 1.541,
|
| 24509 |
+
"step": 314000
|
| 24510 |
+
},
|
| 24511 |
+
{
|
| 24512 |
+
"epoch": 0.0222,
|
| 24513 |
+
"grad_norm": 0.8221678137779236,
|
| 24514 |
+
"learning_rate": 1.5620272844239697e-05,
|
| 24515 |
+
"loss": 1.1344,
|
| 24516 |
+
"step": 314100
|
| 24517 |
+
},
|
| 24518 |
+
{
|
| 24519 |
+
"epoch": 0.0224,
|
| 24520 |
+
"grad_norm": 0.865084707736969,
|
| 24521 |
+
"learning_rate": 1.5605567189013977e-05,
|
| 24522 |
+
"loss": 1.1195,
|
| 24523 |
+
"step": 314200
|
| 24524 |
+
},
|
| 24525 |
+
{
|
| 24526 |
+
"epoch": 0.0226,
|
| 24527 |
+
"grad_norm": 0.8354145288467407,
|
| 24528 |
+
"learning_rate": 1.5590865317864666e-05,
|
| 24529 |
+
"loss": 1.1236,
|
| 24530 |
+
"step": 314300
|
| 24531 |
+
},
|
| 24532 |
+
{
|
| 24533 |
+
"epoch": 0.0228,
|
| 24534 |
+
"grad_norm": 0.8688293099403381,
|
| 24535 |
+
"learning_rate": 1.557616723671369e-05,
|
| 24536 |
+
"loss": 1.1169,
|
| 24537 |
+
"step": 314400
|
| 24538 |
+
},
|
| 24539 |
+
{
|
| 24540 |
+
"epoch": 0.023,
|
| 24541 |
+
"grad_norm": 0.8651818037033081,
|
| 24542 |
+
"learning_rate": 1.5561472951481414e-05,
|
| 24543 |
+
"loss": 1.1099,
|
| 24544 |
+
"step": 314500
|
| 24545 |
+
},
|
| 24546 |
+
{
|
| 24547 |
+
"epoch": 0.0232,
|
| 24548 |
+
"grad_norm": 0.8726403713226318,
|
| 24549 |
+
"learning_rate": 1.5546782468086706e-05,
|
| 24550 |
+
"loss": 1.1284,
|
| 24551 |
+
"step": 314600
|
| 24552 |
+
},
|
| 24553 |
+
{
|
| 24554 |
+
"epoch": 0.0234,
|
| 24555 |
+
"grad_norm": 0.8787026405334473,
|
| 24556 |
+
"learning_rate": 1.5532095792446894e-05,
|
| 24557 |
+
"loss": 1.1046,
|
| 24558 |
+
"step": 314700
|
| 24559 |
+
},
|
| 24560 |
+
{
|
| 24561 |
+
"epoch": 0.0236,
|
| 24562 |
+
"grad_norm": 0.8764083981513977,
|
| 24563 |
+
"learning_rate": 1.5517412930477762e-05,
|
| 24564 |
+
"loss": 1.0929,
|
| 24565 |
+
"step": 314800
|
| 24566 |
+
},
|
| 24567 |
+
{
|
| 24568 |
+
"epoch": 0.0238,
|
| 24569 |
+
"grad_norm": 0.8777551651000977,
|
| 24570 |
+
"learning_rate": 1.5502733888093564e-05,
|
| 24571 |
+
"loss": 1.1143,
|
| 24572 |
+
"step": 314900
|
| 24573 |
+
},
|
| 24574 |
+
{
|
| 24575 |
+
"epoch": 0.024,
|
| 24576 |
+
"grad_norm": 0.8219897150993347,
|
| 24577 |
+
"learning_rate": 1.5488058671207027e-05,
|
| 24578 |
+
"loss": 1.0936,
|
| 24579 |
+
"step": 315000
|
| 24580 |
+
},
|
| 24581 |
+
{
|
| 24582 |
+
"epoch": 0.024,
|
| 24583 |
+
"eval_loss": 2.4566423892974854,
|
| 24584 |
+
"eval_runtime": 52.0959,
|
| 24585 |
+
"eval_samples_per_second": 195.678,
|
| 24586 |
+
"eval_steps_per_second": 1.536,
|
| 24587 |
+
"step": 315000
|
| 24588 |
+
},
|
| 24589 |
+
{
|
| 24590 |
+
"epoch": 0.0242,
|
| 24591 |
+
"grad_norm": 0.8803728818893433,
|
| 24592 |
+
"learning_rate": 1.5473387285729317e-05,
|
| 24593 |
+
"loss": 1.1068,
|
| 24594 |
+
"step": 315100
|
| 24595 |
+
},
|
| 24596 |
+
{
|
| 24597 |
+
"epoch": 0.0244,
|
| 24598 |
+
"grad_norm": 0.9315307140350342,
|
| 24599 |
+
"learning_rate": 1.5458719737570067e-05,
|
| 24600 |
+
"loss": 1.0864,
|
| 24601 |
+
"step": 315200
|
| 24602 |
+
},
|
| 24603 |
+
{
|
| 24604 |
+
"epoch": 0.0246,
|
| 24605 |
+
"grad_norm": 0.9067742824554443,
|
| 24606 |
+
"learning_rate": 1.544405603263737e-05,
|
| 24607 |
+
"loss": 1.0905,
|
| 24608 |
+
"step": 315300
|
| 24609 |
+
},
|
| 24610 |
+
{
|
| 24611 |
+
"epoch": 0.0248,
|
| 24612 |
+
"grad_norm": 0.836283802986145,
|
| 24613 |
+
"learning_rate": 1.5429396176837756e-05,
|
| 24614 |
+
"loss": 1.0925,
|
| 24615 |
+
"step": 315400
|
| 24616 |
+
},
|
| 24617 |
+
{
|
| 24618 |
+
"epoch": 0.025,
|
| 24619 |
+
"grad_norm": 0.8385760188102722,
|
| 24620 |
+
"learning_rate": 1.541474017607622e-05,
|
| 24621 |
+
"loss": 1.0998,
|
| 24622 |
+
"step": 315500
|
| 24623 |
+
},
|
| 24624 |
+
{
|
| 24625 |
+
"epoch": 0.0252,
|
| 24626 |
+
"grad_norm": 0.820689857006073,
|
| 24627 |
+
"learning_rate": 1.5400088036256187e-05,
|
| 24628 |
+
"loss": 1.0826,
|
| 24629 |
+
"step": 315600
|
| 24630 |
+
},
|
| 24631 |
+
{
|
| 24632 |
+
"epoch": 0.0254,
|
| 24633 |
+
"grad_norm": 0.8749442100524902,
|
| 24634 |
+
"learning_rate": 1.5385439763279556e-05,
|
| 24635 |
+
"loss": 1.0923,
|
| 24636 |
+
"step": 315700
|
| 24637 |
+
},
|
| 24638 |
+
{
|
| 24639 |
+
"epoch": 0.0256,
|
| 24640 |
+
"grad_norm": 0.8703187704086304,
|
| 24641 |
+
"learning_rate": 1.537079536304663e-05,
|
| 24642 |
+
"loss": 1.0874,
|
| 24643 |
+
"step": 315800
|
| 24644 |
+
},
|
| 24645 |
+
{
|
| 24646 |
+
"epoch": 0.0258,
|
| 24647 |
+
"grad_norm": 0.8370440006256104,
|
| 24648 |
+
"learning_rate": 1.535615484145619e-05,
|
| 24649 |
+
"loss": 1.0905,
|
| 24650 |
+
"step": 315900
|
| 24651 |
+
},
|
| 24652 |
+
{
|
| 24653 |
+
"epoch": 0.026,
|
| 24654 |
+
"grad_norm": 0.8787978887557983,
|
| 24655 |
+
"learning_rate": 1.5341518204405416e-05,
|
| 24656 |
+
"loss": 1.0855,
|
| 24657 |
+
"step": 316000
|
| 24658 |
+
},
|
| 24659 |
+
{
|
| 24660 |
+
"epoch": 0.026,
|
| 24661 |
+
"eval_loss": 2.462463617324829,
|
| 24662 |
+
"eval_runtime": 51.7904,
|
| 24663 |
+
"eval_samples_per_second": 196.832,
|
| 24664 |
+
"eval_steps_per_second": 1.545,
|
| 24665 |
+
"step": 316000
|
| 24666 |
+
},
|
| 24667 |
+
{
|
| 24668 |
+
"epoch": 0.0262,
|
| 24669 |
+
"grad_norm": 0.8166690468788147,
|
| 24670 |
+
"learning_rate": 1.5326885457789964e-05,
|
| 24671 |
+
"loss": 1.0895,
|
| 24672 |
+
"step": 316100
|
| 24673 |
+
},
|
| 24674 |
+
{
|
| 24675 |
+
"epoch": 0.0264,
|
| 24676 |
+
"grad_norm": 0.8346712589263916,
|
| 24677 |
+
"learning_rate": 1.5312256607503884e-05,
|
| 24678 |
+
"loss": 1.0795,
|
| 24679 |
+
"step": 316200
|
| 24680 |
+
},
|
| 24681 |
+
{
|
| 24682 |
+
"epoch": 0.0266,
|
| 24683 |
+
"grad_norm": 0.8622503876686096,
|
| 24684 |
+
"learning_rate": 1.529763165943969e-05,
|
| 24685 |
+
"loss": 1.0682,
|
| 24686 |
+
"step": 316300
|
| 24687 |
+
},
|
| 24688 |
+
{
|
| 24689 |
+
"epoch": 0.0268,
|
| 24690 |
+
"grad_norm": 0.8298165798187256,
|
| 24691 |
+
"learning_rate": 1.5283010619488296e-05,
|
| 24692 |
+
"loss": 1.077,
|
| 24693 |
+
"step": 316400
|
| 24694 |
+
},
|
| 24695 |
+
{
|
| 24696 |
+
"epoch": 0.027,
|
| 24697 |
+
"grad_norm": 0.8516880869865417,
|
| 24698 |
+
"learning_rate": 1.5268393493539073e-05,
|
| 24699 |
+
"loss": 1.0686,
|
| 24700 |
+
"step": 316500
|
| 24701 |
+
},
|
| 24702 |
+
{
|
| 24703 |
+
"epoch": 0.0272,
|
| 24704 |
+
"grad_norm": 0.8550381660461426,
|
| 24705 |
+
"learning_rate": 1.5253780287479785e-05,
|
| 24706 |
+
"loss": 1.0696,
|
| 24707 |
+
"step": 316600
|
| 24708 |
+
},
|
| 24709 |
+
{
|
| 24710 |
+
"epoch": 0.0274,
|
| 24711 |
+
"grad_norm": 0.821546733379364,
|
| 24712 |
+
"learning_rate": 1.5239171007196623e-05,
|
| 24713 |
+
"loss": 1.0689,
|
| 24714 |
+
"step": 316700
|
| 24715 |
+
},
|
| 24716 |
+
{
|
| 24717 |
+
"epoch": 0.0276,
|
| 24718 |
+
"grad_norm": 0.8041675686836243,
|
| 24719 |
+
"learning_rate": 1.522456565857422e-05,
|
| 24720 |
+
"loss": 1.0649,
|
| 24721 |
+
"step": 316800
|
| 24722 |
+
},
|
| 24723 |
+
{
|
| 24724 |
+
"epoch": 0.0278,
|
| 24725 |
+
"grad_norm": 0.9088461995124817,
|
| 24726 |
+
"learning_rate": 1.5209964247495595e-05,
|
| 24727 |
+
"loss": 1.0751,
|
| 24728 |
+
"step": 316900
|
| 24729 |
+
},
|
| 24730 |
+
{
|
| 24731 |
+
"epoch": 0.028,
|
| 24732 |
+
"grad_norm": 0.8547507524490356,
|
| 24733 |
+
"learning_rate": 1.5195366779842207e-05,
|
| 24734 |
+
"loss": 1.0798,
|
| 24735 |
+
"step": 317000
|
| 24736 |
+
},
|
| 24737 |
+
{
|
| 24738 |
+
"epoch": 0.028,
|
| 24739 |
+
"eval_loss": 2.4812233448028564,
|
| 24740 |
+
"eval_runtime": 52.0302,
|
| 24741 |
+
"eval_samples_per_second": 195.925,
|
| 24742 |
+
"eval_steps_per_second": 1.538,
|
| 24743 |
+
"step": 317000
|
| 24744 |
+
},
|
| 24745 |
+
{
|
| 24746 |
+
"epoch": 0.0282,
|
| 24747 |
+
"grad_norm": 0.8872113823890686,
|
| 24748 |
+
"learning_rate": 1.5180773261493902e-05,
|
| 24749 |
+
"loss": 1.0652,
|
| 24750 |
+
"step": 317100
|
| 24751 |
+
},
|
| 24752 |
+
{
|
| 24753 |
+
"epoch": 0.0284,
|
| 24754 |
+
"grad_norm": 0.984126091003418,
|
| 24755 |
+
"learning_rate": 1.5166183698328957e-05,
|
| 24756 |
+
"loss": 1.0654,
|
| 24757 |
+
"step": 317200
|
| 24758 |
+
},
|
| 24759 |
+
{
|
| 24760 |
+
"epoch": 0.0286,
|
| 24761 |
+
"grad_norm": 0.8874821066856384,
|
| 24762 |
+
"learning_rate": 1.5151598096224037e-05,
|
| 24763 |
+
"loss": 1.0571,
|
| 24764 |
+
"step": 317300
|
| 24765 |
+
},
|
| 24766 |
+
{
|
| 24767 |
+
"epoch": 0.0288,
|
| 24768 |
+
"grad_norm": 0.8837223649024963,
|
| 24769 |
+
"learning_rate": 1.5137016461054233e-05,
|
| 24770 |
+
"loss": 1.066,
|
| 24771 |
+
"step": 317400
|
| 24772 |
+
},
|
| 24773 |
+
{
|
| 24774 |
+
"epoch": 0.029,
|
| 24775 |
+
"grad_norm": 0.879486083984375,
|
| 24776 |
+
"learning_rate": 1.512243879869301e-05,
|
| 24777 |
+
"loss": 1.0572,
|
| 24778 |
+
"step": 317500
|
| 24779 |
+
},
|
| 24780 |
+
{
|
| 24781 |
+
"epoch": 0.0292,
|
| 24782 |
+
"grad_norm": 0.8751283884048462,
|
| 24783 |
+
"learning_rate": 1.5107865115012265e-05,
|
| 24784 |
+
"loss": 1.0552,
|
| 24785 |
+
"step": 317600
|
| 24786 |
+
},
|
| 24787 |
+
{
|
| 24788 |
+
"epoch": 0.0294,
|
| 24789 |
+
"grad_norm": 0.8803706765174866,
|
| 24790 |
+
"learning_rate": 1.5093295415882267e-05,
|
| 24791 |
+
"loss": 1.0499,
|
| 24792 |
+
"step": 317700
|
| 24793 |
+
},
|
| 24794 |
+
{
|
| 24795 |
+
"epoch": 0.0296,
|
| 24796 |
+
"grad_norm": 0.8694496750831604,
|
| 24797 |
+
"learning_rate": 1.507872970717169e-05,
|
| 24798 |
+
"loss": 1.0608,
|
| 24799 |
+
"step": 317800
|
| 24800 |
+
},
|
| 24801 |
+
{
|
| 24802 |
+
"epoch": 0.0298,
|
| 24803 |
+
"grad_norm": 0.8200892806053162,
|
| 24804 |
+
"learning_rate": 1.5064167994747603e-05,
|
| 24805 |
+
"loss": 1.0415,
|
| 24806 |
+
"step": 317900
|
| 24807 |
+
},
|
| 24808 |
+
{
|
| 24809 |
+
"epoch": 0.03,
|
| 24810 |
+
"grad_norm": 0.8422415256500244,
|
| 24811 |
+
"learning_rate": 1.5049610284475458e-05,
|
| 24812 |
+
"loss": 1.0487,
|
| 24813 |
+
"step": 318000
|
| 24814 |
+
},
|
| 24815 |
+
{
|
| 24816 |
+
"epoch": 0.03,
|
| 24817 |
+
"eval_loss": 2.492359161376953,
|
| 24818 |
+
"eval_runtime": 51.9706,
|
| 24819 |
+
"eval_samples_per_second": 196.149,
|
| 24820 |
+
"eval_steps_per_second": 1.539,
|
| 24821 |
+
"step": 318000
|
| 24822 |
+
},
|
| 24823 |
+
{
|
| 24824 |
+
"epoch": 0.0002,
|
| 24825 |
+
"grad_norm": 0.8418950438499451,
|
| 24826 |
+
"learning_rate": 1.5035056582219098e-05,
|
| 24827 |
+
"loss": 1.0456,
|
| 24828 |
+
"step": 318100
|
| 24829 |
+
},
|
| 24830 |
+
{
|
| 24831 |
+
"epoch": 0.0004,
|
| 24832 |
+
"grad_norm": 0.8390074968338013,
|
| 24833 |
+
"learning_rate": 1.5020506893840758e-05,
|
| 24834 |
+
"loss": 1.0318,
|
| 24835 |
+
"step": 318200
|
| 24836 |
+
},
|
| 24837 |
+
{
|
| 24838 |
+
"epoch": 0.0006,
|
| 24839 |
+
"grad_norm": 0.8178459405899048,
|
| 24840 |
+
"learning_rate": 1.5005961225201048e-05,
|
| 24841 |
+
"loss": 1.0373,
|
| 24842 |
+
"step": 318300
|
| 24843 |
+
},
|
| 24844 |
+
{
|
| 24845 |
+
"epoch": 0.0008,
|
| 24846 |
+
"grad_norm": 0.8252522349357605,
|
| 24847 |
+
"learning_rate": 1.4991419582158959e-05,
|
| 24848 |
+
"loss": 1.0267,
|
| 24849 |
+
"step": 318400
|
| 24850 |
+
},
|
| 24851 |
+
{
|
| 24852 |
+
"epoch": 0.001,
|
| 24853 |
+
"grad_norm": 0.8596453070640564,
|
| 24854 |
+
"learning_rate": 1.4976881970571868e-05,
|
| 24855 |
+
"loss": 1.045,
|
| 24856 |
+
"step": 318500
|
| 24857 |
+
},
|
| 24858 |
+
{
|
| 24859 |
+
"epoch": 0.0012,
|
| 24860 |
+
"grad_norm": 0.9191332459449768,
|
| 24861 |
+
"learning_rate": 1.4962348396295517e-05,
|
| 24862 |
+
"loss": 1.0201,
|
| 24863 |
+
"step": 318600
|
| 24864 |
+
},
|
| 24865 |
+
{
|
| 24866 |
+
"epoch": 0.0014,
|
| 24867 |
+
"grad_norm": 0.8910384774208069,
|
| 24868 |
+
"learning_rate": 1.4947818865184035e-05,
|
| 24869 |
+
"loss": 1.0176,
|
| 24870 |
+
"step": 318700
|
| 24871 |
+
},
|
| 24872 |
+
{
|
| 24873 |
+
"epoch": 0.0016,
|
| 24874 |
+
"grad_norm": 0.8146995902061462,
|
| 24875 |
+
"learning_rate": 1.4933293383089908e-05,
|
| 24876 |
+
"loss": 1.0263,
|
| 24877 |
+
"step": 318800
|
| 24878 |
+
},
|
| 24879 |
+
{
|
| 24880 |
+
"epoch": 0.0018,
|
| 24881 |
+
"grad_norm": 0.8134050965309143,
|
| 24882 |
+
"learning_rate": 1.4918771955864009e-05,
|
| 24883 |
+
"loss": 1.0085,
|
| 24884 |
+
"step": 318900
|
| 24885 |
+
},
|
| 24886 |
+
{
|
| 24887 |
+
"epoch": 0.002,
|
| 24888 |
+
"grad_norm": 0.8413226008415222,
|
| 24889 |
+
"learning_rate": 1.4904254589355555e-05,
|
| 24890 |
+
"loss": 1.0336,
|
| 24891 |
+
"step": 319000
|
| 24892 |
+
},
|
| 24893 |
+
{
|
| 24894 |
+
"epoch": 0.002,
|
| 24895 |
+
"eval_loss": 2.506340742111206,
|
| 24896 |
+
"eval_runtime": 52.1719,
|
| 24897 |
+
"eval_samples_per_second": 195.393,
|
| 24898 |
+
"eval_steps_per_second": 1.533,
|
| 24899 |
+
"step": 319000
|
| 24900 |
+
},
|
| 24901 |
+
{
|
| 24902 |
+
"epoch": 0.0022,
|
| 24903 |
+
"grad_norm": 0.8138185739517212,
|
| 24904 |
+
"learning_rate": 1.4889741289412145e-05,
|
| 24905 |
+
"loss": 1.023,
|
| 24906 |
+
"step": 319100
|
| 24907 |
+
},
|
| 24908 |
+
{
|
| 24909 |
+
"epoch": 0.0024,
|
| 24910 |
+
"grad_norm": 0.8572819232940674,
|
| 24911 |
+
"learning_rate": 1.4875232061879735e-05,
|
| 24912 |
+
"loss": 1.0055,
|
| 24913 |
+
"step": 319200
|
| 24914 |
+
},
|
| 24915 |
+
{
|
| 24916 |
+
"epoch": 0.0026,
|
| 24917 |
+
"grad_norm": 0.8657738566398621,
|
| 24918 |
+
"learning_rate": 1.4860726912602643e-05,
|
| 24919 |
+
"loss": 1.009,
|
| 24920 |
+
"step": 319300
|
| 24921 |
+
},
|
| 24922 |
+
{
|
| 24923 |
+
"epoch": 0.0028,
|
| 24924 |
+
"grad_norm": 0.8982349634170532,
|
| 24925 |
+
"learning_rate": 1.4846225847423545e-05,
|
| 24926 |
+
"loss": 1.021,
|
| 24927 |
+
"step": 319400
|
| 24928 |
+
},
|
| 24929 |
+
{
|
| 24930 |
+
"epoch": 0.003,
|
| 24931 |
+
"grad_norm": 0.8425928354263306,
|
| 24932 |
+
"learning_rate": 1.4831728872183448e-05,
|
| 24933 |
+
"loss": 1.0206,
|
| 24934 |
+
"step": 319500
|
| 24935 |
+
},
|
| 24936 |
+
{
|
| 24937 |
+
"epoch": 0.0032,
|
| 24938 |
+
"grad_norm": 0.8392213582992554,
|
| 24939 |
+
"learning_rate": 1.481723599272175e-05,
|
| 24940 |
+
"loss": 1.0088,
|
| 24941 |
+
"step": 319600
|
| 24942 |
+
},
|
| 24943 |
+
{
|
| 24944 |
+
"epoch": 0.0034,
|
| 24945 |
+
"grad_norm": 0.8505594730377197,
|
| 24946 |
+
"learning_rate": 1.480274721487618e-05,
|
| 24947 |
+
"loss": 0.9964,
|
| 24948 |
+
"step": 319700
|
| 24949 |
+
},
|
| 24950 |
+
{
|
| 24951 |
+
"epoch": 0.0036,
|
| 24952 |
+
"grad_norm": 0.7965133190155029,
|
| 24953 |
+
"learning_rate": 1.4788262544482805e-05,
|
| 24954 |
+
"loss": 1.0288,
|
| 24955 |
+
"step": 319800
|
| 24956 |
+
},
|
| 24957 |
+
{
|
| 24958 |
+
"epoch": 0.0038,
|
| 24959 |
+
"grad_norm": 0.8193480372428894,
|
| 24960 |
+
"learning_rate": 1.4773781987376061e-05,
|
| 24961 |
+
"loss": 0.9985,
|
| 24962 |
+
"step": 319900
|
| 24963 |
+
},
|
| 24964 |
+
{
|
| 24965 |
+
"epoch": 0.004,
|
| 24966 |
+
"grad_norm": 0.8430262207984924,
|
| 24967 |
+
"learning_rate": 1.4759305549388708e-05,
|
| 24968 |
+
"loss": 1.0053,
|
| 24969 |
+
"step": 320000
|
| 24970 |
+
},
|
| 24971 |
+
{
|
| 24972 |
+
"epoch": 0.004,
|
| 24973 |
+
"eval_loss": 2.515505790710449,
|
| 24974 |
+
"eval_runtime": 51.658,
|
| 24975 |
+
"eval_samples_per_second": 197.337,
|
| 24976 |
+
"eval_steps_per_second": 1.549,
|
| 24977 |
+
"step": 320000
|
| 24978 |
+
},
|
| 24979 |
+
{
|
| 24980 |
+
"epoch": 0.0042,
|
| 24981 |
+
"grad_norm": 0.8491013050079346,
|
| 24982 |
+
"learning_rate": 1.4744833236351857e-05,
|
| 24983 |
+
"loss": 1.0021,
|
| 24984 |
+
"step": 320100
|
| 24985 |
+
},
|
| 24986 |
+
{
|
| 24987 |
+
"epoch": 0.0044,
|
| 24988 |
+
"grad_norm": 0.8557093739509583,
|
| 24989 |
+
"learning_rate": 1.4730365054094947e-05,
|
| 24990 |
+
"loss": 0.9974,
|
| 24991 |
+
"step": 320200
|
| 24992 |
+
},
|
| 24993 |
+
{
|
| 24994 |
+
"epoch": 0.0046,
|
| 24995 |
+
"grad_norm": 0.8552497625350952,
|
| 24996 |
+
"learning_rate": 1.471590100844577e-05,
|
| 24997 |
+
"loss": 0.9937,
|
| 24998 |
+
"step": 320300
|
| 24999 |
+
},
|
| 25000 |
+
{
|
| 25001 |
+
"epoch": 0.0048,
|
| 25002 |
+
"grad_norm": 0.7959555983543396,
|
| 25003 |
+
"learning_rate": 1.4701441105230435e-05,
|
| 25004 |
+
"loss": 1.0001,
|
| 25005 |
+
"step": 320400
|
| 25006 |
+
},
|
| 25007 |
+
{
|
| 25008 |
+
"epoch": 0.005,
|
| 25009 |
+
"grad_norm": 0.8395636081695557,
|
| 25010 |
+
"learning_rate": 1.4686985350273391e-05,
|
| 25011 |
+
"loss": 0.9984,
|
| 25012 |
+
"step": 320500
|
| 25013 |
+
},
|
| 25014 |
+
{
|
| 25015 |
+
"epoch": 0.0052,
|
| 25016 |
+
"grad_norm": 0.8316648602485657,
|
| 25017 |
+
"learning_rate": 1.4672533749397414e-05,
|
| 25018 |
+
"loss": 0.988,
|
| 25019 |
+
"step": 320600
|
| 25020 |
+
},
|
| 25021 |
+
{
|
| 25022 |
+
"epoch": 0.0054,
|
| 25023 |
+
"grad_norm": 0.8290709853172302,
|
| 25024 |
+
"learning_rate": 1.4658086308423608e-05,
|
| 25025 |
+
"loss": 0.9984,
|
| 25026 |
+
"step": 320700
|
| 25027 |
+
},
|
| 25028 |
+
{
|
| 25029 |
+
"epoch": 0.0056,
|
| 25030 |
+
"grad_norm": 0.8538153767585754,
|
| 25031 |
+
"learning_rate": 1.46436430331714e-05,
|
| 25032 |
+
"loss": 1.0038,
|
| 25033 |
+
"step": 320800
|
| 25034 |
+
},
|
| 25035 |
+
{
|
| 25036 |
+
"epoch": 0.0058,
|
| 25037 |
+
"grad_norm": 0.828048586845398,
|
| 25038 |
+
"learning_rate": 1.462920392945854e-05,
|
| 25039 |
+
"loss": 0.9952,
|
| 25040 |
+
"step": 320900
|
| 25041 |
+
},
|
| 25042 |
+
{
|
| 25043 |
+
"epoch": 0.006,
|
| 25044 |
+
"grad_norm": 0.8509120941162109,
|
| 25045 |
+
"learning_rate": 1.4614769003101097e-05,
|
| 25046 |
+
"loss": 1.0151,
|
| 25047 |
+
"step": 321000
|
| 25048 |
+
},
|
| 25049 |
+
{
|
| 25050 |
+
"epoch": 0.006,
|
| 25051 |
+
"eval_loss": 2.529923677444458,
|
| 25052 |
+
"eval_runtime": 51.641,
|
| 25053 |
+
"eval_samples_per_second": 197.401,
|
| 25054 |
+
"eval_steps_per_second": 1.549,
|
| 25055 |
+
"step": 321000
|
| 25056 |
+
},
|
| 25057 |
+
{
|
| 25058 |
+
"epoch": 0.0062,
|
| 25059 |
+
"grad_norm": 0.8277125358581543,
|
| 25060 |
+
"learning_rate": 1.460033825991346e-05,
|
| 25061 |
+
"loss": 1.0018,
|
| 25062 |
+
"step": 321100
|
| 25063 |
+
},
|
| 25064 |
+
{
|
| 25065 |
+
"epoch": 0.0064,
|
| 25066 |
+
"grad_norm": 0.8201048374176025,
|
| 25067 |
+
"learning_rate": 1.4585911705708325e-05,
|
| 25068 |
+
"loss": 1.0042,
|
| 25069 |
+
"step": 321200
|
| 25070 |
+
},
|
| 25071 |
+
{
|
| 25072 |
+
"epoch": 0.0066,
|
| 25073 |
+
"grad_norm": 0.8629177212715149,
|
| 25074 |
+
"learning_rate": 1.4571489346296718e-05,
|
| 25075 |
+
"loss": 1.0076,
|
| 25076 |
+
"step": 321300
|
| 25077 |
+
},
|
| 25078 |
+
{
|
| 25079 |
+
"epoch": 0.0068,
|
| 25080 |
+
"grad_norm": 0.8436629176139832,
|
| 25081 |
+
"learning_rate": 1.4557071187487945e-05,
|
| 25082 |
+
"loss": 1.0137,
|
| 25083 |
+
"step": 321400
|
| 25084 |
+
},
|
| 25085 |
+
{
|
| 25086 |
+
"epoch": 0.007,
|
| 25087 |
+
"grad_norm": 0.9035348892211914,
|
| 25088 |
+
"learning_rate": 1.4542657235089649e-05,
|
| 25089 |
+
"loss": 0.9959,
|
| 25090 |
+
"step": 321500
|
| 25091 |
+
},
|
| 25092 |
+
{
|
| 25093 |
+
"epoch": 0.0072,
|
| 25094 |
+
"grad_norm": 0.8393178582191467,
|
| 25095 |
+
"learning_rate": 1.4528247494907768e-05,
|
| 25096 |
+
"loss": 1.0055,
|
| 25097 |
+
"step": 321600
|
| 25098 |
+
},
|
| 25099 |
+
{
|
| 25100 |
+
"epoch": 0.0074,
|
| 25101 |
+
"grad_norm": 0.8507473468780518,
|
| 25102 |
+
"learning_rate": 1.4513841972746555e-05,
|
| 25103 |
+
"loss": 1.0039,
|
| 25104 |
+
"step": 321700
|
| 25105 |
+
},
|
| 25106 |
+
{
|
| 25107 |
+
"epoch": 0.0076,
|
| 25108 |
+
"grad_norm": 0.8492685556411743,
|
| 25109 |
+
"learning_rate": 1.4499440674408529e-05,
|
| 25110 |
+
"loss": 1.0109,
|
| 25111 |
+
"step": 321800
|
| 25112 |
+
},
|
| 25113 |
+
{
|
| 25114 |
+
"epoch": 0.0078,
|
| 25115 |
+
"grad_norm": 0.8794492483139038,
|
| 25116 |
+
"learning_rate": 1.4485043605694545e-05,
|
| 25117 |
+
"loss": 0.9981,
|
| 25118 |
+
"step": 321900
|
| 25119 |
+
},
|
| 25120 |
+
{
|
| 25121 |
+
"epoch": 0.008,
|
| 25122 |
+
"grad_norm": 0.9299744963645935,
|
| 25123 |
+
"learning_rate": 1.447065077240374e-05,
|
| 25124 |
+
"loss": 0.999,
|
| 25125 |
+
"step": 322000
|
| 25126 |
+
},
|
| 25127 |
+
{
|
| 25128 |
+
"epoch": 0.008,
|
| 25129 |
+
"eval_loss": 2.534123659133911,
|
| 25130 |
+
"eval_runtime": 51.7664,
|
| 25131 |
+
"eval_samples_per_second": 196.923,
|
| 25132 |
+
"eval_steps_per_second": 1.545,
|
| 25133 |
+
"step": 322000
|
| 25134 |
+
},
|
| 25135 |
+
{
|
| 25136 |
+
"epoch": 0.0082,
|
| 25137 |
+
"grad_norm": 0.8244746923446655,
|
| 25138 |
+
"learning_rate": 1.4456262180333552e-05,
|
| 25139 |
+
"loss": 0.9991,
|
| 25140 |
+
"step": 322100
|
| 25141 |
+
},
|
| 25142 |
+
{
|
| 25143 |
+
"epoch": 0.0084,
|
| 25144 |
+
"grad_norm": 0.8086799383163452,
|
| 25145 |
+
"learning_rate": 1.4441877835279691e-05,
|
| 25146 |
+
"loss": 0.9995,
|
| 25147 |
+
"step": 322200
|
| 25148 |
+
},
|
| 25149 |
+
{
|
| 25150 |
+
"epoch": 0.0086,
|
| 25151 |
+
"grad_norm": 0.8285476565361023,
|
| 25152 |
+
"learning_rate": 1.4427497743036172e-05,
|
| 25153 |
+
"loss": 1.0018,
|
| 25154 |
+
"step": 322300
|
| 25155 |
+
},
|
| 25156 |
+
{
|
| 25157 |
+
"epoch": 0.0088,
|
| 25158 |
+
"grad_norm": 0.8461373448371887,
|
| 25159 |
+
"learning_rate": 1.4413121909395299e-05,
|
| 25160 |
+
"loss": 0.9767,
|
| 25161 |
+
"step": 322400
|
| 25162 |
+
},
|
| 25163 |
+
{
|
| 25164 |
+
"epoch": 0.009,
|
| 25165 |
+
"grad_norm": 0.864859938621521,
|
| 25166 |
+
"learning_rate": 1.4398750340147666e-05,
|
| 25167 |
+
"loss": 1.001,
|
| 25168 |
+
"step": 322500
|
| 25169 |
+
},
|
| 25170 |
+
{
|
| 25171 |
+
"epoch": 0.0092,
|
| 25172 |
+
"grad_norm": 0.8466659784317017,
|
| 25173 |
+
"learning_rate": 1.4384383041082117e-05,
|
| 25174 |
+
"loss": 0.9958,
|
| 25175 |
+
"step": 322600
|
| 25176 |
+
},
|
| 25177 |
+
{
|
| 25178 |
+
"epoch": 0.0094,
|
| 25179 |
+
"grad_norm": 0.8037152290344238,
|
| 25180 |
+
"learning_rate": 1.4370020017985807e-05,
|
| 25181 |
+
"loss": 0.9959,
|
| 25182 |
+
"step": 322700
|
| 25183 |
+
},
|
| 25184 |
+
{
|
| 25185 |
+
"epoch": 0.0096,
|
| 25186 |
+
"grad_norm": 0.8187578320503235,
|
| 25187 |
+
"learning_rate": 1.4355661276644178e-05,
|
| 25188 |
+
"loss": 0.9955,
|
| 25189 |
+
"step": 322800
|
| 25190 |
+
},
|
| 25191 |
+
{
|
| 25192 |
+
"epoch": 0.0098,
|
| 25193 |
+
"grad_norm": 0.8383049368858337,
|
| 25194 |
+
"learning_rate": 1.43413068228409e-05,
|
| 25195 |
+
"loss": 0.9861,
|
| 25196 |
+
"step": 322900
|
| 25197 |
+
},
|
| 25198 |
+
{
|
| 25199 |
+
"epoch": 0.01,
|
| 25200 |
+
"grad_norm": 0.8338568210601807,
|
| 25201 |
+
"learning_rate": 1.432695666235796e-05,
|
| 25202 |
+
"loss": 0.9907,
|
| 25203 |
+
"step": 323000
|
| 25204 |
+
},
|
| 25205 |
+
{
|
| 25206 |
+
"epoch": 0.01,
|
| 25207 |
+
"eval_loss": 2.5478382110595703,
|
| 25208 |
+
"eval_runtime": 51.7181,
|
| 25209 |
+
"eval_samples_per_second": 197.107,
|
| 25210 |
+
"eval_steps_per_second": 1.547,
|
| 25211 |
+
"step": 323000
|
| 25212 |
+
},
|
| 25213 |
+
{
|
| 25214 |
+
"epoch": 0.0102,
|
| 25215 |
+
"grad_norm": 0.9476732611656189,
|
| 25216 |
+
"learning_rate": 1.4312610800975602e-05,
|
| 25217 |
+
"loss": 0.9817,
|
| 25218 |
+
"step": 323100
|
| 25219 |
+
},
|
| 25220 |
+
{
|
| 25221 |
+
"epoch": 0.0104,
|
| 25222 |
+
"grad_norm": 0.8296193480491638,
|
| 25223 |
+
"learning_rate": 1.429826924447234e-05,
|
| 25224 |
+
"loss": 0.9883,
|
| 25225 |
+
"step": 323200
|
| 25226 |
+
},
|
| 25227 |
+
{
|
| 25228 |
+
"epoch": 0.0106,
|
| 25229 |
+
"grad_norm": 0.8237991333007812,
|
| 25230 |
+
"learning_rate": 1.4283931998624938e-05,
|
| 25231 |
+
"loss": 0.9966,
|
| 25232 |
+
"step": 323300
|
| 25233 |
+
},
|
| 25234 |
+
{
|
| 25235 |
+
"epoch": 0.0108,
|
| 25236 |
+
"grad_norm": 0.8200727701187134,
|
| 25237 |
+
"learning_rate": 1.426959906920845e-05,
|
| 25238 |
+
"loss": 0.9925,
|
| 25239 |
+
"step": 323400
|
| 25240 |
+
},
|
| 25241 |
+
{
|
| 25242 |
+
"epoch": 0.011,
|
| 25243 |
+
"grad_norm": 0.7869872450828552,
|
| 25244 |
+
"learning_rate": 1.4255270461996171e-05,
|
| 25245 |
+
"loss": 0.9913,
|
| 25246 |
+
"step": 323500
|
| 25247 |
+
},
|
| 25248 |
+
{
|
| 25249 |
+
"epoch": 0.0112,
|
| 25250 |
+
"grad_norm": 0.8540888428688049,
|
| 25251 |
+
"learning_rate": 1.4240946182759673e-05,
|
| 25252 |
+
"loss": 0.9851,
|
| 25253 |
+
"step": 323600
|
| 25254 |
+
},
|
| 25255 |
+
{
|
| 25256 |
+
"epoch": 0.0114,
|
| 25257 |
+
"grad_norm": 0.9450783729553223,
|
| 25258 |
+
"learning_rate": 1.4226626237268758e-05,
|
| 25259 |
+
"loss": 0.9841,
|
| 25260 |
+
"step": 323700
|
| 25261 |
+
},
|
| 25262 |
+
{
|
| 25263 |
+
"epoch": 0.0116,
|
| 25264 |
+
"grad_norm": 0.8994350433349609,
|
| 25265 |
+
"learning_rate": 1.421231063129151e-05,
|
| 25266 |
+
"loss": 0.9751,
|
| 25267 |
+
"step": 323800
|
| 25268 |
+
},
|
| 25269 |
+
{
|
| 25270 |
+
"epoch": 0.0118,
|
| 25271 |
+
"grad_norm": 0.9152923822402954,
|
| 25272 |
+
"learning_rate": 1.4197999370594246e-05,
|
| 25273 |
+
"loss": 0.9788,
|
| 25274 |
+
"step": 323900
|
| 25275 |
+
},
|
| 25276 |
+
{
|
| 25277 |
+
"epoch": 0.012,
|
| 25278 |
+
"grad_norm": 0.8692894577980042,
|
| 25279 |
+
"learning_rate": 1.418369246094155e-05,
|
| 25280 |
+
"loss": 0.9692,
|
| 25281 |
+
"step": 324000
|
| 25282 |
+
},
|
| 25283 |
+
{
|
| 25284 |
+
"epoch": 0.012,
|
| 25285 |
+
"eval_loss": 2.554483652114868,
|
| 25286 |
+
"eval_runtime": 51.714,
|
| 25287 |
+
"eval_samples_per_second": 197.123,
|
| 25288 |
+
"eval_steps_per_second": 1.547,
|
| 25289 |
+
"step": 324000
|
| 25290 |
+
},
|
| 25291 |
+
{
|
| 25292 |
+
"epoch": 0.0122,
|
| 25293 |
+
"grad_norm": 0.8307340145111084,
|
| 25294 |
+
"learning_rate": 1.4169389908096232e-05,
|
| 25295 |
+
"loss": 0.9791,
|
| 25296 |
+
"step": 324100
|
| 25297 |
+
},
|
| 25298 |
+
{
|
| 25299 |
+
"epoch": 0.0124,
|
| 25300 |
+
"grad_norm": 0.8067870736122131,
|
| 25301 |
+
"learning_rate": 1.4155091717819363e-05,
|
| 25302 |
+
"loss": 0.977,
|
| 25303 |
+
"step": 324200
|
| 25304 |
+
},
|
| 25305 |
+
{
|
| 25306 |
+
"epoch": 0.0126,
|
| 25307 |
+
"grad_norm": 0.904922604560852,
|
| 25308 |
+
"learning_rate": 1.414079789587025e-05,
|
| 25309 |
+
"loss": 0.9615,
|
| 25310 |
+
"step": 324300
|
| 25311 |
+
},
|
| 25312 |
+
{
|
| 25313 |
+
"epoch": 0.0128,
|
| 25314 |
+
"grad_norm": 0.8454153537750244,
|
| 25315 |
+
"learning_rate": 1.4126508448006459e-05,
|
| 25316 |
+
"loss": 0.9681,
|
| 25317 |
+
"step": 324400
|
| 25318 |
+
},
|
| 25319 |
+
{
|
| 25320 |
+
"epoch": 0.013,
|
| 25321 |
+
"grad_norm": 0.8959038257598877,
|
| 25322 |
+
"learning_rate": 1.4112223379983755e-05,
|
| 25323 |
+
"loss": 0.9746,
|
| 25324 |
+
"step": 324500
|
| 25325 |
+
},
|
| 25326 |
+
{
|
| 25327 |
+
"epoch": 0.0132,
|
| 25328 |
+
"grad_norm": 0.9153333306312561,
|
| 25329 |
+
"learning_rate": 1.4097942697556172e-05,
|
| 25330 |
+
"loss": 0.9728,
|
| 25331 |
+
"step": 324600
|
| 25332 |
+
},
|
| 25333 |
+
{
|
| 25334 |
+
"epoch": 0.0134,
|
| 25335 |
+
"grad_norm": 0.809781551361084,
|
| 25336 |
+
"learning_rate": 1.4083666406475976e-05,
|
| 25337 |
+
"loss": 0.964,
|
| 25338 |
+
"step": 324700
|
| 25339 |
+
},
|
| 25340 |
+
{
|
| 25341 |
+
"epoch": 0.0136,
|
| 25342 |
+
"grad_norm": 0.8854051232337952,
|
| 25343 |
+
"learning_rate": 1.4069394512493634e-05,
|
| 25344 |
+
"loss": 0.9826,
|
| 25345 |
+
"step": 324800
|
| 25346 |
+
},
|
| 25347 |
+
{
|
| 25348 |
+
"epoch": 0.0138,
|
| 25349 |
+
"grad_norm": 0.8811824917793274,
|
| 25350 |
+
"learning_rate": 1.4055127021357877e-05,
|
| 25351 |
+
"loss": 0.9809,
|
| 25352 |
+
"step": 324900
|
| 25353 |
+
},
|
| 25354 |
+
{
|
| 25355 |
+
"epoch": 0.014,
|
| 25356 |
+
"grad_norm": 0.8924720883369446,
|
| 25357 |
+
"learning_rate": 1.4040863938815645e-05,
|
| 25358 |
+
"loss": 0.9611,
|
| 25359 |
+
"step": 325000
|
| 25360 |
+
},
|
| 25361 |
+
{
|
| 25362 |
+
"epoch": 0.014,
|
| 25363 |
+
"eval_loss": 2.559173583984375,
|
| 25364 |
+
"eval_runtime": 51.7882,
|
| 25365 |
+
"eval_samples_per_second": 196.84,
|
| 25366 |
+
"eval_steps_per_second": 1.545,
|
| 25367 |
+
"step": 325000
|
| 25368 |
+
},
|
| 25369 |
+
{
|
| 25370 |
+
"epoch": 0.0142,
|
| 25371 |
+
"grad_norm": 0.8205790519714355,
|
| 25372 |
+
"learning_rate": 1.402660527061212e-05,
|
| 25373 |
+
"loss": 0.9903,
|
| 25374 |
+
"step": 325100
|
| 25375 |
+
},
|
| 25376 |
+
{
|
| 25377 |
+
"epoch": 0.0144,
|
| 25378 |
+
"grad_norm": 0.8341870903968811,
|
| 25379 |
+
"learning_rate": 1.4012351022490672e-05,
|
| 25380 |
+
"loss": 0.9615,
|
| 25381 |
+
"step": 325200
|
| 25382 |
+
},
|
| 25383 |
+
{
|
| 25384 |
+
"epoch": 0.0146,
|
| 25385 |
+
"grad_norm": 0.8305156230926514,
|
| 25386 |
+
"learning_rate": 1.3998101200192915e-05,
|
| 25387 |
+
"loss": 0.9627,
|
| 25388 |
+
"step": 325300
|
| 25389 |
+
},
|
| 25390 |
+
{
|
| 25391 |
+
"epoch": 0.0148,
|
| 25392 |
+
"grad_norm": 0.9122214317321777,
|
| 25393 |
+
"learning_rate": 1.398385580945868e-05,
|
| 25394 |
+
"loss": 0.9129,
|
| 25395 |
+
"step": 325400
|
| 25396 |
+
},
|
| 25397 |
+
{
|
| 25398 |
+
"epoch": 0.015,
|
| 25399 |
+
"grad_norm": 0.868425190448761,
|
| 25400 |
+
"learning_rate": 1.3969614856026014e-05,
|
| 25401 |
+
"loss": 0.968,
|
| 25402 |
+
"step": 325500
|
| 25403 |
+
},
|
| 25404 |
+
{
|
| 25405 |
+
"epoch": 0.0152,
|
| 25406 |
+
"grad_norm": 0.8120792508125305,
|
| 25407 |
+
"learning_rate": 1.3955378345631159e-05,
|
| 25408 |
+
"loss": 0.9689,
|
| 25409 |
+
"step": 325600
|
| 25410 |
+
},
|
| 25411 |
+
{
|
| 25412 |
+
"epoch": 0.0154,
|
| 25413 |
+
"grad_norm": 0.8308644890785217,
|
| 25414 |
+
"learning_rate": 1.3941146284008582e-05,
|
| 25415 |
+
"loss": 0.9404,
|
| 25416 |
+
"step": 325700
|
| 25417 |
+
},
|
| 25418 |
+
{
|
| 25419 |
+
"epoch": 0.0156,
|
| 25420 |
+
"grad_norm": 0.7607423663139343,
|
| 25421 |
+
"learning_rate": 1.3926918676890965e-05,
|
| 25422 |
+
"loss": 0.9587,
|
| 25423 |
+
"step": 325800
|
| 25424 |
+
},
|
| 25425 |
+
{
|
| 25426 |
+
"epoch": 0.0158,
|
| 25427 |
+
"grad_norm": 0.8530341386795044,
|
| 25428 |
+
"learning_rate": 1.3912695530009184e-05,
|
| 25429 |
+
"loss": 0.9584,
|
| 25430 |
+
"step": 325900
|
| 25431 |
+
},
|
| 25432 |
+
{
|
| 25433 |
+
"epoch": 0.016,
|
| 25434 |
+
"grad_norm": 0.8315464854240417,
|
| 25435 |
+
"learning_rate": 1.3898476849092312e-05,
|
| 25436 |
+
"loss": 0.9507,
|
| 25437 |
+
"step": 326000
|
| 25438 |
+
},
|
| 25439 |
+
{
|
| 25440 |
+
"epoch": 0.016,
|
| 25441 |
+
"eval_loss": 2.574967861175537,
|
| 25442 |
+
"eval_runtime": 52.1092,
|
| 25443 |
+
"eval_samples_per_second": 195.628,
|
| 25444 |
+
"eval_steps_per_second": 1.535,
|
| 25445 |
+
"step": 326000
|
| 25446 |
+
},
|
| 25447 |
+
{
|
| 25448 |
+
"epoch": 0.0002,
|
| 25449 |
+
"grad_norm": 0.87019944190979,
|
| 25450 |
+
"learning_rate": 1.3884262639867638e-05,
|
| 25451 |
+
"loss": 0.7316,
|
| 25452 |
+
"step": 326100
|
| 25453 |
+
},
|
| 25454 |
+
{
|
| 25455 |
+
"epoch": 0.0004,
|
| 25456 |
+
"grad_norm": 0.8352780342102051,
|
| 25457 |
+
"learning_rate": 1.3870052908060651e-05,
|
| 25458 |
+
"loss": 0.7268,
|
| 25459 |
+
"step": 326200
|
| 25460 |
+
},
|
| 25461 |
+
{
|
| 25462 |
+
"epoch": 0.0006,
|
| 25463 |
+
"grad_norm": 0.9428650736808777,
|
| 25464 |
+
"learning_rate": 1.3855847659395013e-05,
|
| 25465 |
+
"loss": 0.717,
|
| 25466 |
+
"step": 326300
|
| 25467 |
+
},
|
| 25468 |
+
{
|
| 25469 |
+
"epoch": 0.0008,
|
| 25470 |
+
"grad_norm": 1.0137333869934082,
|
| 25471 |
+
"learning_rate": 1.3841646899592603e-05,
|
| 25472 |
+
"loss": 0.7362,
|
| 25473 |
+
"step": 326400
|
| 25474 |
+
},
|
| 25475 |
+
{
|
| 25476 |
+
"epoch": 0.001,
|
| 25477 |
+
"grad_norm": 0.9063905477523804,
|
| 25478 |
+
"learning_rate": 1.382745063437349e-05,
|
| 25479 |
+
"loss": 0.7192,
|
| 25480 |
+
"step": 326500
|
| 25481 |
+
},
|
| 25482 |
+
{
|
| 25483 |
+
"epoch": 0.0012,
|
| 25484 |
+
"grad_norm": 0.8576821088790894,
|
| 25485 |
+
"learning_rate": 1.3813258869455936e-05,
|
| 25486 |
+
"loss": 0.72,
|
| 25487 |
+
"step": 326600
|
| 25488 |
+
},
|
| 25489 |
+
{
|
| 25490 |
+
"epoch": 0.0014,
|
| 25491 |
+
"grad_norm": 0.8997663259506226,
|
| 25492 |
+
"learning_rate": 1.3799071610556358e-05,
|
| 25493 |
+
"loss": 0.7216,
|
| 25494 |
+
"step": 326700
|
| 25495 |
+
},
|
| 25496 |
+
{
|
| 25497 |
+
"epoch": 0.0016,
|
| 25498 |
+
"grad_norm": 0.8130722641944885,
|
| 25499 |
+
"learning_rate": 1.37848888633894e-05,
|
| 25500 |
+
"loss": 0.7251,
|
| 25501 |
+
"step": 326800
|
| 25502 |
+
},
|
| 25503 |
+
{
|
| 25504 |
+
"epoch": 0.0018,
|
| 25505 |
+
"grad_norm": 0.9513541460037231,
|
| 25506 |
+
"learning_rate": 1.3770710633667863e-05,
|
| 25507 |
+
"loss": 0.7245,
|
| 25508 |
+
"step": 326900
|
| 25509 |
+
},
|
| 25510 |
+
{
|
| 25511 |
+
"epoch": 0.002,
|
| 25512 |
+
"grad_norm": 0.8725600838661194,
|
| 25513 |
+
"learning_rate": 1.3756536927102753e-05,
|
| 25514 |
+
"loss": 0.7186,
|
| 25515 |
+
"step": 327000
|
| 25516 |
+
},
|
| 25517 |
+
{
|
| 25518 |
+
"epoch": 0.002,
|
| 25519 |
+
"eval_loss": 2.0350961685180664,
|
| 25520 |
+
"eval_runtime": 51.8928,
|
| 25521 |
+
"eval_samples_per_second": 196.444,
|
| 25522 |
+
"eval_steps_per_second": 1.542,
|
| 25523 |
+
"step": 327000
|
| 25524 |
+
},
|
| 25525 |
+
{
|
| 25526 |
+
"epoch": 0.0022,
|
| 25527 |
+
"grad_norm": 0.9190706610679626,
|
| 25528 |
+
"learning_rate": 1.3742367749403212e-05,
|
| 25529 |
+
"loss": 0.7326,
|
| 25530 |
+
"step": 327100
|
| 25531 |
+
},
|
| 25532 |
+
{
|
| 25533 |
+
"epoch": 0.0024,
|
| 25534 |
+
"grad_norm": 0.8598017692565918,
|
| 25535 |
+
"learning_rate": 1.3728203106276594e-05,
|
| 25536 |
+
"loss": 0.7282,
|
| 25537 |
+
"step": 327200
|
| 25538 |
+
},
|
| 25539 |
+
{
|
| 25540 |
+
"epoch": 0.0026,
|
| 25541 |
+
"grad_norm": 0.833091139793396,
|
| 25542 |
+
"learning_rate": 1.371404300342842e-05,
|
| 25543 |
+
"loss": 0.7183,
|
| 25544 |
+
"step": 327300
|
| 25545 |
+
},
|
| 25546 |
+
{
|
| 25547 |
+
"epoch": 0.0028,
|
| 25548 |
+
"grad_norm": 0.8222286105155945,
|
| 25549 |
+
"learning_rate": 1.3699887446562382e-05,
|
| 25550 |
+
"loss": 0.7139,
|
| 25551 |
+
"step": 327400
|
| 25552 |
+
},
|
| 25553 |
+
{
|
| 25554 |
+
"epoch": 0.003,
|
| 25555 |
+
"grad_norm": 0.8653368353843689,
|
| 25556 |
+
"learning_rate": 1.368573644138032e-05,
|
| 25557 |
+
"loss": 0.7237,
|
| 25558 |
+
"step": 327500
|
| 25559 |
+
},
|
| 25560 |
+
{
|
| 25561 |
+
"epoch": 0.0032,
|
| 25562 |
+
"grad_norm": 0.9050326943397522,
|
| 25563 |
+
"learning_rate": 1.3671589993582268e-05,
|
| 25564 |
+
"loss": 0.7282,
|
| 25565 |
+
"step": 327600
|
| 25566 |
+
},
|
| 25567 |
+
{
|
| 25568 |
+
"epoch": 0.0034,
|
| 25569 |
+
"grad_norm": 0.9215336441993713,
|
| 25570 |
+
"learning_rate": 1.3657448108866423e-05,
|
| 25571 |
+
"loss": 0.7107,
|
| 25572 |
+
"step": 327700
|
| 25573 |
+
},
|
| 25574 |
+
{
|
| 25575 |
+
"epoch": 0.0036,
|
| 25576 |
+
"grad_norm": 0.8540416359901428,
|
| 25577 |
+
"learning_rate": 1.364331079292911e-05,
|
| 25578 |
+
"loss": 0.7176,
|
| 25579 |
+
"step": 327800
|
| 25580 |
+
},
|
| 25581 |
+
{
|
| 25582 |
+
"epoch": 0.0038,
|
| 25583 |
+
"grad_norm": 0.8809969425201416,
|
| 25584 |
+
"learning_rate": 1.3629178051464858e-05,
|
| 25585 |
+
"loss": 0.7223,
|
| 25586 |
+
"step": 327900
|
| 25587 |
+
},
|
| 25588 |
+
{
|
| 25589 |
+
"epoch": 0.004,
|
| 25590 |
+
"grad_norm": 0.8728992342948914,
|
| 25591 |
+
"learning_rate": 1.3615049890166323e-05,
|
| 25592 |
+
"loss": 0.7169,
|
| 25593 |
+
"step": 328000
|
| 25594 |
+
},
|
| 25595 |
+
{
|
| 25596 |
+
"epoch": 0.004,
|
| 25597 |
+
"eval_loss": 2.0252230167388916,
|
| 25598 |
+
"eval_runtime": 51.5937,
|
| 25599 |
+
"eval_samples_per_second": 197.582,
|
| 25600 |
+
"eval_steps_per_second": 1.551,
|
| 25601 |
+
"step": 328000
|
| 25602 |
+
},
|
| 25603 |
+
{
|
| 25604 |
+
"epoch": 0.0042,
|
| 25605 |
+
"grad_norm": 1.0202641487121582,
|
| 25606 |
+
"learning_rate": 1.360092631472433e-05,
|
| 25607 |
+
"loss": 0.7341,
|
| 25608 |
+
"step": 328100
|
| 25609 |
+
},
|
| 25610 |
+
{
|
| 25611 |
+
"epoch": 0.0044,
|
| 25612 |
+
"grad_norm": 0.8477998375892639,
|
| 25613 |
+
"learning_rate": 1.3586807330827861e-05,
|
| 25614 |
+
"loss": 0.7145,
|
| 25615 |
+
"step": 328200
|
| 25616 |
+
},
|
| 25617 |
+
{
|
| 25618 |
+
"epoch": 0.0046,
|
| 25619 |
+
"grad_norm": 0.8075670599937439,
|
| 25620 |
+
"learning_rate": 1.3572692944164029e-05,
|
| 25621 |
+
"loss": 0.7198,
|
| 25622 |
+
"step": 328300
|
| 25623 |
+
},
|
| 25624 |
+
{
|
| 25625 |
+
"epoch": 0.0048,
|
| 25626 |
+
"grad_norm": 0.8715834021568298,
|
| 25627 |
+
"learning_rate": 1.3558583160418109e-05,
|
| 25628 |
+
"loss": 0.7202,
|
| 25629 |
+
"step": 328400
|
| 25630 |
+
},
|
| 25631 |
+
{
|
| 25632 |
+
"epoch": 0.005,
|
| 25633 |
+
"grad_norm": 0.8973333239555359,
|
| 25634 |
+
"learning_rate": 1.3544477985273524e-05,
|
| 25635 |
+
"loss": 0.7165,
|
| 25636 |
+
"step": 328500
|
| 25637 |
+
},
|
| 25638 |
+
{
|
| 25639 |
+
"epoch": 0.0052,
|
| 25640 |
+
"grad_norm": 0.923931360244751,
|
| 25641 |
+
"learning_rate": 1.3530377424411849e-05,
|
| 25642 |
+
"loss": 0.7214,
|
| 25643 |
+
"step": 328600
|
| 25644 |
+
},
|
| 25645 |
+
{
|
| 25646 |
+
"epoch": 0.0054,
|
| 25647 |
+
"grad_norm": 0.9258859753608704,
|
| 25648 |
+
"learning_rate": 1.3516281483512765e-05,
|
| 25649 |
+
"loss": 0.7255,
|
| 25650 |
+
"step": 328700
|
| 25651 |
+
},
|
| 25652 |
+
{
|
| 25653 |
+
"epoch": 0.0056,
|
| 25654 |
+
"grad_norm": 0.8883686661720276,
|
| 25655 |
+
"learning_rate": 1.3502190168254125e-05,
|
| 25656 |
+
"loss": 0.713,
|
| 25657 |
+
"step": 328800
|
| 25658 |
+
},
|
| 25659 |
+
{
|
| 25660 |
+
"epoch": 0.0058,
|
| 25661 |
+
"grad_norm": 0.8454500436782837,
|
| 25662 |
+
"learning_rate": 1.348810348431191e-05,
|
| 25663 |
+
"loss": 0.7117,
|
| 25664 |
+
"step": 328900
|
| 25665 |
+
},
|
| 25666 |
+
{
|
| 25667 |
+
"epoch": 0.006,
|
| 25668 |
+
"grad_norm": 0.9518053531646729,
|
| 25669 |
+
"learning_rate": 1.3474021437360245e-05,
|
| 25670 |
+
"loss": 0.7189,
|
| 25671 |
+
"step": 329000
|
| 25672 |
+
},
|
| 25673 |
+
{
|
| 25674 |
+
"epoch": 0.006,
|
| 25675 |
+
"eval_loss": 2.032439708709717,
|
| 25676 |
+
"eval_runtime": 51.733,
|
| 25677 |
+
"eval_samples_per_second": 197.05,
|
| 25678 |
+
"eval_steps_per_second": 1.546,
|
| 25679 |
+
"step": 329000
|
| 25680 |
+
},
|
| 25681 |
+
{
|
| 25682 |
+
"epoch": 0.0062,
|
| 25683 |
+
"grad_norm": 0.878307044506073,
|
| 25684 |
+
"learning_rate": 1.345994403307136e-05,
|
| 25685 |
+
"loss": 0.7136,
|
| 25686 |
+
"step": 329100
|
| 25687 |
+
},
|
| 25688 |
+
{
|
| 25689 |
+
"epoch": 0.0064,
|
| 25690 |
+
"grad_norm": 0.8827186226844788,
|
| 25691 |
+
"learning_rate": 1.3445871277115635e-05,
|
| 25692 |
+
"loss": 0.7237,
|
| 25693 |
+
"step": 329200
|
| 25694 |
+
},
|
| 25695 |
+
{
|
| 25696 |
+
"epoch": 0.0066,
|
| 25697 |
+
"grad_norm": 0.8805004954338074,
|
| 25698 |
+
"learning_rate": 1.3431803175161586e-05,
|
| 25699 |
+
"loss": 0.7024,
|
| 25700 |
+
"step": 329300
|
| 25701 |
+
},
|
| 25702 |
+
{
|
| 25703 |
+
"epoch": 0.0068,
|
| 25704 |
+
"grad_norm": 0.8745920062065125,
|
| 25705 |
+
"learning_rate": 1.3417739732875829e-05,
|
| 25706 |
+
"loss": 0.7175,
|
| 25707 |
+
"step": 329400
|
| 25708 |
+
},
|
| 25709 |
+
{
|
| 25710 |
+
"epoch": 0.007,
|
| 25711 |
+
"grad_norm": 0.8587835431098938,
|
| 25712 |
+
"learning_rate": 1.340368095592312e-05,
|
| 25713 |
+
"loss": 0.7054,
|
| 25714 |
+
"step": 329500
|
| 25715 |
+
},
|
| 25716 |
+
{
|
| 25717 |
+
"epoch": 0.0072,
|
| 25718 |
+
"grad_norm": 0.8374196290969849,
|
| 25719 |
+
"learning_rate": 1.3389626849966335e-05,
|
| 25720 |
+
"loss": 0.7107,
|
| 25721 |
+
"step": 329600
|
| 25722 |
+
},
|
| 25723 |
+
{
|
| 25724 |
+
"epoch": 0.0074,
|
| 25725 |
+
"grad_norm": 0.929682731628418,
|
| 25726 |
+
"learning_rate": 1.3375577420666477e-05,
|
| 25727 |
+
"loss": 0.7183,
|
| 25728 |
+
"step": 329700
|
| 25729 |
+
},
|
| 25730 |
+
{
|
| 25731 |
+
"epoch": 0.0076,
|
| 25732 |
+
"grad_norm": 0.8738675713539124,
|
| 25733 |
+
"learning_rate": 1.3361532673682633e-05,
|
| 25734 |
+
"loss": 0.7236,
|
| 25735 |
+
"step": 329800
|
| 25736 |
+
},
|
| 25737 |
+
{
|
| 25738 |
+
"epoch": 0.0078,
|
| 25739 |
+
"grad_norm": 0.8550043106079102,
|
| 25740 |
+
"learning_rate": 1.3347492614672039e-05,
|
| 25741 |
+
"loss": 0.7107,
|
| 25742 |
+
"step": 329900
|
| 25743 |
+
},
|
| 25744 |
+
{
|
| 25745 |
+
"epoch": 0.008,
|
| 25746 |
+
"grad_norm": 0.9196627736091614,
|
| 25747 |
+
"learning_rate": 1.3333457249290024e-05,
|
| 25748 |
+
"loss": 0.716,
|
| 25749 |
+
"step": 330000
|
| 25750 |
+
},
|
| 25751 |
+
{
|
| 25752 |
+
"epoch": 0.008,
|
| 25753 |
+
"eval_loss": 2.035661220550537,
|
| 25754 |
+
"eval_runtime": 51.7487,
|
| 25755 |
+
"eval_samples_per_second": 196.99,
|
| 25756 |
+
"eval_steps_per_second": 1.546,
|
| 25757 |
+
"step": 330000
|
| 25758 |
+
},
|
| 25759 |
+
{
|
| 25760 |
+
"epoch": 0.0082,
|
| 25761 |
+
"grad_norm": 0.8340585231781006,
|
| 25762 |
+
"learning_rate": 1.3319426583190042e-05,
|
| 25763 |
+
"loss": 0.7279,
|
| 25764 |
+
"step": 330100
|
| 25765 |
+
},
|
| 25766 |
+
{
|
| 25767 |
+
"epoch": 0.0084,
|
| 25768 |
+
"grad_norm": 0.858969509601593,
|
| 25769 |
+
"learning_rate": 1.3305400622023628e-05,
|
| 25770 |
+
"loss": 0.716,
|
| 25771 |
+
"step": 330200
|
| 25772 |
+
},
|
| 25773 |
+
{
|
| 25774 |
+
"epoch": 0.0086,
|
| 25775 |
+
"grad_norm": 0.9872186183929443,
|
| 25776 |
+
"learning_rate": 1.3291379371440446e-05,
|
| 25777 |
+
"loss": 0.7278,
|
| 25778 |
+
"step": 330300
|
| 25779 |
+
},
|
| 25780 |
+
{
|
| 25781 |
+
"epoch": 0.0088,
|
| 25782 |
+
"grad_norm": 0.8357021808624268,
|
| 25783 |
+
"learning_rate": 1.3277362837088252e-05,
|
| 25784 |
+
"loss": 0.7057,
|
| 25785 |
+
"step": 330400
|
| 25786 |
+
},
|
| 25787 |
+
{
|
| 25788 |
+
"epoch": 0.009,
|
| 25789 |
+
"grad_norm": 0.8592823147773743,
|
| 25790 |
+
"learning_rate": 1.3263351024612914e-05,
|
| 25791 |
+
"loss": 0.7107,
|
| 25792 |
+
"step": 330500
|
| 25793 |
+
},
|
| 25794 |
+
{
|
| 25795 |
+
"epoch": 0.0092,
|
| 25796 |
+
"grad_norm": 0.8655655384063721,
|
| 25797 |
+
"learning_rate": 1.3249343939658371e-05,
|
| 25798 |
+
"loss": 0.7093,
|
| 25799 |
+
"step": 330600
|
| 25800 |
+
},
|
| 25801 |
+
{
|
| 25802 |
+
"epoch": 0.0094,
|
| 25803 |
+
"grad_norm": 0.8590738773345947,
|
| 25804 |
+
"learning_rate": 1.3235341587866684e-05,
|
| 25805 |
+
"loss": 0.7073,
|
| 25806 |
+
"step": 330700
|
| 25807 |
+
},
|
| 25808 |
+
{
|
| 25809 |
+
"epoch": 0.0096,
|
| 25810 |
+
"grad_norm": 0.8633531332015991,
|
| 25811 |
+
"learning_rate": 1.322134397487801e-05,
|
| 25812 |
+
"loss": 0.7129,
|
| 25813 |
+
"step": 330800
|
| 25814 |
+
},
|
| 25815 |
+
{
|
| 25816 |
+
"epoch": 0.0098,
|
| 25817 |
+
"grad_norm": 0.8816627264022827,
|
| 25818 |
+
"learning_rate": 1.3207351106330559e-05,
|
| 25819 |
+
"loss": 0.7114,
|
| 25820 |
+
"step": 330900
|
| 25821 |
+
},
|
| 25822 |
+
{
|
| 25823 |
+
"epoch": 0.01,
|
| 25824 |
+
"grad_norm": 0.9330505132675171,
|
| 25825 |
+
"learning_rate": 1.3193362987860675e-05,
|
| 25826 |
+
"loss": 0.7059,
|
| 25827 |
+
"step": 331000
|
| 25828 |
+
},
|
| 25829 |
+
{
|
| 25830 |
+
"epoch": 0.01,
|
| 25831 |
+
"eval_loss": 2.0230836868286133,
|
| 25832 |
+
"eval_runtime": 51.7504,
|
| 25833 |
+
"eval_samples_per_second": 196.984,
|
| 25834 |
+
"eval_steps_per_second": 1.546,
|
| 25835 |
+
"step": 331000
|
| 25836 |
+
},
|
| 25837 |
+
{
|
| 25838 |
+
"epoch": 0.0102,
|
| 25839 |
+
"grad_norm": 0.8758464455604553,
|
| 25840 |
+
"learning_rate": 1.317937962510277e-05,
|
| 25841 |
+
"loss": 0.7078,
|
| 25842 |
+
"step": 331100
|
| 25843 |
+
},
|
| 25844 |
+
{
|
| 25845 |
+
"epoch": 0.0104,
|
| 25846 |
+
"grad_norm": 0.9444248676300049,
|
| 25847 |
+
"learning_rate": 1.3165401023689344e-05,
|
| 25848 |
+
"loss": 0.7174,
|
| 25849 |
+
"step": 331200
|
| 25850 |
+
},
|
| 25851 |
+
{
|
| 25852 |
+
"epoch": 0.0106,
|
| 25853 |
+
"grad_norm": 0.8706777095794678,
|
| 25854 |
+
"learning_rate": 1.3151427189250965e-05,
|
| 25855 |
+
"loss": 0.7058,
|
| 25856 |
+
"step": 331300
|
| 25857 |
+
},
|
| 25858 |
+
{
|
| 25859 |
+
"epoch": 0.0108,
|
| 25860 |
+
"grad_norm": 0.8867092132568359,
|
| 25861 |
+
"learning_rate": 1.3137458127416297e-05,
|
| 25862 |
+
"loss": 0.7058,
|
| 25863 |
+
"step": 331400
|
| 25864 |
+
},
|
| 25865 |
+
{
|
| 25866 |
+
"epoch": 0.011,
|
| 25867 |
+
"grad_norm": 0.968101978302002,
|
| 25868 |
+
"learning_rate": 1.3123493843812074e-05,
|
| 25869 |
+
"loss": 0.7212,
|
| 25870 |
+
"step": 331500
|
| 25871 |
+
},
|
| 25872 |
+
{
|
| 25873 |
+
"epoch": 0.0112,
|
| 25874 |
+
"grad_norm": 0.8708505630493164,
|
| 25875 |
+
"learning_rate": 1.3109534344063118e-05,
|
| 25876 |
+
"loss": 0.7175,
|
| 25877 |
+
"step": 331600
|
| 25878 |
+
},
|
| 25879 |
+
{
|
| 25880 |
+
"epoch": 0.0114,
|
| 25881 |
+
"grad_norm": 0.910325288772583,
|
| 25882 |
+
"learning_rate": 1.30955796337923e-05,
|
| 25883 |
+
"loss": 0.7078,
|
| 25884 |
+
"step": 331700
|
| 25885 |
+
},
|
| 25886 |
+
{
|
| 25887 |
+
"epoch": 0.0116,
|
| 25888 |
+
"grad_norm": 0.8591578006744385,
|
| 25889 |
+
"learning_rate": 1.308162971862058e-05,
|
| 25890 |
+
"loss": 0.7101,
|
| 25891 |
+
"step": 331800
|
| 25892 |
+
},
|
| 25893 |
+
{
|
| 25894 |
+
"epoch": 0.0118,
|
| 25895 |
+
"grad_norm": 0.9007583260536194,
|
| 25896 |
+
"learning_rate": 1.3067684604166988e-05,
|
| 25897 |
+
"loss": 0.7157,
|
| 25898 |
+
"step": 331900
|
| 25899 |
+
},
|
| 25900 |
+
{
|
| 25901 |
+
"epoch": 0.012,
|
| 25902 |
+
"grad_norm": 0.9580846428871155,
|
| 25903 |
+
"learning_rate": 1.3053744296048617e-05,
|
| 25904 |
+
"loss": 0.7102,
|
| 25905 |
+
"step": 332000
|
| 25906 |
+
},
|
| 25907 |
+
{
|
| 25908 |
+
"epoch": 0.012,
|
| 25909 |
+
"eval_loss": 2.037156581878662,
|
| 25910 |
+
"eval_runtime": 51.5881,
|
| 25911 |
+
"eval_samples_per_second": 197.604,
|
| 25912 |
+
"eval_steps_per_second": 1.551,
|
| 25913 |
+
"step": 332000
|
| 25914 |
+
},
|
| 25915 |
+
{
|
| 25916 |
+
"epoch": 0.0122,
|
| 25917 |
+
"grad_norm": 0.8679760098457336,
|
| 25918 |
+
"learning_rate": 1.3039808799880604e-05,
|
| 25919 |
+
"loss": 0.7144,
|
| 25920 |
+
"step": 332100
|
| 25921 |
+
},
|
| 25922 |
+
{
|
| 25923 |
+
"epoch": 0.0124,
|
| 25924 |
+
"grad_norm": 0.8794786334037781,
|
| 25925 |
+
"learning_rate": 1.302587812127618e-05,
|
| 25926 |
+
"loss": 0.7089,
|
| 25927 |
+
"step": 332200
|
| 25928 |
+
},
|
| 25929 |
+
{
|
| 25930 |
+
"epoch": 0.0126,
|
| 25931 |
+
"grad_norm": 0.855987548828125,
|
| 25932 |
+
"learning_rate": 1.3011952265846626e-05,
|
| 25933 |
+
"loss": 0.7164,
|
| 25934 |
+
"step": 332300
|
| 25935 |
+
},
|
| 25936 |
+
{
|
| 25937 |
+
"epoch": 0.0128,
|
| 25938 |
+
"grad_norm": 0.8838660717010498,
|
| 25939 |
+
"learning_rate": 1.2998031239201252e-05,
|
| 25940 |
+
"loss": 0.7166,
|
| 25941 |
+
"step": 332400
|
| 25942 |
+
},
|
| 25943 |
+
{
|
| 25944 |
+
"epoch": 0.013,
|
| 25945 |
+
"grad_norm": 0.8379763960838318,
|
| 25946 |
+
"learning_rate": 1.2984115046947463e-05,
|
| 25947 |
+
"loss": 0.7168,
|
| 25948 |
+
"step": 332500
|
| 25949 |
+
},
|
| 25950 |
+
{
|
| 25951 |
+
"epoch": 0.0132,
|
| 25952 |
+
"grad_norm": 0.8760377764701843,
|
| 25953 |
+
"learning_rate": 1.2970203694690694e-05,
|
| 25954 |
+
"loss": 0.7106,
|
| 25955 |
+
"step": 332600
|
| 25956 |
+
},
|
| 25957 |
+
{
|
| 25958 |
+
"epoch": 0.0134,
|
| 25959 |
+
"grad_norm": 0.8472399711608887,
|
| 25960 |
+
"learning_rate": 1.295629718803445e-05,
|
| 25961 |
+
"loss": 0.7118,
|
| 25962 |
+
"step": 332700
|
| 25963 |
+
},
|
| 25964 |
+
{
|
| 25965 |
+
"epoch": 0.0136,
|
| 25966 |
+
"grad_norm": 0.8849984407424927,
|
| 25967 |
+
"learning_rate": 1.2942395532580247e-05,
|
| 25968 |
+
"loss": 0.7207,
|
| 25969 |
+
"step": 332800
|
| 25970 |
+
},
|
| 25971 |
+
{
|
| 25972 |
+
"epoch": 0.0138,
|
| 25973 |
+
"grad_norm": 0.8308677077293396,
|
| 25974 |
+
"learning_rate": 1.2928498733927682e-05,
|
| 25975 |
+
"loss": 0.7004,
|
| 25976 |
+
"step": 332900
|
| 25977 |
+
},
|
| 25978 |
+
{
|
| 25979 |
+
"epoch": 0.014,
|
| 25980 |
+
"grad_norm": 0.9149287343025208,
|
| 25981 |
+
"learning_rate": 1.2914606797674384e-05,
|
| 25982 |
+
"loss": 0.7088,
|
| 25983 |
+
"step": 333000
|
| 25984 |
+
},
|
| 25985 |
+
{
|
| 25986 |
+
"epoch": 0.014,
|
| 25987 |
+
"eval_loss": 2.029548168182373,
|
| 25988 |
+
"eval_runtime": 51.6647,
|
| 25989 |
+
"eval_samples_per_second": 197.311,
|
| 25990 |
+
"eval_steps_per_second": 1.548,
|
| 25991 |
+
"step": 333000
|
| 25992 |
+
},
|
| 25993 |
+
{
|
| 25994 |
+
"epoch": 0.0142,
|
| 25995 |
+
"grad_norm": 0.8902376890182495,
|
| 25996 |
+
"learning_rate": 1.2900719729416033e-05,
|
| 25997 |
+
"loss": 0.7095,
|
| 25998 |
+
"step": 333100
|
| 25999 |
+
},
|
| 26000 |
+
{
|
| 26001 |
+
"epoch": 0.0144,
|
| 26002 |
+
"grad_norm": 0.9412351250648499,
|
| 26003 |
+
"learning_rate": 1.2886837534746316e-05,
|
| 26004 |
+
"loss": 0.7186,
|
| 26005 |
+
"step": 333200
|
| 26006 |
+
},
|
| 26007 |
+
{
|
| 26008 |
+
"epoch": 0.0146,
|
| 26009 |
+
"grad_norm": 0.8445390462875366,
|
| 26010 |
+
"learning_rate": 1.2872960219256992e-05,
|
| 26011 |
+
"loss": 0.7093,
|
| 26012 |
+
"step": 333300
|
| 26013 |
+
},
|
| 26014 |
+
{
|
| 26015 |
+
"epoch": 0.0148,
|
| 26016 |
+
"grad_norm": 0.8830252289772034,
|
| 26017 |
+
"learning_rate": 1.2859087788537844e-05,
|
| 26018 |
+
"loss": 0.7074,
|
| 26019 |
+
"step": 333400
|
| 26020 |
+
},
|
| 26021 |
+
{
|
| 26022 |
+
"epoch": 0.015,
|
| 26023 |
+
"grad_norm": 0.8642695546150208,
|
| 26024 |
+
"learning_rate": 1.284522024817669e-05,
|
| 26025 |
+
"loss": 0.7146,
|
| 26026 |
+
"step": 333500
|
| 26027 |
+
},
|
| 26028 |
+
{
|
| 26029 |
+
"epoch": 0.0152,
|
| 26030 |
+
"grad_norm": 0.9142852425575256,
|
| 26031 |
+
"learning_rate": 1.2831357603759358e-05,
|
| 26032 |
+
"loss": 0.7126,
|
| 26033 |
+
"step": 333600
|
| 26034 |
+
},
|
| 26035 |
+
{
|
| 26036 |
+
"epoch": 0.0154,
|
| 26037 |
+
"grad_norm": 0.9412261247634888,
|
| 26038 |
+
"learning_rate": 1.2817499860869725e-05,
|
| 26039 |
+
"loss": 0.7105,
|
| 26040 |
+
"step": 333700
|
| 26041 |
+
},
|
| 26042 |
+
{
|
| 26043 |
+
"epoch": 0.0156,
|
| 26044 |
+
"grad_norm": 0.8529816269874573,
|
| 26045 |
+
"learning_rate": 1.2803647025089705e-05,
|
| 26046 |
+
"loss": 0.7086,
|
| 26047 |
+
"step": 333800
|
| 26048 |
+
},
|
| 26049 |
+
{
|
| 26050 |
+
"epoch": 0.0158,
|
| 26051 |
+
"grad_norm": 0.8930657505989075,
|
| 26052 |
+
"learning_rate": 1.2789799101999194e-05,
|
| 26053 |
+
"loss": 0.7148,
|
| 26054 |
+
"step": 333900
|
| 26055 |
+
},
|
| 26056 |
+
{
|
| 26057 |
+
"epoch": 0.016,
|
| 26058 |
+
"grad_norm": 0.9034160375595093,
|
| 26059 |
+
"learning_rate": 1.2775956097176142e-05,
|
| 26060 |
+
"loss": 0.7138,
|
| 26061 |
+
"step": 334000
|
| 26062 |
+
},
|
| 26063 |
+
{
|
| 26064 |
+
"epoch": 0.016,
|
| 26065 |
+
"eval_loss": 2.034317970275879,
|
| 26066 |
+
"eval_runtime": 52.1314,
|
| 26067 |
+
"eval_samples_per_second": 195.544,
|
| 26068 |
+
"eval_steps_per_second": 1.535,
|
| 26069 |
+
"step": 334000
|
| 26070 |
+
},
|
| 26071 |
+
{
|
| 26072 |
+
"epoch": 0.0162,
|
| 26073 |
+
"grad_norm": 0.7935868501663208,
|
| 26074 |
+
"learning_rate": 1.2762118016196514e-05,
|
| 26075 |
+
"loss": 0.7061,
|
| 26076 |
+
"step": 334100
|
| 26077 |
+
},
|
| 26078 |
+
{
|
| 26079 |
+
"epoch": 0.0164,
|
| 26080 |
+
"grad_norm": 0.8745686411857605,
|
| 26081 |
+
"learning_rate": 1.2748284864634296e-05,
|
| 26082 |
+
"loss": 0.7079,
|
| 26083 |
+
"step": 334200
|
| 26084 |
+
},
|
| 26085 |
+
{
|
| 26086 |
+
"epoch": 0.0166,
|
| 26087 |
+
"grad_norm": 0.8833600878715515,
|
| 26088 |
+
"learning_rate": 1.273445664806146e-05,
|
| 26089 |
+
"loss": 0.7103,
|
| 26090 |
+
"step": 334300
|
| 26091 |
+
},
|
| 26092 |
+
{
|
| 26093 |
+
"epoch": 0.0168,
|
| 26094 |
+
"grad_norm": 0.9068960547447205,
|
| 26095 |
+
"learning_rate": 1.272063337204802e-05,
|
| 26096 |
+
"loss": 0.7001,
|
| 26097 |
+
"step": 334400
|
| 26098 |
+
},
|
| 26099 |
+
{
|
| 26100 |
+
"epoch": 0.017,
|
| 26101 |
+
"grad_norm": 0.8197974562644958,
|
| 26102 |
+
"learning_rate": 1.2706815042161984e-05,
|
| 26103 |
+
"loss": 0.7052,
|
| 26104 |
+
"step": 334500
|
| 26105 |
+
},
|
| 26106 |
+
{
|
| 26107 |
+
"epoch": 0.0172,
|
| 26108 |
+
"grad_norm": 0.8796073794364929,
|
| 26109 |
+
"learning_rate": 1.2693001663969395e-05,
|
| 26110 |
+
"loss": 0.7123,
|
| 26111 |
+
"step": 334600
|
| 26112 |
+
},
|
| 26113 |
+
{
|
| 26114 |
+
"epoch": 0.0174,
|
| 26115 |
+
"grad_norm": 0.883787989616394,
|
| 26116 |
+
"learning_rate": 1.2679193243034249e-05,
|
| 26117 |
+
"loss": 0.7028,
|
| 26118 |
+
"step": 334700
|
| 26119 |
+
},
|
| 26120 |
+
{
|
| 26121 |
+
"epoch": 0.0176,
|
| 26122 |
+
"grad_norm": 0.885678768157959,
|
| 26123 |
+
"learning_rate": 1.2665389784918597e-05,
|
| 26124 |
+
"loss": 0.696,
|
| 26125 |
+
"step": 334800
|
| 26126 |
+
},
|
| 26127 |
+
{
|
| 26128 |
+
"epoch": 0.0178,
|
| 26129 |
+
"grad_norm": 0.895122766494751,
|
| 26130 |
+
"learning_rate": 1.2651591295182457e-05,
|
| 26131 |
+
"loss": 0.7095,
|
| 26132 |
+
"step": 334900
|
| 26133 |
+
},
|
| 26134 |
+
{
|
| 26135 |
+
"epoch": 0.018,
|
| 26136 |
+
"grad_norm": 0.8656454086303711,
|
| 26137 |
+
"learning_rate": 1.2637797779383881e-05,
|
| 26138 |
+
"loss": 0.7098,
|
| 26139 |
+
"step": 335000
|
| 26140 |
+
},
|
| 26141 |
+
{
|
| 26142 |
+
"epoch": 0.018,
|
| 26143 |
+
"eval_loss": 2.041609764099121,
|
| 26144 |
+
"eval_runtime": 51.8364,
|
| 26145 |
+
"eval_samples_per_second": 196.657,
|
| 26146 |
+
"eval_steps_per_second": 1.543,
|
| 26147 |
+
"step": 335000
|
| 26148 |
+
},
|
| 26149 |
+
{
|
| 26150 |
+
"epoch": 0.0182,
|
| 26151 |
+
"grad_norm": 0.8860552906990051,
|
| 26152 |
+
"learning_rate": 1.2624009243078872e-05,
|
| 26153 |
+
"loss": 0.7323,
|
| 26154 |
+
"step": 335100
|
| 26155 |
+
},
|
| 26156 |
+
{
|
| 26157 |
+
"epoch": 0.0184,
|
| 26158 |
+
"grad_norm": 0.9041178226470947,
|
| 26159 |
+
"learning_rate": 1.261022569182146e-05,
|
| 26160 |
+
"loss": 0.7102,
|
| 26161 |
+
"step": 335200
|
| 26162 |
+
},
|
| 26163 |
+
{
|
| 26164 |
+
"epoch": 0.0186,
|
| 26165 |
+
"grad_norm": 0.8467496037483215,
|
| 26166 |
+
"learning_rate": 1.2596447131163657e-05,
|
| 26167 |
+
"loss": 0.7061,
|
| 26168 |
+
"step": 335300
|
| 26169 |
+
},
|
| 26170 |
+
{
|
| 26171 |
+
"epoch": 0.0188,
|
| 26172 |
+
"grad_norm": 0.8838053941726685,
|
| 26173 |
+
"learning_rate": 1.2582673566655474e-05,
|
| 26174 |
+
"loss": 0.7032,
|
| 26175 |
+
"step": 335400
|
| 26176 |
+
},
|
| 26177 |
+
{
|
| 26178 |
+
"epoch": 0.019,
|
| 26179 |
+
"grad_norm": 0.8892683982849121,
|
| 26180 |
+
"learning_rate": 1.2568905003844885e-05,
|
| 26181 |
+
"loss": 0.7032,
|
| 26182 |
+
"step": 335500
|
| 26183 |
+
},
|
| 26184 |
+
{
|
| 26185 |
+
"epoch": 0.0192,
|
| 26186 |
+
"grad_norm": 0.8915057182312012,
|
| 26187 |
+
"learning_rate": 1.2555141448277874e-05,
|
| 26188 |
+
"loss": 0.7162,
|
| 26189 |
+
"step": 335600
|
| 26190 |
+
},
|
| 26191 |
+
{
|
| 26192 |
+
"epoch": 0.0194,
|
| 26193 |
+
"grad_norm": 0.8544843196868896,
|
| 26194 |
+
"learning_rate": 1.2541382905498411e-05,
|
| 26195 |
+
"loss": 0.6972,
|
| 26196 |
+
"step": 335700
|
| 26197 |
+
},
|
| 26198 |
+
{
|
| 26199 |
+
"epoch": 0.0196,
|
| 26200 |
+
"grad_norm": 0.9270769953727722,
|
| 26201 |
+
"learning_rate": 1.2527629381048411e-05,
|
| 26202 |
+
"loss": 0.6981,
|
| 26203 |
+
"step": 335800
|
| 26204 |
+
},
|
| 26205 |
+
{
|
| 26206 |
+
"epoch": 0.0198,
|
| 26207 |
+
"grad_norm": 1.0345507860183716,
|
| 26208 |
+
"learning_rate": 1.2513880880467807e-05,
|
| 26209 |
+
"loss": 0.6987,
|
| 26210 |
+
"step": 335900
|
| 26211 |
+
},
|
| 26212 |
+
{
|
| 26213 |
+
"epoch": 0.02,
|
| 26214 |
+
"grad_norm": 0.8447591662406921,
|
| 26215 |
+
"learning_rate": 1.2500137409294488e-05,
|
| 26216 |
+
"loss": 0.7021,
|
| 26217 |
+
"step": 336000
|
| 26218 |
+
},
|
| 26219 |
+
{
|
| 26220 |
+
"epoch": 0.02,
|
| 26221 |
+
"eval_loss": 2.0394201278686523,
|
| 26222 |
+
"eval_runtime": 52.0024,
|
| 26223 |
+
"eval_samples_per_second": 196.029,
|
| 26224 |
+
"eval_steps_per_second": 1.538,
|
| 26225 |
+
"step": 336000
|
| 26226 |
+
},
|
| 26227 |
+
{
|
| 26228 |
+
"epoch": 0.0202,
|
| 26229 |
+
"grad_norm": 0.871530294418335,
|
| 26230 |
+
"learning_rate": 1.2486398973064339e-05,
|
| 26231 |
+
"loss": 0.7097,
|
| 26232 |
+
"step": 336100
|
| 26233 |
+
},
|
| 26234 |
+
{
|
| 26235 |
+
"epoch": 0.0204,
|
| 26236 |
+
"grad_norm": 0.8515340089797974,
|
| 26237 |
+
"learning_rate": 1.2472665577311176e-05,
|
| 26238 |
+
"loss": 0.705,
|
| 26239 |
+
"step": 336200
|
| 26240 |
+
},
|
| 26241 |
+
{
|
| 26242 |
+
"epoch": 0.0206,
|
| 26243 |
+
"grad_norm": 0.8740963339805603,
|
| 26244 |
+
"learning_rate": 1.2458937227566819e-05,
|
| 26245 |
+
"loss": 0.7004,
|
| 26246 |
+
"step": 336300
|
| 26247 |
+
},
|
| 26248 |
+
{
|
| 26249 |
+
"epoch": 0.0208,
|
| 26250 |
+
"grad_norm": 0.8944967985153198,
|
| 26251 |
+
"learning_rate": 1.244521392936106e-05,
|
| 26252 |
+
"loss": 0.6948,
|
| 26253 |
+
"step": 336400
|
| 26254 |
+
},
|
| 26255 |
+
{
|
| 26256 |
+
"epoch": 0.021,
|
| 26257 |
+
"grad_norm": 0.8867557644844055,
|
| 26258 |
+
"learning_rate": 1.2431495688221618e-05,
|
| 26259 |
+
"loss": 0.7037,
|
| 26260 |
+
"step": 336500
|
| 26261 |
+
},
|
| 26262 |
+
{
|
| 26263 |
+
"epoch": 0.0212,
|
| 26264 |
+
"grad_norm": 0.925564706325531,
|
| 26265 |
+
"learning_rate": 1.2417782509674216e-05,
|
| 26266 |
+
"loss": 0.6971,
|
| 26267 |
+
"step": 336600
|
| 26268 |
+
},
|
| 26269 |
+
{
|
| 26270 |
+
"epoch": 0.0214,
|
| 26271 |
+
"grad_norm": 0.8457061052322388,
|
| 26272 |
+
"learning_rate": 1.240407439924251e-05,
|
| 26273 |
+
"loss": 0.7007,
|
| 26274 |
+
"step": 336700
|
| 26275 |
+
},
|
| 26276 |
+
{
|
| 26277 |
+
"epoch": 0.0216,
|
| 26278 |
+
"grad_norm": 0.8768745064735413,
|
| 26279 |
+
"learning_rate": 1.2390371362448125e-05,
|
| 26280 |
+
"loss": 0.7015,
|
| 26281 |
+
"step": 336800
|
| 26282 |
+
},
|
| 26283 |
+
{
|
| 26284 |
+
"epoch": 0.0218,
|
| 26285 |
+
"grad_norm": 0.8154018521308899,
|
| 26286 |
+
"learning_rate": 1.237667340481066e-05,
|
| 26287 |
+
"loss": 0.6984,
|
| 26288 |
+
"step": 336900
|
| 26289 |
+
},
|
| 26290 |
+
{
|
| 26291 |
+
"epoch": 0.022,
|
| 26292 |
+
"grad_norm": 0.8525890707969666,
|
| 26293 |
+
"learning_rate": 1.2362980531847626e-05,
|
| 26294 |
+
"loss": 0.6991,
|
| 26295 |
+
"step": 337000
|
| 26296 |
+
},
|
| 26297 |
+
{
|
| 26298 |
+
"epoch": 0.022,
|
| 26299 |
+
"eval_loss": 2.052788496017456,
|
| 26300 |
+
"eval_runtime": 52.066,
|
| 26301 |
+
"eval_samples_per_second": 195.79,
|
| 26302 |
+
"eval_steps_per_second": 1.537,
|
| 26303 |
+
"step": 337000
|
| 26304 |
+
},
|
| 26305 |
+
{
|
| 26306 |
+
"epoch": 0.0222,
|
| 26307 |
+
"grad_norm": 0.8477676510810852,
|
| 26308 |
+
"learning_rate": 1.2349292749074526e-05,
|
| 26309 |
+
"loss": 0.6756,
|
| 26310 |
+
"step": 337100
|
| 26311 |
+
},
|
| 26312 |
+
{
|
| 26313 |
+
"epoch": 0.0224,
|
| 26314 |
+
"grad_norm": 0.8637740612030029,
|
| 26315 |
+
"learning_rate": 1.233561006200479e-05,
|
| 26316 |
+
"loss": 0.7043,
|
| 26317 |
+
"step": 337200
|
| 26318 |
+
},
|
| 26319 |
+
{
|
| 26320 |
+
"epoch": 0.0226,
|
| 26321 |
+
"grad_norm": 0.9340733885765076,
|
| 26322 |
+
"learning_rate": 1.232193247614982e-05,
|
| 26323 |
+
"loss": 0.697,
|
| 26324 |
+
"step": 337300
|
| 26325 |
+
},
|
| 26326 |
+
{
|
| 26327 |
+
"epoch": 0.0228,
|
| 26328 |
+
"grad_norm": 0.8994996547698975,
|
| 26329 |
+
"learning_rate": 1.230825999701892e-05,
|
| 26330 |
+
"loss": 0.6975,
|
| 26331 |
+
"step": 337400
|
| 26332 |
+
},
|
| 26333 |
+
{
|
| 26334 |
+
"epoch": 0.023,
|
| 26335 |
+
"grad_norm": 0.9119468331336975,
|
| 26336 |
+
"learning_rate": 1.2294592630119375e-05,
|
| 26337 |
+
"loss": 0.695,
|
| 26338 |
+
"step": 337500
|
| 26339 |
+
},
|
| 26340 |
+
{
|
| 26341 |
+
"epoch": 0.0232,
|
| 26342 |
+
"grad_norm": 0.8722793459892273,
|
| 26343 |
+
"learning_rate": 1.2280930380956402e-05,
|
| 26344 |
+
"loss": 0.694,
|
| 26345 |
+
"step": 337600
|
| 26346 |
+
},
|
| 26347 |
+
{
|
| 26348 |
+
"epoch": 0.0234,
|
| 26349 |
+
"grad_norm": 0.9214362502098083,
|
| 26350 |
+
"learning_rate": 1.2267273255033157e-05,
|
| 26351 |
+
"loss": 0.7004,
|
| 26352 |
+
"step": 337700
|
| 26353 |
+
},
|
| 26354 |
+
{
|
| 26355 |
+
"epoch": 0.0236,
|
| 26356 |
+
"grad_norm": 0.928554892539978,
|
| 26357 |
+
"learning_rate": 1.2253621257850714e-05,
|
| 26358 |
+
"loss": 0.6978,
|
| 26359 |
+
"step": 337800
|
| 26360 |
+
},
|
| 26361 |
+
{
|
| 26362 |
+
"epoch": 0.0238,
|
| 26363 |
+
"grad_norm": 0.952670693397522,
|
| 26364 |
+
"learning_rate": 1.2239974394908102e-05,
|
| 26365 |
+
"loss": 0.7041,
|
| 26366 |
+
"step": 337900
|
| 26367 |
+
},
|
| 26368 |
+
{
|
| 26369 |
+
"epoch": 0.024,
|
| 26370 |
+
"grad_norm": 0.8799007534980774,
|
| 26371 |
+
"learning_rate": 1.2226332671702282e-05,
|
| 26372 |
+
"loss": 0.689,
|
| 26373 |
+
"step": 338000
|
| 26374 |
+
},
|
| 26375 |
+
{
|
| 26376 |
+
"epoch": 0.024,
|
| 26377 |
+
"eval_loss": 2.0413691997528076,
|
| 26378 |
+
"eval_runtime": 52.1051,
|
| 26379 |
+
"eval_samples_per_second": 195.643,
|
| 26380 |
+
"eval_steps_per_second": 1.535,
|
| 26381 |
+
"step": 338000
|
| 26382 |
+
},
|
| 26383 |
+
{
|
| 26384 |
+
"epoch": 0.0242,
|
| 26385 |
+
"grad_norm": 1.0080713033676147,
|
| 26386 |
+
"learning_rate": 1.2212696093728141e-05,
|
| 26387 |
+
"loss": 0.7069,
|
| 26388 |
+
"step": 338100
|
| 26389 |
+
},
|
| 26390 |
+
{
|
| 26391 |
+
"epoch": 0.0244,
|
| 26392 |
+
"grad_norm": 0.9208382964134216,
|
| 26393 |
+
"learning_rate": 1.2199064666478474e-05,
|
| 26394 |
+
"loss": 0.7086,
|
| 26395 |
+
"step": 338200
|
| 26396 |
+
},
|
| 26397 |
+
{
|
| 26398 |
+
"epoch": 0.0246,
|
| 26399 |
+
"grad_norm": 0.9424040913581848,
|
| 26400 |
+
"learning_rate": 1.2185438395444029e-05,
|
| 26401 |
+
"loss": 0.699,
|
| 26402 |
+
"step": 338300
|
| 26403 |
+
},
|
| 26404 |
+
{
|
| 26405 |
+
"epoch": 0.0248,
|
| 26406 |
+
"grad_norm": 0.9521956443786621,
|
| 26407 |
+
"learning_rate": 1.2171817286113476e-05,
|
| 26408 |
+
"loss": 0.6972,
|
| 26409 |
+
"step": 338400
|
| 26410 |
+
},
|
| 26411 |
+
{
|
| 26412 |
+
"epoch": 0.025,
|
| 26413 |
+
"grad_norm": 0.9072422385215759,
|
| 26414 |
+
"learning_rate": 1.2158201343973377e-05,
|
| 26415 |
+
"loss": 0.686,
|
| 26416 |
+
"step": 338500
|
| 26417 |
+
},
|
| 26418 |
+
{
|
| 26419 |
+
"epoch": 0.0252,
|
| 26420 |
+
"grad_norm": 0.8915525078773499,
|
| 26421 |
+
"learning_rate": 1.2144590574508241e-05,
|
| 26422 |
+
"loss": 0.6992,
|
| 26423 |
+
"step": 338600
|
| 26424 |
+
},
|
| 26425 |
+
{
|
| 26426 |
+
"epoch": 0.0254,
|
| 26427 |
+
"grad_norm": 0.8471651673316956,
|
| 26428 |
+
"learning_rate": 1.2130984983200486e-05,
|
| 26429 |
+
"loss": 0.6933,
|
| 26430 |
+
"step": 338700
|
| 26431 |
+
},
|
| 26432 |
+
{
|
| 26433 |
+
"epoch": 0.0256,
|
| 26434 |
+
"grad_norm": 0.8865765929222107,
|
| 26435 |
+
"learning_rate": 1.2117384575530446e-05,
|
| 26436 |
+
"loss": 0.6899,
|
| 26437 |
+
"step": 338800
|
| 26438 |
+
},
|
| 26439 |
+
{
|
| 26440 |
+
"epoch": 0.0258,
|
| 26441 |
+
"grad_norm": 0.8501929640769958,
|
| 26442 |
+
"learning_rate": 1.2103789356976353e-05,
|
| 26443 |
+
"loss": 0.6942,
|
| 26444 |
+
"step": 338900
|
| 26445 |
+
},
|
| 26446 |
+
{
|
| 26447 |
+
"epoch": 0.026,
|
| 26448 |
+
"grad_norm": 0.8459005951881409,
|
| 26449 |
+
"learning_rate": 1.2090199333014363e-05,
|
| 26450 |
+
"loss": 0.6883,
|
| 26451 |
+
"step": 339000
|
| 26452 |
+
},
|
| 26453 |
+
{
|
| 26454 |
+
"epoch": 0.026,
|
| 26455 |
+
"eval_loss": 2.0509862899780273,
|
| 26456 |
+
"eval_runtime": 52.0225,
|
| 26457 |
+
"eval_samples_per_second": 195.954,
|
| 26458 |
+
"eval_steps_per_second": 1.538,
|
| 26459 |
+
"step": 339000
|
| 26460 |
+
},
|
| 26461 |
+
{
|
| 26462 |
+
"epoch": 0.0262,
|
| 26463 |
+
"grad_norm": 0.8724320530891418,
|
| 26464 |
+
"learning_rate": 1.2076614509118537e-05,
|
| 26465 |
+
"loss": 0.6903,
|
| 26466 |
+
"step": 339100
|
| 26467 |
+
},
|
| 26468 |
+
{
|
| 26469 |
+
"epoch": 0.0264,
|
| 26470 |
+
"grad_norm": 0.8801888227462769,
|
| 26471 |
+
"learning_rate": 1.206303489076085e-05,
|
| 26472 |
+
"loss": 0.6934,
|
| 26473 |
+
"step": 339200
|
| 26474 |
+
},
|
| 26475 |
+
{
|
| 26476 |
+
"epoch": 0.0266,
|
| 26477 |
+
"grad_norm": 0.9318116903305054,
|
| 26478 |
+
"learning_rate": 1.2049460483411154e-05,
|
| 26479 |
+
"loss": 0.6909,
|
| 26480 |
+
"step": 339300
|
| 26481 |
+
},
|
| 26482 |
+
{
|
| 26483 |
+
"epoch": 0.0268,
|
| 26484 |
+
"grad_norm": 0.892590343952179,
|
| 26485 |
+
"learning_rate": 1.2035891292537228e-05,
|
| 26486 |
+
"loss": 0.6931,
|
| 26487 |
+
"step": 339400
|
| 26488 |
+
},
|
| 26489 |
+
{
|
| 26490 |
+
"epoch": 0.027,
|
| 26491 |
+
"grad_norm": 0.8987374901771545,
|
| 26492 |
+
"learning_rate": 1.2022327323604735e-05,
|
| 26493 |
+
"loss": 0.682,
|
| 26494 |
+
"step": 339500
|
| 26495 |
+
},
|
| 26496 |
+
{
|
| 26497 |
+
"epoch": 0.0272,
|
| 26498 |
+
"grad_norm": 0.9365059733390808,
|
| 26499 |
+
"learning_rate": 1.2008768582077257e-05,
|
| 26500 |
+
"loss": 0.6849,
|
| 26501 |
+
"step": 339600
|
| 26502 |
+
},
|
| 26503 |
+
{
|
| 26504 |
+
"epoch": 0.0274,
|
| 26505 |
+
"grad_norm": 0.908043384552002,
|
| 26506 |
+
"learning_rate": 1.199521507341623e-05,
|
| 26507 |
+
"loss": 0.6959,
|
| 26508 |
+
"step": 339700
|
| 26509 |
+
},
|
| 26510 |
+
{
|
| 26511 |
+
"epoch": 0.0276,
|
| 26512 |
+
"grad_norm": 0.9550427794456482,
|
| 26513 |
+
"learning_rate": 1.1981666803081015e-05,
|
| 26514 |
+
"loss": 0.6928,
|
| 26515 |
+
"step": 339800
|
| 26516 |
+
},
|
| 26517 |
+
{
|
| 26518 |
+
"epoch": 0.0278,
|
| 26519 |
+
"grad_norm": 0.9074381589889526,
|
| 26520 |
+
"learning_rate": 1.1968123776528855e-05,
|
| 26521 |
+
"loss": 0.6907,
|
| 26522 |
+
"step": 339900
|
| 26523 |
+
},
|
| 26524 |
+
{
|
| 26525 |
+
"epoch": 0.028,
|
| 26526 |
+
"grad_norm": 0.8685894012451172,
|
| 26527 |
+
"learning_rate": 1.195458599921489e-05,
|
| 26528 |
+
"loss": 0.6792,
|
| 26529 |
+
"step": 340000
|
| 26530 |
+
},
|
| 26531 |
+
{
|
| 26532 |
+
"epoch": 0.028,
|
| 26533 |
+
"eval_loss": 2.0385003089904785,
|
| 26534 |
+
"eval_runtime": 51.8057,
|
| 26535 |
+
"eval_samples_per_second": 196.774,
|
| 26536 |
+
"eval_steps_per_second": 1.544,
|
| 26537 |
+
"step": 340000
|
| 26538 |
+
},
|
| 26539 |
+
{
|
| 26540 |
+
"epoch": 0.0282,
|
| 26541 |
+
"grad_norm": 0.8610774874687195,
|
| 26542 |
+
"learning_rate": 1.1941053476592115e-05,
|
| 26543 |
+
"loss": 0.6883,
|
| 26544 |
+
"step": 340100
|
| 26545 |
+
},
|
| 26546 |
+
{
|
| 26547 |
+
"epoch": 0.0284,
|
| 26548 |
+
"grad_norm": 0.8860583901405334,
|
| 26549 |
+
"learning_rate": 1.192752621411144e-05,
|
| 26550 |
+
"loss": 0.6962,
|
| 26551 |
+
"step": 340200
|
| 26552 |
+
},
|
| 26553 |
+
{
|
| 26554 |
+
"epoch": 0.0286,
|
| 26555 |
+
"grad_norm": 0.872675895690918,
|
| 26556 |
+
"learning_rate": 1.191400421722165e-05,
|
| 26557 |
+
"loss": 0.6941,
|
| 26558 |
+
"step": 340300
|
| 26559 |
+
},
|
| 26560 |
+
{
|
| 26561 |
+
"epoch": 0.0288,
|
| 26562 |
+
"grad_norm": 0.9522199630737305,
|
| 26563 |
+
"learning_rate": 1.1900487491369386e-05,
|
| 26564 |
+
"loss": 0.6885,
|
| 26565 |
+
"step": 340400
|
| 26566 |
+
},
|
| 26567 |
+
{
|
| 26568 |
+
"epoch": 0.029,
|
| 26569 |
+
"grad_norm": 0.8697762489318848,
|
| 26570 |
+
"learning_rate": 1.1886976041999196e-05,
|
| 26571 |
+
"loss": 0.688,
|
| 26572 |
+
"step": 340500
|
| 26573 |
+
},
|
| 26574 |
+
{
|
| 26575 |
+
"epoch": 0.0292,
|
| 26576 |
+
"grad_norm": 0.939105749130249,
|
| 26577 |
+
"learning_rate": 1.1873469874553486e-05,
|
| 26578 |
+
"loss": 0.677,
|
| 26579 |
+
"step": 340600
|
| 26580 |
+
},
|
| 26581 |
+
{
|
| 26582 |
+
"epoch": 0.0294,
|
| 26583 |
+
"grad_norm": 0.8797380328178406,
|
| 26584 |
+
"learning_rate": 1.1859968994472551e-05,
|
| 26585 |
+
"loss": 0.681,
|
| 26586 |
+
"step": 340700
|
| 26587 |
+
},
|
| 26588 |
+
{
|
| 26589 |
+
"epoch": 0.0296,
|
| 26590 |
+
"grad_norm": 0.9089605212211609,
|
| 26591 |
+
"learning_rate": 1.1846473407194522e-05,
|
| 26592 |
+
"loss": 0.6916,
|
| 26593 |
+
"step": 340800
|
| 26594 |
+
},
|
| 26595 |
+
{
|
| 26596 |
+
"epoch": 0.0298,
|
| 26597 |
+
"grad_norm": 0.8749380111694336,
|
| 26598 |
+
"learning_rate": 1.1832983118155436e-05,
|
| 26599 |
+
"loss": 0.6855,
|
| 26600 |
+
"step": 340900
|
| 26601 |
+
},
|
| 26602 |
+
{
|
| 26603 |
+
"epoch": 0.03,
|
| 26604 |
+
"grad_norm": 0.8690641522407532,
|
| 26605 |
+
"learning_rate": 1.1819498132789173e-05,
|
| 26606 |
+
"loss": 0.6923,
|
| 26607 |
+
"step": 341000
|
| 26608 |
+
},
|
| 26609 |
+
{
|
| 26610 |
+
"epoch": 0.03,
|
| 26611 |
+
"eval_loss": 2.083442449569702,
|
| 26612 |
+
"eval_runtime": 52.125,
|
| 26613 |
+
"eval_samples_per_second": 195.568,
|
| 26614 |
+
"eval_steps_per_second": 1.535,
|
| 26615 |
+
"step": 341000
|
| 26616 |
+
},
|
| 26617 |
+
{
|
| 26618 |
+
"epoch": 0.0302,
|
| 26619 |
+
"grad_norm": 0.8813545107841492,
|
| 26620 |
+
"learning_rate": 1.1806018456527495e-05,
|
| 26621 |
+
"loss": 0.679,
|
| 26622 |
+
"step": 341100
|
| 26623 |
+
},
|
| 26624 |
+
{
|
| 26625 |
+
"epoch": 0.0304,
|
| 26626 |
+
"grad_norm": 0.8487153649330139,
|
| 26627 |
+
"learning_rate": 1.1792544094799995e-05,
|
| 26628 |
+
"loss": 0.6851,
|
| 26629 |
+
"step": 341200
|
| 26630 |
+
},
|
| 26631 |
+
{
|
| 26632 |
+
"epoch": 0.0306,
|
| 26633 |
+
"grad_norm": 0.8273248672485352,
|
| 26634 |
+
"learning_rate": 1.1779075053034155e-05,
|
| 26635 |
+
"loss": 0.6807,
|
| 26636 |
+
"step": 341300
|
| 26637 |
+
},
|
| 26638 |
+
{
|
| 26639 |
+
"epoch": 0.0308,
|
| 26640 |
+
"grad_norm": 0.879425585269928,
|
| 26641 |
+
"learning_rate": 1.1765611336655305e-05,
|
| 26642 |
+
"loss": 0.6816,
|
| 26643 |
+
"step": 341400
|
| 26644 |
+
},
|
| 26645 |
+
{
|
| 26646 |
+
"epoch": 0.031,
|
| 26647 |
+
"grad_norm": 0.8405166268348694,
|
| 26648 |
+
"learning_rate": 1.1752152951086631e-05,
|
| 26649 |
+
"loss": 0.6762,
|
| 26650 |
+
"step": 341500
|
| 26651 |
+
},
|
| 26652 |
+
{
|
| 26653 |
+
"epoch": 0.0312,
|
| 26654 |
+
"grad_norm": 0.8572484254837036,
|
| 26655 |
+
"learning_rate": 1.1738699901749157e-05,
|
| 26656 |
+
"loss": 0.692,
|
| 26657 |
+
"step": 341600
|
| 26658 |
+
},
|
| 26659 |
+
{
|
| 26660 |
+
"epoch": 0.0314,
|
| 26661 |
+
"grad_norm": 0.9151760935783386,
|
| 26662 |
+
"learning_rate": 1.1725252194061775e-05,
|
| 26663 |
+
"loss": 0.683,
|
| 26664 |
+
"step": 341700
|
| 26665 |
+
},
|
| 26666 |
+
{
|
| 26667 |
+
"epoch": 0.0316,
|
| 26668 |
+
"grad_norm": 0.9075136780738831,
|
| 26669 |
+
"learning_rate": 1.1711809833441235e-05,
|
| 26670 |
+
"loss": 0.6859,
|
| 26671 |
+
"step": 341800
|
| 26672 |
+
},
|
| 26673 |
+
{
|
| 26674 |
+
"epoch": 0.0318,
|
| 26675 |
+
"grad_norm": 0.9236798882484436,
|
| 26676 |
+
"learning_rate": 1.1698372825302093e-05,
|
| 26677 |
+
"loss": 0.6901,
|
| 26678 |
+
"step": 341900
|
| 26679 |
+
},
|
| 26680 |
+
{
|
| 26681 |
+
"epoch": 0.032,
|
| 26682 |
+
"grad_norm": 0.8434112071990967,
|
| 26683 |
+
"learning_rate": 1.1684941175056785e-05,
|
| 26684 |
+
"loss": 0.6844,
|
| 26685 |
+
"step": 342000
|
| 26686 |
+
},
|
| 26687 |
+
{
|
| 26688 |
+
"epoch": 0.032,
|
| 26689 |
+
"eval_loss": 2.0505099296569824,
|
| 26690 |
+
"eval_runtime": 52.0084,
|
| 26691 |
+
"eval_samples_per_second": 196.007,
|
| 26692 |
+
"eval_steps_per_second": 1.538,
|
| 26693 |
+
"step": 342000
|
| 26694 |
+
},
|
| 26695 |
+
{
|
| 26696 |
+
"epoch": 0.0322,
|
| 26697 |
+
"grad_norm": 0.9039320349693298,
|
| 26698 |
+
"learning_rate": 1.1671514888115582e-05,
|
| 26699 |
+
"loss": 0.6859,
|
| 26700 |
+
"step": 342100
|
| 26701 |
+
},
|
| 26702 |
+
{
|
| 26703 |
+
"epoch": 0.0324,
|
| 26704 |
+
"grad_norm": 0.8539577126502991,
|
| 26705 |
+
"learning_rate": 1.1658093969886596e-05,
|
| 26706 |
+
"loss": 0.6734,
|
| 26707 |
+
"step": 342200
|
| 26708 |
+
},
|
| 26709 |
+
{
|
| 26710 |
+
"epoch": 0.0326,
|
| 26711 |
+
"grad_norm": 0.8575844168663025,
|
| 26712 |
+
"learning_rate": 1.1644678425775755e-05,
|
| 26713 |
+
"loss": 0.6762,
|
| 26714 |
+
"step": 342300
|
| 26715 |
+
},
|
| 26716 |
+
{
|
| 26717 |
+
"epoch": 0.0328,
|
| 26718 |
+
"grad_norm": 0.9679238200187683,
|
| 26719 |
+
"learning_rate": 1.1631268261186845e-05,
|
| 26720 |
+
"loss": 0.676,
|
| 26721 |
+
"step": 342400
|
| 26722 |
+
},
|
| 26723 |
+
{
|
| 26724 |
+
"epoch": 0.033,
|
| 26725 |
+
"grad_norm": 0.8782627582550049,
|
| 26726 |
+
"learning_rate": 1.1617863481521483e-05,
|
| 26727 |
+
"loss": 0.6758,
|
| 26728 |
+
"step": 342500
|
| 26729 |
+
},
|
| 26730 |
+
{
|
| 26731 |
+
"epoch": 0.0332,
|
| 26732 |
+
"grad_norm": 0.9136931300163269,
|
| 26733 |
+
"learning_rate": 1.1604464092179118e-05,
|
| 26734 |
+
"loss": 0.6818,
|
| 26735 |
+
"step": 342600
|
| 26736 |
+
},
|
| 26737 |
+
{
|
| 26738 |
+
"epoch": 0.0334,
|
| 26739 |
+
"grad_norm": 0.8847256302833557,
|
| 26740 |
+
"learning_rate": 1.1591070098557006e-05,
|
| 26741 |
+
"loss": 0.6728,
|
| 26742 |
+
"step": 342700
|
| 26743 |
+
},
|
| 26744 |
+
{
|
| 26745 |
+
"epoch": 0.0336,
|
| 26746 |
+
"grad_norm": 0.8676889538764954,
|
| 26747 |
+
"learning_rate": 1.1577681506050253e-05,
|
| 26748 |
+
"loss": 0.682,
|
| 26749 |
+
"step": 342800
|
| 26750 |
+
},
|
| 26751 |
+
{
|
| 26752 |
+
"epoch": 0.0338,
|
| 26753 |
+
"grad_norm": 0.8871778845787048,
|
| 26754 |
+
"learning_rate": 1.1564298320051787e-05,
|
| 26755 |
+
"loss": 0.6748,
|
| 26756 |
+
"step": 342900
|
| 26757 |
+
},
|
| 26758 |
+
{
|
| 26759 |
+
"epoch": 0.034,
|
| 26760 |
+
"grad_norm": 0.9254991412162781,
|
| 26761 |
+
"learning_rate": 1.155092054595236e-05,
|
| 26762 |
+
"loss": 0.6791,
|
| 26763 |
+
"step": 343000
|
| 26764 |
+
},
|
| 26765 |
+
{
|
| 26766 |
+
"epoch": 0.034,
|
| 26767 |
+
"eval_loss": 2.0616466999053955,
|
| 26768 |
+
"eval_runtime": 52.253,
|
| 26769 |
+
"eval_samples_per_second": 195.089,
|
| 26770 |
+
"eval_steps_per_second": 1.531,
|
| 26771 |
+
"step": 343000
|
| 26772 |
}
|
| 26773 |
],
|
| 26774 |
"logging_steps": 100,
|
|
|
|
| 26788 |
"attributes": {}
|
| 26789 |
}
|
| 26790 |
},
|
| 26791 |
+
"total_flos": 2.993443664874701e+19,
|
| 26792 |
"train_batch_size": 128,
|
| 26793 |
"trial_name": null,
|
| 26794 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5777
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58ce66db74e88b1f68194d485c23157f7d0c8a9d6b255f56a99102bd66b1a145
|
| 3 |
size 5777
|