Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1173 -3
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff261834fa34536f963b44d61629d171e8297d50ec29c9ecd77e55f8f4e30a75
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f2cf42e7a86053bde9a697bcec92154da3f0357dc3b6970a4a5c01522d0c4e6
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:159e82523ca477221cb6ee71e6e1fe789822217510366cfeda983df59cb19ad5
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6edc5e7ebf57018d51595ed4fff24582a6a8bfe9d84e42ed6a378983c113ffb
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -16707,6 +16707,1176 @@
|
|
| 16707 |
"eval_samples_per_second": 197.021,
|
| 16708 |
"eval_steps_per_second": 1.546,
|
| 16709 |
"step": 214000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16710 |
}
|
| 16711 |
],
|
| 16712 |
"logging_steps": 100,
|
|
@@ -16726,7 +17896,7 @@
|
|
| 16726 |
"attributes": {}
|
| 16727 |
}
|
| 16728 |
},
|
| 16729 |
-
"total_flos": 1.
|
| 16730 |
"train_batch_size": 128,
|
| 16731 |
"trial_name": null,
|
| 16732 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.0390714393360088,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 229000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 16707 |
"eval_samples_per_second": 197.021,
|
| 16708 |
"eval_steps_per_second": 1.546,
|
| 16709 |
"step": 214000
|
| 16710 |
+
},
|
| 16711 |
+
{
|
| 16712 |
+
"epoch": 0.042141338140980915,
|
| 16713 |
+
"grad_norm": 2.011029005050659,
|
| 16714 |
+
"learning_rate": 1.7978048379468322e-05,
|
| 16715 |
+
"loss": 2.1068,
|
| 16716 |
+
"step": 214100
|
| 16717 |
+
},
|
| 16718 |
+
{
|
| 16719 |
+
"epoch": 0.04242041985052384,
|
| 16720 |
+
"grad_norm": 2.035914897918701,
|
| 16721 |
+
"learning_rate": 1.7956801953428e-05,
|
| 16722 |
+
"loss": 2.1174,
|
| 16723 |
+
"step": 214200
|
| 16724 |
+
},
|
| 16725 |
+
{
|
| 16726 |
+
"epoch": 0.04269950156006676,
|
| 16727 |
+
"grad_norm": 2.129701852798462,
|
| 16728 |
+
"learning_rate": 1.7935561051518883e-05,
|
| 16729 |
+
"loss": 2.1197,
|
| 16730 |
+
"step": 214300
|
| 16731 |
+
},
|
| 16732 |
+
{
|
| 16733 |
+
"epoch": 0.042978583269609676,
|
| 16734 |
+
"grad_norm": 2.043063163757324,
|
| 16735 |
+
"learning_rate": 1.791432569040068e-05,
|
| 16736 |
+
"loss": 2.1106,
|
| 16737 |
+
"step": 214400
|
| 16738 |
+
},
|
| 16739 |
+
{
|
| 16740 |
+
"epoch": 0.043257664979152594,
|
| 16741 |
+
"grad_norm": 2.03788161277771,
|
| 16742 |
+
"learning_rate": 1.7893095886728716e-05,
|
| 16743 |
+
"loss": 2.1055,
|
| 16744 |
+
"step": 214500
|
| 16745 |
+
},
|
| 16746 |
+
{
|
| 16747 |
+
"epoch": 0.04353674668869552,
|
| 16748 |
+
"grad_norm": 1.9218449592590332,
|
| 16749 |
+
"learning_rate": 1.7871871657153993e-05,
|
| 16750 |
+
"loss": 2.1038,
|
| 16751 |
+
"step": 214600
|
| 16752 |
+
},
|
| 16753 |
+
{
|
| 16754 |
+
"epoch": 0.043815828398238436,
|
| 16755 |
+
"grad_norm": 2.175419807434082,
|
| 16756 |
+
"learning_rate": 1.7850653018323132e-05,
|
| 16757 |
+
"loss": 2.1049,
|
| 16758 |
+
"step": 214700
|
| 16759 |
+
},
|
| 16760 |
+
{
|
| 16761 |
+
"epoch": 0.044094910107781354,
|
| 16762 |
+
"grad_norm": 2.14815616607666,
|
| 16763 |
+
"learning_rate": 1.7829439986878374e-05,
|
| 16764 |
+
"loss": 2.1158,
|
| 16765 |
+
"step": 214800
|
| 16766 |
+
},
|
| 16767 |
+
{
|
| 16768 |
+
"epoch": 0.04437399181732428,
|
| 16769 |
+
"grad_norm": 1.9514108896255493,
|
| 16770 |
+
"learning_rate": 1.7808232579457534e-05,
|
| 16771 |
+
"loss": 2.092,
|
| 16772 |
+
"step": 214900
|
| 16773 |
+
},
|
| 16774 |
+
{
|
| 16775 |
+
"epoch": 0.0446530735268672,
|
| 16776 |
+
"grad_norm": 2.0511226654052734,
|
| 16777 |
+
"learning_rate": 1.778703081269405e-05,
|
| 16778 |
+
"loss": 2.0992,
|
| 16779 |
+
"step": 215000
|
| 16780 |
+
},
|
| 16781 |
+
{
|
| 16782 |
+
"epoch": 0.0446530735268672,
|
| 16783 |
+
"eval_loss": 2.183467388153076,
|
| 16784 |
+
"eval_runtime": 51.5396,
|
| 16785 |
+
"eval_samples_per_second": 197.79,
|
| 16786 |
+
"eval_steps_per_second": 1.552,
|
| 16787 |
+
"step": 215000
|
| 16788 |
+
},
|
| 16789 |
+
{
|
| 16790 |
+
"epoch": 0.00027908170954291995,
|
| 16791 |
+
"grad_norm": 2.159756660461426,
|
| 16792 |
+
"learning_rate": 1.776583470321692e-05,
|
| 16793 |
+
"loss": 2.0955,
|
| 16794 |
+
"step": 215100
|
| 16795 |
+
},
|
| 16796 |
+
{
|
| 16797 |
+
"epoch": 0.0005581634190858399,
|
| 16798 |
+
"grad_norm": 2.170898675918579,
|
| 16799 |
+
"learning_rate": 1.7744644267650712e-05,
|
| 16800 |
+
"loss": 2.1049,
|
| 16801 |
+
"step": 215200
|
| 16802 |
+
},
|
| 16803 |
+
{
|
| 16804 |
+
"epoch": 0.0008372451286287599,
|
| 16805 |
+
"grad_norm": 1.9969067573547363,
|
| 16806 |
+
"learning_rate": 1.7723459522615522e-05,
|
| 16807 |
+
"loss": 2.092,
|
| 16808 |
+
"step": 215300
|
| 16809 |
+
},
|
| 16810 |
+
{
|
| 16811 |
+
"epoch": 0.0011163268381716798,
|
| 16812 |
+
"grad_norm": 1.9468703269958496,
|
| 16813 |
+
"learning_rate": 1.770228048472701e-05,
|
| 16814 |
+
"loss": 2.1021,
|
| 16815 |
+
"step": 215400
|
| 16816 |
+
},
|
| 16817 |
+
{
|
| 16818 |
+
"epoch": 0.0013954085477146,
|
| 16819 |
+
"grad_norm": 2.082648992538452,
|
| 16820 |
+
"learning_rate": 1.7681107170596357e-05,
|
| 16821 |
+
"loss": 2.0915,
|
| 16822 |
+
"step": 215500
|
| 16823 |
+
},
|
| 16824 |
+
{
|
| 16825 |
+
"epoch": 0.0016744902572575198,
|
| 16826 |
+
"grad_norm": 2.049349546432495,
|
| 16827 |
+
"learning_rate": 1.7659939596830243e-05,
|
| 16828 |
+
"loss": 2.0962,
|
| 16829 |
+
"step": 215600
|
| 16830 |
+
},
|
| 16831 |
+
{
|
| 16832 |
+
"epoch": 0.00195357196680044,
|
| 16833 |
+
"grad_norm": 2.176790952682495,
|
| 16834 |
+
"learning_rate": 1.7638777780030844e-05,
|
| 16835 |
+
"loss": 2.0892,
|
| 16836 |
+
"step": 215700
|
| 16837 |
+
},
|
| 16838 |
+
{
|
| 16839 |
+
"epoch": 0.0022326536763433596,
|
| 16840 |
+
"grad_norm": 2.1624631881713867,
|
| 16841 |
+
"learning_rate": 1.7617621736795824e-05,
|
| 16842 |
+
"loss": 2.0963,
|
| 16843 |
+
"step": 215800
|
| 16844 |
+
},
|
| 16845 |
+
{
|
| 16846 |
+
"epoch": 0.0025117353858862797,
|
| 16847 |
+
"grad_norm": 2.1935231685638428,
|
| 16848 |
+
"learning_rate": 1.7596471483718328e-05,
|
| 16849 |
+
"loss": 2.0814,
|
| 16850 |
+
"step": 215900
|
| 16851 |
+
},
|
| 16852 |
+
{
|
| 16853 |
+
"epoch": 0.0027908170954292,
|
| 16854 |
+
"grad_norm": 2.091728925704956,
|
| 16855 |
+
"learning_rate": 1.757532703738695e-05,
|
| 16856 |
+
"loss": 2.0956,
|
| 16857 |
+
"step": 216000
|
| 16858 |
+
},
|
| 16859 |
+
{
|
| 16860 |
+
"epoch": 0.0027908170954292,
|
| 16861 |
+
"eval_loss": 2.1795222759246826,
|
| 16862 |
+
"eval_runtime": 51.863,
|
| 16863 |
+
"eval_samples_per_second": 196.556,
|
| 16864 |
+
"eval_steps_per_second": 1.543,
|
| 16865 |
+
"step": 216000
|
| 16866 |
+
},
|
| 16867 |
+
{
|
| 16868 |
+
"epoch": 0.00306989880497212,
|
| 16869 |
+
"grad_norm": 1.9175347089767456,
|
| 16870 |
+
"learning_rate": 1.7554188414385746e-05,
|
| 16871 |
+
"loss": 2.083,
|
| 16872 |
+
"step": 216100
|
| 16873 |
+
},
|
| 16874 |
+
{
|
| 16875 |
+
"epoch": 0.0033489805145150396,
|
| 16876 |
+
"grad_norm": 2.0839240550994873,
|
| 16877 |
+
"learning_rate": 1.753305563129417e-05,
|
| 16878 |
+
"loss": 2.0849,
|
| 16879 |
+
"step": 216200
|
| 16880 |
+
},
|
| 16881 |
+
{
|
| 16882 |
+
"epoch": 0.0036280622240579597,
|
| 16883 |
+
"grad_norm": 2.2987542152404785,
|
| 16884 |
+
"learning_rate": 1.751192870468713e-05,
|
| 16885 |
+
"loss": 2.107,
|
| 16886 |
+
"step": 216300
|
| 16887 |
+
},
|
| 16888 |
+
{
|
| 16889 |
+
"epoch": 0.00390714393360088,
|
| 16890 |
+
"grad_norm": 2.0684635639190674,
|
| 16891 |
+
"learning_rate": 1.7490807651134916e-05,
|
| 16892 |
+
"loss": 2.0833,
|
| 16893 |
+
"step": 216400
|
| 16894 |
+
},
|
| 16895 |
+
{
|
| 16896 |
+
"epoch": 0.0041862256431437995,
|
| 16897 |
+
"grad_norm": 2.094618558883667,
|
| 16898 |
+
"learning_rate": 1.7469692487203242e-05,
|
| 16899 |
+
"loss": 2.1003,
|
| 16900 |
+
"step": 216500
|
| 16901 |
+
},
|
| 16902 |
+
{
|
| 16903 |
+
"epoch": 0.004465307352686719,
|
| 16904 |
+
"grad_norm": 2.0774834156036377,
|
| 16905 |
+
"learning_rate": 1.7448583229453163e-05,
|
| 16906 |
+
"loss": 2.0854,
|
| 16907 |
+
"step": 216600
|
| 16908 |
+
},
|
| 16909 |
+
{
|
| 16910 |
+
"epoch": 0.00474438906222964,
|
| 16911 |
+
"grad_norm": 2.2240655422210693,
|
| 16912 |
+
"learning_rate": 1.7427479894441135e-05,
|
| 16913 |
+
"loss": 2.0914,
|
| 16914 |
+
"step": 216700
|
| 16915 |
+
},
|
| 16916 |
+
{
|
| 16917 |
+
"epoch": 0.005023470771772559,
|
| 16918 |
+
"grad_norm": 2.094910144805908,
|
| 16919 |
+
"learning_rate": 1.740638249871895e-05,
|
| 16920 |
+
"loss": 2.0913,
|
| 16921 |
+
"step": 216800
|
| 16922 |
+
},
|
| 16923 |
+
{
|
| 16924 |
+
"epoch": 0.00530255248131548,
|
| 16925 |
+
"grad_norm": 2.0924530029296875,
|
| 16926 |
+
"learning_rate": 1.738529105883376e-05,
|
| 16927 |
+
"loss": 2.0825,
|
| 16928 |
+
"step": 216900
|
| 16929 |
+
},
|
| 16930 |
+
{
|
| 16931 |
+
"epoch": 0.0055816341908584,
|
| 16932 |
+
"grad_norm": 2.0093395709991455,
|
| 16933 |
+
"learning_rate": 1.7364205591328018e-05,
|
| 16934 |
+
"loss": 2.0782,
|
| 16935 |
+
"step": 217000
|
| 16936 |
+
},
|
| 16937 |
+
{
|
| 16938 |
+
"epoch": 0.0055816341908584,
|
| 16939 |
+
"eval_loss": 2.17291259765625,
|
| 16940 |
+
"eval_runtime": 51.4439,
|
| 16941 |
+
"eval_samples_per_second": 198.157,
|
| 16942 |
+
"eval_steps_per_second": 1.555,
|
| 16943 |
+
"step": 217000
|
| 16944 |
+
},
|
| 16945 |
+
{
|
| 16946 |
+
"epoch": 0.005860715900401319,
|
| 16947 |
+
"grad_norm": 2.0085370540618896,
|
| 16948 |
+
"learning_rate": 1.734312611273951e-05,
|
| 16949 |
+
"loss": 2.0714,
|
| 16950 |
+
"step": 217100
|
| 16951 |
+
},
|
| 16952 |
+
{
|
| 16953 |
+
"epoch": 0.00613979760994424,
|
| 16954 |
+
"grad_norm": 2.3136491775512695,
|
| 16955 |
+
"learning_rate": 1.7322052639601328e-05,
|
| 16956 |
+
"loss": 2.0794,
|
| 16957 |
+
"step": 217200
|
| 16958 |
+
},
|
| 16959 |
+
{
|
| 16960 |
+
"epoch": 0.0064188793194871595,
|
| 16961 |
+
"grad_norm": 2.062134265899658,
|
| 16962 |
+
"learning_rate": 1.7300985188441854e-05,
|
| 16963 |
+
"loss": 2.0822,
|
| 16964 |
+
"step": 217300
|
| 16965 |
+
},
|
| 16966 |
+
{
|
| 16967 |
+
"epoch": 0.006697961029030079,
|
| 16968 |
+
"grad_norm": 2.0435168743133545,
|
| 16969 |
+
"learning_rate": 1.727992377578473e-05,
|
| 16970 |
+
"loss": 2.0763,
|
| 16971 |
+
"step": 217400
|
| 16972 |
+
},
|
| 16973 |
+
{
|
| 16974 |
+
"epoch": 0.006977042738573,
|
| 16975 |
+
"grad_norm": 2.1942365169525146,
|
| 16976 |
+
"learning_rate": 1.7258868418148874e-05,
|
| 16977 |
+
"loss": 2.0876,
|
| 16978 |
+
"step": 217500
|
| 16979 |
+
},
|
| 16980 |
+
{
|
| 16981 |
+
"epoch": 0.0072561244481159195,
|
| 16982 |
+
"grad_norm": 2.1672890186309814,
|
| 16983 |
+
"learning_rate": 1.7237819132048467e-05,
|
| 16984 |
+
"loss": 2.0832,
|
| 16985 |
+
"step": 217600
|
| 16986 |
+
},
|
| 16987 |
+
{
|
| 16988 |
+
"epoch": 0.007535206157658839,
|
| 16989 |
+
"grad_norm": 1.8856595754623413,
|
| 16990 |
+
"learning_rate": 1.7216775933992906e-05,
|
| 16991 |
+
"loss": 2.0706,
|
| 16992 |
+
"step": 217700
|
| 16993 |
+
},
|
| 16994 |
+
{
|
| 16995 |
+
"epoch": 0.00781428786720176,
|
| 16996 |
+
"grad_norm": 2.1063289642333984,
|
| 16997 |
+
"learning_rate": 1.7195738840486825e-05,
|
| 16998 |
+
"loss": 2.2249,
|
| 16999 |
+
"step": 217800
|
| 17000 |
+
},
|
| 17001 |
+
{
|
| 17002 |
+
"epoch": 0.00809336957674468,
|
| 17003 |
+
"grad_norm": 2.09557843208313,
|
| 17004 |
+
"learning_rate": 1.717470786803006e-05,
|
| 17005 |
+
"loss": 2.2446,
|
| 17006 |
+
"step": 217900
|
| 17007 |
+
},
|
| 17008 |
+
{
|
| 17009 |
+
"epoch": 0.008372451286287599,
|
| 17010 |
+
"grad_norm": 2.1334340572357178,
|
| 17011 |
+
"learning_rate": 1.715368303311766e-05,
|
| 17012 |
+
"loss": 2.2297,
|
| 17013 |
+
"step": 218000
|
| 17014 |
+
},
|
| 17015 |
+
{
|
| 17016 |
+
"epoch": 0.008372451286287599,
|
| 17017 |
+
"eval_loss": 2.1775035858154297,
|
| 17018 |
+
"eval_runtime": 51.4889,
|
| 17019 |
+
"eval_samples_per_second": 197.984,
|
| 17020 |
+
"eval_steps_per_second": 1.554,
|
| 17021 |
+
"step": 218000
|
| 17022 |
+
},
|
| 17023 |
+
{
|
| 17024 |
+
"epoch": 0.008651532995830519,
|
| 17025 |
+
"grad_norm": 2.201794385910034,
|
| 17026 |
+
"learning_rate": 1.713266435223986e-05,
|
| 17027 |
+
"loss": 2.2351,
|
| 17028 |
+
"step": 218100
|
| 17029 |
+
},
|
| 17030 |
+
{
|
| 17031 |
+
"epoch": 0.008930614705373438,
|
| 17032 |
+
"grad_norm": 2.2592103481292725,
|
| 17033 |
+
"learning_rate": 1.711165184188205e-05,
|
| 17034 |
+
"loss": 2.223,
|
| 17035 |
+
"step": 218200
|
| 17036 |
+
},
|
| 17037 |
+
{
|
| 17038 |
+
"epoch": 0.00920969641491636,
|
| 17039 |
+
"grad_norm": 2.382873773574829,
|
| 17040 |
+
"learning_rate": 1.7090645518524797e-05,
|
| 17041 |
+
"loss": 2.2283,
|
| 17042 |
+
"step": 218300
|
| 17043 |
+
},
|
| 17044 |
+
{
|
| 17045 |
+
"epoch": 0.00948877812445928,
|
| 17046 |
+
"grad_norm": 2.2751810550689697,
|
| 17047 |
+
"learning_rate": 1.706964539864381e-05,
|
| 17048 |
+
"loss": 2.2369,
|
| 17049 |
+
"step": 218400
|
| 17050 |
+
},
|
| 17051 |
+
{
|
| 17052 |
+
"epoch": 0.0097678598340022,
|
| 17053 |
+
"grad_norm": 2.439268112182617,
|
| 17054 |
+
"learning_rate": 1.7048651498709944e-05,
|
| 17055 |
+
"loss": 2.227,
|
| 17056 |
+
"step": 218500
|
| 17057 |
+
},
|
| 17058 |
+
{
|
| 17059 |
+
"epoch": 0.010046941543545119,
|
| 17060 |
+
"grad_norm": 2.244767665863037,
|
| 17061 |
+
"learning_rate": 1.7027663835189145e-05,
|
| 17062 |
+
"loss": 2.2235,
|
| 17063 |
+
"step": 218600
|
| 17064 |
+
},
|
| 17065 |
+
{
|
| 17066 |
+
"epoch": 0.010326023253088039,
|
| 17067 |
+
"grad_norm": 2.1761574745178223,
|
| 17068 |
+
"learning_rate": 1.7006682424542497e-05,
|
| 17069 |
+
"loss": 2.2172,
|
| 17070 |
+
"step": 218700
|
| 17071 |
+
},
|
| 17072 |
+
{
|
| 17073 |
+
"epoch": 0.01060510496263096,
|
| 17074 |
+
"grad_norm": 2.32922101020813,
|
| 17075 |
+
"learning_rate": 1.6985707283226172e-05,
|
| 17076 |
+
"loss": 2.2169,
|
| 17077 |
+
"step": 218800
|
| 17078 |
+
},
|
| 17079 |
+
{
|
| 17080 |
+
"epoch": 0.01088418667217388,
|
| 17081 |
+
"grad_norm": 2.1702868938446045,
|
| 17082 |
+
"learning_rate": 1.6964738427691426e-05,
|
| 17083 |
+
"loss": 2.2243,
|
| 17084 |
+
"step": 218900
|
| 17085 |
+
},
|
| 17086 |
+
{
|
| 17087 |
+
"epoch": 0.0111632683817168,
|
| 17088 |
+
"grad_norm": 2.0979557037353516,
|
| 17089 |
+
"learning_rate": 1.6943775874384583e-05,
|
| 17090 |
+
"loss": 2.2045,
|
| 17091 |
+
"step": 219000
|
| 17092 |
+
},
|
| 17093 |
+
{
|
| 17094 |
+
"epoch": 0.0111632683817168,
|
| 17095 |
+
"eval_loss": 2.1724750995635986,
|
| 17096 |
+
"eval_runtime": 51.344,
|
| 17097 |
+
"eval_samples_per_second": 198.543,
|
| 17098 |
+
"eval_steps_per_second": 1.558,
|
| 17099 |
+
"step": 219000
|
| 17100 |
+
},
|
| 17101 |
+
{
|
| 17102 |
+
"epoch": 0.011442350091259719,
|
| 17103 |
+
"grad_norm": 2.1244499683380127,
|
| 17104 |
+
"learning_rate": 1.6922819639747006e-05,
|
| 17105 |
+
"loss": 2.2174,
|
| 17106 |
+
"step": 219100
|
| 17107 |
+
},
|
| 17108 |
+
{
|
| 17109 |
+
"epoch": 0.011721431800802639,
|
| 17110 |
+
"grad_norm": 2.18345046043396,
|
| 17111 |
+
"learning_rate": 1.690186974021513e-05,
|
| 17112 |
+
"loss": 2.2265,
|
| 17113 |
+
"step": 219200
|
| 17114 |
+
},
|
| 17115 |
+
{
|
| 17116 |
+
"epoch": 0.012000513510345558,
|
| 17117 |
+
"grad_norm": 2.2020881175994873,
|
| 17118 |
+
"learning_rate": 1.6880926192220413e-05,
|
| 17119 |
+
"loss": 2.2272,
|
| 17120 |
+
"step": 219300
|
| 17121 |
+
},
|
| 17122 |
+
{
|
| 17123 |
+
"epoch": 0.01227959521988848,
|
| 17124 |
+
"grad_norm": 2.2746477127075195,
|
| 17125 |
+
"learning_rate": 1.6859989012189337e-05,
|
| 17126 |
+
"loss": 2.2184,
|
| 17127 |
+
"step": 219400
|
| 17128 |
+
},
|
| 17129 |
+
{
|
| 17130 |
+
"epoch": 0.0125586769294314,
|
| 17131 |
+
"grad_norm": 2.2917847633361816,
|
| 17132 |
+
"learning_rate": 1.6839058216543358e-05,
|
| 17133 |
+
"loss": 2.2267,
|
| 17134 |
+
"step": 219500
|
| 17135 |
+
},
|
| 17136 |
+
{
|
| 17137 |
+
"epoch": 0.012837758638974319,
|
| 17138 |
+
"grad_norm": 2.2045438289642334,
|
| 17139 |
+
"learning_rate": 1.6818133821698965e-05,
|
| 17140 |
+
"loss": 2.2119,
|
| 17141 |
+
"step": 219600
|
| 17142 |
+
},
|
| 17143 |
+
{
|
| 17144 |
+
"epoch": 0.013116840348517239,
|
| 17145 |
+
"grad_norm": 2.218310594558716,
|
| 17146 |
+
"learning_rate": 1.6797215844067604e-05,
|
| 17147 |
+
"loss": 2.2216,
|
| 17148 |
+
"step": 219700
|
| 17149 |
+
},
|
| 17150 |
+
{
|
| 17151 |
+
"epoch": 0.013395922058060158,
|
| 17152 |
+
"grad_norm": 2.124152898788452,
|
| 17153 |
+
"learning_rate": 1.67763043000557e-05,
|
| 17154 |
+
"loss": 2.2065,
|
| 17155 |
+
"step": 219800
|
| 17156 |
+
},
|
| 17157 |
+
{
|
| 17158 |
+
"epoch": 0.013675003767603078,
|
| 17159 |
+
"grad_norm": 2.10780930519104,
|
| 17160 |
+
"learning_rate": 1.675539920606461e-05,
|
| 17161 |
+
"loss": 2.2149,
|
| 17162 |
+
"step": 219900
|
| 17163 |
+
},
|
| 17164 |
+
{
|
| 17165 |
+
"epoch": 0.013954085477146,
|
| 17166 |
+
"grad_norm": 2.210146903991699,
|
| 17167 |
+
"learning_rate": 1.673450057849066e-05,
|
| 17168 |
+
"loss": 2.2149,
|
| 17169 |
+
"step": 220000
|
| 17170 |
+
},
|
| 17171 |
+
{
|
| 17172 |
+
"epoch": 0.013954085477146,
|
| 17173 |
+
"eval_loss": 2.164307117462158,
|
| 17174 |
+
"eval_runtime": 51.3547,
|
| 17175 |
+
"eval_samples_per_second": 198.502,
|
| 17176 |
+
"eval_steps_per_second": 1.558,
|
| 17177 |
+
"step": 220000
|
| 17178 |
+
},
|
| 17179 |
+
{
|
| 17180 |
+
"epoch": 0.01423316718668892,
|
| 17181 |
+
"grad_norm": 2.1689798831939697,
|
| 17182 |
+
"learning_rate": 1.671360843372508e-05,
|
| 17183 |
+
"loss": 2.2174,
|
| 17184 |
+
"step": 220100
|
| 17185 |
+
},
|
| 17186 |
+
{
|
| 17187 |
+
"epoch": 0.014512248896231839,
|
| 17188 |
+
"grad_norm": 2.2905499935150146,
|
| 17189 |
+
"learning_rate": 1.669272278815405e-05,
|
| 17190 |
+
"loss": 2.2041,
|
| 17191 |
+
"step": 220200
|
| 17192 |
+
},
|
| 17193 |
+
{
|
| 17194 |
+
"epoch": 0.014791330605774759,
|
| 17195 |
+
"grad_norm": 2.155677080154419,
|
| 17196 |
+
"learning_rate": 1.6671843658158613e-05,
|
| 17197 |
+
"loss": 2.2197,
|
| 17198 |
+
"step": 220300
|
| 17199 |
+
},
|
| 17200 |
+
{
|
| 17201 |
+
"epoch": 0.015070412315317678,
|
| 17202 |
+
"grad_norm": 2.2219150066375732,
|
| 17203 |
+
"learning_rate": 1.665097106011471e-05,
|
| 17204 |
+
"loss": 2.2173,
|
| 17205 |
+
"step": 220400
|
| 17206 |
+
},
|
| 17207 |
+
{
|
| 17208 |
+
"epoch": 0.015349494024860598,
|
| 17209 |
+
"grad_norm": 2.145770311355591,
|
| 17210 |
+
"learning_rate": 1.6630105010393178e-05,
|
| 17211 |
+
"loss": 2.1991,
|
| 17212 |
+
"step": 220500
|
| 17213 |
+
},
|
| 17214 |
+
{
|
| 17215 |
+
"epoch": 0.01562857573440352,
|
| 17216 |
+
"grad_norm": 2.2329516410827637,
|
| 17217 |
+
"learning_rate": 1.6609245525359717e-05,
|
| 17218 |
+
"loss": 2.222,
|
| 17219 |
+
"step": 220600
|
| 17220 |
+
},
|
| 17221 |
+
{
|
| 17222 |
+
"epoch": 0.015907657443946437,
|
| 17223 |
+
"grad_norm": 2.230044364929199,
|
| 17224 |
+
"learning_rate": 1.6588392621374846e-05,
|
| 17225 |
+
"loss": 2.2124,
|
| 17226 |
+
"step": 220700
|
| 17227 |
+
},
|
| 17228 |
+
{
|
| 17229 |
+
"epoch": 0.01618673915348936,
|
| 17230 |
+
"grad_norm": 2.2386929988861084,
|
| 17231 |
+
"learning_rate": 1.6567546314793956e-05,
|
| 17232 |
+
"loss": 2.1982,
|
| 17233 |
+
"step": 220800
|
| 17234 |
+
},
|
| 17235 |
+
{
|
| 17236 |
+
"epoch": 0.01646582086303228,
|
| 17237 |
+
"grad_norm": 2.178781747817993,
|
| 17238 |
+
"learning_rate": 1.6546706621967255e-05,
|
| 17239 |
+
"loss": 2.2056,
|
| 17240 |
+
"step": 220900
|
| 17241 |
+
},
|
| 17242 |
+
{
|
| 17243 |
+
"epoch": 0.016744902572575198,
|
| 17244 |
+
"grad_norm": 2.2631821632385254,
|
| 17245 |
+
"learning_rate": 1.6525873559239764e-05,
|
| 17246 |
+
"loss": 2.1995,
|
| 17247 |
+
"step": 221000
|
| 17248 |
+
},
|
| 17249 |
+
{
|
| 17250 |
+
"epoch": 0.016744902572575198,
|
| 17251 |
+
"eval_loss": 2.167518138885498,
|
| 17252 |
+
"eval_runtime": 51.2411,
|
| 17253 |
+
"eval_samples_per_second": 198.942,
|
| 17254 |
+
"eval_steps_per_second": 1.561,
|
| 17255 |
+
"step": 221000
|
| 17256 |
+
},
|
| 17257 |
+
{
|
| 17258 |
+
"epoch": 0.01702398428211812,
|
| 17259 |
+
"grad_norm": 2.186282157897949,
|
| 17260 |
+
"learning_rate": 1.650504714295129e-05,
|
| 17261 |
+
"loss": 2.2005,
|
| 17262 |
+
"step": 221100
|
| 17263 |
+
},
|
| 17264 |
+
{
|
| 17265 |
+
"epoch": 0.017303065991661037,
|
| 17266 |
+
"grad_norm": 2.2361273765563965,
|
| 17267 |
+
"learning_rate": 1.648422738943644e-05,
|
| 17268 |
+
"loss": 2.2034,
|
| 17269 |
+
"step": 221200
|
| 17270 |
+
},
|
| 17271 |
+
{
|
| 17272 |
+
"epoch": 0.01758214770120396,
|
| 17273 |
+
"grad_norm": 2.1385703086853027,
|
| 17274 |
+
"learning_rate": 1.646341431502459e-05,
|
| 17275 |
+
"loss": 2.2073,
|
| 17276 |
+
"step": 221300
|
| 17277 |
+
},
|
| 17278 |
+
{
|
| 17279 |
+
"epoch": 0.017861229410746877,
|
| 17280 |
+
"grad_norm": 2.232243299484253,
|
| 17281 |
+
"learning_rate": 1.64426079360399e-05,
|
| 17282 |
+
"loss": 2.2008,
|
| 17283 |
+
"step": 221400
|
| 17284 |
+
},
|
| 17285 |
+
{
|
| 17286 |
+
"epoch": 0.018140311120289798,
|
| 17287 |
+
"grad_norm": 2.30553936958313,
|
| 17288 |
+
"learning_rate": 1.6421808268801235e-05,
|
| 17289 |
+
"loss": 2.2029,
|
| 17290 |
+
"step": 221500
|
| 17291 |
+
},
|
| 17292 |
+
{
|
| 17293 |
+
"epoch": 0.01841939282983272,
|
| 17294 |
+
"grad_norm": 2.1158080101013184,
|
| 17295 |
+
"learning_rate": 1.6401015329622233e-05,
|
| 17296 |
+
"loss": 2.1912,
|
| 17297 |
+
"step": 221600
|
| 17298 |
+
},
|
| 17299 |
+
{
|
| 17300 |
+
"epoch": 0.018698474539375638,
|
| 17301 |
+
"grad_norm": 2.136540412902832,
|
| 17302 |
+
"learning_rate": 1.6380229134811232e-05,
|
| 17303 |
+
"loss": 2.2066,
|
| 17304 |
+
"step": 221700
|
| 17305 |
+
},
|
| 17306 |
+
{
|
| 17307 |
+
"epoch": 0.01897755624891856,
|
| 17308 |
+
"grad_norm": 2.0367746353149414,
|
| 17309 |
+
"learning_rate": 1.6359449700671307e-05,
|
| 17310 |
+
"loss": 2.2027,
|
| 17311 |
+
"step": 221800
|
| 17312 |
+
},
|
| 17313 |
+
{
|
| 17314 |
+
"epoch": 0.019256637958461477,
|
| 17315 |
+
"grad_norm": 2.1502268314361572,
|
| 17316 |
+
"learning_rate": 1.6338677043500197e-05,
|
| 17317 |
+
"loss": 2.2027,
|
| 17318 |
+
"step": 221900
|
| 17319 |
+
},
|
| 17320 |
+
{
|
| 17321 |
+
"epoch": 0.0195357196680044,
|
| 17322 |
+
"grad_norm": 2.2150540351867676,
|
| 17323 |
+
"learning_rate": 1.6317911179590346e-05,
|
| 17324 |
+
"loss": 2.207,
|
| 17325 |
+
"step": 222000
|
| 17326 |
+
},
|
| 17327 |
+
{
|
| 17328 |
+
"epoch": 0.0195357196680044,
|
| 17329 |
+
"eval_loss": 2.16145920753479,
|
| 17330 |
+
"eval_runtime": 51.444,
|
| 17331 |
+
"eval_samples_per_second": 198.157,
|
| 17332 |
+
"eval_steps_per_second": 1.555,
|
| 17333 |
+
"step": 222000
|
| 17334 |
+
},
|
| 17335 |
+
{
|
| 17336 |
+
"epoch": 0.01981480137754732,
|
| 17337 |
+
"grad_norm": 2.327277183532715,
|
| 17338 |
+
"learning_rate": 1.629715212522887e-05,
|
| 17339 |
+
"loss": 2.2025,
|
| 17340 |
+
"step": 222100
|
| 17341 |
+
},
|
| 17342 |
+
{
|
| 17343 |
+
"epoch": 0.020093883087090238,
|
| 17344 |
+
"grad_norm": 2.240081548690796,
|
| 17345 |
+
"learning_rate": 1.627639989669754e-05,
|
| 17346 |
+
"loss": 2.2018,
|
| 17347 |
+
"step": 222200
|
| 17348 |
+
},
|
| 17349 |
+
{
|
| 17350 |
+
"epoch": 0.02037296479663316,
|
| 17351 |
+
"grad_norm": 2.3731963634490967,
|
| 17352 |
+
"learning_rate": 1.6255654510272778e-05,
|
| 17353 |
+
"loss": 2.2009,
|
| 17354 |
+
"step": 222300
|
| 17355 |
+
},
|
| 17356 |
+
{
|
| 17357 |
+
"epoch": 0.020652046506176077,
|
| 17358 |
+
"grad_norm": 2.1497604846954346,
|
| 17359 |
+
"learning_rate": 1.623491598222563e-05,
|
| 17360 |
+
"loss": 2.1973,
|
| 17361 |
+
"step": 222400
|
| 17362 |
+
},
|
| 17363 |
+
{
|
| 17364 |
+
"epoch": 0.020931128215719,
|
| 17365 |
+
"grad_norm": 2.194458246231079,
|
| 17366 |
+
"learning_rate": 1.621418432882176e-05,
|
| 17367 |
+
"loss": 2.2045,
|
| 17368 |
+
"step": 222500
|
| 17369 |
+
},
|
| 17370 |
+
{
|
| 17371 |
+
"epoch": 0.02121020992526192,
|
| 17372 |
+
"grad_norm": 2.1718227863311768,
|
| 17373 |
+
"learning_rate": 1.6193459566321456e-05,
|
| 17374 |
+
"loss": 2.1977,
|
| 17375 |
+
"step": 222600
|
| 17376 |
+
},
|
| 17377 |
+
{
|
| 17378 |
+
"epoch": 0.021489291634804838,
|
| 17379 |
+
"grad_norm": 2.2664620876312256,
|
| 17380 |
+
"learning_rate": 1.6172741710979606e-05,
|
| 17381 |
+
"loss": 2.2011,
|
| 17382 |
+
"step": 222700
|
| 17383 |
+
},
|
| 17384 |
+
{
|
| 17385 |
+
"epoch": 0.02176837334434776,
|
| 17386 |
+
"grad_norm": 2.388573169708252,
|
| 17387 |
+
"learning_rate": 1.6152030779045647e-05,
|
| 17388 |
+
"loss": 2.1984,
|
| 17389 |
+
"step": 222800
|
| 17390 |
+
},
|
| 17391 |
+
{
|
| 17392 |
+
"epoch": 0.022047455053890677,
|
| 17393 |
+
"grad_norm": 2.1636369228363037,
|
| 17394 |
+
"learning_rate": 1.6131326786763616e-05,
|
| 17395 |
+
"loss": 2.2017,
|
| 17396 |
+
"step": 222900
|
| 17397 |
+
},
|
| 17398 |
+
{
|
| 17399 |
+
"epoch": 0.0223265367634336,
|
| 17400 |
+
"grad_norm": 2.3732447624206543,
|
| 17401 |
+
"learning_rate": 1.6110629750372096e-05,
|
| 17402 |
+
"loss": 2.1938,
|
| 17403 |
+
"step": 223000
|
| 17404 |
+
},
|
| 17405 |
+
{
|
| 17406 |
+
"epoch": 0.0223265367634336,
|
| 17407 |
+
"eval_loss": 2.170623779296875,
|
| 17408 |
+
"eval_runtime": 51.4801,
|
| 17409 |
+
"eval_samples_per_second": 198.018,
|
| 17410 |
+
"eval_steps_per_second": 1.554,
|
| 17411 |
+
"step": 223000
|
| 17412 |
+
},
|
| 17413 |
+
{
|
| 17414 |
+
"epoch": 0.022605618472976517,
|
| 17415 |
+
"grad_norm": 2.167587995529175,
|
| 17416 |
+
"learning_rate": 1.608993968610423e-05,
|
| 17417 |
+
"loss": 2.191,
|
| 17418 |
+
"step": 223100
|
| 17419 |
+
},
|
| 17420 |
+
{
|
| 17421 |
+
"epoch": 0.022884700182519438,
|
| 17422 |
+
"grad_norm": 2.159860849380493,
|
| 17423 |
+
"learning_rate": 1.6069256610187656e-05,
|
| 17424 |
+
"loss": 2.2105,
|
| 17425 |
+
"step": 223200
|
| 17426 |
+
},
|
| 17427 |
+
{
|
| 17428 |
+
"epoch": 0.02316378189206236,
|
| 17429 |
+
"grad_norm": 2.154714822769165,
|
| 17430 |
+
"learning_rate": 1.6048580538844566e-05,
|
| 17431 |
+
"loss": 2.1955,
|
| 17432 |
+
"step": 223300
|
| 17433 |
+
},
|
| 17434 |
+
{
|
| 17435 |
+
"epoch": 0.023442863601605277,
|
| 17436 |
+
"grad_norm": 2.1291658878326416,
|
| 17437 |
+
"learning_rate": 1.602791148829164e-05,
|
| 17438 |
+
"loss": 2.2017,
|
| 17439 |
+
"step": 223400
|
| 17440 |
+
},
|
| 17441 |
+
{
|
| 17442 |
+
"epoch": 0.0237219453111482,
|
| 17443 |
+
"grad_norm": 2.1027395725250244,
|
| 17444 |
+
"learning_rate": 1.600724947474008e-05,
|
| 17445 |
+
"loss": 2.1981,
|
| 17446 |
+
"step": 223500
|
| 17447 |
+
},
|
| 17448 |
+
{
|
| 17449 |
+
"epoch": 0.024001027020691117,
|
| 17450 |
+
"grad_norm": 2.206848621368408,
|
| 17451 |
+
"learning_rate": 1.5986594514395513e-05,
|
| 17452 |
+
"loss": 2.1952,
|
| 17453 |
+
"step": 223600
|
| 17454 |
+
},
|
| 17455 |
+
{
|
| 17456 |
+
"epoch": 0.024280108730234038,
|
| 17457 |
+
"grad_norm": 2.2017011642456055,
|
| 17458 |
+
"learning_rate": 1.5965946623458084e-05,
|
| 17459 |
+
"loss": 2.2008,
|
| 17460 |
+
"step": 223700
|
| 17461 |
+
},
|
| 17462 |
+
{
|
| 17463 |
+
"epoch": 0.02455919043977696,
|
| 17464 |
+
"grad_norm": 2.31180477142334,
|
| 17465 |
+
"learning_rate": 1.5945305818122376e-05,
|
| 17466 |
+
"loss": 2.1875,
|
| 17467 |
+
"step": 223800
|
| 17468 |
+
},
|
| 17469 |
+
{
|
| 17470 |
+
"epoch": 0.024838272149319877,
|
| 17471 |
+
"grad_norm": 2.226900577545166,
|
| 17472 |
+
"learning_rate": 1.5924672114577422e-05,
|
| 17473 |
+
"loss": 2.1909,
|
| 17474 |
+
"step": 223900
|
| 17475 |
+
},
|
| 17476 |
+
{
|
| 17477 |
+
"epoch": 0.0251173538588628,
|
| 17478 |
+
"grad_norm": 2.177281618118286,
|
| 17479 |
+
"learning_rate": 1.5904045529006657e-05,
|
| 17480 |
+
"loss": 2.1933,
|
| 17481 |
+
"step": 224000
|
| 17482 |
+
},
|
| 17483 |
+
{
|
| 17484 |
+
"epoch": 0.0251173538588628,
|
| 17485 |
+
"eval_loss": 2.158267021179199,
|
| 17486 |
+
"eval_runtime": 51.4171,
|
| 17487 |
+
"eval_samples_per_second": 198.261,
|
| 17488 |
+
"eval_steps_per_second": 1.556,
|
| 17489 |
+
"step": 224000
|
| 17490 |
+
},
|
| 17491 |
+
{
|
| 17492 |
+
"epoch": 0.025396435568405717,
|
| 17493 |
+
"grad_norm": 2.1759471893310547,
|
| 17494 |
+
"learning_rate": 1.588342607758797e-05,
|
| 17495 |
+
"loss": 2.1969,
|
| 17496 |
+
"step": 224100
|
| 17497 |
+
},
|
| 17498 |
+
{
|
| 17499 |
+
"epoch": 0.025675517277948638,
|
| 17500 |
+
"grad_norm": 2.1845242977142334,
|
| 17501 |
+
"learning_rate": 1.586281377649364e-05,
|
| 17502 |
+
"loss": 2.2041,
|
| 17503 |
+
"step": 224200
|
| 17504 |
+
},
|
| 17505 |
+
{
|
| 17506 |
+
"epoch": 0.025954598987491556,
|
| 17507 |
+
"grad_norm": 2.3617475032806396,
|
| 17508 |
+
"learning_rate": 1.5842208641890337e-05,
|
| 17509 |
+
"loss": 2.1873,
|
| 17510 |
+
"step": 224300
|
| 17511 |
+
},
|
| 17512 |
+
{
|
| 17513 |
+
"epoch": 0.026233680697034478,
|
| 17514 |
+
"grad_norm": 2.091614007949829,
|
| 17515 |
+
"learning_rate": 1.5821610689939105e-05,
|
| 17516 |
+
"loss": 2.1918,
|
| 17517 |
+
"step": 224400
|
| 17518 |
+
},
|
| 17519 |
+
{
|
| 17520 |
+
"epoch": 0.0265127624065774,
|
| 17521 |
+
"grad_norm": 2.2906229496002197,
|
| 17522 |
+
"learning_rate": 1.580101993679535e-05,
|
| 17523 |
+
"loss": 2.1975,
|
| 17524 |
+
"step": 224500
|
| 17525 |
+
},
|
| 17526 |
+
{
|
| 17527 |
+
"epoch": 0.026791844116120317,
|
| 17528 |
+
"grad_norm": 2.089142084121704,
|
| 17529 |
+
"learning_rate": 1.5780436398608854e-05,
|
| 17530 |
+
"loss": 2.2017,
|
| 17531 |
+
"step": 224600
|
| 17532 |
+
},
|
| 17533 |
+
{
|
| 17534 |
+
"epoch": 0.02707092582566324,
|
| 17535 |
+
"grad_norm": 2.2736806869506836,
|
| 17536 |
+
"learning_rate": 1.575986009152373e-05,
|
| 17537 |
+
"loss": 2.1857,
|
| 17538 |
+
"step": 224700
|
| 17539 |
+
},
|
| 17540 |
+
{
|
| 17541 |
+
"epoch": 0.027350007535206156,
|
| 17542 |
+
"grad_norm": 2.1917905807495117,
|
| 17543 |
+
"learning_rate": 1.5739291031678404e-05,
|
| 17544 |
+
"loss": 2.1903,
|
| 17545 |
+
"step": 224800
|
| 17546 |
+
},
|
| 17547 |
+
{
|
| 17548 |
+
"epoch": 0.027629089244749078,
|
| 17549 |
+
"grad_norm": 2.207611322402954,
|
| 17550 |
+
"learning_rate": 1.5718729235205642e-05,
|
| 17551 |
+
"loss": 2.1948,
|
| 17552 |
+
"step": 224900
|
| 17553 |
+
},
|
| 17554 |
+
{
|
| 17555 |
+
"epoch": 0.027908170954292,
|
| 17556 |
+
"grad_norm": 2.3215441703796387,
|
| 17557 |
+
"learning_rate": 1.5698174718232494e-05,
|
| 17558 |
+
"loss": 2.192,
|
| 17559 |
+
"step": 225000
|
| 17560 |
+
},
|
| 17561 |
+
{
|
| 17562 |
+
"epoch": 0.027908170954292,
|
| 17563 |
+
"eval_loss": 2.1532270908355713,
|
| 17564 |
+
"eval_runtime": 51.4641,
|
| 17565 |
+
"eval_samples_per_second": 198.08,
|
| 17566 |
+
"eval_steps_per_second": 1.554,
|
| 17567 |
+
"step": 225000
|
| 17568 |
+
},
|
| 17569 |
+
{
|
| 17570 |
+
"epoch": 0.028187252663834917,
|
| 17571 |
+
"grad_norm": 2.1780614852905273,
|
| 17572 |
+
"learning_rate": 1.567762749688031e-05,
|
| 17573 |
+
"loss": 2.1826,
|
| 17574 |
+
"step": 225100
|
| 17575 |
+
},
|
| 17576 |
+
{
|
| 17577 |
+
"epoch": 0.02846633437337784,
|
| 17578 |
+
"grad_norm": 2.1773393154144287,
|
| 17579 |
+
"learning_rate": 1.5657087587264724e-05,
|
| 17580 |
+
"loss": 2.187,
|
| 17581 |
+
"step": 225200
|
| 17582 |
+
},
|
| 17583 |
+
{
|
| 17584 |
+
"epoch": 0.028745416082920756,
|
| 17585 |
+
"grad_norm": 2.1740593910217285,
|
| 17586 |
+
"learning_rate": 1.5636555005495616e-05,
|
| 17587 |
+
"loss": 2.186,
|
| 17588 |
+
"step": 225300
|
| 17589 |
+
},
|
| 17590 |
+
{
|
| 17591 |
+
"epoch": 0.029024497792463678,
|
| 17592 |
+
"grad_norm": 2.338139295578003,
|
| 17593 |
+
"learning_rate": 1.561602976767713e-05,
|
| 17594 |
+
"loss": 2.1901,
|
| 17595 |
+
"step": 225400
|
| 17596 |
+
},
|
| 17597 |
+
{
|
| 17598 |
+
"epoch": 0.0293035795020066,
|
| 17599 |
+
"grad_norm": 2.3076512813568115,
|
| 17600 |
+
"learning_rate": 1.5595511889907665e-05,
|
| 17601 |
+
"loss": 2.1911,
|
| 17602 |
+
"step": 225500
|
| 17603 |
+
},
|
| 17604 |
+
{
|
| 17605 |
+
"epoch": 0.029582661211549517,
|
| 17606 |
+
"grad_norm": 2.286112070083618,
|
| 17607 |
+
"learning_rate": 1.557500138827982e-05,
|
| 17608 |
+
"loss": 2.1823,
|
| 17609 |
+
"step": 225600
|
| 17610 |
+
},
|
| 17611 |
+
{
|
| 17612 |
+
"epoch": 0.02986174292109244,
|
| 17613 |
+
"grad_norm": 2.1310651302337646,
|
| 17614 |
+
"learning_rate": 1.5554498278880424e-05,
|
| 17615 |
+
"loss": 2.1904,
|
| 17616 |
+
"step": 225700
|
| 17617 |
+
},
|
| 17618 |
+
{
|
| 17619 |
+
"epoch": 0.030140824630635357,
|
| 17620 |
+
"grad_norm": 2.149794578552246,
|
| 17621 |
+
"learning_rate": 1.5534002577790497e-05,
|
| 17622 |
+
"loss": 2.1857,
|
| 17623 |
+
"step": 225800
|
| 17624 |
+
},
|
| 17625 |
+
{
|
| 17626 |
+
"epoch": 0.030419906340178278,
|
| 17627 |
+
"grad_norm": 2.250833511352539,
|
| 17628 |
+
"learning_rate": 1.5513514301085266e-05,
|
| 17629 |
+
"loss": 2.1748,
|
| 17630 |
+
"step": 225900
|
| 17631 |
+
},
|
| 17632 |
+
{
|
| 17633 |
+
"epoch": 0.030698988049721196,
|
| 17634 |
+
"grad_norm": 2.2140324115753174,
|
| 17635 |
+
"learning_rate": 1.5493033464834133e-05,
|
| 17636 |
+
"loss": 2.1891,
|
| 17637 |
+
"step": 226000
|
| 17638 |
+
},
|
| 17639 |
+
{
|
| 17640 |
+
"epoch": 0.030698988049721196,
|
| 17641 |
+
"eval_loss": 2.149634838104248,
|
| 17642 |
+
"eval_runtime": 51.5665,
|
| 17643 |
+
"eval_samples_per_second": 197.687,
|
| 17644 |
+
"eval_steps_per_second": 1.551,
|
| 17645 |
+
"step": 226000
|
| 17646 |
+
},
|
| 17647 |
+
{
|
| 17648 |
+
"epoch": 0.030978069759264117,
|
| 17649 |
+
"grad_norm": 2.228729009628296,
|
| 17650 |
+
"learning_rate": 1.547256008510064e-05,
|
| 17651 |
+
"loss": 2.1815,
|
| 17652 |
+
"step": 226100
|
| 17653 |
+
},
|
| 17654 |
+
{
|
| 17655 |
+
"epoch": 0.03125715146880704,
|
| 17656 |
+
"grad_norm": 2.263529062271118,
|
| 17657 |
+
"learning_rate": 1.545209417794251e-05,
|
| 17658 |
+
"loss": 2.2412,
|
| 17659 |
+
"step": 226200
|
| 17660 |
+
},
|
| 17661 |
+
{
|
| 17662 |
+
"epoch": 0.03153623317834996,
|
| 17663 |
+
"grad_norm": 2.239266872406006,
|
| 17664 |
+
"learning_rate": 1.5431635759411582e-05,
|
| 17665 |
+
"loss": 2.3094,
|
| 17666 |
+
"step": 226300
|
| 17667 |
+
},
|
| 17668 |
+
{
|
| 17669 |
+
"epoch": 0.031815314887892875,
|
| 17670 |
+
"grad_norm": 2.179316997528076,
|
| 17671 |
+
"learning_rate": 1.541118484555385e-05,
|
| 17672 |
+
"loss": 2.2971,
|
| 17673 |
+
"step": 226400
|
| 17674 |
+
},
|
| 17675 |
+
{
|
| 17676 |
+
"epoch": 0.0320943965974358,
|
| 17677 |
+
"grad_norm": 2.152000665664673,
|
| 17678 |
+
"learning_rate": 1.539074145240938e-05,
|
| 17679 |
+
"loss": 2.3019,
|
| 17680 |
+
"step": 226500
|
| 17681 |
+
},
|
| 17682 |
+
{
|
| 17683 |
+
"epoch": 0.03237347830697872,
|
| 17684 |
+
"grad_norm": 2.2889840602874756,
|
| 17685 |
+
"learning_rate": 1.5370305596012376e-05,
|
| 17686 |
+
"loss": 2.284,
|
| 17687 |
+
"step": 226600
|
| 17688 |
+
},
|
| 17689 |
+
{
|
| 17690 |
+
"epoch": 0.032652560016521635,
|
| 17691 |
+
"grad_norm": 2.195444345474243,
|
| 17692 |
+
"learning_rate": 1.5349877292391122e-05,
|
| 17693 |
+
"loss": 2.2919,
|
| 17694 |
+
"step": 226700
|
| 17695 |
+
},
|
| 17696 |
+
{
|
| 17697 |
+
"epoch": 0.03293164172606456,
|
| 17698 |
+
"grad_norm": 2.3559839725494385,
|
| 17699 |
+
"learning_rate": 1.5329456557567978e-05,
|
| 17700 |
+
"loss": 2.2882,
|
| 17701 |
+
"step": 226800
|
| 17702 |
+
},
|
| 17703 |
+
{
|
| 17704 |
+
"epoch": 0.03321072343560748,
|
| 17705 |
+
"grad_norm": 2.2163028717041016,
|
| 17706 |
+
"learning_rate": 1.5309043407559345e-05,
|
| 17707 |
+
"loss": 2.2731,
|
| 17708 |
+
"step": 226900
|
| 17709 |
+
},
|
| 17710 |
+
{
|
| 17711 |
+
"epoch": 0.033489805145150396,
|
| 17712 |
+
"grad_norm": 2.3102822303771973,
|
| 17713 |
+
"learning_rate": 1.5288637858375714e-05,
|
| 17714 |
+
"loss": 2.2873,
|
| 17715 |
+
"step": 227000
|
| 17716 |
+
},
|
| 17717 |
+
{
|
| 17718 |
+
"epoch": 0.033489805145150396,
|
| 17719 |
+
"eval_loss": 2.1502978801727295,
|
| 17720 |
+
"eval_runtime": 51.5237,
|
| 17721 |
+
"eval_samples_per_second": 197.851,
|
| 17722 |
+
"eval_steps_per_second": 1.553,
|
| 17723 |
+
"step": 227000
|
| 17724 |
+
},
|
| 17725 |
+
{
|
| 17726 |
+
"epoch": 0.033768886854693314,
|
| 17727 |
+
"grad_norm": 2.150144577026367,
|
| 17728 |
+
"learning_rate": 1.5268239926021576e-05,
|
| 17729 |
+
"loss": 2.2731,
|
| 17730 |
+
"step": 227100
|
| 17731 |
+
},
|
| 17732 |
+
{
|
| 17733 |
+
"epoch": 0.03404796856423624,
|
| 17734 |
+
"grad_norm": 2.355604410171509,
|
| 17735 |
+
"learning_rate": 1.5247849626495492e-05,
|
| 17736 |
+
"loss": 2.2814,
|
| 17737 |
+
"step": 227200
|
| 17738 |
+
},
|
| 17739 |
+
{
|
| 17740 |
+
"epoch": 0.03432705027377916,
|
| 17741 |
+
"grad_norm": 2.2507338523864746,
|
| 17742 |
+
"learning_rate": 1.5227466975789987e-05,
|
| 17743 |
+
"loss": 2.2773,
|
| 17744 |
+
"step": 227300
|
| 17745 |
+
},
|
| 17746 |
+
{
|
| 17747 |
+
"epoch": 0.034606131983322075,
|
| 17748 |
+
"grad_norm": 2.3993356227874756,
|
| 17749 |
+
"learning_rate": 1.5207091989891617e-05,
|
| 17750 |
+
"loss": 2.275,
|
| 17751 |
+
"step": 227400
|
| 17752 |
+
},
|
| 17753 |
+
{
|
| 17754 |
+
"epoch": 0.034885213692865,
|
| 17755 |
+
"grad_norm": 2.2218728065490723,
|
| 17756 |
+
"learning_rate": 1.5186724684780929e-05,
|
| 17757 |
+
"loss": 2.29,
|
| 17758 |
+
"step": 227500
|
| 17759 |
+
},
|
| 17760 |
+
{
|
| 17761 |
+
"epoch": 0.03516429540240792,
|
| 17762 |
+
"grad_norm": 2.109447717666626,
|
| 17763 |
+
"learning_rate": 1.5166365076432432e-05,
|
| 17764 |
+
"loss": 2.2635,
|
| 17765 |
+
"step": 227600
|
| 17766 |
+
},
|
| 17767 |
+
{
|
| 17768 |
+
"epoch": 0.035443377111950836,
|
| 17769 |
+
"grad_norm": 2.2415287494659424,
|
| 17770 |
+
"learning_rate": 1.51460131808146e-05,
|
| 17771 |
+
"loss": 2.2773,
|
| 17772 |
+
"step": 227700
|
| 17773 |
+
},
|
| 17774 |
+
{
|
| 17775 |
+
"epoch": 0.035722458821493754,
|
| 17776 |
+
"grad_norm": 2.3350560665130615,
|
| 17777 |
+
"learning_rate": 1.5125669013889861e-05,
|
| 17778 |
+
"loss": 2.2789,
|
| 17779 |
+
"step": 227800
|
| 17780 |
+
},
|
| 17781 |
+
{
|
| 17782 |
+
"epoch": 0.03600154053103668,
|
| 17783 |
+
"grad_norm": 2.2049736976623535,
|
| 17784 |
+
"learning_rate": 1.5105332591614585e-05,
|
| 17785 |
+
"loss": 2.2747,
|
| 17786 |
+
"step": 227900
|
| 17787 |
+
},
|
| 17788 |
+
{
|
| 17789 |
+
"epoch": 0.036280622240579596,
|
| 17790 |
+
"grad_norm": 2.2645366191864014,
|
| 17791 |
+
"learning_rate": 1.5085003929939067e-05,
|
| 17792 |
+
"loss": 2.2662,
|
| 17793 |
+
"step": 228000
|
| 17794 |
+
},
|
| 17795 |
+
{
|
| 17796 |
+
"epoch": 0.036280622240579596,
|
| 17797 |
+
"eval_loss": 2.140353202819824,
|
| 17798 |
+
"eval_runtime": 51.6063,
|
| 17799 |
+
"eval_samples_per_second": 197.534,
|
| 17800 |
+
"eval_steps_per_second": 1.55,
|
| 17801 |
+
"step": 228000
|
| 17802 |
+
},
|
| 17803 |
+
{
|
| 17804 |
+
"epoch": 0.036559703950122514,
|
| 17805 |
+
"grad_norm": 2.245758295059204,
|
| 17806 |
+
"learning_rate": 1.5064683044807504e-05,
|
| 17807 |
+
"loss": 2.2559,
|
| 17808 |
+
"step": 228100
|
| 17809 |
+
},
|
| 17810 |
+
{
|
| 17811 |
+
"epoch": 0.03683878565966544,
|
| 17812 |
+
"grad_norm": 2.1644320487976074,
|
| 17813 |
+
"learning_rate": 1.5044369952158e-05,
|
| 17814 |
+
"loss": 2.2621,
|
| 17815 |
+
"step": 228200
|
| 17816 |
+
},
|
| 17817 |
+
{
|
| 17818 |
+
"epoch": 0.03711786736920836,
|
| 17819 |
+
"grad_norm": 2.24301815032959,
|
| 17820 |
+
"learning_rate": 1.5024064667922563e-05,
|
| 17821 |
+
"loss": 2.2643,
|
| 17822 |
+
"step": 228300
|
| 17823 |
+
},
|
| 17824 |
+
{
|
| 17825 |
+
"epoch": 0.037396949078751275,
|
| 17826 |
+
"grad_norm": 2.1599223613739014,
|
| 17827 |
+
"learning_rate": 1.5003767208027048e-05,
|
| 17828 |
+
"loss": 2.2675,
|
| 17829 |
+
"step": 228400
|
| 17830 |
+
},
|
| 17831 |
+
{
|
| 17832 |
+
"epoch": 0.0376760307882942,
|
| 17833 |
+
"grad_norm": 2.279449701309204,
|
| 17834 |
+
"learning_rate": 1.4983477588391203e-05,
|
| 17835 |
+
"loss": 2.2637,
|
| 17836 |
+
"step": 228500
|
| 17837 |
+
},
|
| 17838 |
+
{
|
| 17839 |
+
"epoch": 0.03795511249783712,
|
| 17840 |
+
"grad_norm": 2.155567169189453,
|
| 17841 |
+
"learning_rate": 1.4963195824928595e-05,
|
| 17842 |
+
"loss": 2.2511,
|
| 17843 |
+
"step": 228600
|
| 17844 |
+
},
|
| 17845 |
+
{
|
| 17846 |
+
"epoch": 0.038234194207380036,
|
| 17847 |
+
"grad_norm": 2.1678829193115234,
|
| 17848 |
+
"learning_rate": 1.4942921933546653e-05,
|
| 17849 |
+
"loss": 2.2637,
|
| 17850 |
+
"step": 228700
|
| 17851 |
+
},
|
| 17852 |
+
{
|
| 17853 |
+
"epoch": 0.038513275916922954,
|
| 17854 |
+
"grad_norm": 2.173006772994995,
|
| 17855 |
+
"learning_rate": 1.4922655930146628e-05,
|
| 17856 |
+
"loss": 2.2565,
|
| 17857 |
+
"step": 228800
|
| 17858 |
+
},
|
| 17859 |
+
{
|
| 17860 |
+
"epoch": 0.03879235762646588,
|
| 17861 |
+
"grad_norm": 2.268568992614746,
|
| 17862 |
+
"learning_rate": 1.4902397830623583e-05,
|
| 17863 |
+
"loss": 2.267,
|
| 17864 |
+
"step": 228900
|
| 17865 |
+
},
|
| 17866 |
+
{
|
| 17867 |
+
"epoch": 0.0390714393360088,
|
| 17868 |
+
"grad_norm": 2.140665292739868,
|
| 17869 |
+
"learning_rate": 1.488214765086637e-05,
|
| 17870 |
+
"loss": 2.2609,
|
| 17871 |
+
"step": 229000
|
| 17872 |
+
},
|
| 17873 |
+
{
|
| 17874 |
+
"epoch": 0.0390714393360088,
|
| 17875 |
+
"eval_loss": 2.1331050395965576,
|
| 17876 |
+
"eval_runtime": 51.4755,
|
| 17877 |
+
"eval_samples_per_second": 198.036,
|
| 17878 |
+
"eval_steps_per_second": 1.554,
|
| 17879 |
+
"step": 229000
|
| 17880 |
}
|
| 17881 |
],
|
| 17882 |
"logging_steps": 100,
|
|
|
|
| 17896 |
"attributes": {}
|
| 17897 |
}
|
| 17898 |
},
|
| 17899 |
+
"total_flos": 1.9985381902516224e+19,
|
| 17900 |
"train_batch_size": 128,
|
| 17901 |
"trial_name": null,
|
| 17902 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5777
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f7b845168445732fd0c73bfeaca5509fec78a0bea7de873006a9dc759b752ca
|
| 3 |
size 5777
|