Upload 10 files
Browse files- model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +2187 -3
- training_args.bin +1 -1
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 598635032
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60ccf0628b701b3fbdbd8e47c124929d09ca765f44e1db4de84ca146c4892cb2
|
| 3 |
size 598635032
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1197359627
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90969ce2677fe59ebce6103f3db23c468384c1c32a2de10256b3b5076385d4ff
|
| 3 |
size 1197359627
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b19a9b53a8ffcdf83e2c27bdb7c9e264673baa2e50d42027e774b79d1973943e
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ca7233d8acabb4ee394de5e172d0b6096e38585b946640bcf133642f5f83579
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 1000,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -14523,6 +14523,2190 @@
|
|
| 14523 |
"eval_samples_per_second": 197.87,
|
| 14524 |
"eval_steps_per_second": 1.553,
|
| 14525 |
"step": 186000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14526 |
}
|
| 14527 |
],
|
| 14528 |
"logging_steps": 100,
|
|
@@ -14542,7 +16726,7 @@
|
|
| 14542 |
"attributes": {}
|
| 14543 |
}
|
| 14544 |
},
|
| 14545 |
-
"total_flos": 1.
|
| 14546 |
"train_batch_size": 128,
|
| 14547 |
"trial_name": null,
|
| 14548 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.041862256431438,
|
| 6 |
"eval_steps": 1000,
|
| 7 |
+
"global_step": 214000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 14523 |
"eval_samples_per_second": 197.87,
|
| 14524 |
"eval_steps_per_second": 1.553,
|
| 14525 |
"step": 186000
|
| 14526 |
+
},
|
| 14527 |
+
{
|
| 14528 |
+
"epoch": 0.005860715900401319,
|
| 14529 |
+
"grad_norm": 1.608132004737854,
|
| 14530 |
+
"learning_rate": 2.40818193652843e-05,
|
| 14531 |
+
"loss": 1.837,
|
| 14532 |
+
"step": 186100
|
| 14533 |
+
},
|
| 14534 |
+
{
|
| 14535 |
+
"epoch": 0.00613979760994424,
|
| 14536 |
+
"grad_norm": 1.5261002779006958,
|
| 14537 |
+
"learning_rate": 2.4059694184120883e-05,
|
| 14538 |
+
"loss": 1.827,
|
| 14539 |
+
"step": 186200
|
| 14540 |
+
},
|
| 14541 |
+
{
|
| 14542 |
+
"epoch": 0.0064188793194871595,
|
| 14543 |
+
"grad_norm": 1.604973316192627,
|
| 14544 |
+
"learning_rate": 2.4037569740459486e-05,
|
| 14545 |
+
"loss": 1.8157,
|
| 14546 |
+
"step": 186300
|
| 14547 |
+
},
|
| 14548 |
+
{
|
| 14549 |
+
"epoch": 0.006697961029030079,
|
| 14550 |
+
"grad_norm": 1.6349529027938843,
|
| 14551 |
+
"learning_rate": 2.401544605165276e-05,
|
| 14552 |
+
"loss": 1.8381,
|
| 14553 |
+
"step": 186400
|
| 14554 |
+
},
|
| 14555 |
+
{
|
| 14556 |
+
"epoch": 0.006977042738573,
|
| 14557 |
+
"grad_norm": 1.5540446043014526,
|
| 14558 |
+
"learning_rate": 2.3993323135052806e-05,
|
| 14559 |
+
"loss": 1.8383,
|
| 14560 |
+
"step": 186500
|
| 14561 |
+
},
|
| 14562 |
+
{
|
| 14563 |
+
"epoch": 0.0072561244481159195,
|
| 14564 |
+
"grad_norm": 1.6200664043426514,
|
| 14565 |
+
"learning_rate": 2.3971201008011093e-05,
|
| 14566 |
+
"loss": 1.828,
|
| 14567 |
+
"step": 186600
|
| 14568 |
+
},
|
| 14569 |
+
{
|
| 14570 |
+
"epoch": 0.007535206157658839,
|
| 14571 |
+
"grad_norm": 1.750746726989746,
|
| 14572 |
+
"learning_rate": 2.3949079687878492e-05,
|
| 14573 |
+
"loss": 1.8302,
|
| 14574 |
+
"step": 186700
|
| 14575 |
+
},
|
| 14576 |
+
{
|
| 14577 |
+
"epoch": 0.00781428786720176,
|
| 14578 |
+
"grad_norm": 1.6309112310409546,
|
| 14579 |
+
"learning_rate": 2.392695919200521e-05,
|
| 14580 |
+
"loss": 1.8118,
|
| 14581 |
+
"step": 186800
|
| 14582 |
+
},
|
| 14583 |
+
{
|
| 14584 |
+
"epoch": 0.00809336957674468,
|
| 14585 |
+
"grad_norm": 1.5920358896255493,
|
| 14586 |
+
"learning_rate": 2.3904839537740837e-05,
|
| 14587 |
+
"loss": 1.8226,
|
| 14588 |
+
"step": 186900
|
| 14589 |
+
},
|
| 14590 |
+
{
|
| 14591 |
+
"epoch": 0.008372451286287599,
|
| 14592 |
+
"grad_norm": 1.7713048458099365,
|
| 14593 |
+
"learning_rate": 2.3882720742434294e-05,
|
| 14594 |
+
"loss": 1.8197,
|
| 14595 |
+
"step": 187000
|
| 14596 |
+
},
|
| 14597 |
+
{
|
| 14598 |
+
"epoch": 0.008372451286287599,
|
| 14599 |
+
"eval_loss": 2.121570348739624,
|
| 14600 |
+
"eval_runtime": 51.4105,
|
| 14601 |
+
"eval_samples_per_second": 198.286,
|
| 14602 |
+
"eval_steps_per_second": 1.556,
|
| 14603 |
+
"step": 187000
|
| 14604 |
+
},
|
| 14605 |
+
{
|
| 14606 |
+
"epoch": 0.008651532995830519,
|
| 14607 |
+
"grad_norm": 1.674100637435913,
|
| 14608 |
+
"learning_rate": 2.3860602823433825e-05,
|
| 14609 |
+
"loss": 1.8338,
|
| 14610 |
+
"step": 187100
|
| 14611 |
+
},
|
| 14612 |
+
{
|
| 14613 |
+
"epoch": 0.008930614705373438,
|
| 14614 |
+
"grad_norm": 1.6260745525360107,
|
| 14615 |
+
"learning_rate": 2.3838485798086984e-05,
|
| 14616 |
+
"loss": 1.8209,
|
| 14617 |
+
"step": 187200
|
| 14618 |
+
},
|
| 14619 |
+
{
|
| 14620 |
+
"epoch": 0.00920969641491636,
|
| 14621 |
+
"grad_norm": 1.786022663116455,
|
| 14622 |
+
"learning_rate": 2.3816369683740624e-05,
|
| 14623 |
+
"loss": 1.8298,
|
| 14624 |
+
"step": 187300
|
| 14625 |
+
},
|
| 14626 |
+
{
|
| 14627 |
+
"epoch": 0.00948877812445928,
|
| 14628 |
+
"grad_norm": 1.521037220954895,
|
| 14629 |
+
"learning_rate": 2.3794254497740898e-05,
|
| 14630 |
+
"loss": 1.8353,
|
| 14631 |
+
"step": 187400
|
| 14632 |
+
},
|
| 14633 |
+
{
|
| 14634 |
+
"epoch": 0.0097678598340022,
|
| 14635 |
+
"grad_norm": 1.5519471168518066,
|
| 14636 |
+
"learning_rate": 2.3772140257433223e-05,
|
| 14637 |
+
"loss": 1.8361,
|
| 14638 |
+
"step": 187500
|
| 14639 |
+
},
|
| 14640 |
+
{
|
| 14641 |
+
"epoch": 0.010046941543545119,
|
| 14642 |
+
"grad_norm": 1.5187164545059204,
|
| 14643 |
+
"learning_rate": 2.3750026980162256e-05,
|
| 14644 |
+
"loss": 1.8326,
|
| 14645 |
+
"step": 187600
|
| 14646 |
+
},
|
| 14647 |
+
{
|
| 14648 |
+
"epoch": 0.010326023253088039,
|
| 14649 |
+
"grad_norm": 1.7430784702301025,
|
| 14650 |
+
"learning_rate": 2.3727914683271922e-05,
|
| 14651 |
+
"loss": 1.8308,
|
| 14652 |
+
"step": 187700
|
| 14653 |
+
},
|
| 14654 |
+
{
|
| 14655 |
+
"epoch": 0.01060510496263096,
|
| 14656 |
+
"grad_norm": 1.6210083961486816,
|
| 14657 |
+
"learning_rate": 2.3705803384105377e-05,
|
| 14658 |
+
"loss": 1.8252,
|
| 14659 |
+
"step": 187800
|
| 14660 |
+
},
|
| 14661 |
+
{
|
| 14662 |
+
"epoch": 0.01088418667217388,
|
| 14663 |
+
"grad_norm": 1.6390823125839233,
|
| 14664 |
+
"learning_rate": 2.3683693100004985e-05,
|
| 14665 |
+
"loss": 1.8287,
|
| 14666 |
+
"step": 187900
|
| 14667 |
+
},
|
| 14668 |
+
{
|
| 14669 |
+
"epoch": 0.0111632683817168,
|
| 14670 |
+
"grad_norm": 2.0330820083618164,
|
| 14671 |
+
"learning_rate": 2.3661583848312303e-05,
|
| 14672 |
+
"loss": 1.8347,
|
| 14673 |
+
"step": 188000
|
| 14674 |
+
},
|
| 14675 |
+
{
|
| 14676 |
+
"epoch": 0.0111632683817168,
|
| 14677 |
+
"eval_loss": 2.131164073944092,
|
| 14678 |
+
"eval_runtime": 51.4325,
|
| 14679 |
+
"eval_samples_per_second": 198.202,
|
| 14680 |
+
"eval_steps_per_second": 1.555,
|
| 14681 |
+
"step": 188000
|
| 14682 |
+
},
|
| 14683 |
+
{
|
| 14684 |
+
"epoch": 0.011442350091259719,
|
| 14685 |
+
"grad_norm": 1.5582841634750366,
|
| 14686 |
+
"learning_rate": 2.36394756463681e-05,
|
| 14687 |
+
"loss": 1.8215,
|
| 14688 |
+
"step": 188100
|
| 14689 |
+
},
|
| 14690 |
+
{
|
| 14691 |
+
"epoch": 0.011721431800802639,
|
| 14692 |
+
"grad_norm": 1.5832375288009644,
|
| 14693 |
+
"learning_rate": 2.361736851151231e-05,
|
| 14694 |
+
"loss": 1.8316,
|
| 14695 |
+
"step": 188200
|
| 14696 |
+
},
|
| 14697 |
+
{
|
| 14698 |
+
"epoch": 0.012000513510345558,
|
| 14699 |
+
"grad_norm": 1.578747272491455,
|
| 14700 |
+
"learning_rate": 2.359526246108404e-05,
|
| 14701 |
+
"loss": 1.828,
|
| 14702 |
+
"step": 188300
|
| 14703 |
+
},
|
| 14704 |
+
{
|
| 14705 |
+
"epoch": 0.01227959521988848,
|
| 14706 |
+
"grad_norm": 1.6343365907669067,
|
| 14707 |
+
"learning_rate": 2.3573157512421535e-05,
|
| 14708 |
+
"loss": 1.8348,
|
| 14709 |
+
"step": 188400
|
| 14710 |
+
},
|
| 14711 |
+
{
|
| 14712 |
+
"epoch": 0.0125586769294314,
|
| 14713 |
+
"grad_norm": 1.5738635063171387,
|
| 14714 |
+
"learning_rate": 2.3551053682862177e-05,
|
| 14715 |
+
"loss": 1.8271,
|
| 14716 |
+
"step": 188500
|
| 14717 |
+
},
|
| 14718 |
+
{
|
| 14719 |
+
"epoch": 0.012837758638974319,
|
| 14720 |
+
"grad_norm": 1.6531946659088135,
|
| 14721 |
+
"learning_rate": 2.3528950989742472e-05,
|
| 14722 |
+
"loss": 1.8168,
|
| 14723 |
+
"step": 188600
|
| 14724 |
+
},
|
| 14725 |
+
{
|
| 14726 |
+
"epoch": 0.013116840348517239,
|
| 14727 |
+
"grad_norm": 2.098233699798584,
|
| 14728 |
+
"learning_rate": 2.350684945039804e-05,
|
| 14729 |
+
"loss": 1.8323,
|
| 14730 |
+
"step": 188700
|
| 14731 |
+
},
|
| 14732 |
+
{
|
| 14733 |
+
"epoch": 0.013395922058060158,
|
| 14734 |
+
"grad_norm": 1.6470394134521484,
|
| 14735 |
+
"learning_rate": 2.3484749082163605e-05,
|
| 14736 |
+
"loss": 1.8353,
|
| 14737 |
+
"step": 188800
|
| 14738 |
+
},
|
| 14739 |
+
{
|
| 14740 |
+
"epoch": 0.013675003767603078,
|
| 14741 |
+
"grad_norm": 1.6183503866195679,
|
| 14742 |
+
"learning_rate": 2.346264990237293e-05,
|
| 14743 |
+
"loss": 1.8204,
|
| 14744 |
+
"step": 188900
|
| 14745 |
+
},
|
| 14746 |
+
{
|
| 14747 |
+
"epoch": 0.013954085477146,
|
| 14748 |
+
"grad_norm": 1.60996675491333,
|
| 14749 |
+
"learning_rate": 2.3440551928358894e-05,
|
| 14750 |
+
"loss": 1.8291,
|
| 14751 |
+
"step": 189000
|
| 14752 |
+
},
|
| 14753 |
+
{
|
| 14754 |
+
"epoch": 0.013954085477146,
|
| 14755 |
+
"eval_loss": 2.130070924758911,
|
| 14756 |
+
"eval_runtime": 51.3411,
|
| 14757 |
+
"eval_samples_per_second": 198.554,
|
| 14758 |
+
"eval_steps_per_second": 1.558,
|
| 14759 |
+
"step": 189000
|
| 14760 |
+
},
|
| 14761 |
+
{
|
| 14762 |
+
"epoch": 0.01423316718668892,
|
| 14763 |
+
"grad_norm": 1.5722655057907104,
|
| 14764 |
+
"learning_rate": 2.3418455177453416e-05,
|
| 14765 |
+
"loss": 1.8258,
|
| 14766 |
+
"step": 189100
|
| 14767 |
+
},
|
| 14768 |
+
{
|
| 14769 |
+
"epoch": 0.014512248896231839,
|
| 14770 |
+
"grad_norm": 2.121628999710083,
|
| 14771 |
+
"learning_rate": 2.339635966698745e-05,
|
| 14772 |
+
"loss": 1.8324,
|
| 14773 |
+
"step": 189200
|
| 14774 |
+
},
|
| 14775 |
+
{
|
| 14776 |
+
"epoch": 0.014791330605774759,
|
| 14777 |
+
"grad_norm": 1.6077678203582764,
|
| 14778 |
+
"learning_rate": 2.3374265414290962e-05,
|
| 14779 |
+
"loss": 1.8243,
|
| 14780 |
+
"step": 189300
|
| 14781 |
+
},
|
| 14782 |
+
{
|
| 14783 |
+
"epoch": 0.015070412315317678,
|
| 14784 |
+
"grad_norm": 1.5904488563537598,
|
| 14785 |
+
"learning_rate": 2.335217243669296e-05,
|
| 14786 |
+
"loss": 1.825,
|
| 14787 |
+
"step": 189400
|
| 14788 |
+
},
|
| 14789 |
+
{
|
| 14790 |
+
"epoch": 0.015349494024860598,
|
| 14791 |
+
"grad_norm": 1.536439061164856,
|
| 14792 |
+
"learning_rate": 2.333008075152144e-05,
|
| 14793 |
+
"loss": 1.8242,
|
| 14794 |
+
"step": 189500
|
| 14795 |
+
},
|
| 14796 |
+
{
|
| 14797 |
+
"epoch": 0.01562857573440352,
|
| 14798 |
+
"grad_norm": 2.195769786834717,
|
| 14799 |
+
"learning_rate": 2.3307990376103388e-05,
|
| 14800 |
+
"loss": 1.8365,
|
| 14801 |
+
"step": 189600
|
| 14802 |
+
},
|
| 14803 |
+
{
|
| 14804 |
+
"epoch": 0.015907657443946437,
|
| 14805 |
+
"grad_norm": 1.533521294593811,
|
| 14806 |
+
"learning_rate": 2.328590132776475e-05,
|
| 14807 |
+
"loss": 1.8266,
|
| 14808 |
+
"step": 189700
|
| 14809 |
+
},
|
| 14810 |
+
{
|
| 14811 |
+
"epoch": 0.01618673915348936,
|
| 14812 |
+
"grad_norm": 1.5849336385726929,
|
| 14813 |
+
"learning_rate": 2.326381362383045e-05,
|
| 14814 |
+
"loss": 1.8206,
|
| 14815 |
+
"step": 189800
|
| 14816 |
+
},
|
| 14817 |
+
{
|
| 14818 |
+
"epoch": 0.01646582086303228,
|
| 14819 |
+
"grad_norm": 1.5556162595748901,
|
| 14820 |
+
"learning_rate": 2.3241727281624335e-05,
|
| 14821 |
+
"loss": 1.8272,
|
| 14822 |
+
"step": 189900
|
| 14823 |
+
},
|
| 14824 |
+
{
|
| 14825 |
+
"epoch": 0.016744902572575198,
|
| 14826 |
+
"grad_norm": 1.6486213207244873,
|
| 14827 |
+
"learning_rate": 2.3219642318469215e-05,
|
| 14828 |
+
"loss": 1.8333,
|
| 14829 |
+
"step": 190000
|
| 14830 |
+
},
|
| 14831 |
+
{
|
| 14832 |
+
"epoch": 0.016744902572575198,
|
| 14833 |
+
"eval_loss": 2.1350369453430176,
|
| 14834 |
+
"eval_runtime": 51.4386,
|
| 14835 |
+
"eval_samples_per_second": 198.178,
|
| 14836 |
+
"eval_steps_per_second": 1.555,
|
| 14837 |
+
"step": 190000
|
| 14838 |
+
},
|
| 14839 |
+
{
|
| 14840 |
+
"epoch": 0.01702398428211812,
|
| 14841 |
+
"grad_norm": 1.6402443647384644,
|
| 14842 |
+
"learning_rate": 2.3197558751686776e-05,
|
| 14843 |
+
"loss": 1.83,
|
| 14844 |
+
"step": 190100
|
| 14845 |
+
},
|
| 14846 |
+
{
|
| 14847 |
+
"epoch": 0.017303065991661037,
|
| 14848 |
+
"grad_norm": 1.5592520236968994,
|
| 14849 |
+
"learning_rate": 2.3175476598597648e-05,
|
| 14850 |
+
"loss": 1.8244,
|
| 14851 |
+
"step": 190200
|
| 14852 |
+
},
|
| 14853 |
+
{
|
| 14854 |
+
"epoch": 0.01758214770120396,
|
| 14855 |
+
"grad_norm": 2.0347630977630615,
|
| 14856 |
+
"learning_rate": 2.3153395876521336e-05,
|
| 14857 |
+
"loss": 1.8385,
|
| 14858 |
+
"step": 190300
|
| 14859 |
+
},
|
| 14860 |
+
{
|
| 14861 |
+
"epoch": 0.017861229410746877,
|
| 14862 |
+
"grad_norm": 1.547045350074768,
|
| 14863 |
+
"learning_rate": 2.3131316602776232e-05,
|
| 14864 |
+
"loss": 1.8216,
|
| 14865 |
+
"step": 190400
|
| 14866 |
+
},
|
| 14867 |
+
{
|
| 14868 |
+
"epoch": 0.018140311120289798,
|
| 14869 |
+
"grad_norm": 1.564841628074646,
|
| 14870 |
+
"learning_rate": 2.3109238794679568e-05,
|
| 14871 |
+
"loss": 1.8232,
|
| 14872 |
+
"step": 190500
|
| 14873 |
+
},
|
| 14874 |
+
{
|
| 14875 |
+
"epoch": 0.01841939282983272,
|
| 14876 |
+
"grad_norm": 1.8858461380004883,
|
| 14877 |
+
"learning_rate": 2.3087162469547443e-05,
|
| 14878 |
+
"loss": 1.8319,
|
| 14879 |
+
"step": 190600
|
| 14880 |
+
},
|
| 14881 |
+
{
|
| 14882 |
+
"epoch": 0.018698474539375638,
|
| 14883 |
+
"grad_norm": 1.7047299146652222,
|
| 14884 |
+
"learning_rate": 2.30650876446948e-05,
|
| 14885 |
+
"loss": 1.8391,
|
| 14886 |
+
"step": 190700
|
| 14887 |
+
},
|
| 14888 |
+
{
|
| 14889 |
+
"epoch": 0.01897755624891856,
|
| 14890 |
+
"grad_norm": 1.510563850402832,
|
| 14891 |
+
"learning_rate": 2.30430143374354e-05,
|
| 14892 |
+
"loss": 1.8226,
|
| 14893 |
+
"step": 190800
|
| 14894 |
+
},
|
| 14895 |
+
{
|
| 14896 |
+
"epoch": 0.019256637958461477,
|
| 14897 |
+
"grad_norm": 2.209728956222534,
|
| 14898 |
+
"learning_rate": 2.3020942565081798e-05,
|
| 14899 |
+
"loss": 1.8307,
|
| 14900 |
+
"step": 190900
|
| 14901 |
+
},
|
| 14902 |
+
{
|
| 14903 |
+
"epoch": 0.0195357196680044,
|
| 14904 |
+
"grad_norm": 1.6156638860702515,
|
| 14905 |
+
"learning_rate": 2.299887234494537e-05,
|
| 14906 |
+
"loss": 1.8208,
|
| 14907 |
+
"step": 191000
|
| 14908 |
+
},
|
| 14909 |
+
{
|
| 14910 |
+
"epoch": 0.0195357196680044,
|
| 14911 |
+
"eval_loss": 2.121595621109009,
|
| 14912 |
+
"eval_runtime": 51.6838,
|
| 14913 |
+
"eval_samples_per_second": 197.238,
|
| 14914 |
+
"eval_steps_per_second": 1.548,
|
| 14915 |
+
"step": 191000
|
| 14916 |
+
},
|
| 14917 |
+
{
|
| 14918 |
+
"epoch": 0.01981480137754732,
|
| 14919 |
+
"grad_norm": 1.5259544849395752,
|
| 14920 |
+
"learning_rate": 2.2976803694336256e-05,
|
| 14921 |
+
"loss": 1.8279,
|
| 14922 |
+
"step": 191100
|
| 14923 |
+
},
|
| 14924 |
+
{
|
| 14925 |
+
"epoch": 0.020093883087090238,
|
| 14926 |
+
"grad_norm": 1.6435580253601074,
|
| 14927 |
+
"learning_rate": 2.2954736630563375e-05,
|
| 14928 |
+
"loss": 1.8291,
|
| 14929 |
+
"step": 191200
|
| 14930 |
+
},
|
| 14931 |
+
{
|
| 14932 |
+
"epoch": 0.02037296479663316,
|
| 14933 |
+
"grad_norm": 1.6680907011032104,
|
| 14934 |
+
"learning_rate": 2.2932671170934405e-05,
|
| 14935 |
+
"loss": 1.834,
|
| 14936 |
+
"step": 191300
|
| 14937 |
+
},
|
| 14938 |
+
{
|
| 14939 |
+
"epoch": 0.020652046506176077,
|
| 14940 |
+
"grad_norm": 1.6637004613876343,
|
| 14941 |
+
"learning_rate": 2.2910607332755744e-05,
|
| 14942 |
+
"loss": 1.8067,
|
| 14943 |
+
"step": 191400
|
| 14944 |
+
},
|
| 14945 |
+
{
|
| 14946 |
+
"epoch": 0.020931128215719,
|
| 14947 |
+
"grad_norm": 1.5594576597213745,
|
| 14948 |
+
"learning_rate": 2.288854513333254e-05,
|
| 14949 |
+
"loss": 1.8132,
|
| 14950 |
+
"step": 191500
|
| 14951 |
+
},
|
| 14952 |
+
{
|
| 14953 |
+
"epoch": 0.02121020992526192,
|
| 14954 |
+
"grad_norm": 1.502920389175415,
|
| 14955 |
+
"learning_rate": 2.2866484589968654e-05,
|
| 14956 |
+
"loss": 1.8337,
|
| 14957 |
+
"step": 191600
|
| 14958 |
+
},
|
| 14959 |
+
{
|
| 14960 |
+
"epoch": 0.021489291634804838,
|
| 14961 |
+
"grad_norm": 1.566256046295166,
|
| 14962 |
+
"learning_rate": 2.2844425719966637e-05,
|
| 14963 |
+
"loss": 1.8216,
|
| 14964 |
+
"step": 191700
|
| 14965 |
+
},
|
| 14966 |
+
{
|
| 14967 |
+
"epoch": 0.02176837334434776,
|
| 14968 |
+
"grad_norm": 1.882520079612732,
|
| 14969 |
+
"learning_rate": 2.2822368540627736e-05,
|
| 14970 |
+
"loss": 1.8178,
|
| 14971 |
+
"step": 191800
|
| 14972 |
+
},
|
| 14973 |
+
{
|
| 14974 |
+
"epoch": 0.022047455053890677,
|
| 14975 |
+
"grad_norm": 1.5686990022659302,
|
| 14976 |
+
"learning_rate": 2.2800313069251867e-05,
|
| 14977 |
+
"loss": 1.831,
|
| 14978 |
+
"step": 191900
|
| 14979 |
+
},
|
| 14980 |
+
{
|
| 14981 |
+
"epoch": 0.0223265367634336,
|
| 14982 |
+
"grad_norm": 1.6161882877349854,
|
| 14983 |
+
"learning_rate": 2.2778259323137607e-05,
|
| 14984 |
+
"loss": 1.8236,
|
| 14985 |
+
"step": 192000
|
| 14986 |
+
},
|
| 14987 |
+
{
|
| 14988 |
+
"epoch": 0.0223265367634336,
|
| 14989 |
+
"eval_loss": 2.1250271797180176,
|
| 14990 |
+
"eval_runtime": 51.7856,
|
| 14991 |
+
"eval_samples_per_second": 196.85,
|
| 14992 |
+
"eval_steps_per_second": 1.545,
|
| 14993 |
+
"step": 192000
|
| 14994 |
+
},
|
| 14995 |
+
{
|
| 14996 |
+
"epoch": 0.022605618472976517,
|
| 14997 |
+
"grad_norm": 1.9454728364944458,
|
| 14998 |
+
"learning_rate": 2.27562073195822e-05,
|
| 14999 |
+
"loss": 1.8262,
|
| 15000 |
+
"step": 192100
|
| 15001 |
+
},
|
| 15002 |
+
{
|
| 15003 |
+
"epoch": 0.022884700182519438,
|
| 15004 |
+
"grad_norm": 1.568524956703186,
|
| 15005 |
+
"learning_rate": 2.273415707588148e-05,
|
| 15006 |
+
"loss": 1.8111,
|
| 15007 |
+
"step": 192200
|
| 15008 |
+
},
|
| 15009 |
+
{
|
| 15010 |
+
"epoch": 0.02316378189206236,
|
| 15011 |
+
"grad_norm": 1.6108800172805786,
|
| 15012 |
+
"learning_rate": 2.2712108609329933e-05,
|
| 15013 |
+
"loss": 1.8097,
|
| 15014 |
+
"step": 192300
|
| 15015 |
+
},
|
| 15016 |
+
{
|
| 15017 |
+
"epoch": 0.023442863601605277,
|
| 15018 |
+
"grad_norm": 1.5785143375396729,
|
| 15019 |
+
"learning_rate": 2.2690061937220656e-05,
|
| 15020 |
+
"loss": 1.8223,
|
| 15021 |
+
"step": 192400
|
| 15022 |
+
},
|
| 15023 |
+
{
|
| 15024 |
+
"epoch": 0.0237219453111482,
|
| 15025 |
+
"grad_norm": 2.498911142349243,
|
| 15026 |
+
"learning_rate": 2.2668017076845323e-05,
|
| 15027 |
+
"loss": 2.0084,
|
| 15028 |
+
"step": 192500
|
| 15029 |
+
},
|
| 15030 |
+
{
|
| 15031 |
+
"epoch": 0.024001027020691117,
|
| 15032 |
+
"grad_norm": 2.186514377593994,
|
| 15033 |
+
"learning_rate": 2.2645974045494175e-05,
|
| 15034 |
+
"loss": 2.48,
|
| 15035 |
+
"step": 192600
|
| 15036 |
+
},
|
| 15037 |
+
{
|
| 15038 |
+
"epoch": 0.024280108730234038,
|
| 15039 |
+
"grad_norm": 2.3486995697021484,
|
| 15040 |
+
"learning_rate": 2.2623932860456044e-05,
|
| 15041 |
+
"loss": 2.4545,
|
| 15042 |
+
"step": 192700
|
| 15043 |
+
},
|
| 15044 |
+
{
|
| 15045 |
+
"epoch": 0.02455919043977696,
|
| 15046 |
+
"grad_norm": 2.1500723361968994,
|
| 15047 |
+
"learning_rate": 2.2601893539018305e-05,
|
| 15048 |
+
"loss": 2.4442,
|
| 15049 |
+
"step": 192800
|
| 15050 |
+
},
|
| 15051 |
+
{
|
| 15052 |
+
"epoch": 0.024838272149319877,
|
| 15053 |
+
"grad_norm": 2.1858279705047607,
|
| 15054 |
+
"learning_rate": 2.2579856098466882e-05,
|
| 15055 |
+
"loss": 2.4291,
|
| 15056 |
+
"step": 192900
|
| 15057 |
+
},
|
| 15058 |
+
{
|
| 15059 |
+
"epoch": 0.0251173538588628,
|
| 15060 |
+
"grad_norm": 2.4530797004699707,
|
| 15061 |
+
"learning_rate": 2.2557820556086187e-05,
|
| 15062 |
+
"loss": 2.4252,
|
| 15063 |
+
"step": 193000
|
| 15064 |
+
},
|
| 15065 |
+
{
|
| 15066 |
+
"epoch": 0.0251173538588628,
|
| 15067 |
+
"eval_loss": 2.1376500129699707,
|
| 15068 |
+
"eval_runtime": 51.9091,
|
| 15069 |
+
"eval_samples_per_second": 196.382,
|
| 15070 |
+
"eval_steps_per_second": 1.541,
|
| 15071 |
+
"step": 193000
|
| 15072 |
+
},
|
| 15073 |
+
{
|
| 15074 |
+
"epoch": 0.025396435568405717,
|
| 15075 |
+
"grad_norm": 2.192619562149048,
|
| 15076 |
+
"learning_rate": 2.253578692915919e-05,
|
| 15077 |
+
"loss": 2.4244,
|
| 15078 |
+
"step": 193100
|
| 15079 |
+
},
|
| 15080 |
+
{
|
| 15081 |
+
"epoch": 0.025675517277948638,
|
| 15082 |
+
"grad_norm": 2.2540953159332275,
|
| 15083 |
+
"learning_rate": 2.2513755234967317e-05,
|
| 15084 |
+
"loss": 2.4187,
|
| 15085 |
+
"step": 193200
|
| 15086 |
+
},
|
| 15087 |
+
{
|
| 15088 |
+
"epoch": 0.025954598987491556,
|
| 15089 |
+
"grad_norm": 2.1056604385375977,
|
| 15090 |
+
"learning_rate": 2.2491725490790526e-05,
|
| 15091 |
+
"loss": 2.4017,
|
| 15092 |
+
"step": 193300
|
| 15093 |
+
},
|
| 15094 |
+
{
|
| 15095 |
+
"epoch": 0.026233680697034478,
|
| 15096 |
+
"grad_norm": 2.1589183807373047,
|
| 15097 |
+
"learning_rate": 2.2469697713907186e-05,
|
| 15098 |
+
"loss": 2.4083,
|
| 15099 |
+
"step": 193400
|
| 15100 |
+
},
|
| 15101 |
+
{
|
| 15102 |
+
"epoch": 0.0265127624065774,
|
| 15103 |
+
"grad_norm": 2.1547043323516846,
|
| 15104 |
+
"learning_rate": 2.244767192159417e-05,
|
| 15105 |
+
"loss": 2.4065,
|
| 15106 |
+
"step": 193500
|
| 15107 |
+
},
|
| 15108 |
+
{
|
| 15109 |
+
"epoch": 0.026791844116120317,
|
| 15110 |
+
"grad_norm": 2.057020425796509,
|
| 15111 |
+
"learning_rate": 2.2425648131126777e-05,
|
| 15112 |
+
"loss": 2.3981,
|
| 15113 |
+
"step": 193600
|
| 15114 |
+
},
|
| 15115 |
+
{
|
| 15116 |
+
"epoch": 0.02707092582566324,
|
| 15117 |
+
"grad_norm": 2.380244255065918,
|
| 15118 |
+
"learning_rate": 2.2403626359778753e-05,
|
| 15119 |
+
"loss": 2.404,
|
| 15120 |
+
"step": 193700
|
| 15121 |
+
},
|
| 15122 |
+
{
|
| 15123 |
+
"epoch": 0.027350007535206156,
|
| 15124 |
+
"grad_norm": 2.1975646018981934,
|
| 15125 |
+
"learning_rate": 2.2381606624822228e-05,
|
| 15126 |
+
"loss": 2.3931,
|
| 15127 |
+
"step": 193800
|
| 15128 |
+
},
|
| 15129 |
+
{
|
| 15130 |
+
"epoch": 0.027629089244749078,
|
| 15131 |
+
"grad_norm": 2.0740909576416016,
|
| 15132 |
+
"learning_rate": 2.2359588943527746e-05,
|
| 15133 |
+
"loss": 2.4027,
|
| 15134 |
+
"step": 193900
|
| 15135 |
+
},
|
| 15136 |
+
{
|
| 15137 |
+
"epoch": 0.027908170954292,
|
| 15138 |
+
"grad_norm": 2.2962470054626465,
|
| 15139 |
+
"learning_rate": 2.233757333316426e-05,
|
| 15140 |
+
"loss": 2.3949,
|
| 15141 |
+
"step": 194000
|
| 15142 |
+
},
|
| 15143 |
+
{
|
| 15144 |
+
"epoch": 0.027908170954292,
|
| 15145 |
+
"eval_loss": 2.148186206817627,
|
| 15146 |
+
"eval_runtime": 51.9071,
|
| 15147 |
+
"eval_samples_per_second": 196.389,
|
| 15148 |
+
"eval_steps_per_second": 1.541,
|
| 15149 |
+
"step": 194000
|
| 15150 |
+
},
|
| 15151 |
+
{
|
| 15152 |
+
"epoch": 0.028187252663834917,
|
| 15153 |
+
"grad_norm": 2.1983277797698975,
|
| 15154 |
+
"learning_rate": 2.2315559810999086e-05,
|
| 15155 |
+
"loss": 2.3911,
|
| 15156 |
+
"step": 194100
|
| 15157 |
+
},
|
| 15158 |
+
{
|
| 15159 |
+
"epoch": 0.02846633437337784,
|
| 15160 |
+
"grad_norm": 2.1726229190826416,
|
| 15161 |
+
"learning_rate": 2.2293548394297893e-05,
|
| 15162 |
+
"loss": 2.3763,
|
| 15163 |
+
"step": 194200
|
| 15164 |
+
},
|
| 15165 |
+
{
|
| 15166 |
+
"epoch": 0.028745416082920756,
|
| 15167 |
+
"grad_norm": 2.190869092941284,
|
| 15168 |
+
"learning_rate": 2.2271539100324705e-05,
|
| 15169 |
+
"loss": 2.3822,
|
| 15170 |
+
"step": 194300
|
| 15171 |
+
},
|
| 15172 |
+
{
|
| 15173 |
+
"epoch": 0.029024497792463678,
|
| 15174 |
+
"grad_norm": 2.150756359100342,
|
| 15175 |
+
"learning_rate": 2.22495319463419e-05,
|
| 15176 |
+
"loss": 2.383,
|
| 15177 |
+
"step": 194400
|
| 15178 |
+
},
|
| 15179 |
+
{
|
| 15180 |
+
"epoch": 0.0293035795020066,
|
| 15181 |
+
"grad_norm": 2.159919500350952,
|
| 15182 |
+
"learning_rate": 2.222752694961014e-05,
|
| 15183 |
+
"loss": 2.3799,
|
| 15184 |
+
"step": 194500
|
| 15185 |
+
},
|
| 15186 |
+
{
|
| 15187 |
+
"epoch": 0.029582661211549517,
|
| 15188 |
+
"grad_norm": 2.1796655654907227,
|
| 15189 |
+
"learning_rate": 2.2205524127388438e-05,
|
| 15190 |
+
"loss": 2.3804,
|
| 15191 |
+
"step": 194600
|
| 15192 |
+
},
|
| 15193 |
+
{
|
| 15194 |
+
"epoch": 0.02986174292109244,
|
| 15195 |
+
"grad_norm": 2.313180446624756,
|
| 15196 |
+
"learning_rate": 2.2183523496934052e-05,
|
| 15197 |
+
"loss": 2.3574,
|
| 15198 |
+
"step": 194700
|
| 15199 |
+
},
|
| 15200 |
+
{
|
| 15201 |
+
"epoch": 0.030140824630635357,
|
| 15202 |
+
"grad_norm": 2.2141001224517822,
|
| 15203 |
+
"learning_rate": 2.2161525075502565e-05,
|
| 15204 |
+
"loss": 2.3726,
|
| 15205 |
+
"step": 194800
|
| 15206 |
+
},
|
| 15207 |
+
{
|
| 15208 |
+
"epoch": 0.030419906340178278,
|
| 15209 |
+
"grad_norm": 2.145921468734741,
|
| 15210 |
+
"learning_rate": 2.2139528880347807e-05,
|
| 15211 |
+
"loss": 2.3633,
|
| 15212 |
+
"step": 194900
|
| 15213 |
+
},
|
| 15214 |
+
{
|
| 15215 |
+
"epoch": 0.030698988049721196,
|
| 15216 |
+
"grad_norm": 2.279843330383301,
|
| 15217 |
+
"learning_rate": 2.2117534928721878e-05,
|
| 15218 |
+
"loss": 2.3747,
|
| 15219 |
+
"step": 195000
|
| 15220 |
+
},
|
| 15221 |
+
{
|
| 15222 |
+
"epoch": 0.030698988049721196,
|
| 15223 |
+
"eval_loss": 2.156066417694092,
|
| 15224 |
+
"eval_runtime": 51.6132,
|
| 15225 |
+
"eval_samples_per_second": 197.508,
|
| 15226 |
+
"eval_steps_per_second": 1.55,
|
| 15227 |
+
"step": 195000
|
| 15228 |
+
},
|
| 15229 |
+
{
|
| 15230 |
+
"epoch": 0.030978069759264117,
|
| 15231 |
+
"grad_norm": 2.107222557067871,
|
| 15232 |
+
"learning_rate": 2.2095543237875088e-05,
|
| 15233 |
+
"loss": 2.3612,
|
| 15234 |
+
"step": 195100
|
| 15235 |
+
},
|
| 15236 |
+
{
|
| 15237 |
+
"epoch": 0.03125715146880704,
|
| 15238 |
+
"grad_norm": 2.1660873889923096,
|
| 15239 |
+
"learning_rate": 2.207355382505599e-05,
|
| 15240 |
+
"loss": 2.3562,
|
| 15241 |
+
"step": 195200
|
| 15242 |
+
},
|
| 15243 |
+
{
|
| 15244 |
+
"epoch": 0.03153623317834996,
|
| 15245 |
+
"grad_norm": 2.206403970718384,
|
| 15246 |
+
"learning_rate": 2.2051566707511362e-05,
|
| 15247 |
+
"loss": 2.371,
|
| 15248 |
+
"step": 195300
|
| 15249 |
+
},
|
| 15250 |
+
{
|
| 15251 |
+
"epoch": 0.031815314887892875,
|
| 15252 |
+
"grad_norm": 2.277531147003174,
|
| 15253 |
+
"learning_rate": 2.2029581902486176e-05,
|
| 15254 |
+
"loss": 2.3571,
|
| 15255 |
+
"step": 195400
|
| 15256 |
+
},
|
| 15257 |
+
{
|
| 15258 |
+
"epoch": 0.0320943965974358,
|
| 15259 |
+
"grad_norm": 2.041177749633789,
|
| 15260 |
+
"learning_rate": 2.200759942722357e-05,
|
| 15261 |
+
"loss": 2.3658,
|
| 15262 |
+
"step": 195500
|
| 15263 |
+
},
|
| 15264 |
+
{
|
| 15265 |
+
"epoch": 0.03237347830697872,
|
| 15266 |
+
"grad_norm": 2.2721259593963623,
|
| 15267 |
+
"learning_rate": 2.1985619298964884e-05,
|
| 15268 |
+
"loss": 2.3531,
|
| 15269 |
+
"step": 195600
|
| 15270 |
+
},
|
| 15271 |
+
{
|
| 15272 |
+
"epoch": 0.032652560016521635,
|
| 15273 |
+
"grad_norm": 2.2664246559143066,
|
| 15274 |
+
"learning_rate": 2.1963641534949597e-05,
|
| 15275 |
+
"loss": 2.3522,
|
| 15276 |
+
"step": 195700
|
| 15277 |
+
},
|
| 15278 |
+
{
|
| 15279 |
+
"epoch": 0.03293164172606456,
|
| 15280 |
+
"grad_norm": 2.323575258255005,
|
| 15281 |
+
"learning_rate": 2.1941666152415343e-05,
|
| 15282 |
+
"loss": 2.342,
|
| 15283 |
+
"step": 195800
|
| 15284 |
+
},
|
| 15285 |
+
{
|
| 15286 |
+
"epoch": 0.03321072343560748,
|
| 15287 |
+
"grad_norm": 2.1575205326080322,
|
| 15288 |
+
"learning_rate": 2.1919693168597887e-05,
|
| 15289 |
+
"loss": 2.3505,
|
| 15290 |
+
"step": 195900
|
| 15291 |
+
},
|
| 15292 |
+
{
|
| 15293 |
+
"epoch": 0.033489805145150396,
|
| 15294 |
+
"grad_norm": 2.265693187713623,
|
| 15295 |
+
"learning_rate": 2.1897722600731107e-05,
|
| 15296 |
+
"loss": 2.3428,
|
| 15297 |
+
"step": 196000
|
| 15298 |
+
},
|
| 15299 |
+
{
|
| 15300 |
+
"epoch": 0.033489805145150396,
|
| 15301 |
+
"eval_loss": 2.1471190452575684,
|
| 15302 |
+
"eval_runtime": 51.714,
|
| 15303 |
+
"eval_samples_per_second": 197.123,
|
| 15304 |
+
"eval_steps_per_second": 1.547,
|
| 15305 |
+
"step": 196000
|
| 15306 |
+
},
|
| 15307 |
+
{
|
| 15308 |
+
"epoch": 0.033768886854693314,
|
| 15309 |
+
"grad_norm": 2.1313159465789795,
|
| 15310 |
+
"learning_rate": 2.187575446604699e-05,
|
| 15311 |
+
"loss": 2.344,
|
| 15312 |
+
"step": 196100
|
| 15313 |
+
},
|
| 15314 |
+
{
|
| 15315 |
+
"epoch": 0.03404796856423624,
|
| 15316 |
+
"grad_norm": 2.165553569793701,
|
| 15317 |
+
"learning_rate": 2.1853788781775626e-05,
|
| 15318 |
+
"loss": 2.3369,
|
| 15319 |
+
"step": 196200
|
| 15320 |
+
},
|
| 15321 |
+
{
|
| 15322 |
+
"epoch": 0.03432705027377916,
|
| 15323 |
+
"grad_norm": 2.348489999771118,
|
| 15324 |
+
"learning_rate": 2.1831825565145155e-05,
|
| 15325 |
+
"loss": 2.3325,
|
| 15326 |
+
"step": 196300
|
| 15327 |
+
},
|
| 15328 |
+
{
|
| 15329 |
+
"epoch": 0.034606131983322075,
|
| 15330 |
+
"grad_norm": 2.2844085693359375,
|
| 15331 |
+
"learning_rate": 2.1809864833381816e-05,
|
| 15332 |
+
"loss": 2.3458,
|
| 15333 |
+
"step": 196400
|
| 15334 |
+
},
|
| 15335 |
+
{
|
| 15336 |
+
"epoch": 0.034885213692865,
|
| 15337 |
+
"grad_norm": 2.1077094078063965,
|
| 15338 |
+
"learning_rate": 2.1787906603709863e-05,
|
| 15339 |
+
"loss": 2.3301,
|
| 15340 |
+
"step": 196500
|
| 15341 |
+
},
|
| 15342 |
+
{
|
| 15343 |
+
"epoch": 0.03516429540240792,
|
| 15344 |
+
"grad_norm": 2.2360150814056396,
|
| 15345 |
+
"learning_rate": 2.1765950893351627e-05,
|
| 15346 |
+
"loss": 2.3357,
|
| 15347 |
+
"step": 196600
|
| 15348 |
+
},
|
| 15349 |
+
{
|
| 15350 |
+
"epoch": 0.035443377111950836,
|
| 15351 |
+
"grad_norm": 2.1342897415161133,
|
| 15352 |
+
"learning_rate": 2.1743997719527423e-05,
|
| 15353 |
+
"loss": 2.3309,
|
| 15354 |
+
"step": 196700
|
| 15355 |
+
},
|
| 15356 |
+
{
|
| 15357 |
+
"epoch": 0.035722458821493754,
|
| 15358 |
+
"grad_norm": 2.3143725395202637,
|
| 15359 |
+
"learning_rate": 2.17220470994556e-05,
|
| 15360 |
+
"loss": 2.3497,
|
| 15361 |
+
"step": 196800
|
| 15362 |
+
},
|
| 15363 |
+
{
|
| 15364 |
+
"epoch": 0.03600154053103668,
|
| 15365 |
+
"grad_norm": 2.207287549972534,
|
| 15366 |
+
"learning_rate": 2.170009905035251e-05,
|
| 15367 |
+
"loss": 2.3268,
|
| 15368 |
+
"step": 196900
|
| 15369 |
+
},
|
| 15370 |
+
{
|
| 15371 |
+
"epoch": 0.036280622240579596,
|
| 15372 |
+
"grad_norm": 2.1440131664276123,
|
| 15373 |
+
"learning_rate": 2.167815358943248e-05,
|
| 15374 |
+
"loss": 2.3535,
|
| 15375 |
+
"step": 197000
|
| 15376 |
+
},
|
| 15377 |
+
{
|
| 15378 |
+
"epoch": 0.036280622240579596,
|
| 15379 |
+
"eval_loss": 2.1492607593536377,
|
| 15380 |
+
"eval_runtime": 51.6958,
|
| 15381 |
+
"eval_samples_per_second": 197.192,
|
| 15382 |
+
"eval_steps_per_second": 1.548,
|
| 15383 |
+
"step": 197000
|
| 15384 |
+
},
|
| 15385 |
+
{
|
| 15386 |
+
"epoch": 0.036559703950122514,
|
| 15387 |
+
"grad_norm": 2.189824342727661,
|
| 15388 |
+
"learning_rate": 2.165621073390779e-05,
|
| 15389 |
+
"loss": 2.3368,
|
| 15390 |
+
"step": 197100
|
| 15391 |
+
},
|
| 15392 |
+
{
|
| 15393 |
+
"epoch": 0.03683878565966544,
|
| 15394 |
+
"grad_norm": 2.1990151405334473,
|
| 15395 |
+
"learning_rate": 2.16342705009887e-05,
|
| 15396 |
+
"loss": 2.3344,
|
| 15397 |
+
"step": 197200
|
| 15398 |
+
},
|
| 15399 |
+
{
|
| 15400 |
+
"epoch": 0.03711786736920836,
|
| 15401 |
+
"grad_norm": 2.077488899230957,
|
| 15402 |
+
"learning_rate": 2.1612332907883405e-05,
|
| 15403 |
+
"loss": 2.3267,
|
| 15404 |
+
"step": 197300
|
| 15405 |
+
},
|
| 15406 |
+
{
|
| 15407 |
+
"epoch": 0.037396949078751275,
|
| 15408 |
+
"grad_norm": 2.2698981761932373,
|
| 15409 |
+
"learning_rate": 2.1590397971798025e-05,
|
| 15410 |
+
"loss": 2.3285,
|
| 15411 |
+
"step": 197400
|
| 15412 |
+
},
|
| 15413 |
+
{
|
| 15414 |
+
"epoch": 0.0376760307882942,
|
| 15415 |
+
"grad_norm": 2.2117862701416016,
|
| 15416 |
+
"learning_rate": 2.1568465709936615e-05,
|
| 15417 |
+
"loss": 2.322,
|
| 15418 |
+
"step": 197500
|
| 15419 |
+
},
|
| 15420 |
+
{
|
| 15421 |
+
"epoch": 0.03795511249783712,
|
| 15422 |
+
"grad_norm": 2.194138288497925,
|
| 15423 |
+
"learning_rate": 2.15465361395011e-05,
|
| 15424 |
+
"loss": 2.3228,
|
| 15425 |
+
"step": 197600
|
| 15426 |
+
},
|
| 15427 |
+
{
|
| 15428 |
+
"epoch": 0.038234194207380036,
|
| 15429 |
+
"grad_norm": 2.151017665863037,
|
| 15430 |
+
"learning_rate": 2.1524609277691327e-05,
|
| 15431 |
+
"loss": 2.3376,
|
| 15432 |
+
"step": 197700
|
| 15433 |
+
},
|
| 15434 |
+
{
|
| 15435 |
+
"epoch": 0.038513275916922954,
|
| 15436 |
+
"grad_norm": 2.273414373397827,
|
| 15437 |
+
"learning_rate": 2.1502685141704992e-05,
|
| 15438 |
+
"loss": 2.3298,
|
| 15439 |
+
"step": 197800
|
| 15440 |
+
},
|
| 15441 |
+
{
|
| 15442 |
+
"epoch": 0.03879235762646588,
|
| 15443 |
+
"grad_norm": 2.2569565773010254,
|
| 15444 |
+
"learning_rate": 2.148076374873768e-05,
|
| 15445 |
+
"loss": 2.3371,
|
| 15446 |
+
"step": 197900
|
| 15447 |
+
},
|
| 15448 |
+
{
|
| 15449 |
+
"epoch": 0.0390714393360088,
|
| 15450 |
+
"grad_norm": 2.109938621520996,
|
| 15451 |
+
"learning_rate": 2.1458845115982783e-05,
|
| 15452 |
+
"loss": 2.3074,
|
| 15453 |
+
"step": 198000
|
| 15454 |
+
},
|
| 15455 |
+
{
|
| 15456 |
+
"epoch": 0.0390714393360088,
|
| 15457 |
+
"eval_loss": 2.156459331512451,
|
| 15458 |
+
"eval_runtime": 51.6968,
|
| 15459 |
+
"eval_samples_per_second": 197.188,
|
| 15460 |
+
"eval_steps_per_second": 1.547,
|
| 15461 |
+
"step": 198000
|
| 15462 |
+
},
|
| 15463 |
+
{
|
| 15464 |
+
"epoch": 0.039350521045551715,
|
| 15465 |
+
"grad_norm": 2.1745998859405518,
|
| 15466 |
+
"learning_rate": 2.1436929260631578e-05,
|
| 15467 |
+
"loss": 2.3337,
|
| 15468 |
+
"step": 198100
|
| 15469 |
+
},
|
| 15470 |
+
{
|
| 15471 |
+
"epoch": 0.03962960275509464,
|
| 15472 |
+
"grad_norm": 2.120976448059082,
|
| 15473 |
+
"learning_rate": 2.141501619987313e-05,
|
| 15474 |
+
"loss": 2.3231,
|
| 15475 |
+
"step": 198200
|
| 15476 |
+
},
|
| 15477 |
+
{
|
| 15478 |
+
"epoch": 0.03990868446463756,
|
| 15479 |
+
"grad_norm": 2.1885461807250977,
|
| 15480 |
+
"learning_rate": 2.139310595089434e-05,
|
| 15481 |
+
"loss": 2.3277,
|
| 15482 |
+
"step": 198300
|
| 15483 |
+
},
|
| 15484 |
+
{
|
| 15485 |
+
"epoch": 0.040187766174180475,
|
| 15486 |
+
"grad_norm": 2.2620506286621094,
|
| 15487 |
+
"learning_rate": 2.137119853087986e-05,
|
| 15488 |
+
"loss": 2.3335,
|
| 15489 |
+
"step": 198400
|
| 15490 |
+
},
|
| 15491 |
+
{
|
| 15492 |
+
"epoch": 0.04046684788372339,
|
| 15493 |
+
"grad_norm": 2.1864798069000244,
|
| 15494 |
+
"learning_rate": 2.1349293957012156e-05,
|
| 15495 |
+
"loss": 2.3239,
|
| 15496 |
+
"step": 198500
|
| 15497 |
+
},
|
| 15498 |
+
{
|
| 15499 |
+
"epoch": 0.04074592959326632,
|
| 15500 |
+
"grad_norm": 2.1792876720428467,
|
| 15501 |
+
"learning_rate": 2.1327392246471463e-05,
|
| 15502 |
+
"loss": 2.3166,
|
| 15503 |
+
"step": 198600
|
| 15504 |
+
},
|
| 15505 |
+
{
|
| 15506 |
+
"epoch": 0.041025011302809236,
|
| 15507 |
+
"grad_norm": 2.264899730682373,
|
| 15508 |
+
"learning_rate": 2.1305493416435765e-05,
|
| 15509 |
+
"loss": 2.3171,
|
| 15510 |
+
"step": 198700
|
| 15511 |
+
},
|
| 15512 |
+
{
|
| 15513 |
+
"epoch": 0.041304093012352154,
|
| 15514 |
+
"grad_norm": 1.9806472063064575,
|
| 15515 |
+
"learning_rate": 2.1283597484080765e-05,
|
| 15516 |
+
"loss": 2.3247,
|
| 15517 |
+
"step": 198800
|
| 15518 |
+
},
|
| 15519 |
+
{
|
| 15520 |
+
"epoch": 0.04158317472189508,
|
| 15521 |
+
"grad_norm": 2.1849722862243652,
|
| 15522 |
+
"learning_rate": 2.1261704466579928e-05,
|
| 15523 |
+
"loss": 2.3158,
|
| 15524 |
+
"step": 198900
|
| 15525 |
+
},
|
| 15526 |
+
{
|
| 15527 |
+
"epoch": 0.041862256431438,
|
| 15528 |
+
"grad_norm": 2.025466203689575,
|
| 15529 |
+
"learning_rate": 2.1239814381104417e-05,
|
| 15530 |
+
"loss": 2.3061,
|
| 15531 |
+
"step": 199000
|
| 15532 |
+
},
|
| 15533 |
+
{
|
| 15534 |
+
"epoch": 0.041862256431438,
|
| 15535 |
+
"eval_loss": 2.146428108215332,
|
| 15536 |
+
"eval_runtime": 51.7966,
|
| 15537 |
+
"eval_samples_per_second": 196.808,
|
| 15538 |
+
"eval_steps_per_second": 1.545,
|
| 15539 |
+
"step": 199000
|
| 15540 |
+
},
|
| 15541 |
+
{
|
| 15542 |
+
"epoch": 0.00027908170954291995,
|
| 15543 |
+
"grad_norm": 2.0780715942382812,
|
| 15544 |
+
"learning_rate": 2.1217927244823092e-05,
|
| 15545 |
+
"loss": 2.3137,
|
| 15546 |
+
"step": 199100
|
| 15547 |
+
},
|
| 15548 |
+
{
|
| 15549 |
+
"epoch": 0.0005581634190858399,
|
| 15550 |
+
"grad_norm": 2.1887388229370117,
|
| 15551 |
+
"learning_rate": 2.1196043074902503e-05,
|
| 15552 |
+
"loss": 2.311,
|
| 15553 |
+
"step": 199200
|
| 15554 |
+
},
|
| 15555 |
+
{
|
| 15556 |
+
"epoch": 0.0008372451286287599,
|
| 15557 |
+
"grad_norm": 2.110805034637451,
|
| 15558 |
+
"learning_rate": 2.1174161888506867e-05,
|
| 15559 |
+
"loss": 2.3166,
|
| 15560 |
+
"step": 199300
|
| 15561 |
+
},
|
| 15562 |
+
{
|
| 15563 |
+
"epoch": 0.0011163268381716798,
|
| 15564 |
+
"grad_norm": 2.1829800605773926,
|
| 15565 |
+
"learning_rate": 2.1152283702798077e-05,
|
| 15566 |
+
"loss": 2.3035,
|
| 15567 |
+
"step": 199400
|
| 15568 |
+
},
|
| 15569 |
+
{
|
| 15570 |
+
"epoch": 0.0013954085477146,
|
| 15571 |
+
"grad_norm": 2.2523720264434814,
|
| 15572 |
+
"learning_rate": 2.1130408534935664e-05,
|
| 15573 |
+
"loss": 2.3104,
|
| 15574 |
+
"step": 199500
|
| 15575 |
+
},
|
| 15576 |
+
{
|
| 15577 |
+
"epoch": 0.0016744902572575198,
|
| 15578 |
+
"grad_norm": 2.268869400024414,
|
| 15579 |
+
"learning_rate": 2.1108536402076777e-05,
|
| 15580 |
+
"loss": 2.3095,
|
| 15581 |
+
"step": 199600
|
| 15582 |
+
},
|
| 15583 |
+
{
|
| 15584 |
+
"epoch": 0.00195357196680044,
|
| 15585 |
+
"grad_norm": 2.352266788482666,
|
| 15586 |
+
"learning_rate": 2.108666732137622e-05,
|
| 15587 |
+
"loss": 2.3235,
|
| 15588 |
+
"step": 199700
|
| 15589 |
+
},
|
| 15590 |
+
{
|
| 15591 |
+
"epoch": 0.0022326536763433596,
|
| 15592 |
+
"grad_norm": 2.0702219009399414,
|
| 15593 |
+
"learning_rate": 2.106480130998636e-05,
|
| 15594 |
+
"loss": 2.301,
|
| 15595 |
+
"step": 199800
|
| 15596 |
+
},
|
| 15597 |
+
{
|
| 15598 |
+
"epoch": 0.0025117353858862797,
|
| 15599 |
+
"grad_norm": 2.219024896621704,
|
| 15600 |
+
"learning_rate": 2.1042938385057202e-05,
|
| 15601 |
+
"loss": 2.2952,
|
| 15602 |
+
"step": 199900
|
| 15603 |
+
},
|
| 15604 |
+
{
|
| 15605 |
+
"epoch": 0.0027908170954292,
|
| 15606 |
+
"grad_norm": 2.2651102542877197,
|
| 15607 |
+
"learning_rate": 2.102107856373628e-05,
|
| 15608 |
+
"loss": 2.302,
|
| 15609 |
+
"step": 200000
|
| 15610 |
+
},
|
| 15611 |
+
{
|
| 15612 |
+
"epoch": 0.0027908170954292,
|
| 15613 |
+
"eval_loss": 2.1563775539398193,
|
| 15614 |
+
"eval_runtime": 52.0565,
|
| 15615 |
+
"eval_samples_per_second": 195.826,
|
| 15616 |
+
"eval_steps_per_second": 1.537,
|
| 15617 |
+
"step": 200000
|
| 15618 |
+
},
|
| 15619 |
+
{
|
| 15620 |
+
"epoch": 0.00306989880497212,
|
| 15621 |
+
"grad_norm": 2.2001454830169678,
|
| 15622 |
+
"learning_rate": 2.0999221863168736e-05,
|
| 15623 |
+
"loss": 2.3131,
|
| 15624 |
+
"step": 200100
|
| 15625 |
+
},
|
| 15626 |
+
{
|
| 15627 |
+
"epoch": 0.0033489805145150396,
|
| 15628 |
+
"grad_norm": 2.1782033443450928,
|
| 15629 |
+
"learning_rate": 2.0977368300497246e-05,
|
| 15630 |
+
"loss": 2.3084,
|
| 15631 |
+
"step": 200200
|
| 15632 |
+
},
|
| 15633 |
+
{
|
| 15634 |
+
"epoch": 0.0036280622240579597,
|
| 15635 |
+
"grad_norm": 2.282090663909912,
|
| 15636 |
+
"learning_rate": 2.095551789286204e-05,
|
| 15637 |
+
"loss": 2.2983,
|
| 15638 |
+
"step": 200300
|
| 15639 |
+
},
|
| 15640 |
+
{
|
| 15641 |
+
"epoch": 0.00390714393360088,
|
| 15642 |
+
"grad_norm": 2.1379668712615967,
|
| 15643 |
+
"learning_rate": 2.0933670657400838e-05,
|
| 15644 |
+
"loss": 2.2989,
|
| 15645 |
+
"step": 200400
|
| 15646 |
+
},
|
| 15647 |
+
{
|
| 15648 |
+
"epoch": 0.0041862256431437995,
|
| 15649 |
+
"grad_norm": 2.3254175186157227,
|
| 15650 |
+
"learning_rate": 2.091182661124891e-05,
|
| 15651 |
+
"loss": 2.3211,
|
| 15652 |
+
"step": 200500
|
| 15653 |
+
},
|
| 15654 |
+
{
|
| 15655 |
+
"epoch": 0.004465307352686719,
|
| 15656 |
+
"grad_norm": 2.112151622772217,
|
| 15657 |
+
"learning_rate": 2.0889985771539002e-05,
|
| 15658 |
+
"loss": 2.288,
|
| 15659 |
+
"step": 200600
|
| 15660 |
+
},
|
| 15661 |
+
{
|
| 15662 |
+
"epoch": 0.00474438906222964,
|
| 15663 |
+
"grad_norm": 2.2548341751098633,
|
| 15664 |
+
"learning_rate": 2.0868148155401356e-05,
|
| 15665 |
+
"loss": 2.3027,
|
| 15666 |
+
"step": 200700
|
| 15667 |
+
},
|
| 15668 |
+
{
|
| 15669 |
+
"epoch": 0.005023470771772559,
|
| 15670 |
+
"grad_norm": 2.3280234336853027,
|
| 15671 |
+
"learning_rate": 2.0846313779963696e-05,
|
| 15672 |
+
"loss": 2.3049,
|
| 15673 |
+
"step": 200800
|
| 15674 |
+
},
|
| 15675 |
+
{
|
| 15676 |
+
"epoch": 0.00530255248131548,
|
| 15677 |
+
"grad_norm": 2.256028175354004,
|
| 15678 |
+
"learning_rate": 2.0824482662351167e-05,
|
| 15679 |
+
"loss": 2.3023,
|
| 15680 |
+
"step": 200900
|
| 15681 |
+
},
|
| 15682 |
+
{
|
| 15683 |
+
"epoch": 0.0055816341908584,
|
| 15684 |
+
"grad_norm": 2.195711851119995,
|
| 15685 |
+
"learning_rate": 2.0802654819686398e-05,
|
| 15686 |
+
"loss": 2.2887,
|
| 15687 |
+
"step": 201000
|
| 15688 |
+
},
|
| 15689 |
+
{
|
| 15690 |
+
"epoch": 0.0055816341908584,
|
| 15691 |
+
"eval_loss": 2.14955997467041,
|
| 15692 |
+
"eval_runtime": 51.6475,
|
| 15693 |
+
"eval_samples_per_second": 197.376,
|
| 15694 |
+
"eval_steps_per_second": 1.549,
|
| 15695 |
+
"step": 201000
|
| 15696 |
+
},
|
| 15697 |
+
{
|
| 15698 |
+
"epoch": 0.005860715900401319,
|
| 15699 |
+
"grad_norm": 2.0905580520629883,
|
| 15700 |
+
"learning_rate": 2.0780830269089423e-05,
|
| 15701 |
+
"loss": 2.2914,
|
| 15702 |
+
"step": 201100
|
| 15703 |
+
},
|
| 15704 |
+
{
|
| 15705 |
+
"epoch": 0.00613979760994424,
|
| 15706 |
+
"grad_norm": 2.1279406547546387,
|
| 15707 |
+
"learning_rate": 2.0759009027677727e-05,
|
| 15708 |
+
"loss": 2.3037,
|
| 15709 |
+
"step": 201200
|
| 15710 |
+
},
|
| 15711 |
+
{
|
| 15712 |
+
"epoch": 0.0064188793194871595,
|
| 15713 |
+
"grad_norm": 2.178835868835449,
|
| 15714 |
+
"learning_rate": 2.0737191112566146e-05,
|
| 15715 |
+
"loss": 2.2989,
|
| 15716 |
+
"step": 201300
|
| 15717 |
+
},
|
| 15718 |
+
{
|
| 15719 |
+
"epoch": 0.006697961029030079,
|
| 15720 |
+
"grad_norm": 2.2267632484436035,
|
| 15721 |
+
"learning_rate": 2.071537654086696e-05,
|
| 15722 |
+
"loss": 2.2928,
|
| 15723 |
+
"step": 201400
|
| 15724 |
+
},
|
| 15725 |
+
{
|
| 15726 |
+
"epoch": 0.006977042738573,
|
| 15727 |
+
"grad_norm": 2.310661792755127,
|
| 15728 |
+
"learning_rate": 2.0693565329689793e-05,
|
| 15729 |
+
"loss": 2.3337,
|
| 15730 |
+
"step": 201500
|
| 15731 |
+
},
|
| 15732 |
+
{
|
| 15733 |
+
"epoch": 0.0072561244481159195,
|
| 15734 |
+
"grad_norm": 2.2507314682006836,
|
| 15735 |
+
"learning_rate": 2.0671757496141665e-05,
|
| 15736 |
+
"loss": 2.3269,
|
| 15737 |
+
"step": 201600
|
| 15738 |
+
},
|
| 15739 |
+
{
|
| 15740 |
+
"epoch": 0.007535206157658839,
|
| 15741 |
+
"grad_norm": 2.161654472351074,
|
| 15742 |
+
"learning_rate": 2.0649953057326904e-05,
|
| 15743 |
+
"loss": 2.3191,
|
| 15744 |
+
"step": 201700
|
| 15745 |
+
},
|
| 15746 |
+
{
|
| 15747 |
+
"epoch": 0.00781428786720176,
|
| 15748 |
+
"grad_norm": 2.2663004398345947,
|
| 15749 |
+
"learning_rate": 2.0628152030347214e-05,
|
| 15750 |
+
"loss": 2.3153,
|
| 15751 |
+
"step": 201800
|
| 15752 |
+
},
|
| 15753 |
+
{
|
| 15754 |
+
"epoch": 0.00809336957674468,
|
| 15755 |
+
"grad_norm": 2.2835566997528076,
|
| 15756 |
+
"learning_rate": 2.06063544323016e-05,
|
| 15757 |
+
"loss": 2.3127,
|
| 15758 |
+
"step": 201900
|
| 15759 |
+
},
|
| 15760 |
+
{
|
| 15761 |
+
"epoch": 0.008372451286287599,
|
| 15762 |
+
"grad_norm": 2.1445398330688477,
|
| 15763 |
+
"learning_rate": 2.0584560280286397e-05,
|
| 15764 |
+
"loss": 2.2974,
|
| 15765 |
+
"step": 202000
|
| 15766 |
+
},
|
| 15767 |
+
{
|
| 15768 |
+
"epoch": 0.008372451286287599,
|
| 15769 |
+
"eval_loss": 2.17156720161438,
|
| 15770 |
+
"eval_runtime": 51.3282,
|
| 15771 |
+
"eval_samples_per_second": 198.604,
|
| 15772 |
+
"eval_steps_per_second": 1.559,
|
| 15773 |
+
"step": 202000
|
| 15774 |
+
},
|
| 15775 |
+
{
|
| 15776 |
+
"epoch": 0.008651532995830519,
|
| 15777 |
+
"grad_norm": 2.2205893993377686,
|
| 15778 |
+
"learning_rate": 2.0562769591395203e-05,
|
| 15779 |
+
"loss": 2.3078,
|
| 15780 |
+
"step": 202100
|
| 15781 |
+
},
|
| 15782 |
+
{
|
| 15783 |
+
"epoch": 0.008930614705373438,
|
| 15784 |
+
"grad_norm": 2.205244541168213,
|
| 15785 |
+
"learning_rate": 2.054098238271894e-05,
|
| 15786 |
+
"loss": 2.2938,
|
| 15787 |
+
"step": 202200
|
| 15788 |
+
},
|
| 15789 |
+
{
|
| 15790 |
+
"epoch": 0.00920969641491636,
|
| 15791 |
+
"grad_norm": 2.2526943683624268,
|
| 15792 |
+
"learning_rate": 2.0519198671345784e-05,
|
| 15793 |
+
"loss": 2.2967,
|
| 15794 |
+
"step": 202300
|
| 15795 |
+
},
|
| 15796 |
+
{
|
| 15797 |
+
"epoch": 0.00948877812445928,
|
| 15798 |
+
"grad_norm": 2.3271262645721436,
|
| 15799 |
+
"learning_rate": 2.049741847436116e-05,
|
| 15800 |
+
"loss": 2.2701,
|
| 15801 |
+
"step": 202400
|
| 15802 |
+
},
|
| 15803 |
+
{
|
| 15804 |
+
"epoch": 0.0097678598340022,
|
| 15805 |
+
"grad_norm": 2.225120782852173,
|
| 15806 |
+
"learning_rate": 2.047564180884775e-05,
|
| 15807 |
+
"loss": 2.3035,
|
| 15808 |
+
"step": 202500
|
| 15809 |
+
},
|
| 15810 |
+
{
|
| 15811 |
+
"epoch": 0.010046941543545119,
|
| 15812 |
+
"grad_norm": 2.1193225383758545,
|
| 15813 |
+
"learning_rate": 2.0453868691885446e-05,
|
| 15814 |
+
"loss": 2.287,
|
| 15815 |
+
"step": 202600
|
| 15816 |
+
},
|
| 15817 |
+
{
|
| 15818 |
+
"epoch": 0.010326023253088039,
|
| 15819 |
+
"grad_norm": 2.305154800415039,
|
| 15820 |
+
"learning_rate": 2.043209914055138e-05,
|
| 15821 |
+
"loss": 2.2997,
|
| 15822 |
+
"step": 202700
|
| 15823 |
+
},
|
| 15824 |
+
{
|
| 15825 |
+
"epoch": 0.01060510496263096,
|
| 15826 |
+
"grad_norm": 2.1183183193206787,
|
| 15827 |
+
"learning_rate": 2.041033317191989e-05,
|
| 15828 |
+
"loss": 2.3005,
|
| 15829 |
+
"step": 202800
|
| 15830 |
+
},
|
| 15831 |
+
{
|
| 15832 |
+
"epoch": 0.01088418667217388,
|
| 15833 |
+
"grad_norm": 2.175776481628418,
|
| 15834 |
+
"learning_rate": 2.0388570803062465e-05,
|
| 15835 |
+
"loss": 2.2992,
|
| 15836 |
+
"step": 202900
|
| 15837 |
+
},
|
| 15838 |
+
{
|
| 15839 |
+
"epoch": 0.0111632683817168,
|
| 15840 |
+
"grad_norm": 2.2266392707824707,
|
| 15841 |
+
"learning_rate": 2.036681205104782e-05,
|
| 15842 |
+
"loss": 2.2959,
|
| 15843 |
+
"step": 203000
|
| 15844 |
+
},
|
| 15845 |
+
{
|
| 15846 |
+
"epoch": 0.0111632683817168,
|
| 15847 |
+
"eval_loss": 2.167436122894287,
|
| 15848 |
+
"eval_runtime": 51.341,
|
| 15849 |
+
"eval_samples_per_second": 198.555,
|
| 15850 |
+
"eval_steps_per_second": 1.558,
|
| 15851 |
+
"step": 203000
|
| 15852 |
+
},
|
| 15853 |
+
{
|
| 15854 |
+
"epoch": 0.011442350091259719,
|
| 15855 |
+
"grad_norm": 2.2803258895874023,
|
| 15856 |
+
"learning_rate": 2.0345056932941793e-05,
|
| 15857 |
+
"loss": 2.2866,
|
| 15858 |
+
"step": 203100
|
| 15859 |
+
},
|
| 15860 |
+
{
|
| 15861 |
+
"epoch": 0.011721431800802639,
|
| 15862 |
+
"grad_norm": 2.1691677570343018,
|
| 15863 |
+
"learning_rate": 2.032330546580741e-05,
|
| 15864 |
+
"loss": 2.2798,
|
| 15865 |
+
"step": 203200
|
| 15866 |
+
},
|
| 15867 |
+
{
|
| 15868 |
+
"epoch": 0.012000513510345558,
|
| 15869 |
+
"grad_norm": 2.0682337284088135,
|
| 15870 |
+
"learning_rate": 2.0301557666704787e-05,
|
| 15871 |
+
"loss": 2.2847,
|
| 15872 |
+
"step": 203300
|
| 15873 |
+
},
|
| 15874 |
+
{
|
| 15875 |
+
"epoch": 0.01227959521988848,
|
| 15876 |
+
"grad_norm": 2.41520357131958,
|
| 15877 |
+
"learning_rate": 2.0279813552691208e-05,
|
| 15878 |
+
"loss": 2.2897,
|
| 15879 |
+
"step": 203400
|
| 15880 |
+
},
|
| 15881 |
+
{
|
| 15882 |
+
"epoch": 0.0125586769294314,
|
| 15883 |
+
"grad_norm": 2.2019283771514893,
|
| 15884 |
+
"learning_rate": 2.025807314082104e-05,
|
| 15885 |
+
"loss": 2.2855,
|
| 15886 |
+
"step": 203500
|
| 15887 |
+
},
|
| 15888 |
+
{
|
| 15889 |
+
"epoch": 0.012837758638974319,
|
| 15890 |
+
"grad_norm": 2.154576539993286,
|
| 15891 |
+
"learning_rate": 2.0236336448145766e-05,
|
| 15892 |
+
"loss": 2.2726,
|
| 15893 |
+
"step": 203600
|
| 15894 |
+
},
|
| 15895 |
+
{
|
| 15896 |
+
"epoch": 0.013116840348517239,
|
| 15897 |
+
"grad_norm": 2.2569046020507812,
|
| 15898 |
+
"learning_rate": 2.0214603491713928e-05,
|
| 15899 |
+
"loss": 2.2666,
|
| 15900 |
+
"step": 203700
|
| 15901 |
+
},
|
| 15902 |
+
{
|
| 15903 |
+
"epoch": 0.013395922058060158,
|
| 15904 |
+
"grad_norm": 2.306614875793457,
|
| 15905 |
+
"learning_rate": 2.0192874288571152e-05,
|
| 15906 |
+
"loss": 2.2826,
|
| 15907 |
+
"step": 203800
|
| 15908 |
+
},
|
| 15909 |
+
{
|
| 15910 |
+
"epoch": 0.013675003767603078,
|
| 15911 |
+
"grad_norm": 2.2659449577331543,
|
| 15912 |
+
"learning_rate": 2.017114885576012e-05,
|
| 15913 |
+
"loss": 2.288,
|
| 15914 |
+
"step": 203900
|
| 15915 |
+
},
|
| 15916 |
+
{
|
| 15917 |
+
"epoch": 0.013954085477146,
|
| 15918 |
+
"grad_norm": 2.14077091217041,
|
| 15919 |
+
"learning_rate": 2.0149427210320545e-05,
|
| 15920 |
+
"loss": 2.2729,
|
| 15921 |
+
"step": 204000
|
| 15922 |
+
},
|
| 15923 |
+
{
|
| 15924 |
+
"epoch": 0.013954085477146,
|
| 15925 |
+
"eval_loss": 2.164825916290283,
|
| 15926 |
+
"eval_runtime": 51.3793,
|
| 15927 |
+
"eval_samples_per_second": 198.407,
|
| 15928 |
+
"eval_steps_per_second": 1.557,
|
| 15929 |
+
"step": 204000
|
| 15930 |
+
},
|
| 15931 |
+
{
|
| 15932 |
+
"epoch": 0.01423316718668892,
|
| 15933 |
+
"grad_norm": 2.265152931213379,
|
| 15934 |
+
"learning_rate": 2.0127709369289202e-05,
|
| 15935 |
+
"loss": 2.2654,
|
| 15936 |
+
"step": 204100
|
| 15937 |
+
},
|
| 15938 |
+
{
|
| 15939 |
+
"epoch": 0.014512248896231839,
|
| 15940 |
+
"grad_norm": 2.0833661556243896,
|
| 15941 |
+
"learning_rate": 2.0105995349699832e-05,
|
| 15942 |
+
"loss": 2.2863,
|
| 15943 |
+
"step": 204200
|
| 15944 |
+
},
|
| 15945 |
+
{
|
| 15946 |
+
"epoch": 0.014791330605774759,
|
| 15947 |
+
"grad_norm": 2.2797181606292725,
|
| 15948 |
+
"learning_rate": 2.008428516858323e-05,
|
| 15949 |
+
"loss": 2.2702,
|
| 15950 |
+
"step": 204300
|
| 15951 |
+
},
|
| 15952 |
+
{
|
| 15953 |
+
"epoch": 0.015070412315317678,
|
| 15954 |
+
"grad_norm": 2.2614681720733643,
|
| 15955 |
+
"learning_rate": 2.006257884296713e-05,
|
| 15956 |
+
"loss": 2.2846,
|
| 15957 |
+
"step": 204400
|
| 15958 |
+
},
|
| 15959 |
+
{
|
| 15960 |
+
"epoch": 0.015349494024860598,
|
| 15961 |
+
"grad_norm": 2.1245336532592773,
|
| 15962 |
+
"learning_rate": 2.00408763898763e-05,
|
| 15963 |
+
"loss": 2.2758,
|
| 15964 |
+
"step": 204500
|
| 15965 |
+
},
|
| 15966 |
+
{
|
| 15967 |
+
"epoch": 0.01562857573440352,
|
| 15968 |
+
"grad_norm": 2.14581298828125,
|
| 15969 |
+
"learning_rate": 2.001917782633241e-05,
|
| 15970 |
+
"loss": 2.2624,
|
| 15971 |
+
"step": 204600
|
| 15972 |
+
},
|
| 15973 |
+
{
|
| 15974 |
+
"epoch": 0.015907657443946437,
|
| 15975 |
+
"grad_norm": 2.240208864212036,
|
| 15976 |
+
"learning_rate": 1.9997483169354124e-05,
|
| 15977 |
+
"loss": 2.2563,
|
| 15978 |
+
"step": 204700
|
| 15979 |
+
},
|
| 15980 |
+
{
|
| 15981 |
+
"epoch": 0.01618673915348936,
|
| 15982 |
+
"grad_norm": 2.290208578109741,
|
| 15983 |
+
"learning_rate": 1.9975792435957024e-05,
|
| 15984 |
+
"loss": 2.2733,
|
| 15985 |
+
"step": 204800
|
| 15986 |
+
},
|
| 15987 |
+
{
|
| 15988 |
+
"epoch": 0.01646582086303228,
|
| 15989 |
+
"grad_norm": 2.309551954269409,
|
| 15990 |
+
"learning_rate": 1.9954105643153624e-05,
|
| 15991 |
+
"loss": 2.2575,
|
| 15992 |
+
"step": 204900
|
| 15993 |
+
},
|
| 15994 |
+
{
|
| 15995 |
+
"epoch": 0.016744902572575198,
|
| 15996 |
+
"grad_norm": 2.183645009994507,
|
| 15997 |
+
"learning_rate": 1.9932422807953323e-05,
|
| 15998 |
+
"loss": 2.2796,
|
| 15999 |
+
"step": 205000
|
| 16000 |
+
},
|
| 16001 |
+
{
|
| 16002 |
+
"epoch": 0.016744902572575198,
|
| 16003 |
+
"eval_loss": 2.1678764820098877,
|
| 16004 |
+
"eval_runtime": 51.2196,
|
| 16005 |
+
"eval_samples_per_second": 199.025,
|
| 16006 |
+
"eval_steps_per_second": 1.562,
|
| 16007 |
+
"step": 205000
|
| 16008 |
+
},
|
| 16009 |
+
{
|
| 16010 |
+
"epoch": 0.01702398428211812,
|
| 16011 |
+
"grad_norm": 2.1871604919433594,
|
| 16012 |
+
"learning_rate": 1.9910743947362455e-05,
|
| 16013 |
+
"loss": 2.2631,
|
| 16014 |
+
"step": 205100
|
| 16015 |
+
},
|
| 16016 |
+
{
|
| 16017 |
+
"epoch": 0.017303065991661037,
|
| 16018 |
+
"grad_norm": 2.1617250442504883,
|
| 16019 |
+
"learning_rate": 1.9889069078384193e-05,
|
| 16020 |
+
"loss": 2.2609,
|
| 16021 |
+
"step": 205200
|
| 16022 |
+
},
|
| 16023 |
+
{
|
| 16024 |
+
"epoch": 0.01758214770120396,
|
| 16025 |
+
"grad_norm": 2.183656692504883,
|
| 16026 |
+
"learning_rate": 1.9867398218018624e-05,
|
| 16027 |
+
"loss": 2.2568,
|
| 16028 |
+
"step": 205300
|
| 16029 |
+
},
|
| 16030 |
+
{
|
| 16031 |
+
"epoch": 0.017861229410746877,
|
| 16032 |
+
"grad_norm": 2.2372233867645264,
|
| 16033 |
+
"learning_rate": 1.9845731383262646e-05,
|
| 16034 |
+
"loss": 2.2663,
|
| 16035 |
+
"step": 205400
|
| 16036 |
+
},
|
| 16037 |
+
{
|
| 16038 |
+
"epoch": 0.018140311120289798,
|
| 16039 |
+
"grad_norm": 2.200566053390503,
|
| 16040 |
+
"learning_rate": 1.9824068591110034e-05,
|
| 16041 |
+
"loss": 2.2511,
|
| 16042 |
+
"step": 205500
|
| 16043 |
+
},
|
| 16044 |
+
{
|
| 16045 |
+
"epoch": 0.01841939282983272,
|
| 16046 |
+
"grad_norm": 2.1325571537017822,
|
| 16047 |
+
"learning_rate": 1.9802409858551382e-05,
|
| 16048 |
+
"loss": 2.2628,
|
| 16049 |
+
"step": 205600
|
| 16050 |
+
},
|
| 16051 |
+
{
|
| 16052 |
+
"epoch": 0.018698474539375638,
|
| 16053 |
+
"grad_norm": 2.1458706855773926,
|
| 16054 |
+
"learning_rate": 1.9780755202574098e-05,
|
| 16055 |
+
"loss": 2.2565,
|
| 16056 |
+
"step": 205700
|
| 16057 |
+
},
|
| 16058 |
+
{
|
| 16059 |
+
"epoch": 0.01897755624891856,
|
| 16060 |
+
"grad_norm": 2.397474527359009,
|
| 16061 |
+
"learning_rate": 1.9759104640162388e-05,
|
| 16062 |
+
"loss": 2.2582,
|
| 16063 |
+
"step": 205800
|
| 16064 |
+
},
|
| 16065 |
+
{
|
| 16066 |
+
"epoch": 0.019256637958461477,
|
| 16067 |
+
"grad_norm": 2.239386558532715,
|
| 16068 |
+
"learning_rate": 1.9737458188297247e-05,
|
| 16069 |
+
"loss": 2.2484,
|
| 16070 |
+
"step": 205900
|
| 16071 |
+
},
|
| 16072 |
+
{
|
| 16073 |
+
"epoch": 0.0195357196680044,
|
| 16074 |
+
"grad_norm": 2.17461895942688,
|
| 16075 |
+
"learning_rate": 1.9715815863956462e-05,
|
| 16076 |
+
"loss": 2.2536,
|
| 16077 |
+
"step": 206000
|
| 16078 |
+
},
|
| 16079 |
+
{
|
| 16080 |
+
"epoch": 0.0195357196680044,
|
| 16081 |
+
"eval_loss": 2.1656434535980225,
|
| 16082 |
+
"eval_runtime": 51.4827,
|
| 16083 |
+
"eval_samples_per_second": 198.008,
|
| 16084 |
+
"eval_steps_per_second": 1.554,
|
| 16085 |
+
"step": 206000
|
| 16086 |
+
},
|
| 16087 |
+
{
|
| 16088 |
+
"epoch": 0.01981480137754732,
|
| 16089 |
+
"grad_norm": 2.1970889568328857,
|
| 16090 |
+
"learning_rate": 1.969417768411458e-05,
|
| 16091 |
+
"loss": 2.269,
|
| 16092 |
+
"step": 206100
|
| 16093 |
+
},
|
| 16094 |
+
{
|
| 16095 |
+
"epoch": 0.020093883087090238,
|
| 16096 |
+
"grad_norm": 2.151305913925171,
|
| 16097 |
+
"learning_rate": 1.967254366574286e-05,
|
| 16098 |
+
"loss": 2.2609,
|
| 16099 |
+
"step": 206200
|
| 16100 |
+
},
|
| 16101 |
+
{
|
| 16102 |
+
"epoch": 0.02037296479663316,
|
| 16103 |
+
"grad_norm": 2.164149045944214,
|
| 16104 |
+
"learning_rate": 1.965091382580935e-05,
|
| 16105 |
+
"loss": 2.2608,
|
| 16106 |
+
"step": 206300
|
| 16107 |
+
},
|
| 16108 |
+
{
|
| 16109 |
+
"epoch": 0.020652046506176077,
|
| 16110 |
+
"grad_norm": 2.203151226043701,
|
| 16111 |
+
"learning_rate": 1.9629288181278795e-05,
|
| 16112 |
+
"loss": 2.2616,
|
| 16113 |
+
"step": 206400
|
| 16114 |
+
},
|
| 16115 |
+
{
|
| 16116 |
+
"epoch": 0.020931128215719,
|
| 16117 |
+
"grad_norm": 2.1855273246765137,
|
| 16118 |
+
"learning_rate": 1.960766674911264e-05,
|
| 16119 |
+
"loss": 2.2614,
|
| 16120 |
+
"step": 206500
|
| 16121 |
+
},
|
| 16122 |
+
{
|
| 16123 |
+
"epoch": 0.02121020992526192,
|
| 16124 |
+
"grad_norm": 2.124351978302002,
|
| 16125 |
+
"learning_rate": 1.958604954626906e-05,
|
| 16126 |
+
"loss": 2.2448,
|
| 16127 |
+
"step": 206600
|
| 16128 |
+
},
|
| 16129 |
+
{
|
| 16130 |
+
"epoch": 0.021489291634804838,
|
| 16131 |
+
"grad_norm": 2.177095890045166,
|
| 16132 |
+
"learning_rate": 1.9564436589702864e-05,
|
| 16133 |
+
"loss": 2.2519,
|
| 16134 |
+
"step": 206700
|
| 16135 |
+
},
|
| 16136 |
+
{
|
| 16137 |
+
"epoch": 0.02176837334434776,
|
| 16138 |
+
"grad_norm": 2.1898281574249268,
|
| 16139 |
+
"learning_rate": 1.9542827896365568e-05,
|
| 16140 |
+
"loss": 2.2608,
|
| 16141 |
+
"step": 206800
|
| 16142 |
+
},
|
| 16143 |
+
{
|
| 16144 |
+
"epoch": 0.022047455053890677,
|
| 16145 |
+
"grad_norm": 2.2773730754852295,
|
| 16146 |
+
"learning_rate": 1.9521223483205342e-05,
|
| 16147 |
+
"loss": 2.262,
|
| 16148 |
+
"step": 206900
|
| 16149 |
+
},
|
| 16150 |
+
{
|
| 16151 |
+
"epoch": 0.0223265367634336,
|
| 16152 |
+
"grad_norm": 2.2109436988830566,
|
| 16153 |
+
"learning_rate": 1.9499623367166982e-05,
|
| 16154 |
+
"loss": 2.2448,
|
| 16155 |
+
"step": 207000
|
| 16156 |
+
},
|
| 16157 |
+
{
|
| 16158 |
+
"epoch": 0.0223265367634336,
|
| 16159 |
+
"eval_loss": 2.164100408554077,
|
| 16160 |
+
"eval_runtime": 51.5664,
|
| 16161 |
+
"eval_samples_per_second": 197.687,
|
| 16162 |
+
"eval_steps_per_second": 1.551,
|
| 16163 |
+
"step": 207000
|
| 16164 |
+
},
|
| 16165 |
+
{
|
| 16166 |
+
"epoch": 0.022605618472976517,
|
| 16167 |
+
"grad_norm": 2.2141733169555664,
|
| 16168 |
+
"learning_rate": 1.9478027565191922e-05,
|
| 16169 |
+
"loss": 2.2537,
|
| 16170 |
+
"step": 207100
|
| 16171 |
+
},
|
| 16172 |
+
{
|
| 16173 |
+
"epoch": 0.022884700182519438,
|
| 16174 |
+
"grad_norm": 2.2592718601226807,
|
| 16175 |
+
"learning_rate": 1.945643609421821e-05,
|
| 16176 |
+
"loss": 2.2441,
|
| 16177 |
+
"step": 207200
|
| 16178 |
+
},
|
| 16179 |
+
{
|
| 16180 |
+
"epoch": 0.02316378189206236,
|
| 16181 |
+
"grad_norm": 2.2082977294921875,
|
| 16182 |
+
"learning_rate": 1.94348489711805e-05,
|
| 16183 |
+
"loss": 2.2529,
|
| 16184 |
+
"step": 207300
|
| 16185 |
+
},
|
| 16186 |
+
{
|
| 16187 |
+
"epoch": 0.023442863601605277,
|
| 16188 |
+
"grad_norm": 2.2095062732696533,
|
| 16189 |
+
"learning_rate": 1.941326621301005e-05,
|
| 16190 |
+
"loss": 2.2597,
|
| 16191 |
+
"step": 207400
|
| 16192 |
+
},
|
| 16193 |
+
{
|
| 16194 |
+
"epoch": 0.0237219453111482,
|
| 16195 |
+
"grad_norm": 2.189436674118042,
|
| 16196 |
+
"learning_rate": 1.939168783663466e-05,
|
| 16197 |
+
"loss": 2.2455,
|
| 16198 |
+
"step": 207500
|
| 16199 |
+
},
|
| 16200 |
+
{
|
| 16201 |
+
"epoch": 0.024001027020691117,
|
| 16202 |
+
"grad_norm": 2.218168258666992,
|
| 16203 |
+
"learning_rate": 1.9370113858978722e-05,
|
| 16204 |
+
"loss": 2.2485,
|
| 16205 |
+
"step": 207600
|
| 16206 |
+
},
|
| 16207 |
+
{
|
| 16208 |
+
"epoch": 0.024280108730234038,
|
| 16209 |
+
"grad_norm": 2.1648590564727783,
|
| 16210 |
+
"learning_rate": 1.9348544296963165e-05,
|
| 16211 |
+
"loss": 2.2456,
|
| 16212 |
+
"step": 207700
|
| 16213 |
+
},
|
| 16214 |
+
{
|
| 16215 |
+
"epoch": 0.02455919043977696,
|
| 16216 |
+
"grad_norm": 2.121211051940918,
|
| 16217 |
+
"learning_rate": 1.9326979167505474e-05,
|
| 16218 |
+
"loss": 2.2364,
|
| 16219 |
+
"step": 207800
|
| 16220 |
+
},
|
| 16221 |
+
{
|
| 16222 |
+
"epoch": 0.024838272149319877,
|
| 16223 |
+
"grad_norm": 2.271167039871216,
|
| 16224 |
+
"learning_rate": 1.9305418487519617e-05,
|
| 16225 |
+
"loss": 2.2561,
|
| 16226 |
+
"step": 207900
|
| 16227 |
+
},
|
| 16228 |
+
{
|
| 16229 |
+
"epoch": 0.0251173538588628,
|
| 16230 |
+
"grad_norm": 2.3215372562408447,
|
| 16231 |
+
"learning_rate": 1.9283862273916116e-05,
|
| 16232 |
+
"loss": 2.2397,
|
| 16233 |
+
"step": 208000
|
| 16234 |
+
},
|
| 16235 |
+
{
|
| 16236 |
+
"epoch": 0.0251173538588628,
|
| 16237 |
+
"eval_loss": 2.164187431335449,
|
| 16238 |
+
"eval_runtime": 51.5373,
|
| 16239 |
+
"eval_samples_per_second": 197.799,
|
| 16240 |
+
"eval_steps_per_second": 1.552,
|
| 16241 |
+
"step": 208000
|
| 16242 |
+
},
|
| 16243 |
+
{
|
| 16244 |
+
"epoch": 0.025396435568405717,
|
| 16245 |
+
"grad_norm": 2.174811363220215,
|
| 16246 |
+
"learning_rate": 1.9262310543601962e-05,
|
| 16247 |
+
"loss": 2.2412,
|
| 16248 |
+
"step": 208100
|
| 16249 |
+
},
|
| 16250 |
+
{
|
| 16251 |
+
"epoch": 0.025675517277948638,
|
| 16252 |
+
"grad_norm": 2.1047627925872803,
|
| 16253 |
+
"learning_rate": 1.9240763313480655e-05,
|
| 16254 |
+
"loss": 2.2363,
|
| 16255 |
+
"step": 208200
|
| 16256 |
+
},
|
| 16257 |
+
{
|
| 16258 |
+
"epoch": 0.025954598987491556,
|
| 16259 |
+
"grad_norm": 2.2328543663024902,
|
| 16260 |
+
"learning_rate": 1.9219220600452127e-05,
|
| 16261 |
+
"loss": 2.2537,
|
| 16262 |
+
"step": 208300
|
| 16263 |
+
},
|
| 16264 |
+
{
|
| 16265 |
+
"epoch": 0.026233680697034478,
|
| 16266 |
+
"grad_norm": 2.1852455139160156,
|
| 16267 |
+
"learning_rate": 1.919768242141281e-05,
|
| 16268 |
+
"loss": 2.2472,
|
| 16269 |
+
"step": 208400
|
| 16270 |
+
},
|
| 16271 |
+
{
|
| 16272 |
+
"epoch": 0.0265127624065774,
|
| 16273 |
+
"grad_norm": 2.23559832572937,
|
| 16274 |
+
"learning_rate": 1.9176148793255543e-05,
|
| 16275 |
+
"loss": 2.243,
|
| 16276 |
+
"step": 208500
|
| 16277 |
+
},
|
| 16278 |
+
{
|
| 16279 |
+
"epoch": 0.026791844116120317,
|
| 16280 |
+
"grad_norm": 2.195355176925659,
|
| 16281 |
+
"learning_rate": 1.9154619732869626e-05,
|
| 16282 |
+
"loss": 2.2463,
|
| 16283 |
+
"step": 208600
|
| 16284 |
+
},
|
| 16285 |
+
{
|
| 16286 |
+
"epoch": 0.02707092582566324,
|
| 16287 |
+
"grad_norm": 2.295536994934082,
|
| 16288 |
+
"learning_rate": 1.913309525714075e-05,
|
| 16289 |
+
"loss": 2.2413,
|
| 16290 |
+
"step": 208700
|
| 16291 |
+
},
|
| 16292 |
+
{
|
| 16293 |
+
"epoch": 0.027350007535206156,
|
| 16294 |
+
"grad_norm": 2.373781681060791,
|
| 16295 |
+
"learning_rate": 1.9111575382951026e-05,
|
| 16296 |
+
"loss": 2.2385,
|
| 16297 |
+
"step": 208800
|
| 16298 |
+
},
|
| 16299 |
+
{
|
| 16300 |
+
"epoch": 0.027629089244749078,
|
| 16301 |
+
"grad_norm": 2.3178882598876953,
|
| 16302 |
+
"learning_rate": 1.909006012717896e-05,
|
| 16303 |
+
"loss": 2.2454,
|
| 16304 |
+
"step": 208900
|
| 16305 |
+
},
|
| 16306 |
+
{
|
| 16307 |
+
"epoch": 0.027908170954292,
|
| 16308 |
+
"grad_norm": 2.2002763748168945,
|
| 16309 |
+
"learning_rate": 1.9068549506699425e-05,
|
| 16310 |
+
"loss": 2.236,
|
| 16311 |
+
"step": 209000
|
| 16312 |
+
},
|
| 16313 |
+
{
|
| 16314 |
+
"epoch": 0.027908170954292,
|
| 16315 |
+
"eval_loss": 2.1654672622680664,
|
| 16316 |
+
"eval_runtime": 51.5727,
|
| 16317 |
+
"eval_samples_per_second": 197.663,
|
| 16318 |
+
"eval_steps_per_second": 1.551,
|
| 16319 |
+
"step": 209000
|
| 16320 |
+
},
|
| 16321 |
+
{
|
| 16322 |
+
"epoch": 0.028187252663834917,
|
| 16323 |
+
"grad_norm": 2.2618346214294434,
|
| 16324 |
+
"learning_rate": 1.9047043538383662e-05,
|
| 16325 |
+
"loss": 2.2211,
|
| 16326 |
+
"step": 209100
|
| 16327 |
+
},
|
| 16328 |
+
{
|
| 16329 |
+
"epoch": 0.02846633437337784,
|
| 16330 |
+
"grad_norm": 2.2079176902770996,
|
| 16331 |
+
"learning_rate": 1.9025542239099252e-05,
|
| 16332 |
+
"loss": 2.2456,
|
| 16333 |
+
"step": 209200
|
| 16334 |
+
},
|
| 16335 |
+
{
|
| 16336 |
+
"epoch": 0.028745416082920756,
|
| 16337 |
+
"grad_norm": 2.119337797164917,
|
| 16338 |
+
"learning_rate": 1.9004045625710136e-05,
|
| 16339 |
+
"loss": 2.2356,
|
| 16340 |
+
"step": 209300
|
| 16341 |
+
},
|
| 16342 |
+
{
|
| 16343 |
+
"epoch": 0.029024497792463678,
|
| 16344 |
+
"grad_norm": 2.2664501667022705,
|
| 16345 |
+
"learning_rate": 1.8982553715076583e-05,
|
| 16346 |
+
"loss": 2.2403,
|
| 16347 |
+
"step": 209400
|
| 16348 |
+
},
|
| 16349 |
+
{
|
| 16350 |
+
"epoch": 0.0293035795020066,
|
| 16351 |
+
"grad_norm": 2.2333970069885254,
|
| 16352 |
+
"learning_rate": 1.8961066524055128e-05,
|
| 16353 |
+
"loss": 2.2522,
|
| 16354 |
+
"step": 209500
|
| 16355 |
+
},
|
| 16356 |
+
{
|
| 16357 |
+
"epoch": 0.029582661211549517,
|
| 16358 |
+
"grad_norm": 2.1713504791259766,
|
| 16359 |
+
"learning_rate": 1.8939584069498647e-05,
|
| 16360 |
+
"loss": 2.2488,
|
| 16361 |
+
"step": 209600
|
| 16362 |
+
},
|
| 16363 |
+
{
|
| 16364 |
+
"epoch": 0.02986174292109244,
|
| 16365 |
+
"grad_norm": 2.1721699237823486,
|
| 16366 |
+
"learning_rate": 1.8918106368256302e-05,
|
| 16367 |
+
"loss": 2.2418,
|
| 16368 |
+
"step": 209700
|
| 16369 |
+
},
|
| 16370 |
+
{
|
| 16371 |
+
"epoch": 0.030140824630635357,
|
| 16372 |
+
"grad_norm": 2.102562189102173,
|
| 16373 |
+
"learning_rate": 1.88966334371735e-05,
|
| 16374 |
+
"loss": 2.2346,
|
| 16375 |
+
"step": 209800
|
| 16376 |
+
},
|
| 16377 |
+
{
|
| 16378 |
+
"epoch": 0.030419906340178278,
|
| 16379 |
+
"grad_norm": 2.1796703338623047,
|
| 16380 |
+
"learning_rate": 1.8875165293091936e-05,
|
| 16381 |
+
"loss": 2.2445,
|
| 16382 |
+
"step": 209900
|
| 16383 |
+
},
|
| 16384 |
+
{
|
| 16385 |
+
"epoch": 0.030698988049721196,
|
| 16386 |
+
"grad_norm": 2.25935697555542,
|
| 16387 |
+
"learning_rate": 1.885370195284952e-05,
|
| 16388 |
+
"loss": 2.2407,
|
| 16389 |
+
"step": 210000
|
| 16390 |
+
},
|
| 16391 |
+
{
|
| 16392 |
+
"epoch": 0.030698988049721196,
|
| 16393 |
+
"eval_loss": 2.174961566925049,
|
| 16394 |
+
"eval_runtime": 51.6538,
|
| 16395 |
+
"eval_samples_per_second": 197.352,
|
| 16396 |
+
"eval_steps_per_second": 1.549,
|
| 16397 |
+
"step": 210000
|
| 16398 |
+
},
|
| 16399 |
+
{
|
| 16400 |
+
"epoch": 0.030978069759264117,
|
| 16401 |
+
"grad_norm": 2.1532399654388428,
|
| 16402 |
+
"learning_rate": 1.8832243433280412e-05,
|
| 16403 |
+
"loss": 2.2312,
|
| 16404 |
+
"step": 210100
|
| 16405 |
+
},
|
| 16406 |
+
{
|
| 16407 |
+
"epoch": 0.03125715146880704,
|
| 16408 |
+
"grad_norm": 2.322571277618408,
|
| 16409 |
+
"learning_rate": 1.8810789751215e-05,
|
| 16410 |
+
"loss": 2.235,
|
| 16411 |
+
"step": 210200
|
| 16412 |
+
},
|
| 16413 |
+
{
|
| 16414 |
+
"epoch": 0.03153623317834996,
|
| 16415 |
+
"grad_norm": 2.1225528717041016,
|
| 16416 |
+
"learning_rate": 1.8789340923479862e-05,
|
| 16417 |
+
"loss": 2.2175,
|
| 16418 |
+
"step": 210300
|
| 16419 |
+
},
|
| 16420 |
+
{
|
| 16421 |
+
"epoch": 0.031815314887892875,
|
| 16422 |
+
"grad_norm": 2.2108681201934814,
|
| 16423 |
+
"learning_rate": 1.8767896966897768e-05,
|
| 16424 |
+
"loss": 2.239,
|
| 16425 |
+
"step": 210400
|
| 16426 |
+
},
|
| 16427 |
+
{
|
| 16428 |
+
"epoch": 0.0320943965974358,
|
| 16429 |
+
"grad_norm": 2.227198839187622,
|
| 16430 |
+
"learning_rate": 1.8746457898287673e-05,
|
| 16431 |
+
"loss": 2.2274,
|
| 16432 |
+
"step": 210500
|
| 16433 |
+
},
|
| 16434 |
+
{
|
| 16435 |
+
"epoch": 0.03237347830697872,
|
| 16436 |
+
"grad_norm": 2.250565528869629,
|
| 16437 |
+
"learning_rate": 1.8725023734464702e-05,
|
| 16438 |
+
"loss": 2.2318,
|
| 16439 |
+
"step": 210600
|
| 16440 |
+
},
|
| 16441 |
+
{
|
| 16442 |
+
"epoch": 0.032652560016521635,
|
| 16443 |
+
"grad_norm": 2.1811561584472656,
|
| 16444 |
+
"learning_rate": 1.8703594492240138e-05,
|
| 16445 |
+
"loss": 2.2033,
|
| 16446 |
+
"step": 210700
|
| 16447 |
+
},
|
| 16448 |
+
{
|
| 16449 |
+
"epoch": 0.03293164172606456,
|
| 16450 |
+
"grad_norm": 2.1336236000061035,
|
| 16451 |
+
"learning_rate": 1.8682170188421375e-05,
|
| 16452 |
+
"loss": 2.1952,
|
| 16453 |
+
"step": 210800
|
| 16454 |
+
},
|
| 16455 |
+
{
|
| 16456 |
+
"epoch": 0.03321072343560748,
|
| 16457 |
+
"grad_norm": 2.2047863006591797,
|
| 16458 |
+
"learning_rate": 1.8660750839811963e-05,
|
| 16459 |
+
"loss": 2.1909,
|
| 16460 |
+
"step": 210900
|
| 16461 |
+
},
|
| 16462 |
+
{
|
| 16463 |
+
"epoch": 0.033489805145150396,
|
| 16464 |
+
"grad_norm": 2.003309965133667,
|
| 16465 |
+
"learning_rate": 1.8639336463211566e-05,
|
| 16466 |
+
"loss": 2.1693,
|
| 16467 |
+
"step": 211000
|
| 16468 |
+
},
|
| 16469 |
+
{
|
| 16470 |
+
"epoch": 0.033489805145150396,
|
| 16471 |
+
"eval_loss": 2.171804189682007,
|
| 16472 |
+
"eval_runtime": 51.5992,
|
| 16473 |
+
"eval_samples_per_second": 197.561,
|
| 16474 |
+
"eval_steps_per_second": 1.55,
|
| 16475 |
+
"step": 211000
|
| 16476 |
+
},
|
| 16477 |
+
{
|
| 16478 |
+
"epoch": 0.033768886854693314,
|
| 16479 |
+
"grad_norm": 2.105639934539795,
|
| 16480 |
+
"learning_rate": 1.861792707541593e-05,
|
| 16481 |
+
"loss": 2.1683,
|
| 16482 |
+
"step": 211100
|
| 16483 |
+
},
|
| 16484 |
+
{
|
| 16485 |
+
"epoch": 0.03404796856423624,
|
| 16486 |
+
"grad_norm": 2.2332839965820312,
|
| 16487 |
+
"learning_rate": 1.8596522693216888e-05,
|
| 16488 |
+
"loss": 2.1594,
|
| 16489 |
+
"step": 211200
|
| 16490 |
+
},
|
| 16491 |
+
{
|
| 16492 |
+
"epoch": 0.03432705027377916,
|
| 16493 |
+
"grad_norm": 2.2061290740966797,
|
| 16494 |
+
"learning_rate": 1.8575123333402367e-05,
|
| 16495 |
+
"loss": 2.1593,
|
| 16496 |
+
"step": 211300
|
| 16497 |
+
},
|
| 16498 |
+
{
|
| 16499 |
+
"epoch": 0.034606131983322075,
|
| 16500 |
+
"grad_norm": 2.0589332580566406,
|
| 16501 |
+
"learning_rate": 1.855372901275634e-05,
|
| 16502 |
+
"loss": 2.1437,
|
| 16503 |
+
"step": 211400
|
| 16504 |
+
},
|
| 16505 |
+
{
|
| 16506 |
+
"epoch": 0.034885213692865,
|
| 16507 |
+
"grad_norm": 2.1569809913635254,
|
| 16508 |
+
"learning_rate": 1.8532339748058844e-05,
|
| 16509 |
+
"loss": 2.1533,
|
| 16510 |
+
"step": 211500
|
| 16511 |
+
},
|
| 16512 |
+
{
|
| 16513 |
+
"epoch": 0.03516429540240792,
|
| 16514 |
+
"grad_norm": 2.1025686264038086,
|
| 16515 |
+
"learning_rate": 1.8510955556085915e-05,
|
| 16516 |
+
"loss": 2.1525,
|
| 16517 |
+
"step": 211600
|
| 16518 |
+
},
|
| 16519 |
+
{
|
| 16520 |
+
"epoch": 0.035443377111950836,
|
| 16521 |
+
"grad_norm": 2.19555926322937,
|
| 16522 |
+
"learning_rate": 1.848957645360965e-05,
|
| 16523 |
+
"loss": 2.1447,
|
| 16524 |
+
"step": 211700
|
| 16525 |
+
},
|
| 16526 |
+
{
|
| 16527 |
+
"epoch": 0.035722458821493754,
|
| 16528 |
+
"grad_norm": 2.095914840698242,
|
| 16529 |
+
"learning_rate": 1.8468202457398126e-05,
|
| 16530 |
+
"loss": 2.1421,
|
| 16531 |
+
"step": 211800
|
| 16532 |
+
},
|
| 16533 |
+
{
|
| 16534 |
+
"epoch": 0.03600154053103668,
|
| 16535 |
+
"grad_norm": 2.1924917697906494,
|
| 16536 |
+
"learning_rate": 1.8446833584215444e-05,
|
| 16537 |
+
"loss": 2.1416,
|
| 16538 |
+
"step": 211900
|
| 16539 |
+
},
|
| 16540 |
+
{
|
| 16541 |
+
"epoch": 0.036280622240579596,
|
| 16542 |
+
"grad_norm": 2.123359203338623,
|
| 16543 |
+
"learning_rate": 1.8425469850821648e-05,
|
| 16544 |
+
"loss": 2.1465,
|
| 16545 |
+
"step": 212000
|
| 16546 |
+
},
|
| 16547 |
+
{
|
| 16548 |
+
"epoch": 0.036280622240579596,
|
| 16549 |
+
"eval_loss": 2.1811015605926514,
|
| 16550 |
+
"eval_runtime": 51.5948,
|
| 16551 |
+
"eval_samples_per_second": 197.578,
|
| 16552 |
+
"eval_steps_per_second": 1.551,
|
| 16553 |
+
"step": 212000
|
| 16554 |
+
},
|
| 16555 |
+
{
|
| 16556 |
+
"epoch": 0.036559703950122514,
|
| 16557 |
+
"grad_norm": 2.065702438354492,
|
| 16558 |
+
"learning_rate": 1.840411127397278e-05,
|
| 16559 |
+
"loss": 2.1352,
|
| 16560 |
+
"step": 212100
|
| 16561 |
+
},
|
| 16562 |
+
{
|
| 16563 |
+
"epoch": 0.03683878565966544,
|
| 16564 |
+
"grad_norm": 2.0806708335876465,
|
| 16565 |
+
"learning_rate": 1.838275787042083e-05,
|
| 16566 |
+
"loss": 2.1432,
|
| 16567 |
+
"step": 212200
|
| 16568 |
+
},
|
| 16569 |
+
{
|
| 16570 |
+
"epoch": 0.03711786736920836,
|
| 16571 |
+
"grad_norm": 2.1028740406036377,
|
| 16572 |
+
"learning_rate": 1.8361409656913744e-05,
|
| 16573 |
+
"loss": 2.1349,
|
| 16574 |
+
"step": 212300
|
| 16575 |
+
},
|
| 16576 |
+
{
|
| 16577 |
+
"epoch": 0.037396949078751275,
|
| 16578 |
+
"grad_norm": 2.1603927612304688,
|
| 16579 |
+
"learning_rate": 1.8340066650195363e-05,
|
| 16580 |
+
"loss": 2.1307,
|
| 16581 |
+
"step": 212400
|
| 16582 |
+
},
|
| 16583 |
+
{
|
| 16584 |
+
"epoch": 0.0376760307882942,
|
| 16585 |
+
"grad_norm": 2.016268014907837,
|
| 16586 |
+
"learning_rate": 1.831872886700547e-05,
|
| 16587 |
+
"loss": 2.129,
|
| 16588 |
+
"step": 212500
|
| 16589 |
+
},
|
| 16590 |
+
{
|
| 16591 |
+
"epoch": 0.03795511249783712,
|
| 16592 |
+
"grad_norm": 1.9362486600875854,
|
| 16593 |
+
"learning_rate": 1.829739632407975e-05,
|
| 16594 |
+
"loss": 2.1187,
|
| 16595 |
+
"step": 212600
|
| 16596 |
+
},
|
| 16597 |
+
{
|
| 16598 |
+
"epoch": 0.038234194207380036,
|
| 16599 |
+
"grad_norm": 2.1569607257843018,
|
| 16600 |
+
"learning_rate": 1.827606903814977e-05,
|
| 16601 |
+
"loss": 2.1314,
|
| 16602 |
+
"step": 212700
|
| 16603 |
+
},
|
| 16604 |
+
{
|
| 16605 |
+
"epoch": 0.038513275916922954,
|
| 16606 |
+
"grad_norm": 2.0166728496551514,
|
| 16607 |
+
"learning_rate": 1.825474702594299e-05,
|
| 16608 |
+
"loss": 2.1274,
|
| 16609 |
+
"step": 212800
|
| 16610 |
+
},
|
| 16611 |
+
{
|
| 16612 |
+
"epoch": 0.03879235762646588,
|
| 16613 |
+
"grad_norm": 2.1779658794403076,
|
| 16614 |
+
"learning_rate": 1.8233430304182704e-05,
|
| 16615 |
+
"loss": 2.1183,
|
| 16616 |
+
"step": 212900
|
| 16617 |
+
},
|
| 16618 |
+
{
|
| 16619 |
+
"epoch": 0.0390714393360088,
|
| 16620 |
+
"grad_norm": 2.1090939044952393,
|
| 16621 |
+
"learning_rate": 1.821211888958808e-05,
|
| 16622 |
+
"loss": 2.126,
|
| 16623 |
+
"step": 213000
|
| 16624 |
+
},
|
| 16625 |
+
{
|
| 16626 |
+
"epoch": 0.0390714393360088,
|
| 16627 |
+
"eval_loss": 2.1809489727020264,
|
| 16628 |
+
"eval_runtime": 51.5547,
|
| 16629 |
+
"eval_samples_per_second": 197.732,
|
| 16630 |
+
"eval_steps_per_second": 1.552,
|
| 16631 |
+
"step": 213000
|
| 16632 |
+
},
|
| 16633 |
+
{
|
| 16634 |
+
"epoch": 0.039350521045551715,
|
| 16635 |
+
"grad_norm": 2.2175374031066895,
|
| 16636 |
+
"learning_rate": 1.819081279887411e-05,
|
| 16637 |
+
"loss": 2.1201,
|
| 16638 |
+
"step": 213100
|
| 16639 |
+
},
|
| 16640 |
+
{
|
| 16641 |
+
"epoch": 0.03962960275509464,
|
| 16642 |
+
"grad_norm": 2.0139071941375732,
|
| 16643 |
+
"learning_rate": 1.8169512048751648e-05,
|
| 16644 |
+
"loss": 2.1207,
|
| 16645 |
+
"step": 213200
|
| 16646 |
+
},
|
| 16647 |
+
{
|
| 16648 |
+
"epoch": 0.03990868446463756,
|
| 16649 |
+
"grad_norm": 2.101840019226074,
|
| 16650 |
+
"learning_rate": 1.814821665592729e-05,
|
| 16651 |
+
"loss": 2.1145,
|
| 16652 |
+
"step": 213300
|
| 16653 |
+
},
|
| 16654 |
+
{
|
| 16655 |
+
"epoch": 0.040187766174180475,
|
| 16656 |
+
"grad_norm": 2.199965238571167,
|
| 16657 |
+
"learning_rate": 1.8126926637103484e-05,
|
| 16658 |
+
"loss": 2.1256,
|
| 16659 |
+
"step": 213400
|
| 16660 |
+
},
|
| 16661 |
+
{
|
| 16662 |
+
"epoch": 0.04046684788372339,
|
| 16663 |
+
"grad_norm": 2.042839288711548,
|
| 16664 |
+
"learning_rate": 1.8105642008978458e-05,
|
| 16665 |
+
"loss": 2.1096,
|
| 16666 |
+
"step": 213500
|
| 16667 |
+
},
|
| 16668 |
+
{
|
| 16669 |
+
"epoch": 0.04074592959326632,
|
| 16670 |
+
"grad_norm": 2.233668804168701,
|
| 16671 |
+
"learning_rate": 1.808436278824619e-05,
|
| 16672 |
+
"loss": 2.1099,
|
| 16673 |
+
"step": 213600
|
| 16674 |
+
},
|
| 16675 |
+
{
|
| 16676 |
+
"epoch": 0.041025011302809236,
|
| 16677 |
+
"grad_norm": 2.0933728218078613,
|
| 16678 |
+
"learning_rate": 1.8063088991596437e-05,
|
| 16679 |
+
"loss": 2.1014,
|
| 16680 |
+
"step": 213700
|
| 16681 |
+
},
|
| 16682 |
+
{
|
| 16683 |
+
"epoch": 0.041304093012352154,
|
| 16684 |
+
"grad_norm": 2.1422884464263916,
|
| 16685 |
+
"learning_rate": 1.8041820635714682e-05,
|
| 16686 |
+
"loss": 2.1034,
|
| 16687 |
+
"step": 213800
|
| 16688 |
+
},
|
| 16689 |
+
{
|
| 16690 |
+
"epoch": 0.04158317472189508,
|
| 16691 |
+
"grad_norm": 2.0475480556488037,
|
| 16692 |
+
"learning_rate": 1.802055773728216e-05,
|
| 16693 |
+
"loss": 2.1116,
|
| 16694 |
+
"step": 213900
|
| 16695 |
+
},
|
| 16696 |
+
{
|
| 16697 |
+
"epoch": 0.041862256431438,
|
| 16698 |
+
"grad_norm": 2.0574936866760254,
|
| 16699 |
+
"learning_rate": 1.799930031297583e-05,
|
| 16700 |
+
"loss": 2.1181,
|
| 16701 |
+
"step": 214000
|
| 16702 |
+
},
|
| 16703 |
+
{
|
| 16704 |
+
"epoch": 0.041862256431438,
|
| 16705 |
+
"eval_loss": 2.1702778339385986,
|
| 16706 |
+
"eval_runtime": 51.7407,
|
| 16707 |
+
"eval_samples_per_second": 197.021,
|
| 16708 |
+
"eval_steps_per_second": 1.546,
|
| 16709 |
+
"step": 214000
|
| 16710 |
}
|
| 16711 |
],
|
| 16712 |
"logging_steps": 100,
|
|
|
|
| 16726 |
"attributes": {}
|
| 16727 |
}
|
| 16728 |
},
|
| 16729 |
+
"total_flos": 1.8676295751696384e+19,
|
| 16730 |
"train_batch_size": 128,
|
| 16731 |
"trial_name": null,
|
| 16732 |
"trial_params": null
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5777
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b369d4c284193104629459ff70a317184ca3f350753d5cc563977de982dd1e9
|
| 3 |
size 5777
|