Upload folder using huggingface_hub
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +2473 -3
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 262406656
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74a7fbfd1065c7efe650cbe07ca6888be4c9c4026201b6e3d687e19008471a74
|
| 3 |
size 262406656
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 122872331
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d13ac8ea643c9fa2eb6e074fedbff66a8ad842ec5de19c941bf75cc87d544fb7
|
| 3 |
size 122872331
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa961fa9e506668d35c6cfd8cf85f9299717888b30062742b0fd9e2da10b1c98
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df3eab020225d79fdb69396f30e0ff17b6980870fce2cd29482a57a0b5aad692
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -10928,11 +10928,2481 @@
|
|
| 10928 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 10929 |
"rewards/quality_reward_func/std": 0.0,
|
| 10930 |
"step": 4200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10931 |
}
|
| 10932 |
],
|
| 10933 |
"logging_steps": 10,
|
| 10934 |
"max_steps": 14544,
|
| 10935 |
-
"num_input_tokens_seen":
|
| 10936 |
"num_train_epochs": 1,
|
| 10937 |
"save_steps": 50,
|
| 10938 |
"stateful_callbacks": {
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3540979097909791,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 5150,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 10928 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 10929 |
"rewards/quality_reward_func/std": 0.0,
|
| 10930 |
"step": 4200
|
| 10931 |
+
},
|
| 10932 |
+
{
|
| 10933 |
+
"completion_length": 19.263157894736842,
|
| 10934 |
+
"completions/clipped_ratio": 0.0,
|
| 10935 |
+
"completions/max_length": 19.263157894736842,
|
| 10936 |
+
"completions/max_terminated_length": 19.263157894736842,
|
| 10937 |
+
"completions/mean_length": 16.842105263157894,
|
| 10938 |
+
"completions/mean_terminated_length": 16.842105263157894,
|
| 10939 |
+
"completions/min_length": 15.0,
|
| 10940 |
+
"completions/min_terminated_length": 15.0,
|
| 10941 |
+
"epoch": 0.2894664466446645,
|
| 10942 |
+
"frac_reward_zero_std": 1.0,
|
| 10943 |
+
"grad_norm": 0.0,
|
| 10944 |
+
"kl": 1.231401851302699,
|
| 10945 |
+
"learning_rate": 4.473433060627356e-06,
|
| 10946 |
+
"loss": 0.0,
|
| 10947 |
+
"num_tokens": 6010277.0,
|
| 10948 |
+
"reward": 4.099999904632568,
|
| 10949 |
+
"reward_std": 0.0,
|
| 10950 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 10951 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 10952 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 10953 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 10954 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 10955 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 10956 |
+
"step": 4210
|
| 10957 |
+
},
|
| 10958 |
+
{
|
| 10959 |
+
"completion_length": 19.4,
|
| 10960 |
+
"completions/clipped_ratio": 0.0,
|
| 10961 |
+
"completions/max_length": 19.4,
|
| 10962 |
+
"completions/max_terminated_length": 19.4,
|
| 10963 |
+
"completions/mean_length": 16.675,
|
| 10964 |
+
"completions/mean_terminated_length": 16.675,
|
| 10965 |
+
"completions/min_length": 15.4,
|
| 10966 |
+
"completions/min_terminated_length": 15.4,
|
| 10967 |
+
"epoch": 0.2901540154015402,
|
| 10968 |
+
"frac_reward_zero_std": 1.0,
|
| 10969 |
+
"grad_norm": 0.0,
|
| 10970 |
+
"kl": 1.3109652653336525,
|
| 10971 |
+
"learning_rate": 4.4697436254116876e-06,
|
| 10972 |
+
"loss": 0.0,
|
| 10973 |
+
"num_tokens": 6024628.0,
|
| 10974 |
+
"reward": 4.099999904632568,
|
| 10975 |
+
"reward_std": 0.0,
|
| 10976 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 10977 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 10978 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 10979 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 10980 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 10981 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 10982 |
+
"step": 4220
|
| 10983 |
+
},
|
| 10984 |
+
{
|
| 10985 |
+
"completion_length": 17.3,
|
| 10986 |
+
"completions/clipped_ratio": 0.0,
|
| 10987 |
+
"completions/max_length": 17.3,
|
| 10988 |
+
"completions/max_terminated_length": 17.3,
|
| 10989 |
+
"completions/mean_length": 15.65,
|
| 10990 |
+
"completions/mean_terminated_length": 15.65,
|
| 10991 |
+
"completions/min_length": 14.9,
|
| 10992 |
+
"completions/min_terminated_length": 14.9,
|
| 10993 |
+
"epoch": 0.2908415841584158,
|
| 10994 |
+
"frac_reward_zero_std": 1.0,
|
| 10995 |
+
"grad_norm": 0.0,
|
| 10996 |
+
"kl": 1.1730713717639447,
|
| 10997 |
+
"learning_rate": 4.4660428427975614e-06,
|
| 10998 |
+
"loss": 0.0,
|
| 10999 |
+
"num_tokens": 6039174.0,
|
| 11000 |
+
"reward": 4.099999904632568,
|
| 11001 |
+
"reward_std": 0.0,
|
| 11002 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11003 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11004 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11005 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11006 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11007 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11008 |
+
"step": 4230
|
| 11009 |
+
},
|
| 11010 |
+
{
|
| 11011 |
+
"completion_length": 19.4,
|
| 11012 |
+
"completions/clipped_ratio": 0.0,
|
| 11013 |
+
"completions/max_length": 19.4,
|
| 11014 |
+
"completions/max_terminated_length": 19.4,
|
| 11015 |
+
"completions/mean_length": 16.725,
|
| 11016 |
+
"completions/mean_terminated_length": 16.725,
|
| 11017 |
+
"completions/min_length": 15.2,
|
| 11018 |
+
"completions/min_terminated_length": 15.2,
|
| 11019 |
+
"epoch": 0.2915291529152915,
|
| 11020 |
+
"frac_reward_zero_std": 1.0,
|
| 11021 |
+
"grad_norm": 0.0,
|
| 11022 |
+
"kl": 1.115500158071518,
|
| 11023 |
+
"learning_rate": 4.462330734104633e-06,
|
| 11024 |
+
"loss": 0.0,
|
| 11025 |
+
"num_tokens": 6052447.0,
|
| 11026 |
+
"reward": 4.099999904632568,
|
| 11027 |
+
"reward_std": 0.0,
|
| 11028 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11029 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11030 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11031 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11032 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11033 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11034 |
+
"step": 4240
|
| 11035 |
+
},
|
| 11036 |
+
{
|
| 11037 |
+
"completion_length": 20.7,
|
| 11038 |
+
"completions/clipped_ratio": 0.0,
|
| 11039 |
+
"completions/max_length": 20.7,
|
| 11040 |
+
"completions/max_terminated_length": 20.7,
|
| 11041 |
+
"completions/mean_length": 17.95,
|
| 11042 |
+
"completions/mean_terminated_length": 17.95,
|
| 11043 |
+
"completions/min_length": 16.0,
|
| 11044 |
+
"completions/min_terminated_length": 16.0,
|
| 11045 |
+
"epoch": 0.2922167216721672,
|
| 11046 |
+
"frac_reward_zero_std": 1.0,
|
| 11047 |
+
"grad_norm": 0.0,
|
| 11048 |
+
"kl": 1.2464444026350976,
|
| 11049 |
+
"learning_rate": 4.458607320717805e-06,
|
| 11050 |
+
"loss": 0.0,
|
| 11051 |
+
"num_tokens": 6066089.0,
|
| 11052 |
+
"reward": 4.099999904632568,
|
| 11053 |
+
"reward_std": 0.0,
|
| 11054 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11055 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11056 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11057 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11058 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11059 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11060 |
+
"step": 4250
|
| 11061 |
+
},
|
| 11062 |
+
{
|
| 11063 |
+
"completion_length": 19.6,
|
| 11064 |
+
"completions/clipped_ratio": 0.0,
|
| 11065 |
+
"completions/max_length": 19.6,
|
| 11066 |
+
"completions/max_terminated_length": 19.6,
|
| 11067 |
+
"completions/mean_length": 17.95,
|
| 11068 |
+
"completions/mean_terminated_length": 17.95,
|
| 11069 |
+
"completions/min_length": 15.7,
|
| 11070 |
+
"completions/min_terminated_length": 15.7,
|
| 11071 |
+
"epoch": 0.2929042904290429,
|
| 11072 |
+
"frac_reward_zero_std": 1.0,
|
| 11073 |
+
"grad_norm": 0.0,
|
| 11074 |
+
"kl": 1.0488379423040897,
|
| 11075 |
+
"learning_rate": 4.454872624087105e-06,
|
| 11076 |
+
"loss": 0.0,
|
| 11077 |
+
"num_tokens": 6079355.0,
|
| 11078 |
+
"reward": 4.099999904632568,
|
| 11079 |
+
"reward_std": 0.0,
|
| 11080 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11081 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11082 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11083 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11084 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11085 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11086 |
+
"step": 4260
|
| 11087 |
+
},
|
| 11088 |
+
{
|
| 11089 |
+
"completion_length": 19.2,
|
| 11090 |
+
"completions/clipped_ratio": 0.0,
|
| 11091 |
+
"completions/max_length": 19.2,
|
| 11092 |
+
"completions/max_terminated_length": 19.2,
|
| 11093 |
+
"completions/mean_length": 17.55,
|
| 11094 |
+
"completions/mean_terminated_length": 17.55,
|
| 11095 |
+
"completions/min_length": 15.9,
|
| 11096 |
+
"completions/min_terminated_length": 15.9,
|
| 11097 |
+
"epoch": 0.29359185918591857,
|
| 11098 |
+
"frac_reward_zero_std": 1.0,
|
| 11099 |
+
"grad_norm": 0.0,
|
| 11100 |
+
"kl": 1.088923167437315,
|
| 11101 |
+
"learning_rate": 4.4511266657275624e-06,
|
| 11102 |
+
"loss": 0.0,
|
| 11103 |
+
"num_tokens": 6094181.0,
|
| 11104 |
+
"reward": 4.099999904632568,
|
| 11105 |
+
"reward_std": 0.0,
|
| 11106 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11107 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11108 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11109 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11110 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11111 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11112 |
+
"step": 4270
|
| 11113 |
+
},
|
| 11114 |
+
{
|
| 11115 |
+
"completion_length": 20.0,
|
| 11116 |
+
"completions/clipped_ratio": 0.0,
|
| 11117 |
+
"completions/max_length": 20.0,
|
| 11118 |
+
"completions/max_terminated_length": 20.0,
|
| 11119 |
+
"completions/mean_length": 18.475,
|
| 11120 |
+
"completions/mean_terminated_length": 18.475,
|
| 11121 |
+
"completions/min_length": 17.2,
|
| 11122 |
+
"completions/min_terminated_length": 17.2,
|
| 11123 |
+
"epoch": 0.29427942794279427,
|
| 11124 |
+
"frac_reward_zero_std": 1.0,
|
| 11125 |
+
"grad_norm": 0.0,
|
| 11126 |
+
"kl": 1.1366772107779979,
|
| 11127 |
+
"learning_rate": 4.447369467219081e-06,
|
| 11128 |
+
"loss": 0.0,
|
| 11129 |
+
"num_tokens": 6107348.0,
|
| 11130 |
+
"reward": 4.099999904632568,
|
| 11131 |
+
"reward_std": 0.0,
|
| 11132 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11133 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11134 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11135 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11136 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11137 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11138 |
+
"step": 4280
|
| 11139 |
+
},
|
| 11140 |
+
{
|
| 11141 |
+
"completion_length": 18.1,
|
| 11142 |
+
"completions/clipped_ratio": 0.0,
|
| 11143 |
+
"completions/max_length": 18.1,
|
| 11144 |
+
"completions/max_terminated_length": 18.1,
|
| 11145 |
+
"completions/mean_length": 16.125,
|
| 11146 |
+
"completions/mean_terminated_length": 16.125,
|
| 11147 |
+
"completions/min_length": 14.6,
|
| 11148 |
+
"completions/min_terminated_length": 14.6,
|
| 11149 |
+
"epoch": 0.29496699669966997,
|
| 11150 |
+
"frac_reward_zero_std": 1.0,
|
| 11151 |
+
"grad_norm": 0.0,
|
| 11152 |
+
"kl": 1.187676628679037,
|
| 11153 |
+
"learning_rate": 4.443601050206322e-06,
|
| 11154 |
+
"loss": 0.0,
|
| 11155 |
+
"num_tokens": 6120793.0,
|
| 11156 |
+
"reward": 4.099999904632568,
|
| 11157 |
+
"reward_std": 0.0,
|
| 11158 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11159 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11160 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11161 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11162 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11163 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11164 |
+
"step": 4290
|
| 11165 |
+
},
|
| 11166 |
+
{
|
| 11167 |
+
"completion_length": 16.8,
|
| 11168 |
+
"completions/clipped_ratio": 0.0,
|
| 11169 |
+
"completions/max_length": 16.8,
|
| 11170 |
+
"completions/max_terminated_length": 16.8,
|
| 11171 |
+
"completions/mean_length": 15.1,
|
| 11172 |
+
"completions/mean_terminated_length": 15.1,
|
| 11173 |
+
"completions/min_length": 14.1,
|
| 11174 |
+
"completions/min_terminated_length": 14.1,
|
| 11175 |
+
"epoch": 0.29565456545654567,
|
| 11176 |
+
"frac_reward_zero_std": 1.0,
|
| 11177 |
+
"grad_norm": 0.0,
|
| 11178 |
+
"kl": 1.3925855614244937,
|
| 11179 |
+
"learning_rate": 4.439821436398573e-06,
|
| 11180 |
+
"loss": 0.0,
|
| 11181 |
+
"num_tokens": 6132273.0,
|
| 11182 |
+
"reward": 4.099999904632568,
|
| 11183 |
+
"reward_std": 0.0,
|
| 11184 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11185 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11186 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11187 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11188 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11189 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11190 |
+
"step": 4300
|
| 11191 |
+
},
|
| 11192 |
+
{
|
| 11193 |
+
"completion_length": 17.7,
|
| 11194 |
+
"completions/clipped_ratio": 0.0,
|
| 11195 |
+
"completions/max_length": 17.7,
|
| 11196 |
+
"completions/max_terminated_length": 17.7,
|
| 11197 |
+
"completions/mean_length": 16.8,
|
| 11198 |
+
"completions/mean_terminated_length": 16.8,
|
| 11199 |
+
"completions/min_length": 15.7,
|
| 11200 |
+
"completions/min_terminated_length": 15.7,
|
| 11201 |
+
"epoch": 0.2963421342134213,
|
| 11202 |
+
"frac_reward_zero_std": 1.0,
|
| 11203 |
+
"grad_norm": 0.0,
|
| 11204 |
+
"kl": 1.3798075836151837,
|
| 11205 |
+
"learning_rate": 4.436030647569621e-06,
|
| 11206 |
+
"loss": 0.0,
|
| 11207 |
+
"num_tokens": 6147289.0,
|
| 11208 |
+
"reward": 4.099999904632568,
|
| 11209 |
+
"reward_std": 0.0,
|
| 11210 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11211 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11212 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11213 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11214 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11215 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11216 |
+
"step": 4310
|
| 11217 |
+
},
|
| 11218 |
+
{
|
| 11219 |
+
"completion_length": 19.7,
|
| 11220 |
+
"completions/clipped_ratio": 0.0,
|
| 11221 |
+
"completions/max_length": 19.7,
|
| 11222 |
+
"completions/max_terminated_length": 19.7,
|
| 11223 |
+
"completions/mean_length": 17.325,
|
| 11224 |
+
"completions/mean_terminated_length": 17.325,
|
| 11225 |
+
"completions/min_length": 15.9,
|
| 11226 |
+
"completions/min_terminated_length": 15.9,
|
| 11227 |
+
"epoch": 0.297029702970297,
|
| 11228 |
+
"frac_reward_zero_std": 1.0,
|
| 11229 |
+
"grad_norm": 0.0,
|
| 11230 |
+
"kl": 1.1426802188158036,
|
| 11231 |
+
"learning_rate": 4.432228705557634e-06,
|
| 11232 |
+
"loss": 0.0,
|
| 11233 |
+
"num_tokens": 6161754.0,
|
| 11234 |
+
"reward": 4.099999904632568,
|
| 11235 |
+
"reward_std": 0.0,
|
| 11236 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11237 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11238 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11239 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11240 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11241 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11242 |
+
"step": 4320
|
| 11243 |
+
},
|
| 11244 |
+
{
|
| 11245 |
+
"completion_length": 19.7,
|
| 11246 |
+
"completions/clipped_ratio": 0.0,
|
| 11247 |
+
"completions/max_length": 19.7,
|
| 11248 |
+
"completions/max_terminated_length": 19.7,
|
| 11249 |
+
"completions/mean_length": 17.5,
|
| 11250 |
+
"completions/mean_terminated_length": 17.5,
|
| 11251 |
+
"completions/min_length": 16.2,
|
| 11252 |
+
"completions/min_terminated_length": 16.2,
|
| 11253 |
+
"epoch": 0.2977172717271727,
|
| 11254 |
+
"frac_reward_zero_std": 1.0,
|
| 11255 |
+
"grad_norm": 0.0,
|
| 11256 |
+
"kl": 1.0807349354028701,
|
| 11257 |
+
"learning_rate": 4.428415632265033e-06,
|
| 11258 |
+
"loss": 0.0,
|
| 11259 |
+
"num_tokens": 6174450.0,
|
| 11260 |
+
"reward": 4.099999904632568,
|
| 11261 |
+
"reward_std": 0.0,
|
| 11262 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11263 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11264 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11265 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11266 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11267 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11268 |
+
"step": 4330
|
| 11269 |
+
},
|
| 11270 |
+
{
|
| 11271 |
+
"completion_length": 17.7,
|
| 11272 |
+
"completions/clipped_ratio": 0.0,
|
| 11273 |
+
"completions/max_length": 17.7,
|
| 11274 |
+
"completions/max_terminated_length": 17.7,
|
| 11275 |
+
"completions/mean_length": 16.4,
|
| 11276 |
+
"completions/mean_terminated_length": 16.4,
|
| 11277 |
+
"completions/min_length": 15.0,
|
| 11278 |
+
"completions/min_terminated_length": 15.0,
|
| 11279 |
+
"epoch": 0.2984048404840484,
|
| 11280 |
+
"frac_reward_zero_std": 1.0,
|
| 11281 |
+
"grad_norm": 0.0,
|
| 11282 |
+
"kl": 1.0977762714028358,
|
| 11283 |
+
"learning_rate": 4.424591449658362e-06,
|
| 11284 |
+
"loss": 0.0,
|
| 11285 |
+
"num_tokens": 6188958.0,
|
| 11286 |
+
"reward": 4.099999904632568,
|
| 11287 |
+
"reward_std": 0.0,
|
| 11288 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11289 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11290 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11291 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11292 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11293 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11294 |
+
"step": 4340
|
| 11295 |
+
},
|
| 11296 |
+
{
|
| 11297 |
+
"completion_length": 18.9,
|
| 11298 |
+
"completions/clipped_ratio": 0.0,
|
| 11299 |
+
"completions/max_length": 18.9,
|
| 11300 |
+
"completions/max_terminated_length": 18.9,
|
| 11301 |
+
"completions/mean_length": 17.325,
|
| 11302 |
+
"completions/mean_terminated_length": 17.325,
|
| 11303 |
+
"completions/min_length": 15.9,
|
| 11304 |
+
"completions/min_terminated_length": 15.9,
|
| 11305 |
+
"epoch": 0.2990924092409241,
|
| 11306 |
+
"frac_reward_zero_std": 1.0,
|
| 11307 |
+
"grad_norm": 0.0,
|
| 11308 |
+
"kl": 1.0881227478384972,
|
| 11309 |
+
"learning_rate": 4.420756179768165e-06,
|
| 11310 |
+
"loss": 0.0,
|
| 11311 |
+
"num_tokens": 6204739.0,
|
| 11312 |
+
"reward": 4.099999904632568,
|
| 11313 |
+
"reward_std": 0.0,
|
| 11314 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11315 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11316 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11317 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11318 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11319 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11320 |
+
"step": 4350
|
| 11321 |
+
},
|
| 11322 |
+
{
|
| 11323 |
+
"completion_length": 18.7,
|
| 11324 |
+
"completions/clipped_ratio": 0.0,
|
| 11325 |
+
"completions/max_length": 18.7,
|
| 11326 |
+
"completions/max_terminated_length": 18.7,
|
| 11327 |
+
"completions/mean_length": 18.075,
|
| 11328 |
+
"completions/mean_terminated_length": 18.075,
|
| 11329 |
+
"completions/min_length": 17.5,
|
| 11330 |
+
"completions/min_terminated_length": 17.5,
|
| 11331 |
+
"epoch": 0.29977997799779976,
|
| 11332 |
+
"frac_reward_zero_std": 1.0,
|
| 11333 |
+
"grad_norm": 0.0,
|
| 11334 |
+
"kl": 1.1955605536699294,
|
| 11335 |
+
"learning_rate": 4.4169098446888594e-06,
|
| 11336 |
+
"loss": 0.0,
|
| 11337 |
+
"num_tokens": 6220802.0,
|
| 11338 |
+
"reward": 4.099999904632568,
|
| 11339 |
+
"reward_std": 0.0,
|
| 11340 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11341 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11342 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11343 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11344 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11345 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11346 |
+
"step": 4360
|
| 11347 |
+
},
|
| 11348 |
+
{
|
| 11349 |
+
"completion_length": 19.5,
|
| 11350 |
+
"completions/clipped_ratio": 0.0,
|
| 11351 |
+
"completions/max_length": 19.5,
|
| 11352 |
+
"completions/max_terminated_length": 19.5,
|
| 11353 |
+
"completions/mean_length": 16.85,
|
| 11354 |
+
"completions/mean_terminated_length": 16.85,
|
| 11355 |
+
"completions/min_length": 15.2,
|
| 11356 |
+
"completions/min_terminated_length": 15.2,
|
| 11357 |
+
"epoch": 0.30046754675467546,
|
| 11358 |
+
"frac_reward_zero_std": 1.0,
|
| 11359 |
+
"grad_norm": 0.0,
|
| 11360 |
+
"kl": 1.0562300879508257,
|
| 11361 |
+
"learning_rate": 4.413052466578605e-06,
|
| 11362 |
+
"loss": 0.0,
|
| 11363 |
+
"num_tokens": 6235288.0,
|
| 11364 |
+
"reward": 4.099999904632568,
|
| 11365 |
+
"reward_std": 0.0,
|
| 11366 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11367 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11368 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11369 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11370 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11371 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11372 |
+
"step": 4370
|
| 11373 |
+
},
|
| 11374 |
+
{
|
| 11375 |
+
"completion_length": 18.6,
|
| 11376 |
+
"completions/clipped_ratio": 0.0,
|
| 11377 |
+
"completions/max_length": 18.6,
|
| 11378 |
+
"completions/max_terminated_length": 18.6,
|
| 11379 |
+
"completions/mean_length": 16.625,
|
| 11380 |
+
"completions/mean_terminated_length": 16.625,
|
| 11381 |
+
"completions/min_length": 15.4,
|
| 11382 |
+
"completions/min_terminated_length": 15.4,
|
| 11383 |
+
"epoch": 0.30115511551155116,
|
| 11384 |
+
"frac_reward_zero_std": 1.0,
|
| 11385 |
+
"grad_norm": 0.0,
|
| 11386 |
+
"kl": 1.2873560920357705,
|
| 11387 |
+
"learning_rate": 4.409184067659181e-06,
|
| 11388 |
+
"loss": 0.0,
|
| 11389 |
+
"num_tokens": 6251829.0,
|
| 11390 |
+
"reward": 4.099999904632568,
|
| 11391 |
+
"reward_std": 0.0,
|
| 11392 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11393 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11394 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11395 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11396 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11397 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11398 |
+
"step": 4380
|
| 11399 |
+
},
|
| 11400 |
+
{
|
| 11401 |
+
"completion_length": 17.3,
|
| 11402 |
+
"completions/clipped_ratio": 0.0,
|
| 11403 |
+
"completions/max_length": 17.3,
|
| 11404 |
+
"completions/max_terminated_length": 17.3,
|
| 11405 |
+
"completions/mean_length": 16.25,
|
| 11406 |
+
"completions/mean_terminated_length": 16.25,
|
| 11407 |
+
"completions/min_length": 15.3,
|
| 11408 |
+
"completions/min_terminated_length": 15.3,
|
| 11409 |
+
"epoch": 0.30184268426842686,
|
| 11410 |
+
"frac_reward_zero_std": 1.0,
|
| 11411 |
+
"grad_norm": 0.0,
|
| 11412 |
+
"kl": 1.285911639779806,
|
| 11413 |
+
"learning_rate": 4.4053046702158555e-06,
|
| 11414 |
+
"loss": 0.0,
|
| 11415 |
+
"num_tokens": 6267491.0,
|
| 11416 |
+
"reward": 4.099999904632568,
|
| 11417 |
+
"reward_std": 0.0,
|
| 11418 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11419 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11420 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11421 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11422 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11423 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11424 |
+
"step": 4390
|
| 11425 |
+
},
|
| 11426 |
+
{
|
| 11427 |
+
"completion_length": 22.0,
|
| 11428 |
+
"completions/clipped_ratio": 0.0,
|
| 11429 |
+
"completions/max_length": 22.0,
|
| 11430 |
+
"completions/max_terminated_length": 22.0,
|
| 11431 |
+
"completions/mean_length": 20.075,
|
| 11432 |
+
"completions/mean_terminated_length": 20.075,
|
| 11433 |
+
"completions/min_length": 17.9,
|
| 11434 |
+
"completions/min_terminated_length": 17.9,
|
| 11435 |
+
"epoch": 0.3025302530253025,
|
| 11436 |
+
"frac_reward_zero_std": 1.0,
|
| 11437 |
+
"grad_norm": 0.0,
|
| 11438 |
+
"kl": 0.6993859726935625,
|
| 11439 |
+
"learning_rate": 4.401414296597256e-06,
|
| 11440 |
+
"loss": 0.0,
|
| 11441 |
+
"num_tokens": 6280378.0,
|
| 11442 |
+
"reward": 4.099999904632568,
|
| 11443 |
+
"reward_std": 0.0,
|
| 11444 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11445 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11446 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11447 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11448 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11449 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11450 |
+
"step": 4400
|
| 11451 |
+
},
|
| 11452 |
+
{
|
| 11453 |
+
"completion_length": 22.1,
|
| 11454 |
+
"completions/clipped_ratio": 0.0,
|
| 11455 |
+
"completions/max_length": 22.1,
|
| 11456 |
+
"completions/max_terminated_length": 22.1,
|
| 11457 |
+
"completions/mean_length": 18.675,
|
| 11458 |
+
"completions/mean_terminated_length": 18.675,
|
| 11459 |
+
"completions/min_length": 15.9,
|
| 11460 |
+
"completions/min_terminated_length": 15.9,
|
| 11461 |
+
"epoch": 0.3032178217821782,
|
| 11462 |
+
"frac_reward_zero_std": 1.0,
|
| 11463 |
+
"grad_norm": 0.0,
|
| 11464 |
+
"kl": 1.113627065718174,
|
| 11465 |
+
"learning_rate": 4.397512969215243e-06,
|
| 11466 |
+
"loss": 0.0,
|
| 11467 |
+
"num_tokens": 6295541.0,
|
| 11468 |
+
"reward": 4.099999904632568,
|
| 11469 |
+
"reward_std": 0.0,
|
| 11470 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11471 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11472 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11473 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11474 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11475 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11476 |
+
"step": 4410
|
| 11477 |
+
},
|
| 11478 |
+
{
|
| 11479 |
+
"completion_length": 17.1,
|
| 11480 |
+
"completions/clipped_ratio": 0.0,
|
| 11481 |
+
"completions/max_length": 17.1,
|
| 11482 |
+
"completions/max_terminated_length": 17.1,
|
| 11483 |
+
"completions/mean_length": 15.95,
|
| 11484 |
+
"completions/mean_terminated_length": 15.95,
|
| 11485 |
+
"completions/min_length": 14.7,
|
| 11486 |
+
"completions/min_terminated_length": 14.7,
|
| 11487 |
+
"epoch": 0.3039053905390539,
|
| 11488 |
+
"frac_reward_zero_std": 1.0,
|
| 11489 |
+
"grad_norm": 0.0,
|
| 11490 |
+
"kl": 1.4492276966571809,
|
| 11491 |
+
"learning_rate": 4.393600710544781e-06,
|
| 11492 |
+
"loss": 0.0,
|
| 11493 |
+
"num_tokens": 6311123.0,
|
| 11494 |
+
"reward": 4.099999904632568,
|
| 11495 |
+
"reward_std": 0.0,
|
| 11496 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11497 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11498 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11499 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11500 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11501 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11502 |
+
"step": 4420
|
| 11503 |
+
},
|
| 11504 |
+
{
|
| 11505 |
+
"completion_length": 19.7,
|
| 11506 |
+
"completions/clipped_ratio": 0.0,
|
| 11507 |
+
"completions/max_length": 19.7,
|
| 11508 |
+
"completions/max_terminated_length": 19.7,
|
| 11509 |
+
"completions/mean_length": 17.675,
|
| 11510 |
+
"completions/mean_terminated_length": 17.675,
|
| 11511 |
+
"completions/min_length": 15.9,
|
| 11512 |
+
"completions/min_terminated_length": 15.9,
|
| 11513 |
+
"epoch": 0.3045929592959296,
|
| 11514 |
+
"frac_reward_zero_std": 1.0,
|
| 11515 |
+
"grad_norm": 0.0,
|
| 11516 |
+
"kl": 1.1876488611102105,
|
| 11517 |
+
"learning_rate": 4.389677543123807e-06,
|
| 11518 |
+
"loss": 0.0,
|
| 11519 |
+
"num_tokens": 6325982.0,
|
| 11520 |
+
"reward": 4.099999904632568,
|
| 11521 |
+
"reward_std": 0.0,
|
| 11522 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11523 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11524 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11525 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11526 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11527 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11528 |
+
"step": 4430
|
| 11529 |
+
},
|
| 11530 |
+
{
|
| 11531 |
+
"completion_length": 19.8,
|
| 11532 |
+
"completions/clipped_ratio": 0.0,
|
| 11533 |
+
"completions/max_length": 19.8,
|
| 11534 |
+
"completions/max_terminated_length": 19.8,
|
| 11535 |
+
"completions/mean_length": 18.125,
|
| 11536 |
+
"completions/mean_terminated_length": 18.125,
|
| 11537 |
+
"completions/min_length": 15.7,
|
| 11538 |
+
"completions/min_terminated_length": 15.7,
|
| 11539 |
+
"epoch": 0.30528052805280526,
|
| 11540 |
+
"frac_reward_zero_std": 1.0,
|
| 11541 |
+
"grad_norm": 0.0,
|
| 11542 |
+
"kl": 1.5164424151182174,
|
| 11543 |
+
"learning_rate": 4.385743489553101e-06,
|
| 11544 |
+
"loss": 0.0001,
|
| 11545 |
+
"num_tokens": 6340319.0,
|
| 11546 |
+
"reward": 4.099999904632568,
|
| 11547 |
+
"reward_std": 0.0,
|
| 11548 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11549 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11550 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11551 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11552 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11553 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11554 |
+
"step": 4440
|
| 11555 |
+
},
|
| 11556 |
+
{
|
| 11557 |
+
"completion_length": 21.0,
|
| 11558 |
+
"completions/clipped_ratio": 0.0,
|
| 11559 |
+
"completions/max_length": 21.0,
|
| 11560 |
+
"completions/max_terminated_length": 21.0,
|
| 11561 |
+
"completions/mean_length": 18.3,
|
| 11562 |
+
"completions/mean_terminated_length": 18.3,
|
| 11563 |
+
"completions/min_length": 17.3,
|
| 11564 |
+
"completions/min_terminated_length": 17.3,
|
| 11565 |
+
"epoch": 0.30596809680968096,
|
| 11566 |
+
"frac_reward_zero_std": 1.0,
|
| 11567 |
+
"grad_norm": 0.0,
|
| 11568 |
+
"kl": 1.173893976211548,
|
| 11569 |
+
"learning_rate": 4.3817985724961585e-06,
|
| 11570 |
+
"loss": 0.0,
|
| 11571 |
+
"num_tokens": 6356051.0,
|
| 11572 |
+
"reward": 4.099999904632568,
|
| 11573 |
+
"reward_std": 0.0,
|
| 11574 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11575 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11576 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11577 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11578 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11579 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11580 |
+
"step": 4450
|
| 11581 |
+
},
|
| 11582 |
+
{
|
| 11583 |
+
"completion_length": 18.4,
|
| 11584 |
+
"completions/clipped_ratio": 0.0,
|
| 11585 |
+
"completions/max_length": 18.4,
|
| 11586 |
+
"completions/max_terminated_length": 18.4,
|
| 11587 |
+
"completions/mean_length": 16.725,
|
| 11588 |
+
"completions/mean_terminated_length": 16.725,
|
| 11589 |
+
"completions/min_length": 15.3,
|
| 11590 |
+
"completions/min_terminated_length": 15.3,
|
| 11591 |
+
"epoch": 0.30665566556655666,
|
| 11592 |
+
"frac_reward_zero_std": 1.0,
|
| 11593 |
+
"grad_norm": 0.0,
|
| 11594 |
+
"kl": 1.21855476051569,
|
| 11595 |
+
"learning_rate": 4.3778428146790565e-06,
|
| 11596 |
+
"loss": 0.0,
|
| 11597 |
+
"num_tokens": 6368848.0,
|
| 11598 |
+
"reward": 4.099999904632568,
|
| 11599 |
+
"reward_std": 0.0,
|
| 11600 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11601 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11602 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11603 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11604 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11605 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11606 |
+
"step": 4460
|
| 11607 |
+
},
|
| 11608 |
+
{
|
| 11609 |
+
"completion_length": 17.8,
|
| 11610 |
+
"completions/clipped_ratio": 0.0,
|
| 11611 |
+
"completions/max_length": 17.8,
|
| 11612 |
+
"completions/max_terminated_length": 17.8,
|
| 11613 |
+
"completions/mean_length": 16.675,
|
| 11614 |
+
"completions/mean_terminated_length": 16.675,
|
| 11615 |
+
"completions/min_length": 15.7,
|
| 11616 |
+
"completions/min_terminated_length": 15.7,
|
| 11617 |
+
"epoch": 0.30734323432343236,
|
| 11618 |
+
"frac_reward_zero_std": 1.0,
|
| 11619 |
+
"grad_norm": 0.0,
|
| 11620 |
+
"kl": 1.3003081649541854,
|
| 11621 |
+
"learning_rate": 4.373876238890322e-06,
|
| 11622 |
+
"loss": 0.0,
|
| 11623 |
+
"num_tokens": 6385515.0,
|
| 11624 |
+
"reward": 4.099999904632568,
|
| 11625 |
+
"reward_std": 0.0,
|
| 11626 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11627 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11628 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11629 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11630 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11631 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11632 |
+
"step": 4470
|
| 11633 |
+
},
|
| 11634 |
+
{
|
| 11635 |
+
"completion_length": 18.7,
|
| 11636 |
+
"completions/clipped_ratio": 0.0,
|
| 11637 |
+
"completions/max_length": 18.7,
|
| 11638 |
+
"completions/max_terminated_length": 18.7,
|
| 11639 |
+
"completions/mean_length": 16.4,
|
| 11640 |
+
"completions/mean_terminated_length": 16.4,
|
| 11641 |
+
"completions/min_length": 14.7,
|
| 11642 |
+
"completions/min_terminated_length": 14.7,
|
| 11643 |
+
"epoch": 0.30803080308030806,
|
| 11644 |
+
"frac_reward_zero_std": 1.0,
|
| 11645 |
+
"grad_norm": 0.0,
|
| 11646 |
+
"kl": 1.3788954310119153,
|
| 11647 |
+
"learning_rate": 4.369898867980809e-06,
|
| 11648 |
+
"loss": 0.0,
|
| 11649 |
+
"num_tokens": 6400035.0,
|
| 11650 |
+
"reward": 4.099999904632568,
|
| 11651 |
+
"reward_std": 0.0,
|
| 11652 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11653 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11654 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11655 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11656 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11657 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11658 |
+
"step": 4480
|
| 11659 |
+
},
|
| 11660 |
+
{
|
| 11661 |
+
"completion_length": 19.5,
|
| 11662 |
+
"completions/clipped_ratio": 0.0,
|
| 11663 |
+
"completions/max_length": 19.5,
|
| 11664 |
+
"completions/max_terminated_length": 19.5,
|
| 11665 |
+
"completions/mean_length": 16.75,
|
| 11666 |
+
"completions/mean_terminated_length": 16.75,
|
| 11667 |
+
"completions/min_length": 15.0,
|
| 11668 |
+
"completions/min_terminated_length": 15.0,
|
| 11669 |
+
"epoch": 0.3087183718371837,
|
| 11670 |
+
"frac_reward_zero_std": 1.0,
|
| 11671 |
+
"grad_norm": 0.0,
|
| 11672 |
+
"kl": 1.0902920335531234,
|
| 11673 |
+
"learning_rate": 4.365910724863554e-06,
|
| 11674 |
+
"loss": 0.0,
|
| 11675 |
+
"num_tokens": 6415169.0,
|
| 11676 |
+
"reward": 4.099999904632568,
|
| 11677 |
+
"reward_std": 0.0,
|
| 11678 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11679 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11680 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11681 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11682 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11683 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11684 |
+
"step": 4490
|
| 11685 |
+
},
|
| 11686 |
+
{
|
| 11687 |
+
"completion_length": 18.5,
|
| 11688 |
+
"completions/clipped_ratio": 0.0,
|
| 11689 |
+
"completions/max_length": 18.5,
|
| 11690 |
+
"completions/max_terminated_length": 18.5,
|
| 11691 |
+
"completions/mean_length": 16.9,
|
| 11692 |
+
"completions/mean_terminated_length": 16.9,
|
| 11693 |
+
"completions/min_length": 15.3,
|
| 11694 |
+
"completions/min_terminated_length": 15.3,
|
| 11695 |
+
"epoch": 0.3094059405940594,
|
| 11696 |
+
"frac_reward_zero_std": 1.0,
|
| 11697 |
+
"grad_norm": 0.0,
|
| 11698 |
+
"kl": 1.2483339451253415,
|
| 11699 |
+
"learning_rate": 4.361911832513652e-06,
|
| 11700 |
+
"loss": 0.0,
|
| 11701 |
+
"num_tokens": 6427085.0,
|
| 11702 |
+
"reward": 4.099999904632568,
|
| 11703 |
+
"reward_std": 0.0,
|
| 11704 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11705 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11706 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11707 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11708 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11709 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11710 |
+
"step": 4500
|
| 11711 |
+
},
|
| 11712 |
+
{
|
| 11713 |
+
"completion_length": 19.3,
|
| 11714 |
+
"completions/clipped_ratio": 0.0,
|
| 11715 |
+
"completions/max_length": 19.3,
|
| 11716 |
+
"completions/max_terminated_length": 19.3,
|
| 11717 |
+
"completions/mean_length": 17.725,
|
| 11718 |
+
"completions/mean_terminated_length": 17.725,
|
| 11719 |
+
"completions/min_length": 15.9,
|
| 11720 |
+
"completions/min_terminated_length": 15.9,
|
| 11721 |
+
"epoch": 0.3100935093509351,
|
| 11722 |
+
"frac_reward_zero_std": 1.0,
|
| 11723 |
+
"grad_norm": 0.0,
|
| 11724 |
+
"kl": 1.2941459499299526,
|
| 11725 |
+
"learning_rate": 4.357902213968126e-06,
|
| 11726 |
+
"loss": 0.0,
|
| 11727 |
+
"num_tokens": 6442638.0,
|
| 11728 |
+
"reward": 4.099999904632568,
|
| 11729 |
+
"reward_std": 0.0,
|
| 11730 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11731 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11732 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11733 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11734 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11735 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11736 |
+
"step": 4510
|
| 11737 |
+
},
|
| 11738 |
+
{
|
| 11739 |
+
"completion_length": 18.5,
|
| 11740 |
+
"completions/clipped_ratio": 0.0,
|
| 11741 |
+
"completions/max_length": 18.5,
|
| 11742 |
+
"completions/max_terminated_length": 18.5,
|
| 11743 |
+
"completions/mean_length": 17.2,
|
| 11744 |
+
"completions/mean_terminated_length": 17.2,
|
| 11745 |
+
"completions/min_length": 16.1,
|
| 11746 |
+
"completions/min_terminated_length": 16.1,
|
| 11747 |
+
"epoch": 0.3107810781078108,
|
| 11748 |
+
"frac_reward_zero_std": 1.0,
|
| 11749 |
+
"grad_norm": 0.0,
|
| 11750 |
+
"kl": 1.2603263229131698,
|
| 11751 |
+
"learning_rate": 4.353881892325787e-06,
|
| 11752 |
+
"loss": 0.0,
|
| 11753 |
+
"num_tokens": 6456242.0,
|
| 11754 |
+
"reward": 4.099999904632568,
|
| 11755 |
+
"reward_std": 0.0,
|
| 11756 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11757 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11758 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11759 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11760 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11761 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11762 |
+
"step": 4520
|
| 11763 |
+
},
|
| 11764 |
+
{
|
| 11765 |
+
"completion_length": 19.4,
|
| 11766 |
+
"completions/clipped_ratio": 0.0,
|
| 11767 |
+
"completions/max_length": 19.4,
|
| 11768 |
+
"completions/max_terminated_length": 19.4,
|
| 11769 |
+
"completions/mean_length": 17.425,
|
| 11770 |
+
"completions/mean_terminated_length": 17.425,
|
| 11771 |
+
"completions/min_length": 15.7,
|
| 11772 |
+
"completions/min_terminated_length": 15.7,
|
| 11773 |
+
"epoch": 0.31146864686468645,
|
| 11774 |
+
"frac_reward_zero_std": 1.0,
|
| 11775 |
+
"grad_norm": 0.0,
|
| 11776 |
+
"kl": 0.9698817508295179,
|
| 11777 |
+
"learning_rate": 4.349850890747109e-06,
|
| 11778 |
+
"loss": 0.0,
|
| 11779 |
+
"num_tokens": 6470143.0,
|
| 11780 |
+
"reward": 4.099999904632568,
|
| 11781 |
+
"reward_std": 0.0,
|
| 11782 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11783 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11784 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11785 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11786 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11787 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11788 |
+
"step": 4530
|
| 11789 |
+
},
|
| 11790 |
+
{
|
| 11791 |
+
"completion_length": 18.9,
|
| 11792 |
+
"completions/clipped_ratio": 0.0,
|
| 11793 |
+
"completions/max_length": 18.9,
|
| 11794 |
+
"completions/max_terminated_length": 18.9,
|
| 11795 |
+
"completions/mean_length": 16.95,
|
| 11796 |
+
"completions/mean_terminated_length": 16.95,
|
| 11797 |
+
"completions/min_length": 15.7,
|
| 11798 |
+
"completions/min_terminated_length": 15.7,
|
| 11799 |
+
"epoch": 0.31215621562156215,
|
| 11800 |
+
"frac_reward_zero_std": 1.0,
|
| 11801 |
+
"grad_norm": 0.0,
|
| 11802 |
+
"kl": 1.1165247913450003,
|
| 11803 |
+
"learning_rate": 4.345809232454088e-06,
|
| 11804 |
+
"loss": 0.0,
|
| 11805 |
+
"num_tokens": 6487277.0,
|
| 11806 |
+
"reward": 4.099999904632568,
|
| 11807 |
+
"reward_std": 0.0,
|
| 11808 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11809 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11810 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11811 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11812 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11813 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11814 |
+
"step": 4540
|
| 11815 |
+
},
|
| 11816 |
+
{
|
| 11817 |
+
"completion_length": 19.3,
|
| 11818 |
+
"completions/clipped_ratio": 0.0,
|
| 11819 |
+
"completions/max_length": 19.3,
|
| 11820 |
+
"completions/max_terminated_length": 19.3,
|
| 11821 |
+
"completions/mean_length": 17.325,
|
| 11822 |
+
"completions/mean_terminated_length": 17.325,
|
| 11823 |
+
"completions/min_length": 15.5,
|
| 11824 |
+
"completions/min_terminated_length": 15.5,
|
| 11825 |
+
"epoch": 0.31284378437843785,
|
| 11826 |
+
"frac_reward_zero_std": 1.0,
|
| 11827 |
+
"grad_norm": 0.0,
|
| 11828 |
+
"kl": 1.2503239408135414,
|
| 11829 |
+
"learning_rate": 4.341756940730113e-06,
|
| 11830 |
+
"loss": 0.0,
|
| 11831 |
+
"num_tokens": 6501746.0,
|
| 11832 |
+
"reward": 4.099999904632568,
|
| 11833 |
+
"reward_std": 0.0,
|
| 11834 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11835 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11836 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11837 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11838 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11839 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11840 |
+
"step": 4550
|
| 11841 |
+
},
|
| 11842 |
+
{
|
| 11843 |
+
"completion_length": 19.2,
|
| 11844 |
+
"completions/clipped_ratio": 0.0,
|
| 11845 |
+
"completions/max_length": 19.2,
|
| 11846 |
+
"completions/max_terminated_length": 19.2,
|
| 11847 |
+
"completions/mean_length": 17.325,
|
| 11848 |
+
"completions/mean_terminated_length": 17.325,
|
| 11849 |
+
"completions/min_length": 16.1,
|
| 11850 |
+
"completions/min_terminated_length": 16.1,
|
| 11851 |
+
"epoch": 0.31353135313531355,
|
| 11852 |
+
"frac_reward_zero_std": 1.0,
|
| 11853 |
+
"grad_norm": 0.0,
|
| 11854 |
+
"kl": 1.2415374740958214,
|
| 11855 |
+
"learning_rate": 4.33769403891983e-06,
|
| 11856 |
+
"loss": 0.0,
|
| 11857 |
+
"num_tokens": 6515103.0,
|
| 11858 |
+
"reward": 4.099999904632568,
|
| 11859 |
+
"reward_std": 0.0,
|
| 11860 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11861 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11862 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11863 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11864 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11865 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11866 |
+
"step": 4560
|
| 11867 |
+
},
|
| 11868 |
+
{
|
| 11869 |
+
"completion_length": 17.6,
|
| 11870 |
+
"completions/clipped_ratio": 0.0,
|
| 11871 |
+
"completions/max_length": 17.6,
|
| 11872 |
+
"completions/max_terminated_length": 17.6,
|
| 11873 |
+
"completions/mean_length": 16.425,
|
| 11874 |
+
"completions/mean_terminated_length": 16.425,
|
| 11875 |
+
"completions/min_length": 15.2,
|
| 11876 |
+
"completions/min_terminated_length": 15.2,
|
| 11877 |
+
"epoch": 0.3142189218921892,
|
| 11878 |
+
"frac_reward_zero_std": 1.0,
|
| 11879 |
+
"grad_norm": 0.0,
|
| 11880 |
+
"kl": 1.1866839185357094,
|
| 11881 |
+
"learning_rate": 4.33362055042901e-06,
|
| 11882 |
+
"loss": 0.0,
|
| 11883 |
+
"num_tokens": 6527800.0,
|
| 11884 |
+
"reward": 4.099999904632568,
|
| 11885 |
+
"reward_std": 0.0,
|
| 11886 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11887 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11888 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11889 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11890 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11891 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11892 |
+
"step": 4570
|
| 11893 |
+
},
|
| 11894 |
+
{
|
| 11895 |
+
"completion_length": 17.9,
|
| 11896 |
+
"completions/clipped_ratio": 0.0,
|
| 11897 |
+
"completions/max_length": 17.9,
|
| 11898 |
+
"completions/max_terminated_length": 17.9,
|
| 11899 |
+
"completions/mean_length": 17.0,
|
| 11900 |
+
"completions/mean_terminated_length": 17.0,
|
| 11901 |
+
"completions/min_length": 15.8,
|
| 11902 |
+
"completions/min_terminated_length": 15.8,
|
| 11903 |
+
"epoch": 0.3149064906490649,
|
| 11904 |
+
"frac_reward_zero_std": 1.0,
|
| 11905 |
+
"grad_norm": 0.0,
|
| 11906 |
+
"kl": 1.348712769150734,
|
| 11907 |
+
"learning_rate": 4.32953649872441e-06,
|
| 11908 |
+
"loss": 0.0,
|
| 11909 |
+
"num_tokens": 6542468.0,
|
| 11910 |
+
"reward": 4.099999904632568,
|
| 11911 |
+
"reward_std": 0.0,
|
| 11912 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11913 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11914 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11915 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11916 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11917 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11918 |
+
"step": 4580
|
| 11919 |
+
},
|
| 11920 |
+
{
|
| 11921 |
+
"completion_length": 20.1,
|
| 11922 |
+
"completions/clipped_ratio": 0.0,
|
| 11923 |
+
"completions/max_length": 20.1,
|
| 11924 |
+
"completions/max_terminated_length": 20.1,
|
| 11925 |
+
"completions/mean_length": 17.575,
|
| 11926 |
+
"completions/mean_terminated_length": 17.575,
|
| 11927 |
+
"completions/min_length": 15.3,
|
| 11928 |
+
"completions/min_terminated_length": 15.3,
|
| 11929 |
+
"epoch": 0.3155940594059406,
|
| 11930 |
+
"frac_reward_zero_std": 1.0,
|
| 11931 |
+
"grad_norm": 0.0,
|
| 11932 |
+
"kl": 1.2476600848138333,
|
| 11933 |
+
"learning_rate": 4.325441907333642e-06,
|
| 11934 |
+
"loss": 0.0,
|
| 11935 |
+
"num_tokens": 6556839.0,
|
| 11936 |
+
"reward": 4.099999904632568,
|
| 11937 |
+
"reward_std": 0.0,
|
| 11938 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11939 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11940 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11941 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11942 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11943 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11944 |
+
"step": 4590
|
| 11945 |
+
},
|
| 11946 |
+
{
|
| 11947 |
+
"completion_length": 18.9,
|
| 11948 |
+
"completions/clipped_ratio": 0.0,
|
| 11949 |
+
"completions/max_length": 18.9,
|
| 11950 |
+
"completions/max_terminated_length": 18.9,
|
| 11951 |
+
"completions/mean_length": 17.275,
|
| 11952 |
+
"completions/mean_terminated_length": 17.275,
|
| 11953 |
+
"completions/min_length": 15.8,
|
| 11954 |
+
"completions/min_terminated_length": 15.8,
|
| 11955 |
+
"epoch": 0.3162816281628163,
|
| 11956 |
+
"frac_reward_zero_std": 1.0,
|
| 11957 |
+
"grad_norm": 0.0,
|
| 11958 |
+
"kl": 1.2813825011253357,
|
| 11959 |
+
"learning_rate": 4.321336799845034e-06,
|
| 11960 |
+
"loss": 0.0,
|
| 11961 |
+
"num_tokens": 6570610.0,
|
| 11962 |
+
"reward": 4.099999904632568,
|
| 11963 |
+
"reward_std": 0.0,
|
| 11964 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11965 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11966 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11967 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11968 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11969 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11970 |
+
"step": 4600
|
| 11971 |
+
},
|
| 11972 |
+
{
|
| 11973 |
+
"completion_length": 19.5,
|
| 11974 |
+
"completions/clipped_ratio": 0.0,
|
| 11975 |
+
"completions/max_length": 19.5,
|
| 11976 |
+
"completions/max_terminated_length": 19.5,
|
| 11977 |
+
"completions/mean_length": 17.325,
|
| 11978 |
+
"completions/mean_terminated_length": 17.325,
|
| 11979 |
+
"completions/min_length": 15.8,
|
| 11980 |
+
"completions/min_terminated_length": 15.8,
|
| 11981 |
+
"epoch": 0.31696919691969194,
|
| 11982 |
+
"frac_reward_zero_std": 1.0,
|
| 11983 |
+
"grad_norm": 0.0,
|
| 11984 |
+
"kl": 1.3337368354201318,
|
| 11985 |
+
"learning_rate": 4.317221199907496e-06,
|
| 11986 |
+
"loss": 0.0,
|
| 11987 |
+
"num_tokens": 6583899.0,
|
| 11988 |
+
"reward": 4.099999904632568,
|
| 11989 |
+
"reward_std": 0.0,
|
| 11990 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 11991 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 11992 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 11993 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 11994 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 11995 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 11996 |
+
"step": 4610
|
| 11997 |
+
},
|
| 11998 |
+
{
|
| 11999 |
+
"completion_length": 18.6,
|
| 12000 |
+
"completions/clipped_ratio": 0.0,
|
| 12001 |
+
"completions/max_length": 18.6,
|
| 12002 |
+
"completions/max_terminated_length": 18.6,
|
| 12003 |
+
"completions/mean_length": 16.525,
|
| 12004 |
+
"completions/mean_terminated_length": 16.525,
|
| 12005 |
+
"completions/min_length": 15.5,
|
| 12006 |
+
"completions/min_terminated_length": 15.5,
|
| 12007 |
+
"epoch": 0.31765676567656764,
|
| 12008 |
+
"frac_reward_zero_std": 1.0,
|
| 12009 |
+
"grad_norm": 0.0,
|
| 12010 |
+
"kl": 1.158595709502697,
|
| 12011 |
+
"learning_rate": 4.313095131230385e-06,
|
| 12012 |
+
"loss": 0.0,
|
| 12013 |
+
"num_tokens": 6598792.0,
|
| 12014 |
+
"reward": 4.099999904632568,
|
| 12015 |
+
"reward_std": 0.0,
|
| 12016 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12017 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12018 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12019 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12020 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12021 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12022 |
+
"step": 4620
|
| 12023 |
+
},
|
| 12024 |
+
{
|
| 12025 |
+
"completion_length": 18.6,
|
| 12026 |
+
"completions/clipped_ratio": 0.0,
|
| 12027 |
+
"completions/max_length": 18.6,
|
| 12028 |
+
"completions/max_terminated_length": 18.6,
|
| 12029 |
+
"completions/mean_length": 17.2,
|
| 12030 |
+
"completions/mean_terminated_length": 17.2,
|
| 12031 |
+
"completions/min_length": 15.9,
|
| 12032 |
+
"completions/min_terminated_length": 15.9,
|
| 12033 |
+
"epoch": 0.31834433443344334,
|
| 12034 |
+
"frac_reward_zero_std": 1.0,
|
| 12035 |
+
"grad_norm": 0.0,
|
| 12036 |
+
"kl": 1.4270890690386295,
|
| 12037 |
+
"learning_rate": 4.308958617583364e-06,
|
| 12038 |
+
"loss": 0.0001,
|
| 12039 |
+
"num_tokens": 6614748.0,
|
| 12040 |
+
"reward": 4.099999904632568,
|
| 12041 |
+
"reward_std": 0.0,
|
| 12042 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12043 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12044 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12045 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12046 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12047 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12048 |
+
"step": 4630
|
| 12049 |
+
},
|
| 12050 |
+
{
|
| 12051 |
+
"completion_length": 18.1,
|
| 12052 |
+
"completions/clipped_ratio": 0.0,
|
| 12053 |
+
"completions/max_length": 18.1,
|
| 12054 |
+
"completions/max_terminated_length": 18.1,
|
| 12055 |
+
"completions/mean_length": 16.8,
|
| 12056 |
+
"completions/mean_terminated_length": 16.8,
|
| 12057 |
+
"completions/min_length": 16.2,
|
| 12058 |
+
"completions/min_terminated_length": 16.2,
|
| 12059 |
+
"epoch": 0.31903190319031904,
|
| 12060 |
+
"frac_reward_zero_std": 1.0,
|
| 12061 |
+
"grad_norm": 0.0,
|
| 12062 |
+
"kl": 0.9190359987318516,
|
| 12063 |
+
"learning_rate": 4.304811682796271e-06,
|
| 12064 |
+
"loss": 0.0,
|
| 12065 |
+
"num_tokens": 6628996.0,
|
| 12066 |
+
"reward": 4.099999904632568,
|
| 12067 |
+
"reward_std": 0.0,
|
| 12068 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12069 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12070 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12071 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12072 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12073 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12074 |
+
"step": 4640
|
| 12075 |
+
},
|
| 12076 |
+
{
|
| 12077 |
+
"completion_length": 18.6,
|
| 12078 |
+
"completions/clipped_ratio": 0.0,
|
| 12079 |
+
"completions/max_length": 18.6,
|
| 12080 |
+
"completions/max_terminated_length": 18.6,
|
| 12081 |
+
"completions/mean_length": 16.775,
|
| 12082 |
+
"completions/mean_terminated_length": 16.775,
|
| 12083 |
+
"completions/min_length": 14.9,
|
| 12084 |
+
"completions/min_terminated_length": 14.9,
|
| 12085 |
+
"epoch": 0.31971947194719474,
|
| 12086 |
+
"frac_reward_zero_std": 1.0,
|
| 12087 |
+
"grad_norm": 0.0,
|
| 12088 |
+
"kl": 1.1849085062742233,
|
| 12089 |
+
"learning_rate": 4.300654350758977e-06,
|
| 12090 |
+
"loss": 0.0,
|
| 12091 |
+
"num_tokens": 6645647.0,
|
| 12092 |
+
"reward": 4.099999904632568,
|
| 12093 |
+
"reward_std": 0.0,
|
| 12094 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12095 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12096 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12097 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12098 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12099 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12100 |
+
"step": 4650
|
| 12101 |
+
},
|
| 12102 |
+
{
|
| 12103 |
+
"completion_length": 17.8,
|
| 12104 |
+
"completions/clipped_ratio": 0.0,
|
| 12105 |
+
"completions/max_length": 17.8,
|
| 12106 |
+
"completions/max_terminated_length": 17.8,
|
| 12107 |
+
"completions/mean_length": 16.55,
|
| 12108 |
+
"completions/mean_terminated_length": 16.55,
|
| 12109 |
+
"completions/min_length": 15.7,
|
| 12110 |
+
"completions/min_terminated_length": 15.7,
|
| 12111 |
+
"epoch": 0.3204070407040704,
|
| 12112 |
+
"frac_reward_zero_std": 1.0,
|
| 12113 |
+
"grad_norm": 0.0,
|
| 12114 |
+
"kl": 1.494172091037035,
|
| 12115 |
+
"learning_rate": 4.296486645421249e-06,
|
| 12116 |
+
"loss": 0.0,
|
| 12117 |
+
"num_tokens": 6659029.0,
|
| 12118 |
+
"reward": 4.099999904632568,
|
| 12119 |
+
"reward_std": 0.0,
|
| 12120 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12121 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12122 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12123 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12124 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12125 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12126 |
+
"step": 4660
|
| 12127 |
+
},
|
| 12128 |
+
{
|
| 12129 |
+
"completion_length": 20.3,
|
| 12130 |
+
"completions/clipped_ratio": 0.0,
|
| 12131 |
+
"completions/max_length": 20.3,
|
| 12132 |
+
"completions/max_terminated_length": 20.3,
|
| 12133 |
+
"completions/mean_length": 17.575,
|
| 12134 |
+
"completions/mean_terminated_length": 17.575,
|
| 12135 |
+
"completions/min_length": 15.9,
|
| 12136 |
+
"completions/min_terminated_length": 15.9,
|
| 12137 |
+
"epoch": 0.3210946094609461,
|
| 12138 |
+
"frac_reward_zero_std": 1.0,
|
| 12139 |
+
"grad_norm": 0.0,
|
| 12140 |
+
"kl": 0.8454300031065941,
|
| 12141 |
+
"learning_rate": 4.292308590792616e-06,
|
| 12142 |
+
"loss": 0.0,
|
| 12143 |
+
"num_tokens": 6675132.0,
|
| 12144 |
+
"reward": 4.099999904632568,
|
| 12145 |
+
"reward_std": 0.0,
|
| 12146 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12147 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12148 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12149 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12150 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12151 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12152 |
+
"step": 4670
|
| 12153 |
+
},
|
| 12154 |
+
{
|
| 12155 |
+
"completion_length": 19.9,
|
| 12156 |
+
"completions/clipped_ratio": 0.0,
|
| 12157 |
+
"completions/max_length": 19.9,
|
| 12158 |
+
"completions/max_terminated_length": 19.9,
|
| 12159 |
+
"completions/mean_length": 17.95,
|
| 12160 |
+
"completions/mean_terminated_length": 17.95,
|
| 12161 |
+
"completions/min_length": 16.3,
|
| 12162 |
+
"completions/min_terminated_length": 16.3,
|
| 12163 |
+
"epoch": 0.3217821782178218,
|
| 12164 |
+
"frac_reward_zero_std": 1.0,
|
| 12165 |
+
"grad_norm": 0.0,
|
| 12166 |
+
"kl": 1.3840799629688263,
|
| 12167 |
+
"learning_rate": 4.288120210942223e-06,
|
| 12168 |
+
"loss": 0.0001,
|
| 12169 |
+
"num_tokens": 6690810.0,
|
| 12170 |
+
"reward": 4.099999904632568,
|
| 12171 |
+
"reward_std": 0.0,
|
| 12172 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12173 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12174 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12175 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12176 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12177 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12178 |
+
"step": 4680
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"completion_length": 17.4,
|
| 12182 |
+
"completions/clipped_ratio": 0.0,
|
| 12183 |
+
"completions/max_length": 17.4,
|
| 12184 |
+
"completions/max_terminated_length": 17.4,
|
| 12185 |
+
"completions/mean_length": 17.025,
|
| 12186 |
+
"completions/mean_terminated_length": 17.025,
|
| 12187 |
+
"completions/min_length": 16.6,
|
| 12188 |
+
"completions/min_terminated_length": 16.6,
|
| 12189 |
+
"epoch": 0.3224697469746975,
|
| 12190 |
+
"frac_reward_zero_std": 1.0,
|
| 12191 |
+
"grad_norm": 0.0,
|
| 12192 |
+
"kl": 1.2411757558584213,
|
| 12193 |
+
"learning_rate": 4.283921529998702e-06,
|
| 12194 |
+
"loss": 0.0,
|
| 12195 |
+
"num_tokens": 6708547.0,
|
| 12196 |
+
"reward": 4.099999904632568,
|
| 12197 |
+
"reward_std": 0.0,
|
| 12198 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12199 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12200 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12201 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12202 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12203 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12204 |
+
"step": 4690
|
| 12205 |
+
},
|
| 12206 |
+
{
|
| 12207 |
+
"completion_length": 18.8,
|
| 12208 |
+
"completions/clipped_ratio": 0.0,
|
| 12209 |
+
"completions/max_length": 18.8,
|
| 12210 |
+
"completions/max_terminated_length": 18.8,
|
| 12211 |
+
"completions/mean_length": 16.7,
|
| 12212 |
+
"completions/mean_terminated_length": 16.7,
|
| 12213 |
+
"completions/min_length": 15.2,
|
| 12214 |
+
"completions/min_terminated_length": 15.2,
|
| 12215 |
+
"epoch": 0.32315731573157314,
|
| 12216 |
+
"frac_reward_zero_std": 1.0,
|
| 12217 |
+
"grad_norm": 0.00014543857832904905,
|
| 12218 |
+
"kl": 1.0443045005202294,
|
| 12219 |
+
"learning_rate": 4.2797125721500275e-06,
|
| 12220 |
+
"loss": 0.0,
|
| 12221 |
+
"num_tokens": 6722499.0,
|
| 12222 |
+
"reward": 4.099999904632568,
|
| 12223 |
+
"reward_std": 0.0,
|
| 12224 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12225 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12226 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12227 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12228 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12229 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12230 |
+
"step": 4700
|
| 12231 |
+
},
|
| 12232 |
+
{
|
| 12233 |
+
"completion_length": 18.6,
|
| 12234 |
+
"completions/clipped_ratio": 0.0,
|
| 12235 |
+
"completions/max_length": 18.6,
|
| 12236 |
+
"completions/max_terminated_length": 18.6,
|
| 12237 |
+
"completions/mean_length": 17.1,
|
| 12238 |
+
"completions/mean_terminated_length": 17.1,
|
| 12239 |
+
"completions/min_length": 15.9,
|
| 12240 |
+
"completions/min_terminated_length": 15.9,
|
| 12241 |
+
"epoch": 0.32384488448844884,
|
| 12242 |
+
"frac_reward_zero_std": 1.0,
|
| 12243 |
+
"grad_norm": 0.0,
|
| 12244 |
+
"kl": 1.3368266090750693,
|
| 12245 |
+
"learning_rate": 4.275493361643374e-06,
|
| 12246 |
+
"loss": 0.0,
|
| 12247 |
+
"num_tokens": 6737003.0,
|
| 12248 |
+
"reward": 4.099999904632568,
|
| 12249 |
+
"reward_std": 0.0,
|
| 12250 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12251 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12252 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12253 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12254 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12255 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12256 |
+
"step": 4710
|
| 12257 |
+
},
|
| 12258 |
+
{
|
| 12259 |
+
"completion_length": 18.8,
|
| 12260 |
+
"completions/clipped_ratio": 0.0,
|
| 12261 |
+
"completions/max_length": 18.8,
|
| 12262 |
+
"completions/max_terminated_length": 18.8,
|
| 12263 |
+
"completions/mean_length": 17.375,
|
| 12264 |
+
"completions/mean_terminated_length": 17.375,
|
| 12265 |
+
"completions/min_length": 15.9,
|
| 12266 |
+
"completions/min_terminated_length": 15.9,
|
| 12267 |
+
"epoch": 0.32453245324532454,
|
| 12268 |
+
"frac_reward_zero_std": 1.0,
|
| 12269 |
+
"grad_norm": 0.0,
|
| 12270 |
+
"kl": 1.0547638040734455,
|
| 12271 |
+
"learning_rate": 4.271263922784981e-06,
|
| 12272 |
+
"loss": 0.0,
|
| 12273 |
+
"num_tokens": 6750866.0,
|
| 12274 |
+
"reward": 4.099999904632568,
|
| 12275 |
+
"reward_std": 0.0,
|
| 12276 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12277 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12278 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12279 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12280 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12281 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12282 |
+
"step": 4720
|
| 12283 |
+
},
|
| 12284 |
+
{
|
| 12285 |
+
"completion_length": 19.2,
|
| 12286 |
+
"completions/clipped_ratio": 0.0,
|
| 12287 |
+
"completions/max_length": 19.2,
|
| 12288 |
+
"completions/max_terminated_length": 19.2,
|
| 12289 |
+
"completions/mean_length": 17.25,
|
| 12290 |
+
"completions/mean_terminated_length": 17.25,
|
| 12291 |
+
"completions/min_length": 15.6,
|
| 12292 |
+
"completions/min_terminated_length": 15.6,
|
| 12293 |
+
"epoch": 0.32522002200220024,
|
| 12294 |
+
"frac_reward_zero_std": 1.0,
|
| 12295 |
+
"grad_norm": 0.0,
|
| 12296 |
+
"kl": 1.3946014061570167,
|
| 12297 |
+
"learning_rate": 4.267024279940017e-06,
|
| 12298 |
+
"loss": 0.0001,
|
| 12299 |
+
"num_tokens": 6765004.0,
|
| 12300 |
+
"reward": 4.099999904632568,
|
| 12301 |
+
"reward_std": 0.0,
|
| 12302 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12303 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12304 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12305 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12306 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12307 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12308 |
+
"step": 4730
|
| 12309 |
+
},
|
| 12310 |
+
{
|
| 12311 |
+
"completion_length": 20.5,
|
| 12312 |
+
"completions/clipped_ratio": 0.0,
|
| 12313 |
+
"completions/max_length": 20.5,
|
| 12314 |
+
"completions/max_terminated_length": 20.5,
|
| 12315 |
+
"completions/mean_length": 18.075,
|
| 12316 |
+
"completions/mean_terminated_length": 18.075,
|
| 12317 |
+
"completions/min_length": 15.9,
|
| 12318 |
+
"completions/min_terminated_length": 15.9,
|
| 12319 |
+
"epoch": 0.3259075907590759,
|
| 12320 |
+
"frac_reward_zero_std": 1.0,
|
| 12321 |
+
"grad_norm": 0.0,
|
| 12322 |
+
"kl": 1.2133473329246045,
|
| 12323 |
+
"learning_rate": 4.262774457532428e-06,
|
| 12324 |
+
"loss": 0.0,
|
| 12325 |
+
"num_tokens": 6780903.0,
|
| 12326 |
+
"reward": 4.099999904632568,
|
| 12327 |
+
"reward_std": 0.0,
|
| 12328 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12329 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12330 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12331 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12332 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12333 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12334 |
+
"step": 4740
|
| 12335 |
+
},
|
| 12336 |
+
{
|
| 12337 |
+
"completion_length": 19.2,
|
| 12338 |
+
"completions/clipped_ratio": 0.0,
|
| 12339 |
+
"completions/max_length": 19.2,
|
| 12340 |
+
"completions/max_terminated_length": 19.2,
|
| 12341 |
+
"completions/mean_length": 16.9,
|
| 12342 |
+
"completions/mean_terminated_length": 16.9,
|
| 12343 |
+
"completions/min_length": 15.5,
|
| 12344 |
+
"completions/min_terminated_length": 15.5,
|
| 12345 |
+
"epoch": 0.3265951595159516,
|
| 12346 |
+
"frac_reward_zero_std": 1.0,
|
| 12347 |
+
"grad_norm": 0.0,
|
| 12348 |
+
"kl": 0.9682459566742182,
|
| 12349 |
+
"learning_rate": 4.2585144800448055e-06,
|
| 12350 |
+
"loss": 0.0,
|
| 12351 |
+
"num_tokens": 6794935.0,
|
| 12352 |
+
"reward": 4.099999904632568,
|
| 12353 |
+
"reward_std": 0.0,
|
| 12354 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12355 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12356 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12357 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12358 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12359 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12360 |
+
"step": 4750
|
| 12361 |
+
},
|
| 12362 |
+
{
|
| 12363 |
+
"completion_length": 19.4,
|
| 12364 |
+
"completions/clipped_ratio": 0.0,
|
| 12365 |
+
"completions/max_length": 19.4,
|
| 12366 |
+
"completions/max_terminated_length": 19.4,
|
| 12367 |
+
"completions/mean_length": 16.65,
|
| 12368 |
+
"completions/mean_terminated_length": 16.65,
|
| 12369 |
+
"completions/min_length": 15.2,
|
| 12370 |
+
"completions/min_terminated_length": 15.2,
|
| 12371 |
+
"epoch": 0.3272827282728273,
|
| 12372 |
+
"frac_reward_zero_std": 1.0,
|
| 12373 |
+
"grad_norm": 0.0,
|
| 12374 |
+
"kl": 1.4514311589300632,
|
| 12375 |
+
"learning_rate": 4.254244372018244e-06,
|
| 12376 |
+
"loss": 0.0001,
|
| 12377 |
+
"num_tokens": 6808745.0,
|
| 12378 |
+
"reward": 4.099999904632568,
|
| 12379 |
+
"reward_std": 0.0,
|
| 12380 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12381 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12382 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12383 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12384 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12385 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12386 |
+
"step": 4760
|
| 12387 |
+
},
|
| 12388 |
+
{
|
| 12389 |
+
"completion_length": 19.1,
|
| 12390 |
+
"completions/clipped_ratio": 0.0,
|
| 12391 |
+
"completions/max_length": 19.1,
|
| 12392 |
+
"completions/max_terminated_length": 19.1,
|
| 12393 |
+
"completions/mean_length": 16.875,
|
| 12394 |
+
"completions/mean_terminated_length": 16.875,
|
| 12395 |
+
"completions/min_length": 15.1,
|
| 12396 |
+
"completions/min_terminated_length": 15.1,
|
| 12397 |
+
"epoch": 0.327970297029703,
|
| 12398 |
+
"frac_reward_zero_std": 1.0,
|
| 12399 |
+
"grad_norm": 0.0,
|
| 12400 |
+
"kl": 1.065644410997629,
|
| 12401 |
+
"learning_rate": 4.249964158052195e-06,
|
| 12402 |
+
"loss": 0.0,
|
| 12403 |
+
"num_tokens": 6822524.0,
|
| 12404 |
+
"reward": 4.099999904632568,
|
| 12405 |
+
"reward_std": 0.0,
|
| 12406 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12407 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12408 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12409 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12410 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12411 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12412 |
+
"step": 4770
|
| 12413 |
+
},
|
| 12414 |
+
{
|
| 12415 |
+
"completion_length": 19.0,
|
| 12416 |
+
"completions/clipped_ratio": 0.0,
|
| 12417 |
+
"completions/max_length": 19.0,
|
| 12418 |
+
"completions/max_terminated_length": 19.0,
|
| 12419 |
+
"completions/mean_length": 17.625,
|
| 12420 |
+
"completions/mean_terminated_length": 17.625,
|
| 12421 |
+
"completions/min_length": 16.5,
|
| 12422 |
+
"completions/min_terminated_length": 16.5,
|
| 12423 |
+
"epoch": 0.3286578657865787,
|
| 12424 |
+
"frac_reward_zero_std": 1.0,
|
| 12425 |
+
"grad_norm": 0.0,
|
| 12426 |
+
"kl": 1.3550350315868855,
|
| 12427 |
+
"learning_rate": 4.2456738628043324e-06,
|
| 12428 |
+
"loss": 0.0,
|
| 12429 |
+
"num_tokens": 6838473.0,
|
| 12430 |
+
"reward": 4.099999904632568,
|
| 12431 |
+
"reward_std": 0.0,
|
| 12432 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12433 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12434 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12435 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12436 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12437 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12438 |
+
"step": 4780
|
| 12439 |
+
},
|
| 12440 |
+
{
|
| 12441 |
+
"completion_length": 19.9,
|
| 12442 |
+
"completions/clipped_ratio": 0.0,
|
| 12443 |
+
"completions/max_length": 19.9,
|
| 12444 |
+
"completions/max_terminated_length": 19.9,
|
| 12445 |
+
"completions/mean_length": 17.7,
|
| 12446 |
+
"completions/mean_terminated_length": 17.7,
|
| 12447 |
+
"completions/min_length": 15.4,
|
| 12448 |
+
"completions/min_terminated_length": 15.4,
|
| 12449 |
+
"epoch": 0.32934543454345433,
|
| 12450 |
+
"frac_reward_zero_std": 1.0,
|
| 12451 |
+
"grad_norm": 0.0,
|
| 12452 |
+
"kl": 1.3020609110593795,
|
| 12453 |
+
"learning_rate": 4.241373510990406e-06,
|
| 12454 |
+
"loss": 0.0,
|
| 12455 |
+
"num_tokens": 6852517.0,
|
| 12456 |
+
"reward": 4.099999904632568,
|
| 12457 |
+
"reward_std": 0.0,
|
| 12458 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12459 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12460 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12461 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12462 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12463 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12464 |
+
"step": 4790
|
| 12465 |
+
},
|
| 12466 |
+
{
|
| 12467 |
+
"completion_length": 18.8,
|
| 12468 |
+
"completions/clipped_ratio": 0.0,
|
| 12469 |
+
"completions/max_length": 18.8,
|
| 12470 |
+
"completions/max_terminated_length": 18.8,
|
| 12471 |
+
"completions/mean_length": 17.325,
|
| 12472 |
+
"completions/mean_terminated_length": 17.325,
|
| 12473 |
+
"completions/min_length": 16.4,
|
| 12474 |
+
"completions/min_terminated_length": 16.4,
|
| 12475 |
+
"epoch": 0.33003300330033003,
|
| 12476 |
+
"frac_reward_zero_std": 1.0,
|
| 12477 |
+
"grad_norm": 0.0,
|
| 12478 |
+
"kl": 1.2051956176757812,
|
| 12479 |
+
"learning_rate": 4.237063127384099e-06,
|
| 12480 |
+
"loss": 0.0,
|
| 12481 |
+
"num_tokens": 6866874.0,
|
| 12482 |
+
"reward": 4.099999904632568,
|
| 12483 |
+
"reward_std": 0.0,
|
| 12484 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12485 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12486 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12487 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12488 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12489 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12490 |
+
"step": 4800
|
| 12491 |
+
},
|
| 12492 |
+
{
|
| 12493 |
+
"completion_length": 19.6,
|
| 12494 |
+
"completions/clipped_ratio": 0.0,
|
| 12495 |
+
"completions/max_length": 19.6,
|
| 12496 |
+
"completions/max_terminated_length": 19.6,
|
| 12497 |
+
"completions/mean_length": 18.325,
|
| 12498 |
+
"completions/mean_terminated_length": 18.325,
|
| 12499 |
+
"completions/min_length": 17.2,
|
| 12500 |
+
"completions/min_terminated_length": 17.2,
|
| 12501 |
+
"epoch": 0.33072057205720573,
|
| 12502 |
+
"frac_reward_zero_std": 1.0,
|
| 12503 |
+
"grad_norm": 0.0,
|
| 12504 |
+
"kl": 1.2347914427518845,
|
| 12505 |
+
"learning_rate": 4.232742736816887e-06,
|
| 12506 |
+
"loss": 0.0,
|
| 12507 |
+
"num_tokens": 6883619.0,
|
| 12508 |
+
"reward": 4.099999904632568,
|
| 12509 |
+
"reward_std": 0.0,
|
| 12510 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12511 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12512 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12513 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12514 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12515 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12516 |
+
"step": 4810
|
| 12517 |
+
},
|
| 12518 |
+
{
|
| 12519 |
+
"completion_length": 20.5,
|
| 12520 |
+
"completions/clipped_ratio": 0.0,
|
| 12521 |
+
"completions/max_length": 20.5,
|
| 12522 |
+
"completions/max_terminated_length": 20.5,
|
| 12523 |
+
"completions/mean_length": 19.25,
|
| 12524 |
+
"completions/mean_terminated_length": 19.25,
|
| 12525 |
+
"completions/min_length": 17.8,
|
| 12526 |
+
"completions/min_terminated_length": 17.8,
|
| 12527 |
+
"epoch": 0.33140814081408143,
|
| 12528 |
+
"frac_reward_zero_std": 1.0,
|
| 12529 |
+
"grad_norm": 0.0,
|
| 12530 |
+
"kl": 0.8823791073635221,
|
| 12531 |
+
"learning_rate": 4.228412364177893e-06,
|
| 12532 |
+
"loss": 0.0,
|
| 12533 |
+
"num_tokens": 6897733.0,
|
| 12534 |
+
"reward": 4.099999904632568,
|
| 12535 |
+
"reward_std": 0.0,
|
| 12536 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12537 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12538 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12539 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12540 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12541 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12542 |
+
"step": 4820
|
| 12543 |
+
},
|
| 12544 |
+
{
|
| 12545 |
+
"completion_length": 19.2,
|
| 12546 |
+
"completions/clipped_ratio": 0.0,
|
| 12547 |
+
"completions/max_length": 19.2,
|
| 12548 |
+
"completions/max_terminated_length": 19.2,
|
| 12549 |
+
"completions/mean_length": 17.65,
|
| 12550 |
+
"completions/mean_terminated_length": 17.65,
|
| 12551 |
+
"completions/min_length": 15.7,
|
| 12552 |
+
"completions/min_terminated_length": 15.7,
|
| 12553 |
+
"epoch": 0.3320957095709571,
|
| 12554 |
+
"frac_reward_zero_std": 1.0,
|
| 12555 |
+
"grad_norm": 0.0,
|
| 12556 |
+
"kl": 1.1724223725497722,
|
| 12557 |
+
"learning_rate": 4.2240720344137476e-06,
|
| 12558 |
+
"loss": 0.0,
|
| 12559 |
+
"num_tokens": 6911055.0,
|
| 12560 |
+
"reward": 4.099999904632568,
|
| 12561 |
+
"reward_std": 0.0,
|
| 12562 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12563 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12564 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12565 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12566 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12567 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12568 |
+
"step": 4830
|
| 12569 |
+
},
|
| 12570 |
+
{
|
| 12571 |
+
"completion_length": 19.3,
|
| 12572 |
+
"completions/clipped_ratio": 0.0,
|
| 12573 |
+
"completions/max_length": 19.3,
|
| 12574 |
+
"completions/max_terminated_length": 19.3,
|
| 12575 |
+
"completions/mean_length": 17.575,
|
| 12576 |
+
"completions/mean_terminated_length": 17.575,
|
| 12577 |
+
"completions/min_length": 16.1,
|
| 12578 |
+
"completions/min_terminated_length": 16.1,
|
| 12579 |
+
"epoch": 0.3327832783278328,
|
| 12580 |
+
"frac_reward_zero_std": 1.0,
|
| 12581 |
+
"grad_norm": 0.0,
|
| 12582 |
+
"kl": 0.8990753037855029,
|
| 12583 |
+
"learning_rate": 4.21972177252844e-06,
|
| 12584 |
+
"loss": 0.0,
|
| 12585 |
+
"num_tokens": 6923818.0,
|
| 12586 |
+
"reward": 4.099999904632568,
|
| 12587 |
+
"reward_std": 0.0,
|
| 12588 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12589 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12590 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12591 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12592 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12593 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12594 |
+
"step": 4840
|
| 12595 |
+
},
|
| 12596 |
+
{
|
| 12597 |
+
"completion_length": 21.4,
|
| 12598 |
+
"completions/clipped_ratio": 0.0,
|
| 12599 |
+
"completions/max_length": 21.4,
|
| 12600 |
+
"completions/max_terminated_length": 21.4,
|
| 12601 |
+
"completions/mean_length": 19.175,
|
| 12602 |
+
"completions/mean_terminated_length": 19.175,
|
| 12603 |
+
"completions/min_length": 17.4,
|
| 12604 |
+
"completions/min_terminated_length": 17.4,
|
| 12605 |
+
"epoch": 0.3334708470847085,
|
| 12606 |
+
"frac_reward_zero_std": 1.0,
|
| 12607 |
+
"grad_norm": 0.0,
|
| 12608 |
+
"kl": 0.9704873599112034,
|
| 12609 |
+
"learning_rate": 4.2153616035831806e-06,
|
| 12610 |
+
"loss": 0.0,
|
| 12611 |
+
"num_tokens": 6940993.0,
|
| 12612 |
+
"reward": 4.099999904632568,
|
| 12613 |
+
"reward_std": 0.0,
|
| 12614 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12615 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12616 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12617 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12618 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12619 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12620 |
+
"step": 4850
|
| 12621 |
+
},
|
| 12622 |
+
{
|
| 12623 |
+
"completion_length": 18.6,
|
| 12624 |
+
"completions/clipped_ratio": 0.0,
|
| 12625 |
+
"completions/max_length": 18.6,
|
| 12626 |
+
"completions/max_terminated_length": 18.6,
|
| 12627 |
+
"completions/mean_length": 17.375,
|
| 12628 |
+
"completions/mean_terminated_length": 17.375,
|
| 12629 |
+
"completions/min_length": 15.6,
|
| 12630 |
+
"completions/min_terminated_length": 15.6,
|
| 12631 |
+
"epoch": 0.3341584158415842,
|
| 12632 |
+
"frac_reward_zero_std": 1.0,
|
| 12633 |
+
"grad_norm": 0.0,
|
| 12634 |
+
"kl": 1.0528290897607804,
|
| 12635 |
+
"learning_rate": 4.210991552696247e-06,
|
| 12636 |
+
"loss": 0.0,
|
| 12637 |
+
"num_tokens": 6957752.0,
|
| 12638 |
+
"reward": 4.099999904632568,
|
| 12639 |
+
"reward_std": 0.0,
|
| 12640 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12641 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12642 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12643 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12644 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12645 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12646 |
+
"step": 4860
|
| 12647 |
+
},
|
| 12648 |
+
{
|
| 12649 |
+
"completion_length": 18.7,
|
| 12650 |
+
"completions/clipped_ratio": 0.0,
|
| 12651 |
+
"completions/max_length": 18.7,
|
| 12652 |
+
"completions/max_terminated_length": 18.7,
|
| 12653 |
+
"completions/mean_length": 16.925,
|
| 12654 |
+
"completions/mean_terminated_length": 16.925,
|
| 12655 |
+
"completions/min_length": 15.7,
|
| 12656 |
+
"completions/min_terminated_length": 15.7,
|
| 12657 |
+
"epoch": 0.3348459845984598,
|
| 12658 |
+
"frac_reward_zero_std": 1.0,
|
| 12659 |
+
"grad_norm": 0.0,
|
| 12660 |
+
"kl": 0.9078183400444686,
|
| 12661 |
+
"learning_rate": 4.2066116450428525e-06,
|
| 12662 |
+
"loss": 0.0,
|
| 12663 |
+
"num_tokens": 6973089.0,
|
| 12664 |
+
"reward": 4.099999904632568,
|
| 12665 |
+
"reward_std": 0.0,
|
| 12666 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12667 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12668 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12669 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12670 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12671 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12672 |
+
"step": 4870
|
| 12673 |
+
},
|
| 12674 |
+
{
|
| 12675 |
+
"completion_length": 18.1,
|
| 12676 |
+
"completions/clipped_ratio": 0.0,
|
| 12677 |
+
"completions/max_length": 18.1,
|
| 12678 |
+
"completions/max_terminated_length": 18.1,
|
| 12679 |
+
"completions/mean_length": 16.75,
|
| 12680 |
+
"completions/mean_terminated_length": 16.75,
|
| 12681 |
+
"completions/min_length": 15.5,
|
| 12682 |
+
"completions/min_terminated_length": 15.5,
|
| 12683 |
+
"epoch": 0.3355335533553355,
|
| 12684 |
+
"frac_reward_zero_std": 1.0,
|
| 12685 |
+
"grad_norm": 2.7470434361021034e-05,
|
| 12686 |
+
"kl": 1.577232411503792,
|
| 12687 |
+
"learning_rate": 4.202221905854989e-06,
|
| 12688 |
+
"loss": 0.0001,
|
| 12689 |
+
"num_tokens": 6989239.0,
|
| 12690 |
+
"reward": 4.099999904632568,
|
| 12691 |
+
"reward_std": 0.0,
|
| 12692 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12693 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12694 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12695 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12696 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12697 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12698 |
+
"step": 4880
|
| 12699 |
+
},
|
| 12700 |
+
{
|
| 12701 |
+
"completion_length": 16.0,
|
| 12702 |
+
"completions/clipped_ratio": 0.0,
|
| 12703 |
+
"completions/max_length": 16.0,
|
| 12704 |
+
"completions/max_terminated_length": 16.0,
|
| 12705 |
+
"completions/mean_length": 15.625,
|
| 12706 |
+
"completions/mean_terminated_length": 15.625,
|
| 12707 |
+
"completions/min_length": 15.3,
|
| 12708 |
+
"completions/min_terminated_length": 15.3,
|
| 12709 |
+
"epoch": 0.3362211221122112,
|
| 12710 |
+
"frac_reward_zero_std": 1.0,
|
| 12711 |
+
"grad_norm": 0.0,
|
| 12712 |
+
"kl": 1.1079448973294348,
|
| 12713 |
+
"learning_rate": 4.197822360421286e-06,
|
| 12714 |
+
"loss": 0.0,
|
| 12715 |
+
"num_tokens": 7003600.0,
|
| 12716 |
+
"reward": 4.099999904632568,
|
| 12717 |
+
"reward_std": 0.0,
|
| 12718 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12719 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12720 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12721 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12722 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12723 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12724 |
+
"step": 4890
|
| 12725 |
+
},
|
| 12726 |
+
{
|
| 12727 |
+
"completion_length": 18.9,
|
| 12728 |
+
"completions/clipped_ratio": 0.0,
|
| 12729 |
+
"completions/max_length": 18.9,
|
| 12730 |
+
"completions/max_terminated_length": 18.9,
|
| 12731 |
+
"completions/mean_length": 16.425,
|
| 12732 |
+
"completions/mean_terminated_length": 16.425,
|
| 12733 |
+
"completions/min_length": 14.9,
|
| 12734 |
+
"completions/min_terminated_length": 14.9,
|
| 12735 |
+
"epoch": 0.3369086908690869,
|
| 12736 |
+
"frac_reward_zero_std": 1.0,
|
| 12737 |
+
"grad_norm": 0.0,
|
| 12738 |
+
"kl": 1.4668400838971138,
|
| 12739 |
+
"learning_rate": 4.193413034086868e-06,
|
| 12740 |
+
"loss": 0.0001,
|
| 12741 |
+
"num_tokens": 7018585.0,
|
| 12742 |
+
"reward": 4.099999904632568,
|
| 12743 |
+
"reward_std": 0.0,
|
| 12744 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12745 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12746 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12747 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12748 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12749 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12750 |
+
"step": 4900
|
| 12751 |
+
},
|
| 12752 |
+
{
|
| 12753 |
+
"completion_length": 18.5,
|
| 12754 |
+
"completions/clipped_ratio": 0.0,
|
| 12755 |
+
"completions/max_length": 18.5,
|
| 12756 |
+
"completions/max_terminated_length": 18.5,
|
| 12757 |
+
"completions/mean_length": 17.475,
|
| 12758 |
+
"completions/mean_terminated_length": 17.475,
|
| 12759 |
+
"completions/min_length": 16.7,
|
| 12760 |
+
"completions/min_terminated_length": 16.7,
|
| 12761 |
+
"epoch": 0.33759625962596257,
|
| 12762 |
+
"frac_reward_zero_std": 1.0,
|
| 12763 |
+
"grad_norm": 0.0,
|
| 12764 |
+
"kl": 1.306707089813426,
|
| 12765 |
+
"learning_rate": 4.188993952253205e-06,
|
| 12766 |
+
"loss": 0.0,
|
| 12767 |
+
"num_tokens": 7033004.0,
|
| 12768 |
+
"reward": 4.099999904632568,
|
| 12769 |
+
"reward_std": 0.0,
|
| 12770 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12771 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12772 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12773 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12774 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12775 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12776 |
+
"step": 4910
|
| 12777 |
+
},
|
| 12778 |
+
{
|
| 12779 |
+
"completion_length": 20.0,
|
| 12780 |
+
"completions/clipped_ratio": 0.0,
|
| 12781 |
+
"completions/max_length": 20.0,
|
| 12782 |
+
"completions/max_terminated_length": 20.0,
|
| 12783 |
+
"completions/mean_length": 17.975,
|
| 12784 |
+
"completions/mean_terminated_length": 17.975,
|
| 12785 |
+
"completions/min_length": 16.5,
|
| 12786 |
+
"completions/min_terminated_length": 16.5,
|
| 12787 |
+
"epoch": 0.33828382838283827,
|
| 12788 |
+
"frac_reward_zero_std": 1.0,
|
| 12789 |
+
"grad_norm": 0.0,
|
| 12790 |
+
"kl": 0.9754183698445559,
|
| 12791 |
+
"learning_rate": 4.1845651403779655e-06,
|
| 12792 |
+
"loss": 0.0,
|
| 12793 |
+
"num_tokens": 7046931.0,
|
| 12794 |
+
"reward": 4.099999904632568,
|
| 12795 |
+
"reward_std": 0.0,
|
| 12796 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12797 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12798 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12799 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12800 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12801 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12802 |
+
"step": 4920
|
| 12803 |
+
},
|
| 12804 |
+
{
|
| 12805 |
+
"completion_length": 19.9,
|
| 12806 |
+
"completions/clipped_ratio": 0.0,
|
| 12807 |
+
"completions/max_length": 19.9,
|
| 12808 |
+
"completions/max_terminated_length": 19.9,
|
| 12809 |
+
"completions/mean_length": 17.925,
|
| 12810 |
+
"completions/mean_terminated_length": 17.925,
|
| 12811 |
+
"completions/min_length": 15.6,
|
| 12812 |
+
"completions/min_terminated_length": 15.6,
|
| 12813 |
+
"epoch": 0.33897139713971397,
|
| 12814 |
+
"frac_reward_zero_std": 1.0,
|
| 12815 |
+
"grad_norm": 0.0,
|
| 12816 |
+
"kl": 1.38096314817667,
|
| 12817 |
+
"learning_rate": 4.180126623974874e-06,
|
| 12818 |
+
"loss": 0.0001,
|
| 12819 |
+
"num_tokens": 7061420.0,
|
| 12820 |
+
"reward": 4.099999904632568,
|
| 12821 |
+
"reward_std": 0.0,
|
| 12822 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12823 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12824 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12825 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12826 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12827 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12828 |
+
"step": 4930
|
| 12829 |
+
},
|
| 12830 |
+
{
|
| 12831 |
+
"completion_length": 18.3,
|
| 12832 |
+
"completions/clipped_ratio": 0.0,
|
| 12833 |
+
"completions/max_length": 18.3,
|
| 12834 |
+
"completions/max_terminated_length": 18.3,
|
| 12835 |
+
"completions/mean_length": 17.25,
|
| 12836 |
+
"completions/mean_terminated_length": 17.25,
|
| 12837 |
+
"completions/min_length": 16.2,
|
| 12838 |
+
"completions/min_terminated_length": 16.2,
|
| 12839 |
+
"epoch": 0.33965896589658967,
|
| 12840 |
+
"frac_reward_zero_std": 1.0,
|
| 12841 |
+
"grad_norm": 0.0,
|
| 12842 |
+
"kl": 0.9460880151760648,
|
| 12843 |
+
"learning_rate": 4.175678428613557e-06,
|
| 12844 |
+
"loss": 0.0,
|
| 12845 |
+
"num_tokens": 7076598.0,
|
| 12846 |
+
"reward": 4.099999904632568,
|
| 12847 |
+
"reward_std": 0.0,
|
| 12848 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12849 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12850 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12851 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12852 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12853 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12854 |
+
"step": 4940
|
| 12855 |
+
},
|
| 12856 |
+
{
|
| 12857 |
+
"completion_length": 20.3,
|
| 12858 |
+
"completions/clipped_ratio": 0.0,
|
| 12859 |
+
"completions/max_length": 20.3,
|
| 12860 |
+
"completions/max_terminated_length": 20.3,
|
| 12861 |
+
"completions/mean_length": 17.275,
|
| 12862 |
+
"completions/mean_terminated_length": 17.275,
|
| 12863 |
+
"completions/min_length": 15.2,
|
| 12864 |
+
"completions/min_terminated_length": 15.2,
|
| 12865 |
+
"epoch": 0.34034653465346537,
|
| 12866 |
+
"frac_reward_zero_std": 1.0,
|
| 12867 |
+
"grad_norm": 0.0,
|
| 12868 |
+
"kl": 1.1463233292102815,
|
| 12869 |
+
"learning_rate": 4.171220579919406e-06,
|
| 12870 |
+
"loss": 0.0,
|
| 12871 |
+
"num_tokens": 7091077.0,
|
| 12872 |
+
"reward": 4.099999904632568,
|
| 12873 |
+
"reward_std": 0.0,
|
| 12874 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12875 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12876 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12877 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12878 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12879 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12880 |
+
"step": 4950
|
| 12881 |
+
},
|
| 12882 |
+
{
|
| 12883 |
+
"completion_length": 16.1,
|
| 12884 |
+
"completions/clipped_ratio": 0.0,
|
| 12885 |
+
"completions/max_length": 16.1,
|
| 12886 |
+
"completions/max_terminated_length": 16.1,
|
| 12887 |
+
"completions/mean_length": 15.5,
|
| 12888 |
+
"completions/mean_terminated_length": 15.5,
|
| 12889 |
+
"completions/min_length": 14.8,
|
| 12890 |
+
"completions/min_terminated_length": 14.8,
|
| 12891 |
+
"epoch": 0.341034103410341,
|
| 12892 |
+
"frac_reward_zero_std": 1.0,
|
| 12893 |
+
"grad_norm": 0.0,
|
| 12894 |
+
"kl": 0.9677249977365137,
|
| 12895 |
+
"learning_rate": 4.16675310357342e-06,
|
| 12896 |
+
"loss": 0.0,
|
| 12897 |
+
"num_tokens": 7101725.0,
|
| 12898 |
+
"reward": 4.099999904632568,
|
| 12899 |
+
"reward_std": 0.0,
|
| 12900 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12901 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12902 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12903 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12904 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12905 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12906 |
+
"step": 4960
|
| 12907 |
+
},
|
| 12908 |
+
{
|
| 12909 |
+
"completion_length": 18.8,
|
| 12910 |
+
"completions/clipped_ratio": 0.0,
|
| 12911 |
+
"completions/max_length": 18.8,
|
| 12912 |
+
"completions/max_terminated_length": 18.8,
|
| 12913 |
+
"completions/mean_length": 17.5,
|
| 12914 |
+
"completions/mean_terminated_length": 17.5,
|
| 12915 |
+
"completions/min_length": 15.3,
|
| 12916 |
+
"completions/min_terminated_length": 15.3,
|
| 12917 |
+
"epoch": 0.3417216721672167,
|
| 12918 |
+
"frac_reward_zero_std": 1.0,
|
| 12919 |
+
"grad_norm": 0.0,
|
| 12920 |
+
"kl": 1.3026058718562126,
|
| 12921 |
+
"learning_rate": 4.162276025312059e-06,
|
| 12922 |
+
"loss": 0.0,
|
| 12923 |
+
"num_tokens": 7117885.0,
|
| 12924 |
+
"reward": 4.099999904632568,
|
| 12925 |
+
"reward_std": 0.0,
|
| 12926 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12927 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12928 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12929 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12930 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12931 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12932 |
+
"step": 4970
|
| 12933 |
+
},
|
| 12934 |
+
{
|
| 12935 |
+
"completion_length": 21.8,
|
| 12936 |
+
"completions/clipped_ratio": 0.0,
|
| 12937 |
+
"completions/max_length": 21.8,
|
| 12938 |
+
"completions/max_terminated_length": 21.8,
|
| 12939 |
+
"completions/mean_length": 19.9,
|
| 12940 |
+
"completions/mean_terminated_length": 19.9,
|
| 12941 |
+
"completions/min_length": 17.7,
|
| 12942 |
+
"completions/min_terminated_length": 17.7,
|
| 12943 |
+
"epoch": 0.3424092409240924,
|
| 12944 |
+
"frac_reward_zero_std": 1.0,
|
| 12945 |
+
"grad_norm": 0.0,
|
| 12946 |
+
"kl": 1.2132138408720494,
|
| 12947 |
+
"learning_rate": 4.157789370927104e-06,
|
| 12948 |
+
"loss": 0.0,
|
| 12949 |
+
"num_tokens": 7132993.0,
|
| 12950 |
+
"reward": 4.099999904632568,
|
| 12951 |
+
"reward_std": 0.0,
|
| 12952 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12953 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12954 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12955 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12956 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12957 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12958 |
+
"step": 4980
|
| 12959 |
+
},
|
| 12960 |
+
{
|
| 12961 |
+
"completion_length": 17.8,
|
| 12962 |
+
"completions/clipped_ratio": 0.0,
|
| 12963 |
+
"completions/max_length": 17.8,
|
| 12964 |
+
"completions/max_terminated_length": 17.8,
|
| 12965 |
+
"completions/mean_length": 16.625,
|
| 12966 |
+
"completions/mean_terminated_length": 16.625,
|
| 12967 |
+
"completions/min_length": 15.8,
|
| 12968 |
+
"completions/min_terminated_length": 15.8,
|
| 12969 |
+
"epoch": 0.3430968096809681,
|
| 12970 |
+
"frac_reward_zero_std": 1.0,
|
| 12971 |
+
"grad_norm": 0.0,
|
| 12972 |
+
"kl": 1.3136692702770234,
|
| 12973 |
+
"learning_rate": 4.153293166265502e-06,
|
| 12974 |
+
"loss": 0.0,
|
| 12975 |
+
"num_tokens": 7148478.0,
|
| 12976 |
+
"reward": 4.099999904632568,
|
| 12977 |
+
"reward_std": 0.0,
|
| 12978 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 12979 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 12980 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 12981 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 12982 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 12983 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 12984 |
+
"step": 4990
|
| 12985 |
+
},
|
| 12986 |
+
{
|
| 12987 |
+
"completion_length": 18.0,
|
| 12988 |
+
"completions/clipped_ratio": 0.0,
|
| 12989 |
+
"completions/max_length": 18.0,
|
| 12990 |
+
"completions/max_terminated_length": 18.0,
|
| 12991 |
+
"completions/mean_length": 16.7,
|
| 12992 |
+
"completions/mean_terminated_length": 16.7,
|
| 12993 |
+
"completions/min_length": 15.6,
|
| 12994 |
+
"completions/min_terminated_length": 15.6,
|
| 12995 |
+
"epoch": 0.34378437843784376,
|
| 12996 |
+
"frac_reward_zero_std": 1.0,
|
| 12997 |
+
"grad_norm": 0.0,
|
| 12998 |
+
"kl": 1.5792409382760524,
|
| 12999 |
+
"learning_rate": 4.1487874372292106e-06,
|
| 13000 |
+
"loss": 0.0001,
|
| 13001 |
+
"num_tokens": 7163946.0,
|
| 13002 |
+
"reward": 4.099999904632568,
|
| 13003 |
+
"reward_std": 0.0,
|
| 13004 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13005 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13006 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13007 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13008 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13009 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13010 |
+
"step": 5000
|
| 13011 |
+
},
|
| 13012 |
+
{
|
| 13013 |
+
"completion_length": 21.3,
|
| 13014 |
+
"completions/clipped_ratio": 0.0,
|
| 13015 |
+
"completions/max_length": 21.3,
|
| 13016 |
+
"completions/max_terminated_length": 21.3,
|
| 13017 |
+
"completions/mean_length": 19.075,
|
| 13018 |
+
"completions/mean_terminated_length": 19.075,
|
| 13019 |
+
"completions/min_length": 17.1,
|
| 13020 |
+
"completions/min_terminated_length": 17.1,
|
| 13021 |
+
"epoch": 0.34447194719471946,
|
| 13022 |
+
"frac_reward_zero_std": 1.0,
|
| 13023 |
+
"grad_norm": 0.0,
|
| 13024 |
+
"kl": 1.0890948809683323,
|
| 13025 |
+
"learning_rate": 4.1442722097750645e-06,
|
| 13026 |
+
"loss": 0.0,
|
| 13027 |
+
"num_tokens": 7177981.0,
|
| 13028 |
+
"reward": 4.099999904632568,
|
| 13029 |
+
"reward_std": 0.0,
|
| 13030 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13031 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13032 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13033 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13034 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13035 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13036 |
+
"step": 5010
|
| 13037 |
+
},
|
| 13038 |
+
{
|
| 13039 |
+
"completion_length": 18.0,
|
| 13040 |
+
"completions/clipped_ratio": 0.0,
|
| 13041 |
+
"completions/max_length": 18.0,
|
| 13042 |
+
"completions/max_terminated_length": 18.0,
|
| 13043 |
+
"completions/mean_length": 16.075,
|
| 13044 |
+
"completions/mean_terminated_length": 16.075,
|
| 13045 |
+
"completions/min_length": 14.1,
|
| 13046 |
+
"completions/min_terminated_length": 14.1,
|
| 13047 |
+
"epoch": 0.34515951595159516,
|
| 13048 |
+
"frac_reward_zero_std": 1.0,
|
| 13049 |
+
"grad_norm": 0.0,
|
| 13050 |
+
"kl": 1.2117942936718464,
|
| 13051 |
+
"learning_rate": 4.139747509914613e-06,
|
| 13052 |
+
"loss": 0.0,
|
| 13053 |
+
"num_tokens": 7191344.0,
|
| 13054 |
+
"reward": 4.099999904632568,
|
| 13055 |
+
"reward_std": 0.0,
|
| 13056 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13057 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13058 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13059 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13060 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13061 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13062 |
+
"step": 5020
|
| 13063 |
+
},
|
| 13064 |
+
{
|
| 13065 |
+
"completion_length": 18.5,
|
| 13066 |
+
"completions/clipped_ratio": 0.0,
|
| 13067 |
+
"completions/max_length": 18.5,
|
| 13068 |
+
"completions/max_terminated_length": 18.5,
|
| 13069 |
+
"completions/mean_length": 16.925,
|
| 13070 |
+
"completions/mean_terminated_length": 16.925,
|
| 13071 |
+
"completions/min_length": 15.8,
|
| 13072 |
+
"completions/min_terminated_length": 15.8,
|
| 13073 |
+
"epoch": 0.34584708470847086,
|
| 13074 |
+
"frac_reward_zero_std": 1.0,
|
| 13075 |
+
"grad_norm": 0.0,
|
| 13076 |
+
"kl": 1.3905832149088382,
|
| 13077 |
+
"learning_rate": 4.135213363713976e-06,
|
| 13078 |
+
"loss": 0.0,
|
| 13079 |
+
"num_tokens": 7207557.0,
|
| 13080 |
+
"reward": 4.099999904632568,
|
| 13081 |
+
"reward_std": 0.0,
|
| 13082 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13083 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13084 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13085 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13086 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13087 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13088 |
+
"step": 5030
|
| 13089 |
+
},
|
| 13090 |
+
{
|
| 13091 |
+
"completion_length": 20.5,
|
| 13092 |
+
"completions/clipped_ratio": 0.0,
|
| 13093 |
+
"completions/max_length": 20.5,
|
| 13094 |
+
"completions/max_terminated_length": 20.5,
|
| 13095 |
+
"completions/mean_length": 17.575,
|
| 13096 |
+
"completions/mean_terminated_length": 17.575,
|
| 13097 |
+
"completions/min_length": 15.5,
|
| 13098 |
+
"completions/min_terminated_length": 15.5,
|
| 13099 |
+
"epoch": 0.3465346534653465,
|
| 13100 |
+
"frac_reward_zero_std": 1.0,
|
| 13101 |
+
"grad_norm": 0.0,
|
| 13102 |
+
"kl": 1.4240806803107262,
|
| 13103 |
+
"learning_rate": 4.13066979729369e-06,
|
| 13104 |
+
"loss": 0.0001,
|
| 13105 |
+
"num_tokens": 7221180.0,
|
| 13106 |
+
"reward": 4.099999904632568,
|
| 13107 |
+
"reward_std": 0.0,
|
| 13108 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13109 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13110 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13111 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13112 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13113 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13114 |
+
"step": 5040
|
| 13115 |
+
},
|
| 13116 |
+
{
|
| 13117 |
+
"completion_length": 18.8,
|
| 13118 |
+
"completions/clipped_ratio": 0.0,
|
| 13119 |
+
"completions/max_length": 18.8,
|
| 13120 |
+
"completions/max_terminated_length": 18.8,
|
| 13121 |
+
"completions/mean_length": 17.125,
|
| 13122 |
+
"completions/mean_terminated_length": 17.125,
|
| 13123 |
+
"completions/min_length": 16.2,
|
| 13124 |
+
"completions/min_terminated_length": 16.2,
|
| 13125 |
+
"epoch": 0.3472222222222222,
|
| 13126 |
+
"frac_reward_zero_std": 1.0,
|
| 13127 |
+
"grad_norm": 0.0,
|
| 13128 |
+
"kl": 1.0334409718617508,
|
| 13129 |
+
"learning_rate": 4.126116836828563e-06,
|
| 13130 |
+
"loss": 0.0,
|
| 13131 |
+
"num_tokens": 7235709.0,
|
| 13132 |
+
"reward": 4.099999904632568,
|
| 13133 |
+
"reward_std": 0.0,
|
| 13134 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13135 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13136 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13137 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13138 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13139 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13140 |
+
"step": 5050
|
| 13141 |
+
},
|
| 13142 |
+
{
|
| 13143 |
+
"completion_length": 18.8,
|
| 13144 |
+
"completions/clipped_ratio": 0.0,
|
| 13145 |
+
"completions/max_length": 18.8,
|
| 13146 |
+
"completions/max_terminated_length": 18.8,
|
| 13147 |
+
"completions/mean_length": 17.5,
|
| 13148 |
+
"completions/mean_terminated_length": 17.5,
|
| 13149 |
+
"completions/min_length": 16.1,
|
| 13150 |
+
"completions/min_terminated_length": 16.1,
|
| 13151 |
+
"epoch": 0.3479097909790979,
|
| 13152 |
+
"frac_reward_zero_std": 1.0,
|
| 13153 |
+
"grad_norm": 0.0,
|
| 13154 |
+
"kl": 1.3188940420746804,
|
| 13155 |
+
"learning_rate": 4.121554508547518e-06,
|
| 13156 |
+
"loss": 0.0,
|
| 13157 |
+
"num_tokens": 7248869.0,
|
| 13158 |
+
"reward": 4.099999904632568,
|
| 13159 |
+
"reward_std": 0.0,
|
| 13160 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13161 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13162 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13163 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13164 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13165 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13166 |
+
"step": 5060
|
| 13167 |
+
},
|
| 13168 |
+
{
|
| 13169 |
+
"completion_length": 17.8,
|
| 13170 |
+
"completions/clipped_ratio": 0.0,
|
| 13171 |
+
"completions/max_length": 17.8,
|
| 13172 |
+
"completions/max_terminated_length": 17.8,
|
| 13173 |
+
"completions/mean_length": 16.575,
|
| 13174 |
+
"completions/mean_terminated_length": 16.575,
|
| 13175 |
+
"completions/min_length": 15.5,
|
| 13176 |
+
"completions/min_terminated_length": 15.5,
|
| 13177 |
+
"epoch": 0.3485973597359736,
|
| 13178 |
+
"frac_reward_zero_std": 0.9,
|
| 13179 |
+
"grad_norm": 0.0,
|
| 13180 |
+
"kl": 1.4126264125108718,
|
| 13181 |
+
"learning_rate": 4.116982838733449e-06,
|
| 13182 |
+
"loss": 0.0001,
|
| 13183 |
+
"num_tokens": 7260688.0,
|
| 13184 |
+
"reward": 4.074999904632568,
|
| 13185 |
+
"reward_std": 0.028867512941360474,
|
| 13186 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13187 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13188 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13189 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13190 |
+
"rewards/quality_reward_func/mean": 0.7750000119209289,
|
| 13191 |
+
"rewards/quality_reward_func/std": 0.028867512941360474,
|
| 13192 |
+
"step": 5070
|
| 13193 |
+
},
|
| 13194 |
+
{
|
| 13195 |
+
"completion_length": 20.5,
|
| 13196 |
+
"completions/clipped_ratio": 0.0,
|
| 13197 |
+
"completions/max_length": 20.5,
|
| 13198 |
+
"completions/max_terminated_length": 20.5,
|
| 13199 |
+
"completions/mean_length": 18.65,
|
| 13200 |
+
"completions/mean_terminated_length": 18.65,
|
| 13201 |
+
"completions/min_length": 16.8,
|
| 13202 |
+
"completions/min_terminated_length": 16.8,
|
| 13203 |
+
"epoch": 0.3492849284928493,
|
| 13204 |
+
"frac_reward_zero_std": 1.0,
|
| 13205 |
+
"grad_norm": 0.0,
|
| 13206 |
+
"kl": 1.5038474194705487,
|
| 13207 |
+
"learning_rate": 4.112401853723058e-06,
|
| 13208 |
+
"loss": 0.0001,
|
| 13209 |
+
"num_tokens": 7274190.0,
|
| 13210 |
+
"reward": 4.099999904632568,
|
| 13211 |
+
"reward_std": 0.0,
|
| 13212 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13213 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13214 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13215 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13216 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13217 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13218 |
+
"step": 5080
|
| 13219 |
+
},
|
| 13220 |
+
{
|
| 13221 |
+
"completion_length": 18.6,
|
| 13222 |
+
"completions/clipped_ratio": 0.0,
|
| 13223 |
+
"completions/max_length": 18.6,
|
| 13224 |
+
"completions/max_terminated_length": 18.6,
|
| 13225 |
+
"completions/mean_length": 16.325,
|
| 13226 |
+
"completions/mean_terminated_length": 16.325,
|
| 13227 |
+
"completions/min_length": 15.4,
|
| 13228 |
+
"completions/min_terminated_length": 15.4,
|
| 13229 |
+
"epoch": 0.34997249724972496,
|
| 13230 |
+
"frac_reward_zero_std": 1.0,
|
| 13231 |
+
"grad_norm": 0.0,
|
| 13232 |
+
"kl": 1.1819006368517875,
|
| 13233 |
+
"learning_rate": 4.107811579906718e-06,
|
| 13234 |
+
"loss": 0.0,
|
| 13235 |
+
"num_tokens": 7289051.0,
|
| 13236 |
+
"reward": 4.099999904632568,
|
| 13237 |
+
"reward_std": 0.0,
|
| 13238 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13239 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13240 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13241 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13242 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13243 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13244 |
+
"step": 5090
|
| 13245 |
+
},
|
| 13246 |
+
{
|
| 13247 |
+
"completion_length": 18.3,
|
| 13248 |
+
"completions/clipped_ratio": 0.0,
|
| 13249 |
+
"completions/max_length": 18.3,
|
| 13250 |
+
"completions/max_terminated_length": 18.3,
|
| 13251 |
+
"completions/mean_length": 16.375,
|
| 13252 |
+
"completions/mean_terminated_length": 16.375,
|
| 13253 |
+
"completions/min_length": 15.3,
|
| 13254 |
+
"completions/min_terminated_length": 15.3,
|
| 13255 |
+
"epoch": 0.35066006600660066,
|
| 13256 |
+
"frac_reward_zero_std": 1.0,
|
| 13257 |
+
"grad_norm": 0.0,
|
| 13258 |
+
"kl": 1.1194199629127979,
|
| 13259 |
+
"learning_rate": 4.103212043728308e-06,
|
| 13260 |
+
"loss": 0.0,
|
| 13261 |
+
"num_tokens": 7305174.0,
|
| 13262 |
+
"reward": 4.099999904632568,
|
| 13263 |
+
"reward_std": 0.0,
|
| 13264 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13265 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13266 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13267 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13268 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13269 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13270 |
+
"step": 5100
|
| 13271 |
+
},
|
| 13272 |
+
{
|
| 13273 |
+
"completion_length": 19.4,
|
| 13274 |
+
"completions/clipped_ratio": 0.0,
|
| 13275 |
+
"completions/max_length": 19.4,
|
| 13276 |
+
"completions/max_terminated_length": 19.4,
|
| 13277 |
+
"completions/mean_length": 17.225,
|
| 13278 |
+
"completions/mean_terminated_length": 17.225,
|
| 13279 |
+
"completions/min_length": 15.2,
|
| 13280 |
+
"completions/min_terminated_length": 15.2,
|
| 13281 |
+
"epoch": 0.35134763476347636,
|
| 13282 |
+
"frac_reward_zero_std": 1.0,
|
| 13283 |
+
"grad_norm": 0.0,
|
| 13284 |
+
"kl": 0.9471234813332557,
|
| 13285 |
+
"learning_rate": 4.09860327168507e-06,
|
| 13286 |
+
"loss": 0.0,
|
| 13287 |
+
"num_tokens": 7317111.0,
|
| 13288 |
+
"reward": 4.099999904632568,
|
| 13289 |
+
"reward_std": 0.0,
|
| 13290 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13291 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13292 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13293 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13294 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13295 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13296 |
+
"step": 5110
|
| 13297 |
+
},
|
| 13298 |
+
{
|
| 13299 |
+
"completion_length": 19.8,
|
| 13300 |
+
"completions/clipped_ratio": 0.0,
|
| 13301 |
+
"completions/max_length": 19.8,
|
| 13302 |
+
"completions/max_terminated_length": 19.8,
|
| 13303 |
+
"completions/mean_length": 17.45,
|
| 13304 |
+
"completions/mean_terminated_length": 17.45,
|
| 13305 |
+
"completions/min_length": 15.4,
|
| 13306 |
+
"completions/min_terminated_length": 15.4,
|
| 13307 |
+
"epoch": 0.35203520352035206,
|
| 13308 |
+
"frac_reward_zero_std": 1.0,
|
| 13309 |
+
"grad_norm": 0.0,
|
| 13310 |
+
"kl": 1.301464532315731,
|
| 13311 |
+
"learning_rate": 4.093985290327448e-06,
|
| 13312 |
+
"loss": 0.0,
|
| 13313 |
+
"num_tokens": 7332653.0,
|
| 13314 |
+
"reward": 4.099999904632568,
|
| 13315 |
+
"reward_std": 0.0,
|
| 13316 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13317 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13318 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13319 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13320 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13321 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13322 |
+
"step": 5120
|
| 13323 |
+
},
|
| 13324 |
+
{
|
| 13325 |
+
"completion_length": 17.8,
|
| 13326 |
+
"completions/clipped_ratio": 0.0,
|
| 13327 |
+
"completions/max_length": 17.8,
|
| 13328 |
+
"completions/max_terminated_length": 17.8,
|
| 13329 |
+
"completions/mean_length": 16.325,
|
| 13330 |
+
"completions/mean_terminated_length": 16.325,
|
| 13331 |
+
"completions/min_length": 14.9,
|
| 13332 |
+
"completions/min_terminated_length": 14.9,
|
| 13333 |
+
"epoch": 0.3527227722772277,
|
| 13334 |
+
"frac_reward_zero_std": 1.0,
|
| 13335 |
+
"grad_norm": 0.0,
|
| 13336 |
+
"kl": 1.5016500294208526,
|
| 13337 |
+
"learning_rate": 4.089358126258943e-06,
|
| 13338 |
+
"loss": 0.0001,
|
| 13339 |
+
"num_tokens": 7347698.0,
|
| 13340 |
+
"reward": 4.099999904632568,
|
| 13341 |
+
"reward_std": 0.0,
|
| 13342 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13343 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13344 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13345 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13346 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13347 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13348 |
+
"step": 5130
|
| 13349 |
+
},
|
| 13350 |
+
{
|
| 13351 |
+
"completion_length": 19.6,
|
| 13352 |
+
"completions/clipped_ratio": 0.0,
|
| 13353 |
+
"completions/max_length": 19.6,
|
| 13354 |
+
"completions/max_terminated_length": 19.6,
|
| 13355 |
+
"completions/mean_length": 17.275,
|
| 13356 |
+
"completions/mean_terminated_length": 17.275,
|
| 13357 |
+
"completions/min_length": 15.4,
|
| 13358 |
+
"completions/min_terminated_length": 15.4,
|
| 13359 |
+
"epoch": 0.3534103410341034,
|
| 13360 |
+
"frac_reward_zero_std": 1.0,
|
| 13361 |
+
"grad_norm": 0.0,
|
| 13362 |
+
"kl": 1.2618799805641174,
|
| 13363 |
+
"learning_rate": 4.084721806135956e-06,
|
| 13364 |
+
"loss": 0.0,
|
| 13365 |
+
"num_tokens": 7362377.0,
|
| 13366 |
+
"reward": 4.099999904632568,
|
| 13367 |
+
"reward_std": 0.0,
|
| 13368 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13369 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13370 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13371 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13372 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13373 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13374 |
+
"step": 5140
|
| 13375 |
+
},
|
| 13376 |
+
{
|
| 13377 |
+
"completion_length": 18.3,
|
| 13378 |
+
"completions/clipped_ratio": 0.0,
|
| 13379 |
+
"completions/max_length": 18.3,
|
| 13380 |
+
"completions/max_terminated_length": 18.3,
|
| 13381 |
+
"completions/mean_length": 16.775,
|
| 13382 |
+
"completions/mean_terminated_length": 16.775,
|
| 13383 |
+
"completions/min_length": 15.8,
|
| 13384 |
+
"completions/min_terminated_length": 15.8,
|
| 13385 |
+
"epoch": 0.3540979097909791,
|
| 13386 |
+
"frac_reward_zero_std": 1.0,
|
| 13387 |
+
"grad_norm": 0.0,
|
| 13388 |
+
"kl": 1.2617546994239093,
|
| 13389 |
+
"learning_rate": 4.080076356667633e-06,
|
| 13390 |
+
"loss": 0.0,
|
| 13391 |
+
"num_tokens": 7376940.0,
|
| 13392 |
+
"reward": 4.099999904632568,
|
| 13393 |
+
"reward_std": 0.0,
|
| 13394 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13395 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13396 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13397 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13398 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13399 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13400 |
+
"step": 5150
|
| 13401 |
}
|
| 13402 |
],
|
| 13403 |
"logging_steps": 10,
|
| 13404 |
"max_steps": 14544,
|
| 13405 |
+
"num_input_tokens_seen": 7376940,
|
| 13406 |
"num_train_epochs": 1,
|
| 13407 |
"save_steps": 50,
|
| 13408 |
"stateful_callbacks": {
|