Upload folder using huggingface_hub
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1173 -3
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 262406656
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd8422397958e38dfc54623833b9c42fbf84c2192234f78716993373edeb9c08
|
| 3 |
size 262406656
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 122872331
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6acdf097a44425d0cb4aa2435e670892fe147410ce2c6c5fefed2de4c9ef796
|
| 3 |
size 122872331
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2e37a8994ae61da6b0a5cbf1dc8a1a1e4ca374128d672206c8b82cbdf6e4192
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e0184609e0a634a7a19eed294044d17cbbacf15554dec1788c985d57897ec9e
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -13658,11 +13658,1181 @@
|
|
| 13658 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13659 |
"rewards/quality_reward_func/std": 0.0,
|
| 13660 |
"step": 5250
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13661 |
}
|
| 13662 |
],
|
| 13663 |
"logging_steps": 10,
|
| 13664 |
"max_steps": 14544,
|
| 13665 |
-
"num_input_tokens_seen":
|
| 13666 |
"num_train_epochs": 1,
|
| 13667 |
"save_steps": 50,
|
| 13668 |
"stateful_callbacks": {
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.3919141914191419,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 5700,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 13658 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13659 |
"rewards/quality_reward_func/std": 0.0,
|
| 13660 |
"step": 5250
|
| 13661 |
+
},
|
| 13662 |
+
{
|
| 13663 |
+
"completion_length": 18.08955223880597,
|
| 13664 |
+
"completions/clipped_ratio": 0.0,
|
| 13665 |
+
"completions/max_length": 18.11764705882353,
|
| 13666 |
+
"completions/max_terminated_length": 18.11764705882353,
|
| 13667 |
+
"completions/mean_length": 16.794117647058822,
|
| 13668 |
+
"completions/mean_terminated_length": 16.794117647058822,
|
| 13669 |
+
"completions/min_length": 15.411764705882353,
|
| 13670 |
+
"completions/min_terminated_length": 15.411764705882353,
|
| 13671 |
+
"epoch": 0.36166116611661164,
|
| 13672 |
+
"frac_reward_zero_std": 1.0,
|
| 13673 |
+
"grad_norm": 0.0,
|
| 13674 |
+
"kl": 1.0812231904979963,
|
| 13675 |
+
"learning_rate": 4.028381566875773e-06,
|
| 13676 |
+
"loss": 0.0,
|
| 13677 |
+
"num_tokens": 7540044.0,
|
| 13678 |
+
"reward": 4.099999904632568,
|
| 13679 |
+
"reward_std": 0.0,
|
| 13680 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13681 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13682 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13683 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13684 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13685 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13686 |
+
"step": 5260
|
| 13687 |
+
},
|
| 13688 |
+
{
|
| 13689 |
+
"completion_length": 20.15,
|
| 13690 |
+
"completions/clipped_ratio": 0.0,
|
| 13691 |
+
"completions/max_length": 20.1,
|
| 13692 |
+
"completions/max_terminated_length": 20.1,
|
| 13693 |
+
"completions/mean_length": 18.375,
|
| 13694 |
+
"completions/mean_terminated_length": 18.375,
|
| 13695 |
+
"completions/min_length": 16.6,
|
| 13696 |
+
"completions/min_terminated_length": 16.6,
|
| 13697 |
+
"epoch": 0.36234873487348734,
|
| 13698 |
+
"frac_reward_zero_std": 1.0,
|
| 13699 |
+
"grad_norm": 0.0,
|
| 13700 |
+
"kl": 0.9661604385823012,
|
| 13701 |
+
"learning_rate": 4.02362866756573e-06,
|
| 13702 |
+
"loss": 0.0,
|
| 13703 |
+
"num_tokens": 7554587.0,
|
| 13704 |
+
"reward": 4.099999904632568,
|
| 13705 |
+
"reward_std": 0.0,
|
| 13706 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13707 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13708 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13709 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13710 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13711 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13712 |
+
"step": 5270
|
| 13713 |
+
},
|
| 13714 |
+
{
|
| 13715 |
+
"completion_length": 16.775,
|
| 13716 |
+
"completions/clipped_ratio": 0.0,
|
| 13717 |
+
"completions/max_length": 16.7,
|
| 13718 |
+
"completions/max_terminated_length": 16.7,
|
| 13719 |
+
"completions/mean_length": 16.075,
|
| 13720 |
+
"completions/mean_terminated_length": 16.075,
|
| 13721 |
+
"completions/min_length": 15.4,
|
| 13722 |
+
"completions/min_terminated_length": 15.4,
|
| 13723 |
+
"epoch": 0.36303630363036304,
|
| 13724 |
+
"frac_reward_zero_std": 1.0,
|
| 13725 |
+
"grad_norm": 0.0,
|
| 13726 |
+
"kl": 1.3903781726956368,
|
| 13727 |
+
"learning_rate": 4.018866990858785e-06,
|
| 13728 |
+
"loss": 0.0,
|
| 13729 |
+
"num_tokens": 7569714.0,
|
| 13730 |
+
"reward": 4.099999904632568,
|
| 13731 |
+
"reward_std": 0.0,
|
| 13732 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13733 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13734 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13735 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13736 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13737 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13738 |
+
"step": 5280
|
| 13739 |
+
},
|
| 13740 |
+
{
|
| 13741 |
+
"completion_length": 19.675,
|
| 13742 |
+
"completions/clipped_ratio": 0.0,
|
| 13743 |
+
"completions/max_length": 19.8,
|
| 13744 |
+
"completions/max_terminated_length": 19.8,
|
| 13745 |
+
"completions/mean_length": 18.15,
|
| 13746 |
+
"completions/mean_terminated_length": 18.15,
|
| 13747 |
+
"completions/min_length": 16.6,
|
| 13748 |
+
"completions/min_terminated_length": 16.6,
|
| 13749 |
+
"epoch": 0.36372387238723874,
|
| 13750 |
+
"frac_reward_zero_std": 1.0,
|
| 13751 |
+
"grad_norm": 0.0,
|
| 13752 |
+
"kl": 1.3349122866988181,
|
| 13753 |
+
"learning_rate": 4.014096564186248e-06,
|
| 13754 |
+
"loss": 0.0,
|
| 13755 |
+
"num_tokens": 7582848.0,
|
| 13756 |
+
"reward": 4.099999904632568,
|
| 13757 |
+
"reward_std": 0.0,
|
| 13758 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13759 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13760 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13761 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13762 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13763 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13764 |
+
"step": 5290
|
| 13765 |
+
},
|
| 13766 |
+
{
|
| 13767 |
+
"completion_length": 19.425,
|
| 13768 |
+
"completions/clipped_ratio": 0.0,
|
| 13769 |
+
"completions/max_length": 19.4,
|
| 13770 |
+
"completions/max_terminated_length": 19.4,
|
| 13771 |
+
"completions/mean_length": 17.575,
|
| 13772 |
+
"completions/mean_terminated_length": 17.575,
|
| 13773 |
+
"completions/min_length": 16.3,
|
| 13774 |
+
"completions/min_terminated_length": 16.3,
|
| 13775 |
+
"epoch": 0.3644114411441144,
|
| 13776 |
+
"frac_reward_zero_std": 1.0,
|
| 13777 |
+
"grad_norm": 0.0,
|
| 13778 |
+
"kl": 0.8336154259741306,
|
| 13779 |
+
"learning_rate": 4.009317415029832e-06,
|
| 13780 |
+
"loss": 0.0,
|
| 13781 |
+
"num_tokens": 7597619.0,
|
| 13782 |
+
"reward": 4.099999904632568,
|
| 13783 |
+
"reward_std": 0.0,
|
| 13784 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13785 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13786 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13787 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13788 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13789 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13790 |
+
"step": 5300
|
| 13791 |
+
},
|
| 13792 |
+
{
|
| 13793 |
+
"completion_length": 18.65,
|
| 13794 |
+
"completions/clipped_ratio": 0.0,
|
| 13795 |
+
"completions/max_length": 18.5,
|
| 13796 |
+
"completions/max_terminated_length": 18.5,
|
| 13797 |
+
"completions/mean_length": 17.85,
|
| 13798 |
+
"completions/mean_terminated_length": 17.85,
|
| 13799 |
+
"completions/min_length": 17.1,
|
| 13800 |
+
"completions/min_terminated_length": 17.1,
|
| 13801 |
+
"epoch": 0.3650990099009901,
|
| 13802 |
+
"frac_reward_zero_std": 1.0,
|
| 13803 |
+
"grad_norm": 0.0,
|
| 13804 |
+
"kl": 1.0214567624032498,
|
| 13805 |
+
"learning_rate": 4.004529570921501e-06,
|
| 13806 |
+
"loss": 0.0,
|
| 13807 |
+
"num_tokens": 7612549.0,
|
| 13808 |
+
"reward": 4.099999904632568,
|
| 13809 |
+
"reward_std": 0.0,
|
| 13810 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13811 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13812 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13813 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13814 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13815 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13816 |
+
"step": 5310
|
| 13817 |
+
},
|
| 13818 |
+
{
|
| 13819 |
+
"completion_length": 17.975,
|
| 13820 |
+
"completions/clipped_ratio": 0.0,
|
| 13821 |
+
"completions/max_length": 18.1,
|
| 13822 |
+
"completions/max_terminated_length": 18.1,
|
| 13823 |
+
"completions/mean_length": 17.15,
|
| 13824 |
+
"completions/mean_terminated_length": 17.15,
|
| 13825 |
+
"completions/min_length": 16.2,
|
| 13826 |
+
"completions/min_terminated_length": 16.2,
|
| 13827 |
+
"epoch": 0.3657865786578658,
|
| 13828 |
+
"frac_reward_zero_std": 1.0,
|
| 13829 |
+
"grad_norm": 0.0,
|
| 13830 |
+
"kl": 1.2231212853454054,
|
| 13831 |
+
"learning_rate": 3.99973305944331e-06,
|
| 13832 |
+
"loss": 0.0,
|
| 13833 |
+
"num_tokens": 7627539.0,
|
| 13834 |
+
"reward": 4.099999904632568,
|
| 13835 |
+
"reward_std": 0.0,
|
| 13836 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13837 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13838 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13839 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13840 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13841 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13842 |
+
"step": 5320
|
| 13843 |
+
},
|
| 13844 |
+
{
|
| 13845 |
+
"completion_length": 17.775,
|
| 13846 |
+
"completions/clipped_ratio": 0.0,
|
| 13847 |
+
"completions/max_length": 17.8,
|
| 13848 |
+
"completions/max_terminated_length": 17.8,
|
| 13849 |
+
"completions/mean_length": 17.175,
|
| 13850 |
+
"completions/mean_terminated_length": 17.175,
|
| 13851 |
+
"completions/min_length": 16.3,
|
| 13852 |
+
"completions/min_terminated_length": 16.3,
|
| 13853 |
+
"epoch": 0.3664741474147415,
|
| 13854 |
+
"frac_reward_zero_std": 1.0,
|
| 13855 |
+
"grad_norm": 0.0,
|
| 13856 |
+
"kl": 1.197108805179596,
|
| 13857 |
+
"learning_rate": 3.9949279082272425e-06,
|
| 13858 |
+
"loss": 0.0,
|
| 13859 |
+
"num_tokens": 7643738.0,
|
| 13860 |
+
"reward": 4.099999904632568,
|
| 13861 |
+
"reward_std": 0.0,
|
| 13862 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13863 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13864 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13865 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13866 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13867 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13868 |
+
"step": 5330
|
| 13869 |
+
},
|
| 13870 |
+
{
|
| 13871 |
+
"completion_length": 17.375,
|
| 13872 |
+
"completions/clipped_ratio": 0.0,
|
| 13873 |
+
"completions/max_length": 17.4,
|
| 13874 |
+
"completions/max_terminated_length": 17.4,
|
| 13875 |
+
"completions/mean_length": 15.925,
|
| 13876 |
+
"completions/mean_terminated_length": 15.925,
|
| 13877 |
+
"completions/min_length": 13.9,
|
| 13878 |
+
"completions/min_terminated_length": 13.9,
|
| 13879 |
+
"epoch": 0.36716171617161714,
|
| 13880 |
+
"frac_reward_zero_std": 1.0,
|
| 13881 |
+
"grad_norm": 0.0,
|
| 13882 |
+
"kl": 1.1159055039286614,
|
| 13883 |
+
"learning_rate": 3.9901141449550565e-06,
|
| 13884 |
+
"loss": 0.0,
|
| 13885 |
+
"num_tokens": 7658551.0,
|
| 13886 |
+
"reward": 4.099999904632568,
|
| 13887 |
+
"reward_std": 0.0,
|
| 13888 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13889 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13890 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13891 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13892 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13893 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13894 |
+
"step": 5340
|
| 13895 |
+
},
|
| 13896 |
+
{
|
| 13897 |
+
"completion_length": 17.075,
|
| 13898 |
+
"completions/clipped_ratio": 0.0,
|
| 13899 |
+
"completions/max_length": 16.9,
|
| 13900 |
+
"completions/max_terminated_length": 16.9,
|
| 13901 |
+
"completions/mean_length": 16.075,
|
| 13902 |
+
"completions/mean_terminated_length": 16.075,
|
| 13903 |
+
"completions/min_length": 15.6,
|
| 13904 |
+
"completions/min_terminated_length": 15.6,
|
| 13905 |
+
"epoch": 0.36784928492849284,
|
| 13906 |
+
"frac_reward_zero_std": 1.0,
|
| 13907 |
+
"grad_norm": 0.0,
|
| 13908 |
+
"kl": 1.0155922904610635,
|
| 13909 |
+
"learning_rate": 3.985291797358123e-06,
|
| 13910 |
+
"loss": 0.0,
|
| 13911 |
+
"num_tokens": 7671674.0,
|
| 13912 |
+
"reward": 4.099999904632568,
|
| 13913 |
+
"reward_std": 0.0,
|
| 13914 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13915 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13916 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13917 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13918 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13919 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13920 |
+
"step": 5350
|
| 13921 |
+
},
|
| 13922 |
+
{
|
| 13923 |
+
"completion_length": 16.825,
|
| 13924 |
+
"completions/clipped_ratio": 0.0,
|
| 13925 |
+
"completions/max_length": 17.0,
|
| 13926 |
+
"completions/max_terminated_length": 17.0,
|
| 13927 |
+
"completions/mean_length": 16.575,
|
| 13928 |
+
"completions/mean_terminated_length": 16.575,
|
| 13929 |
+
"completions/min_length": 16.1,
|
| 13930 |
+
"completions/min_terminated_length": 16.1,
|
| 13931 |
+
"epoch": 0.36853685368536854,
|
| 13932 |
+
"frac_reward_zero_std": 1.0,
|
| 13933 |
+
"grad_norm": 0.0,
|
| 13934 |
+
"kl": 1.2678054243326187,
|
| 13935 |
+
"learning_rate": 3.980460893217265e-06,
|
| 13936 |
+
"loss": 0.0,
|
| 13937 |
+
"num_tokens": 7684565.0,
|
| 13938 |
+
"reward": 4.099999904632568,
|
| 13939 |
+
"reward_std": 0.0,
|
| 13940 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13941 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13942 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13943 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13944 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13945 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13946 |
+
"step": 5360
|
| 13947 |
+
},
|
| 13948 |
+
{
|
| 13949 |
+
"completion_length": 17.975,
|
| 13950 |
+
"completions/clipped_ratio": 0.0,
|
| 13951 |
+
"completions/max_length": 17.8,
|
| 13952 |
+
"completions/max_terminated_length": 17.8,
|
| 13953 |
+
"completions/mean_length": 17.0,
|
| 13954 |
+
"completions/mean_terminated_length": 17.0,
|
| 13955 |
+
"completions/min_length": 16.1,
|
| 13956 |
+
"completions/min_terminated_length": 16.1,
|
| 13957 |
+
"epoch": 0.36922442244224424,
|
| 13958 |
+
"frac_reward_zero_std": 1.0,
|
| 13959 |
+
"grad_norm": 0.0,
|
| 13960 |
+
"kl": 1.2318198367953301,
|
| 13961 |
+
"learning_rate": 3.9756214603626e-06,
|
| 13962 |
+
"loss": 0.0,
|
| 13963 |
+
"num_tokens": 7698909.0,
|
| 13964 |
+
"reward": 4.099999904632568,
|
| 13965 |
+
"reward_std": 0.0,
|
| 13966 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13967 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13968 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13969 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13970 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13971 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13972 |
+
"step": 5370
|
| 13973 |
+
},
|
| 13974 |
+
{
|
| 13975 |
+
"completion_length": 18.95,
|
| 13976 |
+
"completions/clipped_ratio": 0.0,
|
| 13977 |
+
"completions/max_length": 19.0,
|
| 13978 |
+
"completions/max_terminated_length": 19.0,
|
| 13979 |
+
"completions/mean_length": 17.2,
|
| 13980 |
+
"completions/mean_terminated_length": 17.2,
|
| 13981 |
+
"completions/min_length": 15.4,
|
| 13982 |
+
"completions/min_terminated_length": 15.4,
|
| 13983 |
+
"epoch": 0.36991199119911994,
|
| 13984 |
+
"frac_reward_zero_std": 1.0,
|
| 13985 |
+
"grad_norm": 0.0,
|
| 13986 |
+
"kl": 1.3333981722593307,
|
| 13987 |
+
"learning_rate": 3.9707735266733735e-06,
|
| 13988 |
+
"loss": 0.0,
|
| 13989 |
+
"num_tokens": 7715133.0,
|
| 13990 |
+
"reward": 4.099999904632568,
|
| 13991 |
+
"reward_std": 0.0,
|
| 13992 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 13993 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 13994 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 13995 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 13996 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 13997 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 13998 |
+
"step": 5380
|
| 13999 |
+
},
|
| 14000 |
+
{
|
| 14001 |
+
"completion_length": 17.425,
|
| 14002 |
+
"completions/clipped_ratio": 0.0,
|
| 14003 |
+
"completions/max_length": 17.7,
|
| 14004 |
+
"completions/max_terminated_length": 17.7,
|
| 14005 |
+
"completions/mean_length": 16.8,
|
| 14006 |
+
"completions/mean_terminated_length": 16.8,
|
| 14007 |
+
"completions/min_length": 16.0,
|
| 14008 |
+
"completions/min_terminated_length": 16.0,
|
| 14009 |
+
"epoch": 0.3705995599559956,
|
| 14010 |
+
"frac_reward_zero_std": 1.0,
|
| 14011 |
+
"grad_norm": 0.0,
|
| 14012 |
+
"kl": 1.3391637369990348,
|
| 14013 |
+
"learning_rate": 3.965917120077811e-06,
|
| 14014 |
+
"loss": 0.0,
|
| 14015 |
+
"num_tokens": 7727317.0,
|
| 14016 |
+
"reward": 4.099999904632568,
|
| 14017 |
+
"reward_std": 0.0,
|
| 14018 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14019 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14020 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14021 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14022 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14023 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14024 |
+
"step": 5390
|
| 14025 |
+
},
|
| 14026 |
+
{
|
| 14027 |
+
"completion_length": 19.15,
|
| 14028 |
+
"completions/clipped_ratio": 0.0,
|
| 14029 |
+
"completions/max_length": 19.0,
|
| 14030 |
+
"completions/max_terminated_length": 19.0,
|
| 14031 |
+
"completions/mean_length": 16.925,
|
| 14032 |
+
"completions/mean_terminated_length": 16.925,
|
| 14033 |
+
"completions/min_length": 15.6,
|
| 14034 |
+
"completions/min_terminated_length": 15.6,
|
| 14035 |
+
"epoch": 0.3712871287128713,
|
| 14036 |
+
"frac_reward_zero_std": 1.0,
|
| 14037 |
+
"grad_norm": 0.0,
|
| 14038 |
+
"kl": 1.1109920389950276,
|
| 14039 |
+
"learning_rate": 3.961052268552941e-06,
|
| 14040 |
+
"loss": 0.0,
|
| 14041 |
+
"num_tokens": 7743642.0,
|
| 14042 |
+
"reward": 4.099999904632568,
|
| 14043 |
+
"reward_std": 0.0,
|
| 14044 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14045 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14046 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14047 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14048 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14049 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14050 |
+
"step": 5400
|
| 14051 |
+
},
|
| 14052 |
+
{
|
| 14053 |
+
"completion_length": 16.95,
|
| 14054 |
+
"completions/clipped_ratio": 0.0,
|
| 14055 |
+
"completions/max_length": 16.9,
|
| 14056 |
+
"completions/max_terminated_length": 16.9,
|
| 14057 |
+
"completions/mean_length": 15.875,
|
| 14058 |
+
"completions/mean_terminated_length": 15.875,
|
| 14059 |
+
"completions/min_length": 14.9,
|
| 14060 |
+
"completions/min_terminated_length": 14.9,
|
| 14061 |
+
"epoch": 0.371974697469747,
|
| 14062 |
+
"frac_reward_zero_std": 1.0,
|
| 14063 |
+
"grad_norm": 5.6203894928330556e-05,
|
| 14064 |
+
"kl": 1.363871442526579,
|
| 14065 |
+
"learning_rate": 3.956179000124447e-06,
|
| 14066 |
+
"loss": 0.0,
|
| 14067 |
+
"num_tokens": 7758365.0,
|
| 14068 |
+
"reward": 4.099999904632568,
|
| 14069 |
+
"reward_std": 0.0,
|
| 14070 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14071 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14072 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14073 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14074 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14075 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14076 |
+
"step": 5410
|
| 14077 |
+
},
|
| 14078 |
+
{
|
| 14079 |
+
"completion_length": 17.825,
|
| 14080 |
+
"completions/clipped_ratio": 0.0,
|
| 14081 |
+
"completions/max_length": 17.7,
|
| 14082 |
+
"completions/max_terminated_length": 17.7,
|
| 14083 |
+
"completions/mean_length": 16.15,
|
| 14084 |
+
"completions/mean_terminated_length": 16.15,
|
| 14085 |
+
"completions/min_length": 15.2,
|
| 14086 |
+
"completions/min_terminated_length": 15.2,
|
| 14087 |
+
"epoch": 0.3726622662266227,
|
| 14088 |
+
"frac_reward_zero_std": 1.0,
|
| 14089 |
+
"grad_norm": 0.0,
|
| 14090 |
+
"kl": 0.9931762866675854,
|
| 14091 |
+
"learning_rate": 3.9512973428665e-06,
|
| 14092 |
+
"loss": 0.0,
|
| 14093 |
+
"num_tokens": 7772323.0,
|
| 14094 |
+
"reward": 4.099999904632568,
|
| 14095 |
+
"reward_std": 0.0,
|
| 14096 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14097 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14098 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14099 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14100 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14101 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14102 |
+
"step": 5420
|
| 14103 |
+
},
|
| 14104 |
+
{
|
| 14105 |
+
"completion_length": 21.425,
|
| 14106 |
+
"completions/clipped_ratio": 0.0,
|
| 14107 |
+
"completions/max_length": 21.6,
|
| 14108 |
+
"completions/max_terminated_length": 21.6,
|
| 14109 |
+
"completions/mean_length": 18.825,
|
| 14110 |
+
"completions/mean_terminated_length": 18.825,
|
| 14111 |
+
"completions/min_length": 16.1,
|
| 14112 |
+
"completions/min_terminated_length": 16.1,
|
| 14113 |
+
"epoch": 0.37334983498349833,
|
| 14114 |
+
"frac_reward_zero_std": 1.0,
|
| 14115 |
+
"grad_norm": 0.0,
|
| 14116 |
+
"kl": 1.0012955855578185,
|
| 14117 |
+
"learning_rate": 3.946407324901598e-06,
|
| 14118 |
+
"loss": 0.0,
|
| 14119 |
+
"num_tokens": 7785692.0,
|
| 14120 |
+
"reward": 4.099999904632568,
|
| 14121 |
+
"reward_std": 0.0,
|
| 14122 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14123 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14124 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14125 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14126 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14127 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14128 |
+
"step": 5430
|
| 14129 |
+
},
|
| 14130 |
+
{
|
| 14131 |
+
"completion_length": 21.0,
|
| 14132 |
+
"completions/clipped_ratio": 0.0,
|
| 14133 |
+
"completions/max_length": 21.2,
|
| 14134 |
+
"completions/max_terminated_length": 21.2,
|
| 14135 |
+
"completions/mean_length": 17.95,
|
| 14136 |
+
"completions/mean_terminated_length": 17.95,
|
| 14137 |
+
"completions/min_length": 16.1,
|
| 14138 |
+
"completions/min_terminated_length": 16.1,
|
| 14139 |
+
"epoch": 0.37403740374037403,
|
| 14140 |
+
"frac_reward_zero_std": 1.0,
|
| 14141 |
+
"grad_norm": 0.0,
|
| 14142 |
+
"kl": 1.0526311319321393,
|
| 14143 |
+
"learning_rate": 3.941508974400401e-06,
|
| 14144 |
+
"loss": 0.0,
|
| 14145 |
+
"num_tokens": 7802662.0,
|
| 14146 |
+
"reward": 4.099999904632568,
|
| 14147 |
+
"reward_std": 0.0,
|
| 14148 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14149 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14150 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14151 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14152 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14153 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14154 |
+
"step": 5440
|
| 14155 |
+
},
|
| 14156 |
+
{
|
| 14157 |
+
"completion_length": 17.1,
|
| 14158 |
+
"completions/clipped_ratio": 0.0,
|
| 14159 |
+
"completions/max_length": 16.9,
|
| 14160 |
+
"completions/max_terminated_length": 16.9,
|
| 14161 |
+
"completions/mean_length": 16.575,
|
| 14162 |
+
"completions/mean_terminated_length": 16.575,
|
| 14163 |
+
"completions/min_length": 16.3,
|
| 14164 |
+
"completions/min_terminated_length": 16.3,
|
| 14165 |
+
"epoch": 0.37472497249724973,
|
| 14166 |
+
"frac_reward_zero_std": 1.0,
|
| 14167 |
+
"grad_norm": 0.0,
|
| 14168 |
+
"kl": 1.1798742283135653,
|
| 14169 |
+
"learning_rate": 3.9366023195815755e-06,
|
| 14170 |
+
"loss": 0.0,
|
| 14171 |
+
"num_tokens": 7817133.0,
|
| 14172 |
+
"reward": 4.099999904632568,
|
| 14173 |
+
"reward_std": 0.0,
|
| 14174 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14175 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14176 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14177 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14178 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14179 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14180 |
+
"step": 5450
|
| 14181 |
+
},
|
| 14182 |
+
{
|
| 14183 |
+
"completion_length": 19.2,
|
| 14184 |
+
"completions/clipped_ratio": 0.0,
|
| 14185 |
+
"completions/max_length": 19.3,
|
| 14186 |
+
"completions/max_terminated_length": 19.3,
|
| 14187 |
+
"completions/mean_length": 18.225,
|
| 14188 |
+
"completions/mean_terminated_length": 18.225,
|
| 14189 |
+
"completions/min_length": 16.6,
|
| 14190 |
+
"completions/min_terminated_length": 16.6,
|
| 14191 |
+
"epoch": 0.37541254125412543,
|
| 14192 |
+
"frac_reward_zero_std": 1.0,
|
| 14193 |
+
"grad_norm": 0.0,
|
| 14194 |
+
"kl": 1.3727002948522569,
|
| 14195 |
+
"learning_rate": 3.931687388711626e-06,
|
| 14196 |
+
"loss": 0.0001,
|
| 14197 |
+
"num_tokens": 7833654.0,
|
| 14198 |
+
"reward": 4.099999904632568,
|
| 14199 |
+
"reward_std": 0.0,
|
| 14200 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14201 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14202 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14203 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14204 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14205 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14206 |
+
"step": 5460
|
| 14207 |
+
},
|
| 14208 |
+
{
|
| 14209 |
+
"completion_length": 20.15,
|
| 14210 |
+
"completions/clipped_ratio": 0.0,
|
| 14211 |
+
"completions/max_length": 20.0,
|
| 14212 |
+
"completions/max_terminated_length": 20.0,
|
| 14213 |
+
"completions/mean_length": 17.3,
|
| 14214 |
+
"completions/mean_terminated_length": 17.3,
|
| 14215 |
+
"completions/min_length": 15.6,
|
| 14216 |
+
"completions/min_terminated_length": 15.6,
|
| 14217 |
+
"epoch": 0.3761001100110011,
|
| 14218 |
+
"frac_reward_zero_std": 1.0,
|
| 14219 |
+
"grad_norm": 0.0,
|
| 14220 |
+
"kl": 1.0240365587174893,
|
| 14221 |
+
"learning_rate": 3.926764210104733e-06,
|
| 14222 |
+
"loss": 0.0,
|
| 14223 |
+
"num_tokens": 7851086.0,
|
| 14224 |
+
"reward": 4.099999904632568,
|
| 14225 |
+
"reward_std": 0.0,
|
| 14226 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14227 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14228 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14229 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14230 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14231 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14232 |
+
"step": 5470
|
| 14233 |
+
},
|
| 14234 |
+
{
|
| 14235 |
+
"completion_length": 19.075,
|
| 14236 |
+
"completions/clipped_ratio": 0.0,
|
| 14237 |
+
"completions/max_length": 19.2,
|
| 14238 |
+
"completions/max_terminated_length": 19.2,
|
| 14239 |
+
"completions/mean_length": 17.2,
|
| 14240 |
+
"completions/mean_terminated_length": 17.2,
|
| 14241 |
+
"completions/min_length": 15.3,
|
| 14242 |
+
"completions/min_terminated_length": 15.3,
|
| 14243 |
+
"epoch": 0.3767876787678768,
|
| 14244 |
+
"frac_reward_zero_std": 1.0,
|
| 14245 |
+
"grad_norm": 0.0,
|
| 14246 |
+
"kl": 1.1240653157234193,
|
| 14247 |
+
"learning_rate": 3.921832812122593e-06,
|
| 14248 |
+
"loss": 0.0,
|
| 14249 |
+
"num_tokens": 7867270.0,
|
| 14250 |
+
"reward": 4.099999904632568,
|
| 14251 |
+
"reward_std": 0.0,
|
| 14252 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14253 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14254 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14255 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14256 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14257 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14258 |
+
"step": 5480
|
| 14259 |
+
},
|
| 14260 |
+
{
|
| 14261 |
+
"completion_length": 17.975,
|
| 14262 |
+
"completions/clipped_ratio": 0.0,
|
| 14263 |
+
"completions/max_length": 17.8,
|
| 14264 |
+
"completions/max_terminated_length": 17.8,
|
| 14265 |
+
"completions/mean_length": 16.55,
|
| 14266 |
+
"completions/mean_terminated_length": 16.55,
|
| 14267 |
+
"completions/min_length": 15.8,
|
| 14268 |
+
"completions/min_terminated_length": 15.8,
|
| 14269 |
+
"epoch": 0.3774752475247525,
|
| 14270 |
+
"frac_reward_zero_std": 1.0,
|
| 14271 |
+
"grad_norm": 0.0,
|
| 14272 |
+
"kl": 1.4767700091004372,
|
| 14273 |
+
"learning_rate": 3.916893223174254e-06,
|
| 14274 |
+
"loss": 0.0001,
|
| 14275 |
+
"num_tokens": 7882340.0,
|
| 14276 |
+
"reward": 4.099999904632568,
|
| 14277 |
+
"reward_std": 0.0,
|
| 14278 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14279 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14280 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14281 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14282 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14283 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14284 |
+
"step": 5490
|
| 14285 |
+
},
|
| 14286 |
+
{
|
| 14287 |
+
"completion_length": 18.725,
|
| 14288 |
+
"completions/clipped_ratio": 0.0,
|
| 14289 |
+
"completions/max_length": 18.8,
|
| 14290 |
+
"completions/max_terminated_length": 18.8,
|
| 14291 |
+
"completions/mean_length": 16.35,
|
| 14292 |
+
"completions/mean_terminated_length": 16.35,
|
| 14293 |
+
"completions/min_length": 14.8,
|
| 14294 |
+
"completions/min_terminated_length": 14.8,
|
| 14295 |
+
"epoch": 0.3781628162816282,
|
| 14296 |
+
"frac_reward_zero_std": 1.0,
|
| 14297 |
+
"grad_norm": 0.0,
|
| 14298 |
+
"kl": 1.476631324738264,
|
| 14299 |
+
"learning_rate": 3.911945471715947e-06,
|
| 14300 |
+
"loss": 0.0001,
|
| 14301 |
+
"num_tokens": 7897518.0,
|
| 14302 |
+
"reward": 4.099999904632568,
|
| 14303 |
+
"reward_std": 0.0,
|
| 14304 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14305 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14306 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14307 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14308 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14309 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14310 |
+
"step": 5500
|
| 14311 |
+
},
|
| 14312 |
+
{
|
| 14313 |
+
"completion_length": 19.825,
|
| 14314 |
+
"completions/clipped_ratio": 0.0,
|
| 14315 |
+
"completions/max_length": 19.8,
|
| 14316 |
+
"completions/max_terminated_length": 19.8,
|
| 14317 |
+
"completions/mean_length": 18.3,
|
| 14318 |
+
"completions/mean_terminated_length": 18.3,
|
| 14319 |
+
"completions/min_length": 16.8,
|
| 14320 |
+
"completions/min_terminated_length": 16.8,
|
| 14321 |
+
"epoch": 0.3788503850385038,
|
| 14322 |
+
"frac_reward_zero_std": 1.0,
|
| 14323 |
+
"grad_norm": 0.0,
|
| 14324 |
+
"kl": 1.1571273379027844,
|
| 14325 |
+
"learning_rate": 3.906989586250928e-06,
|
| 14326 |
+
"loss": 0.0,
|
| 14327 |
+
"num_tokens": 7911386.0,
|
| 14328 |
+
"reward": 4.099999904632568,
|
| 14329 |
+
"reward_std": 0.0,
|
| 14330 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14331 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14332 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14333 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14334 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14335 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14336 |
+
"step": 5510
|
| 14337 |
+
},
|
| 14338 |
+
{
|
| 14339 |
+
"completion_length": 17.3,
|
| 14340 |
+
"completions/clipped_ratio": 0.0,
|
| 14341 |
+
"completions/max_length": 17.3,
|
| 14342 |
+
"completions/max_terminated_length": 17.3,
|
| 14343 |
+
"completions/mean_length": 15.575,
|
| 14344 |
+
"completions/mean_terminated_length": 15.575,
|
| 14345 |
+
"completions/min_length": 14.7,
|
| 14346 |
+
"completions/min_terminated_length": 14.7,
|
| 14347 |
+
"epoch": 0.3795379537953795,
|
| 14348 |
+
"frac_reward_zero_std": 1.0,
|
| 14349 |
+
"grad_norm": 0.0,
|
| 14350 |
+
"kl": 1.3854421511292458,
|
| 14351 |
+
"learning_rate": 3.902025595329314e-06,
|
| 14352 |
+
"loss": 0.0,
|
| 14353 |
+
"num_tokens": 7923165.0,
|
| 14354 |
+
"reward": 4.099999904632568,
|
| 14355 |
+
"reward_std": 0.0,
|
| 14356 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14357 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14358 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14359 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14360 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14361 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14362 |
+
"step": 5520
|
| 14363 |
+
},
|
| 14364 |
+
{
|
| 14365 |
+
"completion_length": 18.2,
|
| 14366 |
+
"completions/clipped_ratio": 0.0,
|
| 14367 |
+
"completions/max_length": 18.2,
|
| 14368 |
+
"completions/max_terminated_length": 18.2,
|
| 14369 |
+
"completions/mean_length": 16.75,
|
| 14370 |
+
"completions/mean_terminated_length": 16.75,
|
| 14371 |
+
"completions/min_length": 15.5,
|
| 14372 |
+
"completions/min_terminated_length": 15.5,
|
| 14373 |
+
"epoch": 0.3802255225522552,
|
| 14374 |
+
"frac_reward_zero_std": 1.0,
|
| 14375 |
+
"grad_norm": 0.0,
|
| 14376 |
+
"kl": 1.2270353332161903,
|
| 14377 |
+
"learning_rate": 3.897053527547912e-06,
|
| 14378 |
+
"loss": 0.0,
|
| 14379 |
+
"num_tokens": 7937471.0,
|
| 14380 |
+
"reward": 4.099999904632568,
|
| 14381 |
+
"reward_std": 0.0,
|
| 14382 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14383 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14384 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14385 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14386 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14387 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14388 |
+
"step": 5530
|
| 14389 |
+
},
|
| 14390 |
+
{
|
| 14391 |
+
"completion_length": 17.6,
|
| 14392 |
+
"completions/clipped_ratio": 0.0,
|
| 14393 |
+
"completions/max_length": 17.7,
|
| 14394 |
+
"completions/max_terminated_length": 17.7,
|
| 14395 |
+
"completions/mean_length": 16.75,
|
| 14396 |
+
"completions/mean_terminated_length": 16.75,
|
| 14397 |
+
"completions/min_length": 16.1,
|
| 14398 |
+
"completions/min_terminated_length": 16.1,
|
| 14399 |
+
"epoch": 0.3809130913091309,
|
| 14400 |
+
"frac_reward_zero_std": 1.0,
|
| 14401 |
+
"grad_norm": 0.0,
|
| 14402 |
+
"kl": 1.0666535507887602,
|
| 14403 |
+
"learning_rate": 3.892073411550062e-06,
|
| 14404 |
+
"loss": 0.0,
|
| 14405 |
+
"num_tokens": 7951813.0,
|
| 14406 |
+
"reward": 4.099999904632568,
|
| 14407 |
+
"reward_std": 0.0,
|
| 14408 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14409 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14410 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14411 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14412 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14413 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14414 |
+
"step": 5540
|
| 14415 |
+
},
|
| 14416 |
+
{
|
| 14417 |
+
"completion_length": 17.7,
|
| 14418 |
+
"completions/clipped_ratio": 0.0,
|
| 14419 |
+
"completions/max_length": 17.9,
|
| 14420 |
+
"completions/max_terminated_length": 17.9,
|
| 14421 |
+
"completions/mean_length": 16.9,
|
| 14422 |
+
"completions/mean_terminated_length": 16.9,
|
| 14423 |
+
"completions/min_length": 15.5,
|
| 14424 |
+
"completions/min_terminated_length": 15.5,
|
| 14425 |
+
"epoch": 0.3816006600660066,
|
| 14426 |
+
"frac_reward_zero_std": 1.0,
|
| 14427 |
+
"grad_norm": 0.0,
|
| 14428 |
+
"kl": 1.2041775345802308,
|
| 14429 |
+
"learning_rate": 3.887085276025469e-06,
|
| 14430 |
+
"loss": 0.0,
|
| 14431 |
+
"num_tokens": 7968181.0,
|
| 14432 |
+
"reward": 4.099999904632568,
|
| 14433 |
+
"reward_std": 0.0,
|
| 14434 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14435 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14436 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14437 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14438 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14439 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14440 |
+
"step": 5550
|
| 14441 |
+
},
|
| 14442 |
+
{
|
| 14443 |
+
"completion_length": 16.7,
|
| 14444 |
+
"completions/clipped_ratio": 0.0,
|
| 14445 |
+
"completions/max_length": 16.4,
|
| 14446 |
+
"completions/max_terminated_length": 16.4,
|
| 14447 |
+
"completions/mean_length": 15.75,
|
| 14448 |
+
"completions/mean_terminated_length": 15.75,
|
| 14449 |
+
"completions/min_length": 14.8,
|
| 14450 |
+
"completions/min_terminated_length": 14.8,
|
| 14451 |
+
"epoch": 0.38228822882288227,
|
| 14452 |
+
"frac_reward_zero_std": 1.0,
|
| 14453 |
+
"grad_norm": 0.0,
|
| 14454 |
+
"kl": 1.1235090486705304,
|
| 14455 |
+
"learning_rate": 3.882089149710035e-06,
|
| 14456 |
+
"loss": 0.0,
|
| 14457 |
+
"num_tokens": 7984055.0,
|
| 14458 |
+
"reward": 4.099999904632568,
|
| 14459 |
+
"reward_std": 0.0,
|
| 14460 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14461 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14462 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14463 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14464 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14465 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14466 |
+
"step": 5560
|
| 14467 |
+
},
|
| 14468 |
+
{
|
| 14469 |
+
"completion_length": 18.725,
|
| 14470 |
+
"completions/clipped_ratio": 0.0,
|
| 14471 |
+
"completions/max_length": 18.7,
|
| 14472 |
+
"completions/max_terminated_length": 18.7,
|
| 14473 |
+
"completions/mean_length": 17.3,
|
| 14474 |
+
"completions/mean_terminated_length": 17.3,
|
| 14475 |
+
"completions/min_length": 16.3,
|
| 14476 |
+
"completions/min_terminated_length": 16.3,
|
| 14477 |
+
"epoch": 0.38297579757975797,
|
| 14478 |
+
"frac_reward_zero_std": 1.0,
|
| 14479 |
+
"grad_norm": 2.725888043642044e-05,
|
| 14480 |
+
"kl": 1.1920234143733979,
|
| 14481 |
+
"learning_rate": 3.877085061385694e-06,
|
| 14482 |
+
"loss": 0.0,
|
| 14483 |
+
"num_tokens": 7997675.0,
|
| 14484 |
+
"reward": 4.099999904632568,
|
| 14485 |
+
"reward_std": 0.0,
|
| 14486 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14487 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14488 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14489 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14490 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14491 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14492 |
+
"step": 5570
|
| 14493 |
+
},
|
| 14494 |
+
{
|
| 14495 |
+
"completion_length": 18.275,
|
| 14496 |
+
"completions/clipped_ratio": 0.0,
|
| 14497 |
+
"completions/max_length": 18.4,
|
| 14498 |
+
"completions/max_terminated_length": 18.4,
|
| 14499 |
+
"completions/mean_length": 16.6,
|
| 14500 |
+
"completions/mean_terminated_length": 16.6,
|
| 14501 |
+
"completions/min_length": 14.9,
|
| 14502 |
+
"completions/min_terminated_length": 14.9,
|
| 14503 |
+
"epoch": 0.38366336633663367,
|
| 14504 |
+
"frac_reward_zero_std": 1.0,
|
| 14505 |
+
"grad_norm": 0.0,
|
| 14506 |
+
"kl": 0.9494880434125662,
|
| 14507 |
+
"learning_rate": 3.872073039880254e-06,
|
| 14508 |
+
"loss": 0.0,
|
| 14509 |
+
"num_tokens": 8011851.0,
|
| 14510 |
+
"reward": 4.099999904632568,
|
| 14511 |
+
"reward_std": 0.0,
|
| 14512 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14513 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14514 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14515 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14516 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14517 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14518 |
+
"step": 5580
|
| 14519 |
+
},
|
| 14520 |
+
{
|
| 14521 |
+
"completion_length": 23.25,
|
| 14522 |
+
"completions/clipped_ratio": 0.0,
|
| 14523 |
+
"completions/max_length": 23.2,
|
| 14524 |
+
"completions/max_terminated_length": 23.2,
|
| 14525 |
+
"completions/mean_length": 19.575,
|
| 14526 |
+
"completions/mean_terminated_length": 19.575,
|
| 14527 |
+
"completions/min_length": 16.7,
|
| 14528 |
+
"completions/min_terminated_length": 16.7,
|
| 14529 |
+
"epoch": 0.38435093509350937,
|
| 14530 |
+
"frac_reward_zero_std": 1.0,
|
| 14531 |
+
"grad_norm": 0.0,
|
| 14532 |
+
"kl": 1.0359878040850163,
|
| 14533 |
+
"learning_rate": 3.8670531140672194e-06,
|
| 14534 |
+
"loss": 0.0,
|
| 14535 |
+
"num_tokens": 8024570.0,
|
| 14536 |
+
"reward": 4.099999904632568,
|
| 14537 |
+
"reward_std": 0.0,
|
| 14538 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14539 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14540 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14541 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14542 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14543 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14544 |
+
"step": 5590
|
| 14545 |
+
},
|
| 14546 |
+
{
|
| 14547 |
+
"completion_length": 18.3,
|
| 14548 |
+
"completions/clipped_ratio": 0.0,
|
| 14549 |
+
"completions/max_length": 18.3,
|
| 14550 |
+
"completions/max_terminated_length": 18.3,
|
| 14551 |
+
"completions/mean_length": 17.25,
|
| 14552 |
+
"completions/mean_terminated_length": 17.25,
|
| 14553 |
+
"completions/min_length": 16.2,
|
| 14554 |
+
"completions/min_terminated_length": 16.2,
|
| 14555 |
+
"epoch": 0.385038503850385,
|
| 14556 |
+
"frac_reward_zero_std": 1.0,
|
| 14557 |
+
"grad_norm": 0.0,
|
| 14558 |
+
"kl": 1.4615533858537675,
|
| 14559 |
+
"learning_rate": 3.862025312865633e-06,
|
| 14560 |
+
"loss": 0.0001,
|
| 14561 |
+
"num_tokens": 8039680.0,
|
| 14562 |
+
"reward": 4.099999904632568,
|
| 14563 |
+
"reward_std": 0.0,
|
| 14564 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14565 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14566 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14567 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14568 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14569 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14570 |
+
"step": 5600
|
| 14571 |
+
},
|
| 14572 |
+
{
|
| 14573 |
+
"completion_length": 17.725,
|
| 14574 |
+
"completions/clipped_ratio": 0.0,
|
| 14575 |
+
"completions/max_length": 17.6,
|
| 14576 |
+
"completions/max_terminated_length": 17.6,
|
| 14577 |
+
"completions/mean_length": 16.6,
|
| 14578 |
+
"completions/mean_terminated_length": 16.6,
|
| 14579 |
+
"completions/min_length": 16.0,
|
| 14580 |
+
"completions/min_terminated_length": 16.0,
|
| 14581 |
+
"epoch": 0.3857260726072607,
|
| 14582 |
+
"frac_reward_zero_std": 1.0,
|
| 14583 |
+
"grad_norm": 0.0,
|
| 14584 |
+
"kl": 1.1007069438695907,
|
| 14585 |
+
"learning_rate": 3.856989665239904e-06,
|
| 14586 |
+
"loss": 0.0,
|
| 14587 |
+
"num_tokens": 8054900.0,
|
| 14588 |
+
"reward": 4.099999904632568,
|
| 14589 |
+
"reward_std": 0.0,
|
| 14590 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14591 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14592 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14593 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14594 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14595 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14596 |
+
"step": 5610
|
| 14597 |
+
},
|
| 14598 |
+
{
|
| 14599 |
+
"completion_length": 19.275,
|
| 14600 |
+
"completions/clipped_ratio": 0.0,
|
| 14601 |
+
"completions/max_length": 19.4,
|
| 14602 |
+
"completions/max_terminated_length": 19.4,
|
| 14603 |
+
"completions/mean_length": 18.075,
|
| 14604 |
+
"completions/mean_terminated_length": 18.075,
|
| 14605 |
+
"completions/min_length": 16.2,
|
| 14606 |
+
"completions/min_terminated_length": 16.2,
|
| 14607 |
+
"epoch": 0.3864136413641364,
|
| 14608 |
+
"frac_reward_zero_std": 1.0,
|
| 14609 |
+
"grad_norm": 0.0,
|
| 14610 |
+
"kl": 1.1791205305606127,
|
| 14611 |
+
"learning_rate": 3.851946200199648e-06,
|
| 14612 |
+
"loss": 0.0,
|
| 14613 |
+
"num_tokens": 8070555.0,
|
| 14614 |
+
"reward": 4.099999904632568,
|
| 14615 |
+
"reward_std": 0.0,
|
| 14616 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14617 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14618 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14619 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14620 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14621 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14622 |
+
"step": 5620
|
| 14623 |
+
},
|
| 14624 |
+
{
|
| 14625 |
+
"completion_length": 19.1,
|
| 14626 |
+
"completions/clipped_ratio": 0.0,
|
| 14627 |
+
"completions/max_length": 19.3,
|
| 14628 |
+
"completions/max_terminated_length": 19.3,
|
| 14629 |
+
"completions/mean_length": 16.925,
|
| 14630 |
+
"completions/mean_terminated_length": 16.925,
|
| 14631 |
+
"completions/min_length": 15.3,
|
| 14632 |
+
"completions/min_terminated_length": 15.3,
|
| 14633 |
+
"epoch": 0.3871012101210121,
|
| 14634 |
+
"frac_reward_zero_std": 1.0,
|
| 14635 |
+
"grad_norm": 0.0,
|
| 14636 |
+
"kl": 1.1043142512440682,
|
| 14637 |
+
"learning_rate": 3.846894946799511e-06,
|
| 14638 |
+
"loss": 0.0,
|
| 14639 |
+
"num_tokens": 8083116.0,
|
| 14640 |
+
"reward": 4.099999904632568,
|
| 14641 |
+
"reward_std": 0.0,
|
| 14642 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14643 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14644 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14645 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14646 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14647 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14648 |
+
"step": 5630
|
| 14649 |
+
},
|
| 14650 |
+
{
|
| 14651 |
+
"completion_length": 17.65,
|
| 14652 |
+
"completions/clipped_ratio": 0.0,
|
| 14653 |
+
"completions/max_length": 17.4,
|
| 14654 |
+
"completions/max_terminated_length": 17.4,
|
| 14655 |
+
"completions/mean_length": 16.025,
|
| 14656 |
+
"completions/mean_terminated_length": 16.025,
|
| 14657 |
+
"completions/min_length": 15.3,
|
| 14658 |
+
"completions/min_terminated_length": 15.3,
|
| 14659 |
+
"epoch": 0.38778877887788776,
|
| 14660 |
+
"frac_reward_zero_std": 1.0,
|
| 14661 |
+
"grad_norm": 0.0,
|
| 14662 |
+
"kl": 1.4466410249471664,
|
| 14663 |
+
"learning_rate": 3.841835934139008e-06,
|
| 14664 |
+
"loss": 0.0,
|
| 14665 |
+
"num_tokens": 8097373.0,
|
| 14666 |
+
"reward": 4.099999904632568,
|
| 14667 |
+
"reward_std": 0.0,
|
| 14668 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14669 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14670 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14671 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14672 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14673 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14674 |
+
"step": 5640
|
| 14675 |
+
},
|
| 14676 |
+
{
|
| 14677 |
+
"completion_length": 17.175,
|
| 14678 |
+
"completions/clipped_ratio": 0.0,
|
| 14679 |
+
"completions/max_length": 17.1,
|
| 14680 |
+
"completions/max_terminated_length": 17.1,
|
| 14681 |
+
"completions/mean_length": 15.975,
|
| 14682 |
+
"completions/mean_terminated_length": 15.975,
|
| 14683 |
+
"completions/min_length": 15.2,
|
| 14684 |
+
"completions/min_terminated_length": 15.2,
|
| 14685 |
+
"epoch": 0.38847634763476346,
|
| 14686 |
+
"frac_reward_zero_std": 1.0,
|
| 14687 |
+
"grad_norm": 0.0,
|
| 14688 |
+
"kl": 1.3932079687714576,
|
| 14689 |
+
"learning_rate": 3.8367691913623565e-06,
|
| 14690 |
+
"loss": 0.0,
|
| 14691 |
+
"num_tokens": 8108796.0,
|
| 14692 |
+
"reward": 4.099999904632568,
|
| 14693 |
+
"reward_std": 0.0,
|
| 14694 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14695 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14696 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14697 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14698 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14699 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14700 |
+
"step": 5650
|
| 14701 |
+
},
|
| 14702 |
+
{
|
| 14703 |
+
"completion_length": 20.275,
|
| 14704 |
+
"completions/clipped_ratio": 0.0,
|
| 14705 |
+
"completions/max_length": 20.4,
|
| 14706 |
+
"completions/max_terminated_length": 20.4,
|
| 14707 |
+
"completions/mean_length": 17.525,
|
| 14708 |
+
"completions/mean_terminated_length": 17.525,
|
| 14709 |
+
"completions/min_length": 15.8,
|
| 14710 |
+
"completions/min_terminated_length": 15.8,
|
| 14711 |
+
"epoch": 0.38916391639163916,
|
| 14712 |
+
"frac_reward_zero_std": 1.0,
|
| 14713 |
+
"grad_norm": 0.0,
|
| 14714 |
+
"kl": 1.1165172673761845,
|
| 14715 |
+
"learning_rate": 3.831694747658301e-06,
|
| 14716 |
+
"loss": 0.0,
|
| 14717 |
+
"num_tokens": 8123245.0,
|
| 14718 |
+
"reward": 4.099999904632568,
|
| 14719 |
+
"reward_std": 0.0,
|
| 14720 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14721 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14722 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14723 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14724 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14725 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14726 |
+
"step": 5660
|
| 14727 |
+
},
|
| 14728 |
+
{
|
| 14729 |
+
"completion_length": 17.175,
|
| 14730 |
+
"completions/clipped_ratio": 0.0,
|
| 14731 |
+
"completions/max_length": 17.0,
|
| 14732 |
+
"completions/max_terminated_length": 17.0,
|
| 14733 |
+
"completions/mean_length": 16.2,
|
| 14734 |
+
"completions/mean_terminated_length": 16.2,
|
| 14735 |
+
"completions/min_length": 15.4,
|
| 14736 |
+
"completions/min_terminated_length": 15.4,
|
| 14737 |
+
"epoch": 0.38985148514851486,
|
| 14738 |
+
"frac_reward_zero_std": 1.0,
|
| 14739 |
+
"grad_norm": 0.0,
|
| 14740 |
+
"kl": 1.037246273458004,
|
| 14741 |
+
"learning_rate": 3.826612632259955e-06,
|
| 14742 |
+
"loss": 0.0,
|
| 14743 |
+
"num_tokens": 8137105.0,
|
| 14744 |
+
"reward": 4.099999904632568,
|
| 14745 |
+
"reward_std": 0.0,
|
| 14746 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14747 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14748 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14749 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14750 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14751 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14752 |
+
"step": 5670
|
| 14753 |
+
},
|
| 14754 |
+
{
|
| 14755 |
+
"completion_length": 19.9,
|
| 14756 |
+
"completions/clipped_ratio": 0.0,
|
| 14757 |
+
"completions/max_length": 19.9,
|
| 14758 |
+
"completions/max_terminated_length": 19.9,
|
| 14759 |
+
"completions/mean_length": 17.65,
|
| 14760 |
+
"completions/mean_terminated_length": 17.65,
|
| 14761 |
+
"completions/min_length": 15.8,
|
| 14762 |
+
"completions/min_terminated_length": 15.8,
|
| 14763 |
+
"epoch": 0.39053905390539057,
|
| 14764 |
+
"frac_reward_zero_std": 1.0,
|
| 14765 |
+
"grad_norm": 0.0,
|
| 14766 |
+
"kl": 1.185601119697094,
|
| 14767 |
+
"learning_rate": 3.821522874444626e-06,
|
| 14768 |
+
"loss": 0.0,
|
| 14769 |
+
"num_tokens": 8151835.0,
|
| 14770 |
+
"reward": 4.099999904632568,
|
| 14771 |
+
"reward_std": 0.0,
|
| 14772 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14773 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14774 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14775 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14776 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14777 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14778 |
+
"step": 5680
|
| 14779 |
+
},
|
| 14780 |
+
{
|
| 14781 |
+
"completion_length": 20.15,
|
| 14782 |
+
"completions/clipped_ratio": 0.0,
|
| 14783 |
+
"completions/max_length": 20.4,
|
| 14784 |
+
"completions/max_terminated_length": 20.4,
|
| 14785 |
+
"completions/mean_length": 18.2,
|
| 14786 |
+
"completions/mean_terminated_length": 18.2,
|
| 14787 |
+
"completions/min_length": 16.7,
|
| 14788 |
+
"completions/min_terminated_length": 16.7,
|
| 14789 |
+
"epoch": 0.3912266226622662,
|
| 14790 |
+
"frac_reward_zero_std": 1.0,
|
| 14791 |
+
"grad_norm": 0.0,
|
| 14792 |
+
"kl": 0.9930311039090156,
|
| 14793 |
+
"learning_rate": 3.8164255035336454e-06,
|
| 14794 |
+
"loss": 0.0,
|
| 14795 |
+
"num_tokens": 8165839.0,
|
| 14796 |
+
"reward": 4.099999904632568,
|
| 14797 |
+
"reward_std": 0.0,
|
| 14798 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14799 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14800 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14801 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14802 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14803 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14804 |
+
"step": 5690
|
| 14805 |
+
},
|
| 14806 |
+
{
|
| 14807 |
+
"completion_length": 18.85,
|
| 14808 |
+
"completions/clipped_ratio": 0.0,
|
| 14809 |
+
"completions/max_length": 18.6,
|
| 14810 |
+
"completions/max_terminated_length": 18.6,
|
| 14811 |
+
"completions/mean_length": 16.775,
|
| 14812 |
+
"completions/mean_terminated_length": 16.775,
|
| 14813 |
+
"completions/min_length": 15.5,
|
| 14814 |
+
"completions/min_terminated_length": 15.5,
|
| 14815 |
+
"epoch": 0.3919141914191419,
|
| 14816 |
+
"frac_reward_zero_std": 1.0,
|
| 14817 |
+
"grad_norm": 0.0,
|
| 14818 |
+
"kl": 1.4449263490736484,
|
| 14819 |
+
"learning_rate": 3.811320548892205e-06,
|
| 14820 |
+
"loss": 0.0001,
|
| 14821 |
+
"num_tokens": 8177630.0,
|
| 14822 |
+
"reward": 4.099999904632568,
|
| 14823 |
+
"reward_std": 0.0,
|
| 14824 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 14825 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 14826 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 14827 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 14828 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 14829 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 14830 |
+
"step": 5700
|
| 14831 |
}
|
| 14832 |
],
|
| 14833 |
"logging_steps": 10,
|
| 14834 |
"max_steps": 14544,
|
| 14835 |
+
"num_input_tokens_seen": 8177630,
|
| 14836 |
"num_train_epochs": 1,
|
| 14837 |
"save_steps": 50,
|
| 14838 |
"stateful_callbacks": {
|