Upload folder using huggingface_hub
Browse files- adapter_config.json +5 -5
- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1563 -3
- training_args.bin +1 -1
adapter_config.json
CHANGED
|
@@ -29,13 +29,13 @@
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
-
"
|
| 33 |
-
"up_proj",
|
| 34 |
-
"o_proj",
|
| 35 |
"q_proj",
|
| 36 |
-
"down_proj",
|
| 37 |
"k_proj",
|
| 38 |
-
"
|
|
|
|
|
|
|
|
|
|
| 39 |
],
|
| 40 |
"task_type": "CAUSAL_LM",
|
| 41 |
"trainable_token_indices": null,
|
|
|
|
| 29 |
"rank_pattern": {},
|
| 30 |
"revision": null,
|
| 31 |
"target_modules": [
|
| 32 |
+
"v_proj",
|
|
|
|
|
|
|
| 33 |
"q_proj",
|
|
|
|
| 34 |
"k_proj",
|
| 35 |
+
"down_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"up_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
],
|
| 40 |
"task_type": "CAUSAL_LM",
|
| 41 |
"trainable_token_indices": null,
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 262406656
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f2738bd78dd9fdddc1b66df0f6ec4635109af536dbc363aafed6d322450beb8b
|
| 3 |
size 262406656
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 122872331
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba0292821427c451536eef21c12bb5f7bc0a97ab8f847350a83eaae9819255c5
|
| 3 |
size 122872331
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c0a5f633aef81c2c2385c3ac3006b6453cc048a5d296f67bd0c5df19b617956
|
| 3 |
size 14645
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0917f83675af402a7519163c507a3887460b43acf17e8357c7b8ced53c5a092
|
| 3 |
size 1465
|
trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8458,11 +8458,1571 @@
|
|
| 8458 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8459 |
"rewards/quality_reward_func/std": 0.0,
|
| 8460 |
"step": 3250
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8461 |
}
|
| 8462 |
],
|
| 8463 |
"logging_steps": 10,
|
| 8464 |
"max_steps": 14544,
|
| 8465 |
-
"num_input_tokens_seen":
|
| 8466 |
"num_train_epochs": 1,
|
| 8467 |
"save_steps": 50,
|
| 8468 |
"stateful_callbacks": {
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2647139713971397,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 3850,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8458 |
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8459 |
"rewards/quality_reward_func/std": 0.0,
|
| 8460 |
"step": 3250
|
| 8461 |
+
},
|
| 8462 |
+
{
|
| 8463 |
+
"completion_length": 20.7,
|
| 8464 |
+
"completions/clipped_ratio": 0.0,
|
| 8465 |
+
"completions/max_length": 20.7,
|
| 8466 |
+
"completions/max_terminated_length": 20.7,
|
| 8467 |
+
"completions/mean_length": 18.425,
|
| 8468 |
+
"completions/mean_terminated_length": 18.425,
|
| 8469 |
+
"completions/min_length": 16.6,
|
| 8470 |
+
"completions/min_terminated_length": 16.6,
|
| 8471 |
+
"epoch": 0.22414741474147415,
|
| 8472 |
+
"frac_reward_zero_std": 1.0,
|
| 8473 |
+
"grad_norm": 0.0,
|
| 8474 |
+
"kl": 1.1709686018526555,
|
| 8475 |
+
"learning_rate": 4.769285944997953e-06,
|
| 8476 |
+
"loss": 0.0,
|
| 8477 |
+
"num_tokens": 4664606.0,
|
| 8478 |
+
"reward": 4.099999904632568,
|
| 8479 |
+
"reward_std": 0.0,
|
| 8480 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8481 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8482 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8483 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8484 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8485 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8486 |
+
"step": 3260
|
| 8487 |
+
},
|
| 8488 |
+
{
|
| 8489 |
+
"completion_length": 18.4,
|
| 8490 |
+
"completions/clipped_ratio": 0.0,
|
| 8491 |
+
"completions/max_length": 18.4,
|
| 8492 |
+
"completions/max_terminated_length": 18.4,
|
| 8493 |
+
"completions/mean_length": 17.45,
|
| 8494 |
+
"completions/mean_terminated_length": 17.45,
|
| 8495 |
+
"completions/min_length": 16.3,
|
| 8496 |
+
"completions/min_terminated_length": 16.3,
|
| 8497 |
+
"epoch": 0.22483498349834982,
|
| 8498 |
+
"frac_reward_zero_std": 1.0,
|
| 8499 |
+
"grad_norm": 0.0,
|
| 8500 |
+
"kl": 1.3686413869261742,
|
| 8501 |
+
"learning_rate": 4.766761692749586e-06,
|
| 8502 |
+
"loss": 0.0,
|
| 8503 |
+
"num_tokens": 4679528.0,
|
| 8504 |
+
"reward": 4.099999904632568,
|
| 8505 |
+
"reward_std": 0.0,
|
| 8506 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8507 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8508 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8509 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8510 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8511 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8512 |
+
"step": 3270
|
| 8513 |
+
},
|
| 8514 |
+
{
|
| 8515 |
+
"completion_length": 20.3,
|
| 8516 |
+
"completions/clipped_ratio": 0.0,
|
| 8517 |
+
"completions/max_length": 20.3,
|
| 8518 |
+
"completions/max_terminated_length": 20.3,
|
| 8519 |
+
"completions/mean_length": 17.275,
|
| 8520 |
+
"completions/mean_terminated_length": 17.275,
|
| 8521 |
+
"completions/min_length": 15.4,
|
| 8522 |
+
"completions/min_terminated_length": 15.4,
|
| 8523 |
+
"epoch": 0.22552255225522552,
|
| 8524 |
+
"frac_reward_zero_std": 1.0,
|
| 8525 |
+
"grad_norm": 0.0,
|
| 8526 |
+
"kl": 0.9682805396616458,
|
| 8527 |
+
"learning_rate": 4.764224382026094e-06,
|
| 8528 |
+
"loss": 0.0,
|
| 8529 |
+
"num_tokens": 4692875.0,
|
| 8530 |
+
"reward": 4.099999904632568,
|
| 8531 |
+
"reward_std": 0.0,
|
| 8532 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8533 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8534 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8535 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8536 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8537 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8538 |
+
"step": 3280
|
| 8539 |
+
},
|
| 8540 |
+
{
|
| 8541 |
+
"completion_length": 21.0,
|
| 8542 |
+
"completions/clipped_ratio": 0.0,
|
| 8543 |
+
"completions/max_length": 21.0,
|
| 8544 |
+
"completions/max_terminated_length": 21.0,
|
| 8545 |
+
"completions/mean_length": 18.825,
|
| 8546 |
+
"completions/mean_terminated_length": 18.825,
|
| 8547 |
+
"completions/min_length": 17.0,
|
| 8548 |
+
"completions/min_terminated_length": 17.0,
|
| 8549 |
+
"epoch": 0.22621012101210122,
|
| 8550 |
+
"frac_reward_zero_std": 1.0,
|
| 8551 |
+
"grad_norm": 0.0,
|
| 8552 |
+
"kl": 1.0991791110485791,
|
| 8553 |
+
"learning_rate": 4.761674027444544e-06,
|
| 8554 |
+
"loss": 0.0,
|
| 8555 |
+
"num_tokens": 4708156.0,
|
| 8556 |
+
"reward": 4.099999904632568,
|
| 8557 |
+
"reward_std": 0.0,
|
| 8558 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8559 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8560 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8561 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8562 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8563 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8564 |
+
"step": 3290
|
| 8565 |
+
},
|
| 8566 |
+
{
|
| 8567 |
+
"completion_length": 20.2,
|
| 8568 |
+
"completions/clipped_ratio": 0.0,
|
| 8569 |
+
"completions/max_length": 20.2,
|
| 8570 |
+
"completions/max_terminated_length": 20.2,
|
| 8571 |
+
"completions/mean_length": 18.65,
|
| 8572 |
+
"completions/mean_terminated_length": 18.65,
|
| 8573 |
+
"completions/min_length": 17.0,
|
| 8574 |
+
"completions/min_terminated_length": 17.0,
|
| 8575 |
+
"epoch": 0.2268976897689769,
|
| 8576 |
+
"frac_reward_zero_std": 1.0,
|
| 8577 |
+
"grad_norm": 0.0,
|
| 8578 |
+
"kl": 1.0455322712659836,
|
| 8579 |
+
"learning_rate": 4.759110643697146e-06,
|
| 8580 |
+
"loss": 0.0,
|
| 8581 |
+
"num_tokens": 4722014.0,
|
| 8582 |
+
"reward": 4.099999904632568,
|
| 8583 |
+
"reward_std": 0.0,
|
| 8584 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8585 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8586 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8587 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8588 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8589 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8590 |
+
"step": 3300
|
| 8591 |
+
},
|
| 8592 |
+
{
|
| 8593 |
+
"completion_length": 20.0,
|
| 8594 |
+
"completions/clipped_ratio": 0.0,
|
| 8595 |
+
"completions/max_length": 20.0,
|
| 8596 |
+
"completions/max_terminated_length": 20.0,
|
| 8597 |
+
"completions/mean_length": 17.925,
|
| 8598 |
+
"completions/mean_terminated_length": 17.925,
|
| 8599 |
+
"completions/min_length": 16.2,
|
| 8600 |
+
"completions/min_terminated_length": 16.2,
|
| 8601 |
+
"epoch": 0.2275852585258526,
|
| 8602 |
+
"frac_reward_zero_std": 1.0,
|
| 8603 |
+
"grad_norm": 0.0,
|
| 8604 |
+
"kl": 1.2300671976059676,
|
| 8605 |
+
"learning_rate": 4.756534245551172e-06,
|
| 8606 |
+
"loss": 0.0,
|
| 8607 |
+
"num_tokens": 4735443.0,
|
| 8608 |
+
"reward": 4.099999904632568,
|
| 8609 |
+
"reward_std": 0.0,
|
| 8610 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8611 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8612 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8613 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8614 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8615 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8616 |
+
"step": 3310
|
| 8617 |
+
},
|
| 8618 |
+
{
|
| 8619 |
+
"completion_length": 16.4,
|
| 8620 |
+
"completions/clipped_ratio": 0.0,
|
| 8621 |
+
"completions/max_length": 16.4,
|
| 8622 |
+
"completions/max_terminated_length": 16.4,
|
| 8623 |
+
"completions/mean_length": 15.575,
|
| 8624 |
+
"completions/mean_terminated_length": 15.575,
|
| 8625 |
+
"completions/min_length": 14.8,
|
| 8626 |
+
"completions/min_terminated_length": 14.8,
|
| 8627 |
+
"epoch": 0.22827282728272827,
|
| 8628 |
+
"frac_reward_zero_std": 1.0,
|
| 8629 |
+
"grad_norm": 0.0,
|
| 8630 |
+
"kl": 1.2536677211523055,
|
| 8631 |
+
"learning_rate": 4.753944847848867e-06,
|
| 8632 |
+
"loss": 0.0,
|
| 8633 |
+
"num_tokens": 4748098.0,
|
| 8634 |
+
"reward": 4.099999904632568,
|
| 8635 |
+
"reward_std": 0.0,
|
| 8636 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8637 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8638 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8639 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8640 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8641 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8642 |
+
"step": 3320
|
| 8643 |
+
},
|
| 8644 |
+
{
|
| 8645 |
+
"completion_length": 20.3,
|
| 8646 |
+
"completions/clipped_ratio": 0.0,
|
| 8647 |
+
"completions/max_length": 20.3,
|
| 8648 |
+
"completions/max_terminated_length": 20.3,
|
| 8649 |
+
"completions/mean_length": 18.8,
|
| 8650 |
+
"completions/mean_terminated_length": 18.8,
|
| 8651 |
+
"completions/min_length": 17.7,
|
| 8652 |
+
"completions/min_terminated_length": 17.7,
|
| 8653 |
+
"epoch": 0.22896039603960397,
|
| 8654 |
+
"frac_reward_zero_std": 1.0,
|
| 8655 |
+
"grad_norm": 0.0,
|
| 8656 |
+
"kl": 1.3008077703416348,
|
| 8657 |
+
"learning_rate": 4.751342465507362e-06,
|
| 8658 |
+
"loss": 0.0,
|
| 8659 |
+
"num_tokens": 4761274.0,
|
| 8660 |
+
"reward": 4.099999904632568,
|
| 8661 |
+
"reward_std": 0.0,
|
| 8662 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8663 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8664 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8665 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8666 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8667 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8668 |
+
"step": 3330
|
| 8669 |
+
},
|
| 8670 |
+
{
|
| 8671 |
+
"completion_length": 22.3,
|
| 8672 |
+
"completions/clipped_ratio": 0.0,
|
| 8673 |
+
"completions/max_length": 22.3,
|
| 8674 |
+
"completions/max_terminated_length": 22.3,
|
| 8675 |
+
"completions/mean_length": 19.325,
|
| 8676 |
+
"completions/mean_terminated_length": 19.325,
|
| 8677 |
+
"completions/min_length": 16.7,
|
| 8678 |
+
"completions/min_terminated_length": 16.7,
|
| 8679 |
+
"epoch": 0.22964796479647964,
|
| 8680 |
+
"frac_reward_zero_std": 1.0,
|
| 8681 |
+
"grad_norm": 0.0,
|
| 8682 |
+
"kl": 1.0198756888508798,
|
| 8683 |
+
"learning_rate": 4.748727113518594e-06,
|
| 8684 |
+
"loss": 0.0,
|
| 8685 |
+
"num_tokens": 4773463.0,
|
| 8686 |
+
"reward": 4.099999904632568,
|
| 8687 |
+
"reward_std": 0.0,
|
| 8688 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8689 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8690 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8691 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8692 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8693 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8694 |
+
"step": 3340
|
| 8695 |
+
},
|
| 8696 |
+
{
|
| 8697 |
+
"completion_length": 18.2,
|
| 8698 |
+
"completions/clipped_ratio": 0.0,
|
| 8699 |
+
"completions/max_length": 18.2,
|
| 8700 |
+
"completions/max_terminated_length": 18.2,
|
| 8701 |
+
"completions/mean_length": 17.35,
|
| 8702 |
+
"completions/mean_terminated_length": 17.35,
|
| 8703 |
+
"completions/min_length": 16.5,
|
| 8704 |
+
"completions/min_terminated_length": 16.5,
|
| 8705 |
+
"epoch": 0.23033553355335534,
|
| 8706 |
+
"frac_reward_zero_std": 1.0,
|
| 8707 |
+
"grad_norm": 0.0,
|
| 8708 |
+
"kl": 1.0109242379665375,
|
| 8709 |
+
"learning_rate": 4.746098806949213e-06,
|
| 8710 |
+
"loss": 0.0,
|
| 8711 |
+
"num_tokens": 4787017.0,
|
| 8712 |
+
"reward": 4.099999904632568,
|
| 8713 |
+
"reward_std": 0.0,
|
| 8714 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8715 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8716 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8717 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8718 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8719 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8720 |
+
"step": 3350
|
| 8721 |
+
},
|
| 8722 |
+
{
|
| 8723 |
+
"completion_length": 17.8,
|
| 8724 |
+
"completions/clipped_ratio": 0.0,
|
| 8725 |
+
"completions/max_length": 17.8,
|
| 8726 |
+
"completions/max_terminated_length": 17.8,
|
| 8727 |
+
"completions/mean_length": 17.025,
|
| 8728 |
+
"completions/mean_terminated_length": 17.025,
|
| 8729 |
+
"completions/min_length": 15.8,
|
| 8730 |
+
"completions/min_terminated_length": 15.8,
|
| 8731 |
+
"epoch": 0.23102310231023102,
|
| 8732 |
+
"frac_reward_zero_std": 1.0,
|
| 8733 |
+
"grad_norm": 0.0,
|
| 8734 |
+
"kl": 1.1944296956062317,
|
| 8735 |
+
"learning_rate": 4.743457560940503e-06,
|
| 8736 |
+
"loss": 0.0,
|
| 8737 |
+
"num_tokens": 4800622.0,
|
| 8738 |
+
"reward": 4.099999904632568,
|
| 8739 |
+
"reward_std": 0.0,
|
| 8740 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8741 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8742 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8743 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8744 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8745 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8746 |
+
"step": 3360
|
| 8747 |
+
},
|
| 8748 |
+
{
|
| 8749 |
+
"completion_length": 21.3,
|
| 8750 |
+
"completions/clipped_ratio": 0.0,
|
| 8751 |
+
"completions/max_length": 21.3,
|
| 8752 |
+
"completions/max_terminated_length": 21.3,
|
| 8753 |
+
"completions/mean_length": 17.55,
|
| 8754 |
+
"completions/mean_terminated_length": 17.55,
|
| 8755 |
+
"completions/min_length": 15.2,
|
| 8756 |
+
"completions/min_terminated_length": 15.2,
|
| 8757 |
+
"epoch": 0.23171067106710672,
|
| 8758 |
+
"frac_reward_zero_std": 1.0,
|
| 8759 |
+
"grad_norm": 0.0,
|
| 8760 |
+
"kl": 1.1894298686645925,
|
| 8761 |
+
"learning_rate": 4.740803390708284e-06,
|
| 8762 |
+
"loss": 0.0,
|
| 8763 |
+
"num_tokens": 4815392.0,
|
| 8764 |
+
"reward": 4.099999904632568,
|
| 8765 |
+
"reward_std": 0.0,
|
| 8766 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8767 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8768 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8769 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8770 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8771 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8772 |
+
"step": 3370
|
| 8773 |
+
},
|
| 8774 |
+
{
|
| 8775 |
+
"completion_length": 17.5,
|
| 8776 |
+
"completions/clipped_ratio": 0.0,
|
| 8777 |
+
"completions/max_length": 17.5,
|
| 8778 |
+
"completions/max_terminated_length": 17.5,
|
| 8779 |
+
"completions/mean_length": 16.4,
|
| 8780 |
+
"completions/mean_terminated_length": 16.4,
|
| 8781 |
+
"completions/min_length": 15.3,
|
| 8782 |
+
"completions/min_terminated_length": 15.3,
|
| 8783 |
+
"epoch": 0.2323982398239824,
|
| 8784 |
+
"frac_reward_zero_std": 1.0,
|
| 8785 |
+
"grad_norm": 0.0,
|
| 8786 |
+
"kl": 1.3694863229990006,
|
| 8787 |
+
"learning_rate": 4.738136311542836e-06,
|
| 8788 |
+
"loss": 0.0,
|
| 8789 |
+
"num_tokens": 4831268.0,
|
| 8790 |
+
"reward": 4.099999904632568,
|
| 8791 |
+
"reward_std": 0.0,
|
| 8792 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8793 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8794 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8795 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8796 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8797 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8798 |
+
"step": 3380
|
| 8799 |
+
},
|
| 8800 |
+
{
|
| 8801 |
+
"completion_length": 19.6,
|
| 8802 |
+
"completions/clipped_ratio": 0.0,
|
| 8803 |
+
"completions/max_length": 19.6,
|
| 8804 |
+
"completions/max_terminated_length": 19.6,
|
| 8805 |
+
"completions/mean_length": 18.425,
|
| 8806 |
+
"completions/mean_terminated_length": 18.425,
|
| 8807 |
+
"completions/min_length": 16.7,
|
| 8808 |
+
"completions/min_terminated_length": 16.7,
|
| 8809 |
+
"epoch": 0.2330858085808581,
|
| 8810 |
+
"frac_reward_zero_std": 1.0,
|
| 8811 |
+
"grad_norm": 0.0,
|
| 8812 |
+
"kl": 1.2427097693085671,
|
| 8813 |
+
"learning_rate": 4.7354563388088026e-06,
|
| 8814 |
+
"loss": 0.0,
|
| 8815 |
+
"num_tokens": 4846697.0,
|
| 8816 |
+
"reward": 4.099999904632568,
|
| 8817 |
+
"reward_std": 0.0,
|
| 8818 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8819 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8820 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8821 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8822 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8823 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8824 |
+
"step": 3390
|
| 8825 |
+
},
|
| 8826 |
+
{
|
| 8827 |
+
"completion_length": 21.6,
|
| 8828 |
+
"completions/clipped_ratio": 0.0,
|
| 8829 |
+
"completions/max_length": 21.6,
|
| 8830 |
+
"completions/max_terminated_length": 21.6,
|
| 8831 |
+
"completions/mean_length": 19.075,
|
| 8832 |
+
"completions/mean_terminated_length": 19.075,
|
| 8833 |
+
"completions/min_length": 16.2,
|
| 8834 |
+
"completions/min_terminated_length": 16.2,
|
| 8835 |
+
"epoch": 0.23377337733773376,
|
| 8836 |
+
"frac_reward_zero_std": 1.0,
|
| 8837 |
+
"grad_norm": 0.0,
|
| 8838 |
+
"kl": 1.0708073504269122,
|
| 8839 |
+
"learning_rate": 4.732763487945106e-06,
|
| 8840 |
+
"loss": 0.0,
|
| 8841 |
+
"num_tokens": 4861028.0,
|
| 8842 |
+
"reward": 4.099999904632568,
|
| 8843 |
+
"reward_std": 0.0,
|
| 8844 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8845 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8846 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8847 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8848 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8849 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8850 |
+
"step": 3400
|
| 8851 |
+
},
|
| 8852 |
+
{
|
| 8853 |
+
"completion_length": 29.7,
|
| 8854 |
+
"completions/clipped_ratio": 0.0,
|
| 8855 |
+
"completions/max_length": 29.7,
|
| 8856 |
+
"completions/max_terminated_length": 29.7,
|
| 8857 |
+
"completions/mean_length": 19.5,
|
| 8858 |
+
"completions/mean_terminated_length": 19.5,
|
| 8859 |
+
"completions/min_length": 15.5,
|
| 8860 |
+
"completions/min_terminated_length": 15.5,
|
| 8861 |
+
"epoch": 0.23446094609460946,
|
| 8862 |
+
"frac_reward_zero_std": 0.9,
|
| 8863 |
+
"grad_norm": 0.3156428635120392,
|
| 8864 |
+
"kl": 0.9965578641742467,
|
| 8865 |
+
"learning_rate": 4.730057774464856e-06,
|
| 8866 |
+
"loss": 0.0,
|
| 8867 |
+
"num_tokens": 4877352.0,
|
| 8868 |
+
"reward": 4.092499876022339,
|
| 8869 |
+
"reward_std": 0.015000002086162567,
|
| 8870 |
+
"rewards/coherence_reward_func/mean": 1.2924999475479126,
|
| 8871 |
+
"rewards/coherence_reward_func/std": 0.01499999761581421,
|
| 8872 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8873 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8874 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8875 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8876 |
+
"step": 3410
|
| 8877 |
+
},
|
| 8878 |
+
{
|
| 8879 |
+
"completion_length": 23.0,
|
| 8880 |
+
"completions/clipped_ratio": 0.0,
|
| 8881 |
+
"completions/max_length": 23.0,
|
| 8882 |
+
"completions/max_terminated_length": 23.0,
|
| 8883 |
+
"completions/mean_length": 18.875,
|
| 8884 |
+
"completions/mean_terminated_length": 18.875,
|
| 8885 |
+
"completions/min_length": 15.7,
|
| 8886 |
+
"completions/min_terminated_length": 15.7,
|
| 8887 |
+
"epoch": 0.23514851485148514,
|
| 8888 |
+
"frac_reward_zero_std": 1.0,
|
| 8889 |
+
"grad_norm": 0.0,
|
| 8890 |
+
"kl": 1.0397824190557003,
|
| 8891 |
+
"learning_rate": 4.727339213955265e-06,
|
| 8892 |
+
"loss": 0.0,
|
| 8893 |
+
"num_tokens": 4889631.0,
|
| 8894 |
+
"reward": 4.099999904632568,
|
| 8895 |
+
"reward_std": 0.0,
|
| 8896 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8897 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8898 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8899 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8900 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8901 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8902 |
+
"step": 3420
|
| 8903 |
+
},
|
| 8904 |
+
{
|
| 8905 |
+
"completion_length": 21.8,
|
| 8906 |
+
"completions/clipped_ratio": 0.0,
|
| 8907 |
+
"completions/max_length": 21.8,
|
| 8908 |
+
"completions/max_terminated_length": 21.8,
|
| 8909 |
+
"completions/mean_length": 20.225,
|
| 8910 |
+
"completions/mean_terminated_length": 20.225,
|
| 8911 |
+
"completions/min_length": 18.4,
|
| 8912 |
+
"completions/min_terminated_length": 18.4,
|
| 8913 |
+
"epoch": 0.23583608360836084,
|
| 8914 |
+
"frac_reward_zero_std": 1.0,
|
| 8915 |
+
"grad_norm": 0.0,
|
| 8916 |
+
"kl": 1.215433156117797,
|
| 8917 |
+
"learning_rate": 4.724607822077554e-06,
|
| 8918 |
+
"loss": 0.0,
|
| 8919 |
+
"num_tokens": 4902888.0,
|
| 8920 |
+
"reward": 4.099999904632568,
|
| 8921 |
+
"reward_std": 0.0,
|
| 8922 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8923 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8924 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8925 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8926 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8927 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8928 |
+
"step": 3430
|
| 8929 |
+
},
|
| 8930 |
+
{
|
| 8931 |
+
"completion_length": 20.7,
|
| 8932 |
+
"completions/clipped_ratio": 0.0,
|
| 8933 |
+
"completions/max_length": 20.7,
|
| 8934 |
+
"completions/max_terminated_length": 20.7,
|
| 8935 |
+
"completions/mean_length": 19.05,
|
| 8936 |
+
"completions/mean_terminated_length": 19.05,
|
| 8937 |
+
"completions/min_length": 17.2,
|
| 8938 |
+
"completions/min_terminated_length": 17.2,
|
| 8939 |
+
"epoch": 0.23652365236523654,
|
| 8940 |
+
"frac_reward_zero_std": 1.0,
|
| 8941 |
+
"grad_norm": 0.0,
|
| 8942 |
+
"kl": 1.1691066682338715,
|
| 8943 |
+
"learning_rate": 4.7218636145668615e-06,
|
| 8944 |
+
"loss": 0.0,
|
| 8945 |
+
"num_tokens": 4916974.0,
|
| 8946 |
+
"reward": 4.099999904632568,
|
| 8947 |
+
"reward_std": 0.0,
|
| 8948 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8949 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8950 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8951 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8952 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8953 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8954 |
+
"step": 3440
|
| 8955 |
+
},
|
| 8956 |
+
{
|
| 8957 |
+
"completion_length": 19.1,
|
| 8958 |
+
"completions/clipped_ratio": 0.0,
|
| 8959 |
+
"completions/max_length": 19.1,
|
| 8960 |
+
"completions/max_terminated_length": 19.1,
|
| 8961 |
+
"completions/mean_length": 16.2,
|
| 8962 |
+
"completions/mean_terminated_length": 16.2,
|
| 8963 |
+
"completions/min_length": 14.4,
|
| 8964 |
+
"completions/min_terminated_length": 14.4,
|
| 8965 |
+
"epoch": 0.2372112211221122,
|
| 8966 |
+
"frac_reward_zero_std": 1.0,
|
| 8967 |
+
"grad_norm": 0.0,
|
| 8968 |
+
"kl": 1.1441729221493007,
|
| 8969 |
+
"learning_rate": 4.7191066072321575e-06,
|
| 8970 |
+
"loss": 0.0,
|
| 8971 |
+
"num_tokens": 4933274.0,
|
| 8972 |
+
"reward": 4.099999904632568,
|
| 8973 |
+
"reward_std": 0.0,
|
| 8974 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 8975 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 8976 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 8977 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 8978 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 8979 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 8980 |
+
"step": 3450
|
| 8981 |
+
},
|
| 8982 |
+
{
|
| 8983 |
+
"completion_length": 22.4,
|
| 8984 |
+
"completions/clipped_ratio": 0.0,
|
| 8985 |
+
"completions/max_length": 22.4,
|
| 8986 |
+
"completions/max_terminated_length": 22.4,
|
| 8987 |
+
"completions/mean_length": 19.525,
|
| 8988 |
+
"completions/mean_terminated_length": 19.525,
|
| 8989 |
+
"completions/min_length": 16.6,
|
| 8990 |
+
"completions/min_terminated_length": 16.6,
|
| 8991 |
+
"epoch": 0.2378987898789879,
|
| 8992 |
+
"frac_reward_zero_std": 1.0,
|
| 8993 |
+
"grad_norm": 0.0,
|
| 8994 |
+
"kl": 1.0465805977582932,
|
| 8995 |
+
"learning_rate": 4.716336815956148e-06,
|
| 8996 |
+
"loss": 0.0,
|
| 8997 |
+
"num_tokens": 4946543.0,
|
| 8998 |
+
"reward": 4.099999904632568,
|
| 8999 |
+
"reward_std": 0.0,
|
| 9000 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9001 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9002 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9003 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9004 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9005 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9006 |
+
"step": 3460
|
| 9007 |
+
},
|
| 9008 |
+
{
|
| 9009 |
+
"completion_length": 21.6,
|
| 9010 |
+
"completions/clipped_ratio": 0.0,
|
| 9011 |
+
"completions/max_length": 21.6,
|
| 9012 |
+
"completions/max_terminated_length": 21.6,
|
| 9013 |
+
"completions/mean_length": 19.35,
|
| 9014 |
+
"completions/mean_terminated_length": 19.35,
|
| 9015 |
+
"completions/min_length": 18.0,
|
| 9016 |
+
"completions/min_terminated_length": 18.0,
|
| 9017 |
+
"epoch": 0.23858635863586358,
|
| 9018 |
+
"frac_reward_zero_std": 1.0,
|
| 9019 |
+
"grad_norm": 0.0,
|
| 9020 |
+
"kl": 1.2777705937623978,
|
| 9021 |
+
"learning_rate": 4.713554256695188e-06,
|
| 9022 |
+
"loss": 0.0001,
|
| 9023 |
+
"num_tokens": 4959301.0,
|
| 9024 |
+
"reward": 4.099999904632568,
|
| 9025 |
+
"reward_std": 0.0,
|
| 9026 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9027 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9028 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9029 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9030 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9031 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9032 |
+
"step": 3470
|
| 9033 |
+
},
|
| 9034 |
+
{
|
| 9035 |
+
"completion_length": 20.9,
|
| 9036 |
+
"completions/clipped_ratio": 0.0,
|
| 9037 |
+
"completions/max_length": 20.9,
|
| 9038 |
+
"completions/max_terminated_length": 20.9,
|
| 9039 |
+
"completions/mean_length": 20.0,
|
| 9040 |
+
"completions/mean_terminated_length": 20.0,
|
| 9041 |
+
"completions/min_length": 19.0,
|
| 9042 |
+
"completions/min_terminated_length": 19.0,
|
| 9043 |
+
"epoch": 0.23927392739273928,
|
| 9044 |
+
"frac_reward_zero_std": 1.0,
|
| 9045 |
+
"grad_norm": 0.0,
|
| 9046 |
+
"kl": 1.1782601185142993,
|
| 9047 |
+
"learning_rate": 4.710758945479184e-06,
|
| 9048 |
+
"loss": 0.0,
|
| 9049 |
+
"num_tokens": 4973385.0,
|
| 9050 |
+
"reward": 4.099999904632568,
|
| 9051 |
+
"reward_std": 0.0,
|
| 9052 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9053 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9054 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9055 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9056 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9057 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9058 |
+
"step": 3480
|
| 9059 |
+
},
|
| 9060 |
+
{
|
| 9061 |
+
"completion_length": 21.4,
|
| 9062 |
+
"completions/clipped_ratio": 0.0,
|
| 9063 |
+
"completions/max_length": 21.4,
|
| 9064 |
+
"completions/max_terminated_length": 21.4,
|
| 9065 |
+
"completions/mean_length": 18.025,
|
| 9066 |
+
"completions/mean_terminated_length": 18.025,
|
| 9067 |
+
"completions/min_length": 16.3,
|
| 9068 |
+
"completions/min_terminated_length": 16.3,
|
| 9069 |
+
"epoch": 0.23996149614961496,
|
| 9070 |
+
"frac_reward_zero_std": 1.0,
|
| 9071 |
+
"grad_norm": 0.0,
|
| 9072 |
+
"kl": 1.0537080638110639,
|
| 9073 |
+
"learning_rate": 4.7079508984115064e-06,
|
| 9074 |
+
"loss": 0.0,
|
| 9075 |
+
"num_tokens": 4986858.0,
|
| 9076 |
+
"reward": 4.099999904632568,
|
| 9077 |
+
"reward_std": 0.0,
|
| 9078 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9079 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9080 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9081 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9082 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9083 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9084 |
+
"step": 3490
|
| 9085 |
+
},
|
| 9086 |
+
{
|
| 9087 |
+
"completion_length": 18.2,
|
| 9088 |
+
"completions/clipped_ratio": 0.0,
|
| 9089 |
+
"completions/max_length": 18.2,
|
| 9090 |
+
"completions/max_terminated_length": 18.2,
|
| 9091 |
+
"completions/mean_length": 16.15,
|
| 9092 |
+
"completions/mean_terminated_length": 16.15,
|
| 9093 |
+
"completions/min_length": 15.0,
|
| 9094 |
+
"completions/min_terminated_length": 15.0,
|
| 9095 |
+
"epoch": 0.24064906490649066,
|
| 9096 |
+
"frac_reward_zero_std": 1.0,
|
| 9097 |
+
"grad_norm": 0.0,
|
| 9098 |
+
"kl": 1.129155667871237,
|
| 9099 |
+
"learning_rate": 4.705130131668894e-06,
|
| 9100 |
+
"loss": 0.0,
|
| 9101 |
+
"num_tokens": 5003140.0,
|
| 9102 |
+
"reward": 4.099999904632568,
|
| 9103 |
+
"reward_std": 0.0,
|
| 9104 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9105 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9106 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9107 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9108 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9109 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9110 |
+
"step": 3500
|
| 9111 |
+
},
|
| 9112 |
+
{
|
| 9113 |
+
"completion_length": 22.9,
|
| 9114 |
+
"completions/clipped_ratio": 0.0,
|
| 9115 |
+
"completions/max_length": 22.9,
|
| 9116 |
+
"completions/max_terminated_length": 22.9,
|
| 9117 |
+
"completions/mean_length": 19.525,
|
| 9118 |
+
"completions/mean_terminated_length": 19.525,
|
| 9119 |
+
"completions/min_length": 16.7,
|
| 9120 |
+
"completions/min_terminated_length": 16.7,
|
| 9121 |
+
"epoch": 0.24133663366336633,
|
| 9122 |
+
"frac_reward_zero_std": 1.0,
|
| 9123 |
+
"grad_norm": 0.0,
|
| 9124 |
+
"kl": 1.2283895801752807,
|
| 9125 |
+
"learning_rate": 4.702296661501362e-06,
|
| 9126 |
+
"loss": 0.0001,
|
| 9127 |
+
"num_tokens": 5018057.0,
|
| 9128 |
+
"reward": 4.099999904632568,
|
| 9129 |
+
"reward_std": 0.0,
|
| 9130 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9131 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9132 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9133 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9134 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9135 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9136 |
+
"step": 3510
|
| 9137 |
+
},
|
| 9138 |
+
{
|
| 9139 |
+
"completion_length": 19.5,
|
| 9140 |
+
"completions/clipped_ratio": 0.0,
|
| 9141 |
+
"completions/max_length": 19.5,
|
| 9142 |
+
"completions/max_terminated_length": 19.5,
|
| 9143 |
+
"completions/mean_length": 18.275,
|
| 9144 |
+
"completions/mean_terminated_length": 18.275,
|
| 9145 |
+
"completions/min_length": 17.1,
|
| 9146 |
+
"completions/min_terminated_length": 17.1,
|
| 9147 |
+
"epoch": 0.24202420242024203,
|
| 9148 |
+
"frac_reward_zero_std": 1.0,
|
| 9149 |
+
"grad_norm": 0.0,
|
| 9150 |
+
"kl": 1.1926358938217163,
|
| 9151 |
+
"learning_rate": 4.6994505042321096e-06,
|
| 9152 |
+
"loss": 0.0,
|
| 9153 |
+
"num_tokens": 5031064.0,
|
| 9154 |
+
"reward": 4.099999904632568,
|
| 9155 |
+
"reward_std": 0.0,
|
| 9156 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9157 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9158 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9159 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9160 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9161 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9162 |
+
"step": 3520
|
| 9163 |
+
},
|
| 9164 |
+
{
|
| 9165 |
+
"completion_length": 17.8,
|
| 9166 |
+
"completions/clipped_ratio": 0.0,
|
| 9167 |
+
"completions/max_length": 17.8,
|
| 9168 |
+
"completions/max_terminated_length": 17.8,
|
| 9169 |
+
"completions/mean_length": 16.6,
|
| 9170 |
+
"completions/mean_terminated_length": 16.6,
|
| 9171 |
+
"completions/min_length": 15.9,
|
| 9172 |
+
"completions/min_terminated_length": 15.9,
|
| 9173 |
+
"epoch": 0.2427117711771177,
|
| 9174 |
+
"frac_reward_zero_std": 1.0,
|
| 9175 |
+
"grad_norm": 0.0,
|
| 9176 |
+
"kl": 1.5111532375216483,
|
| 9177 |
+
"learning_rate": 4.696591676257422e-06,
|
| 9178 |
+
"loss": 0.0,
|
| 9179 |
+
"num_tokens": 5044100.0,
|
| 9180 |
+
"reward": 4.099999904632568,
|
| 9181 |
+
"reward_std": 0.0,
|
| 9182 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9183 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9184 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9185 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9186 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9187 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9188 |
+
"step": 3530
|
| 9189 |
+
},
|
| 9190 |
+
{
|
| 9191 |
+
"completion_length": 21.3,
|
| 9192 |
+
"completions/clipped_ratio": 0.0,
|
| 9193 |
+
"completions/max_length": 21.3,
|
| 9194 |
+
"completions/max_terminated_length": 21.3,
|
| 9195 |
+
"completions/mean_length": 19.4,
|
| 9196 |
+
"completions/mean_terminated_length": 19.4,
|
| 9197 |
+
"completions/min_length": 17.8,
|
| 9198 |
+
"completions/min_terminated_length": 17.8,
|
| 9199 |
+
"epoch": 0.2433993399339934,
|
| 9200 |
+
"frac_reward_zero_std": 1.0,
|
| 9201 |
+
"grad_norm": 0.0,
|
| 9202 |
+
"kl": 1.169459306448698,
|
| 9203 |
+
"learning_rate": 4.693720194046579e-06,
|
| 9204 |
+
"loss": 0.0,
|
| 9205 |
+
"num_tokens": 5058988.0,
|
| 9206 |
+
"reward": 4.099999904632568,
|
| 9207 |
+
"reward_std": 0.0,
|
| 9208 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9209 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9210 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9211 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9212 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9213 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9214 |
+
"step": 3540
|
| 9215 |
+
},
|
| 9216 |
+
{
|
| 9217 |
+
"completion_length": 17.0,
|
| 9218 |
+
"completions/clipped_ratio": 0.0,
|
| 9219 |
+
"completions/max_length": 17.0,
|
| 9220 |
+
"completions/max_terminated_length": 17.0,
|
| 9221 |
+
"completions/mean_length": 16.35,
|
| 9222 |
+
"completions/mean_terminated_length": 16.35,
|
| 9223 |
+
"completions/min_length": 16.0,
|
| 9224 |
+
"completions/min_terminated_length": 16.0,
|
| 9225 |
+
"epoch": 0.24408690869086908,
|
| 9226 |
+
"frac_reward_zero_std": 1.0,
|
| 9227 |
+
"grad_norm": 0.0,
|
| 9228 |
+
"kl": 1.476221612840891,
|
| 9229 |
+
"learning_rate": 4.690836074141762e-06,
|
| 9230 |
+
"loss": 0.0,
|
| 9231 |
+
"num_tokens": 5075874.0,
|
| 9232 |
+
"reward": 4.099999904632568,
|
| 9233 |
+
"reward_std": 0.0,
|
| 9234 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9235 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9236 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9237 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9238 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9239 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9240 |
+
"step": 3550
|
| 9241 |
+
},
|
| 9242 |
+
{
|
| 9243 |
+
"completion_length": 19.3,
|
| 9244 |
+
"completions/clipped_ratio": 0.0,
|
| 9245 |
+
"completions/max_length": 19.3,
|
| 9246 |
+
"completions/max_terminated_length": 19.3,
|
| 9247 |
+
"completions/mean_length": 17.775,
|
| 9248 |
+
"completions/mean_terminated_length": 17.775,
|
| 9249 |
+
"completions/min_length": 16.2,
|
| 9250 |
+
"completions/min_terminated_length": 16.2,
|
| 9251 |
+
"epoch": 0.24477447744774478,
|
| 9252 |
+
"frac_reward_zero_std": 1.0,
|
| 9253 |
+
"grad_norm": 0.0,
|
| 9254 |
+
"kl": 1.4168094083666802,
|
| 9255 |
+
"learning_rate": 4.687939333157954e-06,
|
| 9256 |
+
"loss": 0.0001,
|
| 9257 |
+
"num_tokens": 5089925.0,
|
| 9258 |
+
"reward": 4.099999904632568,
|
| 9259 |
+
"reward_std": 0.0,
|
| 9260 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9261 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9262 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9263 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9264 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9265 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9266 |
+
"step": 3560
|
| 9267 |
+
},
|
| 9268 |
+
{
|
| 9269 |
+
"completion_length": 20.3,
|
| 9270 |
+
"completions/clipped_ratio": 0.0,
|
| 9271 |
+
"completions/max_length": 20.3,
|
| 9272 |
+
"completions/max_terminated_length": 20.3,
|
| 9273 |
+
"completions/mean_length": 18.75,
|
| 9274 |
+
"completions/mean_terminated_length": 18.75,
|
| 9275 |
+
"completions/min_length": 17.0,
|
| 9276 |
+
"completions/min_terminated_length": 17.0,
|
| 9277 |
+
"epoch": 0.24546204620462045,
|
| 9278 |
+
"frac_reward_zero_std": 1.0,
|
| 9279 |
+
"grad_norm": 0.0,
|
| 9280 |
+
"kl": 1.2915988519787789,
|
| 9281 |
+
"learning_rate": 4.685029987782845e-06,
|
| 9282 |
+
"loss": 0.0,
|
| 9283 |
+
"num_tokens": 5104875.0,
|
| 9284 |
+
"reward": 4.099999904632568,
|
| 9285 |
+
"reward_std": 0.0,
|
| 9286 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9287 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9288 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9289 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9290 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9291 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9292 |
+
"step": 3570
|
| 9293 |
+
},
|
| 9294 |
+
{
|
| 9295 |
+
"completion_length": 20.6,
|
| 9296 |
+
"completions/clipped_ratio": 0.0,
|
| 9297 |
+
"completions/max_length": 20.6,
|
| 9298 |
+
"completions/max_terminated_length": 20.6,
|
| 9299 |
+
"completions/mean_length": 17.4,
|
| 9300 |
+
"completions/mean_terminated_length": 17.4,
|
| 9301 |
+
"completions/min_length": 15.7,
|
| 9302 |
+
"completions/min_terminated_length": 15.7,
|
| 9303 |
+
"epoch": 0.24614961496149615,
|
| 9304 |
+
"frac_reward_zero_std": 1.0,
|
| 9305 |
+
"grad_norm": 0.0,
|
| 9306 |
+
"kl": 1.1931571021676064,
|
| 9307 |
+
"learning_rate": 4.682108054776741e-06,
|
| 9308 |
+
"loss": 0.0,
|
| 9309 |
+
"num_tokens": 5118863.0,
|
| 9310 |
+
"reward": 4.099999904632568,
|
| 9311 |
+
"reward_std": 0.0,
|
| 9312 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9313 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9314 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9315 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9316 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9317 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9318 |
+
"step": 3580
|
| 9319 |
+
},
|
| 9320 |
+
{
|
| 9321 |
+
"completion_length": 20.5,
|
| 9322 |
+
"completions/clipped_ratio": 0.0,
|
| 9323 |
+
"completions/max_length": 20.5,
|
| 9324 |
+
"completions/max_terminated_length": 20.5,
|
| 9325 |
+
"completions/mean_length": 18.25,
|
| 9326 |
+
"completions/mean_terminated_length": 18.25,
|
| 9327 |
+
"completions/min_length": 16.4,
|
| 9328 |
+
"completions/min_terminated_length": 16.4,
|
| 9329 |
+
"epoch": 0.24683718371837185,
|
| 9330 |
+
"frac_reward_zero_std": 1.0,
|
| 9331 |
+
"grad_norm": 0.0,
|
| 9332 |
+
"kl": 1.1943508870899677,
|
| 9333 |
+
"learning_rate": 4.67917355097246e-06,
|
| 9334 |
+
"loss": 0.0,
|
| 9335 |
+
"num_tokens": 5132977.0,
|
| 9336 |
+
"reward": 4.099999904632568,
|
| 9337 |
+
"reward_std": 0.0,
|
| 9338 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9339 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9340 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9341 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9342 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9343 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9344 |
+
"step": 3590
|
| 9345 |
+
},
|
| 9346 |
+
{
|
| 9347 |
+
"completion_length": 20.2,
|
| 9348 |
+
"completions/clipped_ratio": 0.0,
|
| 9349 |
+
"completions/max_length": 20.2,
|
| 9350 |
+
"completions/max_terminated_length": 20.2,
|
| 9351 |
+
"completions/mean_length": 17.9,
|
| 9352 |
+
"completions/mean_terminated_length": 17.9,
|
| 9353 |
+
"completions/min_length": 16.0,
|
| 9354 |
+
"completions/min_terminated_length": 16.0,
|
| 9355 |
+
"epoch": 0.24752475247524752,
|
| 9356 |
+
"frac_reward_zero_std": 1.0,
|
| 9357 |
+
"grad_norm": 0.0,
|
| 9358 |
+
"kl": 1.3834453955292703,
|
| 9359 |
+
"learning_rate": 4.676226493275239e-06,
|
| 9360 |
+
"loss": 0.0001,
|
| 9361 |
+
"num_tokens": 5146825.0,
|
| 9362 |
+
"reward": 4.099999904632568,
|
| 9363 |
+
"reward_std": 0.0,
|
| 9364 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9365 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9366 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9367 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9368 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9369 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9370 |
+
"step": 3600
|
| 9371 |
+
},
|
| 9372 |
+
{
|
| 9373 |
+
"completion_length": 21.6,
|
| 9374 |
+
"completions/clipped_ratio": 0.0,
|
| 9375 |
+
"completions/max_length": 21.6,
|
| 9376 |
+
"completions/max_terminated_length": 21.6,
|
| 9377 |
+
"completions/mean_length": 18.975,
|
| 9378 |
+
"completions/mean_terminated_length": 18.975,
|
| 9379 |
+
"completions/min_length": 16.3,
|
| 9380 |
+
"completions/min_terminated_length": 16.3,
|
| 9381 |
+
"epoch": 0.24821232123212322,
|
| 9382 |
+
"frac_reward_zero_std": 0.9,
|
| 9383 |
+
"grad_norm": 0.0,
|
| 9384 |
+
"kl": 1.411560659110546,
|
| 9385 |
+
"learning_rate": 4.673266898662637e-06,
|
| 9386 |
+
"loss": 0.0001,
|
| 9387 |
+
"num_tokens": 5161888.0,
|
| 9388 |
+
"reward": 4.092499876022339,
|
| 9389 |
+
"reward_std": 0.015000002086162567,
|
| 9390 |
+
"rewards/coherence_reward_func/mean": 1.2924999475479126,
|
| 9391 |
+
"rewards/coherence_reward_func/std": 0.01499999761581421,
|
| 9392 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9393 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9394 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9395 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9396 |
+
"step": 3610
|
| 9397 |
+
},
|
| 9398 |
+
{
|
| 9399 |
+
"completion_length": 18.6,
|
| 9400 |
+
"completions/clipped_ratio": 0.0,
|
| 9401 |
+
"completions/max_length": 18.6,
|
| 9402 |
+
"completions/max_terminated_length": 18.6,
|
| 9403 |
+
"completions/mean_length": 16.7,
|
| 9404 |
+
"completions/mean_terminated_length": 16.7,
|
| 9405 |
+
"completions/min_length": 15.6,
|
| 9406 |
+
"completions/min_terminated_length": 15.6,
|
| 9407 |
+
"epoch": 0.2488998899889989,
|
| 9408 |
+
"frac_reward_zero_std": 1.0,
|
| 9409 |
+
"grad_norm": 0.0,
|
| 9410 |
+
"kl": 1.4795636057853698,
|
| 9411 |
+
"learning_rate": 4.670294784184436e-06,
|
| 9412 |
+
"loss": 0.0,
|
| 9413 |
+
"num_tokens": 5176032.0,
|
| 9414 |
+
"reward": 4.099999904632568,
|
| 9415 |
+
"reward_std": 0.0,
|
| 9416 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9417 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9418 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9419 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9420 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9421 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9422 |
+
"step": 3620
|
| 9423 |
+
},
|
| 9424 |
+
{
|
| 9425 |
+
"completion_length": 18.7,
|
| 9426 |
+
"completions/clipped_ratio": 0.0,
|
| 9427 |
+
"completions/max_length": 18.7,
|
| 9428 |
+
"completions/max_terminated_length": 18.7,
|
| 9429 |
+
"completions/mean_length": 16.625,
|
| 9430 |
+
"completions/mean_terminated_length": 16.625,
|
| 9431 |
+
"completions/min_length": 15.3,
|
| 9432 |
+
"completions/min_terminated_length": 15.3,
|
| 9433 |
+
"epoch": 0.2495874587458746,
|
| 9434 |
+
"frac_reward_zero_std": 1.0,
|
| 9435 |
+
"grad_norm": 0.0,
|
| 9436 |
+
"kl": 1.2247574172914029,
|
| 9437 |
+
"learning_rate": 4.6673101669625445e-06,
|
| 9438 |
+
"loss": 0.0,
|
| 9439 |
+
"num_tokens": 5190661.0,
|
| 9440 |
+
"reward": 4.099999904632568,
|
| 9441 |
+
"reward_std": 0.0,
|
| 9442 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9443 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9444 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9445 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9446 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9447 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9448 |
+
"step": 3630
|
| 9449 |
+
},
|
| 9450 |
+
{
|
| 9451 |
+
"completion_length": 19.5,
|
| 9452 |
+
"completions/clipped_ratio": 0.0,
|
| 9453 |
+
"completions/max_length": 19.5,
|
| 9454 |
+
"completions/max_terminated_length": 19.5,
|
| 9455 |
+
"completions/mean_length": 17.95,
|
| 9456 |
+
"completions/mean_terminated_length": 17.95,
|
| 9457 |
+
"completions/min_length": 15.9,
|
| 9458 |
+
"completions/min_terminated_length": 15.9,
|
| 9459 |
+
"epoch": 0.25027502750275027,
|
| 9460 |
+
"frac_reward_zero_std": 1.0,
|
| 9461 |
+
"grad_norm": 0.0,
|
| 9462 |
+
"kl": 1.3278845094144345,
|
| 9463 |
+
"learning_rate": 4.664313064190893e-06,
|
| 9464 |
+
"loss": 0.0,
|
| 9465 |
+
"num_tokens": 5206219.0,
|
| 9466 |
+
"reward": 4.099999904632568,
|
| 9467 |
+
"reward_std": 0.0,
|
| 9468 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9469 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9470 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9471 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9472 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9473 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9474 |
+
"step": 3640
|
| 9475 |
+
},
|
| 9476 |
+
{
|
| 9477 |
+
"completion_length": 17.8,
|
| 9478 |
+
"completions/clipped_ratio": 0.0,
|
| 9479 |
+
"completions/max_length": 17.8,
|
| 9480 |
+
"completions/max_terminated_length": 17.8,
|
| 9481 |
+
"completions/mean_length": 16.575,
|
| 9482 |
+
"completions/mean_terminated_length": 16.575,
|
| 9483 |
+
"completions/min_length": 15.8,
|
| 9484 |
+
"completions/min_terminated_length": 15.8,
|
| 9485 |
+
"epoch": 0.25096259625962597,
|
| 9486 |
+
"frac_reward_zero_std": 1.0,
|
| 9487 |
+
"grad_norm": 0.0,
|
| 9488 |
+
"kl": 1.3263515307568015,
|
| 9489 |
+
"learning_rate": 4.6613034931353445e-06,
|
| 9490 |
+
"loss": 0.0,
|
| 9491 |
+
"num_tokens": 5217886.0,
|
| 9492 |
+
"reward": 4.099999904632568,
|
| 9493 |
+
"reward_std": 0.0,
|
| 9494 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9495 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9496 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9497 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9498 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9499 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9500 |
+
"step": 3650
|
| 9501 |
+
},
|
| 9502 |
+
{
|
| 9503 |
+
"completion_length": 17.7,
|
| 9504 |
+
"completions/clipped_ratio": 0.0,
|
| 9505 |
+
"completions/max_length": 17.7,
|
| 9506 |
+
"completions/max_terminated_length": 17.7,
|
| 9507 |
+
"completions/mean_length": 16.8,
|
| 9508 |
+
"completions/mean_terminated_length": 16.8,
|
| 9509 |
+
"completions/min_length": 16.1,
|
| 9510 |
+
"completions/min_terminated_length": 16.1,
|
| 9511 |
+
"epoch": 0.25165016501650167,
|
| 9512 |
+
"frac_reward_zero_std": 1.0,
|
| 9513 |
+
"grad_norm": 0.0,
|
| 9514 |
+
"kl": 1.2702551379799842,
|
| 9515 |
+
"learning_rate": 4.6582814711335874e-06,
|
| 9516 |
+
"loss": 0.0,
|
| 9517 |
+
"num_tokens": 5229738.0,
|
| 9518 |
+
"reward": 4.099999904632568,
|
| 9519 |
+
"reward_std": 0.0,
|
| 9520 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9521 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9522 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9523 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9524 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9525 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9526 |
+
"step": 3660
|
| 9527 |
+
},
|
| 9528 |
+
{
|
| 9529 |
+
"completion_length": 24.7,
|
| 9530 |
+
"completions/clipped_ratio": 0.0,
|
| 9531 |
+
"completions/max_length": 24.7,
|
| 9532 |
+
"completions/max_terminated_length": 24.7,
|
| 9533 |
+
"completions/mean_length": 21.6,
|
| 9534 |
+
"completions/mean_terminated_length": 21.6,
|
| 9535 |
+
"completions/min_length": 19.0,
|
| 9536 |
+
"completions/min_terminated_length": 19.0,
|
| 9537 |
+
"epoch": 0.2523377337733773,
|
| 9538 |
+
"frac_reward_zero_std": 1.0,
|
| 9539 |
+
"grad_norm": 0.0,
|
| 9540 |
+
"kl": 1.0552857838571073,
|
| 9541 |
+
"learning_rate": 4.655247015595039e-06,
|
| 9542 |
+
"loss": 0.0,
|
| 9543 |
+
"num_tokens": 5244126.0,
|
| 9544 |
+
"reward": 4.099999904632568,
|
| 9545 |
+
"reward_std": 0.0,
|
| 9546 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9547 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9548 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9549 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9550 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9551 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9552 |
+
"step": 3670
|
| 9553 |
+
},
|
| 9554 |
+
{
|
| 9555 |
+
"completion_length": 18.2,
|
| 9556 |
+
"completions/clipped_ratio": 0.0,
|
| 9557 |
+
"completions/max_length": 18.2,
|
| 9558 |
+
"completions/max_terminated_length": 18.2,
|
| 9559 |
+
"completions/mean_length": 15.95,
|
| 9560 |
+
"completions/mean_terminated_length": 15.95,
|
| 9561 |
+
"completions/min_length": 13.5,
|
| 9562 |
+
"completions/min_terminated_length": 13.5,
|
| 9563 |
+
"epoch": 0.253025302530253,
|
| 9564 |
+
"frac_reward_zero_std": 0.9,
|
| 9565 |
+
"grad_norm": 0.0,
|
| 9566 |
+
"kl": 41.09145687818527,
|
| 9567 |
+
"learning_rate": 4.652200144000743e-06,
|
| 9568 |
+
"loss": 0.0017,
|
| 9569 |
+
"num_tokens": 5258988.0,
|
| 9570 |
+
"reward": 3.8949999094009398,
|
| 9571 |
+
"reward_std": 0.23671360015869142,
|
| 9572 |
+
"rewards/coherence_reward_func/mean": 1.23499995470047,
|
| 9573 |
+
"rewards/coherence_reward_func/std": 0.07505553364753723,
|
| 9574 |
+
"rewards/formatting_reward_func/mean": 1.9,
|
| 9575 |
+
"rewards/formatting_reward_func/std": 0.1154700517654419,
|
| 9576 |
+
"rewards/quality_reward_func/mean": 0.7600000113248825,
|
| 9577 |
+
"rewards/quality_reward_func/std": 0.046188023686408994,
|
| 9578 |
+
"step": 3680
|
| 9579 |
+
},
|
| 9580 |
+
{
|
| 9581 |
+
"completion_length": 21.3,
|
| 9582 |
+
"completions/clipped_ratio": 0.0,
|
| 9583 |
+
"completions/max_length": 21.3,
|
| 9584 |
+
"completions/max_terminated_length": 21.3,
|
| 9585 |
+
"completions/mean_length": 19.175,
|
| 9586 |
+
"completions/mean_terminated_length": 19.175,
|
| 9587 |
+
"completions/min_length": 17.4,
|
| 9588 |
+
"completions/min_terminated_length": 17.4,
|
| 9589 |
+
"epoch": 0.2537128712871287,
|
| 9590 |
+
"frac_reward_zero_std": 1.0,
|
| 9591 |
+
"grad_norm": 0.0,
|
| 9592 |
+
"kl": 1.4594970896840096,
|
| 9593 |
+
"learning_rate": 4.6491408739032705e-06,
|
| 9594 |
+
"loss": 0.0001,
|
| 9595 |
+
"num_tokens": 5273603.0,
|
| 9596 |
+
"reward": 4.099999904632568,
|
| 9597 |
+
"reward_std": 0.0,
|
| 9598 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9599 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9600 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9601 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9602 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9603 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9604 |
+
"step": 3690
|
| 9605 |
+
},
|
| 9606 |
+
{
|
| 9607 |
+
"completion_length": 22.5,
|
| 9608 |
+
"completions/clipped_ratio": 0.0,
|
| 9609 |
+
"completions/max_length": 22.5,
|
| 9610 |
+
"completions/max_terminated_length": 22.5,
|
| 9611 |
+
"completions/mean_length": 19.25,
|
| 9612 |
+
"completions/mean_terminated_length": 19.25,
|
| 9613 |
+
"completions/min_length": 17.3,
|
| 9614 |
+
"completions/min_terminated_length": 17.3,
|
| 9615 |
+
"epoch": 0.2544004400440044,
|
| 9616 |
+
"frac_reward_zero_std": 1.0,
|
| 9617 |
+
"grad_norm": 0.0,
|
| 9618 |
+
"kl": 1.1064071744680404,
|
| 9619 |
+
"learning_rate": 4.64606922292662e-06,
|
| 9620 |
+
"loss": 0.0,
|
| 9621 |
+
"num_tokens": 5288777.0,
|
| 9622 |
+
"reward": 4.099999904632568,
|
| 9623 |
+
"reward_std": 0.0,
|
| 9624 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9625 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9626 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9627 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9628 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9629 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9630 |
+
"step": 3700
|
| 9631 |
+
},
|
| 9632 |
+
{
|
| 9633 |
+
"completion_length": 17.1,
|
| 9634 |
+
"completions/clipped_ratio": 0.0,
|
| 9635 |
+
"completions/max_length": 17.1,
|
| 9636 |
+
"completions/max_terminated_length": 17.1,
|
| 9637 |
+
"completions/mean_length": 16.55,
|
| 9638 |
+
"completions/mean_terminated_length": 16.55,
|
| 9639 |
+
"completions/min_length": 15.9,
|
| 9640 |
+
"completions/min_terminated_length": 15.9,
|
| 9641 |
+
"epoch": 0.25508800880088006,
|
| 9642 |
+
"frac_reward_zero_std": 1.0,
|
| 9643 |
+
"grad_norm": 0.0,
|
| 9644 |
+
"kl": 1.3959959626197815,
|
| 9645 |
+
"learning_rate": 4.642985208766113e-06,
|
| 9646 |
+
"loss": 0.0,
|
| 9647 |
+
"num_tokens": 5300959.0,
|
| 9648 |
+
"reward": 4.099999904632568,
|
| 9649 |
+
"reward_std": 0.0,
|
| 9650 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9651 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9652 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9653 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9654 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9655 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9656 |
+
"step": 3710
|
| 9657 |
+
},
|
| 9658 |
+
{
|
| 9659 |
+
"completion_length": 17.6,
|
| 9660 |
+
"completions/clipped_ratio": 0.0,
|
| 9661 |
+
"completions/max_length": 17.6,
|
| 9662 |
+
"completions/max_terminated_length": 17.6,
|
| 9663 |
+
"completions/mean_length": 16.525,
|
| 9664 |
+
"completions/mean_terminated_length": 16.525,
|
| 9665 |
+
"completions/min_length": 15.4,
|
| 9666 |
+
"completions/min_terminated_length": 15.4,
|
| 9667 |
+
"epoch": 0.25577557755775576,
|
| 9668 |
+
"frac_reward_zero_std": 1.0,
|
| 9669 |
+
"grad_norm": 0.0,
|
| 9670 |
+
"kl": 1.3369979746639729,
|
| 9671 |
+
"learning_rate": 4.639888849188295e-06,
|
| 9672 |
+
"loss": 0.0,
|
| 9673 |
+
"num_tokens": 5314908.0,
|
| 9674 |
+
"reward": 4.099999904632568,
|
| 9675 |
+
"reward_std": 0.0,
|
| 9676 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9677 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9678 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9679 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9680 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9681 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9682 |
+
"step": 3720
|
| 9683 |
+
},
|
| 9684 |
+
{
|
| 9685 |
+
"completion_length": 17.5,
|
| 9686 |
+
"completions/clipped_ratio": 0.0,
|
| 9687 |
+
"completions/max_length": 17.5,
|
| 9688 |
+
"completions/max_terminated_length": 17.5,
|
| 9689 |
+
"completions/mean_length": 16.725,
|
| 9690 |
+
"completions/mean_terminated_length": 16.725,
|
| 9691 |
+
"completions/min_length": 16.2,
|
| 9692 |
+
"completions/min_terminated_length": 16.2,
|
| 9693 |
+
"epoch": 0.25646314631463146,
|
| 9694 |
+
"frac_reward_zero_std": 1.0,
|
| 9695 |
+
"grad_norm": 0.0,
|
| 9696 |
+
"kl": 1.4185206890106201,
|
| 9697 |
+
"learning_rate": 4.6367801620308295e-06,
|
| 9698 |
+
"loss": 0.0,
|
| 9699 |
+
"num_tokens": 5327609.0,
|
| 9700 |
+
"reward": 4.099999904632568,
|
| 9701 |
+
"reward_std": 0.0,
|
| 9702 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9703 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9704 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9705 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9706 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9707 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9708 |
+
"step": 3730
|
| 9709 |
+
},
|
| 9710 |
+
{
|
| 9711 |
+
"completion_length": 19.5,
|
| 9712 |
+
"completions/clipped_ratio": 0.0,
|
| 9713 |
+
"completions/max_length": 19.5,
|
| 9714 |
+
"completions/max_terminated_length": 19.5,
|
| 9715 |
+
"completions/mean_length": 17.675,
|
| 9716 |
+
"completions/mean_terminated_length": 17.675,
|
| 9717 |
+
"completions/min_length": 15.8,
|
| 9718 |
+
"completions/min_terminated_length": 15.8,
|
| 9719 |
+
"epoch": 0.25715071507150716,
|
| 9720 |
+
"frac_reward_zero_std": 1.0,
|
| 9721 |
+
"grad_norm": 0.0,
|
| 9722 |
+
"kl": 1.2852225728332995,
|
| 9723 |
+
"learning_rate": 4.633659165202398e-06,
|
| 9724 |
+
"loss": 0.0,
|
| 9725 |
+
"num_tokens": 5341592.0,
|
| 9726 |
+
"reward": 4.099999904632568,
|
| 9727 |
+
"reward_std": 0.0,
|
| 9728 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9729 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9730 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9731 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9732 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9733 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9734 |
+
"step": 3740
|
| 9735 |
+
},
|
| 9736 |
+
{
|
| 9737 |
+
"completion_length": 19.8,
|
| 9738 |
+
"completions/clipped_ratio": 0.0,
|
| 9739 |
+
"completions/max_length": 19.8,
|
| 9740 |
+
"completions/max_terminated_length": 19.8,
|
| 9741 |
+
"completions/mean_length": 16.9,
|
| 9742 |
+
"completions/mean_terminated_length": 16.9,
|
| 9743 |
+
"completions/min_length": 14.8,
|
| 9744 |
+
"completions/min_terminated_length": 14.8,
|
| 9745 |
+
"epoch": 0.25783828382838286,
|
| 9746 |
+
"frac_reward_zero_std": 1.0,
|
| 9747 |
+
"grad_norm": 0.0,
|
| 9748 |
+
"kl": 1.2239439487457275,
|
| 9749 |
+
"learning_rate": 4.630525876682597e-06,
|
| 9750 |
+
"loss": 0.0,
|
| 9751 |
+
"num_tokens": 5353784.0,
|
| 9752 |
+
"reward": 4.099999904632568,
|
| 9753 |
+
"reward_std": 0.0,
|
| 9754 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9755 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9756 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9757 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9758 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9759 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9760 |
+
"step": 3750
|
| 9761 |
+
},
|
| 9762 |
+
{
|
| 9763 |
+
"completion_length": 18.2,
|
| 9764 |
+
"completions/clipped_ratio": 0.0,
|
| 9765 |
+
"completions/max_length": 18.2,
|
| 9766 |
+
"completions/max_terminated_length": 18.2,
|
| 9767 |
+
"completions/mean_length": 16.825,
|
| 9768 |
+
"completions/mean_terminated_length": 16.825,
|
| 9769 |
+
"completions/min_length": 15.7,
|
| 9770 |
+
"completions/min_terminated_length": 15.7,
|
| 9771 |
+
"epoch": 0.2585258525852585,
|
| 9772 |
+
"frac_reward_zero_std": 1.0,
|
| 9773 |
+
"grad_norm": 0.0,
|
| 9774 |
+
"kl": 1.3233575984835624,
|
| 9775 |
+
"learning_rate": 4.627380314521833e-06,
|
| 9776 |
+
"loss": 0.0,
|
| 9777 |
+
"num_tokens": 5366529.0,
|
| 9778 |
+
"reward": 4.099999904632568,
|
| 9779 |
+
"reward_std": 0.0,
|
| 9780 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9781 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9782 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9783 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9784 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9785 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9786 |
+
"step": 3760
|
| 9787 |
+
},
|
| 9788 |
+
{
|
| 9789 |
+
"completion_length": 19.3,
|
| 9790 |
+
"completions/clipped_ratio": 0.0,
|
| 9791 |
+
"completions/max_length": 19.3,
|
| 9792 |
+
"completions/max_terminated_length": 19.3,
|
| 9793 |
+
"completions/mean_length": 17.6,
|
| 9794 |
+
"completions/mean_terminated_length": 17.6,
|
| 9795 |
+
"completions/min_length": 16.1,
|
| 9796 |
+
"completions/min_terminated_length": 16.1,
|
| 9797 |
+
"epoch": 0.2592134213421342,
|
| 9798 |
+
"frac_reward_zero_std": 1.0,
|
| 9799 |
+
"grad_norm": 0.0,
|
| 9800 |
+
"kl": 1.4207659110426902,
|
| 9801 |
+
"learning_rate": 4.624222496841219e-06,
|
| 9802 |
+
"loss": 0.0001,
|
| 9803 |
+
"num_tokens": 5380945.0,
|
| 9804 |
+
"reward": 4.099999904632568,
|
| 9805 |
+
"reward_std": 0.0,
|
| 9806 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9807 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9808 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9809 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9810 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9811 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9812 |
+
"step": 3770
|
| 9813 |
+
},
|
| 9814 |
+
{
|
| 9815 |
+
"completion_length": 19.8,
|
| 9816 |
+
"completions/clipped_ratio": 0.0,
|
| 9817 |
+
"completions/max_length": 19.8,
|
| 9818 |
+
"completions/max_terminated_length": 19.8,
|
| 9819 |
+
"completions/mean_length": 17.35,
|
| 9820 |
+
"completions/mean_terminated_length": 17.35,
|
| 9821 |
+
"completions/min_length": 15.0,
|
| 9822 |
+
"completions/min_terminated_length": 15.0,
|
| 9823 |
+
"epoch": 0.2599009900990099,
|
| 9824 |
+
"frac_reward_zero_std": 1.0,
|
| 9825 |
+
"grad_norm": 0.0,
|
| 9826 |
+
"kl": 1.4342102020978929,
|
| 9827 |
+
"learning_rate": 4.621052441832471e-06,
|
| 9828 |
+
"loss": 0.0001,
|
| 9829 |
+
"num_tokens": 5395375.0,
|
| 9830 |
+
"reward": 4.099999904632568,
|
| 9831 |
+
"reward_std": 0.0,
|
| 9832 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9833 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9834 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9835 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9836 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9837 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9838 |
+
"step": 3780
|
| 9839 |
+
},
|
| 9840 |
+
{
|
| 9841 |
+
"completion_length": 17.8,
|
| 9842 |
+
"completions/clipped_ratio": 0.0,
|
| 9843 |
+
"completions/max_length": 17.8,
|
| 9844 |
+
"completions/max_terminated_length": 17.8,
|
| 9845 |
+
"completions/mean_length": 16.8,
|
| 9846 |
+
"completions/mean_terminated_length": 16.8,
|
| 9847 |
+
"completions/min_length": 15.2,
|
| 9848 |
+
"completions/min_terminated_length": 15.2,
|
| 9849 |
+
"epoch": 0.2605885588558856,
|
| 9850 |
+
"frac_reward_zero_std": 1.0,
|
| 9851 |
+
"grad_norm": 0.0,
|
| 9852 |
+
"kl": 1.3788485825061798,
|
| 9853 |
+
"learning_rate": 4.617870167757801e-06,
|
| 9854 |
+
"loss": 0.0,
|
| 9855 |
+
"num_tokens": 5410043.0,
|
| 9856 |
+
"reward": 4.099999904632568,
|
| 9857 |
+
"reward_std": 0.0,
|
| 9858 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9859 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9860 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9861 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9862 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9863 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9864 |
+
"step": 3790
|
| 9865 |
+
},
|
| 9866 |
+
{
|
| 9867 |
+
"completion_length": 20.0,
|
| 9868 |
+
"completions/clipped_ratio": 0.0,
|
| 9869 |
+
"completions/max_length": 20.0,
|
| 9870 |
+
"completions/max_terminated_length": 20.0,
|
| 9871 |
+
"completions/mean_length": 17.825,
|
| 9872 |
+
"completions/mean_terminated_length": 17.825,
|
| 9873 |
+
"completions/min_length": 16.4,
|
| 9874 |
+
"completions/min_terminated_length": 16.4,
|
| 9875 |
+
"epoch": 0.26127612761276126,
|
| 9876 |
+
"frac_reward_zero_std": 1.0,
|
| 9877 |
+
"grad_norm": 0.0,
|
| 9878 |
+
"kl": 1.3054631665349006,
|
| 9879 |
+
"learning_rate": 4.614675692949815e-06,
|
| 9880 |
+
"loss": 0.0001,
|
| 9881 |
+
"num_tokens": 5423164.0,
|
| 9882 |
+
"reward": 4.099999904632568,
|
| 9883 |
+
"reward_std": 0.0,
|
| 9884 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9885 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9886 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9887 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9888 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9889 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9890 |
+
"step": 3800
|
| 9891 |
+
},
|
| 9892 |
+
{
|
| 9893 |
+
"completion_length": 19.6,
|
| 9894 |
+
"completions/clipped_ratio": 0.0,
|
| 9895 |
+
"completions/max_length": 19.6,
|
| 9896 |
+
"completions/max_terminated_length": 19.6,
|
| 9897 |
+
"completions/mean_length": 16.875,
|
| 9898 |
+
"completions/mean_terminated_length": 16.875,
|
| 9899 |
+
"completions/min_length": 15.2,
|
| 9900 |
+
"completions/min_terminated_length": 15.2,
|
| 9901 |
+
"epoch": 0.26196369636963696,
|
| 9902 |
+
"frac_reward_zero_std": 0.9,
|
| 9903 |
+
"grad_norm": 0.0,
|
| 9904 |
+
"kl": 1.2541000019758939,
|
| 9905 |
+
"learning_rate": 4.611469035811404e-06,
|
| 9906 |
+
"loss": 0.0,
|
| 9907 |
+
"num_tokens": 5437159.0,
|
| 9908 |
+
"reward": 3.792499911785126,
|
| 9909 |
+
"reward_std": 0.20499999523162843,
|
| 9910 |
+
"rewards/coherence_reward_func/mean": 1.2024999558925629,
|
| 9911 |
+
"rewards/coherence_reward_func/std": 0.06499999761581421,
|
| 9912 |
+
"rewards/formatting_reward_func/mean": 1.85,
|
| 9913 |
+
"rewards/formatting_reward_func/std": 0.1,
|
| 9914 |
+
"rewards/quality_reward_func/mean": 0.7400000110268593,
|
| 9915 |
+
"rewards/quality_reward_func/std": 0.04000000059604645,
|
| 9916 |
+
"step": 3810
|
| 9917 |
+
},
|
| 9918 |
+
{
|
| 9919 |
+
"completion_length": 19.6,
|
| 9920 |
+
"completions/clipped_ratio": 0.0,
|
| 9921 |
+
"completions/max_length": 19.6,
|
| 9922 |
+
"completions/max_terminated_length": 19.6,
|
| 9923 |
+
"completions/mean_length": 17.125,
|
| 9924 |
+
"completions/mean_terminated_length": 17.125,
|
| 9925 |
+
"completions/min_length": 15.7,
|
| 9926 |
+
"completions/min_terminated_length": 15.7,
|
| 9927 |
+
"epoch": 0.26265126512651266,
|
| 9928 |
+
"frac_reward_zero_std": 1.0,
|
| 9929 |
+
"grad_norm": 0.0,
|
| 9930 |
+
"kl": 1.126739951223135,
|
| 9931 |
+
"learning_rate": 4.60825021481564e-06,
|
| 9932 |
+
"loss": 0.0,
|
| 9933 |
+
"num_tokens": 5451712.0,
|
| 9934 |
+
"reward": 4.099999904632568,
|
| 9935 |
+
"reward_std": 0.0,
|
| 9936 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9937 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9938 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9939 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9940 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9941 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9942 |
+
"step": 3820
|
| 9943 |
+
},
|
| 9944 |
+
{
|
| 9945 |
+
"completion_length": 19.6,
|
| 9946 |
+
"completions/clipped_ratio": 0.0,
|
| 9947 |
+
"completions/max_length": 19.6,
|
| 9948 |
+
"completions/max_terminated_length": 19.6,
|
| 9949 |
+
"completions/mean_length": 17.25,
|
| 9950 |
+
"completions/mean_terminated_length": 17.25,
|
| 9951 |
+
"completions/min_length": 16.2,
|
| 9952 |
+
"completions/min_terminated_length": 16.2,
|
| 9953 |
+
"epoch": 0.26333883388338836,
|
| 9954 |
+
"frac_reward_zero_std": 1.0,
|
| 9955 |
+
"grad_norm": 0.0,
|
| 9956 |
+
"kl": 1.021160862594843,
|
| 9957 |
+
"learning_rate": 4.60501924850567e-06,
|
| 9958 |
+
"loss": 0.0,
|
| 9959 |
+
"num_tokens": 5464550.0,
|
| 9960 |
+
"reward": 4.099999904632568,
|
| 9961 |
+
"reward_std": 0.0,
|
| 9962 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9963 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9964 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9965 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9966 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9967 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9968 |
+
"step": 3830
|
| 9969 |
+
},
|
| 9970 |
+
{
|
| 9971 |
+
"completion_length": 15.9,
|
| 9972 |
+
"completions/clipped_ratio": 0.0,
|
| 9973 |
+
"completions/max_length": 15.9,
|
| 9974 |
+
"completions/max_terminated_length": 15.9,
|
| 9975 |
+
"completions/mean_length": 15.15,
|
| 9976 |
+
"completions/mean_terminated_length": 15.15,
|
| 9977 |
+
"completions/min_length": 14.7,
|
| 9978 |
+
"completions/min_terminated_length": 14.7,
|
| 9979 |
+
"epoch": 0.264026402640264,
|
| 9980 |
+
"frac_reward_zero_std": 1.0,
|
| 9981 |
+
"grad_norm": 0.0,
|
| 9982 |
+
"kl": 1.45318810492754,
|
| 9983 |
+
"learning_rate": 4.601776155494607e-06,
|
| 9984 |
+
"loss": 0.0,
|
| 9985 |
+
"num_tokens": 5477840.0,
|
| 9986 |
+
"reward": 4.099999904632568,
|
| 9987 |
+
"reward_std": 0.0,
|
| 9988 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 9989 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 9990 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 9991 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 9992 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 9993 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 9994 |
+
"step": 3840
|
| 9995 |
+
},
|
| 9996 |
+
{
|
| 9997 |
+
"completion_length": 20.6,
|
| 9998 |
+
"completions/clipped_ratio": 0.0,
|
| 9999 |
+
"completions/max_length": 20.6,
|
| 10000 |
+
"completions/max_terminated_length": 20.6,
|
| 10001 |
+
"completions/mean_length": 17.3,
|
| 10002 |
+
"completions/mean_terminated_length": 17.3,
|
| 10003 |
+
"completions/min_length": 15.3,
|
| 10004 |
+
"completions/min_terminated_length": 15.3,
|
| 10005 |
+
"epoch": 0.2647139713971397,
|
| 10006 |
+
"frac_reward_zero_std": 1.0,
|
| 10007 |
+
"grad_norm": 0.0,
|
| 10008 |
+
"kl": 1.0578694000840188,
|
| 10009 |
+
"learning_rate": 4.5985209544654265e-06,
|
| 10010 |
+
"loss": 0.0,
|
| 10011 |
+
"num_tokens": 5491052.0,
|
| 10012 |
+
"reward": 4.099999904632568,
|
| 10013 |
+
"reward_std": 0.0,
|
| 10014 |
+
"rewards/coherence_reward_func/mean": 1.2999999523162842,
|
| 10015 |
+
"rewards/coherence_reward_func/std": 0.0,
|
| 10016 |
+
"rewards/formatting_reward_func/mean": 2.0,
|
| 10017 |
+
"rewards/formatting_reward_func/std": 0.0,
|
| 10018 |
+
"rewards/quality_reward_func/mean": 0.800000011920929,
|
| 10019 |
+
"rewards/quality_reward_func/std": 0.0,
|
| 10020 |
+
"step": 3850
|
| 10021 |
}
|
| 10022 |
],
|
| 10023 |
"logging_steps": 10,
|
| 10024 |
"max_steps": 14544,
|
| 10025 |
+
"num_input_tokens_seen": 5491052,
|
| 10026 |
"num_train_epochs": 1,
|
| 10027 |
"save_steps": 50,
|
| 10028 |
"stateful_callbacks": {
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 7057
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42b42a64fa29ca47bc2e0aa39c0a6a5f4997b48e715b9026d691d0c0901ff35f
|
| 3 |
size 7057
|