Arittro2 commited on
Commit
a324af7
·
verified ·
1 Parent(s): 7998885

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "gate_proj",
33
- "up_proj",
34
- "o_proj",
35
  "q_proj",
36
- "down_proj",
37
  "k_proj",
38
- "v_proj"
 
 
 
39
  ],
40
  "task_type": "CAUSAL_LM",
41
  "trainable_token_indices": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "v_proj",
 
 
33
  "q_proj",
 
34
  "k_proj",
35
+ "down_proj",
36
+ "gate_proj",
37
+ "up_proj",
38
+ "o_proj"
39
  ],
40
  "task_type": "CAUSAL_LM",
41
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:152fa5f56cb1824db7e11ba86a3524bd6fa52e5e390f62cccf3cb65d251aa2f8
3
  size 262406656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2738bd78dd9fdddc1b66df0f6ec4635109af536dbc363aafed6d322450beb8b
3
  size 262406656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bdd2a8d55ade8743234ae1eb53e4bb2735dd0f20952f8c68c188babe3956e2b
3
  size 122872331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0292821427c451536eef21c12bb5f7bc0a97ab8f847350a83eaae9819255c5
3
  size 122872331
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fade270bd5aea03ad2ee21fc77c487d756842297b94b550e453d6f16d8ae5879
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0a5f633aef81c2c2385c3ac3006b6453cc048a5d296f67bd0c5df19b617956
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30591131e8e6f66410c593e632ee259cbfda76932a60127de665fe5e62f8b3c5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0917f83675af402a7519163c507a3887460b43acf17e8357c7b8ced53c5a092
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.22345984598459845,
6
  "eval_steps": 500,
7
- "global_step": 3250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8458,11 +8458,1571 @@
8458
  "rewards/quality_reward_func/mean": 0.800000011920929,
8459
  "rewards/quality_reward_func/std": 0.0,
8460
  "step": 3250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8461
  }
8462
  ],
8463
  "logging_steps": 10,
8464
  "max_steps": 14544,
8465
- "num_input_tokens_seen": 4648281,
8466
  "num_train_epochs": 1,
8467
  "save_steps": 50,
8468
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2647139713971397,
6
  "eval_steps": 500,
7
+ "global_step": 3850,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8458
  "rewards/quality_reward_func/mean": 0.800000011920929,
8459
  "rewards/quality_reward_func/std": 0.0,
8460
  "step": 3250
8461
+ },
8462
+ {
8463
+ "completion_length": 20.7,
8464
+ "completions/clipped_ratio": 0.0,
8465
+ "completions/max_length": 20.7,
8466
+ "completions/max_terminated_length": 20.7,
8467
+ "completions/mean_length": 18.425,
8468
+ "completions/mean_terminated_length": 18.425,
8469
+ "completions/min_length": 16.6,
8470
+ "completions/min_terminated_length": 16.6,
8471
+ "epoch": 0.22414741474147415,
8472
+ "frac_reward_zero_std": 1.0,
8473
+ "grad_norm": 0.0,
8474
+ "kl": 1.1709686018526555,
8475
+ "learning_rate": 4.769285944997953e-06,
8476
+ "loss": 0.0,
8477
+ "num_tokens": 4664606.0,
8478
+ "reward": 4.099999904632568,
8479
+ "reward_std": 0.0,
8480
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8481
+ "rewards/coherence_reward_func/std": 0.0,
8482
+ "rewards/formatting_reward_func/mean": 2.0,
8483
+ "rewards/formatting_reward_func/std": 0.0,
8484
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8485
+ "rewards/quality_reward_func/std": 0.0,
8486
+ "step": 3260
8487
+ },
8488
+ {
8489
+ "completion_length": 18.4,
8490
+ "completions/clipped_ratio": 0.0,
8491
+ "completions/max_length": 18.4,
8492
+ "completions/max_terminated_length": 18.4,
8493
+ "completions/mean_length": 17.45,
8494
+ "completions/mean_terminated_length": 17.45,
8495
+ "completions/min_length": 16.3,
8496
+ "completions/min_terminated_length": 16.3,
8497
+ "epoch": 0.22483498349834982,
8498
+ "frac_reward_zero_std": 1.0,
8499
+ "grad_norm": 0.0,
8500
+ "kl": 1.3686413869261742,
8501
+ "learning_rate": 4.766761692749586e-06,
8502
+ "loss": 0.0,
8503
+ "num_tokens": 4679528.0,
8504
+ "reward": 4.099999904632568,
8505
+ "reward_std": 0.0,
8506
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8507
+ "rewards/coherence_reward_func/std": 0.0,
8508
+ "rewards/formatting_reward_func/mean": 2.0,
8509
+ "rewards/formatting_reward_func/std": 0.0,
8510
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8511
+ "rewards/quality_reward_func/std": 0.0,
8512
+ "step": 3270
8513
+ },
8514
+ {
8515
+ "completion_length": 20.3,
8516
+ "completions/clipped_ratio": 0.0,
8517
+ "completions/max_length": 20.3,
8518
+ "completions/max_terminated_length": 20.3,
8519
+ "completions/mean_length": 17.275,
8520
+ "completions/mean_terminated_length": 17.275,
8521
+ "completions/min_length": 15.4,
8522
+ "completions/min_terminated_length": 15.4,
8523
+ "epoch": 0.22552255225522552,
8524
+ "frac_reward_zero_std": 1.0,
8525
+ "grad_norm": 0.0,
8526
+ "kl": 0.9682805396616458,
8527
+ "learning_rate": 4.764224382026094e-06,
8528
+ "loss": 0.0,
8529
+ "num_tokens": 4692875.0,
8530
+ "reward": 4.099999904632568,
8531
+ "reward_std": 0.0,
8532
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8533
+ "rewards/coherence_reward_func/std": 0.0,
8534
+ "rewards/formatting_reward_func/mean": 2.0,
8535
+ "rewards/formatting_reward_func/std": 0.0,
8536
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8537
+ "rewards/quality_reward_func/std": 0.0,
8538
+ "step": 3280
8539
+ },
8540
+ {
8541
+ "completion_length": 21.0,
8542
+ "completions/clipped_ratio": 0.0,
8543
+ "completions/max_length": 21.0,
8544
+ "completions/max_terminated_length": 21.0,
8545
+ "completions/mean_length": 18.825,
8546
+ "completions/mean_terminated_length": 18.825,
8547
+ "completions/min_length": 17.0,
8548
+ "completions/min_terminated_length": 17.0,
8549
+ "epoch": 0.22621012101210122,
8550
+ "frac_reward_zero_std": 1.0,
8551
+ "grad_norm": 0.0,
8552
+ "kl": 1.0991791110485791,
8553
+ "learning_rate": 4.761674027444544e-06,
8554
+ "loss": 0.0,
8555
+ "num_tokens": 4708156.0,
8556
+ "reward": 4.099999904632568,
8557
+ "reward_std": 0.0,
8558
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8559
+ "rewards/coherence_reward_func/std": 0.0,
8560
+ "rewards/formatting_reward_func/mean": 2.0,
8561
+ "rewards/formatting_reward_func/std": 0.0,
8562
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8563
+ "rewards/quality_reward_func/std": 0.0,
8564
+ "step": 3290
8565
+ },
8566
+ {
8567
+ "completion_length": 20.2,
8568
+ "completions/clipped_ratio": 0.0,
8569
+ "completions/max_length": 20.2,
8570
+ "completions/max_terminated_length": 20.2,
8571
+ "completions/mean_length": 18.65,
8572
+ "completions/mean_terminated_length": 18.65,
8573
+ "completions/min_length": 17.0,
8574
+ "completions/min_terminated_length": 17.0,
8575
+ "epoch": 0.2268976897689769,
8576
+ "frac_reward_zero_std": 1.0,
8577
+ "grad_norm": 0.0,
8578
+ "kl": 1.0455322712659836,
8579
+ "learning_rate": 4.759110643697146e-06,
8580
+ "loss": 0.0,
8581
+ "num_tokens": 4722014.0,
8582
+ "reward": 4.099999904632568,
8583
+ "reward_std": 0.0,
8584
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8585
+ "rewards/coherence_reward_func/std": 0.0,
8586
+ "rewards/formatting_reward_func/mean": 2.0,
8587
+ "rewards/formatting_reward_func/std": 0.0,
8588
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8589
+ "rewards/quality_reward_func/std": 0.0,
8590
+ "step": 3300
8591
+ },
8592
+ {
8593
+ "completion_length": 20.0,
8594
+ "completions/clipped_ratio": 0.0,
8595
+ "completions/max_length": 20.0,
8596
+ "completions/max_terminated_length": 20.0,
8597
+ "completions/mean_length": 17.925,
8598
+ "completions/mean_terminated_length": 17.925,
8599
+ "completions/min_length": 16.2,
8600
+ "completions/min_terminated_length": 16.2,
8601
+ "epoch": 0.2275852585258526,
8602
+ "frac_reward_zero_std": 1.0,
8603
+ "grad_norm": 0.0,
8604
+ "kl": 1.2300671976059676,
8605
+ "learning_rate": 4.756534245551172e-06,
8606
+ "loss": 0.0,
8607
+ "num_tokens": 4735443.0,
8608
+ "reward": 4.099999904632568,
8609
+ "reward_std": 0.0,
8610
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8611
+ "rewards/coherence_reward_func/std": 0.0,
8612
+ "rewards/formatting_reward_func/mean": 2.0,
8613
+ "rewards/formatting_reward_func/std": 0.0,
8614
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8615
+ "rewards/quality_reward_func/std": 0.0,
8616
+ "step": 3310
8617
+ },
8618
+ {
8619
+ "completion_length": 16.4,
8620
+ "completions/clipped_ratio": 0.0,
8621
+ "completions/max_length": 16.4,
8622
+ "completions/max_terminated_length": 16.4,
8623
+ "completions/mean_length": 15.575,
8624
+ "completions/mean_terminated_length": 15.575,
8625
+ "completions/min_length": 14.8,
8626
+ "completions/min_terminated_length": 14.8,
8627
+ "epoch": 0.22827282728272827,
8628
+ "frac_reward_zero_std": 1.0,
8629
+ "grad_norm": 0.0,
8630
+ "kl": 1.2536677211523055,
8631
+ "learning_rate": 4.753944847848867e-06,
8632
+ "loss": 0.0,
8633
+ "num_tokens": 4748098.0,
8634
+ "reward": 4.099999904632568,
8635
+ "reward_std": 0.0,
8636
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8637
+ "rewards/coherence_reward_func/std": 0.0,
8638
+ "rewards/formatting_reward_func/mean": 2.0,
8639
+ "rewards/formatting_reward_func/std": 0.0,
8640
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8641
+ "rewards/quality_reward_func/std": 0.0,
8642
+ "step": 3320
8643
+ },
8644
+ {
8645
+ "completion_length": 20.3,
8646
+ "completions/clipped_ratio": 0.0,
8647
+ "completions/max_length": 20.3,
8648
+ "completions/max_terminated_length": 20.3,
8649
+ "completions/mean_length": 18.8,
8650
+ "completions/mean_terminated_length": 18.8,
8651
+ "completions/min_length": 17.7,
8652
+ "completions/min_terminated_length": 17.7,
8653
+ "epoch": 0.22896039603960397,
8654
+ "frac_reward_zero_std": 1.0,
8655
+ "grad_norm": 0.0,
8656
+ "kl": 1.3008077703416348,
8657
+ "learning_rate": 4.751342465507362e-06,
8658
+ "loss": 0.0,
8659
+ "num_tokens": 4761274.0,
8660
+ "reward": 4.099999904632568,
8661
+ "reward_std": 0.0,
8662
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8663
+ "rewards/coherence_reward_func/std": 0.0,
8664
+ "rewards/formatting_reward_func/mean": 2.0,
8665
+ "rewards/formatting_reward_func/std": 0.0,
8666
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8667
+ "rewards/quality_reward_func/std": 0.0,
8668
+ "step": 3330
8669
+ },
8670
+ {
8671
+ "completion_length": 22.3,
8672
+ "completions/clipped_ratio": 0.0,
8673
+ "completions/max_length": 22.3,
8674
+ "completions/max_terminated_length": 22.3,
8675
+ "completions/mean_length": 19.325,
8676
+ "completions/mean_terminated_length": 19.325,
8677
+ "completions/min_length": 16.7,
8678
+ "completions/min_terminated_length": 16.7,
8679
+ "epoch": 0.22964796479647964,
8680
+ "frac_reward_zero_std": 1.0,
8681
+ "grad_norm": 0.0,
8682
+ "kl": 1.0198756888508798,
8683
+ "learning_rate": 4.748727113518594e-06,
8684
+ "loss": 0.0,
8685
+ "num_tokens": 4773463.0,
8686
+ "reward": 4.099999904632568,
8687
+ "reward_std": 0.0,
8688
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8689
+ "rewards/coherence_reward_func/std": 0.0,
8690
+ "rewards/formatting_reward_func/mean": 2.0,
8691
+ "rewards/formatting_reward_func/std": 0.0,
8692
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8693
+ "rewards/quality_reward_func/std": 0.0,
8694
+ "step": 3340
8695
+ },
8696
+ {
8697
+ "completion_length": 18.2,
8698
+ "completions/clipped_ratio": 0.0,
8699
+ "completions/max_length": 18.2,
8700
+ "completions/max_terminated_length": 18.2,
8701
+ "completions/mean_length": 17.35,
8702
+ "completions/mean_terminated_length": 17.35,
8703
+ "completions/min_length": 16.5,
8704
+ "completions/min_terminated_length": 16.5,
8705
+ "epoch": 0.23033553355335534,
8706
+ "frac_reward_zero_std": 1.0,
8707
+ "grad_norm": 0.0,
8708
+ "kl": 1.0109242379665375,
8709
+ "learning_rate": 4.746098806949213e-06,
8710
+ "loss": 0.0,
8711
+ "num_tokens": 4787017.0,
8712
+ "reward": 4.099999904632568,
8713
+ "reward_std": 0.0,
8714
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8715
+ "rewards/coherence_reward_func/std": 0.0,
8716
+ "rewards/formatting_reward_func/mean": 2.0,
8717
+ "rewards/formatting_reward_func/std": 0.0,
8718
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8719
+ "rewards/quality_reward_func/std": 0.0,
8720
+ "step": 3350
8721
+ },
8722
+ {
8723
+ "completion_length": 17.8,
8724
+ "completions/clipped_ratio": 0.0,
8725
+ "completions/max_length": 17.8,
8726
+ "completions/max_terminated_length": 17.8,
8727
+ "completions/mean_length": 17.025,
8728
+ "completions/mean_terminated_length": 17.025,
8729
+ "completions/min_length": 15.8,
8730
+ "completions/min_terminated_length": 15.8,
8731
+ "epoch": 0.23102310231023102,
8732
+ "frac_reward_zero_std": 1.0,
8733
+ "grad_norm": 0.0,
8734
+ "kl": 1.1944296956062317,
8735
+ "learning_rate": 4.743457560940503e-06,
8736
+ "loss": 0.0,
8737
+ "num_tokens": 4800622.0,
8738
+ "reward": 4.099999904632568,
8739
+ "reward_std": 0.0,
8740
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8741
+ "rewards/coherence_reward_func/std": 0.0,
8742
+ "rewards/formatting_reward_func/mean": 2.0,
8743
+ "rewards/formatting_reward_func/std": 0.0,
8744
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8745
+ "rewards/quality_reward_func/std": 0.0,
8746
+ "step": 3360
8747
+ },
8748
+ {
8749
+ "completion_length": 21.3,
8750
+ "completions/clipped_ratio": 0.0,
8751
+ "completions/max_length": 21.3,
8752
+ "completions/max_terminated_length": 21.3,
8753
+ "completions/mean_length": 17.55,
8754
+ "completions/mean_terminated_length": 17.55,
8755
+ "completions/min_length": 15.2,
8756
+ "completions/min_terminated_length": 15.2,
8757
+ "epoch": 0.23171067106710672,
8758
+ "frac_reward_zero_std": 1.0,
8759
+ "grad_norm": 0.0,
8760
+ "kl": 1.1894298686645925,
8761
+ "learning_rate": 4.740803390708284e-06,
8762
+ "loss": 0.0,
8763
+ "num_tokens": 4815392.0,
8764
+ "reward": 4.099999904632568,
8765
+ "reward_std": 0.0,
8766
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8767
+ "rewards/coherence_reward_func/std": 0.0,
8768
+ "rewards/formatting_reward_func/mean": 2.0,
8769
+ "rewards/formatting_reward_func/std": 0.0,
8770
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8771
+ "rewards/quality_reward_func/std": 0.0,
8772
+ "step": 3370
8773
+ },
8774
+ {
8775
+ "completion_length": 17.5,
8776
+ "completions/clipped_ratio": 0.0,
8777
+ "completions/max_length": 17.5,
8778
+ "completions/max_terminated_length": 17.5,
8779
+ "completions/mean_length": 16.4,
8780
+ "completions/mean_terminated_length": 16.4,
8781
+ "completions/min_length": 15.3,
8782
+ "completions/min_terminated_length": 15.3,
8783
+ "epoch": 0.2323982398239824,
8784
+ "frac_reward_zero_std": 1.0,
8785
+ "grad_norm": 0.0,
8786
+ "kl": 1.3694863229990006,
8787
+ "learning_rate": 4.738136311542836e-06,
8788
+ "loss": 0.0,
8789
+ "num_tokens": 4831268.0,
8790
+ "reward": 4.099999904632568,
8791
+ "reward_std": 0.0,
8792
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8793
+ "rewards/coherence_reward_func/std": 0.0,
8794
+ "rewards/formatting_reward_func/mean": 2.0,
8795
+ "rewards/formatting_reward_func/std": 0.0,
8796
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8797
+ "rewards/quality_reward_func/std": 0.0,
8798
+ "step": 3380
8799
+ },
8800
+ {
8801
+ "completion_length": 19.6,
8802
+ "completions/clipped_ratio": 0.0,
8803
+ "completions/max_length": 19.6,
8804
+ "completions/max_terminated_length": 19.6,
8805
+ "completions/mean_length": 18.425,
8806
+ "completions/mean_terminated_length": 18.425,
8807
+ "completions/min_length": 16.7,
8808
+ "completions/min_terminated_length": 16.7,
8809
+ "epoch": 0.2330858085808581,
8810
+ "frac_reward_zero_std": 1.0,
8811
+ "grad_norm": 0.0,
8812
+ "kl": 1.2427097693085671,
8813
+ "learning_rate": 4.7354563388088026e-06,
8814
+ "loss": 0.0,
8815
+ "num_tokens": 4846697.0,
8816
+ "reward": 4.099999904632568,
8817
+ "reward_std": 0.0,
8818
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8819
+ "rewards/coherence_reward_func/std": 0.0,
8820
+ "rewards/formatting_reward_func/mean": 2.0,
8821
+ "rewards/formatting_reward_func/std": 0.0,
8822
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8823
+ "rewards/quality_reward_func/std": 0.0,
8824
+ "step": 3390
8825
+ },
8826
+ {
8827
+ "completion_length": 21.6,
8828
+ "completions/clipped_ratio": 0.0,
8829
+ "completions/max_length": 21.6,
8830
+ "completions/max_terminated_length": 21.6,
8831
+ "completions/mean_length": 19.075,
8832
+ "completions/mean_terminated_length": 19.075,
8833
+ "completions/min_length": 16.2,
8834
+ "completions/min_terminated_length": 16.2,
8835
+ "epoch": 0.23377337733773376,
8836
+ "frac_reward_zero_std": 1.0,
8837
+ "grad_norm": 0.0,
8838
+ "kl": 1.0708073504269122,
8839
+ "learning_rate": 4.732763487945106e-06,
8840
+ "loss": 0.0,
8841
+ "num_tokens": 4861028.0,
8842
+ "reward": 4.099999904632568,
8843
+ "reward_std": 0.0,
8844
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8845
+ "rewards/coherence_reward_func/std": 0.0,
8846
+ "rewards/formatting_reward_func/mean": 2.0,
8847
+ "rewards/formatting_reward_func/std": 0.0,
8848
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8849
+ "rewards/quality_reward_func/std": 0.0,
8850
+ "step": 3400
8851
+ },
8852
+ {
8853
+ "completion_length": 29.7,
8854
+ "completions/clipped_ratio": 0.0,
8855
+ "completions/max_length": 29.7,
8856
+ "completions/max_terminated_length": 29.7,
8857
+ "completions/mean_length": 19.5,
8858
+ "completions/mean_terminated_length": 19.5,
8859
+ "completions/min_length": 15.5,
8860
+ "completions/min_terminated_length": 15.5,
8861
+ "epoch": 0.23446094609460946,
8862
+ "frac_reward_zero_std": 0.9,
8863
+ "grad_norm": 0.3156428635120392,
8864
+ "kl": 0.9965578641742467,
8865
+ "learning_rate": 4.730057774464856e-06,
8866
+ "loss": 0.0,
8867
+ "num_tokens": 4877352.0,
8868
+ "reward": 4.092499876022339,
8869
+ "reward_std": 0.015000002086162567,
8870
+ "rewards/coherence_reward_func/mean": 1.2924999475479126,
8871
+ "rewards/coherence_reward_func/std": 0.01499999761581421,
8872
+ "rewards/formatting_reward_func/mean": 2.0,
8873
+ "rewards/formatting_reward_func/std": 0.0,
8874
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8875
+ "rewards/quality_reward_func/std": 0.0,
8876
+ "step": 3410
8877
+ },
8878
+ {
8879
+ "completion_length": 23.0,
8880
+ "completions/clipped_ratio": 0.0,
8881
+ "completions/max_length": 23.0,
8882
+ "completions/max_terminated_length": 23.0,
8883
+ "completions/mean_length": 18.875,
8884
+ "completions/mean_terminated_length": 18.875,
8885
+ "completions/min_length": 15.7,
8886
+ "completions/min_terminated_length": 15.7,
8887
+ "epoch": 0.23514851485148514,
8888
+ "frac_reward_zero_std": 1.0,
8889
+ "grad_norm": 0.0,
8890
+ "kl": 1.0397824190557003,
8891
+ "learning_rate": 4.727339213955265e-06,
8892
+ "loss": 0.0,
8893
+ "num_tokens": 4889631.0,
8894
+ "reward": 4.099999904632568,
8895
+ "reward_std": 0.0,
8896
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8897
+ "rewards/coherence_reward_func/std": 0.0,
8898
+ "rewards/formatting_reward_func/mean": 2.0,
8899
+ "rewards/formatting_reward_func/std": 0.0,
8900
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8901
+ "rewards/quality_reward_func/std": 0.0,
8902
+ "step": 3420
8903
+ },
8904
+ {
8905
+ "completion_length": 21.8,
8906
+ "completions/clipped_ratio": 0.0,
8907
+ "completions/max_length": 21.8,
8908
+ "completions/max_terminated_length": 21.8,
8909
+ "completions/mean_length": 20.225,
8910
+ "completions/mean_terminated_length": 20.225,
8911
+ "completions/min_length": 18.4,
8912
+ "completions/min_terminated_length": 18.4,
8913
+ "epoch": 0.23583608360836084,
8914
+ "frac_reward_zero_std": 1.0,
8915
+ "grad_norm": 0.0,
8916
+ "kl": 1.215433156117797,
8917
+ "learning_rate": 4.724607822077554e-06,
8918
+ "loss": 0.0,
8919
+ "num_tokens": 4902888.0,
8920
+ "reward": 4.099999904632568,
8921
+ "reward_std": 0.0,
8922
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8923
+ "rewards/coherence_reward_func/std": 0.0,
8924
+ "rewards/formatting_reward_func/mean": 2.0,
8925
+ "rewards/formatting_reward_func/std": 0.0,
8926
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8927
+ "rewards/quality_reward_func/std": 0.0,
8928
+ "step": 3430
8929
+ },
8930
+ {
8931
+ "completion_length": 20.7,
8932
+ "completions/clipped_ratio": 0.0,
8933
+ "completions/max_length": 20.7,
8934
+ "completions/max_terminated_length": 20.7,
8935
+ "completions/mean_length": 19.05,
8936
+ "completions/mean_terminated_length": 19.05,
8937
+ "completions/min_length": 17.2,
8938
+ "completions/min_terminated_length": 17.2,
8939
+ "epoch": 0.23652365236523654,
8940
+ "frac_reward_zero_std": 1.0,
8941
+ "grad_norm": 0.0,
8942
+ "kl": 1.1691066682338715,
8943
+ "learning_rate": 4.7218636145668615e-06,
8944
+ "loss": 0.0,
8945
+ "num_tokens": 4916974.0,
8946
+ "reward": 4.099999904632568,
8947
+ "reward_std": 0.0,
8948
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8949
+ "rewards/coherence_reward_func/std": 0.0,
8950
+ "rewards/formatting_reward_func/mean": 2.0,
8951
+ "rewards/formatting_reward_func/std": 0.0,
8952
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8953
+ "rewards/quality_reward_func/std": 0.0,
8954
+ "step": 3440
8955
+ },
8956
+ {
8957
+ "completion_length": 19.1,
8958
+ "completions/clipped_ratio": 0.0,
8959
+ "completions/max_length": 19.1,
8960
+ "completions/max_terminated_length": 19.1,
8961
+ "completions/mean_length": 16.2,
8962
+ "completions/mean_terminated_length": 16.2,
8963
+ "completions/min_length": 14.4,
8964
+ "completions/min_terminated_length": 14.4,
8965
+ "epoch": 0.2372112211221122,
8966
+ "frac_reward_zero_std": 1.0,
8967
+ "grad_norm": 0.0,
8968
+ "kl": 1.1441729221493007,
8969
+ "learning_rate": 4.7191066072321575e-06,
8970
+ "loss": 0.0,
8971
+ "num_tokens": 4933274.0,
8972
+ "reward": 4.099999904632568,
8973
+ "reward_std": 0.0,
8974
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
8975
+ "rewards/coherence_reward_func/std": 0.0,
8976
+ "rewards/formatting_reward_func/mean": 2.0,
8977
+ "rewards/formatting_reward_func/std": 0.0,
8978
+ "rewards/quality_reward_func/mean": 0.800000011920929,
8979
+ "rewards/quality_reward_func/std": 0.0,
8980
+ "step": 3450
8981
+ },
8982
+ {
8983
+ "completion_length": 22.4,
8984
+ "completions/clipped_ratio": 0.0,
8985
+ "completions/max_length": 22.4,
8986
+ "completions/max_terminated_length": 22.4,
8987
+ "completions/mean_length": 19.525,
8988
+ "completions/mean_terminated_length": 19.525,
8989
+ "completions/min_length": 16.6,
8990
+ "completions/min_terminated_length": 16.6,
8991
+ "epoch": 0.2378987898789879,
8992
+ "frac_reward_zero_std": 1.0,
8993
+ "grad_norm": 0.0,
8994
+ "kl": 1.0465805977582932,
8995
+ "learning_rate": 4.716336815956148e-06,
8996
+ "loss": 0.0,
8997
+ "num_tokens": 4946543.0,
8998
+ "reward": 4.099999904632568,
8999
+ "reward_std": 0.0,
9000
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9001
+ "rewards/coherence_reward_func/std": 0.0,
9002
+ "rewards/formatting_reward_func/mean": 2.0,
9003
+ "rewards/formatting_reward_func/std": 0.0,
9004
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9005
+ "rewards/quality_reward_func/std": 0.0,
9006
+ "step": 3460
9007
+ },
9008
+ {
9009
+ "completion_length": 21.6,
9010
+ "completions/clipped_ratio": 0.0,
9011
+ "completions/max_length": 21.6,
9012
+ "completions/max_terminated_length": 21.6,
9013
+ "completions/mean_length": 19.35,
9014
+ "completions/mean_terminated_length": 19.35,
9015
+ "completions/min_length": 18.0,
9016
+ "completions/min_terminated_length": 18.0,
9017
+ "epoch": 0.23858635863586358,
9018
+ "frac_reward_zero_std": 1.0,
9019
+ "grad_norm": 0.0,
9020
+ "kl": 1.2777705937623978,
9021
+ "learning_rate": 4.713554256695188e-06,
9022
+ "loss": 0.0001,
9023
+ "num_tokens": 4959301.0,
9024
+ "reward": 4.099999904632568,
9025
+ "reward_std": 0.0,
9026
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9027
+ "rewards/coherence_reward_func/std": 0.0,
9028
+ "rewards/formatting_reward_func/mean": 2.0,
9029
+ "rewards/formatting_reward_func/std": 0.0,
9030
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9031
+ "rewards/quality_reward_func/std": 0.0,
9032
+ "step": 3470
9033
+ },
9034
+ {
9035
+ "completion_length": 20.9,
9036
+ "completions/clipped_ratio": 0.0,
9037
+ "completions/max_length": 20.9,
9038
+ "completions/max_terminated_length": 20.9,
9039
+ "completions/mean_length": 20.0,
9040
+ "completions/mean_terminated_length": 20.0,
9041
+ "completions/min_length": 19.0,
9042
+ "completions/min_terminated_length": 19.0,
9043
+ "epoch": 0.23927392739273928,
9044
+ "frac_reward_zero_std": 1.0,
9045
+ "grad_norm": 0.0,
9046
+ "kl": 1.1782601185142993,
9047
+ "learning_rate": 4.710758945479184e-06,
9048
+ "loss": 0.0,
9049
+ "num_tokens": 4973385.0,
9050
+ "reward": 4.099999904632568,
9051
+ "reward_std": 0.0,
9052
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9053
+ "rewards/coherence_reward_func/std": 0.0,
9054
+ "rewards/formatting_reward_func/mean": 2.0,
9055
+ "rewards/formatting_reward_func/std": 0.0,
9056
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9057
+ "rewards/quality_reward_func/std": 0.0,
9058
+ "step": 3480
9059
+ },
9060
+ {
9061
+ "completion_length": 21.4,
9062
+ "completions/clipped_ratio": 0.0,
9063
+ "completions/max_length": 21.4,
9064
+ "completions/max_terminated_length": 21.4,
9065
+ "completions/mean_length": 18.025,
9066
+ "completions/mean_terminated_length": 18.025,
9067
+ "completions/min_length": 16.3,
9068
+ "completions/min_terminated_length": 16.3,
9069
+ "epoch": 0.23996149614961496,
9070
+ "frac_reward_zero_std": 1.0,
9071
+ "grad_norm": 0.0,
9072
+ "kl": 1.0537080638110639,
9073
+ "learning_rate": 4.7079508984115064e-06,
9074
+ "loss": 0.0,
9075
+ "num_tokens": 4986858.0,
9076
+ "reward": 4.099999904632568,
9077
+ "reward_std": 0.0,
9078
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9079
+ "rewards/coherence_reward_func/std": 0.0,
9080
+ "rewards/formatting_reward_func/mean": 2.0,
9081
+ "rewards/formatting_reward_func/std": 0.0,
9082
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9083
+ "rewards/quality_reward_func/std": 0.0,
9084
+ "step": 3490
9085
+ },
9086
+ {
9087
+ "completion_length": 18.2,
9088
+ "completions/clipped_ratio": 0.0,
9089
+ "completions/max_length": 18.2,
9090
+ "completions/max_terminated_length": 18.2,
9091
+ "completions/mean_length": 16.15,
9092
+ "completions/mean_terminated_length": 16.15,
9093
+ "completions/min_length": 15.0,
9094
+ "completions/min_terminated_length": 15.0,
9095
+ "epoch": 0.24064906490649066,
9096
+ "frac_reward_zero_std": 1.0,
9097
+ "grad_norm": 0.0,
9098
+ "kl": 1.129155667871237,
9099
+ "learning_rate": 4.705130131668894e-06,
9100
+ "loss": 0.0,
9101
+ "num_tokens": 5003140.0,
9102
+ "reward": 4.099999904632568,
9103
+ "reward_std": 0.0,
9104
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9105
+ "rewards/coherence_reward_func/std": 0.0,
9106
+ "rewards/formatting_reward_func/mean": 2.0,
9107
+ "rewards/formatting_reward_func/std": 0.0,
9108
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9109
+ "rewards/quality_reward_func/std": 0.0,
9110
+ "step": 3500
9111
+ },
9112
+ {
9113
+ "completion_length": 22.9,
9114
+ "completions/clipped_ratio": 0.0,
9115
+ "completions/max_length": 22.9,
9116
+ "completions/max_terminated_length": 22.9,
9117
+ "completions/mean_length": 19.525,
9118
+ "completions/mean_terminated_length": 19.525,
9119
+ "completions/min_length": 16.7,
9120
+ "completions/min_terminated_length": 16.7,
9121
+ "epoch": 0.24133663366336633,
9122
+ "frac_reward_zero_std": 1.0,
9123
+ "grad_norm": 0.0,
9124
+ "kl": 1.2283895801752807,
9125
+ "learning_rate": 4.702296661501362e-06,
9126
+ "loss": 0.0001,
9127
+ "num_tokens": 5018057.0,
9128
+ "reward": 4.099999904632568,
9129
+ "reward_std": 0.0,
9130
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9131
+ "rewards/coherence_reward_func/std": 0.0,
9132
+ "rewards/formatting_reward_func/mean": 2.0,
9133
+ "rewards/formatting_reward_func/std": 0.0,
9134
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9135
+ "rewards/quality_reward_func/std": 0.0,
9136
+ "step": 3510
9137
+ },
9138
+ {
9139
+ "completion_length": 19.5,
9140
+ "completions/clipped_ratio": 0.0,
9141
+ "completions/max_length": 19.5,
9142
+ "completions/max_terminated_length": 19.5,
9143
+ "completions/mean_length": 18.275,
9144
+ "completions/mean_terminated_length": 18.275,
9145
+ "completions/min_length": 17.1,
9146
+ "completions/min_terminated_length": 17.1,
9147
+ "epoch": 0.24202420242024203,
9148
+ "frac_reward_zero_std": 1.0,
9149
+ "grad_norm": 0.0,
9150
+ "kl": 1.1926358938217163,
9151
+ "learning_rate": 4.6994505042321096e-06,
9152
+ "loss": 0.0,
9153
+ "num_tokens": 5031064.0,
9154
+ "reward": 4.099999904632568,
9155
+ "reward_std": 0.0,
9156
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9157
+ "rewards/coherence_reward_func/std": 0.0,
9158
+ "rewards/formatting_reward_func/mean": 2.0,
9159
+ "rewards/formatting_reward_func/std": 0.0,
9160
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9161
+ "rewards/quality_reward_func/std": 0.0,
9162
+ "step": 3520
9163
+ },
9164
+ {
9165
+ "completion_length": 17.8,
9166
+ "completions/clipped_ratio": 0.0,
9167
+ "completions/max_length": 17.8,
9168
+ "completions/max_terminated_length": 17.8,
9169
+ "completions/mean_length": 16.6,
9170
+ "completions/mean_terminated_length": 16.6,
9171
+ "completions/min_length": 15.9,
9172
+ "completions/min_terminated_length": 15.9,
9173
+ "epoch": 0.2427117711771177,
9174
+ "frac_reward_zero_std": 1.0,
9175
+ "grad_norm": 0.0,
9176
+ "kl": 1.5111532375216483,
9177
+ "learning_rate": 4.696591676257422e-06,
9178
+ "loss": 0.0,
9179
+ "num_tokens": 5044100.0,
9180
+ "reward": 4.099999904632568,
9181
+ "reward_std": 0.0,
9182
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9183
+ "rewards/coherence_reward_func/std": 0.0,
9184
+ "rewards/formatting_reward_func/mean": 2.0,
9185
+ "rewards/formatting_reward_func/std": 0.0,
9186
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9187
+ "rewards/quality_reward_func/std": 0.0,
9188
+ "step": 3530
9189
+ },
9190
+ {
9191
+ "completion_length": 21.3,
9192
+ "completions/clipped_ratio": 0.0,
9193
+ "completions/max_length": 21.3,
9194
+ "completions/max_terminated_length": 21.3,
9195
+ "completions/mean_length": 19.4,
9196
+ "completions/mean_terminated_length": 19.4,
9197
+ "completions/min_length": 17.8,
9198
+ "completions/min_terminated_length": 17.8,
9199
+ "epoch": 0.2433993399339934,
9200
+ "frac_reward_zero_std": 1.0,
9201
+ "grad_norm": 0.0,
9202
+ "kl": 1.169459306448698,
9203
+ "learning_rate": 4.693720194046579e-06,
9204
+ "loss": 0.0,
9205
+ "num_tokens": 5058988.0,
9206
+ "reward": 4.099999904632568,
9207
+ "reward_std": 0.0,
9208
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9209
+ "rewards/coherence_reward_func/std": 0.0,
9210
+ "rewards/formatting_reward_func/mean": 2.0,
9211
+ "rewards/formatting_reward_func/std": 0.0,
9212
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9213
+ "rewards/quality_reward_func/std": 0.0,
9214
+ "step": 3540
9215
+ },
9216
+ {
9217
+ "completion_length": 17.0,
9218
+ "completions/clipped_ratio": 0.0,
9219
+ "completions/max_length": 17.0,
9220
+ "completions/max_terminated_length": 17.0,
9221
+ "completions/mean_length": 16.35,
9222
+ "completions/mean_terminated_length": 16.35,
9223
+ "completions/min_length": 16.0,
9224
+ "completions/min_terminated_length": 16.0,
9225
+ "epoch": 0.24408690869086908,
9226
+ "frac_reward_zero_std": 1.0,
9227
+ "grad_norm": 0.0,
9228
+ "kl": 1.476221612840891,
9229
+ "learning_rate": 4.690836074141762e-06,
9230
+ "loss": 0.0,
9231
+ "num_tokens": 5075874.0,
9232
+ "reward": 4.099999904632568,
9233
+ "reward_std": 0.0,
9234
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9235
+ "rewards/coherence_reward_func/std": 0.0,
9236
+ "rewards/formatting_reward_func/mean": 2.0,
9237
+ "rewards/formatting_reward_func/std": 0.0,
9238
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9239
+ "rewards/quality_reward_func/std": 0.0,
9240
+ "step": 3550
9241
+ },
9242
+ {
9243
+ "completion_length": 19.3,
9244
+ "completions/clipped_ratio": 0.0,
9245
+ "completions/max_length": 19.3,
9246
+ "completions/max_terminated_length": 19.3,
9247
+ "completions/mean_length": 17.775,
9248
+ "completions/mean_terminated_length": 17.775,
9249
+ "completions/min_length": 16.2,
9250
+ "completions/min_terminated_length": 16.2,
9251
+ "epoch": 0.24477447744774478,
9252
+ "frac_reward_zero_std": 1.0,
9253
+ "grad_norm": 0.0,
9254
+ "kl": 1.4168094083666802,
9255
+ "learning_rate": 4.687939333157954e-06,
9256
+ "loss": 0.0001,
9257
+ "num_tokens": 5089925.0,
9258
+ "reward": 4.099999904632568,
9259
+ "reward_std": 0.0,
9260
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9261
+ "rewards/coherence_reward_func/std": 0.0,
9262
+ "rewards/formatting_reward_func/mean": 2.0,
9263
+ "rewards/formatting_reward_func/std": 0.0,
9264
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9265
+ "rewards/quality_reward_func/std": 0.0,
9266
+ "step": 3560
9267
+ },
9268
+ {
9269
+ "completion_length": 20.3,
9270
+ "completions/clipped_ratio": 0.0,
9271
+ "completions/max_length": 20.3,
9272
+ "completions/max_terminated_length": 20.3,
9273
+ "completions/mean_length": 18.75,
9274
+ "completions/mean_terminated_length": 18.75,
9275
+ "completions/min_length": 17.0,
9276
+ "completions/min_terminated_length": 17.0,
9277
+ "epoch": 0.24546204620462045,
9278
+ "frac_reward_zero_std": 1.0,
9279
+ "grad_norm": 0.0,
9280
+ "kl": 1.2915988519787789,
9281
+ "learning_rate": 4.685029987782845e-06,
9282
+ "loss": 0.0,
9283
+ "num_tokens": 5104875.0,
9284
+ "reward": 4.099999904632568,
9285
+ "reward_std": 0.0,
9286
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9287
+ "rewards/coherence_reward_func/std": 0.0,
9288
+ "rewards/formatting_reward_func/mean": 2.0,
9289
+ "rewards/formatting_reward_func/std": 0.0,
9290
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9291
+ "rewards/quality_reward_func/std": 0.0,
9292
+ "step": 3570
9293
+ },
9294
+ {
9295
+ "completion_length": 20.6,
9296
+ "completions/clipped_ratio": 0.0,
9297
+ "completions/max_length": 20.6,
9298
+ "completions/max_terminated_length": 20.6,
9299
+ "completions/mean_length": 17.4,
9300
+ "completions/mean_terminated_length": 17.4,
9301
+ "completions/min_length": 15.7,
9302
+ "completions/min_terminated_length": 15.7,
9303
+ "epoch": 0.24614961496149615,
9304
+ "frac_reward_zero_std": 1.0,
9305
+ "grad_norm": 0.0,
9306
+ "kl": 1.1931571021676064,
9307
+ "learning_rate": 4.682108054776741e-06,
9308
+ "loss": 0.0,
9309
+ "num_tokens": 5118863.0,
9310
+ "reward": 4.099999904632568,
9311
+ "reward_std": 0.0,
9312
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9313
+ "rewards/coherence_reward_func/std": 0.0,
9314
+ "rewards/formatting_reward_func/mean": 2.0,
9315
+ "rewards/formatting_reward_func/std": 0.0,
9316
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9317
+ "rewards/quality_reward_func/std": 0.0,
9318
+ "step": 3580
9319
+ },
9320
+ {
9321
+ "completion_length": 20.5,
9322
+ "completions/clipped_ratio": 0.0,
9323
+ "completions/max_length": 20.5,
9324
+ "completions/max_terminated_length": 20.5,
9325
+ "completions/mean_length": 18.25,
9326
+ "completions/mean_terminated_length": 18.25,
9327
+ "completions/min_length": 16.4,
9328
+ "completions/min_terminated_length": 16.4,
9329
+ "epoch": 0.24683718371837185,
9330
+ "frac_reward_zero_std": 1.0,
9331
+ "grad_norm": 0.0,
9332
+ "kl": 1.1943508870899677,
9333
+ "learning_rate": 4.67917355097246e-06,
9334
+ "loss": 0.0,
9335
+ "num_tokens": 5132977.0,
9336
+ "reward": 4.099999904632568,
9337
+ "reward_std": 0.0,
9338
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9339
+ "rewards/coherence_reward_func/std": 0.0,
9340
+ "rewards/formatting_reward_func/mean": 2.0,
9341
+ "rewards/formatting_reward_func/std": 0.0,
9342
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9343
+ "rewards/quality_reward_func/std": 0.0,
9344
+ "step": 3590
9345
+ },
9346
+ {
9347
+ "completion_length": 20.2,
9348
+ "completions/clipped_ratio": 0.0,
9349
+ "completions/max_length": 20.2,
9350
+ "completions/max_terminated_length": 20.2,
9351
+ "completions/mean_length": 17.9,
9352
+ "completions/mean_terminated_length": 17.9,
9353
+ "completions/min_length": 16.0,
9354
+ "completions/min_terminated_length": 16.0,
9355
+ "epoch": 0.24752475247524752,
9356
+ "frac_reward_zero_std": 1.0,
9357
+ "grad_norm": 0.0,
9358
+ "kl": 1.3834453955292703,
9359
+ "learning_rate": 4.676226493275239e-06,
9360
+ "loss": 0.0001,
9361
+ "num_tokens": 5146825.0,
9362
+ "reward": 4.099999904632568,
9363
+ "reward_std": 0.0,
9364
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9365
+ "rewards/coherence_reward_func/std": 0.0,
9366
+ "rewards/formatting_reward_func/mean": 2.0,
9367
+ "rewards/formatting_reward_func/std": 0.0,
9368
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9369
+ "rewards/quality_reward_func/std": 0.0,
9370
+ "step": 3600
9371
+ },
9372
+ {
9373
+ "completion_length": 21.6,
9374
+ "completions/clipped_ratio": 0.0,
9375
+ "completions/max_length": 21.6,
9376
+ "completions/max_terminated_length": 21.6,
9377
+ "completions/mean_length": 18.975,
9378
+ "completions/mean_terminated_length": 18.975,
9379
+ "completions/min_length": 16.3,
9380
+ "completions/min_terminated_length": 16.3,
9381
+ "epoch": 0.24821232123212322,
9382
+ "frac_reward_zero_std": 0.9,
9383
+ "grad_norm": 0.0,
9384
+ "kl": 1.411560659110546,
9385
+ "learning_rate": 4.673266898662637e-06,
9386
+ "loss": 0.0001,
9387
+ "num_tokens": 5161888.0,
9388
+ "reward": 4.092499876022339,
9389
+ "reward_std": 0.015000002086162567,
9390
+ "rewards/coherence_reward_func/mean": 1.2924999475479126,
9391
+ "rewards/coherence_reward_func/std": 0.01499999761581421,
9392
+ "rewards/formatting_reward_func/mean": 2.0,
9393
+ "rewards/formatting_reward_func/std": 0.0,
9394
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9395
+ "rewards/quality_reward_func/std": 0.0,
9396
+ "step": 3610
9397
+ },
9398
+ {
9399
+ "completion_length": 18.6,
9400
+ "completions/clipped_ratio": 0.0,
9401
+ "completions/max_length": 18.6,
9402
+ "completions/max_terminated_length": 18.6,
9403
+ "completions/mean_length": 16.7,
9404
+ "completions/mean_terminated_length": 16.7,
9405
+ "completions/min_length": 15.6,
9406
+ "completions/min_terminated_length": 15.6,
9407
+ "epoch": 0.2488998899889989,
9408
+ "frac_reward_zero_std": 1.0,
9409
+ "grad_norm": 0.0,
9410
+ "kl": 1.4795636057853698,
9411
+ "learning_rate": 4.670294784184436e-06,
9412
+ "loss": 0.0,
9413
+ "num_tokens": 5176032.0,
9414
+ "reward": 4.099999904632568,
9415
+ "reward_std": 0.0,
9416
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9417
+ "rewards/coherence_reward_func/std": 0.0,
9418
+ "rewards/formatting_reward_func/mean": 2.0,
9419
+ "rewards/formatting_reward_func/std": 0.0,
9420
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9421
+ "rewards/quality_reward_func/std": 0.0,
9422
+ "step": 3620
9423
+ },
9424
+ {
9425
+ "completion_length": 18.7,
9426
+ "completions/clipped_ratio": 0.0,
9427
+ "completions/max_length": 18.7,
9428
+ "completions/max_terminated_length": 18.7,
9429
+ "completions/mean_length": 16.625,
9430
+ "completions/mean_terminated_length": 16.625,
9431
+ "completions/min_length": 15.3,
9432
+ "completions/min_terminated_length": 15.3,
9433
+ "epoch": 0.2495874587458746,
9434
+ "frac_reward_zero_std": 1.0,
9435
+ "grad_norm": 0.0,
9436
+ "kl": 1.2247574172914029,
9437
+ "learning_rate": 4.6673101669625445e-06,
9438
+ "loss": 0.0,
9439
+ "num_tokens": 5190661.0,
9440
+ "reward": 4.099999904632568,
9441
+ "reward_std": 0.0,
9442
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9443
+ "rewards/coherence_reward_func/std": 0.0,
9444
+ "rewards/formatting_reward_func/mean": 2.0,
9445
+ "rewards/formatting_reward_func/std": 0.0,
9446
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9447
+ "rewards/quality_reward_func/std": 0.0,
9448
+ "step": 3630
9449
+ },
9450
+ {
9451
+ "completion_length": 19.5,
9452
+ "completions/clipped_ratio": 0.0,
9453
+ "completions/max_length": 19.5,
9454
+ "completions/max_terminated_length": 19.5,
9455
+ "completions/mean_length": 17.95,
9456
+ "completions/mean_terminated_length": 17.95,
9457
+ "completions/min_length": 15.9,
9458
+ "completions/min_terminated_length": 15.9,
9459
+ "epoch": 0.25027502750275027,
9460
+ "frac_reward_zero_std": 1.0,
9461
+ "grad_norm": 0.0,
9462
+ "kl": 1.3278845094144345,
9463
+ "learning_rate": 4.664313064190893e-06,
9464
+ "loss": 0.0,
9465
+ "num_tokens": 5206219.0,
9466
+ "reward": 4.099999904632568,
9467
+ "reward_std": 0.0,
9468
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9469
+ "rewards/coherence_reward_func/std": 0.0,
9470
+ "rewards/formatting_reward_func/mean": 2.0,
9471
+ "rewards/formatting_reward_func/std": 0.0,
9472
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9473
+ "rewards/quality_reward_func/std": 0.0,
9474
+ "step": 3640
9475
+ },
9476
+ {
9477
+ "completion_length": 17.8,
9478
+ "completions/clipped_ratio": 0.0,
9479
+ "completions/max_length": 17.8,
9480
+ "completions/max_terminated_length": 17.8,
9481
+ "completions/mean_length": 16.575,
9482
+ "completions/mean_terminated_length": 16.575,
9483
+ "completions/min_length": 15.8,
9484
+ "completions/min_terminated_length": 15.8,
9485
+ "epoch": 0.25096259625962597,
9486
+ "frac_reward_zero_std": 1.0,
9487
+ "grad_norm": 0.0,
9488
+ "kl": 1.3263515307568015,
9489
+ "learning_rate": 4.6613034931353445e-06,
9490
+ "loss": 0.0,
9491
+ "num_tokens": 5217886.0,
9492
+ "reward": 4.099999904632568,
9493
+ "reward_std": 0.0,
9494
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9495
+ "rewards/coherence_reward_func/std": 0.0,
9496
+ "rewards/formatting_reward_func/mean": 2.0,
9497
+ "rewards/formatting_reward_func/std": 0.0,
9498
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9499
+ "rewards/quality_reward_func/std": 0.0,
9500
+ "step": 3650
9501
+ },
9502
+ {
9503
+ "completion_length": 17.7,
9504
+ "completions/clipped_ratio": 0.0,
9505
+ "completions/max_length": 17.7,
9506
+ "completions/max_terminated_length": 17.7,
9507
+ "completions/mean_length": 16.8,
9508
+ "completions/mean_terminated_length": 16.8,
9509
+ "completions/min_length": 16.1,
9510
+ "completions/min_terminated_length": 16.1,
9511
+ "epoch": 0.25165016501650167,
9512
+ "frac_reward_zero_std": 1.0,
9513
+ "grad_norm": 0.0,
9514
+ "kl": 1.2702551379799842,
9515
+ "learning_rate": 4.6582814711335874e-06,
9516
+ "loss": 0.0,
9517
+ "num_tokens": 5229738.0,
9518
+ "reward": 4.099999904632568,
9519
+ "reward_std": 0.0,
9520
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9521
+ "rewards/coherence_reward_func/std": 0.0,
9522
+ "rewards/formatting_reward_func/mean": 2.0,
9523
+ "rewards/formatting_reward_func/std": 0.0,
9524
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9525
+ "rewards/quality_reward_func/std": 0.0,
9526
+ "step": 3660
9527
+ },
9528
+ {
9529
+ "completion_length": 24.7,
9530
+ "completions/clipped_ratio": 0.0,
9531
+ "completions/max_length": 24.7,
9532
+ "completions/max_terminated_length": 24.7,
9533
+ "completions/mean_length": 21.6,
9534
+ "completions/mean_terminated_length": 21.6,
9535
+ "completions/min_length": 19.0,
9536
+ "completions/min_terminated_length": 19.0,
9537
+ "epoch": 0.2523377337733773,
9538
+ "frac_reward_zero_std": 1.0,
9539
+ "grad_norm": 0.0,
9540
+ "kl": 1.0552857838571073,
9541
+ "learning_rate": 4.655247015595039e-06,
9542
+ "loss": 0.0,
9543
+ "num_tokens": 5244126.0,
9544
+ "reward": 4.099999904632568,
9545
+ "reward_std": 0.0,
9546
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9547
+ "rewards/coherence_reward_func/std": 0.0,
9548
+ "rewards/formatting_reward_func/mean": 2.0,
9549
+ "rewards/formatting_reward_func/std": 0.0,
9550
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9551
+ "rewards/quality_reward_func/std": 0.0,
9552
+ "step": 3670
9553
+ },
9554
+ {
9555
+ "completion_length": 18.2,
9556
+ "completions/clipped_ratio": 0.0,
9557
+ "completions/max_length": 18.2,
9558
+ "completions/max_terminated_length": 18.2,
9559
+ "completions/mean_length": 15.95,
9560
+ "completions/mean_terminated_length": 15.95,
9561
+ "completions/min_length": 13.5,
9562
+ "completions/min_terminated_length": 13.5,
9563
+ "epoch": 0.253025302530253,
9564
+ "frac_reward_zero_std": 0.9,
9565
+ "grad_norm": 0.0,
9566
+ "kl": 41.09145687818527,
9567
+ "learning_rate": 4.652200144000743e-06,
9568
+ "loss": 0.0017,
9569
+ "num_tokens": 5258988.0,
9570
+ "reward": 3.8949999094009398,
9571
+ "reward_std": 0.23671360015869142,
9572
+ "rewards/coherence_reward_func/mean": 1.23499995470047,
9573
+ "rewards/coherence_reward_func/std": 0.07505553364753723,
9574
+ "rewards/formatting_reward_func/mean": 1.9,
9575
+ "rewards/formatting_reward_func/std": 0.1154700517654419,
9576
+ "rewards/quality_reward_func/mean": 0.7600000113248825,
9577
+ "rewards/quality_reward_func/std": 0.046188023686408994,
9578
+ "step": 3680
9579
+ },
9580
+ {
9581
+ "completion_length": 21.3,
9582
+ "completions/clipped_ratio": 0.0,
9583
+ "completions/max_length": 21.3,
9584
+ "completions/max_terminated_length": 21.3,
9585
+ "completions/mean_length": 19.175,
9586
+ "completions/mean_terminated_length": 19.175,
9587
+ "completions/min_length": 17.4,
9588
+ "completions/min_terminated_length": 17.4,
9589
+ "epoch": 0.2537128712871287,
9590
+ "frac_reward_zero_std": 1.0,
9591
+ "grad_norm": 0.0,
9592
+ "kl": 1.4594970896840096,
9593
+ "learning_rate": 4.6491408739032705e-06,
9594
+ "loss": 0.0001,
9595
+ "num_tokens": 5273603.0,
9596
+ "reward": 4.099999904632568,
9597
+ "reward_std": 0.0,
9598
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9599
+ "rewards/coherence_reward_func/std": 0.0,
9600
+ "rewards/formatting_reward_func/mean": 2.0,
9601
+ "rewards/formatting_reward_func/std": 0.0,
9602
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9603
+ "rewards/quality_reward_func/std": 0.0,
9604
+ "step": 3690
9605
+ },
9606
+ {
9607
+ "completion_length": 22.5,
9608
+ "completions/clipped_ratio": 0.0,
9609
+ "completions/max_length": 22.5,
9610
+ "completions/max_terminated_length": 22.5,
9611
+ "completions/mean_length": 19.25,
9612
+ "completions/mean_terminated_length": 19.25,
9613
+ "completions/min_length": 17.3,
9614
+ "completions/min_terminated_length": 17.3,
9615
+ "epoch": 0.2544004400440044,
9616
+ "frac_reward_zero_std": 1.0,
9617
+ "grad_norm": 0.0,
9618
+ "kl": 1.1064071744680404,
9619
+ "learning_rate": 4.64606922292662e-06,
9620
+ "loss": 0.0,
9621
+ "num_tokens": 5288777.0,
9622
+ "reward": 4.099999904632568,
9623
+ "reward_std": 0.0,
9624
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9625
+ "rewards/coherence_reward_func/std": 0.0,
9626
+ "rewards/formatting_reward_func/mean": 2.0,
9627
+ "rewards/formatting_reward_func/std": 0.0,
9628
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9629
+ "rewards/quality_reward_func/std": 0.0,
9630
+ "step": 3700
9631
+ },
9632
+ {
9633
+ "completion_length": 17.1,
9634
+ "completions/clipped_ratio": 0.0,
9635
+ "completions/max_length": 17.1,
9636
+ "completions/max_terminated_length": 17.1,
9637
+ "completions/mean_length": 16.55,
9638
+ "completions/mean_terminated_length": 16.55,
9639
+ "completions/min_length": 15.9,
9640
+ "completions/min_terminated_length": 15.9,
9641
+ "epoch": 0.25508800880088006,
9642
+ "frac_reward_zero_std": 1.0,
9643
+ "grad_norm": 0.0,
9644
+ "kl": 1.3959959626197815,
9645
+ "learning_rate": 4.642985208766113e-06,
9646
+ "loss": 0.0,
9647
+ "num_tokens": 5300959.0,
9648
+ "reward": 4.099999904632568,
9649
+ "reward_std": 0.0,
9650
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9651
+ "rewards/coherence_reward_func/std": 0.0,
9652
+ "rewards/formatting_reward_func/mean": 2.0,
9653
+ "rewards/formatting_reward_func/std": 0.0,
9654
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9655
+ "rewards/quality_reward_func/std": 0.0,
9656
+ "step": 3710
9657
+ },
9658
+ {
9659
+ "completion_length": 17.6,
9660
+ "completions/clipped_ratio": 0.0,
9661
+ "completions/max_length": 17.6,
9662
+ "completions/max_terminated_length": 17.6,
9663
+ "completions/mean_length": 16.525,
9664
+ "completions/mean_terminated_length": 16.525,
9665
+ "completions/min_length": 15.4,
9666
+ "completions/min_terminated_length": 15.4,
9667
+ "epoch": 0.25577557755775576,
9668
+ "frac_reward_zero_std": 1.0,
9669
+ "grad_norm": 0.0,
9670
+ "kl": 1.3369979746639729,
9671
+ "learning_rate": 4.639888849188295e-06,
9672
+ "loss": 0.0,
9673
+ "num_tokens": 5314908.0,
9674
+ "reward": 4.099999904632568,
9675
+ "reward_std": 0.0,
9676
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9677
+ "rewards/coherence_reward_func/std": 0.0,
9678
+ "rewards/formatting_reward_func/mean": 2.0,
9679
+ "rewards/formatting_reward_func/std": 0.0,
9680
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9681
+ "rewards/quality_reward_func/std": 0.0,
9682
+ "step": 3720
9683
+ },
9684
+ {
9685
+ "completion_length": 17.5,
9686
+ "completions/clipped_ratio": 0.0,
9687
+ "completions/max_length": 17.5,
9688
+ "completions/max_terminated_length": 17.5,
9689
+ "completions/mean_length": 16.725,
9690
+ "completions/mean_terminated_length": 16.725,
9691
+ "completions/min_length": 16.2,
9692
+ "completions/min_terminated_length": 16.2,
9693
+ "epoch": 0.25646314631463146,
9694
+ "frac_reward_zero_std": 1.0,
9695
+ "grad_norm": 0.0,
9696
+ "kl": 1.4185206890106201,
9697
+ "learning_rate": 4.6367801620308295e-06,
9698
+ "loss": 0.0,
9699
+ "num_tokens": 5327609.0,
9700
+ "reward": 4.099999904632568,
9701
+ "reward_std": 0.0,
9702
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9703
+ "rewards/coherence_reward_func/std": 0.0,
9704
+ "rewards/formatting_reward_func/mean": 2.0,
9705
+ "rewards/formatting_reward_func/std": 0.0,
9706
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9707
+ "rewards/quality_reward_func/std": 0.0,
9708
+ "step": 3730
9709
+ },
9710
+ {
9711
+ "completion_length": 19.5,
9712
+ "completions/clipped_ratio": 0.0,
9713
+ "completions/max_length": 19.5,
9714
+ "completions/max_terminated_length": 19.5,
9715
+ "completions/mean_length": 17.675,
9716
+ "completions/mean_terminated_length": 17.675,
9717
+ "completions/min_length": 15.8,
9718
+ "completions/min_terminated_length": 15.8,
9719
+ "epoch": 0.25715071507150716,
9720
+ "frac_reward_zero_std": 1.0,
9721
+ "grad_norm": 0.0,
9722
+ "kl": 1.2852225728332995,
9723
+ "learning_rate": 4.633659165202398e-06,
9724
+ "loss": 0.0,
9725
+ "num_tokens": 5341592.0,
9726
+ "reward": 4.099999904632568,
9727
+ "reward_std": 0.0,
9728
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9729
+ "rewards/coherence_reward_func/std": 0.0,
9730
+ "rewards/formatting_reward_func/mean": 2.0,
9731
+ "rewards/formatting_reward_func/std": 0.0,
9732
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9733
+ "rewards/quality_reward_func/std": 0.0,
9734
+ "step": 3740
9735
+ },
9736
+ {
9737
+ "completion_length": 19.8,
9738
+ "completions/clipped_ratio": 0.0,
9739
+ "completions/max_length": 19.8,
9740
+ "completions/max_terminated_length": 19.8,
9741
+ "completions/mean_length": 16.9,
9742
+ "completions/mean_terminated_length": 16.9,
9743
+ "completions/min_length": 14.8,
9744
+ "completions/min_terminated_length": 14.8,
9745
+ "epoch": 0.25783828382838286,
9746
+ "frac_reward_zero_std": 1.0,
9747
+ "grad_norm": 0.0,
9748
+ "kl": 1.2239439487457275,
9749
+ "learning_rate": 4.630525876682597e-06,
9750
+ "loss": 0.0,
9751
+ "num_tokens": 5353784.0,
9752
+ "reward": 4.099999904632568,
9753
+ "reward_std": 0.0,
9754
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9755
+ "rewards/coherence_reward_func/std": 0.0,
9756
+ "rewards/formatting_reward_func/mean": 2.0,
9757
+ "rewards/formatting_reward_func/std": 0.0,
9758
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9759
+ "rewards/quality_reward_func/std": 0.0,
9760
+ "step": 3750
9761
+ },
9762
+ {
9763
+ "completion_length": 18.2,
9764
+ "completions/clipped_ratio": 0.0,
9765
+ "completions/max_length": 18.2,
9766
+ "completions/max_terminated_length": 18.2,
9767
+ "completions/mean_length": 16.825,
9768
+ "completions/mean_terminated_length": 16.825,
9769
+ "completions/min_length": 15.7,
9770
+ "completions/min_terminated_length": 15.7,
9771
+ "epoch": 0.2585258525852585,
9772
+ "frac_reward_zero_std": 1.0,
9773
+ "grad_norm": 0.0,
9774
+ "kl": 1.3233575984835624,
9775
+ "learning_rate": 4.627380314521833e-06,
9776
+ "loss": 0.0,
9777
+ "num_tokens": 5366529.0,
9778
+ "reward": 4.099999904632568,
9779
+ "reward_std": 0.0,
9780
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9781
+ "rewards/coherence_reward_func/std": 0.0,
9782
+ "rewards/formatting_reward_func/mean": 2.0,
9783
+ "rewards/formatting_reward_func/std": 0.0,
9784
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9785
+ "rewards/quality_reward_func/std": 0.0,
9786
+ "step": 3760
9787
+ },
9788
+ {
9789
+ "completion_length": 19.3,
9790
+ "completions/clipped_ratio": 0.0,
9791
+ "completions/max_length": 19.3,
9792
+ "completions/max_terminated_length": 19.3,
9793
+ "completions/mean_length": 17.6,
9794
+ "completions/mean_terminated_length": 17.6,
9795
+ "completions/min_length": 16.1,
9796
+ "completions/min_terminated_length": 16.1,
9797
+ "epoch": 0.2592134213421342,
9798
+ "frac_reward_zero_std": 1.0,
9799
+ "grad_norm": 0.0,
9800
+ "kl": 1.4207659110426902,
9801
+ "learning_rate": 4.624222496841219e-06,
9802
+ "loss": 0.0001,
9803
+ "num_tokens": 5380945.0,
9804
+ "reward": 4.099999904632568,
9805
+ "reward_std": 0.0,
9806
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9807
+ "rewards/coherence_reward_func/std": 0.0,
9808
+ "rewards/formatting_reward_func/mean": 2.0,
9809
+ "rewards/formatting_reward_func/std": 0.0,
9810
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9811
+ "rewards/quality_reward_func/std": 0.0,
9812
+ "step": 3770
9813
+ },
9814
+ {
9815
+ "completion_length": 19.8,
9816
+ "completions/clipped_ratio": 0.0,
9817
+ "completions/max_length": 19.8,
9818
+ "completions/max_terminated_length": 19.8,
9819
+ "completions/mean_length": 17.35,
9820
+ "completions/mean_terminated_length": 17.35,
9821
+ "completions/min_length": 15.0,
9822
+ "completions/min_terminated_length": 15.0,
9823
+ "epoch": 0.2599009900990099,
9824
+ "frac_reward_zero_std": 1.0,
9825
+ "grad_norm": 0.0,
9826
+ "kl": 1.4342102020978929,
9827
+ "learning_rate": 4.621052441832471e-06,
9828
+ "loss": 0.0001,
9829
+ "num_tokens": 5395375.0,
9830
+ "reward": 4.099999904632568,
9831
+ "reward_std": 0.0,
9832
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9833
+ "rewards/coherence_reward_func/std": 0.0,
9834
+ "rewards/formatting_reward_func/mean": 2.0,
9835
+ "rewards/formatting_reward_func/std": 0.0,
9836
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9837
+ "rewards/quality_reward_func/std": 0.0,
9838
+ "step": 3780
9839
+ },
9840
+ {
9841
+ "completion_length": 17.8,
9842
+ "completions/clipped_ratio": 0.0,
9843
+ "completions/max_length": 17.8,
9844
+ "completions/max_terminated_length": 17.8,
9845
+ "completions/mean_length": 16.8,
9846
+ "completions/mean_terminated_length": 16.8,
9847
+ "completions/min_length": 15.2,
9848
+ "completions/min_terminated_length": 15.2,
9849
+ "epoch": 0.2605885588558856,
9850
+ "frac_reward_zero_std": 1.0,
9851
+ "grad_norm": 0.0,
9852
+ "kl": 1.3788485825061798,
9853
+ "learning_rate": 4.617870167757801e-06,
9854
+ "loss": 0.0,
9855
+ "num_tokens": 5410043.0,
9856
+ "reward": 4.099999904632568,
9857
+ "reward_std": 0.0,
9858
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9859
+ "rewards/coherence_reward_func/std": 0.0,
9860
+ "rewards/formatting_reward_func/mean": 2.0,
9861
+ "rewards/formatting_reward_func/std": 0.0,
9862
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9863
+ "rewards/quality_reward_func/std": 0.0,
9864
+ "step": 3790
9865
+ },
9866
+ {
9867
+ "completion_length": 20.0,
9868
+ "completions/clipped_ratio": 0.0,
9869
+ "completions/max_length": 20.0,
9870
+ "completions/max_terminated_length": 20.0,
9871
+ "completions/mean_length": 17.825,
9872
+ "completions/mean_terminated_length": 17.825,
9873
+ "completions/min_length": 16.4,
9874
+ "completions/min_terminated_length": 16.4,
9875
+ "epoch": 0.26127612761276126,
9876
+ "frac_reward_zero_std": 1.0,
9877
+ "grad_norm": 0.0,
9878
+ "kl": 1.3054631665349006,
9879
+ "learning_rate": 4.614675692949815e-06,
9880
+ "loss": 0.0001,
9881
+ "num_tokens": 5423164.0,
9882
+ "reward": 4.099999904632568,
9883
+ "reward_std": 0.0,
9884
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9885
+ "rewards/coherence_reward_func/std": 0.0,
9886
+ "rewards/formatting_reward_func/mean": 2.0,
9887
+ "rewards/formatting_reward_func/std": 0.0,
9888
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9889
+ "rewards/quality_reward_func/std": 0.0,
9890
+ "step": 3800
9891
+ },
9892
+ {
9893
+ "completion_length": 19.6,
9894
+ "completions/clipped_ratio": 0.0,
9895
+ "completions/max_length": 19.6,
9896
+ "completions/max_terminated_length": 19.6,
9897
+ "completions/mean_length": 16.875,
9898
+ "completions/mean_terminated_length": 16.875,
9899
+ "completions/min_length": 15.2,
9900
+ "completions/min_terminated_length": 15.2,
9901
+ "epoch": 0.26196369636963696,
9902
+ "frac_reward_zero_std": 0.9,
9903
+ "grad_norm": 0.0,
9904
+ "kl": 1.2541000019758939,
9905
+ "learning_rate": 4.611469035811404e-06,
9906
+ "loss": 0.0,
9907
+ "num_tokens": 5437159.0,
9908
+ "reward": 3.792499911785126,
9909
+ "reward_std": 0.20499999523162843,
9910
+ "rewards/coherence_reward_func/mean": 1.2024999558925629,
9911
+ "rewards/coherence_reward_func/std": 0.06499999761581421,
9912
+ "rewards/formatting_reward_func/mean": 1.85,
9913
+ "rewards/formatting_reward_func/std": 0.1,
9914
+ "rewards/quality_reward_func/mean": 0.7400000110268593,
9915
+ "rewards/quality_reward_func/std": 0.04000000059604645,
9916
+ "step": 3810
9917
+ },
9918
+ {
9919
+ "completion_length": 19.6,
9920
+ "completions/clipped_ratio": 0.0,
9921
+ "completions/max_length": 19.6,
9922
+ "completions/max_terminated_length": 19.6,
9923
+ "completions/mean_length": 17.125,
9924
+ "completions/mean_terminated_length": 17.125,
9925
+ "completions/min_length": 15.7,
9926
+ "completions/min_terminated_length": 15.7,
9927
+ "epoch": 0.26265126512651266,
9928
+ "frac_reward_zero_std": 1.0,
9929
+ "grad_norm": 0.0,
9930
+ "kl": 1.126739951223135,
9931
+ "learning_rate": 4.60825021481564e-06,
9932
+ "loss": 0.0,
9933
+ "num_tokens": 5451712.0,
9934
+ "reward": 4.099999904632568,
9935
+ "reward_std": 0.0,
9936
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9937
+ "rewards/coherence_reward_func/std": 0.0,
9938
+ "rewards/formatting_reward_func/mean": 2.0,
9939
+ "rewards/formatting_reward_func/std": 0.0,
9940
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9941
+ "rewards/quality_reward_func/std": 0.0,
9942
+ "step": 3820
9943
+ },
9944
+ {
9945
+ "completion_length": 19.6,
9946
+ "completions/clipped_ratio": 0.0,
9947
+ "completions/max_length": 19.6,
9948
+ "completions/max_terminated_length": 19.6,
9949
+ "completions/mean_length": 17.25,
9950
+ "completions/mean_terminated_length": 17.25,
9951
+ "completions/min_length": 16.2,
9952
+ "completions/min_terminated_length": 16.2,
9953
+ "epoch": 0.26333883388338836,
9954
+ "frac_reward_zero_std": 1.0,
9955
+ "grad_norm": 0.0,
9956
+ "kl": 1.021160862594843,
9957
+ "learning_rate": 4.60501924850567e-06,
9958
+ "loss": 0.0,
9959
+ "num_tokens": 5464550.0,
9960
+ "reward": 4.099999904632568,
9961
+ "reward_std": 0.0,
9962
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9963
+ "rewards/coherence_reward_func/std": 0.0,
9964
+ "rewards/formatting_reward_func/mean": 2.0,
9965
+ "rewards/formatting_reward_func/std": 0.0,
9966
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9967
+ "rewards/quality_reward_func/std": 0.0,
9968
+ "step": 3830
9969
+ },
9970
+ {
9971
+ "completion_length": 15.9,
9972
+ "completions/clipped_ratio": 0.0,
9973
+ "completions/max_length": 15.9,
9974
+ "completions/max_terminated_length": 15.9,
9975
+ "completions/mean_length": 15.15,
9976
+ "completions/mean_terminated_length": 15.15,
9977
+ "completions/min_length": 14.7,
9978
+ "completions/min_terminated_length": 14.7,
9979
+ "epoch": 0.264026402640264,
9980
+ "frac_reward_zero_std": 1.0,
9981
+ "grad_norm": 0.0,
9982
+ "kl": 1.45318810492754,
9983
+ "learning_rate": 4.601776155494607e-06,
9984
+ "loss": 0.0,
9985
+ "num_tokens": 5477840.0,
9986
+ "reward": 4.099999904632568,
9987
+ "reward_std": 0.0,
9988
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
9989
+ "rewards/coherence_reward_func/std": 0.0,
9990
+ "rewards/formatting_reward_func/mean": 2.0,
9991
+ "rewards/formatting_reward_func/std": 0.0,
9992
+ "rewards/quality_reward_func/mean": 0.800000011920929,
9993
+ "rewards/quality_reward_func/std": 0.0,
9994
+ "step": 3840
9995
+ },
9996
+ {
9997
+ "completion_length": 20.6,
9998
+ "completions/clipped_ratio": 0.0,
9999
+ "completions/max_length": 20.6,
10000
+ "completions/max_terminated_length": 20.6,
10001
+ "completions/mean_length": 17.3,
10002
+ "completions/mean_terminated_length": 17.3,
10003
+ "completions/min_length": 15.3,
10004
+ "completions/min_terminated_length": 15.3,
10005
+ "epoch": 0.2647139713971397,
10006
+ "frac_reward_zero_std": 1.0,
10007
+ "grad_norm": 0.0,
10008
+ "kl": 1.0578694000840188,
10009
+ "learning_rate": 4.5985209544654265e-06,
10010
+ "loss": 0.0,
10011
+ "num_tokens": 5491052.0,
10012
+ "reward": 4.099999904632568,
10013
+ "reward_std": 0.0,
10014
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
10015
+ "rewards/coherence_reward_func/std": 0.0,
10016
+ "rewards/formatting_reward_func/mean": 2.0,
10017
+ "rewards/formatting_reward_func/std": 0.0,
10018
+ "rewards/quality_reward_func/mean": 0.800000011920929,
10019
+ "rewards/quality_reward_func/std": 0.0,
10020
+ "step": 3850
10021
  }
10022
  ],
10023
  "logging_steps": 10,
10024
  "max_steps": 14544,
10025
+ "num_input_tokens_seen": 5491052,
10026
  "num_train_epochs": 1,
10027
  "save_steps": 50,
10028
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df21d5fa3ce640097f53fa1ff1e43994f82e13e3d3dd1a01db2aac8afbf571d9
3
  size 7057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b42a64fa29ca47bc2e0aa39c0a6a5f4997b48e715b9026d691d0c0901ff35f
3
  size 7057