Arittro2 commited on
Commit
5a4a264
·
verified ·
1 Parent(s): acd0bbf

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1173 -3
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93fe08fda954a8d19235305b943a8e691ee131a7294b52f2b5fb23bd46716507
3
  size 262406656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd8422397958e38dfc54623833b9c42fbf84c2192234f78716993373edeb9c08
3
  size 262406656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2eb3975da2fc0d9c7f4f4e9652c42e390d2d5a328fc9fe84b2a3069c7e905c7b
3
  size 122872331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6acdf097a44425d0cb4aa2435e670892fe147410ce2c6c5fefed2de4c9ef796
3
  size 122872331
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcfe3a31388de42cdb8e2ba831ae4c3ef355515443e6afb9cf07cb38355f83c0
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2e37a8994ae61da6b0a5cbf1dc8a1a1e4ca374128d672206c8b82cbdf6e4192
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:744586c5f7157d9fd0b219ae516b1d5cf715a6af929b7cd570b93b36b3eb4887
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e0184609e0a634a7a19eed294044d17cbbacf15554dec1788c985d57897ec9e
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.360973597359736,
6
  "eval_steps": 500,
7
- "global_step": 5250,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -13658,11 +13658,1181 @@
13658
  "rewards/quality_reward_func/mean": 0.800000011920929,
13659
  "rewards/quality_reward_func/std": 0.0,
13660
  "step": 5250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13661
  }
13662
  ],
13663
  "logging_steps": 10,
13664
  "max_steps": 14544,
13665
- "num_input_tokens_seen": 7525094,
13666
  "num_train_epochs": 1,
13667
  "save_steps": 50,
13668
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3919141914191419,
6
  "eval_steps": 500,
7
+ "global_step": 5700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
13658
  "rewards/quality_reward_func/mean": 0.800000011920929,
13659
  "rewards/quality_reward_func/std": 0.0,
13660
  "step": 5250
13661
+ },
13662
+ {
13663
+ "completion_length": 18.08955223880597,
13664
+ "completions/clipped_ratio": 0.0,
13665
+ "completions/max_length": 18.11764705882353,
13666
+ "completions/max_terminated_length": 18.11764705882353,
13667
+ "completions/mean_length": 16.794117647058822,
13668
+ "completions/mean_terminated_length": 16.794117647058822,
13669
+ "completions/min_length": 15.411764705882353,
13670
+ "completions/min_terminated_length": 15.411764705882353,
13671
+ "epoch": 0.36166116611661164,
13672
+ "frac_reward_zero_std": 1.0,
13673
+ "grad_norm": 0.0,
13674
+ "kl": 1.0812231904979963,
13675
+ "learning_rate": 4.028381566875773e-06,
13676
+ "loss": 0.0,
13677
+ "num_tokens": 7540044.0,
13678
+ "reward": 4.099999904632568,
13679
+ "reward_std": 0.0,
13680
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13681
+ "rewards/coherence_reward_func/std": 0.0,
13682
+ "rewards/formatting_reward_func/mean": 2.0,
13683
+ "rewards/formatting_reward_func/std": 0.0,
13684
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13685
+ "rewards/quality_reward_func/std": 0.0,
13686
+ "step": 5260
13687
+ },
13688
+ {
13689
+ "completion_length": 20.15,
13690
+ "completions/clipped_ratio": 0.0,
13691
+ "completions/max_length": 20.1,
13692
+ "completions/max_terminated_length": 20.1,
13693
+ "completions/mean_length": 18.375,
13694
+ "completions/mean_terminated_length": 18.375,
13695
+ "completions/min_length": 16.6,
13696
+ "completions/min_terminated_length": 16.6,
13697
+ "epoch": 0.36234873487348734,
13698
+ "frac_reward_zero_std": 1.0,
13699
+ "grad_norm": 0.0,
13700
+ "kl": 0.9661604385823012,
13701
+ "learning_rate": 4.02362866756573e-06,
13702
+ "loss": 0.0,
13703
+ "num_tokens": 7554587.0,
13704
+ "reward": 4.099999904632568,
13705
+ "reward_std": 0.0,
13706
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13707
+ "rewards/coherence_reward_func/std": 0.0,
13708
+ "rewards/formatting_reward_func/mean": 2.0,
13709
+ "rewards/formatting_reward_func/std": 0.0,
13710
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13711
+ "rewards/quality_reward_func/std": 0.0,
13712
+ "step": 5270
13713
+ },
13714
+ {
13715
+ "completion_length": 16.775,
13716
+ "completions/clipped_ratio": 0.0,
13717
+ "completions/max_length": 16.7,
13718
+ "completions/max_terminated_length": 16.7,
13719
+ "completions/mean_length": 16.075,
13720
+ "completions/mean_terminated_length": 16.075,
13721
+ "completions/min_length": 15.4,
13722
+ "completions/min_terminated_length": 15.4,
13723
+ "epoch": 0.36303630363036304,
13724
+ "frac_reward_zero_std": 1.0,
13725
+ "grad_norm": 0.0,
13726
+ "kl": 1.3903781726956368,
13727
+ "learning_rate": 4.018866990858785e-06,
13728
+ "loss": 0.0,
13729
+ "num_tokens": 7569714.0,
13730
+ "reward": 4.099999904632568,
13731
+ "reward_std": 0.0,
13732
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13733
+ "rewards/coherence_reward_func/std": 0.0,
13734
+ "rewards/formatting_reward_func/mean": 2.0,
13735
+ "rewards/formatting_reward_func/std": 0.0,
13736
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13737
+ "rewards/quality_reward_func/std": 0.0,
13738
+ "step": 5280
13739
+ },
13740
+ {
13741
+ "completion_length": 19.675,
13742
+ "completions/clipped_ratio": 0.0,
13743
+ "completions/max_length": 19.8,
13744
+ "completions/max_terminated_length": 19.8,
13745
+ "completions/mean_length": 18.15,
13746
+ "completions/mean_terminated_length": 18.15,
13747
+ "completions/min_length": 16.6,
13748
+ "completions/min_terminated_length": 16.6,
13749
+ "epoch": 0.36372387238723874,
13750
+ "frac_reward_zero_std": 1.0,
13751
+ "grad_norm": 0.0,
13752
+ "kl": 1.3349122866988181,
13753
+ "learning_rate": 4.014096564186248e-06,
13754
+ "loss": 0.0,
13755
+ "num_tokens": 7582848.0,
13756
+ "reward": 4.099999904632568,
13757
+ "reward_std": 0.0,
13758
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13759
+ "rewards/coherence_reward_func/std": 0.0,
13760
+ "rewards/formatting_reward_func/mean": 2.0,
13761
+ "rewards/formatting_reward_func/std": 0.0,
13762
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13763
+ "rewards/quality_reward_func/std": 0.0,
13764
+ "step": 5290
13765
+ },
13766
+ {
13767
+ "completion_length": 19.425,
13768
+ "completions/clipped_ratio": 0.0,
13769
+ "completions/max_length": 19.4,
13770
+ "completions/max_terminated_length": 19.4,
13771
+ "completions/mean_length": 17.575,
13772
+ "completions/mean_terminated_length": 17.575,
13773
+ "completions/min_length": 16.3,
13774
+ "completions/min_terminated_length": 16.3,
13775
+ "epoch": 0.3644114411441144,
13776
+ "frac_reward_zero_std": 1.0,
13777
+ "grad_norm": 0.0,
13778
+ "kl": 0.8336154259741306,
13779
+ "learning_rate": 4.009317415029832e-06,
13780
+ "loss": 0.0,
13781
+ "num_tokens": 7597619.0,
13782
+ "reward": 4.099999904632568,
13783
+ "reward_std": 0.0,
13784
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13785
+ "rewards/coherence_reward_func/std": 0.0,
13786
+ "rewards/formatting_reward_func/mean": 2.0,
13787
+ "rewards/formatting_reward_func/std": 0.0,
13788
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13789
+ "rewards/quality_reward_func/std": 0.0,
13790
+ "step": 5300
13791
+ },
13792
+ {
13793
+ "completion_length": 18.65,
13794
+ "completions/clipped_ratio": 0.0,
13795
+ "completions/max_length": 18.5,
13796
+ "completions/max_terminated_length": 18.5,
13797
+ "completions/mean_length": 17.85,
13798
+ "completions/mean_terminated_length": 17.85,
13799
+ "completions/min_length": 17.1,
13800
+ "completions/min_terminated_length": 17.1,
13801
+ "epoch": 0.3650990099009901,
13802
+ "frac_reward_zero_std": 1.0,
13803
+ "grad_norm": 0.0,
13804
+ "kl": 1.0214567624032498,
13805
+ "learning_rate": 4.004529570921501e-06,
13806
+ "loss": 0.0,
13807
+ "num_tokens": 7612549.0,
13808
+ "reward": 4.099999904632568,
13809
+ "reward_std": 0.0,
13810
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13811
+ "rewards/coherence_reward_func/std": 0.0,
13812
+ "rewards/formatting_reward_func/mean": 2.0,
13813
+ "rewards/formatting_reward_func/std": 0.0,
13814
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13815
+ "rewards/quality_reward_func/std": 0.0,
13816
+ "step": 5310
13817
+ },
13818
+ {
13819
+ "completion_length": 17.975,
13820
+ "completions/clipped_ratio": 0.0,
13821
+ "completions/max_length": 18.1,
13822
+ "completions/max_terminated_length": 18.1,
13823
+ "completions/mean_length": 17.15,
13824
+ "completions/mean_terminated_length": 17.15,
13825
+ "completions/min_length": 16.2,
13826
+ "completions/min_terminated_length": 16.2,
13827
+ "epoch": 0.3657865786578658,
13828
+ "frac_reward_zero_std": 1.0,
13829
+ "grad_norm": 0.0,
13830
+ "kl": 1.2231212853454054,
13831
+ "learning_rate": 3.99973305944331e-06,
13832
+ "loss": 0.0,
13833
+ "num_tokens": 7627539.0,
13834
+ "reward": 4.099999904632568,
13835
+ "reward_std": 0.0,
13836
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13837
+ "rewards/coherence_reward_func/std": 0.0,
13838
+ "rewards/formatting_reward_func/mean": 2.0,
13839
+ "rewards/formatting_reward_func/std": 0.0,
13840
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13841
+ "rewards/quality_reward_func/std": 0.0,
13842
+ "step": 5320
13843
+ },
13844
+ {
13845
+ "completion_length": 17.775,
13846
+ "completions/clipped_ratio": 0.0,
13847
+ "completions/max_length": 17.8,
13848
+ "completions/max_terminated_length": 17.8,
13849
+ "completions/mean_length": 17.175,
13850
+ "completions/mean_terminated_length": 17.175,
13851
+ "completions/min_length": 16.3,
13852
+ "completions/min_terminated_length": 16.3,
13853
+ "epoch": 0.3664741474147415,
13854
+ "frac_reward_zero_std": 1.0,
13855
+ "grad_norm": 0.0,
13856
+ "kl": 1.197108805179596,
13857
+ "learning_rate": 3.9949279082272425e-06,
13858
+ "loss": 0.0,
13859
+ "num_tokens": 7643738.0,
13860
+ "reward": 4.099999904632568,
13861
+ "reward_std": 0.0,
13862
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13863
+ "rewards/coherence_reward_func/std": 0.0,
13864
+ "rewards/formatting_reward_func/mean": 2.0,
13865
+ "rewards/formatting_reward_func/std": 0.0,
13866
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13867
+ "rewards/quality_reward_func/std": 0.0,
13868
+ "step": 5330
13869
+ },
13870
+ {
13871
+ "completion_length": 17.375,
13872
+ "completions/clipped_ratio": 0.0,
13873
+ "completions/max_length": 17.4,
13874
+ "completions/max_terminated_length": 17.4,
13875
+ "completions/mean_length": 15.925,
13876
+ "completions/mean_terminated_length": 15.925,
13877
+ "completions/min_length": 13.9,
13878
+ "completions/min_terminated_length": 13.9,
13879
+ "epoch": 0.36716171617161714,
13880
+ "frac_reward_zero_std": 1.0,
13881
+ "grad_norm": 0.0,
13882
+ "kl": 1.1159055039286614,
13883
+ "learning_rate": 3.9901141449550565e-06,
13884
+ "loss": 0.0,
13885
+ "num_tokens": 7658551.0,
13886
+ "reward": 4.099999904632568,
13887
+ "reward_std": 0.0,
13888
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13889
+ "rewards/coherence_reward_func/std": 0.0,
13890
+ "rewards/formatting_reward_func/mean": 2.0,
13891
+ "rewards/formatting_reward_func/std": 0.0,
13892
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13893
+ "rewards/quality_reward_func/std": 0.0,
13894
+ "step": 5340
13895
+ },
13896
+ {
13897
+ "completion_length": 17.075,
13898
+ "completions/clipped_ratio": 0.0,
13899
+ "completions/max_length": 16.9,
13900
+ "completions/max_terminated_length": 16.9,
13901
+ "completions/mean_length": 16.075,
13902
+ "completions/mean_terminated_length": 16.075,
13903
+ "completions/min_length": 15.6,
13904
+ "completions/min_terminated_length": 15.6,
13905
+ "epoch": 0.36784928492849284,
13906
+ "frac_reward_zero_std": 1.0,
13907
+ "grad_norm": 0.0,
13908
+ "kl": 1.0155922904610635,
13909
+ "learning_rate": 3.985291797358123e-06,
13910
+ "loss": 0.0,
13911
+ "num_tokens": 7671674.0,
13912
+ "reward": 4.099999904632568,
13913
+ "reward_std": 0.0,
13914
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13915
+ "rewards/coherence_reward_func/std": 0.0,
13916
+ "rewards/formatting_reward_func/mean": 2.0,
13917
+ "rewards/formatting_reward_func/std": 0.0,
13918
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13919
+ "rewards/quality_reward_func/std": 0.0,
13920
+ "step": 5350
13921
+ },
13922
+ {
13923
+ "completion_length": 16.825,
13924
+ "completions/clipped_ratio": 0.0,
13925
+ "completions/max_length": 17.0,
13926
+ "completions/max_terminated_length": 17.0,
13927
+ "completions/mean_length": 16.575,
13928
+ "completions/mean_terminated_length": 16.575,
13929
+ "completions/min_length": 16.1,
13930
+ "completions/min_terminated_length": 16.1,
13931
+ "epoch": 0.36853685368536854,
13932
+ "frac_reward_zero_std": 1.0,
13933
+ "grad_norm": 0.0,
13934
+ "kl": 1.2678054243326187,
13935
+ "learning_rate": 3.980460893217265e-06,
13936
+ "loss": 0.0,
13937
+ "num_tokens": 7684565.0,
13938
+ "reward": 4.099999904632568,
13939
+ "reward_std": 0.0,
13940
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13941
+ "rewards/coherence_reward_func/std": 0.0,
13942
+ "rewards/formatting_reward_func/mean": 2.0,
13943
+ "rewards/formatting_reward_func/std": 0.0,
13944
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13945
+ "rewards/quality_reward_func/std": 0.0,
13946
+ "step": 5360
13947
+ },
13948
+ {
13949
+ "completion_length": 17.975,
13950
+ "completions/clipped_ratio": 0.0,
13951
+ "completions/max_length": 17.8,
13952
+ "completions/max_terminated_length": 17.8,
13953
+ "completions/mean_length": 17.0,
13954
+ "completions/mean_terminated_length": 17.0,
13955
+ "completions/min_length": 16.1,
13956
+ "completions/min_terminated_length": 16.1,
13957
+ "epoch": 0.36922442244224424,
13958
+ "frac_reward_zero_std": 1.0,
13959
+ "grad_norm": 0.0,
13960
+ "kl": 1.2318198367953301,
13961
+ "learning_rate": 3.9756214603626e-06,
13962
+ "loss": 0.0,
13963
+ "num_tokens": 7698909.0,
13964
+ "reward": 4.099999904632568,
13965
+ "reward_std": 0.0,
13966
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13967
+ "rewards/coherence_reward_func/std": 0.0,
13968
+ "rewards/formatting_reward_func/mean": 2.0,
13969
+ "rewards/formatting_reward_func/std": 0.0,
13970
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13971
+ "rewards/quality_reward_func/std": 0.0,
13972
+ "step": 5370
13973
+ },
13974
+ {
13975
+ "completion_length": 18.95,
13976
+ "completions/clipped_ratio": 0.0,
13977
+ "completions/max_length": 19.0,
13978
+ "completions/max_terminated_length": 19.0,
13979
+ "completions/mean_length": 17.2,
13980
+ "completions/mean_terminated_length": 17.2,
13981
+ "completions/min_length": 15.4,
13982
+ "completions/min_terminated_length": 15.4,
13983
+ "epoch": 0.36991199119911994,
13984
+ "frac_reward_zero_std": 1.0,
13985
+ "grad_norm": 0.0,
13986
+ "kl": 1.3333981722593307,
13987
+ "learning_rate": 3.9707735266733735e-06,
13988
+ "loss": 0.0,
13989
+ "num_tokens": 7715133.0,
13990
+ "reward": 4.099999904632568,
13991
+ "reward_std": 0.0,
13992
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
13993
+ "rewards/coherence_reward_func/std": 0.0,
13994
+ "rewards/formatting_reward_func/mean": 2.0,
13995
+ "rewards/formatting_reward_func/std": 0.0,
13996
+ "rewards/quality_reward_func/mean": 0.800000011920929,
13997
+ "rewards/quality_reward_func/std": 0.0,
13998
+ "step": 5380
13999
+ },
14000
+ {
14001
+ "completion_length": 17.425,
14002
+ "completions/clipped_ratio": 0.0,
14003
+ "completions/max_length": 17.7,
14004
+ "completions/max_terminated_length": 17.7,
14005
+ "completions/mean_length": 16.8,
14006
+ "completions/mean_terminated_length": 16.8,
14007
+ "completions/min_length": 16.0,
14008
+ "completions/min_terminated_length": 16.0,
14009
+ "epoch": 0.3705995599559956,
14010
+ "frac_reward_zero_std": 1.0,
14011
+ "grad_norm": 0.0,
14012
+ "kl": 1.3391637369990348,
14013
+ "learning_rate": 3.965917120077811e-06,
14014
+ "loss": 0.0,
14015
+ "num_tokens": 7727317.0,
14016
+ "reward": 4.099999904632568,
14017
+ "reward_std": 0.0,
14018
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14019
+ "rewards/coherence_reward_func/std": 0.0,
14020
+ "rewards/formatting_reward_func/mean": 2.0,
14021
+ "rewards/formatting_reward_func/std": 0.0,
14022
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14023
+ "rewards/quality_reward_func/std": 0.0,
14024
+ "step": 5390
14025
+ },
14026
+ {
14027
+ "completion_length": 19.15,
14028
+ "completions/clipped_ratio": 0.0,
14029
+ "completions/max_length": 19.0,
14030
+ "completions/max_terminated_length": 19.0,
14031
+ "completions/mean_length": 16.925,
14032
+ "completions/mean_terminated_length": 16.925,
14033
+ "completions/min_length": 15.6,
14034
+ "completions/min_terminated_length": 15.6,
14035
+ "epoch": 0.3712871287128713,
14036
+ "frac_reward_zero_std": 1.0,
14037
+ "grad_norm": 0.0,
14038
+ "kl": 1.1109920389950276,
14039
+ "learning_rate": 3.961052268552941e-06,
14040
+ "loss": 0.0,
14041
+ "num_tokens": 7743642.0,
14042
+ "reward": 4.099999904632568,
14043
+ "reward_std": 0.0,
14044
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14045
+ "rewards/coherence_reward_func/std": 0.0,
14046
+ "rewards/formatting_reward_func/mean": 2.0,
14047
+ "rewards/formatting_reward_func/std": 0.0,
14048
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14049
+ "rewards/quality_reward_func/std": 0.0,
14050
+ "step": 5400
14051
+ },
14052
+ {
14053
+ "completion_length": 16.95,
14054
+ "completions/clipped_ratio": 0.0,
14055
+ "completions/max_length": 16.9,
14056
+ "completions/max_terminated_length": 16.9,
14057
+ "completions/mean_length": 15.875,
14058
+ "completions/mean_terminated_length": 15.875,
14059
+ "completions/min_length": 14.9,
14060
+ "completions/min_terminated_length": 14.9,
14061
+ "epoch": 0.371974697469747,
14062
+ "frac_reward_zero_std": 1.0,
14063
+ "grad_norm": 5.6203894928330556e-05,
14064
+ "kl": 1.363871442526579,
14065
+ "learning_rate": 3.956179000124447e-06,
14066
+ "loss": 0.0,
14067
+ "num_tokens": 7758365.0,
14068
+ "reward": 4.099999904632568,
14069
+ "reward_std": 0.0,
14070
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14071
+ "rewards/coherence_reward_func/std": 0.0,
14072
+ "rewards/formatting_reward_func/mean": 2.0,
14073
+ "rewards/formatting_reward_func/std": 0.0,
14074
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14075
+ "rewards/quality_reward_func/std": 0.0,
14076
+ "step": 5410
14077
+ },
14078
+ {
14079
+ "completion_length": 17.825,
14080
+ "completions/clipped_ratio": 0.0,
14081
+ "completions/max_length": 17.7,
14082
+ "completions/max_terminated_length": 17.7,
14083
+ "completions/mean_length": 16.15,
14084
+ "completions/mean_terminated_length": 16.15,
14085
+ "completions/min_length": 15.2,
14086
+ "completions/min_terminated_length": 15.2,
14087
+ "epoch": 0.3726622662266227,
14088
+ "frac_reward_zero_std": 1.0,
14089
+ "grad_norm": 0.0,
14090
+ "kl": 0.9931762866675854,
14091
+ "learning_rate": 3.9512973428665e-06,
14092
+ "loss": 0.0,
14093
+ "num_tokens": 7772323.0,
14094
+ "reward": 4.099999904632568,
14095
+ "reward_std": 0.0,
14096
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14097
+ "rewards/coherence_reward_func/std": 0.0,
14098
+ "rewards/formatting_reward_func/mean": 2.0,
14099
+ "rewards/formatting_reward_func/std": 0.0,
14100
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14101
+ "rewards/quality_reward_func/std": 0.0,
14102
+ "step": 5420
14103
+ },
14104
+ {
14105
+ "completion_length": 21.425,
14106
+ "completions/clipped_ratio": 0.0,
14107
+ "completions/max_length": 21.6,
14108
+ "completions/max_terminated_length": 21.6,
14109
+ "completions/mean_length": 18.825,
14110
+ "completions/mean_terminated_length": 18.825,
14111
+ "completions/min_length": 16.1,
14112
+ "completions/min_terminated_length": 16.1,
14113
+ "epoch": 0.37334983498349833,
14114
+ "frac_reward_zero_std": 1.0,
14115
+ "grad_norm": 0.0,
14116
+ "kl": 1.0012955855578185,
14117
+ "learning_rate": 3.946407324901598e-06,
14118
+ "loss": 0.0,
14119
+ "num_tokens": 7785692.0,
14120
+ "reward": 4.099999904632568,
14121
+ "reward_std": 0.0,
14122
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14123
+ "rewards/coherence_reward_func/std": 0.0,
14124
+ "rewards/formatting_reward_func/mean": 2.0,
14125
+ "rewards/formatting_reward_func/std": 0.0,
14126
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14127
+ "rewards/quality_reward_func/std": 0.0,
14128
+ "step": 5430
14129
+ },
14130
+ {
14131
+ "completion_length": 21.0,
14132
+ "completions/clipped_ratio": 0.0,
14133
+ "completions/max_length": 21.2,
14134
+ "completions/max_terminated_length": 21.2,
14135
+ "completions/mean_length": 17.95,
14136
+ "completions/mean_terminated_length": 17.95,
14137
+ "completions/min_length": 16.1,
14138
+ "completions/min_terminated_length": 16.1,
14139
+ "epoch": 0.37403740374037403,
14140
+ "frac_reward_zero_std": 1.0,
14141
+ "grad_norm": 0.0,
14142
+ "kl": 1.0526311319321393,
14143
+ "learning_rate": 3.941508974400401e-06,
14144
+ "loss": 0.0,
14145
+ "num_tokens": 7802662.0,
14146
+ "reward": 4.099999904632568,
14147
+ "reward_std": 0.0,
14148
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14149
+ "rewards/coherence_reward_func/std": 0.0,
14150
+ "rewards/formatting_reward_func/mean": 2.0,
14151
+ "rewards/formatting_reward_func/std": 0.0,
14152
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14153
+ "rewards/quality_reward_func/std": 0.0,
14154
+ "step": 5440
14155
+ },
14156
+ {
14157
+ "completion_length": 17.1,
14158
+ "completions/clipped_ratio": 0.0,
14159
+ "completions/max_length": 16.9,
14160
+ "completions/max_terminated_length": 16.9,
14161
+ "completions/mean_length": 16.575,
14162
+ "completions/mean_terminated_length": 16.575,
14163
+ "completions/min_length": 16.3,
14164
+ "completions/min_terminated_length": 16.3,
14165
+ "epoch": 0.37472497249724973,
14166
+ "frac_reward_zero_std": 1.0,
14167
+ "grad_norm": 0.0,
14168
+ "kl": 1.1798742283135653,
14169
+ "learning_rate": 3.9366023195815755e-06,
14170
+ "loss": 0.0,
14171
+ "num_tokens": 7817133.0,
14172
+ "reward": 4.099999904632568,
14173
+ "reward_std": 0.0,
14174
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14175
+ "rewards/coherence_reward_func/std": 0.0,
14176
+ "rewards/formatting_reward_func/mean": 2.0,
14177
+ "rewards/formatting_reward_func/std": 0.0,
14178
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14179
+ "rewards/quality_reward_func/std": 0.0,
14180
+ "step": 5450
14181
+ },
14182
+ {
14183
+ "completion_length": 19.2,
14184
+ "completions/clipped_ratio": 0.0,
14185
+ "completions/max_length": 19.3,
14186
+ "completions/max_terminated_length": 19.3,
14187
+ "completions/mean_length": 18.225,
14188
+ "completions/mean_terminated_length": 18.225,
14189
+ "completions/min_length": 16.6,
14190
+ "completions/min_terminated_length": 16.6,
14191
+ "epoch": 0.37541254125412543,
14192
+ "frac_reward_zero_std": 1.0,
14193
+ "grad_norm": 0.0,
14194
+ "kl": 1.3727002948522569,
14195
+ "learning_rate": 3.931687388711626e-06,
14196
+ "loss": 0.0001,
14197
+ "num_tokens": 7833654.0,
14198
+ "reward": 4.099999904632568,
14199
+ "reward_std": 0.0,
14200
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14201
+ "rewards/coherence_reward_func/std": 0.0,
14202
+ "rewards/formatting_reward_func/mean": 2.0,
14203
+ "rewards/formatting_reward_func/std": 0.0,
14204
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14205
+ "rewards/quality_reward_func/std": 0.0,
14206
+ "step": 5460
14207
+ },
14208
+ {
14209
+ "completion_length": 20.15,
14210
+ "completions/clipped_ratio": 0.0,
14211
+ "completions/max_length": 20.0,
14212
+ "completions/max_terminated_length": 20.0,
14213
+ "completions/mean_length": 17.3,
14214
+ "completions/mean_terminated_length": 17.3,
14215
+ "completions/min_length": 15.6,
14216
+ "completions/min_terminated_length": 15.6,
14217
+ "epoch": 0.3761001100110011,
14218
+ "frac_reward_zero_std": 1.0,
14219
+ "grad_norm": 0.0,
14220
+ "kl": 1.0240365587174893,
14221
+ "learning_rate": 3.926764210104733e-06,
14222
+ "loss": 0.0,
14223
+ "num_tokens": 7851086.0,
14224
+ "reward": 4.099999904632568,
14225
+ "reward_std": 0.0,
14226
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14227
+ "rewards/coherence_reward_func/std": 0.0,
14228
+ "rewards/formatting_reward_func/mean": 2.0,
14229
+ "rewards/formatting_reward_func/std": 0.0,
14230
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14231
+ "rewards/quality_reward_func/std": 0.0,
14232
+ "step": 5470
14233
+ },
14234
+ {
14235
+ "completion_length": 19.075,
14236
+ "completions/clipped_ratio": 0.0,
14237
+ "completions/max_length": 19.2,
14238
+ "completions/max_terminated_length": 19.2,
14239
+ "completions/mean_length": 17.2,
14240
+ "completions/mean_terminated_length": 17.2,
14241
+ "completions/min_length": 15.3,
14242
+ "completions/min_terminated_length": 15.3,
14243
+ "epoch": 0.3767876787678768,
14244
+ "frac_reward_zero_std": 1.0,
14245
+ "grad_norm": 0.0,
14246
+ "kl": 1.1240653157234193,
14247
+ "learning_rate": 3.921832812122593e-06,
14248
+ "loss": 0.0,
14249
+ "num_tokens": 7867270.0,
14250
+ "reward": 4.099999904632568,
14251
+ "reward_std": 0.0,
14252
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14253
+ "rewards/coherence_reward_func/std": 0.0,
14254
+ "rewards/formatting_reward_func/mean": 2.0,
14255
+ "rewards/formatting_reward_func/std": 0.0,
14256
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14257
+ "rewards/quality_reward_func/std": 0.0,
14258
+ "step": 5480
14259
+ },
14260
+ {
14261
+ "completion_length": 17.975,
14262
+ "completions/clipped_ratio": 0.0,
14263
+ "completions/max_length": 17.8,
14264
+ "completions/max_terminated_length": 17.8,
14265
+ "completions/mean_length": 16.55,
14266
+ "completions/mean_terminated_length": 16.55,
14267
+ "completions/min_length": 15.8,
14268
+ "completions/min_terminated_length": 15.8,
14269
+ "epoch": 0.3774752475247525,
14270
+ "frac_reward_zero_std": 1.0,
14271
+ "grad_norm": 0.0,
14272
+ "kl": 1.4767700091004372,
14273
+ "learning_rate": 3.916893223174254e-06,
14274
+ "loss": 0.0001,
14275
+ "num_tokens": 7882340.0,
14276
+ "reward": 4.099999904632568,
14277
+ "reward_std": 0.0,
14278
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14279
+ "rewards/coherence_reward_func/std": 0.0,
14280
+ "rewards/formatting_reward_func/mean": 2.0,
14281
+ "rewards/formatting_reward_func/std": 0.0,
14282
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14283
+ "rewards/quality_reward_func/std": 0.0,
14284
+ "step": 5490
14285
+ },
14286
+ {
14287
+ "completion_length": 18.725,
14288
+ "completions/clipped_ratio": 0.0,
14289
+ "completions/max_length": 18.8,
14290
+ "completions/max_terminated_length": 18.8,
14291
+ "completions/mean_length": 16.35,
14292
+ "completions/mean_terminated_length": 16.35,
14293
+ "completions/min_length": 14.8,
14294
+ "completions/min_terminated_length": 14.8,
14295
+ "epoch": 0.3781628162816282,
14296
+ "frac_reward_zero_std": 1.0,
14297
+ "grad_norm": 0.0,
14298
+ "kl": 1.476631324738264,
14299
+ "learning_rate": 3.911945471715947e-06,
14300
+ "loss": 0.0001,
14301
+ "num_tokens": 7897518.0,
14302
+ "reward": 4.099999904632568,
14303
+ "reward_std": 0.0,
14304
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14305
+ "rewards/coherence_reward_func/std": 0.0,
14306
+ "rewards/formatting_reward_func/mean": 2.0,
14307
+ "rewards/formatting_reward_func/std": 0.0,
14308
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14309
+ "rewards/quality_reward_func/std": 0.0,
14310
+ "step": 5500
14311
+ },
14312
+ {
14313
+ "completion_length": 19.825,
14314
+ "completions/clipped_ratio": 0.0,
14315
+ "completions/max_length": 19.8,
14316
+ "completions/max_terminated_length": 19.8,
14317
+ "completions/mean_length": 18.3,
14318
+ "completions/mean_terminated_length": 18.3,
14319
+ "completions/min_length": 16.8,
14320
+ "completions/min_terminated_length": 16.8,
14321
+ "epoch": 0.3788503850385038,
14322
+ "frac_reward_zero_std": 1.0,
14323
+ "grad_norm": 0.0,
14324
+ "kl": 1.1571273379027844,
14325
+ "learning_rate": 3.906989586250928e-06,
14326
+ "loss": 0.0,
14327
+ "num_tokens": 7911386.0,
14328
+ "reward": 4.099999904632568,
14329
+ "reward_std": 0.0,
14330
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14331
+ "rewards/coherence_reward_func/std": 0.0,
14332
+ "rewards/formatting_reward_func/mean": 2.0,
14333
+ "rewards/formatting_reward_func/std": 0.0,
14334
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14335
+ "rewards/quality_reward_func/std": 0.0,
14336
+ "step": 5510
14337
+ },
14338
+ {
14339
+ "completion_length": 17.3,
14340
+ "completions/clipped_ratio": 0.0,
14341
+ "completions/max_length": 17.3,
14342
+ "completions/max_terminated_length": 17.3,
14343
+ "completions/mean_length": 15.575,
14344
+ "completions/mean_terminated_length": 15.575,
14345
+ "completions/min_length": 14.7,
14346
+ "completions/min_terminated_length": 14.7,
14347
+ "epoch": 0.3795379537953795,
14348
+ "frac_reward_zero_std": 1.0,
14349
+ "grad_norm": 0.0,
14350
+ "kl": 1.3854421511292458,
14351
+ "learning_rate": 3.902025595329314e-06,
14352
+ "loss": 0.0,
14353
+ "num_tokens": 7923165.0,
14354
+ "reward": 4.099999904632568,
14355
+ "reward_std": 0.0,
14356
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14357
+ "rewards/coherence_reward_func/std": 0.0,
14358
+ "rewards/formatting_reward_func/mean": 2.0,
14359
+ "rewards/formatting_reward_func/std": 0.0,
14360
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14361
+ "rewards/quality_reward_func/std": 0.0,
14362
+ "step": 5520
14363
+ },
14364
+ {
14365
+ "completion_length": 18.2,
14366
+ "completions/clipped_ratio": 0.0,
14367
+ "completions/max_length": 18.2,
14368
+ "completions/max_terminated_length": 18.2,
14369
+ "completions/mean_length": 16.75,
14370
+ "completions/mean_terminated_length": 16.75,
14371
+ "completions/min_length": 15.5,
14372
+ "completions/min_terminated_length": 15.5,
14373
+ "epoch": 0.3802255225522552,
14374
+ "frac_reward_zero_std": 1.0,
14375
+ "grad_norm": 0.0,
14376
+ "kl": 1.2270353332161903,
14377
+ "learning_rate": 3.897053527547912e-06,
14378
+ "loss": 0.0,
14379
+ "num_tokens": 7937471.0,
14380
+ "reward": 4.099999904632568,
14381
+ "reward_std": 0.0,
14382
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14383
+ "rewards/coherence_reward_func/std": 0.0,
14384
+ "rewards/formatting_reward_func/mean": 2.0,
14385
+ "rewards/formatting_reward_func/std": 0.0,
14386
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14387
+ "rewards/quality_reward_func/std": 0.0,
14388
+ "step": 5530
14389
+ },
14390
+ {
14391
+ "completion_length": 17.6,
14392
+ "completions/clipped_ratio": 0.0,
14393
+ "completions/max_length": 17.7,
14394
+ "completions/max_terminated_length": 17.7,
14395
+ "completions/mean_length": 16.75,
14396
+ "completions/mean_terminated_length": 16.75,
14397
+ "completions/min_length": 16.1,
14398
+ "completions/min_terminated_length": 16.1,
14399
+ "epoch": 0.3809130913091309,
14400
+ "frac_reward_zero_std": 1.0,
14401
+ "grad_norm": 0.0,
14402
+ "kl": 1.0666535507887602,
14403
+ "learning_rate": 3.892073411550062e-06,
14404
+ "loss": 0.0,
14405
+ "num_tokens": 7951813.0,
14406
+ "reward": 4.099999904632568,
14407
+ "reward_std": 0.0,
14408
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14409
+ "rewards/coherence_reward_func/std": 0.0,
14410
+ "rewards/formatting_reward_func/mean": 2.0,
14411
+ "rewards/formatting_reward_func/std": 0.0,
14412
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14413
+ "rewards/quality_reward_func/std": 0.0,
14414
+ "step": 5540
14415
+ },
14416
+ {
14417
+ "completion_length": 17.7,
14418
+ "completions/clipped_ratio": 0.0,
14419
+ "completions/max_length": 17.9,
14420
+ "completions/max_terminated_length": 17.9,
14421
+ "completions/mean_length": 16.9,
14422
+ "completions/mean_terminated_length": 16.9,
14423
+ "completions/min_length": 15.5,
14424
+ "completions/min_terminated_length": 15.5,
14425
+ "epoch": 0.3816006600660066,
14426
+ "frac_reward_zero_std": 1.0,
14427
+ "grad_norm": 0.0,
14428
+ "kl": 1.2041775345802308,
14429
+ "learning_rate": 3.887085276025469e-06,
14430
+ "loss": 0.0,
14431
+ "num_tokens": 7968181.0,
14432
+ "reward": 4.099999904632568,
14433
+ "reward_std": 0.0,
14434
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14435
+ "rewards/coherence_reward_func/std": 0.0,
14436
+ "rewards/formatting_reward_func/mean": 2.0,
14437
+ "rewards/formatting_reward_func/std": 0.0,
14438
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14439
+ "rewards/quality_reward_func/std": 0.0,
14440
+ "step": 5550
14441
+ },
14442
+ {
14443
+ "completion_length": 16.7,
14444
+ "completions/clipped_ratio": 0.0,
14445
+ "completions/max_length": 16.4,
14446
+ "completions/max_terminated_length": 16.4,
14447
+ "completions/mean_length": 15.75,
14448
+ "completions/mean_terminated_length": 15.75,
14449
+ "completions/min_length": 14.8,
14450
+ "completions/min_terminated_length": 14.8,
14451
+ "epoch": 0.38228822882288227,
14452
+ "frac_reward_zero_std": 1.0,
14453
+ "grad_norm": 0.0,
14454
+ "kl": 1.1235090486705304,
14455
+ "learning_rate": 3.882089149710035e-06,
14456
+ "loss": 0.0,
14457
+ "num_tokens": 7984055.0,
14458
+ "reward": 4.099999904632568,
14459
+ "reward_std": 0.0,
14460
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14461
+ "rewards/coherence_reward_func/std": 0.0,
14462
+ "rewards/formatting_reward_func/mean": 2.0,
14463
+ "rewards/formatting_reward_func/std": 0.0,
14464
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14465
+ "rewards/quality_reward_func/std": 0.0,
14466
+ "step": 5560
14467
+ },
14468
+ {
14469
+ "completion_length": 18.725,
14470
+ "completions/clipped_ratio": 0.0,
14471
+ "completions/max_length": 18.7,
14472
+ "completions/max_terminated_length": 18.7,
14473
+ "completions/mean_length": 17.3,
14474
+ "completions/mean_terminated_length": 17.3,
14475
+ "completions/min_length": 16.3,
14476
+ "completions/min_terminated_length": 16.3,
14477
+ "epoch": 0.38297579757975797,
14478
+ "frac_reward_zero_std": 1.0,
14479
+ "grad_norm": 2.725888043642044e-05,
14480
+ "kl": 1.1920234143733979,
14481
+ "learning_rate": 3.877085061385694e-06,
14482
+ "loss": 0.0,
14483
+ "num_tokens": 7997675.0,
14484
+ "reward": 4.099999904632568,
14485
+ "reward_std": 0.0,
14486
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14487
+ "rewards/coherence_reward_func/std": 0.0,
14488
+ "rewards/formatting_reward_func/mean": 2.0,
14489
+ "rewards/formatting_reward_func/std": 0.0,
14490
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14491
+ "rewards/quality_reward_func/std": 0.0,
14492
+ "step": 5570
14493
+ },
14494
+ {
14495
+ "completion_length": 18.275,
14496
+ "completions/clipped_ratio": 0.0,
14497
+ "completions/max_length": 18.4,
14498
+ "completions/max_terminated_length": 18.4,
14499
+ "completions/mean_length": 16.6,
14500
+ "completions/mean_terminated_length": 16.6,
14501
+ "completions/min_length": 14.9,
14502
+ "completions/min_terminated_length": 14.9,
14503
+ "epoch": 0.38366336633663367,
14504
+ "frac_reward_zero_std": 1.0,
14505
+ "grad_norm": 0.0,
14506
+ "kl": 0.9494880434125662,
14507
+ "learning_rate": 3.872073039880254e-06,
14508
+ "loss": 0.0,
14509
+ "num_tokens": 8011851.0,
14510
+ "reward": 4.099999904632568,
14511
+ "reward_std": 0.0,
14512
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14513
+ "rewards/coherence_reward_func/std": 0.0,
14514
+ "rewards/formatting_reward_func/mean": 2.0,
14515
+ "rewards/formatting_reward_func/std": 0.0,
14516
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14517
+ "rewards/quality_reward_func/std": 0.0,
14518
+ "step": 5580
14519
+ },
14520
+ {
14521
+ "completion_length": 23.25,
14522
+ "completions/clipped_ratio": 0.0,
14523
+ "completions/max_length": 23.2,
14524
+ "completions/max_terminated_length": 23.2,
14525
+ "completions/mean_length": 19.575,
14526
+ "completions/mean_terminated_length": 19.575,
14527
+ "completions/min_length": 16.7,
14528
+ "completions/min_terminated_length": 16.7,
14529
+ "epoch": 0.38435093509350937,
14530
+ "frac_reward_zero_std": 1.0,
14531
+ "grad_norm": 0.0,
14532
+ "kl": 1.0359878040850163,
14533
+ "learning_rate": 3.8670531140672194e-06,
14534
+ "loss": 0.0,
14535
+ "num_tokens": 8024570.0,
14536
+ "reward": 4.099999904632568,
14537
+ "reward_std": 0.0,
14538
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14539
+ "rewards/coherence_reward_func/std": 0.0,
14540
+ "rewards/formatting_reward_func/mean": 2.0,
14541
+ "rewards/formatting_reward_func/std": 0.0,
14542
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14543
+ "rewards/quality_reward_func/std": 0.0,
14544
+ "step": 5590
14545
+ },
14546
+ {
14547
+ "completion_length": 18.3,
14548
+ "completions/clipped_ratio": 0.0,
14549
+ "completions/max_length": 18.3,
14550
+ "completions/max_terminated_length": 18.3,
14551
+ "completions/mean_length": 17.25,
14552
+ "completions/mean_terminated_length": 17.25,
14553
+ "completions/min_length": 16.2,
14554
+ "completions/min_terminated_length": 16.2,
14555
+ "epoch": 0.385038503850385,
14556
+ "frac_reward_zero_std": 1.0,
14557
+ "grad_norm": 0.0,
14558
+ "kl": 1.4615533858537675,
14559
+ "learning_rate": 3.862025312865633e-06,
14560
+ "loss": 0.0001,
14561
+ "num_tokens": 8039680.0,
14562
+ "reward": 4.099999904632568,
14563
+ "reward_std": 0.0,
14564
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14565
+ "rewards/coherence_reward_func/std": 0.0,
14566
+ "rewards/formatting_reward_func/mean": 2.0,
14567
+ "rewards/formatting_reward_func/std": 0.0,
14568
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14569
+ "rewards/quality_reward_func/std": 0.0,
14570
+ "step": 5600
14571
+ },
14572
+ {
14573
+ "completion_length": 17.725,
14574
+ "completions/clipped_ratio": 0.0,
14575
+ "completions/max_length": 17.6,
14576
+ "completions/max_terminated_length": 17.6,
14577
+ "completions/mean_length": 16.6,
14578
+ "completions/mean_terminated_length": 16.6,
14579
+ "completions/min_length": 16.0,
14580
+ "completions/min_terminated_length": 16.0,
14581
+ "epoch": 0.3857260726072607,
14582
+ "frac_reward_zero_std": 1.0,
14583
+ "grad_norm": 0.0,
14584
+ "kl": 1.1007069438695907,
14585
+ "learning_rate": 3.856989665239904e-06,
14586
+ "loss": 0.0,
14587
+ "num_tokens": 8054900.0,
14588
+ "reward": 4.099999904632568,
14589
+ "reward_std": 0.0,
14590
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14591
+ "rewards/coherence_reward_func/std": 0.0,
14592
+ "rewards/formatting_reward_func/mean": 2.0,
14593
+ "rewards/formatting_reward_func/std": 0.0,
14594
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14595
+ "rewards/quality_reward_func/std": 0.0,
14596
+ "step": 5610
14597
+ },
14598
+ {
14599
+ "completion_length": 19.275,
14600
+ "completions/clipped_ratio": 0.0,
14601
+ "completions/max_length": 19.4,
14602
+ "completions/max_terminated_length": 19.4,
14603
+ "completions/mean_length": 18.075,
14604
+ "completions/mean_terminated_length": 18.075,
14605
+ "completions/min_length": 16.2,
14606
+ "completions/min_terminated_length": 16.2,
14607
+ "epoch": 0.3864136413641364,
14608
+ "frac_reward_zero_std": 1.0,
14609
+ "grad_norm": 0.0,
14610
+ "kl": 1.1791205305606127,
14611
+ "learning_rate": 3.851946200199648e-06,
14612
+ "loss": 0.0,
14613
+ "num_tokens": 8070555.0,
14614
+ "reward": 4.099999904632568,
14615
+ "reward_std": 0.0,
14616
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14617
+ "rewards/coherence_reward_func/std": 0.0,
14618
+ "rewards/formatting_reward_func/mean": 2.0,
14619
+ "rewards/formatting_reward_func/std": 0.0,
14620
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14621
+ "rewards/quality_reward_func/std": 0.0,
14622
+ "step": 5620
14623
+ },
14624
+ {
14625
+ "completion_length": 19.1,
14626
+ "completions/clipped_ratio": 0.0,
14627
+ "completions/max_length": 19.3,
14628
+ "completions/max_terminated_length": 19.3,
14629
+ "completions/mean_length": 16.925,
14630
+ "completions/mean_terminated_length": 16.925,
14631
+ "completions/min_length": 15.3,
14632
+ "completions/min_terminated_length": 15.3,
14633
+ "epoch": 0.3871012101210121,
14634
+ "frac_reward_zero_std": 1.0,
14635
+ "grad_norm": 0.0,
14636
+ "kl": 1.1043142512440682,
14637
+ "learning_rate": 3.846894946799511e-06,
14638
+ "loss": 0.0,
14639
+ "num_tokens": 8083116.0,
14640
+ "reward": 4.099999904632568,
14641
+ "reward_std": 0.0,
14642
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14643
+ "rewards/coherence_reward_func/std": 0.0,
14644
+ "rewards/formatting_reward_func/mean": 2.0,
14645
+ "rewards/formatting_reward_func/std": 0.0,
14646
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14647
+ "rewards/quality_reward_func/std": 0.0,
14648
+ "step": 5630
14649
+ },
14650
+ {
14651
+ "completion_length": 17.65,
14652
+ "completions/clipped_ratio": 0.0,
14653
+ "completions/max_length": 17.4,
14654
+ "completions/max_terminated_length": 17.4,
14655
+ "completions/mean_length": 16.025,
14656
+ "completions/mean_terminated_length": 16.025,
14657
+ "completions/min_length": 15.3,
14658
+ "completions/min_terminated_length": 15.3,
14659
+ "epoch": 0.38778877887788776,
14660
+ "frac_reward_zero_std": 1.0,
14661
+ "grad_norm": 0.0,
14662
+ "kl": 1.4466410249471664,
14663
+ "learning_rate": 3.841835934139008e-06,
14664
+ "loss": 0.0,
14665
+ "num_tokens": 8097373.0,
14666
+ "reward": 4.099999904632568,
14667
+ "reward_std": 0.0,
14668
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14669
+ "rewards/coherence_reward_func/std": 0.0,
14670
+ "rewards/formatting_reward_func/mean": 2.0,
14671
+ "rewards/formatting_reward_func/std": 0.0,
14672
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14673
+ "rewards/quality_reward_func/std": 0.0,
14674
+ "step": 5640
14675
+ },
14676
+ {
14677
+ "completion_length": 17.175,
14678
+ "completions/clipped_ratio": 0.0,
14679
+ "completions/max_length": 17.1,
14680
+ "completions/max_terminated_length": 17.1,
14681
+ "completions/mean_length": 15.975,
14682
+ "completions/mean_terminated_length": 15.975,
14683
+ "completions/min_length": 15.2,
14684
+ "completions/min_terminated_length": 15.2,
14685
+ "epoch": 0.38847634763476346,
14686
+ "frac_reward_zero_std": 1.0,
14687
+ "grad_norm": 0.0,
14688
+ "kl": 1.3932079687714576,
14689
+ "learning_rate": 3.8367691913623565e-06,
14690
+ "loss": 0.0,
14691
+ "num_tokens": 8108796.0,
14692
+ "reward": 4.099999904632568,
14693
+ "reward_std": 0.0,
14694
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14695
+ "rewards/coherence_reward_func/std": 0.0,
14696
+ "rewards/formatting_reward_func/mean": 2.0,
14697
+ "rewards/formatting_reward_func/std": 0.0,
14698
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14699
+ "rewards/quality_reward_func/std": 0.0,
14700
+ "step": 5650
14701
+ },
14702
+ {
14703
+ "completion_length": 20.275,
14704
+ "completions/clipped_ratio": 0.0,
14705
+ "completions/max_length": 20.4,
14706
+ "completions/max_terminated_length": 20.4,
14707
+ "completions/mean_length": 17.525,
14708
+ "completions/mean_terminated_length": 17.525,
14709
+ "completions/min_length": 15.8,
14710
+ "completions/min_terminated_length": 15.8,
14711
+ "epoch": 0.38916391639163916,
14712
+ "frac_reward_zero_std": 1.0,
14713
+ "grad_norm": 0.0,
14714
+ "kl": 1.1165172673761845,
14715
+ "learning_rate": 3.831694747658301e-06,
14716
+ "loss": 0.0,
14717
+ "num_tokens": 8123245.0,
14718
+ "reward": 4.099999904632568,
14719
+ "reward_std": 0.0,
14720
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14721
+ "rewards/coherence_reward_func/std": 0.0,
14722
+ "rewards/formatting_reward_func/mean": 2.0,
14723
+ "rewards/formatting_reward_func/std": 0.0,
14724
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14725
+ "rewards/quality_reward_func/std": 0.0,
14726
+ "step": 5660
14727
+ },
14728
+ {
14729
+ "completion_length": 17.175,
14730
+ "completions/clipped_ratio": 0.0,
14731
+ "completions/max_length": 17.0,
14732
+ "completions/max_terminated_length": 17.0,
14733
+ "completions/mean_length": 16.2,
14734
+ "completions/mean_terminated_length": 16.2,
14735
+ "completions/min_length": 15.4,
14736
+ "completions/min_terminated_length": 15.4,
14737
+ "epoch": 0.38985148514851486,
14738
+ "frac_reward_zero_std": 1.0,
14739
+ "grad_norm": 0.0,
14740
+ "kl": 1.037246273458004,
14741
+ "learning_rate": 3.826612632259955e-06,
14742
+ "loss": 0.0,
14743
+ "num_tokens": 8137105.0,
14744
+ "reward": 4.099999904632568,
14745
+ "reward_std": 0.0,
14746
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14747
+ "rewards/coherence_reward_func/std": 0.0,
14748
+ "rewards/formatting_reward_func/mean": 2.0,
14749
+ "rewards/formatting_reward_func/std": 0.0,
14750
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14751
+ "rewards/quality_reward_func/std": 0.0,
14752
+ "step": 5670
14753
+ },
14754
+ {
14755
+ "completion_length": 19.9,
14756
+ "completions/clipped_ratio": 0.0,
14757
+ "completions/max_length": 19.9,
14758
+ "completions/max_terminated_length": 19.9,
14759
+ "completions/mean_length": 17.65,
14760
+ "completions/mean_terminated_length": 17.65,
14761
+ "completions/min_length": 15.8,
14762
+ "completions/min_terminated_length": 15.8,
14763
+ "epoch": 0.39053905390539057,
14764
+ "frac_reward_zero_std": 1.0,
14765
+ "grad_norm": 0.0,
14766
+ "kl": 1.185601119697094,
14767
+ "learning_rate": 3.821522874444626e-06,
14768
+ "loss": 0.0,
14769
+ "num_tokens": 8151835.0,
14770
+ "reward": 4.099999904632568,
14771
+ "reward_std": 0.0,
14772
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14773
+ "rewards/coherence_reward_func/std": 0.0,
14774
+ "rewards/formatting_reward_func/mean": 2.0,
14775
+ "rewards/formatting_reward_func/std": 0.0,
14776
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14777
+ "rewards/quality_reward_func/std": 0.0,
14778
+ "step": 5680
14779
+ },
14780
+ {
14781
+ "completion_length": 20.15,
14782
+ "completions/clipped_ratio": 0.0,
14783
+ "completions/max_length": 20.4,
14784
+ "completions/max_terminated_length": 20.4,
14785
+ "completions/mean_length": 18.2,
14786
+ "completions/mean_terminated_length": 18.2,
14787
+ "completions/min_length": 16.7,
14788
+ "completions/min_terminated_length": 16.7,
14789
+ "epoch": 0.3912266226622662,
14790
+ "frac_reward_zero_std": 1.0,
14791
+ "grad_norm": 0.0,
14792
+ "kl": 0.9930311039090156,
14793
+ "learning_rate": 3.8164255035336454e-06,
14794
+ "loss": 0.0,
14795
+ "num_tokens": 8165839.0,
14796
+ "reward": 4.099999904632568,
14797
+ "reward_std": 0.0,
14798
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14799
+ "rewards/coherence_reward_func/std": 0.0,
14800
+ "rewards/formatting_reward_func/mean": 2.0,
14801
+ "rewards/formatting_reward_func/std": 0.0,
14802
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14803
+ "rewards/quality_reward_func/std": 0.0,
14804
+ "step": 5690
14805
+ },
14806
+ {
14807
+ "completion_length": 18.85,
14808
+ "completions/clipped_ratio": 0.0,
14809
+ "completions/max_length": 18.6,
14810
+ "completions/max_terminated_length": 18.6,
14811
+ "completions/mean_length": 16.775,
14812
+ "completions/mean_terminated_length": 16.775,
14813
+ "completions/min_length": 15.5,
14814
+ "completions/min_terminated_length": 15.5,
14815
+ "epoch": 0.3919141914191419,
14816
+ "frac_reward_zero_std": 1.0,
14817
+ "grad_norm": 0.0,
14818
+ "kl": 1.4449263490736484,
14819
+ "learning_rate": 3.811320548892205e-06,
14820
+ "loss": 0.0001,
14821
+ "num_tokens": 8177630.0,
14822
+ "reward": 4.099999904632568,
14823
+ "reward_std": 0.0,
14824
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14825
+ "rewards/coherence_reward_func/std": 0.0,
14826
+ "rewards/formatting_reward_func/mean": 2.0,
14827
+ "rewards/formatting_reward_func/std": 0.0,
14828
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14829
+ "rewards/quality_reward_func/std": 0.0,
14830
+ "step": 5700
14831
  }
14832
  ],
14833
  "logging_steps": 10,
14834
  "max_steps": 14544,
14835
+ "num_input_tokens_seen": 8177630,
14836
  "num_train_epochs": 1,
14837
  "save_steps": 50,
14838
  "stateful_callbacks": {