Arittro2 commited on
Commit
e54f5b3
·
verified ·
1 Parent(s): 5a4a264

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,12 +29,12 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "q_proj",
34
- "k_proj",
35
- "down_proj",
36
  "gate_proj",
 
37
  "up_proj",
 
 
 
38
  "o_proj"
39
  ],
40
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
 
 
32
  "gate_proj",
33
+ "down_proj",
34
  "up_proj",
35
+ "q_proj",
36
+ "k_proj",
37
+ "v_proj",
38
  "o_proj"
39
  ],
40
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd8422397958e38dfc54623833b9c42fbf84c2192234f78716993373edeb9c08
3
  size 262406656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f89e75eace1f37a98140d93962ea46e73cba4f4b8e34e368480bf3f2b1e4cdd
3
  size 262406656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6acdf097a44425d0cb4aa2435e670892fe147410ce2c6c5fefed2de4c9ef796
3
  size 122872331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3eb3d4cf9477e021678068cf544673ad23c71724f09a6af6a000805761f348f
3
  size 122872331
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2e37a8994ae61da6b0a5cbf1dc8a1a1e4ca374128d672206c8b82cbdf6e4192
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:792b3fee8a1554be314683100df2b980f0bfc2f891874430d77a51ba9880a32f
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e0184609e0a634a7a19eed294044d17cbbacf15554dec1788c985d57897ec9e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74de7329a01fdf8f6ecea853bf84d421d0cc36daa4e1fdfaf82ec5c4e05cf81c
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3919141914191419,
6
  "eval_steps": 500,
7
- "global_step": 5700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -14828,11 +14828,791 @@
14828
  "rewards/quality_reward_func/mean": 0.800000011920929,
14829
  "rewards/quality_reward_func/std": 0.0,
14830
  "step": 5700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14831
  }
14832
  ],
14833
  "logging_steps": 10,
14834
  "max_steps": 14544,
14835
- "num_input_tokens_seen": 8177630,
14836
  "num_train_epochs": 1,
14837
  "save_steps": 50,
14838
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.41254125412541254,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
14828
  "rewards/quality_reward_func/mean": 0.800000011920929,
14829
  "rewards/quality_reward_func/std": 0.0,
14830
  "step": 5700
14831
+ },
14832
+ {
14833
+ "completion_length": 17.5,
14834
+ "completions/clipped_ratio": 0.0,
14835
+ "completions/max_length": 17.5,
14836
+ "completions/max_terminated_length": 17.5,
14837
+ "completions/mean_length": 16.775,
14838
+ "completions/mean_terminated_length": 16.775,
14839
+ "completions/min_length": 16.0,
14840
+ "completions/min_terminated_length": 16.0,
14841
+ "epoch": 0.3926017601760176,
14842
+ "frac_reward_zero_std": 1.0,
14843
+ "grad_norm": 0.0,
14844
+ "kl": 1.1857761025428772,
14845
+ "learning_rate": 3.8062080399291872e-06,
14846
+ "loss": 0.0,
14847
+ "num_tokens": 8192361.0,
14848
+ "reward": 4.099999904632568,
14849
+ "reward_std": 0.0,
14850
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14851
+ "rewards/coherence_reward_func/std": 0.0,
14852
+ "rewards/formatting_reward_func/mean": 2.0,
14853
+ "rewards/formatting_reward_func/std": 0.0,
14854
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14855
+ "rewards/quality_reward_func/std": 0.0,
14856
+ "step": 5710
14857
+ },
14858
+ {
14859
+ "completion_length": 20.1,
14860
+ "completions/clipped_ratio": 0.0,
14861
+ "completions/max_length": 20.1,
14862
+ "completions/max_terminated_length": 20.1,
14863
+ "completions/mean_length": 18.975,
14864
+ "completions/mean_terminated_length": 18.975,
14865
+ "completions/min_length": 17.4,
14866
+ "completions/min_terminated_length": 17.4,
14867
+ "epoch": 0.3932893289328933,
14868
+ "frac_reward_zero_std": 1.0,
14869
+ "grad_norm": 0.0,
14870
+ "kl": 1.346421904861927,
14871
+ "learning_rate": 3.801088006096989e-06,
14872
+ "loss": 0.0001,
14873
+ "num_tokens": 8204804.0,
14874
+ "reward": 4.099999904632568,
14875
+ "reward_std": 0.0,
14876
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14877
+ "rewards/coherence_reward_func/std": 0.0,
14878
+ "rewards/formatting_reward_func/mean": 2.0,
14879
+ "rewards/formatting_reward_func/std": 0.0,
14880
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14881
+ "rewards/quality_reward_func/std": 0.0,
14882
+ "step": 5720
14883
+ },
14884
+ {
14885
+ "completion_length": 16.7,
14886
+ "completions/clipped_ratio": 0.0,
14887
+ "completions/max_length": 16.7,
14888
+ "completions/max_terminated_length": 16.7,
14889
+ "completions/mean_length": 15.875,
14890
+ "completions/mean_terminated_length": 15.875,
14891
+ "completions/min_length": 15.4,
14892
+ "completions/min_terminated_length": 15.4,
14893
+ "epoch": 0.39397689768976896,
14894
+ "frac_reward_zero_std": 1.0,
14895
+ "grad_norm": 0.0,
14896
+ "kl": 1.1227647330611945,
14897
+ "learning_rate": 3.7959604768913615e-06,
14898
+ "loss": 0.0,
14899
+ "num_tokens": 8220067.0,
14900
+ "reward": 4.099999904632568,
14901
+ "reward_std": 0.0,
14902
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14903
+ "rewards/coherence_reward_func/std": 0.0,
14904
+ "rewards/formatting_reward_func/mean": 2.0,
14905
+ "rewards/formatting_reward_func/std": 0.0,
14906
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14907
+ "rewards/quality_reward_func/std": 0.0,
14908
+ "step": 5730
14909
+ },
14910
+ {
14911
+ "completion_length": 18.1,
14912
+ "completions/clipped_ratio": 0.0,
14913
+ "completions/max_length": 18.1,
14914
+ "completions/max_terminated_length": 18.1,
14915
+ "completions/mean_length": 17.1,
14916
+ "completions/mean_terminated_length": 17.1,
14917
+ "completions/min_length": 16.2,
14918
+ "completions/min_terminated_length": 16.2,
14919
+ "epoch": 0.39466446644664466,
14920
+ "frac_reward_zero_std": 1.0,
14921
+ "grad_norm": 0.0,
14922
+ "kl": 1.3379293769598006,
14923
+ "learning_rate": 3.7908254818512323e-06,
14924
+ "loss": 0.0,
14925
+ "num_tokens": 8235871.0,
14926
+ "reward": 4.099999904632568,
14927
+ "reward_std": 0.0,
14928
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14929
+ "rewards/coherence_reward_func/std": 0.0,
14930
+ "rewards/formatting_reward_func/mean": 2.0,
14931
+ "rewards/formatting_reward_func/std": 0.0,
14932
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14933
+ "rewards/quality_reward_func/std": 0.0,
14934
+ "step": 5740
14935
+ },
14936
+ {
14937
+ "completion_length": 20.6,
14938
+ "completions/clipped_ratio": 0.0,
14939
+ "completions/max_length": 20.6,
14940
+ "completions/max_terminated_length": 20.6,
14941
+ "completions/mean_length": 18.75,
14942
+ "completions/mean_terminated_length": 18.75,
14943
+ "completions/min_length": 16.7,
14944
+ "completions/min_terminated_length": 16.7,
14945
+ "epoch": 0.39535203520352036,
14946
+ "frac_reward_zero_std": 1.0,
14947
+ "grad_norm": 0.0,
14948
+ "kl": 1.1466250203549861,
14949
+ "learning_rate": 3.785683050558541e-06,
14950
+ "loss": 0.0,
14951
+ "num_tokens": 8249645.0,
14952
+ "reward": 4.099999904632568,
14953
+ "reward_std": 0.0,
14954
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14955
+ "rewards/coherence_reward_func/std": 0.0,
14956
+ "rewards/formatting_reward_func/mean": 2.0,
14957
+ "rewards/formatting_reward_func/std": 0.0,
14958
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14959
+ "rewards/quality_reward_func/std": 0.0,
14960
+ "step": 5750
14961
+ },
14962
+ {
14963
+ "completion_length": 15.5,
14964
+ "completions/clipped_ratio": 0.0,
14965
+ "completions/max_length": 15.5,
14966
+ "completions/max_terminated_length": 15.5,
14967
+ "completions/mean_length": 15.05,
14968
+ "completions/mean_terminated_length": 15.05,
14969
+ "completions/min_length": 14.7,
14970
+ "completions/min_terminated_length": 14.7,
14971
+ "epoch": 0.39603960396039606,
14972
+ "frac_reward_zero_std": 1.0,
14973
+ "grad_norm": 0.0,
14974
+ "kl": 1.3106171108782292,
14975
+ "learning_rate": 3.7805332126380647e-06,
14976
+ "loss": 0.0,
14977
+ "num_tokens": 8262587.0,
14978
+ "reward": 4.099999904632568,
14979
+ "reward_std": 0.0,
14980
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
14981
+ "rewards/coherence_reward_func/std": 0.0,
14982
+ "rewards/formatting_reward_func/mean": 2.0,
14983
+ "rewards/formatting_reward_func/std": 0.0,
14984
+ "rewards/quality_reward_func/mean": 0.800000011920929,
14985
+ "rewards/quality_reward_func/std": 0.0,
14986
+ "step": 5760
14987
+ },
14988
+ {
14989
+ "completion_length": 20.3,
14990
+ "completions/clipped_ratio": 0.0,
14991
+ "completions/max_length": 20.3,
14992
+ "completions/max_terminated_length": 20.3,
14993
+ "completions/mean_length": 17.925,
14994
+ "completions/mean_terminated_length": 17.925,
14995
+ "completions/min_length": 15.4,
14996
+ "completions/min_terminated_length": 15.4,
14997
+ "epoch": 0.3967271727172717,
14998
+ "frac_reward_zero_std": 1.0,
14999
+ "grad_norm": 0.0,
15000
+ "kl": 1.1861035495996475,
15001
+ "learning_rate": 3.775375997757249e-06,
15002
+ "loss": 0.0,
15003
+ "num_tokens": 8276160.0,
15004
+ "reward": 4.099999904632568,
15005
+ "reward_std": 0.0,
15006
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15007
+ "rewards/coherence_reward_func/std": 0.0,
15008
+ "rewards/formatting_reward_func/mean": 2.0,
15009
+ "rewards/formatting_reward_func/std": 0.0,
15010
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15011
+ "rewards/quality_reward_func/std": 0.0,
15012
+ "step": 5770
15013
+ },
15014
+ {
15015
+ "completion_length": 19.8,
15016
+ "completions/clipped_ratio": 0.0,
15017
+ "completions/max_length": 19.8,
15018
+ "completions/max_terminated_length": 19.8,
15019
+ "completions/mean_length": 17.475,
15020
+ "completions/mean_terminated_length": 17.475,
15021
+ "completions/min_length": 15.8,
15022
+ "completions/min_terminated_length": 15.8,
15023
+ "epoch": 0.3974147414741474,
15024
+ "frac_reward_zero_std": 1.0,
15025
+ "grad_norm": 0.0,
15026
+ "kl": 1.0805307626724243,
15027
+ "learning_rate": 3.7702114356260387e-06,
15028
+ "loss": 0.0,
15029
+ "num_tokens": 8290663.0,
15030
+ "reward": 4.099999904632568,
15031
+ "reward_std": 0.0,
15032
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15033
+ "rewards/coherence_reward_func/std": 0.0,
15034
+ "rewards/formatting_reward_func/mean": 2.0,
15035
+ "rewards/formatting_reward_func/std": 0.0,
15036
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15037
+ "rewards/quality_reward_func/std": 0.0,
15038
+ "step": 5780
15039
+ },
15040
+ {
15041
+ "completion_length": 17.0,
15042
+ "completions/clipped_ratio": 0.0,
15043
+ "completions/max_length": 17.0,
15044
+ "completions/max_terminated_length": 17.0,
15045
+ "completions/mean_length": 16.075,
15046
+ "completions/mean_terminated_length": 16.075,
15047
+ "completions/min_length": 15.2,
15048
+ "completions/min_terminated_length": 15.2,
15049
+ "epoch": 0.3981023102310231,
15050
+ "frac_reward_zero_std": 1.0,
15051
+ "grad_norm": 0.0,
15052
+ "kl": 1.2187039345502853,
15053
+ "learning_rate": 3.7650395559967036e-06,
15054
+ "loss": 0.0,
15055
+ "num_tokens": 8301238.0,
15056
+ "reward": 4.099999904632568,
15057
+ "reward_std": 0.0,
15058
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15059
+ "rewards/coherence_reward_func/std": 0.0,
15060
+ "rewards/formatting_reward_func/mean": 2.0,
15061
+ "rewards/formatting_reward_func/std": 0.0,
15062
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15063
+ "rewards/quality_reward_func/std": 0.0,
15064
+ "step": 5790
15065
+ },
15066
+ {
15067
+ "completion_length": 18.1,
15068
+ "completions/clipped_ratio": 0.0,
15069
+ "completions/max_length": 18.1,
15070
+ "completions/max_terminated_length": 18.1,
15071
+ "completions/mean_length": 17.05,
15072
+ "completions/mean_terminated_length": 17.05,
15073
+ "completions/min_length": 16.0,
15074
+ "completions/min_terminated_length": 16.0,
15075
+ "epoch": 0.3987898789878988,
15076
+ "frac_reward_zero_std": 1.0,
15077
+ "grad_norm": 0.0,
15078
+ "kl": 1.1940217852592467,
15079
+ "learning_rate": 3.759860388663668e-06,
15080
+ "loss": 0.0,
15081
+ "num_tokens": 8313336.0,
15082
+ "reward": 4.099999904632568,
15083
+ "reward_std": 0.0,
15084
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15085
+ "rewards/coherence_reward_func/std": 0.0,
15086
+ "rewards/formatting_reward_func/mean": 2.0,
15087
+ "rewards/formatting_reward_func/std": 0.0,
15088
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15089
+ "rewards/quality_reward_func/std": 0.0,
15090
+ "step": 5800
15091
+ },
15092
+ {
15093
+ "completion_length": 17.5,
15094
+ "completions/clipped_ratio": 0.0,
15095
+ "completions/max_length": 17.5,
15096
+ "completions/max_terminated_length": 17.5,
15097
+ "completions/mean_length": 17.025,
15098
+ "completions/mean_terminated_length": 17.025,
15099
+ "completions/min_length": 16.6,
15100
+ "completions/min_terminated_length": 16.6,
15101
+ "epoch": 0.39947744774477445,
15102
+ "frac_reward_zero_std": 1.0,
15103
+ "grad_norm": 0.0,
15104
+ "kl": 1.0611320044845343,
15105
+ "learning_rate": 3.754673963463341e-06,
15106
+ "loss": 0.0,
15107
+ "num_tokens": 8327733.0,
15108
+ "reward": 4.099999904632568,
15109
+ "reward_std": 0.0,
15110
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15111
+ "rewards/coherence_reward_func/std": 0.0,
15112
+ "rewards/formatting_reward_func/mean": 2.0,
15113
+ "rewards/formatting_reward_func/std": 0.0,
15114
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15115
+ "rewards/quality_reward_func/std": 0.0,
15116
+ "step": 5810
15117
+ },
15118
+ {
15119
+ "completion_length": 18.9,
15120
+ "completions/clipped_ratio": 0.0,
15121
+ "completions/max_length": 18.9,
15122
+ "completions/max_terminated_length": 18.9,
15123
+ "completions/mean_length": 17.125,
15124
+ "completions/mean_terminated_length": 17.125,
15125
+ "completions/min_length": 16.1,
15126
+ "completions/min_terminated_length": 16.1,
15127
+ "epoch": 0.40016501650165015,
15128
+ "frac_reward_zero_std": 1.0,
15129
+ "grad_norm": 0.0,
15130
+ "kl": 1.1921575225889682,
15131
+ "learning_rate": 3.749480310273943e-06,
15132
+ "loss": 0.0,
15133
+ "num_tokens": 8341750.0,
15134
+ "reward": 4.099999904632568,
15135
+ "reward_std": 0.0,
15136
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15137
+ "rewards/coherence_reward_func/std": 0.0,
15138
+ "rewards/formatting_reward_func/mean": 2.0,
15139
+ "rewards/formatting_reward_func/std": 0.0,
15140
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15141
+ "rewards/quality_reward_func/std": 0.0,
15142
+ "step": 5820
15143
+ },
15144
+ {
15145
+ "completion_length": 17.3,
15146
+ "completions/clipped_ratio": 0.0,
15147
+ "completions/max_length": 17.3,
15148
+ "completions/max_terminated_length": 17.3,
15149
+ "completions/mean_length": 16.35,
15150
+ "completions/mean_terminated_length": 16.35,
15151
+ "completions/min_length": 15.6,
15152
+ "completions/min_terminated_length": 15.6,
15153
+ "epoch": 0.40085258525852585,
15154
+ "frac_reward_zero_std": 1.0,
15155
+ "grad_norm": 0.0,
15156
+ "kl": 0.9138251326978206,
15157
+ "learning_rate": 3.7442794590153326e-06,
15158
+ "loss": 0.0,
15159
+ "num_tokens": 8356848.0,
15160
+ "reward": 4.099999904632568,
15161
+ "reward_std": 0.0,
15162
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15163
+ "rewards/coherence_reward_func/std": 0.0,
15164
+ "rewards/formatting_reward_func/mean": 2.0,
15165
+ "rewards/formatting_reward_func/std": 0.0,
15166
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15167
+ "rewards/quality_reward_func/std": 0.0,
15168
+ "step": 5830
15169
+ },
15170
+ {
15171
+ "completion_length": 20.0,
15172
+ "completions/clipped_ratio": 0.0,
15173
+ "completions/max_length": 20.0,
15174
+ "completions/max_terminated_length": 20.0,
15175
+ "completions/mean_length": 18.2,
15176
+ "completions/mean_terminated_length": 18.2,
15177
+ "completions/min_length": 16.3,
15178
+ "completions/min_terminated_length": 16.3,
15179
+ "epoch": 0.40154015401540155,
15180
+ "frac_reward_zero_std": 1.0,
15181
+ "grad_norm": 0.0,
15182
+ "kl": 1.1536221474409103,
15183
+ "learning_rate": 3.739071439648836e-06,
15184
+ "loss": 0.0,
15185
+ "num_tokens": 8372328.0,
15186
+ "reward": 4.099999904632568,
15187
+ "reward_std": 0.0,
15188
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15189
+ "rewards/coherence_reward_func/std": 0.0,
15190
+ "rewards/formatting_reward_func/mean": 2.0,
15191
+ "rewards/formatting_reward_func/std": 0.0,
15192
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15193
+ "rewards/quality_reward_func/std": 0.0,
15194
+ "step": 5840
15195
+ },
15196
+ {
15197
+ "completion_length": 19.8,
15198
+ "completions/clipped_ratio": 0.0,
15199
+ "completions/max_length": 19.8,
15200
+ "completions/max_terminated_length": 19.8,
15201
+ "completions/mean_length": 16.325,
15202
+ "completions/mean_terminated_length": 16.325,
15203
+ "completions/min_length": 14.6,
15204
+ "completions/min_terminated_length": 14.6,
15205
+ "epoch": 0.40222772277227725,
15206
+ "frac_reward_zero_std": 1.0,
15207
+ "grad_norm": 0.0,
15208
+ "kl": 1.0150370292365551,
15209
+ "learning_rate": 3.733856282177074e-06,
15210
+ "loss": 0.0,
15211
+ "num_tokens": 8387829.0,
15212
+ "reward": 4.099999904632568,
15213
+ "reward_std": 0.0,
15214
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15215
+ "rewards/coherence_reward_func/std": 0.0,
15216
+ "rewards/formatting_reward_func/mean": 2.0,
15217
+ "rewards/formatting_reward_func/std": 0.0,
15218
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15219
+ "rewards/quality_reward_func/std": 0.0,
15220
+ "step": 5850
15221
+ },
15222
+ {
15223
+ "completion_length": 19.6,
15224
+ "completions/clipped_ratio": 0.0,
15225
+ "completions/max_length": 19.6,
15226
+ "completions/max_terminated_length": 19.6,
15227
+ "completions/mean_length": 18.0,
15228
+ "completions/mean_terminated_length": 18.0,
15229
+ "completions/min_length": 16.1,
15230
+ "completions/min_terminated_length": 16.1,
15231
+ "epoch": 0.4029152915291529,
15232
+ "frac_reward_zero_std": 1.0,
15233
+ "grad_norm": 0.0,
15234
+ "kl": 1.3025204107165336,
15235
+ "learning_rate": 3.7286340166437907e-06,
15236
+ "loss": 0.0,
15237
+ "num_tokens": 8402069.0,
15238
+ "reward": 4.099999904632568,
15239
+ "reward_std": 0.0,
15240
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15241
+ "rewards/coherence_reward_func/std": 0.0,
15242
+ "rewards/formatting_reward_func/mean": 2.0,
15243
+ "rewards/formatting_reward_func/std": 0.0,
15244
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15245
+ "rewards/quality_reward_func/std": 0.0,
15246
+ "step": 5860
15247
+ },
15248
+ {
15249
+ "completion_length": 20.2,
15250
+ "completions/clipped_ratio": 0.0,
15251
+ "completions/max_length": 20.2,
15252
+ "completions/max_terminated_length": 20.2,
15253
+ "completions/mean_length": 17.9,
15254
+ "completions/mean_terminated_length": 17.9,
15255
+ "completions/min_length": 16.3,
15256
+ "completions/min_terminated_length": 16.3,
15257
+ "epoch": 0.4036028602860286,
15258
+ "frac_reward_zero_std": 1.0,
15259
+ "grad_norm": 0.0,
15260
+ "kl": 1.1302866250276566,
15261
+ "learning_rate": 3.723404673133674e-06,
15262
+ "loss": 0.0,
15263
+ "num_tokens": 8416929.0,
15264
+ "reward": 4.099999904632568,
15265
+ "reward_std": 0.0,
15266
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15267
+ "rewards/coherence_reward_func/std": 0.0,
15268
+ "rewards/formatting_reward_func/mean": 2.0,
15269
+ "rewards/formatting_reward_func/std": 0.0,
15270
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15271
+ "rewards/quality_reward_func/std": 0.0,
15272
+ "step": 5870
15273
+ },
15274
+ {
15275
+ "completion_length": 19.3,
15276
+ "completions/clipped_ratio": 0.0,
15277
+ "completions/max_length": 19.3,
15278
+ "completions/max_terminated_length": 19.3,
15279
+ "completions/mean_length": 17.15,
15280
+ "completions/mean_terminated_length": 17.15,
15281
+ "completions/min_length": 16.0,
15282
+ "completions/min_terminated_length": 16.0,
15283
+ "epoch": 0.4042904290429043,
15284
+ "frac_reward_zero_std": 1.0,
15285
+ "grad_norm": 0.0,
15286
+ "kl": 1.0556719139218331,
15287
+ "learning_rate": 3.7181682817721915e-06,
15288
+ "loss": 0.0,
15289
+ "num_tokens": 8433219.0,
15290
+ "reward": 4.099999904632568,
15291
+ "reward_std": 0.0,
15292
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15293
+ "rewards/coherence_reward_func/std": 0.0,
15294
+ "rewards/formatting_reward_func/mean": 2.0,
15295
+ "rewards/formatting_reward_func/std": 0.0,
15296
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15297
+ "rewards/quality_reward_func/std": 0.0,
15298
+ "step": 5880
15299
+ },
15300
+ {
15301
+ "completion_length": 20.4,
15302
+ "completions/clipped_ratio": 0.0,
15303
+ "completions/max_length": 20.4,
15304
+ "completions/max_terminated_length": 20.4,
15305
+ "completions/mean_length": 17.95,
15306
+ "completions/mean_terminated_length": 17.95,
15307
+ "completions/min_length": 15.5,
15308
+ "completions/min_terminated_length": 15.5,
15309
+ "epoch": 0.40497799779978,
15310
+ "frac_reward_zero_std": 1.0,
15311
+ "grad_norm": 0.0,
15312
+ "kl": 1.1209779269993305,
15313
+ "learning_rate": 3.712924872725411e-06,
15314
+ "loss": 0.0,
15315
+ "num_tokens": 8448301.0,
15316
+ "reward": 4.099999904632568,
15317
+ "reward_std": 0.0,
15318
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15319
+ "rewards/coherence_reward_func/std": 0.0,
15320
+ "rewards/formatting_reward_func/mean": 2.0,
15321
+ "rewards/formatting_reward_func/std": 0.0,
15322
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15323
+ "rewards/quality_reward_func/std": 0.0,
15324
+ "step": 5890
15325
+ },
15326
+ {
15327
+ "completion_length": 19.9,
15328
+ "completions/clipped_ratio": 0.0,
15329
+ "completions/max_length": 19.9,
15330
+ "completions/max_terminated_length": 19.9,
15331
+ "completions/mean_length": 17.45,
15332
+ "completions/mean_terminated_length": 17.45,
15333
+ "completions/min_length": 15.2,
15334
+ "completions/min_terminated_length": 15.2,
15335
+ "epoch": 0.40566556655665564,
15336
+ "frac_reward_zero_std": 1.0,
15337
+ "grad_norm": 0.0,
15338
+ "kl": 1.1055759094655513,
15339
+ "learning_rate": 3.7076744761998268e-06,
15340
+ "loss": 0.0,
15341
+ "num_tokens": 8461651.0,
15342
+ "reward": 4.099999904632568,
15343
+ "reward_std": 0.0,
15344
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15345
+ "rewards/coherence_reward_func/std": 0.0,
15346
+ "rewards/formatting_reward_func/mean": 2.0,
15347
+ "rewards/formatting_reward_func/std": 0.0,
15348
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15349
+ "rewards/quality_reward_func/std": 0.0,
15350
+ "step": 5900
15351
+ },
15352
+ {
15353
+ "completion_length": 18.3,
15354
+ "completions/clipped_ratio": 0.0,
15355
+ "completions/max_length": 18.3,
15356
+ "completions/max_terminated_length": 18.3,
15357
+ "completions/mean_length": 16.625,
15358
+ "completions/mean_terminated_length": 16.625,
15359
+ "completions/min_length": 15.4,
15360
+ "completions/min_terminated_length": 15.4,
15361
+ "epoch": 0.40635313531353134,
15362
+ "frac_reward_zero_std": 1.0,
15363
+ "grad_norm": 0.0,
15364
+ "kl": 1.0736303746700286,
15365
+ "learning_rate": 3.7024171224421884e-06,
15366
+ "loss": 0.0,
15367
+ "num_tokens": 8475424.0,
15368
+ "reward": 4.099999904632568,
15369
+ "reward_std": 0.0,
15370
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15371
+ "rewards/coherence_reward_func/std": 0.0,
15372
+ "rewards/formatting_reward_func/mean": 2.0,
15373
+ "rewards/formatting_reward_func/std": 0.0,
15374
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15375
+ "rewards/quality_reward_func/std": 0.0,
15376
+ "step": 5910
15377
+ },
15378
+ {
15379
+ "completion_length": 17.2,
15380
+ "completions/clipped_ratio": 0.0,
15381
+ "completions/max_length": 17.2,
15382
+ "completions/max_terminated_length": 17.2,
15383
+ "completions/mean_length": 16.425,
15384
+ "completions/mean_terminated_length": 16.425,
15385
+ "completions/min_length": 15.7,
15386
+ "completions/min_terminated_length": 15.7,
15387
+ "epoch": 0.40704070407040704,
15388
+ "frac_reward_zero_std": 1.0,
15389
+ "grad_norm": 0.0,
15390
+ "kl": 1.2711664289236069,
15391
+ "learning_rate": 3.6971528417393254e-06,
15392
+ "loss": 0.0,
15393
+ "num_tokens": 8490933.0,
15394
+ "reward": 4.099999904632568,
15395
+ "reward_std": 0.0,
15396
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15397
+ "rewards/coherence_reward_func/std": 0.0,
15398
+ "rewards/formatting_reward_func/mean": 2.0,
15399
+ "rewards/formatting_reward_func/std": 0.0,
15400
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15401
+ "rewards/quality_reward_func/std": 0.0,
15402
+ "step": 5920
15403
+ },
15404
+ {
15405
+ "completion_length": 21.2,
15406
+ "completions/clipped_ratio": 0.0,
15407
+ "completions/max_length": 21.2,
15408
+ "completions/max_terminated_length": 21.2,
15409
+ "completions/mean_length": 18.575,
15410
+ "completions/mean_terminated_length": 18.575,
15411
+ "completions/min_length": 15.8,
15412
+ "completions/min_terminated_length": 15.8,
15413
+ "epoch": 0.40772827282728275,
15414
+ "frac_reward_zero_std": 1.0,
15415
+ "grad_norm": 0.0,
15416
+ "kl": 0.9545292537659407,
15417
+ "learning_rate": 3.6918816644179707e-06,
15418
+ "loss": 0.0,
15419
+ "num_tokens": 8504496.0,
15420
+ "reward": 4.099999904632568,
15421
+ "reward_std": 0.0,
15422
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15423
+ "rewards/coherence_reward_func/std": 0.0,
15424
+ "rewards/formatting_reward_func/mean": 2.0,
15425
+ "rewards/formatting_reward_func/std": 0.0,
15426
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15427
+ "rewards/quality_reward_func/std": 0.0,
15428
+ "step": 5930
15429
+ },
15430
+ {
15431
+ "completion_length": 17.7,
15432
+ "completions/clipped_ratio": 0.0,
15433
+ "completions/max_length": 17.7,
15434
+ "completions/max_terminated_length": 17.7,
15435
+ "completions/mean_length": 16.425,
15436
+ "completions/mean_terminated_length": 16.425,
15437
+ "completions/min_length": 14.7,
15438
+ "completions/min_terminated_length": 14.7,
15439
+ "epoch": 0.4084158415841584,
15440
+ "frac_reward_zero_std": 1.0,
15441
+ "grad_norm": 0.0,
15442
+ "kl": 1.1604718565940857,
15443
+ "learning_rate": 3.686603620844589e-06,
15444
+ "loss": 0.0,
15445
+ "num_tokens": 8517765.0,
15446
+ "reward": 4.099999904632568,
15447
+ "reward_std": 0.0,
15448
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15449
+ "rewards/coherence_reward_func/std": 0.0,
15450
+ "rewards/formatting_reward_func/mean": 2.0,
15451
+ "rewards/formatting_reward_func/std": 0.0,
15452
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15453
+ "rewards/quality_reward_func/std": 0.0,
15454
+ "step": 5940
15455
+ },
15456
+ {
15457
+ "completion_length": 17.5,
15458
+ "completions/clipped_ratio": 0.0,
15459
+ "completions/max_length": 17.5,
15460
+ "completions/max_terminated_length": 17.5,
15461
+ "completions/mean_length": 16.55,
15462
+ "completions/mean_terminated_length": 16.55,
15463
+ "completions/min_length": 15.5,
15464
+ "completions/min_terminated_length": 15.5,
15465
+ "epoch": 0.4091034103410341,
15466
+ "frac_reward_zero_std": 1.0,
15467
+ "grad_norm": 0.0,
15468
+ "kl": 1.387231619283557,
15469
+ "learning_rate": 3.6813187414252e-06,
15470
+ "loss": 0.0,
15471
+ "num_tokens": 8530935.0,
15472
+ "reward": 4.099999904632568,
15473
+ "reward_std": 0.0,
15474
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15475
+ "rewards/coherence_reward_func/std": 0.0,
15476
+ "rewards/formatting_reward_func/mean": 2.0,
15477
+ "rewards/formatting_reward_func/std": 0.0,
15478
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15479
+ "rewards/quality_reward_func/std": 0.0,
15480
+ "step": 5950
15481
+ },
15482
+ {
15483
+ "completion_length": 18.0,
15484
+ "completions/clipped_ratio": 0.0,
15485
+ "completions/max_length": 18.0,
15486
+ "completions/max_terminated_length": 18.0,
15487
+ "completions/mean_length": 16.7,
15488
+ "completions/mean_terminated_length": 16.7,
15489
+ "completions/min_length": 15.3,
15490
+ "completions/min_terminated_length": 15.3,
15491
+ "epoch": 0.4097909790979098,
15492
+ "frac_reward_zero_std": 1.0,
15493
+ "grad_norm": 0.0,
15494
+ "kl": 1.3967902317643166,
15495
+ "learning_rate": 3.6760270566052037e-06,
15496
+ "loss": 0.0,
15497
+ "num_tokens": 8544803.0,
15498
+ "reward": 4.099999904632568,
15499
+ "reward_std": 0.0,
15500
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15501
+ "rewards/coherence_reward_func/std": 0.0,
15502
+ "rewards/formatting_reward_func/mean": 2.0,
15503
+ "rewards/formatting_reward_func/std": 0.0,
15504
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15505
+ "rewards/quality_reward_func/std": 0.0,
15506
+ "step": 5960
15507
+ },
15508
+ {
15509
+ "completion_length": 17.8,
15510
+ "completions/clipped_ratio": 0.0,
15511
+ "completions/max_length": 17.8,
15512
+ "completions/max_terminated_length": 17.8,
15513
+ "completions/mean_length": 16.275,
15514
+ "completions/mean_terminated_length": 16.275,
15515
+ "completions/min_length": 14.7,
15516
+ "completions/min_terminated_length": 14.7,
15517
+ "epoch": 0.4104785478547855,
15518
+ "frac_reward_zero_std": 1.0,
15519
+ "grad_norm": 0.0,
15520
+ "kl": 1.1969308275729418,
15521
+ "learning_rate": 3.670728596869205e-06,
15522
+ "loss": 0.0,
15523
+ "num_tokens": 8558642.0,
15524
+ "reward": 4.099999904632568,
15525
+ "reward_std": 0.0,
15526
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15527
+ "rewards/coherence_reward_func/std": 0.0,
15528
+ "rewards/formatting_reward_func/mean": 2.0,
15529
+ "rewards/formatting_reward_func/std": 0.0,
15530
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15531
+ "rewards/quality_reward_func/std": 0.0,
15532
+ "step": 5970
15533
+ },
15534
+ {
15535
+ "completion_length": 17.8,
15536
+ "completions/clipped_ratio": 0.0,
15537
+ "completions/max_length": 17.8,
15538
+ "completions/max_terminated_length": 17.8,
15539
+ "completions/mean_length": 15.625,
15540
+ "completions/mean_terminated_length": 15.625,
15541
+ "completions/min_length": 14.0,
15542
+ "completions/min_terminated_length": 14.0,
15543
+ "epoch": 0.4111661166116612,
15544
+ "frac_reward_zero_std": 1.0,
15545
+ "grad_norm": 0.0,
15546
+ "kl": 1.0570856800302864,
15547
+ "learning_rate": 3.6654233927408377e-06,
15548
+ "loss": 0.0,
15549
+ "num_tokens": 8572351.0,
15550
+ "reward": 4.099999904632568,
15551
+ "reward_std": 0.0,
15552
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15553
+ "rewards/coherence_reward_func/std": 0.0,
15554
+ "rewards/formatting_reward_func/mean": 2.0,
15555
+ "rewards/formatting_reward_func/std": 0.0,
15556
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15557
+ "rewards/quality_reward_func/std": 0.0,
15558
+ "step": 5980
15559
+ },
15560
+ {
15561
+ "completion_length": 19.0,
15562
+ "completions/clipped_ratio": 0.0,
15563
+ "completions/max_length": 19.0,
15564
+ "completions/max_terminated_length": 19.0,
15565
+ "completions/mean_length": 17.35,
15566
+ "completions/mean_terminated_length": 17.35,
15567
+ "completions/min_length": 16.4,
15568
+ "completions/min_terminated_length": 16.4,
15569
+ "epoch": 0.41185368536853684,
15570
+ "frac_reward_zero_std": 1.0,
15571
+ "grad_norm": 0.0,
15572
+ "kl": 1.3443511426448822,
15573
+ "learning_rate": 3.66011147478259e-06,
15574
+ "loss": 0.0,
15575
+ "num_tokens": 8588401.0,
15576
+ "reward": 4.099999904632568,
15577
+ "reward_std": 0.0,
15578
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15579
+ "rewards/coherence_reward_func/std": 0.0,
15580
+ "rewards/formatting_reward_func/mean": 2.0,
15581
+ "rewards/formatting_reward_func/std": 0.0,
15582
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15583
+ "rewards/quality_reward_func/std": 0.0,
15584
+ "step": 5990
15585
+ },
15586
+ {
15587
+ "completion_length": 17.9,
15588
+ "completions/clipped_ratio": 0.0,
15589
+ "completions/max_length": 17.9,
15590
+ "completions/max_terminated_length": 17.9,
15591
+ "completions/mean_length": 16.775,
15592
+ "completions/mean_terminated_length": 16.775,
15593
+ "completions/min_length": 16.4,
15594
+ "completions/min_terminated_length": 16.4,
15595
+ "epoch": 0.41254125412541254,
15596
+ "frac_reward_zero_std": 1.0,
15597
+ "grad_norm": 0.0,
15598
+ "kl": 1.286434081196785,
15599
+ "learning_rate": 3.654792873595627e-06,
15600
+ "loss": 0.0,
15601
+ "num_tokens": 8604144.0,
15602
+ "reward": 4.099999904632568,
15603
+ "reward_std": 0.0,
15604
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
15605
+ "rewards/coherence_reward_func/std": 0.0,
15606
+ "rewards/formatting_reward_func/mean": 2.0,
15607
+ "rewards/formatting_reward_func/std": 0.0,
15608
+ "rewards/quality_reward_func/mean": 0.800000011920929,
15609
+ "rewards/quality_reward_func/std": 0.0,
15610
+ "step": 6000
15611
  }
15612
  ],
15613
  "logging_steps": 10,
15614
  "max_steps": 14544,
15615
+ "num_input_tokens_seen": 8604144,
15616
  "num_train_epochs": 1,
15617
  "save_steps": 50,
15618
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42b42a64fa29ca47bc2e0aa39c0a6a5f4997b48e715b9026d691d0c0901ff35f
3
  size 7057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f3df0c21647ebac4dcd78266f6f25b764a8202748a0b7c0402d7405dc13124
3
  size 7057