Arittro2 commited on
Commit
6899abe
·
verified ·
1 Parent(s): 2c5844d

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
- "o_proj",
34
  "up_proj",
35
- "v_proj",
36
  "k_proj",
 
37
  "gate_proj",
38
- "down_proj"
 
 
39
  ],
40
  "task_type": "CAUSAL_LM",
41
  "trainable_token_indices": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
 
32
  "up_proj",
 
33
  "k_proj",
34
+ "down_proj",
35
  "gate_proj",
36
+ "o_proj",
37
+ "v_proj",
38
+ "q_proj"
39
  ],
40
  "task_type": "CAUSAL_LM",
41
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1a10b47ddcdb322971a51ee26080931f9d895c9369d96e2eec9575465dd8d4c
3
  size 262406656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ed7c8122b399a7429458c9bec480cf7e21349f800561dd00ce127f72784813
3
  size 262406656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc0655a2bba93259edec89533b0ee6ac4767d92178573b7bdb376f8748e2b6be
3
  size 122872331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e14fe59460f9351b2c2c7e068f3daa559d773b943562f9712360bb2d6fcd69
3
  size 122872331
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c740099c2ad4b86e09f4a66a39e34520b6c5b4af09b4af62b1704c5c352ab67
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:715fa110eff443e5f62845343f8d9c47ba7633ab807289e3e2a72139c5a3dcb5
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6479516a40faee53dfd5a572b83d53f7ef43eb8c5ddb1d3c9c766e863711d9b9
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc8093c5d8958faf7374afb5677c510aef66de9b3b9dfb56763d6e9c1ab23447
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.9488448844884488,
6
  "eval_steps": 500,
7
- "global_step": 13800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -35888,11 +35888,531 @@
35888
  "rewards/quality_reward_func/mean": 0.800000011920929,
35889
  "rewards/quality_reward_func/std": 0.0,
35890
  "step": 13800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35891
  }
35892
  ],
35893
  "logging_steps": 10,
35894
  "max_steps": 14544,
35895
- "num_input_tokens_seen": 19839192,
35896
  "num_train_epochs": 1,
35897
  "save_steps": 50,
35898
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9625962596259626,
6
  "eval_steps": 500,
7
+ "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
35888
  "rewards/quality_reward_func/mean": 0.800000011920929,
35889
  "rewards/quality_reward_func/std": 0.0,
35890
  "step": 13800
35891
+ },
35892
+ {
35893
+ "completion_length": 18.7,
35894
+ "completions/clipped_ratio": 0.0,
35895
+ "completions/max_length": 18.7,
35896
+ "completions/max_terminated_length": 18.7,
35897
+ "completions/mean_length": 16.725,
35898
+ "completions/mean_terminated_length": 16.725,
35899
+ "completions/min_length": 15.3,
35900
+ "completions/min_terminated_length": 15.3,
35901
+ "epoch": 0.9495324532453245,
35902
+ "frac_reward_zero_std": 1.0,
35903
+ "grad_norm": 0.0,
35904
+ "kl": 1.0350565232336522,
35905
+ "learning_rate": 3.8801175392468584e-08,
35906
+ "loss": 0.0,
35907
+ "num_tokens": 19854041.0,
35908
+ "reward": 4.099999904632568,
35909
+ "reward_std": 0.0,
35910
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
35911
+ "rewards/coherence_reward_func/std": 0.0,
35912
+ "rewards/formatting_reward_func/mean": 2.0,
35913
+ "rewards/formatting_reward_func/std": 0.0,
35914
+ "rewards/quality_reward_func/mean": 0.800000011920929,
35915
+ "rewards/quality_reward_func/std": 0.0,
35916
+ "step": 13810
35917
+ },
35918
+ {
35919
+ "completion_length": 18.1,
35920
+ "completions/clipped_ratio": 0.0,
35921
+ "completions/max_length": 18.1,
35922
+ "completions/max_terminated_length": 18.1,
35923
+ "completions/mean_length": 16.9,
35924
+ "completions/mean_terminated_length": 16.9,
35925
+ "completions/min_length": 15.7,
35926
+ "completions/min_terminated_length": 15.7,
35927
+ "epoch": 0.9502200220022002,
35928
+ "frac_reward_zero_std": 1.0,
35929
+ "grad_norm": 0.0,
35930
+ "kl": 1.2483361944556237,
35931
+ "learning_rate": 3.775519104418812e-08,
35932
+ "loss": 0.0,
35933
+ "num_tokens": 19869149.0,
35934
+ "reward": 4.099999904632568,
35935
+ "reward_std": 0.0,
35936
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
35937
+ "rewards/coherence_reward_func/std": 0.0,
35938
+ "rewards/formatting_reward_func/mean": 2.0,
35939
+ "rewards/formatting_reward_func/std": 0.0,
35940
+ "rewards/quality_reward_func/mean": 0.800000011920929,
35941
+ "rewards/quality_reward_func/std": 0.0,
35942
+ "step": 13820
35943
+ },
35944
+ {
35945
+ "completion_length": 18.0,
35946
+ "completions/clipped_ratio": 0.0,
35947
+ "completions/max_length": 18.0,
35948
+ "completions/max_terminated_length": 18.0,
35949
+ "completions/mean_length": 16.775,
35950
+ "completions/mean_terminated_length": 16.775,
35951
+ "completions/min_length": 15.7,
35952
+ "completions/min_terminated_length": 15.7,
35953
+ "epoch": 0.9509075907590759,
35954
+ "frac_reward_zero_std": 1.0,
35955
+ "grad_norm": 0.0,
35956
+ "kl": 1.2970769941806792,
35957
+ "learning_rate": 3.672339132003211e-08,
35958
+ "loss": 0.0,
35959
+ "num_tokens": 19886136.0,
35960
+ "reward": 4.099999904632568,
35961
+ "reward_std": 0.0,
35962
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
35963
+ "rewards/coherence_reward_func/std": 0.0,
35964
+ "rewards/formatting_reward_func/mean": 2.0,
35965
+ "rewards/formatting_reward_func/std": 0.0,
35966
+ "rewards/quality_reward_func/mean": 0.800000011920929,
35967
+ "rewards/quality_reward_func/std": 0.0,
35968
+ "step": 13830
35969
+ },
35970
+ {
35971
+ "completion_length": 20.5,
35972
+ "completions/clipped_ratio": 0.0,
35973
+ "completions/max_length": 20.5,
35974
+ "completions/max_terminated_length": 20.5,
35975
+ "completions/mean_length": 18.35,
35976
+ "completions/mean_terminated_length": 18.35,
35977
+ "completions/min_length": 16.3,
35978
+ "completions/min_terminated_length": 16.3,
35979
+ "epoch": 0.9515951595159516,
35980
+ "frac_reward_zero_std": 1.0,
35981
+ "grad_norm": 0.0,
35982
+ "kl": 1.3883480228483678,
35983
+ "learning_rate": 3.5705782164044135e-08,
35984
+ "loss": 0.0001,
35985
+ "num_tokens": 19897482.0,
35986
+ "reward": 4.099999904632568,
35987
+ "reward_std": 0.0,
35988
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
35989
+ "rewards/coherence_reward_func/std": 0.0,
35990
+ "rewards/formatting_reward_func/mean": 2.0,
35991
+ "rewards/formatting_reward_func/std": 0.0,
35992
+ "rewards/quality_reward_func/mean": 0.800000011920929,
35993
+ "rewards/quality_reward_func/std": 0.0,
35994
+ "step": 13840
35995
+ },
35996
+ {
35997
+ "completion_length": 15.6,
35998
+ "completions/clipped_ratio": 0.0,
35999
+ "completions/max_length": 15.6,
36000
+ "completions/max_terminated_length": 15.6,
36001
+ "completions/mean_length": 15.25,
36002
+ "completions/mean_terminated_length": 15.25,
36003
+ "completions/min_length": 14.9,
36004
+ "completions/min_terminated_length": 14.9,
36005
+ "epoch": 0.9522827282728272,
36006
+ "frac_reward_zero_std": 1.0,
36007
+ "grad_norm": 0.0,
36008
+ "kl": 0.9344463728368282,
36009
+ "learning_rate": 3.470236943851929e-08,
36010
+ "loss": 0.0,
36011
+ "num_tokens": 19910592.0,
36012
+ "reward": 4.099999904632568,
36013
+ "reward_std": 0.0,
36014
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36015
+ "rewards/coherence_reward_func/std": 0.0,
36016
+ "rewards/formatting_reward_func/mean": 2.0,
36017
+ "rewards/formatting_reward_func/std": 0.0,
36018
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36019
+ "rewards/quality_reward_func/std": 0.0,
36020
+ "step": 13850
36021
+ },
36022
+ {
36023
+ "completion_length": 18.2,
36024
+ "completions/clipped_ratio": 0.0,
36025
+ "completions/max_length": 18.2,
36026
+ "completions/max_terminated_length": 18.2,
36027
+ "completions/mean_length": 16.75,
36028
+ "completions/mean_terminated_length": 16.75,
36029
+ "completions/min_length": 15.6,
36030
+ "completions/min_terminated_length": 15.6,
36031
+ "epoch": 0.9529702970297029,
36032
+ "frac_reward_zero_std": 1.0,
36033
+ "grad_norm": 0.0,
36034
+ "kl": 1.0786833353340626,
36035
+ "learning_rate": 3.371315892396698e-08,
36036
+ "loss": 0.0,
36037
+ "num_tokens": 19924222.0,
36038
+ "reward": 4.099999904632568,
36039
+ "reward_std": 0.0,
36040
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36041
+ "rewards/coherence_reward_func/std": 0.0,
36042
+ "rewards/formatting_reward_func/mean": 2.0,
36043
+ "rewards/formatting_reward_func/std": 0.0,
36044
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36045
+ "rewards/quality_reward_func/std": 0.0,
36046
+ "step": 13860
36047
+ },
36048
+ {
36049
+ "completion_length": 20.6,
36050
+ "completions/clipped_ratio": 0.0,
36051
+ "completions/max_length": 20.6,
36052
+ "completions/max_terminated_length": 20.6,
36053
+ "completions/mean_length": 18.65,
36054
+ "completions/mean_terminated_length": 18.65,
36055
+ "completions/min_length": 16.7,
36056
+ "completions/min_terminated_length": 16.7,
36057
+ "epoch": 0.9536578657865786,
36058
+ "frac_reward_zero_std": 1.0,
36059
+ "grad_norm": 0.0,
36060
+ "kl": 1.420877918601036,
36061
+ "learning_rate": 3.2738156319082336e-08,
36062
+ "loss": 0.0001,
36063
+ "num_tokens": 19939312.0,
36064
+ "reward": 4.099999904632568,
36065
+ "reward_std": 0.0,
36066
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36067
+ "rewards/coherence_reward_func/std": 0.0,
36068
+ "rewards/formatting_reward_func/mean": 2.0,
36069
+ "rewards/formatting_reward_func/std": 0.0,
36070
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36071
+ "rewards/quality_reward_func/std": 0.0,
36072
+ "step": 13870
36073
+ },
36074
+ {
36075
+ "completion_length": 18.8,
36076
+ "completions/clipped_ratio": 0.0,
36077
+ "completions/max_length": 18.8,
36078
+ "completions/max_terminated_length": 18.8,
36079
+ "completions/mean_length": 17.35,
36080
+ "completions/mean_terminated_length": 17.35,
36081
+ "completions/min_length": 16.0,
36082
+ "completions/min_terminated_length": 16.0,
36083
+ "epoch": 0.9543454345434543,
36084
+ "frac_reward_zero_std": 1.0,
36085
+ "grad_norm": 0.0,
36086
+ "kl": 1.182000921666622,
36087
+ "learning_rate": 3.1777367240708455e-08,
36088
+ "loss": 0.0,
36089
+ "num_tokens": 19953590.0,
36090
+ "reward": 4.099999904632568,
36091
+ "reward_std": 0.0,
36092
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36093
+ "rewards/coherence_reward_func/std": 0.0,
36094
+ "rewards/formatting_reward_func/mean": 2.0,
36095
+ "rewards/formatting_reward_func/std": 0.0,
36096
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36097
+ "rewards/quality_reward_func/std": 0.0,
36098
+ "step": 13880
36099
+ },
36100
+ {
36101
+ "completion_length": 18.6,
36102
+ "completions/clipped_ratio": 0.0,
36103
+ "completions/max_length": 18.6,
36104
+ "completions/max_terminated_length": 18.6,
36105
+ "completions/mean_length": 17.4,
36106
+ "completions/mean_terminated_length": 17.4,
36107
+ "completions/min_length": 16.1,
36108
+ "completions/min_terminated_length": 16.1,
36109
+ "epoch": 0.95503300330033,
36110
+ "frac_reward_zero_std": 1.0,
36111
+ "grad_norm": 0.0,
36112
+ "kl": 1.1275596469640732,
36113
+ "learning_rate": 3.0830797223808106e-08,
36114
+ "loss": 0.0,
36115
+ "num_tokens": 19969106.0,
36116
+ "reward": 4.099999904632568,
36117
+ "reward_std": 0.0,
36118
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36119
+ "rewards/coherence_reward_func/std": 0.0,
36120
+ "rewards/formatting_reward_func/mean": 2.0,
36121
+ "rewards/formatting_reward_func/std": 0.0,
36122
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36123
+ "rewards/quality_reward_func/std": 0.0,
36124
+ "step": 13890
36125
+ },
36126
+ {
36127
+ "completion_length": 20.7,
36128
+ "completions/clipped_ratio": 0.0,
36129
+ "completions/max_length": 20.7,
36130
+ "completions/max_terminated_length": 20.7,
36131
+ "completions/mean_length": 18.1,
36132
+ "completions/mean_terminated_length": 18.1,
36133
+ "completions/min_length": 16.5,
36134
+ "completions/min_terminated_length": 16.5,
36135
+ "epoch": 0.9557205720572057,
36136
+ "frac_reward_zero_std": 1.0,
36137
+ "grad_norm": 0.0,
36138
+ "kl": 1.4078487813472749,
36139
+ "learning_rate": 2.989845172142958e-08,
36140
+ "loss": 0.0001,
36141
+ "num_tokens": 19985234.0,
36142
+ "reward": 4.099999904632568,
36143
+ "reward_std": 0.0,
36144
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36145
+ "rewards/coherence_reward_func/std": 0.0,
36146
+ "rewards/formatting_reward_func/mean": 2.0,
36147
+ "rewards/formatting_reward_func/std": 0.0,
36148
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36149
+ "rewards/quality_reward_func/std": 0.0,
36150
+ "step": 13900
36151
+ },
36152
+ {
36153
+ "completion_length": 17.5,
36154
+ "completions/clipped_ratio": 0.0,
36155
+ "completions/max_length": 17.5,
36156
+ "completions/max_terminated_length": 17.5,
36157
+ "completions/mean_length": 16.625,
36158
+ "completions/mean_terminated_length": 16.625,
36159
+ "completions/min_length": 16.1,
36160
+ "completions/min_terminated_length": 16.1,
36161
+ "epoch": 0.9564081408140814,
36162
+ "frac_reward_zero_std": 1.0,
36163
+ "grad_norm": 0.0,
36164
+ "kl": 0.7960809737443924,
36165
+ "learning_rate": 2.89803361046756e-08,
36166
+ "loss": 0.0,
36167
+ "num_tokens": 20002139.0,
36168
+ "reward": 4.099999904632568,
36169
+ "reward_std": 0.0,
36170
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36171
+ "rewards/coherence_reward_func/std": 0.0,
36172
+ "rewards/formatting_reward_func/mean": 2.0,
36173
+ "rewards/formatting_reward_func/std": 0.0,
36174
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36175
+ "rewards/quality_reward_func/std": 0.0,
36176
+ "step": 13910
36177
+ },
36178
+ {
36179
+ "completion_length": 18.8,
36180
+ "completions/clipped_ratio": 0.0,
36181
+ "completions/max_length": 18.8,
36182
+ "completions/max_terminated_length": 18.8,
36183
+ "completions/mean_length": 16.975,
36184
+ "completions/mean_terminated_length": 16.975,
36185
+ "completions/min_length": 15.8,
36186
+ "completions/min_terminated_length": 15.8,
36187
+ "epoch": 0.9570957095709571,
36188
+ "frac_reward_zero_std": 1.0,
36189
+ "grad_norm": 0.0,
36190
+ "kl": 0.9877739049494266,
36191
+ "learning_rate": 2.8076455662673363e-08,
36192
+ "loss": 0.0,
36193
+ "num_tokens": 20019710.0,
36194
+ "reward": 4.099999904632568,
36195
+ "reward_std": 0.0,
36196
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36197
+ "rewards/coherence_reward_func/std": 0.0,
36198
+ "rewards/formatting_reward_func/mean": 2.0,
36199
+ "rewards/formatting_reward_func/std": 0.0,
36200
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36201
+ "rewards/quality_reward_func/std": 0.0,
36202
+ "step": 13920
36203
+ },
36204
+ {
36205
+ "completion_length": 20.1,
36206
+ "completions/clipped_ratio": 0.0,
36207
+ "completions/max_length": 20.1,
36208
+ "completions/max_terminated_length": 20.1,
36209
+ "completions/mean_length": 18.55,
36210
+ "completions/mean_terminated_length": 18.55,
36211
+ "completions/min_length": 16.9,
36212
+ "completions/min_terminated_length": 16.9,
36213
+ "epoch": 0.9577832783278328,
36214
+ "frac_reward_zero_std": 1.0,
36215
+ "grad_norm": 0.0,
36216
+ "kl": 1.1007904268801212,
36217
+ "learning_rate": 2.7186815602542606e-08,
36218
+ "loss": 0.0,
36219
+ "num_tokens": 20035352.0,
36220
+ "reward": 4.099999904632568,
36221
+ "reward_std": 0.0,
36222
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36223
+ "rewards/coherence_reward_func/std": 0.0,
36224
+ "rewards/formatting_reward_func/mean": 2.0,
36225
+ "rewards/formatting_reward_func/std": 0.0,
36226
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36227
+ "rewards/quality_reward_func/std": 0.0,
36228
+ "step": 13930
36229
+ },
36230
+ {
36231
+ "completion_length": 18.4,
36232
+ "completions/clipped_ratio": 0.0,
36233
+ "completions/max_length": 18.4,
36234
+ "completions/max_terminated_length": 18.4,
36235
+ "completions/mean_length": 15.9,
36236
+ "completions/mean_terminated_length": 15.9,
36237
+ "completions/min_length": 14.8,
36238
+ "completions/min_terminated_length": 14.8,
36239
+ "epoch": 0.9584708470847084,
36240
+ "frac_reward_zero_std": 1.0,
36241
+ "grad_norm": 0.0,
36242
+ "kl": 1.0643165530636907,
36243
+ "learning_rate": 2.6311421049366736e-08,
36244
+ "loss": 0.0,
36245
+ "num_tokens": 20046580.0,
36246
+ "reward": 4.099999904632568,
36247
+ "reward_std": 0.0,
36248
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36249
+ "rewards/coherence_reward_func/std": 0.0,
36250
+ "rewards/formatting_reward_func/mean": 2.0,
36251
+ "rewards/formatting_reward_func/std": 0.0,
36252
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36253
+ "rewards/quality_reward_func/std": 0.0,
36254
+ "step": 13940
36255
+ },
36256
+ {
36257
+ "completion_length": 19.0,
36258
+ "completions/clipped_ratio": 0.0,
36259
+ "completions/max_length": 19.0,
36260
+ "completions/max_terminated_length": 19.0,
36261
+ "completions/mean_length": 16.9,
36262
+ "completions/mean_terminated_length": 16.9,
36263
+ "completions/min_length": 15.5,
36264
+ "completions/min_terminated_length": 15.5,
36265
+ "epoch": 0.9591584158415841,
36266
+ "frac_reward_zero_std": 1.0,
36267
+ "grad_norm": 0.0,
36268
+ "kl": 1.126106108725071,
36269
+ "learning_rate": 2.5450277046162874e-08,
36270
+ "loss": 0.0,
36271
+ "num_tokens": 20061356.0,
36272
+ "reward": 4.099999904632568,
36273
+ "reward_std": 0.0,
36274
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36275
+ "rewards/coherence_reward_func/std": 0.0,
36276
+ "rewards/formatting_reward_func/mean": 2.0,
36277
+ "rewards/formatting_reward_func/std": 0.0,
36278
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36279
+ "rewards/quality_reward_func/std": 0.0,
36280
+ "step": 13950
36281
+ },
36282
+ {
36283
+ "completion_length": 19.7,
36284
+ "completions/clipped_ratio": 0.0,
36285
+ "completions/max_length": 19.7,
36286
+ "completions/max_terminated_length": 19.7,
36287
+ "completions/mean_length": 18.05,
36288
+ "completions/mean_terminated_length": 18.05,
36289
+ "completions/min_length": 17.1,
36290
+ "completions/min_terminated_length": 17.1,
36291
+ "epoch": 0.9598459845984598,
36292
+ "frac_reward_zero_std": 1.0,
36293
+ "grad_norm": 0.0,
36294
+ "kl": 0.9622666202485561,
36295
+ "learning_rate": 2.460338855385297e-08,
36296
+ "loss": 0.0,
36297
+ "num_tokens": 20075018.0,
36298
+ "reward": 4.099999904632568,
36299
+ "reward_std": 0.0,
36300
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36301
+ "rewards/coherence_reward_func/std": 0.0,
36302
+ "rewards/formatting_reward_func/mean": 2.0,
36303
+ "rewards/formatting_reward_func/std": 0.0,
36304
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36305
+ "rewards/quality_reward_func/std": 0.0,
36306
+ "step": 13960
36307
+ },
36308
+ {
36309
+ "completion_length": 17.2,
36310
+ "completions/clipped_ratio": 0.0,
36311
+ "completions/max_length": 17.2,
36312
+ "completions/max_terminated_length": 17.2,
36313
+ "completions/mean_length": 15.55,
36314
+ "completions/mean_terminated_length": 15.55,
36315
+ "completions/min_length": 14.6,
36316
+ "completions/min_terminated_length": 14.6,
36317
+ "epoch": 0.9605335533553355,
36318
+ "frac_reward_zero_std": 1.0,
36319
+ "grad_norm": 0.0,
36320
+ "kl": 1.032901889272034,
36321
+ "learning_rate": 2.3770760451234665e-08,
36322
+ "loss": 0.0,
36323
+ "num_tokens": 20089012.0,
36324
+ "reward": 4.099999904632568,
36325
+ "reward_std": 0.0,
36326
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36327
+ "rewards/coherence_reward_func/std": 0.0,
36328
+ "rewards/formatting_reward_func/mean": 2.0,
36329
+ "rewards/formatting_reward_func/std": 0.0,
36330
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36331
+ "rewards/quality_reward_func/std": 0.0,
36332
+ "step": 13970
36333
+ },
36334
+ {
36335
+ "completion_length": 20.5,
36336
+ "completions/clipped_ratio": 0.0,
36337
+ "completions/max_length": 20.5,
36338
+ "completions/max_terminated_length": 20.5,
36339
+ "completions/mean_length": 18.0,
36340
+ "completions/mean_terminated_length": 18.0,
36341
+ "completions/min_length": 16.0,
36342
+ "completions/min_terminated_length": 16.0,
36343
+ "epoch": 0.9612211221122112,
36344
+ "frac_reward_zero_std": 1.0,
36345
+ "grad_norm": 0.0,
36346
+ "kl": 1.3917377760633827,
36347
+ "learning_rate": 2.2952397534954097e-08,
36348
+ "loss": 0.0001,
36349
+ "num_tokens": 20102060.0,
36350
+ "reward": 4.099999904632568,
36351
+ "reward_std": 0.0,
36352
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36353
+ "rewards/coherence_reward_func/std": 0.0,
36354
+ "rewards/formatting_reward_func/mean": 2.0,
36355
+ "rewards/formatting_reward_func/std": 0.0,
36356
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36357
+ "rewards/quality_reward_func/std": 0.0,
36358
+ "step": 13980
36359
+ },
36360
+ {
36361
+ "completion_length": 17.1,
36362
+ "completions/clipped_ratio": 0.0,
36363
+ "completions/max_length": 17.1,
36364
+ "completions/max_terminated_length": 17.1,
36365
+ "completions/mean_length": 15.55,
36366
+ "completions/mean_terminated_length": 15.55,
36367
+ "completions/min_length": 14.3,
36368
+ "completions/min_terminated_length": 14.3,
36369
+ "epoch": 0.9619086908690869,
36370
+ "frac_reward_zero_std": 1.0,
36371
+ "grad_norm": 0.0,
36372
+ "kl": 1.2739990446716547,
36373
+ "learning_rate": 2.214830451947786e-08,
36374
+ "loss": 0.0,
36375
+ "num_tokens": 20116294.0,
36376
+ "reward": 4.099999904632568,
36377
+ "reward_std": 0.0,
36378
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36379
+ "rewards/coherence_reward_func/std": 0.0,
36380
+ "rewards/formatting_reward_func/mean": 2.0,
36381
+ "rewards/formatting_reward_func/std": 0.0,
36382
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36383
+ "rewards/quality_reward_func/std": 0.0,
36384
+ "step": 13990
36385
+ },
36386
+ {
36387
+ "completion_length": 17.7,
36388
+ "completions/clipped_ratio": 0.0,
36389
+ "completions/max_length": 17.7,
36390
+ "completions/max_terminated_length": 17.7,
36391
+ "completions/mean_length": 16.875,
36392
+ "completions/mean_terminated_length": 16.875,
36393
+ "completions/min_length": 16.1,
36394
+ "completions/min_terminated_length": 16.1,
36395
+ "epoch": 0.9625962596259626,
36396
+ "frac_reward_zero_std": 1.0,
36397
+ "grad_norm": 0.0,
36398
+ "kl": 1.1548074826598167,
36399
+ "learning_rate": 2.1358486037065253e-08,
36400
+ "loss": 0.0,
36401
+ "num_tokens": 20130169.0,
36402
+ "reward": 4.099999904632568,
36403
+ "reward_std": 0.0,
36404
+ "rewards/coherence_reward_func/mean": 1.2999999523162842,
36405
+ "rewards/coherence_reward_func/std": 0.0,
36406
+ "rewards/formatting_reward_func/mean": 2.0,
36407
+ "rewards/formatting_reward_func/std": 0.0,
36408
+ "rewards/quality_reward_func/mean": 0.800000011920929,
36409
+ "rewards/quality_reward_func/std": 0.0,
36410
+ "step": 14000
36411
  }
36412
  ],
36413
  "logging_steps": 10,
36414
  "max_steps": 14544,
36415
+ "num_input_tokens_seen": 20130169,
36416
  "num_train_epochs": 1,
36417
  "save_steps": 50,
36418
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d555ad97383c64657789e880346a88f8c6b6a63980961fc6011db199d7a6a11d
3
  size 7057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:834eb2c805e6cbc223d894fc25ac7ea495fdb7ae28416408c469f17593a544fc
3
  size 7057