BRlkl commited on
Commit
c019c2f
·
verified ·
1 Parent(s): 8c36649

full-state checkpoint 10-percent (step 75)

Browse files
ckpt-10-percent/adapter_config.json CHANGED
@@ -33,13 +33,13 @@
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "v_proj",
37
  "up_proj",
38
- "q_proj",
39
- "k_proj",
40
  "o_proj",
41
  "gate_proj",
42
- "down_proj"
 
 
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
36
  "up_proj",
 
 
37
  "o_proj",
38
  "gate_proj",
39
+ "down_proj",
40
+ "v_proj",
41
+ "k_proj",
42
+ "q_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
ckpt-10-percent/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:284a02ea2fa500520f982be5d640b88cf69df89613a9d08b35f3c03477166ae8
3
  size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f59456b130c136a1e74d092ac27b7e24d7bd6a28968f8e4d30ecf1dd2f2a6af
3
  size 528550256
ckpt-10-percent/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70f986b0e013e0c8a3a70c9c0465167f804fd40e54dc5bcfca3169f0237c29a6
3
+ size 268963141
ckpt-10-percent/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3384fda721d87cf13c70812a8bea527841773ed994afaaf23bda164b902877b
3
+ size 14709
ckpt-10-percent/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:620e365c24703d8eb8dcba3ff0a9ceabbb8471df3a45aaf4beb33a3f78143275
3
+ size 1465
ckpt-10-percent/trainer_state.json ADDED
@@ -0,0 +1,1662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.2,
6
+ "eval_steps": 500,
7
+ "global_step": 75,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "completion_length": 390.8525085449219,
14
+ "completions/clipped_ratio": 0.022499999031424522,
15
+ "completions/max_length": 2048.0,
16
+ "completions/max_terminated_length": 1494.0,
17
+ "completions/mean_length": 391.8299865722656,
18
+ "completions/mean_terminated_length": 353.70843505859375,
19
+ "completions/min_length": 36.0,
20
+ "completions/min_terminated_length": 36.0,
21
+ "epoch": 0.0026666666666666666,
22
+ "frac_reward_zero_std": 0.0,
23
+ "grad_norm": 0.23882409930229187,
24
+ "kl": 0.0,
25
+ "learning_rate": 1e-05,
26
+ "loss": -0.0,
27
+ "num_tokens": 274622.0,
28
+ "reward": -0.3036116659641266,
29
+ "reward_std": 1.1926226615905762,
30
+ "rewards/multidomain_reward_func/mean": -0.3036116659641266,
31
+ "rewards/multidomain_reward_func/std": 1.5280684232711792,
32
+ "step": 1
33
+ },
34
+ {
35
+ "completion_length": 429.7850082397461,
36
+ "completions/clipped_ratio": 0.042500000447034836,
37
+ "completions/max_length": 2048.0,
38
+ "completions/max_terminated_length": 1361.0,
39
+ "completions/mean_length": 430.74249267578125,
40
+ "completions/mean_terminated_length": 358.9582214355469,
41
+ "completions/min_length": 18.0,
42
+ "completions/min_terminated_length": 18.0,
43
+ "epoch": 0.005333333333333333,
44
+ "frac_reward_zero_std": 0.0,
45
+ "grad_norm": 0.2470335215330124,
46
+ "kl": 0.0,
47
+ "learning_rate": 9.986666666666667e-06,
48
+ "loss": 0.0,
49
+ "num_tokens": 571689.0,
50
+ "reward": -0.1849958449602127,
51
+ "reward_std": 1.1945927143096924,
52
+ "rewards/multidomain_reward_func/mean": -0.1849958449602127,
53
+ "rewards/multidomain_reward_func/std": 1.7641037702560425,
54
+ "step": 2
55
+ },
56
+ {
57
+ "completion_length": 469.2000045776367,
58
+ "completions/clipped_ratio": 0.0625,
59
+ "completions/max_length": 2048.0,
60
+ "completions/max_terminated_length": 1267.0,
61
+ "completions/mean_length": 470.1374816894531,
62
+ "completions/mean_terminated_length": 364.9466552734375,
63
+ "completions/min_length": 79.0,
64
+ "completions/min_terminated_length": 79.0,
65
+ "epoch": 0.008,
66
+ "frac_reward_zero_std": 0.09999999403953552,
67
+ "grad_norm": 0.2603771984577179,
68
+ "kl": 0.0,
69
+ "learning_rate": 9.973333333333333e-06,
70
+ "loss": -0.0,
71
+ "num_tokens": 898834.0,
72
+ "reward": -0.8332051634788513,
73
+ "reward_std": 0.9539631605148315,
74
+ "rewards/multidomain_reward_func/mean": -0.8332051038742065,
75
+ "rewards/multidomain_reward_func/std": 2.1196253299713135,
76
+ "step": 3
77
+ },
78
+ {
79
+ "completion_length": 391.4550064086914,
80
+ "completions/clipped_ratio": 0.004999999888241291,
81
+ "completions/max_length": 2048.0,
82
+ "completions/max_terminated_length": 1078.0,
83
+ "completions/mean_length": 392.4499816894531,
84
+ "completions/mean_terminated_length": 384.1306457519531,
85
+ "completions/min_length": 63.0,
86
+ "completions/min_terminated_length": 63.0,
87
+ "epoch": 0.010666666666666666,
88
+ "frac_reward_zero_std": 0.0,
89
+ "grad_norm": 0.30947721004486084,
90
+ "kl": 0.0,
91
+ "learning_rate": 9.960000000000001e-06,
92
+ "loss": -0.0,
93
+ "num_tokens": 1170514.0,
94
+ "reward": 0.25277450680732727,
95
+ "reward_std": 0.9387752413749695,
96
+ "rewards/multidomain_reward_func/mean": 0.25277450680732727,
97
+ "rewards/multidomain_reward_func/std": 1.2881438732147217,
98
+ "step": 4
99
+ },
100
+ {
101
+ "completion_length": 414.002507019043,
102
+ "completions/clipped_ratio": 0.01249999925494194,
103
+ "completions/max_length": 2048.0,
104
+ "completions/max_terminated_length": 1733.0,
105
+ "completions/mean_length": 414.989990234375,
106
+ "completions/mean_terminated_length": 394.3190002441406,
107
+ "completions/min_length": 11.0,
108
+ "completions/min_terminated_length": 11.0,
109
+ "epoch": 0.013333333333333334,
110
+ "frac_reward_zero_std": 0.0,
111
+ "grad_norm": 0.23343618214130402,
112
+ "kl": 0.0,
113
+ "learning_rate": 9.946666666666667e-06,
114
+ "loss": -0.0,
115
+ "num_tokens": 1454280.0,
116
+ "reward": 0.15532104671001434,
117
+ "reward_std": 0.8334585428237915,
118
+ "rewards/multidomain_reward_func/mean": 0.15532104671001434,
119
+ "rewards/multidomain_reward_func/std": 1.405861258506775,
120
+ "step": 5
121
+ },
122
+ {
123
+ "completion_length": 417.9025054931641,
124
+ "completions/clipped_ratio": 0.014999999664723873,
125
+ "completions/max_length": 2048.0,
126
+ "completions/max_terminated_length": 1255.0,
127
+ "completions/mean_length": 418.8874816894531,
128
+ "completions/mean_terminated_length": 394.07867431640625,
129
+ "completions/min_length": 36.0,
130
+ "completions/min_terminated_length": 36.0,
131
+ "epoch": 0.016,
132
+ "frac_reward_zero_std": 0.0,
133
+ "grad_norm": 0.15190428495407104,
134
+ "kl": 0.0,
135
+ "learning_rate": 9.933333333333334e-06,
136
+ "loss": -0.0,
137
+ "num_tokens": 1732945.0,
138
+ "reward": 0.1933300942182541,
139
+ "reward_std": 1.0449475049972534,
140
+ "rewards/multidomain_reward_func/mean": 0.19333010911941528,
141
+ "rewards/multidomain_reward_func/std": 1.4342669248580933,
142
+ "step": 6
143
+ },
144
+ {
145
+ "completion_length": 421.90250396728516,
146
+ "completions/clipped_ratio": 0.007499999832361937,
147
+ "completions/max_length": 2048.0,
148
+ "completions/max_terminated_length": 1837.0,
149
+ "completions/mean_length": 422.8949890136719,
150
+ "completions/mean_terminated_length": 410.6145935058594,
151
+ "completions/min_length": 18.0,
152
+ "completions/min_terminated_length": 18.0,
153
+ "epoch": 0.018666666666666668,
154
+ "frac_reward_zero_std": 0.0,
155
+ "grad_norm": 0.1601637899875641,
156
+ "kl": 0.0,
157
+ "learning_rate": 9.920000000000002e-06,
158
+ "loss": 0.0,
159
+ "num_tokens": 2018863.0,
160
+ "reward": 0.14378485083580017,
161
+ "reward_std": 0.9768227934837341,
162
+ "rewards/multidomain_reward_func/mean": 0.14378486573696136,
163
+ "rewards/multidomain_reward_func/std": 1.4048492908477783,
164
+ "step": 7
165
+ },
166
+ {
167
+ "completion_length": 391.8425033569336,
168
+ "completions/clipped_ratio": 0.004999999888241291,
169
+ "completions/max_length": 2048.0,
170
+ "completions/max_terminated_length": 1118.0,
171
+ "completions/mean_length": 392.8374938964844,
172
+ "completions/mean_terminated_length": 384.5201110839844,
173
+ "completions/min_length": 33.0,
174
+ "completions/min_terminated_length": 33.0,
175
+ "epoch": 0.021333333333333333,
176
+ "frac_reward_zero_std": 0.0,
177
+ "grad_norm": 0.31883543729782104,
178
+ "kl": 0.0,
179
+ "learning_rate": 9.906666666666668e-06,
180
+ "loss": 0.0,
181
+ "num_tokens": 2290948.0,
182
+ "reward": 0.26540714502334595,
183
+ "reward_std": 0.9163522720336914,
184
+ "rewards/multidomain_reward_func/mean": 0.26540714502334595,
185
+ "rewards/multidomain_reward_func/std": 1.2766904830932617,
186
+ "step": 8
187
+ },
188
+ {
189
+ "completion_length": 426.43250579833983,
190
+ "completions/clipped_ratio": 0.007499999832361937,
191
+ "completions/max_length": 2048.0,
192
+ "completions/max_terminated_length": 1471.0,
193
+ "completions/mean_length": 427.42498779296875,
194
+ "completions/mean_terminated_length": 415.1788330078125,
195
+ "completions/min_length": 36.0,
196
+ "completions/min_terminated_length": 36.0,
197
+ "epoch": 0.024,
198
+ "frac_reward_zero_std": 0.0,
199
+ "grad_norm": 0.1413382887840271,
200
+ "kl": 0.0,
201
+ "learning_rate": 9.893333333333334e-06,
202
+ "loss": 0.0,
203
+ "num_tokens": 2583858.0,
204
+ "reward": 0.23617644608020782,
205
+ "reward_std": 0.9365644454956055,
206
+ "rewards/multidomain_reward_func/mean": 0.23617644608020782,
207
+ "rewards/multidomain_reward_func/std": 1.3539695739746094,
208
+ "step": 9
209
+ },
210
+ {
211
+ "completion_length": 467.14500427246094,
212
+ "completions/clipped_ratio": 0.009999999776482582,
213
+ "completions/max_length": 2048.0,
214
+ "completions/max_terminated_length": 1490.0,
215
+ "completions/mean_length": 468.1349792480469,
216
+ "completions/mean_terminated_length": 452.1767578125,
217
+ "completions/min_length": 97.0,
218
+ "completions/min_terminated_length": 97.0,
219
+ "epoch": 0.02666666666666667,
220
+ "frac_reward_zero_std": 0.0,
221
+ "grad_norm": 0.17736880481243134,
222
+ "kl": 0.0,
223
+ "learning_rate": 9.88e-06,
224
+ "loss": -0.0,
225
+ "num_tokens": 2908252.0,
226
+ "reward": 0.10222619771957397,
227
+ "reward_std": 0.9887304306030273,
228
+ "rewards/multidomain_reward_func/mean": 0.10222619771957397,
229
+ "rewards/multidomain_reward_func/std": 1.4667627811431885,
230
+ "step": 10
231
+ },
232
+ {
233
+ "completion_length": 421.3575073242188,
234
+ "completions/clipped_ratio": 0.007499999832361937,
235
+ "completions/max_length": 2048.0,
236
+ "completions/max_terminated_length": 1658.0,
237
+ "completions/mean_length": 422.3499755859375,
238
+ "completions/mean_terminated_length": 410.0654602050781,
239
+ "completions/min_length": 97.0,
240
+ "completions/min_terminated_length": 97.0,
241
+ "epoch": 0.029333333333333333,
242
+ "frac_reward_zero_std": 0.0,
243
+ "grad_norm": 0.1340264528989792,
244
+ "kl": 0.0,
245
+ "learning_rate": 9.866666666666668e-06,
246
+ "loss": 0.0,
247
+ "num_tokens": 3198092.0,
248
+ "reward": 0.30930107831954956,
249
+ "reward_std": 0.7165582180023193,
250
+ "rewards/multidomain_reward_func/mean": 0.3093010187149048,
251
+ "rewards/multidomain_reward_func/std": 1.2975008487701416,
252
+ "step": 11
253
+ },
254
+ {
255
+ "completion_length": 418.21250610351564,
256
+ "completions/clipped_ratio": 0.0,
257
+ "completions/max_length": 1392.0,
258
+ "completions/max_terminated_length": 1392.0,
259
+ "completions/mean_length": 419.2124938964844,
260
+ "completions/mean_terminated_length": 419.2124938964844,
261
+ "completions/min_length": 115.0,
262
+ "completions/min_terminated_length": 115.0,
263
+ "epoch": 0.032,
264
+ "frac_reward_zero_std": 0.0,
265
+ "grad_norm": 0.16614454984664917,
266
+ "kl": 0.0,
267
+ "learning_rate": 9.853333333333334e-06,
268
+ "loss": 0.0,
269
+ "num_tokens": 3487357.0,
270
+ "reward": 0.550111711025238,
271
+ "reward_std": 0.7317887544631958,
272
+ "rewards/multidomain_reward_func/mean": 0.550111711025238,
273
+ "rewards/multidomain_reward_func/std": 1.0946627855300903,
274
+ "step": 12
275
+ },
276
+ {
277
+ "completion_length": 419.3975082397461,
278
+ "completions/clipped_ratio": 0.0,
279
+ "completions/max_length": 1651.0,
280
+ "completions/max_terminated_length": 1651.0,
281
+ "completions/mean_length": 420.3974914550781,
282
+ "completions/mean_terminated_length": 420.3974914550781,
283
+ "completions/min_length": 73.0,
284
+ "completions/min_terminated_length": 73.0,
285
+ "epoch": 0.034666666666666665,
286
+ "frac_reward_zero_std": 0.0,
287
+ "grad_norm": 0.1116686686873436,
288
+ "kl": 0.0,
289
+ "learning_rate": 9.84e-06,
290
+ "loss": 0.0,
291
+ "num_tokens": 3783606.0,
292
+ "reward": 0.5550642609596252,
293
+ "reward_std": 0.6016712188720703,
294
+ "rewards/multidomain_reward_func/mean": 0.5550642609596252,
295
+ "rewards/multidomain_reward_func/std": 1.0916095972061157,
296
+ "step": 13
297
+ },
298
+ {
299
+ "completion_length": 426.627507019043,
300
+ "completions/clipped_ratio": 0.007499999832361937,
301
+ "completions/max_length": 2048.0,
302
+ "completions/max_terminated_length": 1428.0,
303
+ "completions/mean_length": 427.6199951171875,
304
+ "completions/mean_terminated_length": 415.37530517578125,
305
+ "completions/min_length": 104.0,
306
+ "completions/min_terminated_length": 104.0,
307
+ "epoch": 0.037333333333333336,
308
+ "frac_reward_zero_std": 0.0,
309
+ "grad_norm": 0.13436946272850037,
310
+ "kl": 0.0,
311
+ "learning_rate": 9.826666666666667e-06,
312
+ "loss": 0.0,
313
+ "num_tokens": 4083764.0,
314
+ "reward": 0.466512531042099,
315
+ "reward_std": 0.7918379306793213,
316
+ "rewards/multidomain_reward_func/mean": 0.466512531042099,
317
+ "rewards/multidomain_reward_func/std": 1.2242188453674316,
318
+ "step": 14
319
+ },
320
+ {
321
+ "completion_length": 378.4425079345703,
322
+ "completions/clipped_ratio": 0.0024999999441206455,
323
+ "completions/max_length": 2048.0,
324
+ "completions/max_terminated_length": 938.0,
325
+ "completions/mean_length": 379.44000244140625,
326
+ "completions/mean_terminated_length": 375.2581481933594,
327
+ "completions/min_length": 73.0,
328
+ "completions/min_terminated_length": 73.0,
329
+ "epoch": 0.04,
330
+ "frac_reward_zero_std": 0.0,
331
+ "grad_norm": 0.12867581844329834,
332
+ "kl": 0.0,
333
+ "learning_rate": 9.813333333333333e-06,
334
+ "loss": 0.0,
335
+ "num_tokens": 4345610.0,
336
+ "reward": 0.5736742615699768,
337
+ "reward_std": 0.6882736086845398,
338
+ "rewards/multidomain_reward_func/mean": 0.5736742615699768,
339
+ "rewards/multidomain_reward_func/std": 1.0166397094726562,
340
+ "step": 15
341
+ },
342
+ {
343
+ "completion_length": 409.86500854492186,
344
+ "completions/clipped_ratio": 0.0,
345
+ "completions/max_length": 1269.0,
346
+ "completions/max_terminated_length": 1269.0,
347
+ "completions/mean_length": 410.864990234375,
348
+ "completions/mean_terminated_length": 410.864990234375,
349
+ "completions/min_length": 98.0,
350
+ "completions/min_terminated_length": 98.0,
351
+ "epoch": 0.042666666666666665,
352
+ "frac_reward_zero_std": 0.0,
353
+ "grad_norm": 0.1019008606672287,
354
+ "kl": 0.0,
355
+ "learning_rate": 9.800000000000001e-06,
356
+ "loss": -0.0,
357
+ "num_tokens": 4628316.0,
358
+ "reward": 0.7713325619697571,
359
+ "reward_std": 0.6617187857627869,
360
+ "rewards/multidomain_reward_func/mean": 0.7713325023651123,
361
+ "rewards/multidomain_reward_func/std": 1.1467927694320679,
362
+ "step": 16
363
+ },
364
+ {
365
+ "completion_length": 425.4700073242187,
366
+ "completions/clipped_ratio": 0.0024999999441206455,
367
+ "completions/max_length": 2048.0,
368
+ "completions/max_terminated_length": 1551.0,
369
+ "completions/mean_length": 426.4674987792969,
370
+ "completions/mean_terminated_length": 422.40350341796875,
371
+ "completions/min_length": 93.0,
372
+ "completions/min_terminated_length": 93.0,
373
+ "epoch": 0.04533333333333334,
374
+ "frac_reward_zero_std": 0.0,
375
+ "grad_norm": 0.106789730489254,
376
+ "kl": 0.0,
377
+ "learning_rate": 9.786666666666667e-06,
378
+ "loss": 0.0,
379
+ "num_tokens": 4928883.0,
380
+ "reward": 0.5637399554252625,
381
+ "reward_std": 0.7021534442901611,
382
+ "rewards/multidomain_reward_func/mean": 0.5637399554252625,
383
+ "rewards/multidomain_reward_func/std": 1.0422096252441406,
384
+ "step": 17
385
+ },
386
+ {
387
+ "completion_length": 431.8600051879883,
388
+ "completions/clipped_ratio": 0.0024999999441206455,
389
+ "completions/max_length": 2048.0,
390
+ "completions/max_terminated_length": 1521.0,
391
+ "completions/mean_length": 432.85748291015625,
392
+ "completions/mean_terminated_length": 428.80950927734375,
393
+ "completions/min_length": 105.0,
394
+ "completions/min_terminated_length": 105.0,
395
+ "epoch": 0.048,
396
+ "frac_reward_zero_std": 0.0,
397
+ "grad_norm": 0.12454589456319809,
398
+ "kl": 0.0,
399
+ "learning_rate": 9.773333333333335e-06,
400
+ "loss": 0.0,
401
+ "num_tokens": 5226216.0,
402
+ "reward": 0.5953689217567444,
403
+ "reward_std": 0.5794697403907776,
404
+ "rewards/multidomain_reward_func/mean": 0.5953689217567444,
405
+ "rewards/multidomain_reward_func/std": 1.0605465173721313,
406
+ "step": 18
407
+ },
408
+ {
409
+ "completion_length": 412.7725082397461,
410
+ "completions/clipped_ratio": 0.004999999888241291,
411
+ "completions/max_length": 2048.0,
412
+ "completions/max_terminated_length": 1357.0,
413
+ "completions/mean_length": 413.7674865722656,
414
+ "completions/mean_terminated_length": 405.5552673339844,
415
+ "completions/min_length": 92.0,
416
+ "completions/min_terminated_length": 92.0,
417
+ "epoch": 0.050666666666666665,
418
+ "frac_reward_zero_std": 0.0,
419
+ "grad_norm": 0.13599112629890442,
420
+ "kl": 0.0,
421
+ "learning_rate": 9.760000000000001e-06,
422
+ "loss": -0.0,
423
+ "num_tokens": 5512113.0,
424
+ "reward": 0.39229491353034973,
425
+ "reward_std": 0.8084317445755005,
426
+ "rewards/multidomain_reward_func/mean": 0.39229488372802734,
427
+ "rewards/multidomain_reward_func/std": 1.3224574327468872,
428
+ "step": 19
429
+ },
430
+ {
431
+ "completion_length": 387.01250610351565,
432
+ "completions/clipped_ratio": 0.0,
433
+ "completions/max_length": 1417.0,
434
+ "completions/max_terminated_length": 1417.0,
435
+ "completions/mean_length": 388.0124816894531,
436
+ "completions/mean_terminated_length": 388.0124816894531,
437
+ "completions/min_length": 109.0,
438
+ "completions/min_terminated_length": 109.0,
439
+ "epoch": 0.05333333333333334,
440
+ "frac_reward_zero_std": 0.0,
441
+ "grad_norm": 0.12738779187202454,
442
+ "kl": 0.0,
443
+ "learning_rate": 9.746666666666668e-06,
444
+ "loss": 0.0,
445
+ "num_tokens": 5778268.0,
446
+ "reward": 0.5316053628921509,
447
+ "reward_std": 0.7337101697921753,
448
+ "rewards/multidomain_reward_func/mean": 0.5316053628921509,
449
+ "rewards/multidomain_reward_func/std": 1.270464539527893,
450
+ "step": 20
451
+ },
452
+ {
453
+ "completion_length": 413.6300018310547,
454
+ "completions/clipped_ratio": 0.0,
455
+ "completions/max_length": 1348.0,
456
+ "completions/max_terminated_length": 1348.0,
457
+ "completions/mean_length": 414.6300048828125,
458
+ "completions/mean_terminated_length": 414.6300048828125,
459
+ "completions/min_length": 108.0,
460
+ "completions/min_terminated_length": 108.0,
461
+ "epoch": 0.056,
462
+ "frac_reward_zero_std": 0.0,
463
+ "grad_norm": 0.11424646526575089,
464
+ "kl": 0.0,
465
+ "learning_rate": 9.733333333333334e-06,
466
+ "loss": 0.0,
467
+ "num_tokens": 6069330.0,
468
+ "reward": 0.5331506133079529,
469
+ "reward_std": 0.7416218519210815,
470
+ "rewards/multidomain_reward_func/mean": 0.5331505537033081,
471
+ "rewards/multidomain_reward_func/std": 1.2239094972610474,
472
+ "step": 21
473
+ },
474
+ {
475
+ "completion_length": 369.3925064086914,
476
+ "completions/clipped_ratio": 0.0,
477
+ "completions/max_length": 1257.0,
478
+ "completions/max_terminated_length": 1257.0,
479
+ "completions/mean_length": 370.3924865722656,
480
+ "completions/mean_terminated_length": 370.3924865722656,
481
+ "completions/min_length": 93.0,
482
+ "completions/min_terminated_length": 93.0,
483
+ "epoch": 0.058666666666666666,
484
+ "frac_reward_zero_std": 0.0,
485
+ "grad_norm": 0.12145662307739258,
486
+ "kl": 0.0,
487
+ "learning_rate": 9.72e-06,
488
+ "loss": 0.0,
489
+ "num_tokens": 6327207.0,
490
+ "reward": 0.6458982229232788,
491
+ "reward_std": 0.6354836821556091,
492
+ "rewards/multidomain_reward_func/mean": 0.6458981037139893,
493
+ "rewards/multidomain_reward_func/std": 0.9722529053688049,
494
+ "step": 22
495
+ },
496
+ {
497
+ "completion_length": 427.4825103759766,
498
+ "completions/clipped_ratio": 0.0024999999441206455,
499
+ "completions/max_length": 2048.0,
500
+ "completions/max_terminated_length": 1758.0,
501
+ "completions/mean_length": 428.47998046875,
502
+ "completions/mean_terminated_length": 424.4210510253906,
503
+ "completions/min_length": 114.0,
504
+ "completions/min_terminated_length": 114.0,
505
+ "epoch": 0.06133333333333333,
506
+ "frac_reward_zero_std": 0.0,
507
+ "grad_norm": 0.11638902127742767,
508
+ "kl": 0.0,
509
+ "learning_rate": 9.706666666666668e-06,
510
+ "loss": 0.0,
511
+ "num_tokens": 6623079.0,
512
+ "reward": 0.678668737411499,
513
+ "reward_std": 0.6608802676200867,
514
+ "rewards/multidomain_reward_func/mean": 0.678668737411499,
515
+ "rewards/multidomain_reward_func/std": 1.0120640993118286,
516
+ "step": 23
517
+ },
518
+ {
519
+ "completion_length": 450.40000915527344,
520
+ "completions/clipped_ratio": 0.017500000074505806,
521
+ "completions/max_length": 2048.0,
522
+ "completions/max_terminated_length": 1979.0,
523
+ "completions/mean_length": 451.3824768066406,
524
+ "completions/mean_terminated_length": 422.94403076171875,
525
+ "completions/min_length": 53.0,
526
+ "completions/min_terminated_length": 53.0,
527
+ "epoch": 0.064,
528
+ "frac_reward_zero_std": 0.09999999403953552,
529
+ "grad_norm": 0.11742237955331802,
530
+ "kl": 0.0,
531
+ "learning_rate": 9.693333333333334e-06,
532
+ "loss": 0.0,
533
+ "num_tokens": 6931172.0,
534
+ "reward": 0.08583255857229233,
535
+ "reward_std": 0.5651226043701172,
536
+ "rewards/multidomain_reward_func/mean": 0.08583252131938934,
537
+ "rewards/multidomain_reward_func/std": 1.92720365524292,
538
+ "step": 24
539
+ },
540
+ {
541
+ "completion_length": 414.2875061035156,
542
+ "completions/clipped_ratio": 0.0,
543
+ "completions/max_length": 1183.0,
544
+ "completions/max_terminated_length": 1183.0,
545
+ "completions/mean_length": 415.2874755859375,
546
+ "completions/mean_terminated_length": 415.2874755859375,
547
+ "completions/min_length": 96.0,
548
+ "completions/min_terminated_length": 96.0,
549
+ "epoch": 0.06666666666666667,
550
+ "frac_reward_zero_std": 0.0,
551
+ "grad_norm": 0.11120733618736267,
552
+ "kl": 0.0,
553
+ "learning_rate": 9.68e-06,
554
+ "loss": -0.0,
555
+ "num_tokens": 7209577.0,
556
+ "reward": 0.6717811226844788,
557
+ "reward_std": 0.5638630390167236,
558
+ "rewards/multidomain_reward_func/mean": 0.6717811822891235,
559
+ "rewards/multidomain_reward_func/std": 1.0027689933776855,
560
+ "step": 25
561
+ },
562
+ {
563
+ "completion_length": 445.37500762939453,
564
+ "completions/clipped_ratio": 0.0,
565
+ "completions/max_length": 1532.0,
566
+ "completions/max_terminated_length": 1532.0,
567
+ "completions/mean_length": 446.375,
568
+ "completions/mean_terminated_length": 446.375,
569
+ "completions/min_length": 147.0,
570
+ "completions/min_terminated_length": 147.0,
571
+ "epoch": 0.06933333333333333,
572
+ "frac_reward_zero_std": 0.0,
573
+ "grad_norm": 0.10715946555137634,
574
+ "kl": 0.0,
575
+ "learning_rate": 9.666666666666667e-06,
576
+ "loss": 0.0,
577
+ "num_tokens": 7514747.0,
578
+ "reward": 0.6113654971122742,
579
+ "reward_std": 0.6842783093452454,
580
+ "rewards/multidomain_reward_func/mean": 0.6113654971122742,
581
+ "rewards/multidomain_reward_func/std": 1.0898476839065552,
582
+ "step": 26
583
+ },
584
+ {
585
+ "completion_length": 404.92500762939454,
586
+ "completions/clipped_ratio": 0.0,
587
+ "completions/max_length": 1058.0,
588
+ "completions/max_terminated_length": 1058.0,
589
+ "completions/mean_length": 405.92498779296875,
590
+ "completions/mean_terminated_length": 405.92498779296875,
591
+ "completions/min_length": 115.0,
592
+ "completions/min_terminated_length": 115.0,
593
+ "epoch": 0.072,
594
+ "frac_reward_zero_std": 0.0,
595
+ "grad_norm": 0.10682351142168045,
596
+ "kl": 0.0,
597
+ "learning_rate": 9.653333333333335e-06,
598
+ "loss": 0.0,
599
+ "num_tokens": 7779577.0,
600
+ "reward": 0.521935760974884,
601
+ "reward_std": 0.5503350496292114,
602
+ "rewards/multidomain_reward_func/mean": 0.5219358205795288,
603
+ "rewards/multidomain_reward_func/std": 1.1004360914230347,
604
+ "step": 27
605
+ },
606
+ {
607
+ "completion_length": 462.3850036621094,
608
+ "completions/clipped_ratio": 0.0,
609
+ "completions/max_length": 1845.0,
610
+ "completions/max_terminated_length": 1845.0,
611
+ "completions/mean_length": 463.3849792480469,
612
+ "completions/mean_terminated_length": 463.3849792480469,
613
+ "completions/min_length": 120.0,
614
+ "completions/min_terminated_length": 120.0,
615
+ "epoch": 0.07466666666666667,
616
+ "frac_reward_zero_std": 0.0,
617
+ "grad_norm": 0.1338387280702591,
618
+ "kl": 0.0,
619
+ "learning_rate": 9.640000000000001e-06,
620
+ "loss": 0.0,
621
+ "num_tokens": 8087041.0,
622
+ "reward": 0.6461222767829895,
623
+ "reward_std": 0.7280974984169006,
624
+ "rewards/multidomain_reward_func/mean": 0.6461222767829895,
625
+ "rewards/multidomain_reward_func/std": 1.1095894575119019,
626
+ "step": 28
627
+ },
628
+ {
629
+ "completion_length": 466.19500732421875,
630
+ "completions/clipped_ratio": 0.0024999999441206455,
631
+ "completions/max_length": 2048.0,
632
+ "completions/max_terminated_length": 1823.0,
633
+ "completions/mean_length": 467.1924743652344,
634
+ "completions/mean_terminated_length": 463.2305603027344,
635
+ "completions/min_length": 125.0,
636
+ "completions/min_terminated_length": 125.0,
637
+ "epoch": 0.07733333333333334,
638
+ "frac_reward_zero_std": 0.0,
639
+ "grad_norm": 0.11648301780223846,
640
+ "kl": 0.0,
641
+ "learning_rate": 9.626666666666667e-06,
642
+ "loss": 0.0,
643
+ "num_tokens": 8389638.0,
644
+ "reward": 0.763523519039154,
645
+ "reward_std": 0.6302433013916016,
646
+ "rewards/multidomain_reward_func/mean": 0.7635235786437988,
647
+ "rewards/multidomain_reward_func/std": 0.9619850516319275,
648
+ "step": 29
649
+ },
650
+ {
651
+ "completion_length": 463.572509765625,
652
+ "completions/clipped_ratio": 0.0,
653
+ "completions/max_length": 1148.0,
654
+ "completions/max_terminated_length": 1148.0,
655
+ "completions/mean_length": 464.5724792480469,
656
+ "completions/mean_terminated_length": 464.5724792480469,
657
+ "completions/min_length": 118.0,
658
+ "completions/min_terminated_length": 118.0,
659
+ "epoch": 0.08,
660
+ "frac_reward_zero_std": 0.0,
661
+ "grad_norm": 0.10752178728580475,
662
+ "kl": 0.0,
663
+ "learning_rate": 9.613333333333335e-06,
664
+ "loss": -0.0,
665
+ "num_tokens": 8699697.0,
666
+ "reward": 0.6276810169219971,
667
+ "reward_std": 0.6993708610534668,
668
+ "rewards/multidomain_reward_func/mean": 0.6276810169219971,
669
+ "rewards/multidomain_reward_func/std": 1.024697184562683,
670
+ "step": 30
671
+ },
672
+ {
673
+ "completion_length": 433.10250701904295,
674
+ "completions/clipped_ratio": 0.0,
675
+ "completions/max_length": 1404.0,
676
+ "completions/max_terminated_length": 1404.0,
677
+ "completions/mean_length": 434.10247802734375,
678
+ "completions/mean_terminated_length": 434.10247802734375,
679
+ "completions/min_length": 152.0,
680
+ "completions/min_terminated_length": 152.0,
681
+ "epoch": 0.08266666666666667,
682
+ "frac_reward_zero_std": 0.0,
683
+ "grad_norm": 0.1253821700811386,
684
+ "kl": 0.0,
685
+ "learning_rate": 9.600000000000001e-06,
686
+ "loss": 0.0,
687
+ "num_tokens": 8978008.0,
688
+ "reward": 0.7020823359489441,
689
+ "reward_std": 0.6984261274337769,
690
+ "rewards/multidomain_reward_func/mean": 0.7020823955535889,
691
+ "rewards/multidomain_reward_func/std": 1.1664484739303589,
692
+ "step": 31
693
+ },
694
+ {
695
+ "completion_length": 493.20751037597654,
696
+ "completions/clipped_ratio": 0.0,
697
+ "completions/max_length": 1631.0,
698
+ "completions/max_terminated_length": 1631.0,
699
+ "completions/mean_length": 494.2074890136719,
700
+ "completions/mean_terminated_length": 494.2074890136719,
701
+ "completions/min_length": 132.0,
702
+ "completions/min_terminated_length": 132.0,
703
+ "epoch": 0.08533333333333333,
704
+ "frac_reward_zero_std": 0.0,
705
+ "grad_norm": 0.28045377135276794,
706
+ "kl": 0.0,
707
+ "learning_rate": 9.586666666666667e-06,
708
+ "loss": 0.0,
709
+ "num_tokens": 9303251.0,
710
+ "reward": 0.36521032452583313,
711
+ "reward_std": 0.7486643195152283,
712
+ "rewards/multidomain_reward_func/mean": 0.3652103543281555,
713
+ "rewards/multidomain_reward_func/std": 1.215452790260315,
714
+ "step": 32
715
+ },
716
+ {
717
+ "completion_length": 459.21251068115237,
718
+ "completions/clipped_ratio": 0.0,
719
+ "completions/max_length": 1346.0,
720
+ "completions/max_terminated_length": 1346.0,
721
+ "completions/mean_length": 460.2124938964844,
722
+ "completions/mean_terminated_length": 460.2124938964844,
723
+ "completions/min_length": 114.0,
724
+ "completions/min_terminated_length": 114.0,
725
+ "epoch": 0.088,
726
+ "frac_reward_zero_std": 0.0,
727
+ "grad_norm": 0.09916597604751587,
728
+ "kl": 0.0,
729
+ "learning_rate": 9.573333333333334e-06,
730
+ "loss": 0.0,
731
+ "num_tokens": 9600216.0,
732
+ "reward": 0.6524021625518799,
733
+ "reward_std": 0.6229043006896973,
734
+ "rewards/multidomain_reward_func/mean": 0.6524021625518799,
735
+ "rewards/multidomain_reward_func/std": 1.1347594261169434,
736
+ "step": 33
737
+ },
738
+ {
739
+ "completion_length": 478.38000640869143,
740
+ "completions/clipped_ratio": 0.0024999999441206455,
741
+ "completions/max_length": 2048.0,
742
+ "completions/max_terminated_length": 1426.0,
743
+ "completions/mean_length": 479.37750244140625,
744
+ "completions/mean_terminated_length": 475.44610595703125,
745
+ "completions/min_length": 116.0,
746
+ "completions/min_terminated_length": 116.0,
747
+ "epoch": 0.09066666666666667,
748
+ "frac_reward_zero_std": 0.0,
749
+ "grad_norm": 0.09763700515031815,
750
+ "kl": 0.0,
751
+ "learning_rate": 9.56e-06,
752
+ "loss": 0.0,
753
+ "num_tokens": 9909247.0,
754
+ "reward": 0.5732457041740417,
755
+ "reward_std": 0.6212210655212402,
756
+ "rewards/multidomain_reward_func/mean": 0.5732457041740417,
757
+ "rewards/multidomain_reward_func/std": 1.1442022323608398,
758
+ "step": 34
759
+ },
760
+ {
761
+ "completion_length": 530.3800079345704,
762
+ "completions/clipped_ratio": 0.004999999888241291,
763
+ "completions/max_length": 2048.0,
764
+ "completions/max_terminated_length": 2014.0,
765
+ "completions/mean_length": 531.375,
766
+ "completions/mean_terminated_length": 523.7537841796875,
767
+ "completions/min_length": 115.0,
768
+ "completions/min_terminated_length": 115.0,
769
+ "epoch": 0.09333333333333334,
770
+ "frac_reward_zero_std": 0.0,
771
+ "grad_norm": 0.12349400669336319,
772
+ "kl": 0.0,
773
+ "learning_rate": 9.546666666666668e-06,
774
+ "loss": 0.0,
775
+ "num_tokens": 10265587.0,
776
+ "reward": 0.45227575302124023,
777
+ "reward_std": 0.7804867625236511,
778
+ "rewards/multidomain_reward_func/mean": 0.4522757828235626,
779
+ "rewards/multidomain_reward_func/std": 1.2044216394424438,
780
+ "step": 35
781
+ },
782
+ {
783
+ "completion_length": 488.8800079345703,
784
+ "completions/clipped_ratio": 0.0,
785
+ "completions/max_length": 2034.0,
786
+ "completions/max_terminated_length": 2034.0,
787
+ "completions/mean_length": 489.8799743652344,
788
+ "completions/mean_terminated_length": 489.8799743652344,
789
+ "completions/min_length": 132.0,
790
+ "completions/min_terminated_length": 132.0,
791
+ "epoch": 0.096,
792
+ "frac_reward_zero_std": 0.0,
793
+ "grad_norm": 0.13603246212005615,
794
+ "kl": 0.0,
795
+ "learning_rate": 9.533333333333334e-06,
796
+ "loss": 0.0,
797
+ "num_tokens": 10596189.0,
798
+ "reward": 0.6199676990509033,
799
+ "reward_std": 0.7394188046455383,
800
+ "rewards/multidomain_reward_func/mean": 0.6199676990509033,
801
+ "rewards/multidomain_reward_func/std": 1.151370644569397,
802
+ "step": 36
803
+ },
804
+ {
805
+ "completion_length": 469.7675079345703,
806
+ "completions/clipped_ratio": 0.0,
807
+ "completions/max_length": 1208.0,
808
+ "completions/max_terminated_length": 1208.0,
809
+ "completions/mean_length": 470.7674865722656,
810
+ "completions/mean_terminated_length": 470.7674865722656,
811
+ "completions/min_length": 130.0,
812
+ "completions/min_terminated_length": 130.0,
813
+ "epoch": 0.09866666666666667,
814
+ "frac_reward_zero_std": 0.0,
815
+ "grad_norm": 0.09747374057769775,
816
+ "kl": 0.0,
817
+ "learning_rate": 9.52e-06,
818
+ "loss": 0.0,
819
+ "num_tokens": 10896396.0,
820
+ "reward": 0.935648500919342,
821
+ "reward_std": 0.5971238017082214,
822
+ "rewards/multidomain_reward_func/mean": 0.935648500919342,
823
+ "rewards/multidomain_reward_func/std": 0.9697657823562622,
824
+ "step": 37
825
+ },
826
+ {
827
+ "completion_length": 493.3000015258789,
828
+ "completions/clipped_ratio": 0.0024999999441206455,
829
+ "completions/max_length": 2048.0,
830
+ "completions/max_terminated_length": 1513.0,
831
+ "completions/mean_length": 494.2974853515625,
832
+ "completions/mean_terminated_length": 490.40350341796875,
833
+ "completions/min_length": 125.0,
834
+ "completions/min_terminated_length": 125.0,
835
+ "epoch": 0.10133333333333333,
836
+ "frac_reward_zero_std": 0.0,
837
+ "grad_norm": 0.0883842334151268,
838
+ "kl": 0.0,
839
+ "learning_rate": 9.506666666666667e-06,
840
+ "loss": 0.0,
841
+ "num_tokens": 11215205.0,
842
+ "reward": 0.7769816517829895,
843
+ "reward_std": 0.5231021642684937,
844
+ "rewards/multidomain_reward_func/mean": 0.7769816517829895,
845
+ "rewards/multidomain_reward_func/std": 0.9222862720489502,
846
+ "step": 38
847
+ },
848
+ {
849
+ "completion_length": 486.83250885009767,
850
+ "completions/clipped_ratio": 0.0,
851
+ "completions/max_length": 1406.0,
852
+ "completions/max_terminated_length": 1406.0,
853
+ "completions/mean_length": 487.8324890136719,
854
+ "completions/mean_terminated_length": 487.8324890136719,
855
+ "completions/min_length": 133.0,
856
+ "completions/min_terminated_length": 133.0,
857
+ "epoch": 0.104,
858
+ "frac_reward_zero_std": 0.0,
859
+ "grad_norm": 0.10908982902765274,
860
+ "kl": 0.0,
861
+ "learning_rate": 9.493333333333334e-06,
862
+ "loss": 0.0,
863
+ "num_tokens": 11540558.0,
864
+ "reward": 0.6732150912284851,
865
+ "reward_std": 0.6396495699882507,
866
+ "rewards/multidomain_reward_func/mean": 0.6732151508331299,
867
+ "rewards/multidomain_reward_func/std": 1.1686171293258667,
868
+ "step": 39
869
+ },
870
+ {
871
+ "completion_length": 487.9100112915039,
872
+ "completions/clipped_ratio": 0.0,
873
+ "completions/max_length": 1473.0,
874
+ "completions/max_terminated_length": 1473.0,
875
+ "completions/mean_length": 488.9100036621094,
876
+ "completions/mean_terminated_length": 488.9100036621094,
877
+ "completions/min_length": 129.0,
878
+ "completions/min_terminated_length": 129.0,
879
+ "epoch": 0.10666666666666667,
880
+ "frac_reward_zero_std": 0.0,
881
+ "grad_norm": 0.10951139777898788,
882
+ "kl": 0.0,
883
+ "learning_rate": 9.48e-06,
884
+ "loss": 0.0,
885
+ "num_tokens": 11863322.0,
886
+ "reward": 0.6731404662132263,
887
+ "reward_std": 0.6585783362388611,
888
+ "rewards/multidomain_reward_func/mean": 0.6731404662132263,
889
+ "rewards/multidomain_reward_func/std": 1.0647237300872803,
890
+ "step": 40
891
+ },
892
+ {
893
+ "completion_length": 502.2775161743164,
894
+ "completions/clipped_ratio": 0.0,
895
+ "completions/max_length": 1538.0,
896
+ "completions/max_terminated_length": 1538.0,
897
+ "completions/mean_length": 503.2774963378906,
898
+ "completions/mean_terminated_length": 503.2774963378906,
899
+ "completions/min_length": 145.0,
900
+ "completions/min_terminated_length": 145.0,
901
+ "epoch": 0.10933333333333334,
902
+ "frac_reward_zero_std": 0.0,
903
+ "grad_norm": 1.849650263786316,
904
+ "kl": 0.0,
905
+ "learning_rate": 9.466666666666667e-06,
906
+ "loss": 0.0,
907
+ "num_tokens": 12192863.0,
908
+ "reward": 0.7937699556350708,
909
+ "reward_std": 0.6187800168991089,
910
+ "rewards/multidomain_reward_func/mean": 0.7937700152397156,
911
+ "rewards/multidomain_reward_func/std": 1.0126259326934814,
912
+ "step": 41
913
+ },
914
+ {
915
+ "completion_length": 478.65250396728516,
916
+ "completions/clipped_ratio": 0.0,
917
+ "completions/max_length": 1362.0,
918
+ "completions/max_terminated_length": 1362.0,
919
+ "completions/mean_length": 479.6524963378906,
920
+ "completions/mean_terminated_length": 479.6524963378906,
921
+ "completions/min_length": 130.0,
922
+ "completions/min_terminated_length": 130.0,
923
+ "epoch": 0.112,
924
+ "frac_reward_zero_std": 0.0,
925
+ "grad_norm": 0.12482614070177078,
926
+ "kl": 0.0,
927
+ "learning_rate": 9.453333333333335e-06,
928
+ "loss": 0.0,
929
+ "num_tokens": 12500484.0,
930
+ "reward": 0.6001678109169006,
931
+ "reward_std": 0.6798524260520935,
932
+ "rewards/multidomain_reward_func/mean": 0.6001678705215454,
933
+ "rewards/multidomain_reward_func/std": 1.120275616645813,
934
+ "step": 42
935
+ },
936
+ {
937
+ "completion_length": 517.6400100708008,
938
+ "completions/clipped_ratio": 0.0024999999441206455,
939
+ "completions/max_length": 2048.0,
940
+ "completions/max_terminated_length": 1579.0,
941
+ "completions/mean_length": 518.6375122070312,
942
+ "completions/mean_terminated_length": 514.8045043945312,
943
+ "completions/min_length": 144.0,
944
+ "completions/min_terminated_length": 144.0,
945
+ "epoch": 0.11466666666666667,
946
+ "frac_reward_zero_std": 0.0,
947
+ "grad_norm": 0.09683530032634735,
948
+ "kl": 0.0,
949
+ "learning_rate": 9.440000000000001e-06,
950
+ "loss": 0.0,
951
+ "num_tokens": 12841819.0,
952
+ "reward": 0.753699541091919,
953
+ "reward_std": 0.6156747341156006,
954
+ "rewards/multidomain_reward_func/mean": 0.753699541091919,
955
+ "rewards/multidomain_reward_func/std": 0.9609552025794983,
956
+ "step": 43
957
+ },
958
+ {
959
+ "completion_length": 505.51000823974607,
960
+ "completions/clipped_ratio": 0.0,
961
+ "completions/max_length": 1757.0,
962
+ "completions/max_terminated_length": 1757.0,
963
+ "completions/mean_length": 506.5099792480469,
964
+ "completions/mean_terminated_length": 506.5099792480469,
965
+ "completions/min_length": 129.0,
966
+ "completions/min_terminated_length": 129.0,
967
+ "epoch": 0.11733333333333333,
968
+ "frac_reward_zero_std": 0.0,
969
+ "grad_norm": 0.07461382448673248,
970
+ "kl": 0.0,
971
+ "learning_rate": 9.426666666666667e-06,
972
+ "loss": 0.0,
973
+ "num_tokens": 13168013.0,
974
+ "reward": 0.8316676616668701,
975
+ "reward_std": 0.47557753324508667,
976
+ "rewards/multidomain_reward_func/mean": 0.8316677212715149,
977
+ "rewards/multidomain_reward_func/std": 0.9142977595329285,
978
+ "step": 44
979
+ },
980
+ {
981
+ "completion_length": 1016.6600128173828,
982
+ "completions/clipped_ratio": 0.3125,
983
+ "completions/max_length": 2048.0,
984
+ "completions/max_terminated_length": 1516.0,
985
+ "completions/mean_length": 1017.3474731445312,
986
+ "completions/mean_terminated_length": 548.8690795898438,
987
+ "completions/min_length": 6.0,
988
+ "completions/min_terminated_length": 6.0,
989
+ "epoch": 0.12,
990
+ "frac_reward_zero_std": 0.3499999940395355,
991
+ "grad_norm": 0.1530371755361557,
992
+ "kl": 0.0,
993
+ "learning_rate": 9.413333333333334e-06,
994
+ "loss": 0.0,
995
+ "num_tokens": 13716622.0,
996
+ "reward": -1.7727051973342896,
997
+ "reward_std": 0.515305757522583,
998
+ "rewards/multidomain_reward_func/mean": -1.772705078125,
999
+ "rewards/multidomain_reward_func/std": 2.787855386734009,
1000
+ "step": 45
1001
+ },
1002
+ {
1003
+ "completion_length": 450.5750076293945,
1004
+ "completions/clipped_ratio": 0.0,
1005
+ "completions/max_length": 1174.0,
1006
+ "completions/max_terminated_length": 1174.0,
1007
+ "completions/mean_length": 451.5749816894531,
1008
+ "completions/mean_terminated_length": 451.5749816894531,
1009
+ "completions/min_length": 138.0,
1010
+ "completions/min_terminated_length": 138.0,
1011
+ "epoch": 0.12266666666666666,
1012
+ "frac_reward_zero_std": 0.0,
1013
+ "grad_norm": 0.11144713312387466,
1014
+ "kl": 0.0,
1015
+ "learning_rate": 9.4e-06,
1016
+ "loss": 0.0,
1017
+ "num_tokens": 14013782.0,
1018
+ "reward": 0.7144942283630371,
1019
+ "reward_std": 0.6038557887077332,
1020
+ "rewards/multidomain_reward_func/mean": 0.7144941687583923,
1021
+ "rewards/multidomain_reward_func/std": 1.0141149759292603,
1022
+ "step": 46
1023
+ },
1024
+ {
1025
+ "completion_length": 462.4975051879883,
1026
+ "completions/clipped_ratio": 0.0024999999441206455,
1027
+ "completions/max_length": 2048.0,
1028
+ "completions/max_terminated_length": 1033.0,
1029
+ "completions/mean_length": 463.4949951171875,
1030
+ "completions/mean_terminated_length": 459.5238037109375,
1031
+ "completions/min_length": 124.0,
1032
+ "completions/min_terminated_length": 124.0,
1033
+ "epoch": 0.12533333333333332,
1034
+ "frac_reward_zero_std": 0.0,
1035
+ "grad_norm": 0.10090559720993042,
1036
+ "kl": 0.0,
1037
+ "learning_rate": 9.386666666666668e-06,
1038
+ "loss": 0.0,
1039
+ "num_tokens": 14317510.0,
1040
+ "reward": 0.6212114095687866,
1041
+ "reward_std": 0.6358470916748047,
1042
+ "rewards/multidomain_reward_func/mean": 0.6212114095687866,
1043
+ "rewards/multidomain_reward_func/std": 1.0588219165802002,
1044
+ "step": 47
1045
+ },
1046
+ {
1047
+ "completion_length": 467.2750045776367,
1048
+ "completions/clipped_ratio": 0.0,
1049
+ "completions/max_length": 1720.0,
1050
+ "completions/max_terminated_length": 1720.0,
1051
+ "completions/mean_length": 468.2749938964844,
1052
+ "completions/mean_terminated_length": 468.2749938964844,
1053
+ "completions/min_length": 119.0,
1054
+ "completions/min_terminated_length": 119.0,
1055
+ "epoch": 0.128,
1056
+ "frac_reward_zero_std": 0.0,
1057
+ "grad_norm": 0.09495209157466888,
1058
+ "kl": 0.0,
1059
+ "learning_rate": 9.373333333333334e-06,
1060
+ "loss": 0.0,
1061
+ "num_tokens": 14621120.0,
1062
+ "reward": 0.9720970988273621,
1063
+ "reward_std": 0.6470828652381897,
1064
+ "rewards/multidomain_reward_func/mean": 0.9720970392227173,
1065
+ "rewards/multidomain_reward_func/std": 1.0593924522399902,
1066
+ "step": 48
1067
+ },
1068
+ {
1069
+ "completion_length": 490.20000610351565,
1070
+ "completions/clipped_ratio": 0.0,
1071
+ "completions/max_length": 1525.0,
1072
+ "completions/max_terminated_length": 1525.0,
1073
+ "completions/mean_length": 491.1999816894531,
1074
+ "completions/mean_terminated_length": 491.1999816894531,
1075
+ "completions/min_length": 140.0,
1076
+ "completions/min_terminated_length": 140.0,
1077
+ "epoch": 0.13066666666666665,
1078
+ "frac_reward_zero_std": 0.0,
1079
+ "grad_norm": 0.09339933842420578,
1080
+ "kl": 0.0,
1081
+ "learning_rate": 9.360000000000002e-06,
1082
+ "loss": 0.0,
1083
+ "num_tokens": 14945280.0,
1084
+ "reward": 0.7716130018234253,
1085
+ "reward_std": 0.5648047924041748,
1086
+ "rewards/multidomain_reward_func/mean": 0.7716130018234253,
1087
+ "rewards/multidomain_reward_func/std": 1.0510609149932861,
1088
+ "step": 49
1089
+ },
1090
+ {
1091
+ "completion_length": 470.83250885009767,
1092
+ "completions/clipped_ratio": 0.0,
1093
+ "completions/max_length": 1374.0,
1094
+ "completions/max_terminated_length": 1374.0,
1095
+ "completions/mean_length": 471.8324890136719,
1096
+ "completions/mean_terminated_length": 471.8324890136719,
1097
+ "completions/min_length": 123.0,
1098
+ "completions/min_terminated_length": 123.0,
1099
+ "epoch": 0.13333333333333333,
1100
+ "frac_reward_zero_std": 0.0,
1101
+ "grad_norm": 0.10589201003313065,
1102
+ "kl": 0.0,
1103
+ "learning_rate": 9.346666666666666e-06,
1104
+ "loss": 0.0,
1105
+ "num_tokens": 15252173.0,
1106
+ "reward": 0.8854246139526367,
1107
+ "reward_std": 0.5977504849433899,
1108
+ "rewards/multidomain_reward_func/mean": 0.8854245543479919,
1109
+ "rewards/multidomain_reward_func/std": 1.0996488332748413,
1110
+ "step": 50
1111
+ },
1112
+ {
1113
+ "completion_length": 487.5975112915039,
1114
+ "completions/clipped_ratio": 0.004999999888241291,
1115
+ "completions/max_length": 2048.0,
1116
+ "completions/max_terminated_length": 1913.0,
1117
+ "completions/mean_length": 488.5924987792969,
1118
+ "completions/mean_terminated_length": 480.75628662109375,
1119
+ "completions/min_length": 177.0,
1120
+ "completions/min_terminated_length": 177.0,
1121
+ "epoch": 0.136,
1122
+ "frac_reward_zero_std": 0.0,
1123
+ "grad_norm": 0.12346775829792023,
1124
+ "kl": 0.0,
1125
+ "learning_rate": 9.333333333333334e-06,
1126
+ "loss": 0.0,
1127
+ "num_tokens": 15564510.0,
1128
+ "reward": 0.48230254650115967,
1129
+ "reward_std": 0.6803807020187378,
1130
+ "rewards/multidomain_reward_func/mean": 0.4823025166988373,
1131
+ "rewards/multidomain_reward_func/std": 1.2003233432769775,
1132
+ "step": 51
1133
+ },
1134
+ {
1135
+ "completion_length": 459.94250640869143,
1136
+ "completions/clipped_ratio": 0.0,
1137
+ "completions/max_length": 1186.0,
1138
+ "completions/max_terminated_length": 1186.0,
1139
+ "completions/mean_length": 460.9425048828125,
1140
+ "completions/mean_terminated_length": 460.9425048828125,
1141
+ "completions/min_length": 124.0,
1142
+ "completions/min_terminated_length": 124.0,
1143
+ "epoch": 0.13866666666666666,
1144
+ "frac_reward_zero_std": 0.0,
1145
+ "grad_norm": 0.13084262609481812,
1146
+ "kl": 0.0,
1147
+ "learning_rate": 9.32e-06,
1148
+ "loss": 0.0,
1149
+ "num_tokens": 15868027.0,
1150
+ "reward": 0.796763002872467,
1151
+ "reward_std": 0.7062838077545166,
1152
+ "rewards/multidomain_reward_func/mean": 0.7967629432678223,
1153
+ "rewards/multidomain_reward_func/std": 1.2271337509155273,
1154
+ "step": 52
1155
+ },
1156
+ {
1157
+ "completion_length": 453.93250427246096,
1158
+ "completions/clipped_ratio": 0.0024999999441206455,
1159
+ "completions/max_length": 2048.0,
1160
+ "completions/max_terminated_length": 1102.0,
1161
+ "completions/mean_length": 454.92999267578125,
1162
+ "completions/mean_terminated_length": 450.9373474121094,
1163
+ "completions/min_length": 169.0,
1164
+ "completions/min_terminated_length": 169.0,
1165
+ "epoch": 0.14133333333333334,
1166
+ "frac_reward_zero_std": 0.0,
1167
+ "grad_norm": 0.14952056109905243,
1168
+ "kl": 0.0,
1169
+ "learning_rate": 9.306666666666667e-06,
1170
+ "loss": 0.0,
1171
+ "num_tokens": 16159159.0,
1172
+ "reward": 0.6241033673286438,
1173
+ "reward_std": 0.6670031547546387,
1174
+ "rewards/multidomain_reward_func/mean": 0.6241033673286438,
1175
+ "rewards/multidomain_reward_func/std": 1.1514482498168945,
1176
+ "step": 53
1177
+ },
1178
+ {
1179
+ "completion_length": 452.2375061035156,
1180
+ "completions/clipped_ratio": 0.0,
1181
+ "completions/max_length": 1412.0,
1182
+ "completions/max_terminated_length": 1412.0,
1183
+ "completions/mean_length": 453.23748779296875,
1184
+ "completions/mean_terminated_length": 453.23748779296875,
1185
+ "completions/min_length": 144.0,
1186
+ "completions/min_terminated_length": 144.0,
1187
+ "epoch": 0.144,
1188
+ "frac_reward_zero_std": 0.0,
1189
+ "grad_norm": 0.09604249149560928,
1190
+ "kl": 0.0,
1191
+ "learning_rate": 9.293333333333335e-06,
1192
+ "loss": -0.0,
1193
+ "num_tokens": 16452664.0,
1194
+ "reward": 0.4965752363204956,
1195
+ "reward_std": 0.5294793844223022,
1196
+ "rewards/multidomain_reward_func/mean": 0.4965752363204956,
1197
+ "rewards/multidomain_reward_func/std": 1.1207164525985718,
1198
+ "step": 54
1199
+ },
1200
+ {
1201
+ "completion_length": 465.0425033569336,
1202
+ "completions/clipped_ratio": 0.0,
1203
+ "completions/max_length": 1350.0,
1204
+ "completions/max_terminated_length": 1350.0,
1205
+ "completions/mean_length": 466.04248046875,
1206
+ "completions/mean_terminated_length": 466.04248046875,
1207
+ "completions/min_length": 132.0,
1208
+ "completions/min_terminated_length": 132.0,
1209
+ "epoch": 0.14666666666666667,
1210
+ "frac_reward_zero_std": 0.0,
1211
+ "grad_norm": 0.11836592108011246,
1212
+ "kl": 0.0,
1213
+ "learning_rate": 9.280000000000001e-06,
1214
+ "loss": 0.0,
1215
+ "num_tokens": 16757261.0,
1216
+ "reward": 0.6661089658737183,
1217
+ "reward_std": 0.5464302897453308,
1218
+ "rewards/multidomain_reward_func/mean": 0.666109025478363,
1219
+ "rewards/multidomain_reward_func/std": 0.9100214242935181,
1220
+ "step": 55
1221
+ },
1222
+ {
1223
+ "completion_length": 491.40500793457034,
1224
+ "completions/clipped_ratio": 0.0,
1225
+ "completions/max_length": 1505.0,
1226
+ "completions/max_terminated_length": 1505.0,
1227
+ "completions/mean_length": 492.4049987792969,
1228
+ "completions/mean_terminated_length": 492.4049987792969,
1229
+ "completions/min_length": 145.0,
1230
+ "completions/min_terminated_length": 145.0,
1231
+ "epoch": 0.14933333333333335,
1232
+ "frac_reward_zero_std": 0.0,
1233
+ "grad_norm": 0.0901034027338028,
1234
+ "kl": 0.0,
1235
+ "learning_rate": 9.266666666666667e-06,
1236
+ "loss": 0.0,
1237
+ "num_tokens": 17086573.0,
1238
+ "reward": 0.8763098120689392,
1239
+ "reward_std": 0.5632340908050537,
1240
+ "rewards/multidomain_reward_func/mean": 0.876309871673584,
1241
+ "rewards/multidomain_reward_func/std": 0.8210961818695068,
1242
+ "step": 56
1243
+ },
1244
+ {
1245
+ "completion_length": 495.40500793457034,
1246
+ "completions/clipped_ratio": 0.0,
1247
+ "completions/max_length": 1396.0,
1248
+ "completions/max_terminated_length": 1396.0,
1249
+ "completions/mean_length": 496.4049987792969,
1250
+ "completions/mean_terminated_length": 496.4049987792969,
1251
+ "completions/min_length": 168.0,
1252
+ "completions/min_terminated_length": 168.0,
1253
+ "epoch": 0.152,
1254
+ "frac_reward_zero_std": 0.0,
1255
+ "grad_norm": 0.09418133646249771,
1256
+ "kl": 0.0,
1257
+ "learning_rate": 9.253333333333333e-06,
1258
+ "loss": 0.0,
1259
+ "num_tokens": 17411285.0,
1260
+ "reward": 0.7451098561286926,
1261
+ "reward_std": 0.5770621299743652,
1262
+ "rewards/multidomain_reward_func/mean": 0.7451097965240479,
1263
+ "rewards/multidomain_reward_func/std": 0.9386699795722961,
1264
+ "step": 57
1265
+ },
1266
+ {
1267
+ "completion_length": 1361.317529296875,
1268
+ "completions/clipped_ratio": 0.5299999713897705,
1269
+ "completions/max_length": 2048.0,
1270
+ "completions/max_terminated_length": 1974.0,
1271
+ "completions/mean_length": 1361.7874755859375,
1272
+ "completions/mean_terminated_length": 587.973388671875,
1273
+ "completions/min_length": 5.0,
1274
+ "completions/min_terminated_length": 5.0,
1275
+ "epoch": 0.15466666666666667,
1276
+ "frac_reward_zero_std": 0.6499999761581421,
1277
+ "grad_norm": 0.1710672676563263,
1278
+ "kl": 0.0,
1279
+ "learning_rate": 9.240000000000001e-06,
1280
+ "loss": 0.0,
1281
+ "num_tokens": 18114280.0,
1282
+ "reward": -3.2304539680480957,
1283
+ "reward_std": 0.3466276526451111,
1284
+ "rewards/multidomain_reward_func/mean": -3.230454206466675,
1285
+ "rewards/multidomain_reward_func/std": 2.6598012447357178,
1286
+ "step": 58
1287
+ },
1288
+ {
1289
+ "completion_length": 459.1775100708008,
1290
+ "completions/clipped_ratio": 0.0,
1291
+ "completions/max_length": 1182.0,
1292
+ "completions/max_terminated_length": 1182.0,
1293
+ "completions/mean_length": 460.177490234375,
1294
+ "completions/mean_terminated_length": 460.177490234375,
1295
+ "completions/min_length": 172.0,
1296
+ "completions/min_terminated_length": 172.0,
1297
+ "epoch": 0.15733333333333333,
1298
+ "frac_reward_zero_std": 0.0,
1299
+ "grad_norm": 0.18426430225372314,
1300
+ "kl": 0.0,
1301
+ "learning_rate": 9.226666666666668e-06,
1302
+ "loss": 0.0,
1303
+ "num_tokens": 18410481.0,
1304
+ "reward": 0.7067541480064392,
1305
+ "reward_std": 0.5836679935455322,
1306
+ "rewards/multidomain_reward_func/mean": 0.706754207611084,
1307
+ "rewards/multidomain_reward_func/std": 0.9405755996704102,
1308
+ "step": 59
1309
+ },
1310
+ {
1311
+ "completion_length": 518.2050018310547,
1312
+ "completions/clipped_ratio": 0.0,
1313
+ "completions/max_length": 1761.0,
1314
+ "completions/max_terminated_length": 1761.0,
1315
+ "completions/mean_length": 519.2050170898438,
1316
+ "completions/mean_terminated_length": 519.2050170898438,
1317
+ "completions/min_length": 150.0,
1318
+ "completions/min_terminated_length": 150.0,
1319
+ "epoch": 0.16,
1320
+ "frac_reward_zero_std": 0.0,
1321
+ "grad_norm": 0.10580391436815262,
1322
+ "kl": 0.0,
1323
+ "learning_rate": 9.213333333333334e-06,
1324
+ "loss": 0.0,
1325
+ "num_tokens": 18777053.0,
1326
+ "reward": 0.6891916990280151,
1327
+ "reward_std": 0.6400912404060364,
1328
+ "rewards/multidomain_reward_func/mean": 0.6891917586326599,
1329
+ "rewards/multidomain_reward_func/std": 1.0435802936553955,
1330
+ "step": 60
1331
+ },
1332
+ {
1333
+ "completion_length": 488.422509765625,
1334
+ "completions/clipped_ratio": 0.0,
1335
+ "completions/max_length": 1770.0,
1336
+ "completions/max_terminated_length": 1770.0,
1337
+ "completions/mean_length": 489.4224853515625,
1338
+ "completions/mean_terminated_length": 489.4224853515625,
1339
+ "completions/min_length": 149.0,
1340
+ "completions/min_terminated_length": 149.0,
1341
+ "epoch": 0.16266666666666665,
1342
+ "frac_reward_zero_std": 0.0,
1343
+ "grad_norm": 0.11052470654249191,
1344
+ "kl": 0.0,
1345
+ "learning_rate": 9.200000000000002e-06,
1346
+ "loss": 0.0,
1347
+ "num_tokens": 19105212.0,
1348
+ "reward": 0.7272980809211731,
1349
+ "reward_std": 0.6313333511352539,
1350
+ "rewards/multidomain_reward_func/mean": 0.7272981405258179,
1351
+ "rewards/multidomain_reward_func/std": 0.9895555973052979,
1352
+ "step": 61
1353
+ },
1354
+ {
1355
+ "completion_length": 499.8575073242188,
1356
+ "completions/clipped_ratio": 0.0024999999441206455,
1357
+ "completions/max_length": 2048.0,
1358
+ "completions/max_terminated_length": 1807.0,
1359
+ "completions/mean_length": 500.85498046875,
1360
+ "completions/mean_terminated_length": 496.9774475097656,
1361
+ "completions/min_length": 142.0,
1362
+ "completions/min_terminated_length": 142.0,
1363
+ "epoch": 0.16533333333333333,
1364
+ "frac_reward_zero_std": 0.0,
1365
+ "grad_norm": 0.09797242283821106,
1366
+ "kl": 0.0,
1367
+ "learning_rate": 9.186666666666666e-06,
1368
+ "loss": -0.0,
1369
+ "num_tokens": 19439224.0,
1370
+ "reward": 0.7694770693778992,
1371
+ "reward_std": 0.6147860288619995,
1372
+ "rewards/multidomain_reward_func/mean": 0.7694770693778992,
1373
+ "rewards/multidomain_reward_func/std": 0.9149509072303772,
1374
+ "step": 62
1375
+ },
1376
+ {
1377
+ "completion_length": 470.8975067138672,
1378
+ "completions/clipped_ratio": 0.0,
1379
+ "completions/max_length": 982.0,
1380
+ "completions/max_terminated_length": 982.0,
1381
+ "completions/mean_length": 471.8974914550781,
1382
+ "completions/mean_terminated_length": 471.8974914550781,
1383
+ "completions/min_length": 117.0,
1384
+ "completions/min_terminated_length": 117.0,
1385
+ "epoch": 0.168,
1386
+ "frac_reward_zero_std": 0.0,
1387
+ "grad_norm": 0.12676118314266205,
1388
+ "kl": 0.0,
1389
+ "learning_rate": 9.173333333333334e-06,
1390
+ "loss": 0.0,
1391
+ "num_tokens": 19736443.0,
1392
+ "reward": 0.8108384609222412,
1393
+ "reward_std": 0.6640507578849792,
1394
+ "rewards/multidomain_reward_func/mean": 0.8108384609222412,
1395
+ "rewards/multidomain_reward_func/std": 1.193241834640503,
1396
+ "step": 63
1397
+ },
1398
+ {
1399
+ "completion_length": 498.67500762939454,
1400
+ "completions/clipped_ratio": 0.007499999832361937,
1401
+ "completions/max_length": 2048.0,
1402
+ "completions/max_terminated_length": 1361.0,
1403
+ "completions/mean_length": 499.66748046875,
1404
+ "completions/mean_terminated_length": 487.96722412109375,
1405
+ "completions/min_length": 166.0,
1406
+ "completions/min_terminated_length": 166.0,
1407
+ "epoch": 0.17066666666666666,
1408
+ "frac_reward_zero_std": 0.0,
1409
+ "grad_norm": 0.13014905154705048,
1410
+ "kl": 0.0,
1411
+ "learning_rate": 9.16e-06,
1412
+ "loss": -0.0,
1413
+ "num_tokens": 20057610.0,
1414
+ "reward": 0.6247091293334961,
1415
+ "reward_std": 0.7679011225700378,
1416
+ "rewards/multidomain_reward_func/mean": 0.6247091889381409,
1417
+ "rewards/multidomain_reward_func/std": 1.1993564367294312,
1418
+ "step": 64
1419
+ },
1420
+ {
1421
+ "completion_length": 458.45250549316404,
1422
+ "completions/clipped_ratio": 0.0,
1423
+ "completions/max_length": 1030.0,
1424
+ "completions/max_terminated_length": 1030.0,
1425
+ "completions/mean_length": 459.4524841308594,
1426
+ "completions/mean_terminated_length": 459.4524841308594,
1427
+ "completions/min_length": 139.0,
1428
+ "completions/min_terminated_length": 139.0,
1429
+ "epoch": 0.17333333333333334,
1430
+ "frac_reward_zero_std": 0.0,
1431
+ "grad_norm": 0.13847854733467102,
1432
+ "kl": 0.0,
1433
+ "learning_rate": 9.146666666666667e-06,
1434
+ "loss": 0.0,
1435
+ "num_tokens": 20372321.0,
1436
+ "reward": 0.7129409909248352,
1437
+ "reward_std": 0.617203950881958,
1438
+ "rewards/multidomain_reward_func/mean": 0.7129409909248352,
1439
+ "rewards/multidomain_reward_func/std": 1.0829230546951294,
1440
+ "step": 65
1441
+ },
1442
+ {
1443
+ "completion_length": 496.3400085449219,
1444
+ "completions/clipped_ratio": 0.0,
1445
+ "completions/max_length": 1668.0,
1446
+ "completions/max_terminated_length": 1668.0,
1447
+ "completions/mean_length": 497.3399963378906,
1448
+ "completions/mean_terminated_length": 497.3399963378906,
1449
+ "completions/min_length": 153.0,
1450
+ "completions/min_terminated_length": 153.0,
1451
+ "epoch": 0.176,
1452
+ "frac_reward_zero_std": 0.0,
1453
+ "grad_norm": 0.10193447768688202,
1454
+ "kl": 0.0,
1455
+ "learning_rate": 9.133333333333335e-06,
1456
+ "loss": 0.0,
1457
+ "num_tokens": 20709467.0,
1458
+ "reward": 0.6699061393737793,
1459
+ "reward_std": 0.6168271899223328,
1460
+ "rewards/multidomain_reward_func/mean": 0.6699060797691345,
1461
+ "rewards/multidomain_reward_func/std": 1.0752849578857422,
1462
+ "step": 66
1463
+ },
1464
+ {
1465
+ "completion_length": 481.9150100708008,
1466
+ "completions/clipped_ratio": 0.0,
1467
+ "completions/max_length": 1616.0,
1468
+ "completions/max_terminated_length": 1616.0,
1469
+ "completions/mean_length": 482.91497802734375,
1470
+ "completions/mean_terminated_length": 482.91497802734375,
1471
+ "completions/min_length": 139.0,
1472
+ "completions/min_terminated_length": 139.0,
1473
+ "epoch": 0.17866666666666667,
1474
+ "frac_reward_zero_std": 0.0,
1475
+ "grad_norm": 0.11991851031780243,
1476
+ "kl": 0.0,
1477
+ "learning_rate": 9.12e-06,
1478
+ "loss": 0.0,
1479
+ "num_tokens": 21044563.0,
1480
+ "reward": 0.5581293106079102,
1481
+ "reward_std": 0.6784390807151794,
1482
+ "rewards/multidomain_reward_func/mean": 0.5581292510032654,
1483
+ "rewards/multidomain_reward_func/std": 1.1184920072555542,
1484
+ "step": 67
1485
+ },
1486
+ {
1487
+ "completion_length": 483.61750946044924,
1488
+ "completions/clipped_ratio": 0.004999999888241291,
1489
+ "completions/max_length": 2048.0,
1490
+ "completions/max_terminated_length": 1506.0,
1491
+ "completions/mean_length": 484.61248779296875,
1492
+ "completions/mean_terminated_length": 476.75628662109375,
1493
+ "completions/min_length": 143.0,
1494
+ "completions/min_terminated_length": 143.0,
1495
+ "epoch": 0.18133333333333335,
1496
+ "frac_reward_zero_std": 0.0,
1497
+ "grad_norm": 0.10080371052026749,
1498
+ "kl": 0.0,
1499
+ "learning_rate": 9.106666666666667e-06,
1500
+ "loss": 0.0,
1501
+ "num_tokens": 21359318.0,
1502
+ "reward": 0.5767732262611389,
1503
+ "reward_std": 0.6775339841842651,
1504
+ "rewards/multidomain_reward_func/mean": 0.5767732262611389,
1505
+ "rewards/multidomain_reward_func/std": 1.1861456632614136,
1506
+ "step": 68
1507
+ },
1508
+ {
1509
+ "completion_length": 451.9850082397461,
1510
+ "completions/clipped_ratio": 0.0,
1511
+ "completions/max_length": 1193.0,
1512
+ "completions/max_terminated_length": 1193.0,
1513
+ "completions/mean_length": 452.9849853515625,
1514
+ "completions/mean_terminated_length": 452.9849853515625,
1515
+ "completions/min_length": 157.0,
1516
+ "completions/min_terminated_length": 157.0,
1517
+ "epoch": 0.184,
1518
+ "frac_reward_zero_std": 0.0,
1519
+ "grad_norm": 0.1563381850719452,
1520
+ "kl": 0.0,
1521
+ "learning_rate": 9.093333333333333e-06,
1522
+ "loss": 0.0,
1523
+ "num_tokens": 21654302.0,
1524
+ "reward": 0.8275067210197449,
1525
+ "reward_std": 0.6079853177070618,
1526
+ "rewards/multidomain_reward_func/mean": 0.8275066614151001,
1527
+ "rewards/multidomain_reward_func/std": 0.9333595633506775,
1528
+ "step": 69
1529
+ },
1530
+ {
1531
+ "completion_length": 499.7975128173828,
1532
+ "completions/clipped_ratio": 0.0,
1533
+ "completions/max_length": 1561.0,
1534
+ "completions/max_terminated_length": 1561.0,
1535
+ "completions/mean_length": 500.7974853515625,
1536
+ "completions/mean_terminated_length": 500.7974853515625,
1537
+ "completions/min_length": 158.0,
1538
+ "completions/min_terminated_length": 158.0,
1539
+ "epoch": 0.18666666666666668,
1540
+ "frac_reward_zero_std": 0.0,
1541
+ "grad_norm": 0.11193910241127014,
1542
+ "kl": 0.0,
1543
+ "learning_rate": 9.080000000000001e-06,
1544
+ "loss": 0.0,
1545
+ "num_tokens": 21985271.0,
1546
+ "reward": 0.6707971096038818,
1547
+ "reward_std": 0.6554468870162964,
1548
+ "rewards/multidomain_reward_func/mean": 0.6707971096038818,
1549
+ "rewards/multidomain_reward_func/std": 0.9665917158126831,
1550
+ "step": 70
1551
+ },
1552
+ {
1553
+ "completion_length": 463.9925048828125,
1554
+ "completions/clipped_ratio": 0.0,
1555
+ "completions/max_length": 1094.0,
1556
+ "completions/max_terminated_length": 1094.0,
1557
+ "completions/mean_length": 464.99249267578125,
1558
+ "completions/mean_terminated_length": 464.99249267578125,
1559
+ "completions/min_length": 135.0,
1560
+ "completions/min_terminated_length": 135.0,
1561
+ "epoch": 0.18933333333333333,
1562
+ "frac_reward_zero_std": 0.0,
1563
+ "grad_norm": 0.12109918892383575,
1564
+ "kl": 0.0,
1565
+ "learning_rate": 9.066666666666667e-06,
1566
+ "loss": 0.0,
1567
+ "num_tokens": 22281108.0,
1568
+ "reward": 0.7637037634849548,
1569
+ "reward_std": 0.7281582951545715,
1570
+ "rewards/multidomain_reward_func/mean": 0.7637037634849548,
1571
+ "rewards/multidomain_reward_func/std": 1.2205768823623657,
1572
+ "step": 71
1573
+ },
1574
+ {
1575
+ "completion_length": 464.7775054931641,
1576
+ "completions/clipped_ratio": 0.0,
1577
+ "completions/max_length": 1411.0,
1578
+ "completions/max_terminated_length": 1411.0,
1579
+ "completions/mean_length": 465.7774963378906,
1580
+ "completions/mean_terminated_length": 465.7774963378906,
1581
+ "completions/min_length": 119.0,
1582
+ "completions/min_terminated_length": 119.0,
1583
+ "epoch": 0.192,
1584
+ "frac_reward_zero_std": 0.0,
1585
+ "grad_norm": 0.11413338035345078,
1586
+ "kl": 0.0,
1587
+ "learning_rate": 9.053333333333334e-06,
1588
+ "loss": -0.0,
1589
+ "num_tokens": 22588769.0,
1590
+ "reward": 0.7388538122177124,
1591
+ "reward_std": 0.647964596748352,
1592
+ "rewards/multidomain_reward_func/mean": 0.7388537526130676,
1593
+ "rewards/multidomain_reward_func/std": 1.1589411497116089,
1594
+ "step": 72
1595
+ },
1596
+ {
1597
+ "completion_length": 451.3100051879883,
1598
+ "completions/clipped_ratio": 0.0,
1599
+ "completions/max_length": 1049.0,
1600
+ "completions/max_terminated_length": 1049.0,
1601
+ "completions/mean_length": 452.30999755859375,
1602
+ "completions/mean_terminated_length": 452.30999755859375,
1603
+ "completions/min_length": 136.0,
1604
+ "completions/min_terminated_length": 136.0,
1605
+ "epoch": 0.19466666666666665,
1606
+ "frac_reward_zero_std": 0.0,
1607
+ "grad_norm": 0.10486124455928802,
1608
+ "kl": 0.0,
1609
+ "learning_rate": 9.040000000000002e-06,
1610
+ "loss": 0.0,
1611
+ "num_tokens": 22881323.0,
1612
+ "reward": 0.825734555721283,
1613
+ "reward_std": 0.5979833602905273,
1614
+ "rewards/multidomain_reward_func/mean": 0.8257344961166382,
1615
+ "rewards/multidomain_reward_func/std": 1.0813428163528442,
1616
+ "step": 73
1617
+ },
1618
+ {
1619
+ "completion_length": 460.61500854492186,
1620
+ "completions/clipped_ratio": 0.0,
1621
+ "completions/max_length": 889.0,
1622
+ "completions/max_terminated_length": 889.0,
1623
+ "completions/mean_length": 461.614990234375,
1624
+ "completions/mean_terminated_length": 461.614990234375,
1625
+ "completions/min_length": 153.0,
1626
+ "completions/min_terminated_length": 153.0,
1627
+ "epoch": 0.19733333333333333,
1628
+ "frac_reward_zero_std": 0.0,
1629
+ "grad_norm": 0.1169808879494667,
1630
+ "kl": 0.0,
1631
+ "learning_rate": 9.026666666666666e-06,
1632
+ "loss": 0.0,
1633
+ "num_tokens": 23181119.0,
1634
+ "reward": 0.5477174520492554,
1635
+ "reward_std": 0.670683741569519,
1636
+ "rewards/multidomain_reward_func/mean": 0.5477175116539001,
1637
+ "rewards/multidomain_reward_func/std": 1.1359970569610596,
1638
+ "step": 74
1639
+ }
1640
+ ],
1641
+ "logging_steps": 1,
1642
+ "max_steps": 750,
1643
+ "num_input_tokens_seen": 23521351,
1644
+ "num_train_epochs": 2,
1645
+ "save_steps": 250,
1646
+ "stateful_callbacks": {
1647
+ "TrainerControl": {
1648
+ "args": {
1649
+ "should_epoch_stop": false,
1650
+ "should_evaluate": false,
1651
+ "should_log": false,
1652
+ "should_save": false,
1653
+ "should_training_stop": false
1654
+ },
1655
+ "attributes": {}
1656
+ }
1657
+ },
1658
+ "total_flos": 0.0,
1659
+ "train_batch_size": 20,
1660
+ "trial_name": null,
1661
+ "trial_params": null
1662
+ }
ckpt-10-percent/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61eace08d8243327b42925ff3b1e3c66cebcd3733c789b42a8bf93692185dfd6
3
  size 7505
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a7756db7d337356b03f79cfc247284ee5c8ed3985ced618c069718c8faeac6
3
  size 7505