BRlkl commited on
Commit
e2b2b56
·
verified ·
1 Parent(s): 9dc0e0a

full-state checkpoint 40-percent (step 37)

Browse files
ckpt-40-percent/adapter_config.json CHANGED
@@ -22,7 +22,7 @@
22
  "loftq_config": {},
23
  "lora_alpha": 128,
24
  "lora_bias": false,
25
- "lora_dropout": 0.0,
26
  "megatron_config": null,
27
  "megatron_core": "megatron.core",
28
  "modules_to_save": null,
@@ -33,13 +33,13 @@
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "q_proj",
 
37
  "k_proj",
 
38
  "up_proj",
39
- "o_proj",
40
  "gate_proj",
41
- "v_proj",
42
- "down_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
22
  "loftq_config": {},
23
  "lora_alpha": 128,
24
  "lora_bias": false,
25
+ "lora_dropout": 0,
26
  "megatron_config": null,
27
  "megatron_core": "megatron.core",
28
  "modules_to_save": null,
 
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
+ "o_proj",
37
+ "v_proj",
38
  "k_proj",
39
+ "down_proj",
40
  "up_proj",
 
41
  "gate_proj",
42
+ "q_proj"
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
ckpt-40-percent/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4193948521a74caee87f9af27bb89976d5850e03f214624ecea3d1591794fe3c
3
  size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:547c520e9f6018dcc04e0887a68f66d4b548b65f8ba5326a187b3ce97736ffd8
3
  size 528550256
ckpt-40-percent/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7474bafbe60682e7d3d2158a792c18fcb041721f237de832859e8b832bf9f2b
3
  size 268963141
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cead732039a3038ba8417f7b3ad7e37980df520b14fccc9f4888df78be2519a
3
  size 268963141
ckpt-40-percent/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17d5a3e321b13968af04496b05cee2e429ee248e578f5a3d0ae41b3721da2857
3
- size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dbc8c9973e8ee68a48d381267c6f9df9f7ae9da4f171318dc9c7a0f1150af01
3
+ size 14645
ckpt-40-percent/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:903719f9c6dd8b4ae758b702075c2357a375dcc44c265e022a8de9fb3959abdb
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fbb20913bae5335d75c036bc67049a0f424c18b248b7ab0fe9f7a3664d090e3
3
  size 1465
ckpt-40-percent/trainer_state.json CHANGED
@@ -2,831 +2,809 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1450381679389313,
6
  "eval_steps": 500,
7
- "global_step": 38,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 567.1225036621094,
14
- "completions/clipped_ratio": 0.07750000059604645,
15
- "completions/max_length": 2048.0,
16
- "completions/max_terminated_length": 1791.0,
17
- "completions/mean_length": 568.0449829101562,
18
- "completions/mean_terminated_length": 443.7127380371094,
19
- "completions/min_length": 31.0,
20
- "completions/min_terminated_length": 31.0,
21
- "epoch": 0.003816793893129771,
22
- "frac_reward_zero_std": 0.5,
23
- "grad_norm": 0.1118144765496254,
24
  "kl": 0.0,
25
  "learning_rate": 0.0,
26
  "loss": 0.0,
27
- "num_tokens": 359368.0,
28
- "reward": 0.3113526701927185,
29
- "reward_std": 0.43317943811416626,
30
- "rewards/multidomain_reward_func/mean": 0.3113526999950409,
31
- "rewards/multidomain_reward_func/std": 1.7456704378128052,
32
  "step": 1
33
  },
34
  {
35
- "completion_length": 426.60000762939455,
36
  "completions/clipped_ratio": 0.0,
37
- "completions/max_length": 1055.0,
38
- "completions/max_terminated_length": 1055.0,
39
- "completions/mean_length": 427.5999755859375,
40
- "completions/mean_terminated_length": 427.5999755859375,
41
- "completions/min_length": 125.0,
42
- "completions/min_terminated_length": 125.0,
43
- "epoch": 0.007633587786259542,
44
- "frac_reward_zero_std": 0.22499999403953552,
45
- "grad_norm": 0.18109840154647827,
46
- "kl": 0.0,
47
- "learning_rate": 1.25e-06,
48
  "loss": 0.0,
49
- "num_tokens": 666388.0,
50
- "reward": 0.9589062333106995,
51
- "reward_std": 0.7145926356315613,
52
- "rewards/multidomain_reward_func/mean": 0.9589062333106995,
53
- "rewards/multidomain_reward_func/std": 1.3675330877304077,
54
  "step": 2
55
  },
56
  {
57
- "completion_length": 398.95250396728517,
58
  "completions/clipped_ratio": 0.0,
59
- "completions/max_length": 1163.0,
60
- "completions/max_terminated_length": 1163.0,
61
- "completions/mean_length": 399.9524841308594,
62
- "completions/mean_terminated_length": 399.9524841308594,
63
- "completions/min_length": 129.0,
64
- "completions/min_terminated_length": 129.0,
65
- "epoch": 0.011450381679389313,
66
- "frac_reward_zero_std": 0.17499999701976776,
67
- "grad_norm": 0.1430462747812271,
68
- "kl": 0.0,
69
- "learning_rate": 2.5e-06,
70
  "loss": 0.0,
71
- "num_tokens": 944729.0,
72
- "reward": 0.980177104473114,
73
- "reward_std": 0.7184366583824158,
74
- "rewards/multidomain_reward_func/mean": 0.980177104473114,
75
- "rewards/multidomain_reward_func/std": 1.2778397798538208,
76
  "step": 3
77
  },
78
  {
79
- "completion_length": 413.59500885009766,
80
  "completions/clipped_ratio": 0.0,
81
- "completions/max_length": 1070.0,
82
- "completions/max_terminated_length": 1070.0,
83
- "completions/mean_length": 414.5950012207031,
84
- "completions/mean_terminated_length": 414.5950012207031,
85
- "completions/min_length": 20.0,
86
- "completions/min_terminated_length": 20.0,
87
- "epoch": 0.015267175572519083,
88
- "frac_reward_zero_std": 0.14999999105930328,
89
- "grad_norm": 0.18651697039604187,
90
- "kl": 0.0,
91
- "learning_rate": 3.7500000000000005e-06,
92
  "loss": -0.0,
93
- "num_tokens": 1240667.0,
94
- "reward": 0.8180863261222839,
95
- "reward_std": 0.7909437417984009,
96
- "rewards/multidomain_reward_func/mean": 0.8180863261222839,
97
- "rewards/multidomain_reward_func/std": 1.4800753593444824,
98
  "step": 4
99
  },
100
  {
101
- "completion_length": 403.8550094604492,
102
  "completions/clipped_ratio": 0.0,
103
- "completions/max_length": 1220.0,
104
- "completions/max_terminated_length": 1220.0,
105
- "completions/mean_length": 404.85498046875,
106
- "completions/mean_terminated_length": 404.85498046875,
107
- "completions/min_length": 114.0,
108
- "completions/min_terminated_length": 114.0,
109
- "epoch": 0.019083969465648856,
110
- "frac_reward_zero_std": 0.375,
111
- "grad_norm": 0.1282689869403839,
112
- "kl": 0.0,
113
- "learning_rate": 5e-06,
114
- "loss": -0.0,
115
- "num_tokens": 1528019.0,
116
- "reward": 0.9822691082954407,
117
- "reward_std": 0.5212968587875366,
118
- "rewards/multidomain_reward_func/mean": 0.9822691082954407,
119
- "rewards/multidomain_reward_func/std": 1.1484410762786865,
120
  "step": 5
121
  },
122
  {
123
- "completion_length": 419.6075042724609,
124
  "completions/clipped_ratio": 0.0,
125
- "completions/max_length": 1306.0,
126
- "completions/max_terminated_length": 1306.0,
127
- "completions/mean_length": 420.60748291015625,
128
- "completions/mean_terminated_length": 420.60748291015625,
129
- "completions/min_length": 123.0,
130
- "completions/min_terminated_length": 123.0,
131
- "epoch": 0.022900763358778626,
132
- "frac_reward_zero_std": 0.32499998807907104,
133
- "grad_norm": 0.1297599971294403,
134
  "kl": 0.0,
135
- "learning_rate": 6.25e-06,
136
  "loss": 0.0,
137
- "num_tokens": 1838792.0,
138
- "reward": 1.1234791278839111,
139
- "reward_std": 0.5952121019363403,
140
- "rewards/multidomain_reward_func/mean": 1.1234791278839111,
141
- "rewards/multidomain_reward_func/std": 1.1342366933822632,
142
  "step": 6
143
  },
144
  {
145
- "completion_length": 394.5875045776367,
146
  "completions/clipped_ratio": 0.0,
147
- "completions/max_length": 1297.0,
148
- "completions/max_terminated_length": 1297.0,
149
- "completions/mean_length": 395.5874938964844,
150
- "completions/mean_terminated_length": 395.5874938964844,
151
- "completions/min_length": 20.0,
152
- "completions/min_terminated_length": 20.0,
153
- "epoch": 0.026717557251908396,
154
- "frac_reward_zero_std": 0.29999998211860657,
155
- "grad_norm": 0.1435064822435379,
156
- "kl": 0.0,
157
- "learning_rate": 7.500000000000001e-06,
158
- "loss": 0.0,
159
- "num_tokens": 2127307.0,
160
- "reward": 0.9833280444145203,
161
- "reward_std": 0.6006773114204407,
162
- "rewards/multidomain_reward_func/mean": 0.983328104019165,
163
- "rewards/multidomain_reward_func/std": 1.1262794733047485,
164
  "step": 7
165
  },
166
  {
167
- "completion_length": 386.64000701904297,
168
  "completions/clipped_ratio": 0.0,
169
- "completions/max_length": 1240.0,
170
- "completions/max_terminated_length": 1240.0,
171
- "completions/mean_length": 387.6399841308594,
172
- "completions/mean_terminated_length": 387.6399841308594,
173
- "completions/min_length": 99.0,
174
- "completions/min_terminated_length": 99.0,
175
- "epoch": 0.030534351145038167,
176
- "frac_reward_zero_std": 0.22499999403953552,
177
- "grad_norm": 0.1412028968334198,
178
- "kl": 0.0,
179
- "learning_rate": 8.750000000000001e-06,
180
- "loss": -0.0,
181
- "num_tokens": 2408423.0,
182
- "reward": 0.9568594098091125,
183
- "reward_std": 0.6804539561271667,
184
- "rewards/multidomain_reward_func/mean": 0.9568594098091125,
185
- "rewards/multidomain_reward_func/std": 1.3227062225341797,
186
  "step": 8
187
  },
188
  {
189
- "completion_length": 392.76500701904297,
190
  "completions/clipped_ratio": 0.0,
191
- "completions/max_length": 1124.0,
192
- "completions/max_terminated_length": 1124.0,
193
- "completions/mean_length": 393.7649841308594,
194
- "completions/mean_terminated_length": 393.7649841308594,
195
- "completions/min_length": 129.0,
196
- "completions/min_terminated_length": 129.0,
197
- "epoch": 0.03435114503816794,
198
- "frac_reward_zero_std": 0.375,
199
- "grad_norm": 0.1385762244462967,
200
- "kl": 0.0,
201
- "learning_rate": 1e-05,
202
  "loss": 0.0,
203
- "num_tokens": 2693519.0,
204
- "reward": 1.08942711353302,
205
- "reward_std": 0.5218599438667297,
206
- "rewards/multidomain_reward_func/mean": 1.0894269943237305,
207
- "rewards/multidomain_reward_func/std": 1.1226119995117188,
208
  "step": 9
209
  },
210
  {
211
- "completion_length": 402.0925048828125,
212
  "completions/clipped_ratio": 0.0,
213
- "completions/max_length": 1010.0,
214
- "completions/max_terminated_length": 1010.0,
215
- "completions/mean_length": 403.0924987792969,
216
- "completions/mean_terminated_length": 403.0924987792969,
217
- "completions/min_length": 113.0,
218
- "completions/min_terminated_length": 113.0,
219
- "epoch": 0.03816793893129771,
220
- "frac_reward_zero_std": 0.42499998211860657,
221
- "grad_norm": 0.1233278438448906,
222
- "kl": 0.0,
223
- "learning_rate": 1e-05,
224
- "loss": -0.0,
225
- "num_tokens": 2988546.0,
226
- "reward": 1.0668281316757202,
227
- "reward_std": 0.45709723234176636,
228
- "rewards/multidomain_reward_func/mean": 1.0668281316757202,
229
- "rewards/multidomain_reward_func/std": 1.084045648574829,
230
  "step": 10
231
  },
232
  {
233
- "completion_length": 403.14000701904297,
234
  "completions/clipped_ratio": 0.0,
235
- "completions/max_length": 1282.0,
236
- "completions/max_terminated_length": 1282.0,
237
- "completions/mean_length": 404.1399841308594,
238
- "completions/mean_terminated_length": 404.1399841308594,
239
- "completions/min_length": 85.0,
240
- "completions/min_terminated_length": 85.0,
241
- "epoch": 0.04198473282442748,
242
- "frac_reward_zero_std": 0.19999998807907104,
243
- "grad_norm": 0.1325792521238327,
244
- "kl": 0.0,
245
- "learning_rate": 1e-05,
246
- "loss": 0.0,
247
- "num_tokens": 3284862.0,
248
- "reward": 0.9093489646911621,
249
- "reward_std": 0.7637284398078918,
250
- "rewards/multidomain_reward_func/mean": 0.9093489646911621,
251
- "rewards/multidomain_reward_func/std": 1.334052324295044,
252
  "step": 11
253
  },
254
  {
255
- "completion_length": 417.5600051879883,
256
- "completions/clipped_ratio": 0.0,
257
- "completions/max_length": 1357.0,
258
- "completions/max_terminated_length": 1357.0,
259
- "completions/mean_length": 418.55999755859375,
260
- "completions/mean_terminated_length": 418.55999755859375,
261
- "completions/min_length": 74.0,
262
- "completions/min_terminated_length": 74.0,
263
- "epoch": 0.04580152671755725,
264
  "frac_reward_zero_std": 0.29999998211860657,
265
- "grad_norm": 0.15695820748806,
266
  "kl": 0.0,
267
- "learning_rate": 1e-05,
268
  "loss": 0.0,
269
- "num_tokens": 3594236.0,
270
- "reward": 1.1599501371383667,
271
- "reward_std": 0.5550302267074585,
272
- "rewards/multidomain_reward_func/mean": 1.1599501371383667,
273
- "rewards/multidomain_reward_func/std": 1.0896780490875244,
274
  "step": 12
275
  },
276
  {
277
- "completion_length": 787.7125091552734,
278
- "completions/clipped_ratio": 0.20999999344348907,
279
- "completions/max_length": 2048.0,
280
- "completions/max_terminated_length": 1967.0,
281
- "completions/mean_length": 788.5025024414062,
282
- "completions/mean_terminated_length": 453.6993713378906,
283
- "completions/min_length": 4.0,
284
- "completions/min_terminated_length": 4.0,
285
- "epoch": 0.04961832061068702,
286
- "frac_reward_zero_std": 0.625,
287
- "grad_norm": 0.11839566379785538,
288
- "kl": 0.0,
289
- "learning_rate": 1e-05,
290
- "loss": -0.0,
291
- "num_tokens": 4067927.0,
292
- "reward": -1.5976874828338623,
293
- "reward_std": 0.30139976739883423,
294
- "rewards/multidomain_reward_func/mean": -1.5976874828338623,
295
- "rewards/multidomain_reward_func/std": 1.9086108207702637,
296
  "step": 13
297
  },
298
  {
299
- "completion_length": 390.23500671386716,
300
  "completions/clipped_ratio": 0.0,
301
- "completions/max_length": 1288.0,
302
- "completions/max_terminated_length": 1288.0,
303
- "completions/mean_length": 391.2349853515625,
304
- "completions/mean_terminated_length": 391.2349853515625,
305
- "completions/min_length": 104.0,
306
- "completions/min_terminated_length": 104.0,
307
- "epoch": 0.05343511450381679,
308
- "frac_reward_zero_std": 0.29999998211860657,
309
- "grad_norm": 0.14162577688694,
310
  "kl": 0.0,
311
- "learning_rate": 1e-05,
312
- "loss": 0.0,
313
- "num_tokens": 4354421.0,
314
- "reward": 0.8984739780426025,
315
- "reward_std": 0.7180371880531311,
316
- "rewards/multidomain_reward_func/mean": 0.8984739780426025,
317
- "rewards/multidomain_reward_func/std": 1.231815218925476,
318
  "step": 14
319
  },
320
  {
321
- "completion_length": 383.99000549316406,
322
- "completions/clipped_ratio": 0.0024999999441206455,
323
- "completions/max_length": 2048.0,
324
- "completions/max_terminated_length": 1316.0,
325
- "completions/mean_length": 384.98748779296875,
326
- "completions/mean_terminated_length": 380.8195495605469,
327
- "completions/min_length": 58.0,
328
- "completions/min_terminated_length": 58.0,
329
- "epoch": 0.05725190839694656,
330
- "frac_reward_zero_std": 0.29999998211860657,
331
- "grad_norm": 0.16749094426631927,
332
  "kl": 0.0,
333
- "learning_rate": 1e-05,
334
  "loss": -0.0,
335
- "num_tokens": 4621896.0,
336
- "reward": 0.781964123249054,
337
- "reward_std": 0.7420740127563477,
338
- "rewards/multidomain_reward_func/mean": 0.781964123249054,
339
- "rewards/multidomain_reward_func/std": 1.3384652137756348,
340
  "step": 15
341
  },
342
  {
343
- "completion_length": 406.53000793457034,
344
  "completions/clipped_ratio": 0.0,
345
- "completions/max_length": 1692.0,
346
- "completions/max_terminated_length": 1692.0,
347
- "completions/mean_length": 407.5299987792969,
348
- "completions/mean_terminated_length": 407.5299987792969,
349
- "completions/min_length": 105.0,
350
- "completions/min_terminated_length": 105.0,
351
- "epoch": 0.061068702290076333,
352
- "frac_reward_zero_std": 0.3499999940395355,
353
- "grad_norm": 0.14106978476047516,
354
- "kl": 0.0,
355
- "learning_rate": 1e-05,
356
- "loss": -0.0,
357
- "num_tokens": 4914168.0,
358
- "reward": 0.8764218688011169,
359
- "reward_std": 0.5444123148918152,
360
- "rewards/multidomain_reward_func/mean": 0.8764218688011169,
361
- "rewards/multidomain_reward_func/std": 1.2579195499420166,
362
  "step": 16
363
  },
364
  {
365
- "completion_length": 424.4025085449219,
366
  "completions/clipped_ratio": 0.0,
367
- "completions/max_length": 1329.0,
368
- "completions/max_terminated_length": 1329.0,
369
- "completions/mean_length": 425.4024963378906,
370
- "completions/mean_terminated_length": 425.4024963378906,
371
- "completions/min_length": 107.0,
372
- "completions/min_terminated_length": 107.0,
373
- "epoch": 0.0648854961832061,
374
  "frac_reward_zero_std": 0.22499999403953552,
375
- "grad_norm": 0.15478958189487457,
376
  "kl": 0.0,
377
- "learning_rate": 1e-05,
378
- "loss": -0.0,
379
- "num_tokens": 5222629.0,
380
- "reward": 0.9992671012878418,
381
- "reward_std": 0.7444822788238525,
382
- "rewards/multidomain_reward_func/mean": 0.9992671012878418,
383
- "rewards/multidomain_reward_func/std": 1.1914256811141968,
384
  "step": 17
385
  },
386
  {
387
- "completion_length": 395.4175079345703,
388
- "completions/clipped_ratio": 0.0,
389
- "completions/max_length": 1124.0,
390
- "completions/max_terminated_length": 1124.0,
391
- "completions/mean_length": 396.41748046875,
392
- "completions/mean_terminated_length": 396.41748046875,
393
- "completions/min_length": 127.0,
394
- "completions/min_terminated_length": 127.0,
395
- "epoch": 0.06870229007633588,
396
- "frac_reward_zero_std": 0.25,
397
- "grad_norm": 0.14686347544193268,
398
  "kl": 0.0,
399
- "learning_rate": 1e-05,
400
- "loss": 0.0,
401
- "num_tokens": 5501186.0,
402
- "reward": 1.1373611688613892,
403
- "reward_std": 0.5914338827133179,
404
- "rewards/multidomain_reward_func/mean": 1.1373611688613892,
405
- "rewards/multidomain_reward_func/std": 1.1266436576843262,
406
  "step": 18
407
  },
408
  {
409
- "completion_length": 423.48500671386716,
410
- "completions/clipped_ratio": 0.0,
411
- "completions/max_length": 1373.0,
412
- "completions/max_terminated_length": 1373.0,
413
- "completions/mean_length": 424.4849853515625,
414
- "completions/mean_terminated_length": 424.4849853515625,
415
- "completions/min_length": 123.0,
416
- "completions/min_terminated_length": 123.0,
417
- "epoch": 0.07251908396946564,
418
  "frac_reward_zero_std": 0.29999998211860657,
419
- "grad_norm": 0.13952139019966125,
420
- "kl": 0.0,
421
- "learning_rate": 1e-05,
422
- "loss": -0.0001,
423
- "num_tokens": 5795160.0,
424
- "reward": 0.9805884957313538,
425
- "reward_std": 0.6606646776199341,
426
- "rewards/multidomain_reward_func/mean": 0.9805885553359985,
427
- "rewards/multidomain_reward_func/std": 1.3177053928375244,
428
  "step": 19
429
  },
430
  {
431
- "completion_length": 399.9825042724609,
432
- "completions/clipped_ratio": 0.0,
433
- "completions/max_length": 1349.0,
434
- "completions/max_terminated_length": 1349.0,
435
- "completions/mean_length": 400.98248291015625,
436
- "completions/mean_terminated_length": 400.98248291015625,
437
- "completions/min_length": 106.0,
438
- "completions/min_terminated_length": 106.0,
439
- "epoch": 0.07633587786259542,
440
- "frac_reward_zero_std": 0.29999998211860657,
441
- "grad_norm": 0.1434020698070526,
442
- "kl": 0.0,
443
- "learning_rate": 1e-05,
444
  "loss": -0.0,
445
- "num_tokens": 6067803.0,
446
- "reward": 1.0183696746826172,
447
- "reward_std": 0.6196312308311462,
448
- "rewards/multidomain_reward_func/mean": 1.0183697938919067,
449
- "rewards/multidomain_reward_func/std": 1.0895596742630005,
450
  "step": 20
451
  },
452
  {
453
- "completion_length": 389.4550079345703,
454
  "completions/clipped_ratio": 0.0,
455
- "completions/max_length": 1196.0,
456
- "completions/max_terminated_length": 1196.0,
457
- "completions/mean_length": 390.4549865722656,
458
- "completions/mean_terminated_length": 390.4549865722656,
459
- "completions/min_length": 112.0,
460
- "completions/min_terminated_length": 112.0,
461
- "epoch": 0.08015267175572519,
462
- "frac_reward_zero_std": 0.2750000059604645,
463
- "grad_norm": 0.14288610219955444,
464
  "kl": 0.0,
465
- "learning_rate": 1e-05,
466
  "loss": 0.0,
467
- "num_tokens": 6343805.0,
468
- "reward": 1.1138594150543213,
469
- "reward_std": 0.696417510509491,
470
- "rewards/multidomain_reward_func/mean": 1.1138594150543213,
471
- "rewards/multidomain_reward_func/std": 1.0503205060958862,
472
  "step": 21
473
  },
474
  {
475
- "completion_length": 417.29000701904295,
476
- "completions/clipped_ratio": 0.004999999888241291,
477
- "completions/max_length": 2048.0,
478
- "completions/max_terminated_length": 1233.0,
479
- "completions/mean_length": 418.2850036621094,
480
- "completions/mean_terminated_length": 410.095458984375,
481
- "completions/min_length": 112.0,
482
- "completions/min_terminated_length": 112.0,
483
- "epoch": 0.08396946564885496,
484
- "frac_reward_zero_std": 0.2750000059604645,
485
- "grad_norm": 0.13009311258792877,
486
- "kl": 0.0,
487
- "learning_rate": 1e-05,
488
- "loss": -0.0,
489
- "num_tokens": 6627459.0,
490
- "reward": 1.0032275915145874,
491
- "reward_std": 0.6355298757553101,
492
- "rewards/multidomain_reward_func/mean": 1.003227710723877,
493
- "rewards/multidomain_reward_func/std": 1.3018079996109009,
494
  "step": 22
495
  },
496
  {
497
- "completion_length": 435.9750045776367,
498
- "completions/clipped_ratio": 0.0024999999441206455,
499
- "completions/max_length": 2048.0,
500
- "completions/max_terminated_length": 1614.0,
501
- "completions/mean_length": 436.9725036621094,
502
- "completions/mean_terminated_length": 432.9348449707031,
503
- "completions/min_length": 60.0,
504
- "completions/min_terminated_length": 60.0,
505
- "epoch": 0.08778625954198473,
506
- "frac_reward_zero_std": 0.32499998807907104,
507
- "grad_norm": 0.16062505543231964,
508
  "kl": 0.0,
509
- "learning_rate": 1e-05,
510
- "loss": -0.0,
511
- "num_tokens": 6931618.0,
512
- "reward": 0.8509132266044617,
513
- "reward_std": 0.7453229427337646,
514
- "rewards/multidomain_reward_func/mean": 0.8509131669998169,
515
- "rewards/multidomain_reward_func/std": 1.4180208444595337,
516
  "step": 23
517
  },
518
  {
519
- "completion_length": 434.4600067138672,
520
- "completions/clipped_ratio": 0.0,
521
- "completions/max_length": 1097.0,
522
- "completions/max_terminated_length": 1097.0,
523
- "completions/mean_length": 435.4599914550781,
524
- "completions/mean_terminated_length": 435.4599914550781,
525
- "completions/min_length": 131.0,
526
- "completions/min_terminated_length": 131.0,
527
- "epoch": 0.0916030534351145,
528
- "frac_reward_zero_std": 0.3499999940395355,
529
- "grad_norm": 0.11939506977796555,
530
- "kl": 0.0,
531
- "learning_rate": 1e-05,
532
  "loss": 0.0,
533
- "num_tokens": 7230562.0,
534
- "reward": 1.0885521173477173,
535
- "reward_std": 0.5634962320327759,
536
- "rewards/multidomain_reward_func/mean": 1.0885521173477173,
537
- "rewards/multidomain_reward_func/std": 1.1546189785003662,
538
  "step": 24
539
  },
540
  {
541
- "completion_length": 403.5975036621094,
542
  "completions/clipped_ratio": 0.0,
543
- "completions/max_length": 1309.0,
544
- "completions/max_terminated_length": 1309.0,
545
- "completions/mean_length": 404.5975036621094,
546
- "completions/mean_terminated_length": 404.5975036621094,
547
- "completions/min_length": 132.0,
548
- "completions/min_terminated_length": 132.0,
549
- "epoch": 0.09541984732824428,
550
- "frac_reward_zero_std": 0.19999998807907104,
551
- "grad_norm": 0.13339835405349731,
552
  "kl": 0.0,
553
- "learning_rate": 1e-05,
554
  "loss": 0.0,
555
- "num_tokens": 7509601.0,
556
- "reward": 1.0083370208740234,
557
- "reward_std": 0.7597755193710327,
558
- "rewards/multidomain_reward_func/mean": 1.008337140083313,
559
- "rewards/multidomain_reward_func/std": 1.158740520477295,
560
  "step": 25
561
  },
562
  {
563
- "completion_length": 432.5300094604492,
564
- "completions/clipped_ratio": 0.01249999925494194,
565
- "completions/max_length": 2048.0,
566
- "completions/max_terminated_length": 1090.0,
567
- "completions/mean_length": 433.5174865722656,
568
- "completions/mean_terminated_length": 413.0810241699219,
569
- "completions/min_length": 76.0,
570
- "completions/min_terminated_length": 76.0,
571
- "epoch": 0.09923664122137404,
572
- "frac_reward_zero_std": 0.2750000059604645,
573
- "grad_norm": 0.21664012968540192,
574
- "kl": 0.0,
575
- "learning_rate": 1e-05,
576
  "loss": 0.0,
577
- "num_tokens": 7808048.0,
578
- "reward": 0.5465364456176758,
579
- "reward_std": 0.7330557703971863,
580
- "rewards/multidomain_reward_func/mean": 0.5465364456176758,
581
- "rewards/multidomain_reward_func/std": 1.6296391487121582,
582
  "step": 26
583
  },
584
  {
585
- "completion_length": 470.5725036621094,
586
- "completions/clipped_ratio": 0.0,
587
- "completions/max_length": 1447.0,
588
- "completions/max_terminated_length": 1447.0,
589
- "completions/mean_length": 471.5724792480469,
590
- "completions/mean_terminated_length": 471.5724792480469,
591
- "completions/min_length": 67.0,
592
- "completions/min_terminated_length": 67.0,
593
- "epoch": 0.10305343511450382,
594
- "frac_reward_zero_std": 0.32499998807907104,
595
- "grad_norm": 0.14879898726940155,
596
  "kl": 0.0,
597
- "learning_rate": 1e-05,
598
  "loss": 0.0,
599
- "num_tokens": 8137947.0,
600
- "reward": 1.0213390588760376,
601
- "reward_std": 0.6532722115516663,
602
- "rewards/multidomain_reward_func/mean": 1.0213390588760376,
603
- "rewards/multidomain_reward_func/std": 1.0712261199951172,
604
  "step": 27
605
  },
606
  {
607
- "completion_length": 497.59501190185546,
608
- "completions/clipped_ratio": 0.01249999925494194,
609
  "completions/max_length": 2048.0,
610
- "completions/max_terminated_length": 1324.0,
611
- "completions/mean_length": 498.5824890136719,
612
- "completions/mean_terminated_length": 478.9696350097656,
613
- "completions/min_length": 101.0,
614
- "completions/min_terminated_length": 101.0,
615
- "epoch": 0.10687022900763359,
616
- "frac_reward_zero_std": 0.2750000059604645,
617
- "grad_norm": 0.13745573163032532,
618
  "kl": 0.0,
619
- "learning_rate": 1e-05,
620
  "loss": 0.0,
621
- "num_tokens": 8486510.0,
622
- "reward": 0.6380816102027893,
623
- "reward_std": 0.6292651891708374,
624
- "rewards/multidomain_reward_func/mean": 0.6380816102027893,
625
- "rewards/multidomain_reward_func/std": 1.474389910697937,
626
  "step": 28
627
  },
628
  {
629
- "completion_length": 475.4775100708008,
630
- "completions/clipped_ratio": 0.009999999776482582,
631
- "completions/max_length": 2048.0,
632
- "completions/max_terminated_length": 1549.0,
633
- "completions/mean_length": 476.4674987792969,
634
- "completions/mean_terminated_length": 460.59344482421875,
635
- "completions/min_length": 67.0,
636
- "completions/min_terminated_length": 67.0,
637
- "epoch": 0.11068702290076336,
638
- "frac_reward_zero_std": 0.2750000059604645,
639
- "grad_norm": 0.18127009272575378,
640
- "kl": 0.0,
641
- "learning_rate": 1e-05,
642
  "loss": 0.0,
643
- "num_tokens": 8811727.0,
644
- "reward": 0.3718194365501404,
645
- "reward_std": 0.8439445495605469,
646
- "rewards/multidomain_reward_func/mean": 0.37181946635246277,
647
- "rewards/multidomain_reward_func/std": 1.7001434564590454,
648
  "step": 29
649
  },
650
  {
651
- "completion_length": 482.22750701904295,
652
  "completions/clipped_ratio": 0.0,
653
- "completions/max_length": 1534.0,
654
- "completions/max_terminated_length": 1534.0,
655
- "completions/mean_length": 483.22747802734375,
656
- "completions/mean_terminated_length": 483.22747802734375,
657
- "completions/min_length": 126.0,
658
- "completions/min_terminated_length": 126.0,
659
- "epoch": 0.11450381679389313,
660
- "frac_reward_zero_std": 0.32499998807907104,
661
- "grad_norm": 0.11341980844736099,
662
  "kl": 0.0,
663
- "learning_rate": 1e-05,
664
  "loss": 0.0,
665
- "num_tokens": 9151548.0,
666
- "reward": 0.9484896063804626,
667
- "reward_std": 0.5695937871932983,
668
- "rewards/multidomain_reward_func/mean": 0.9484896063804626,
669
- "rewards/multidomain_reward_func/std": 1.1334481239318848,
670
  "step": 30
671
  },
672
  {
673
- "completion_length": 420.1550048828125,
674
  "completions/clipped_ratio": 0.0,
675
- "completions/max_length": 1361.0,
676
- "completions/max_terminated_length": 1361.0,
677
- "completions/mean_length": 421.1549987792969,
678
- "completions/mean_terminated_length": 421.1549987792969,
679
- "completions/min_length": 142.0,
680
- "completions/min_terminated_length": 142.0,
681
- "epoch": 0.1183206106870229,
682
- "frac_reward_zero_std": 0.22499999403953552,
683
- "grad_norm": 0.13916772603988647,
684
  "kl": 0.0,
685
- "learning_rate": 1e-05,
686
  "loss": -0.0,
687
- "num_tokens": 9432380.0,
688
- "reward": 0.8056875467300415,
689
- "reward_std": 0.7789397835731506,
690
- "rewards/multidomain_reward_func/mean": 0.8056875467300415,
691
- "rewards/multidomain_reward_func/std": 1.314136028289795,
692
  "step": 31
693
  },
694
  {
695
- "completion_length": 471.6550033569336,
696
- "completions/clipped_ratio": 0.0024999999441206455,
697
  "completions/max_length": 2048.0,
698
- "completions/max_terminated_length": 1544.0,
699
- "completions/mean_length": 472.6524963378906,
700
- "completions/mean_terminated_length": 468.7042541503906,
701
- "completions/min_length": 30.0,
702
- "completions/min_terminated_length": 30.0,
703
- "epoch": 0.12213740458015267,
704
- "frac_reward_zero_std": 0.25,
705
- "grad_norm": 0.1645308881998062,
706
  "kl": 0.0,
707
- "learning_rate": 1e-05,
708
  "loss": -0.0,
709
- "num_tokens": 9755281.0,
710
- "reward": 0.6055446267127991,
711
- "reward_std": 0.7402985692024231,
712
- "rewards/multidomain_reward_func/mean": 0.6055446863174438,
713
- "rewards/multidomain_reward_func/std": 1.5184121131896973,
714
  "step": 32
715
  },
716
  {
717
- "completion_length": 459.3675048828125,
718
- "completions/clipped_ratio": 0.0,
719
- "completions/max_length": 1138.0,
720
- "completions/max_terminated_length": 1138.0,
721
- "completions/mean_length": 460.36749267578125,
722
- "completions/mean_terminated_length": 460.36749267578125,
723
- "completions/min_length": 125.0,
724
- "completions/min_terminated_length": 125.0,
725
- "epoch": 0.12595419847328243,
726
- "frac_reward_zero_std": 0.14999999105930328,
727
- "grad_norm": 0.1528780460357666,
728
  "kl": 0.0,
729
- "learning_rate": 1e-05,
730
  "loss": 0.0,
731
- "num_tokens": 10076308.0,
732
- "reward": 0.9635868072509766,
733
- "reward_std": 0.8727088570594788,
734
- "rewards/multidomain_reward_func/mean": 0.9635868072509766,
735
- "rewards/multidomain_reward_func/std": 1.286659598350525,
736
  "step": 33
737
  },
738
  {
739
- "completion_length": 413.8675033569336,
740
  "completions/clipped_ratio": 0.0,
741
- "completions/max_length": 1067.0,
742
- "completions/max_terminated_length": 1067.0,
743
- "completions/mean_length": 414.86749267578125,
744
- "completions/mean_terminated_length": 414.86749267578125,
745
- "completions/min_length": 134.0,
746
- "completions/min_terminated_length": 134.0,
747
- "epoch": 0.1297709923664122,
748
- "frac_reward_zero_std": 0.29999998211860657,
749
- "grad_norm": 0.14397776126861572,
750
- "kl": 0.0,
751
- "learning_rate": 1e-05,
752
- "loss": -0.0,
753
- "num_tokens": 10360115.0,
754
- "reward": 1.0364062786102295,
755
- "reward_std": 0.690801203250885,
756
- "rewards/multidomain_reward_func/mean": 1.0364062786102295,
757
- "rewards/multidomain_reward_func/std": 1.1974419355392456,
758
  "step": 34
759
  },
760
  {
761
- "completion_length": 641.300015258789,
762
- "completions/clipped_ratio": 0.08749999850988388,
763
  "completions/max_length": 2048.0,
764
- "completions/max_terminated_length": 2008.0,
765
- "completions/mean_length": 642.2124633789062,
766
- "completions/mean_terminated_length": 507.4109802246094,
767
- "completions/min_length": 89.0,
768
- "completions/min_terminated_length": 89.0,
769
- "epoch": 0.13358778625954199,
770
- "frac_reward_zero_std": 0.3999999761581421,
771
- "grad_norm": 0.11470023542642593,
772
- "kl": 0.0,
773
- "learning_rate": 1e-05,
774
  "loss": 0.0,
775
- "num_tokens": 10754530.0,
776
- "reward": -0.2579292953014374,
777
- "reward_std": 0.8382142186164856,
778
- "rewards/multidomain_reward_func/mean": -0.2579292953014374,
779
- "rewards/multidomain_reward_func/std": 2.043348789215088,
780
  "step": 35
781
  },
782
  {
783
- "completion_length": 434.5075088500977,
784
- "completions/clipped_ratio": 0.0,
785
- "completions/max_length": 1103.0,
786
- "completions/max_terminated_length": 1103.0,
787
- "completions/mean_length": 435.5074768066406,
788
- "completions/mean_terminated_length": 435.5074768066406,
789
- "completions/min_length": 124.0,
790
- "completions/min_terminated_length": 124.0,
791
- "epoch": 0.13740458015267176,
792
- "frac_reward_zero_std": 0.3999999761581421,
793
- "grad_norm": 0.1501513123512268,
794
- "kl": 0.0,
795
- "learning_rate": 1e-05,
796
  "loss": 0.0,
797
- "num_tokens": 11054083.0,
798
- "reward": 1.0775103569030762,
799
- "reward_std": 0.6445066928863525,
800
- "rewards/multidomain_reward_func/mean": 1.0775104761123657,
801
- "rewards/multidomain_reward_func/std": 1.1158509254455566,
802
  "step": 36
803
- },
804
- {
805
- "completion_length": 427.5175033569336,
806
- "completions/clipped_ratio": 0.0,
807
- "completions/max_length": 1242.0,
808
- "completions/max_terminated_length": 1242.0,
809
- "completions/mean_length": 428.5174865722656,
810
- "completions/mean_terminated_length": 428.5174865722656,
811
- "completions/min_length": 116.0,
812
- "completions/min_terminated_length": 116.0,
813
- "epoch": 0.14122137404580154,
814
- "frac_reward_zero_std": 0.2750000059604645,
815
- "grad_norm": 0.1382102072238922,
816
- "kl": 0.0,
817
- "learning_rate": 1e-05,
818
- "loss": -0.0,
819
- "num_tokens": 11357730.0,
820
- "reward": 0.934586763381958,
821
- "reward_std": 0.6311339139938354,
822
- "rewards/multidomain_reward_func/mean": 0.934586763381958,
823
- "rewards/multidomain_reward_func/std": 1.2222440242767334,
824
- "step": 37
825
  }
826
  ],
827
  "logging_steps": 1,
828
- "max_steps": 262,
829
- "num_input_tokens_seen": 11657878,
830
  "num_train_epochs": 1,
831
  "save_steps": 250,
832
  "stateful_callbacks": {
@@ -842,7 +820,7 @@
842
  }
843
  },
844
  "total_flos": 0.0,
845
- "train_batch_size": 20,
846
  "trial_name": null,
847
  "trial_params": null
848
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3978494623655914,
6
  "eval_steps": 500,
7
+ "global_step": 37,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 417.31876068115236,
14
+ "completions/clipped_ratio": 0.0,
15
+ "completions/max_length": 1219.0,
16
+ "completions/max_terminated_length": 1219.0,
17
+ "completions/mean_length": 418.3187255859375,
18
+ "completions/mean_terminated_length": 418.3187255859375,
19
+ "completions/min_length": 155.0,
20
+ "completions/min_terminated_length": 155.0,
21
+ "epoch": 0.010752688172043012,
22
+ "frac_reward_zero_std": 0.2750000059604645,
23
+ "grad_norm": 0.1586368829011917,
24
  "kl": 0.0,
25
  "learning_rate": 0.0,
26
  "loss": 0.0,
27
+ "num_tokens": 589935.0,
28
+ "reward": 0.9395886063575745,
29
+ "reward_std": 0.6425187587738037,
30
+ "rewards/multidomain_reward_func/mean": 0.9395885467529297,
31
+ "rewards/multidomain_reward_func/std": 1.1834609508514404,
32
  "step": 1
33
  },
34
  {
35
+ "completion_length": 447.3875045776367,
36
  "completions/clipped_ratio": 0.0,
37
+ "completions/max_length": 1311.0,
38
+ "completions/max_terminated_length": 1311.0,
39
+ "completions/mean_length": 448.3874816894531,
40
+ "completions/mean_terminated_length": 448.3874816894531,
41
+ "completions/min_length": 144.0,
42
+ "completions/min_terminated_length": 144.0,
43
+ "epoch": 0.021505376344086023,
44
+ "frac_reward_zero_std": 0.2874999940395355,
45
+ "grad_norm": 0.09217020124197006,
46
+ "kl": 0.0,
47
+ "learning_rate": 9.999999999999999e-06,
48
  "loss": 0.0,
49
+ "num_tokens": 1202245.0,
50
+ "reward": 1.0898984670639038,
51
+ "reward_std": 0.6939569115638733,
52
+ "rewards/multidomain_reward_func/mean": 1.0898985862731934,
53
+ "rewards/multidomain_reward_func/std": 1.1529914140701294,
54
  "step": 2
55
  },
56
  {
57
+ "completion_length": 444.58375854492186,
58
  "completions/clipped_ratio": 0.0,
59
+ "completions/max_length": 1050.0,
60
+ "completions/max_terminated_length": 1050.0,
61
+ "completions/mean_length": 445.583740234375,
62
+ "completions/mean_terminated_length": 445.583740234375,
63
+ "completions/min_length": 144.0,
64
+ "completions/min_terminated_length": 144.0,
65
+ "epoch": 0.03225806451612903,
66
+ "frac_reward_zero_std": 0.2874999940395355,
67
+ "grad_norm": 0.09536850452423096,
68
+ "kl": 0.0,
69
+ "learning_rate": 1.9999999999999998e-05,
70
  "loss": 0.0,
71
+ "num_tokens": 1822762.0,
72
+ "reward": 1.2017430067062378,
73
+ "reward_std": 0.6153996586799622,
74
+ "rewards/multidomain_reward_func/mean": 1.2017431259155273,
75
+ "rewards/multidomain_reward_func/std": 1.1850411891937256,
76
  "step": 3
77
  },
78
  {
79
+ "completion_length": 440.8112548828125,
80
  "completions/clipped_ratio": 0.0,
81
+ "completions/max_length": 1121.0,
82
+ "completions/max_terminated_length": 1121.0,
83
+ "completions/mean_length": 441.8112487792969,
84
+ "completions/mean_terminated_length": 441.8112487792969,
85
+ "completions/min_length": 113.0,
86
+ "completions/min_terminated_length": 113.0,
87
+ "epoch": 0.043010752688172046,
88
+ "frac_reward_zero_std": 0.36249998211860657,
89
+ "grad_norm": 0.14304442703723907,
90
+ "kl": 0.0,
91
+ "learning_rate": 3e-05,
92
  "loss": -0.0,
93
+ "num_tokens": 2404141.0,
94
+ "reward": 0.9961501955986023,
95
+ "reward_std": 0.5547618865966797,
96
+ "rewards/multidomain_reward_func/mean": 0.9961501955986023,
97
+ "rewards/multidomain_reward_func/std": 1.2231403589248657,
98
  "step": 4
99
  },
100
  {
101
+ "completion_length": 513.616259765625,
102
  "completions/clipped_ratio": 0.0,
103
+ "completions/max_length": 1674.0,
104
+ "completions/max_terminated_length": 1674.0,
105
+ "completions/mean_length": 514.6162109375,
106
+ "completions/mean_terminated_length": 514.6162109375,
107
+ "completions/min_length": 153.0,
108
+ "completions/min_terminated_length": 153.0,
109
+ "epoch": 0.053763440860215055,
110
+ "frac_reward_zero_std": 0.21249999105930328,
111
+ "grad_norm": 0.0913516953587532,
112
+ "kl": 0.0,
113
+ "learning_rate": 3e-05,
114
+ "loss": 0.0,
115
+ "num_tokens": 3070424.0,
116
+ "reward": 0.9858412146568298,
117
+ "reward_std": 0.6862795948982239,
118
+ "rewards/multidomain_reward_func/mean": 0.9858411550521851,
119
+ "rewards/multidomain_reward_func/std": 1.2488752603530884,
120
  "step": 5
121
  },
122
  {
123
+ "completion_length": 501.64500885009767,
124
  "completions/clipped_ratio": 0.0,
125
+ "completions/max_length": 1360.0,
126
+ "completions/max_terminated_length": 1360.0,
127
+ "completions/mean_length": 502.6449890136719,
128
+ "completions/mean_terminated_length": 502.6449890136719,
129
+ "completions/min_length": 128.0,
130
+ "completions/min_terminated_length": 128.0,
131
+ "epoch": 0.06451612903225806,
132
+ "frac_reward_zero_std": 0.29999998211860657,
133
+ "grad_norm": 0.09110506623983383,
134
  "kl": 0.0,
135
+ "learning_rate": 3e-05,
136
  "loss": 0.0,
137
+ "num_tokens": 3715110.0,
138
+ "reward": 1.0306060314178467,
139
+ "reward_std": 0.6257905960083008,
140
+ "rewards/multidomain_reward_func/mean": 1.0306060314178467,
141
+ "rewards/multidomain_reward_func/std": 1.1789323091506958,
142
  "step": 6
143
  },
144
  {
145
+ "completion_length": 520.2025085449219,
146
  "completions/clipped_ratio": 0.0,
147
+ "completions/max_length": 1468.0,
148
+ "completions/max_terminated_length": 1468.0,
149
+ "completions/mean_length": 521.2025146484375,
150
+ "completions/mean_terminated_length": 521.2025146484375,
151
+ "completions/min_length": 191.0,
152
+ "completions/min_terminated_length": 191.0,
153
+ "epoch": 0.07526881720430108,
154
+ "frac_reward_zero_std": 0.26249998807907104,
155
+ "grad_norm": 0.08829142153263092,
156
+ "kl": 0.0,
157
+ "learning_rate": 3e-05,
158
+ "loss": -0.0,
159
+ "num_tokens": 4384052.0,
160
+ "reward": 1.1119601726531982,
161
+ "reward_std": 0.6622768044471741,
162
+ "rewards/multidomain_reward_func/mean": 1.1119602918624878,
163
+ "rewards/multidomain_reward_func/std": 1.1772183179855347,
164
  "step": 7
165
  },
166
  {
167
+ "completion_length": 528.4200134277344,
168
  "completions/clipped_ratio": 0.0,
169
+ "completions/max_length": 1742.0,
170
+ "completions/max_terminated_length": 1742.0,
171
+ "completions/mean_length": 529.4199829101562,
172
+ "completions/mean_terminated_length": 529.4199829101562,
173
+ "completions/min_length": 164.0,
174
+ "completions/min_terminated_length": 164.0,
175
+ "epoch": 0.08602150537634409,
176
+ "frac_reward_zero_std": 0.21249999105930328,
177
+ "grad_norm": 0.09405792504549026,
178
+ "kl": 0.0,
179
+ "learning_rate": 3e-05,
180
+ "loss": 0.0,
181
+ "num_tokens": 5063198.0,
182
+ "reward": 0.8146641254425049,
183
+ "reward_std": 1.0251129865646362,
184
+ "rewards/multidomain_reward_func/mean": 0.8146640658378601,
185
+ "rewards/multidomain_reward_func/std": 1.5042277574539185,
186
  "step": 8
187
  },
188
  {
189
+ "completion_length": 530.1662612915039,
190
  "completions/clipped_ratio": 0.0,
191
+ "completions/max_length": 1322.0,
192
+ "completions/max_terminated_length": 1322.0,
193
+ "completions/mean_length": 531.166259765625,
194
+ "completions/mean_terminated_length": 531.166259765625,
195
+ "completions/min_length": 182.0,
196
+ "completions/min_terminated_length": 182.0,
197
+ "epoch": 0.0967741935483871,
198
+ "frac_reward_zero_std": 0.3125,
199
+ "grad_norm": 0.08664832264184952,
200
+ "kl": 0.0,
201
+ "learning_rate": 3e-05,
202
  "loss": 0.0,
203
+ "num_tokens": 5745791.0,
204
+ "reward": 1.105246663093567,
205
+ "reward_std": 0.5355943441390991,
206
+ "rewards/multidomain_reward_func/mean": 1.1052465438842773,
207
+ "rewards/multidomain_reward_func/std": 1.1541340351104736,
208
  "step": 9
209
  },
210
  {
211
+ "completion_length": 529.9687576293945,
212
  "completions/clipped_ratio": 0.0,
213
+ "completions/max_length": 1527.0,
214
+ "completions/max_terminated_length": 1527.0,
215
+ "completions/mean_length": 530.96875,
216
+ "completions/mean_terminated_length": 530.96875,
217
+ "completions/min_length": 146.0,
218
+ "completions/min_terminated_length": 146.0,
219
+ "epoch": 0.10752688172043011,
220
+ "frac_reward_zero_std": 0.23749999701976776,
221
+ "grad_norm": 0.09411562234163284,
222
+ "kl": 0.0,
223
+ "learning_rate": 3e-05,
224
+ "loss": 0.0,
225
+ "num_tokens": 6448686.0,
226
+ "reward": 0.5127537846565247,
227
+ "reward_std": 1.1222161054611206,
228
+ "rewards/multidomain_reward_func/mean": 0.5127537846565247,
229
+ "rewards/multidomain_reward_func/std": 1.6059224605560303,
230
  "step": 10
231
  },
232
  {
233
+ "completion_length": 506.1475082397461,
234
  "completions/clipped_ratio": 0.0,
235
+ "completions/max_length": 1700.0,
236
+ "completions/max_terminated_length": 1700.0,
237
+ "completions/mean_length": 507.1474914550781,
238
+ "completions/mean_terminated_length": 507.1474914550781,
239
+ "completions/min_length": 101.0,
240
+ "completions/min_terminated_length": 101.0,
241
+ "epoch": 0.11827956989247312,
242
+ "frac_reward_zero_std": 0.3375000059604645,
243
+ "grad_norm": 0.09998169541358948,
244
+ "kl": 0.0,
245
+ "learning_rate": 3e-05,
246
+ "loss": -0.0,
247
+ "num_tokens": 7132404.0,
248
+ "reward": 0.799436628818512,
249
+ "reward_std": 0.71962970495224,
250
+ "rewards/multidomain_reward_func/mean": 0.799436628818512,
251
+ "rewards/multidomain_reward_func/std": 1.4251511096954346,
252
  "step": 11
253
  },
254
  {
255
+ "completion_length": 486.396257019043,
256
+ "completions/clipped_ratio": 0.0012499999720603228,
257
+ "completions/max_length": 2048.0,
258
+ "completions/max_terminated_length": 1110.0,
259
+ "completions/mean_length": 487.3949890136719,
260
+ "completions/mean_terminated_length": 485.4418029785156,
261
+ "completions/min_length": 165.0,
262
+ "completions/min_terminated_length": 165.0,
263
+ "epoch": 0.12903225806451613,
264
  "frac_reward_zero_std": 0.29999998211860657,
265
+ "grad_norm": 0.09192386269569397,
266
  "kl": 0.0,
267
+ "learning_rate": 3e-05,
268
  "loss": 0.0,
269
+ "num_tokens": 7759800.0,
270
+ "reward": 1.1488876342773438,
271
+ "reward_std": 0.5708559155464172,
272
+ "rewards/multidomain_reward_func/mean": 1.1488877534866333,
273
+ "rewards/multidomain_reward_func/std": 1.160998821258545,
274
  "step": 12
275
  },
276
  {
277
+ "completion_length": 476.3787567138672,
278
+ "completions/clipped_ratio": 0.0,
279
+ "completions/max_length": 1302.0,
280
+ "completions/max_terminated_length": 1302.0,
281
+ "completions/mean_length": 477.3787536621094,
282
+ "completions/mean_terminated_length": 477.3787536621094,
283
+ "completions/min_length": 121.0,
284
+ "completions/min_terminated_length": 121.0,
285
+ "epoch": 0.13978494623655913,
286
+ "frac_reward_zero_std": 0.4124999940395355,
287
+ "grad_norm": 0.09517515450716019,
288
+ "kl": 0.0,
289
+ "learning_rate": 3e-05,
290
+ "loss": 0.0,
291
+ "num_tokens": 8412593.0,
292
+ "reward": 1.0822017192840576,
293
+ "reward_std": 0.47241905331611633,
294
+ "rewards/multidomain_reward_func/mean": 1.082201600074768,
295
+ "rewards/multidomain_reward_func/std": 1.0703767538070679,
296
  "step": 13
297
  },
298
  {
299
+ "completion_length": 461.2287567138672,
300
  "completions/clipped_ratio": 0.0,
301
+ "completions/max_length": 1070.0,
302
+ "completions/max_terminated_length": 1070.0,
303
+ "completions/mean_length": 462.2287292480469,
304
+ "completions/mean_terminated_length": 462.2287292480469,
305
+ "completions/min_length": 132.0,
306
+ "completions/min_terminated_length": 132.0,
307
+ "epoch": 0.15053763440860216,
308
+ "frac_reward_zero_std": 0.4124999940395355,
309
+ "grad_norm": 0.08922425657510757,
310
  "kl": 0.0,
311
+ "learning_rate": 3e-05,
312
+ "loss": -0.0,
313
+ "num_tokens": 9027756.0,
314
+ "reward": 1.1727960109710693,
315
+ "reward_std": 0.5148429274559021,
316
+ "rewards/multidomain_reward_func/mean": 1.1727960109710693,
317
+ "rewards/multidomain_reward_func/std": 1.1102207899093628,
318
  "step": 14
319
  },
320
  {
321
+ "completion_length": 447.0050079345703,
322
+ "completions/clipped_ratio": 0.0,
323
+ "completions/max_length": 1175.0,
324
+ "completions/max_terminated_length": 1175.0,
325
+ "completions/mean_length": 448.0050048828125,
326
+ "completions/mean_terminated_length": 448.0050048828125,
327
+ "completions/min_length": 125.0,
328
+ "completions/min_terminated_length": 125.0,
329
+ "epoch": 0.16129032258064516,
330
+ "frac_reward_zero_std": 0.25,
331
+ "grad_norm": 0.09966976195573807,
332
  "kl": 0.0,
333
+ "learning_rate": 3e-05,
334
  "loss": -0.0,
335
+ "num_tokens": 9647350.0,
336
+ "reward": 1.1358871459960938,
337
+ "reward_std": 0.6162644028663635,
338
+ "rewards/multidomain_reward_func/mean": 1.1358871459960938,
339
+ "rewards/multidomain_reward_func/std": 1.0770015716552734,
340
  "step": 15
341
  },
342
  {
343
+ "completion_length": 453.9125045776367,
344
  "completions/clipped_ratio": 0.0,
345
+ "completions/max_length": 1108.0,
346
+ "completions/max_terminated_length": 1108.0,
347
+ "completions/mean_length": 454.9124755859375,
348
+ "completions/mean_terminated_length": 454.9124755859375,
349
+ "completions/min_length": 118.0,
350
+ "completions/min_terminated_length": 118.0,
351
+ "epoch": 0.17204301075268819,
352
+ "frac_reward_zero_std": 0.26249998807907104,
353
+ "grad_norm": 0.09356285631656647,
354
+ "kl": 0.0,
355
+ "learning_rate": 3e-05,
356
+ "loss": 0.0,
357
+ "num_tokens": 10275890.0,
358
+ "reward": 1.1041463613510132,
359
+ "reward_std": 0.595342755317688,
360
+ "rewards/multidomain_reward_func/mean": 1.1041463613510132,
361
+ "rewards/multidomain_reward_func/std": 1.1405205726623535,
362
  "step": 16
363
  },
364
  {
365
+ "completion_length": 448.9925048828125,
366
  "completions/clipped_ratio": 0.0,
367
+ "completions/max_length": 1191.0,
368
+ "completions/max_terminated_length": 1191.0,
369
+ "completions/mean_length": 449.99249267578125,
370
+ "completions/mean_terminated_length": 449.99249267578125,
371
+ "completions/min_length": 142.0,
372
+ "completions/min_terminated_length": 142.0,
373
+ "epoch": 0.1827956989247312,
374
  "frac_reward_zero_std": 0.22499999403953552,
375
+ "grad_norm": 0.10382834821939468,
376
  "kl": 0.0,
377
+ "learning_rate": 3e-05,
378
+ "loss": 0.0,
379
+ "num_tokens": 10890984.0,
380
+ "reward": 1.1085461378097534,
381
+ "reward_std": 0.6585391163825989,
382
+ "rewards/multidomain_reward_func/mean": 1.1085461378097534,
383
+ "rewards/multidomain_reward_func/std": 1.2252981662750244,
384
  "step": 17
385
  },
386
  {
387
+ "completion_length": 451.19625701904295,
388
+ "completions/clipped_ratio": 0.0037499999161809683,
389
+ "completions/max_length": 2048.0,
390
+ "completions/max_terminated_length": 1073.0,
391
+ "completions/mean_length": 452.1925048828125,
392
+ "completions/mean_terminated_length": 446.1856994628906,
393
+ "completions/min_length": 106.0,
394
+ "completions/min_terminated_length": 106.0,
395
+ "epoch": 0.1935483870967742,
396
+ "frac_reward_zero_std": 0.32499998807907104,
397
+ "grad_norm": 0.10299306362867355,
398
  "kl": 0.0,
399
+ "learning_rate": 3e-05,
400
+ "loss": -0.0,
401
+ "num_tokens": 11513098.0,
402
+ "reward": 0.9830173850059509,
403
+ "reward_std": 0.5063682198524475,
404
+ "rewards/multidomain_reward_func/mean": 0.9830173254013062,
405
+ "rewards/multidomain_reward_func/std": 1.2036722898483276,
406
  "step": 18
407
  },
408
  {
409
+ "completion_length": 474.92625732421874,
410
+ "completions/clipped_ratio": 0.0012499999720603228,
411
+ "completions/max_length": 2048.0,
412
+ "completions/max_terminated_length": 1312.0,
413
+ "completions/mean_length": 475.92498779296875,
414
+ "completions/mean_terminated_length": 473.95745849609375,
415
+ "completions/min_length": 129.0,
416
+ "completions/min_terminated_length": 129.0,
417
+ "epoch": 0.20430107526881722,
418
  "frac_reward_zero_std": 0.29999998211860657,
419
+ "grad_norm": 0.09433634579181671,
420
+ "kl": 0.0,
421
+ "learning_rate": 3e-05,
422
+ "loss": -0.0,
423
+ "num_tokens": 12184568.0,
424
+ "reward": 1.0752404928207397,
425
+ "reward_std": 0.5778380632400513,
426
+ "rewards/multidomain_reward_func/mean": 1.0752404928207397,
427
+ "rewards/multidomain_reward_func/std": 1.1373811960220337,
428
  "step": 19
429
  },
430
  {
431
+ "completion_length": 452.972509765625,
432
+ "completions/clipped_ratio": 0.0012499999720603228,
433
+ "completions/max_length": 2048.0,
434
+ "completions/max_terminated_length": 1264.0,
435
+ "completions/mean_length": 453.97125244140625,
436
+ "completions/mean_terminated_length": 451.9762268066406,
437
+ "completions/min_length": 102.0,
438
+ "completions/min_terminated_length": 102.0,
439
+ "epoch": 0.21505376344086022,
440
+ "frac_reward_zero_std": 0.23749999701976776,
441
+ "grad_norm": 0.09762763977050781,
442
+ "kl": 0.0,
443
+ "learning_rate": 3e-05,
444
  "loss": -0.0,
445
+ "num_tokens": 12820175.0,
446
+ "reward": 0.7925729155540466,
447
+ "reward_std": 0.8282187581062317,
448
+ "rewards/multidomain_reward_func/mean": 0.7925729155540466,
449
+ "rewards/multidomain_reward_func/std": 1.3876315355300903,
450
  "step": 20
451
  },
452
  {
453
+ "completion_length": 430.3100082397461,
454
  "completions/clipped_ratio": 0.0,
455
+ "completions/max_length": 974.0,
456
+ "completions/max_terminated_length": 974.0,
457
+ "completions/mean_length": 431.30999755859375,
458
+ "completions/mean_terminated_length": 431.30999755859375,
459
+ "completions/min_length": 136.0,
460
+ "completions/min_terminated_length": 136.0,
461
+ "epoch": 0.22580645161290322,
462
+ "frac_reward_zero_std": 0.32499998807907104,
463
+ "grad_norm": 0.12044712156057358,
464
  "kl": 0.0,
465
+ "learning_rate": 3e-05,
466
  "loss": 0.0,
467
+ "num_tokens": 13426793.0,
468
+ "reward": 1.1226346492767334,
469
+ "reward_std": 0.514378011226654,
470
+ "rewards/multidomain_reward_func/mean": 1.122634768486023,
471
+ "rewards/multidomain_reward_func/std": 1.1829208135604858,
472
  "step": 21
473
  },
474
  {
475
+ "completion_length": 412.8612548828125,
476
+ "completions/clipped_ratio": 0.0,
477
+ "completions/max_length": 1069.0,
478
+ "completions/max_terminated_length": 1069.0,
479
+ "completions/mean_length": 413.8612365722656,
480
+ "completions/mean_terminated_length": 413.8612365722656,
481
+ "completions/min_length": 117.0,
482
+ "completions/min_terminated_length": 117.0,
483
+ "epoch": 0.23655913978494625,
484
+ "frac_reward_zero_std": 0.21249999105930328,
485
+ "grad_norm": 0.10490359365940094,
486
+ "kl": 0.0,
487
+ "learning_rate": 3e-05,
488
+ "loss": 0.0,
489
+ "num_tokens": 14010292.0,
490
+ "reward": 1.0972095727920532,
491
+ "reward_std": 0.6835877895355225,
492
+ "rewards/multidomain_reward_func/mean": 1.0972095727920532,
493
+ "rewards/multidomain_reward_func/std": 1.1799525022506714,
494
  "step": 22
495
  },
496
  {
497
+ "completion_length": 398.9125061035156,
498
+ "completions/clipped_ratio": 0.0,
499
+ "completions/max_length": 1085.0,
500
+ "completions/max_terminated_length": 1085.0,
501
+ "completions/mean_length": 399.9125061035156,
502
+ "completions/mean_terminated_length": 399.9125061035156,
503
+ "completions/min_length": 118.0,
504
+ "completions/min_terminated_length": 118.0,
505
+ "epoch": 0.24731182795698925,
506
+ "frac_reward_zero_std": 0.29999998211860657,
507
+ "grad_norm": 0.10093961656093597,
508
  "kl": 0.0,
509
+ "learning_rate": 3e-05,
510
+ "loss": 0.0,
511
+ "num_tokens": 14575372.0,
512
+ "reward": 1.0390172004699707,
513
+ "reward_std": 0.6097813248634338,
514
+ "rewards/multidomain_reward_func/mean": 1.0390172004699707,
515
+ "rewards/multidomain_reward_func/std": 1.273277759552002,
516
  "step": 23
517
  },
518
  {
519
+ "completion_length": 436.89375457763674,
520
+ "completions/clipped_ratio": 0.0012499999720603228,
521
+ "completions/max_length": 2048.0,
522
+ "completions/max_terminated_length": 1535.0,
523
+ "completions/mean_length": 437.8924865722656,
524
+ "completions/mean_terminated_length": 435.8773498535156,
525
+ "completions/min_length": 55.0,
526
+ "completions/min_terminated_length": 55.0,
527
+ "epoch": 0.25806451612903225,
528
+ "frac_reward_zero_std": 0.3125,
529
+ "grad_norm": 0.10864879190921783,
530
+ "kl": 0.0,
531
+ "learning_rate": 3e-05,
532
  "loss": 0.0,
533
+ "num_tokens": 15207466.0,
534
+ "reward": 0.8946269154548645,
535
+ "reward_std": 0.6158351898193359,
536
+ "rewards/multidomain_reward_func/mean": 0.8946268558502197,
537
+ "rewards/multidomain_reward_func/std": 1.3325064182281494,
538
  "step": 24
539
  },
540
  {
541
+ "completion_length": 429.6425079345703,
542
  "completions/clipped_ratio": 0.0,
543
+ "completions/max_length": 1189.0,
544
+ "completions/max_terminated_length": 1189.0,
545
+ "completions/mean_length": 430.6424865722656,
546
+ "completions/mean_terminated_length": 430.6424865722656,
547
+ "completions/min_length": 123.0,
548
+ "completions/min_terminated_length": 123.0,
549
+ "epoch": 0.26881720430107525,
550
+ "frac_reward_zero_std": 0.22499999403953552,
551
+ "grad_norm": 0.09544331580400467,
552
  "kl": 0.0,
553
+ "learning_rate": 3e-05,
554
  "loss": 0.0,
555
+ "num_tokens": 15806970.0,
556
+ "reward": 1.0628045797348022,
557
+ "reward_std": 0.6665589213371277,
558
+ "rewards/multidomain_reward_func/mean": 1.0628045797348022,
559
+ "rewards/multidomain_reward_func/std": 1.2315187454223633,
560
  "step": 25
561
  },
562
  {
563
+ "completion_length": 431.210009765625,
564
+ "completions/clipped_ratio": 0.0,
565
+ "completions/max_length": 1029.0,
566
+ "completions/max_terminated_length": 1029.0,
567
+ "completions/mean_length": 432.2099914550781,
568
+ "completions/mean_terminated_length": 432.2099914550781,
569
+ "completions/min_length": 115.0,
570
+ "completions/min_terminated_length": 115.0,
571
+ "epoch": 0.27956989247311825,
572
+ "frac_reward_zero_std": 0.2874999940395355,
573
+ "grad_norm": 0.17953357100486755,
574
+ "kl": 0.0,
575
+ "learning_rate": 3e-05,
576
  "loss": 0.0,
577
+ "num_tokens": 16390478.0,
578
+ "reward": 1.0917786359786987,
579
+ "reward_std": 0.5078949332237244,
580
+ "rewards/multidomain_reward_func/mean": 1.0917787551879883,
581
+ "rewards/multidomain_reward_func/std": 1.2241995334625244,
582
  "step": 26
583
  },
584
  {
585
+ "completion_length": 469.45375061035156,
586
+ "completions/clipped_ratio": 0.0037499999161809683,
587
+ "completions/max_length": 2048.0,
588
+ "completions/max_terminated_length": 1498.0,
589
+ "completions/mean_length": 470.4499816894531,
590
+ "completions/mean_terminated_length": 464.51190185546875,
591
+ "completions/min_length": 105.0,
592
+ "completions/min_terminated_length": 105.0,
593
+ "epoch": 0.2903225806451613,
594
+ "frac_reward_zero_std": 0.29999998211860657,
595
+ "grad_norm": 0.15617702901363373,
596
  "kl": 0.0,
597
+ "learning_rate": 3e-05,
598
  "loss": 0.0,
599
+ "num_tokens": 17049938.0,
600
+ "reward": 0.8478901386260986,
601
+ "reward_std": 0.5746238827705383,
602
+ "rewards/multidomain_reward_func/mean": 0.8478901386260986,
603
+ "rewards/multidomain_reward_func/std": 1.3466750383377075,
604
  "step": 27
605
  },
606
  {
607
+ "completion_length": 469.49500427246096,
608
+ "completions/clipped_ratio": 0.0012499999720603228,
609
  "completions/max_length": 2048.0,
610
+ "completions/max_terminated_length": 1457.0,
611
+ "completions/mean_length": 470.4937438964844,
612
+ "completions/mean_terminated_length": 468.5194091796875,
613
+ "completions/min_length": 120.0,
614
+ "completions/min_terminated_length": 120.0,
615
+ "epoch": 0.3010752688172043,
616
+ "frac_reward_zero_std": 0.22499999403953552,
617
+ "grad_norm": 0.11228151619434357,
618
  "kl": 0.0,
619
+ "learning_rate": 3e-05,
620
  "loss": 0.0,
621
+ "num_tokens": 17697093.0,
622
+ "reward": 1.025890588760376,
623
+ "reward_std": 0.6576307415962219,
624
+ "rewards/multidomain_reward_func/mean": 1.0258907079696655,
625
+ "rewards/multidomain_reward_func/std": 1.2718981504440308,
626
  "step": 28
627
  },
628
  {
629
+ "completion_length": 454.85375823974607,
630
+ "completions/clipped_ratio": 0.0,
631
+ "completions/max_length": 1260.0,
632
+ "completions/max_terminated_length": 1260.0,
633
+ "completions/mean_length": 455.8537292480469,
634
+ "completions/mean_terminated_length": 455.8537292480469,
635
+ "completions/min_length": 118.0,
636
+ "completions/min_terminated_length": 118.0,
637
+ "epoch": 0.3118279569892473,
638
+ "frac_reward_zero_std": 0.23749999701976776,
639
+ "grad_norm": 0.08834511786699295,
640
+ "kl": 0.0,
641
+ "learning_rate": 3e-05,
642
  "loss": 0.0,
643
+ "num_tokens": 18325316.0,
644
+ "reward": 1.1648790836334229,
645
+ "reward_std": 0.6541204452514648,
646
+ "rewards/multidomain_reward_func/mean": 1.1648792028427124,
647
+ "rewards/multidomain_reward_func/std": 1.1322048902511597,
648
  "step": 29
649
  },
650
  {
651
+ "completion_length": 437.74625701904296,
652
  "completions/clipped_ratio": 0.0,
653
+ "completions/max_length": 1771.0,
654
+ "completions/max_terminated_length": 1771.0,
655
+ "completions/mean_length": 438.7462463378906,
656
+ "completions/mean_terminated_length": 438.7462463378906,
657
+ "completions/min_length": 113.0,
658
+ "completions/min_terminated_length": 113.0,
659
+ "epoch": 0.3225806451612903,
660
+ "frac_reward_zero_std": 0.23749999701976776,
661
+ "grad_norm": 0.10401499271392822,
662
  "kl": 0.0,
663
+ "learning_rate": 3e-05,
664
  "loss": 0.0,
665
+ "num_tokens": 18941413.0,
666
+ "reward": 1.0463263988494873,
667
+ "reward_std": 0.6855629682540894,
668
+ "rewards/multidomain_reward_func/mean": 1.0463263988494873,
669
+ "rewards/multidomain_reward_func/std": 1.205276608467102,
670
  "step": 30
671
  },
672
  {
673
+ "completion_length": 447.941259765625,
674
  "completions/clipped_ratio": 0.0,
675
+ "completions/max_length": 2021.0,
676
+ "completions/max_terminated_length": 2021.0,
677
+ "completions/mean_length": 448.9412536621094,
678
+ "completions/mean_terminated_length": 448.9412536621094,
679
+ "completions/min_length": 120.0,
680
+ "completions/min_terminated_length": 120.0,
681
+ "epoch": 0.3333333333333333,
682
+ "frac_reward_zero_std": 0.25,
683
+ "grad_norm": 0.09366130828857422,
684
  "kl": 0.0,
685
+ "learning_rate": 3e-05,
686
  "loss": -0.0,
687
+ "num_tokens": 19542566.0,
688
+ "reward": 1.0071501731872559,
689
+ "reward_std": 0.676040768623352,
690
+ "rewards/multidomain_reward_func/mean": 1.0071501731872559,
691
+ "rewards/multidomain_reward_func/std": 1.1646482944488525,
692
  "step": 31
693
  },
694
  {
695
+ "completion_length": 470.27625427246096,
696
+ "completions/clipped_ratio": 0.0037499999161809683,
697
  "completions/max_length": 2048.0,
698
+ "completions/max_terminated_length": 1493.0,
699
+ "completions/mean_length": 471.2724914550781,
700
+ "completions/mean_terminated_length": 465.3375244140625,
701
+ "completions/min_length": 123.0,
702
+ "completions/min_terminated_length": 123.0,
703
+ "epoch": 0.34408602150537637,
704
+ "frac_reward_zero_std": 0.3125,
705
+ "grad_norm": 0.09432131797075272,
706
  "kl": 0.0,
707
+ "learning_rate": 3e-05,
708
  "loss": -0.0,
709
+ "num_tokens": 20172294.0,
710
+ "reward": 0.02558271773159504,
711
+ "reward_std": 1.2057312726974487,
712
+ "rewards/multidomain_reward_func/mean": 0.02558271400630474,
713
+ "rewards/multidomain_reward_func/std": 1.7726387977600098,
714
  "step": 32
715
  },
716
  {
717
+ "completion_length": 419.5925033569336,
718
+ "completions/clipped_ratio": 0.0012499999720603228,
719
+ "completions/max_length": 2048.0,
720
+ "completions/max_terminated_length": 1442.0,
721
+ "completions/mean_length": 420.59124755859375,
722
+ "completions/mean_terminated_length": 418.554443359375,
723
+ "completions/min_length": 111.0,
724
+ "completions/min_terminated_length": 111.0,
725
+ "epoch": 0.3548387096774194,
726
+ "frac_reward_zero_std": 0.3499999940395355,
727
+ "grad_norm": 0.09137614816427231,
728
  "kl": 0.0,
729
+ "learning_rate": 3e-05,
730
  "loss": 0.0,
731
+ "num_tokens": 20744097.0,
732
+ "reward": 1.1542513370513916,
733
+ "reward_std": 0.5509113073348999,
734
+ "rewards/multidomain_reward_func/mean": 1.1542514562606812,
735
+ "rewards/multidomain_reward_func/std": 0.9433695077896118,
736
  "step": 33
737
  },
738
  {
739
+ "completion_length": 434.2000076293945,
740
  "completions/clipped_ratio": 0.0,
741
+ "completions/max_length": 1262.0,
742
+ "completions/max_terminated_length": 1262.0,
743
+ "completions/mean_length": 435.1999816894531,
744
+ "completions/mean_terminated_length": 435.1999816894531,
745
+ "completions/min_length": 120.0,
746
+ "completions/min_terminated_length": 120.0,
747
+ "epoch": 0.3655913978494624,
748
+ "frac_reward_zero_std": 0.2874999940395355,
749
+ "grad_norm": 0.08975847065448761,
750
+ "kl": 0.0,
751
+ "learning_rate": 3e-05,
752
+ "loss": 0.0,
753
+ "num_tokens": 21349127.0,
754
+ "reward": 1.0992751121520996,
755
+ "reward_std": 0.532092809677124,
756
+ "rewards/multidomain_reward_func/mean": 1.0992752313613892,
757
+ "rewards/multidomain_reward_func/std": 1.0367480516433716,
758
  "step": 34
759
  },
760
  {
761
+ "completion_length": 482.0337600708008,
762
+ "completions/clipped_ratio": 0.0012499999720603228,
763
  "completions/max_length": 2048.0,
764
+ "completions/max_terminated_length": 2013.0,
765
+ "completions/mean_length": 483.0325012207031,
766
+ "completions/mean_terminated_length": 481.0738525390625,
767
+ "completions/min_length": 118.0,
768
+ "completions/min_terminated_length": 118.0,
769
+ "epoch": 0.3763440860215054,
770
+ "frac_reward_zero_std": 0.25,
771
+ "grad_norm": 0.09360182285308838,
772
+ "kl": 0.0,
773
+ "learning_rate": 3e-05,
774
  "loss": 0.0,
775
+ "num_tokens": 22033873.0,
776
+ "reward": 1.0292495489120483,
777
+ "reward_std": 0.6627224087715149,
778
+ "rewards/multidomain_reward_func/mean": 1.0292495489120483,
779
+ "rewards/multidomain_reward_func/std": 1.2340757846832275,
780
  "step": 35
781
  },
782
  {
783
+ "completion_length": 1182.0312561035157,
784
+ "completions/clipped_ratio": 0.4074999988079071,
785
+ "completions/max_length": 2048.0,
786
+ "completions/max_terminated_length": 1497.0,
787
+ "completions/mean_length": 1182.623779296875,
788
+ "completions/mean_terminated_length": 587.4493408203125,
789
+ "completions/min_length": 6.0,
790
+ "completions/min_terminated_length": 6.0,
791
+ "epoch": 0.3870967741935484,
792
+ "frac_reward_zero_std": 0.675000011920929,
793
+ "grad_norm": 0.05318624526262283,
794
+ "kl": 0.0,
795
+ "learning_rate": 3e-05,
796
  "loss": 0.0,
797
+ "num_tokens": 23279772.0,
798
+ "reward": -0.7171798348426819,
799
+ "reward_std": 0.26930734515190125,
800
+ "rewards/multidomain_reward_func/mean": -0.7171798944473267,
801
+ "rewards/multidomain_reward_func/std": 2.1405746936798096,
802
  "step": 36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
  }
804
  ],
805
  "logging_steps": 1,
806
+ "max_steps": 93,
807
+ "num_input_tokens_seen": 23871076,
808
  "num_train_epochs": 1,
809
  "save_steps": 250,
810
  "stateful_callbacks": {
 
820
  }
821
  },
822
  "total_flos": 0.0,
823
+ "train_batch_size": 40,
824
  "trial_name": null,
825
  "trial_params": null
826
  }
ckpt-40-percent/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8b716515b4f5eb2059f89ec3fda03b45958ba6ea937a4d3de40351b00d5d1f3
3
  size 7505
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b48077003af5f11ced05cb103bb98c595f8752173ade63e67e1251ccaecb2486
3
  size 7505