bimabk commited on
Commit
b827c8e
·
verified ·
1 Parent(s): 0a2c360

Upload task output 1

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
- "gate_proj",
34
  "o_proj",
 
 
35
  "down_proj",
36
  "k_proj",
37
- "up_proj",
38
- "v_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "up_proj",
 
33
  "o_proj",
34
+ "v_proj",
35
+ "gate_proj",
36
  "down_proj",
37
  "k_proj",
38
+ "q_proj"
 
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9bc32fac7f25bcede80af9be64db4efa53407d9dbd6b511856089e76298e8f7
3
  size 957942768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51dcc60062ae1b1cf80ce9f9cf0b07ba4a42abdffd827bf4c19f5b4c6152a313
3
  size 957942768
loss.txt CHANGED
@@ -1 +1 @@
1
- 152,-0.06000000238418579
 
1
+ 156,-1.1483332872390748
trainer_state.json CHANGED
@@ -2,507 +2,507 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.02432,
6
  "eval_steps": 500,
7
- "global_step": 152,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "clip_ratio/high_max": 0.0,
14
- "clip_ratio/high_mean": 0.0,
15
- "clip_ratio/low_mean": 0.0,
16
- "clip_ratio/low_min": 0.0,
17
- "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 374.4,
20
- "completions/max_terminated_length": 374.4,
21
- "completions/mean_length": 293.55,
22
- "completions/mean_terminated_length": 293.55,
23
- "completions/min_length": 181.8,
24
- "completions/min_terminated_length": 181.8,
25
- "entropy": 0.7695603430271148,
26
  "epoch": 0.0008,
27
- "frac_reward_zero_std": 0.675,
28
- "grad_norm": 0.1396484375,
29
- "kl": 0.007634526048786938,
30
  "learning_rate": 1.137216e-06,
31
- "loss": 9.18293430004269e-05,
32
- "num_tokens": 135798.0,
33
- "reward": 0.051062504202127455,
34
- "reward_std": 0.07574881687760353,
35
- "rewards/env_goofspiel_reward/mean": 0.051062504202127455,
36
- "rewards/env_goofspiel_reward/std": 0.195309117436409,
37
- "sampling/importance_sampling_ratio/max": 1.719690752029419,
38
- "sampling/importance_sampling_ratio/mean": 1.0014273881912232,
39
- "sampling/importance_sampling_ratio/min": 0.48566103279590606,
40
- "sampling/sampling_logp_difference/max": 0.7150738000869751,
41
- "sampling/sampling_logp_difference/mean": 0.0632629081606865,
42
  "step": 5,
43
- "step_time": 4.057331793400226
44
  },
45
  {
46
- "clip_ratio/high_max": 0.0,
47
- "clip_ratio/high_mean": 0.0,
48
- "clip_ratio/low_mean": 0.0,
49
  "clip_ratio/low_min": 0.0,
50
- "clip_ratio/region_mean": 0.0,
51
  "completions/clipped_ratio": 0.0,
52
  "completions/max_length": 374.0,
53
  "completions/max_terminated_length": 374.0,
54
- "completions/mean_length": 290.5375,
55
- "completions/mean_terminated_length": 290.5375,
56
- "completions/min_length": 193.0,
57
- "completions/min_terminated_length": 193.0,
58
- "entropy": 0.6640829563140869,
59
  "epoch": 0.0016,
60
- "frac_reward_zero_std": 0.7375,
61
- "grad_norm": 0.296875,
62
- "kl": 0.009508041350636631,
63
  "learning_rate": 2.5587359999999995e-06,
64
- "loss": -5.3186528384685515e-05,
65
- "num_tokens": 271318.0,
66
- "reward": 0.05906250327825546,
67
- "reward_std": 0.06461188569664955,
68
- "rewards/env_goofspiel_reward/mean": 0.05906250327825546,
69
- "rewards/env_goofspiel_reward/std": 0.16751175224781037,
70
- "sampling/importance_sampling_ratio/max": 2.0673105001449583,
71
- "sampling/importance_sampling_ratio/mean": 1.0338432788848877,
72
- "sampling/importance_sampling_ratio/min": 0.6319645524024964,
73
- "sampling/sampling_logp_difference/max": 0.6566181659698487,
74
- "sampling/sampling_logp_difference/mean": 0.05573496893048287,
75
  "step": 10,
76
- "step_time": 3.559387542600052
77
  },
78
  {
79
- "clip_ratio/high_max": 0.0,
80
- "clip_ratio/high_mean": 0.0,
81
- "clip_ratio/low_mean": 0.0,
82
- "clip_ratio/low_min": 0.0,
83
- "clip_ratio/region_mean": 0.0,
84
  "completions/clipped_ratio": 0.0,
85
- "completions/max_length": 378.8,
86
- "completions/max_terminated_length": 378.8,
87
- "completions/mean_length": 285.15,
88
- "completions/mean_terminated_length": 285.15,
89
- "completions/min_length": 212.0,
90
- "completions/min_terminated_length": 212.0,
91
- "entropy": 0.5863359421491623,
92
  "epoch": 0.0024,
93
- "frac_reward_zero_std": 0.8,
94
- "grad_norm": 0.1650390625,
95
- "kl": 0.05467459289357066,
96
  "learning_rate": 3.9802559999999995e-06,
97
- "loss": 0.00011974496301263571,
98
- "num_tokens": 403708.0,
99
- "reward": 0.037000001792330296,
100
- "reward_std": 0.0535633388790302,
101
- "rewards/env_goofspiel_reward/mean": 0.037000001792330296,
102
- "rewards/env_goofspiel_reward/std": 0.11216981350444258,
103
- "sampling/importance_sampling_ratio/max": 1.5559733867645265,
104
- "sampling/importance_sampling_ratio/mean": 0.9780893206596375,
105
- "sampling/importance_sampling_ratio/min": 0.604676628112793,
106
- "sampling/sampling_logp_difference/max": 0.4918365955352783,
107
- "sampling/sampling_logp_difference/mean": 0.04973898231983185,
108
  "step": 15,
109
- "step_time": 3.5593893944003865
110
  },
111
  {
112
- "clip_ratio/high_max": 0.0,
113
- "clip_ratio/high_mean": 0.0,
114
- "clip_ratio/low_mean": 0.0,
115
- "clip_ratio/low_min": 0.0,
116
- "clip_ratio/region_mean": 0.0,
117
  "completions/clipped_ratio": 0.0,
118
- "completions/max_length": 374.0,
119
- "completions/max_terminated_length": 374.0,
120
- "completions/mean_length": 281.6375,
121
- "completions/mean_terminated_length": 281.6375,
122
  "completions/min_length": 212.0,
123
  "completions/min_terminated_length": 212.0,
124
- "entropy": 0.4126308411359787,
125
  "epoch": 0.0032,
126
- "frac_reward_zero_std": 0.7375,
127
- "grad_norm": 0.31640625,
128
- "kl": 0.139003873616457,
129
  "learning_rate": 5.401775999999999e-06,
130
- "loss": -0.00015898187411949037,
131
- "num_tokens": 536243.0,
132
- "reward": 0.06343750283122063,
133
- "reward_std": 0.0905980572104454,
134
- "rewards/env_goofspiel_reward/mean": 0.06343750283122063,
135
- "rewards/env_goofspiel_reward/std": 0.19052477180957794,
136
- "sampling/importance_sampling_ratio/max": 1.6631618976593017,
137
- "sampling/importance_sampling_ratio/mean": 1.0170260667800903,
138
- "sampling/importance_sampling_ratio/min": 0.643394160270691,
139
- "sampling/sampling_logp_difference/max": 0.5536829710006714,
140
- "sampling/sampling_logp_difference/mean": 0.03567908257246018,
141
  "step": 20,
142
- "step_time": 3.513668759400389
143
  },
144
  {
145
- "clip_ratio/high_max": 0.0,
146
- "clip_ratio/high_mean": 0.0,
147
- "clip_ratio/low_mean": 0.0,
148
- "clip_ratio/low_min": 0.0,
149
- "clip_ratio/region_mean": 0.0,
150
  "completions/clipped_ratio": 0.0,
151
- "completions/max_length": 374.0,
152
- "completions/max_terminated_length": 374.0,
153
- "completions/mean_length": 300.975,
154
- "completions/mean_terminated_length": 300.975,
155
- "completions/min_length": 219.0,
156
- "completions/min_terminated_length": 219.0,
157
- "entropy": 0.5057466819882392,
158
  "epoch": 0.004,
159
- "frac_reward_zero_std": 0.8625,
160
- "grad_norm": 0.201171875,
161
- "kl": 0.14662861209362746,
162
  "learning_rate": 6.8232959999999994e-06,
163
- "loss": 0.00011511286720633507,
164
- "num_tokens": 674802.0,
165
- "reward": 0.056125002296175806,
166
- "reward_std": 0.05851308616111055,
167
- "rewards/env_goofspiel_reward/mean": 0.056125002296175806,
168
- "rewards/env_goofspiel_reward/std": 0.1724803472403437,
169
- "sampling/importance_sampling_ratio/max": 1.8632315158843995,
170
- "sampling/importance_sampling_ratio/mean": 0.9876452207565307,
171
- "sampling/importance_sampling_ratio/min": 0.5998193681240082,
172
- "sampling/sampling_logp_difference/max": 0.5798031091690063,
173
- "sampling/sampling_logp_difference/mean": 0.049172034859657286,
174
  "step": 25,
175
- "step_time": 3.5666124442001093
176
  },
177
  {
178
- "clip_ratio/high_max": 0.0,
179
- "clip_ratio/high_mean": 0.0,
180
- "clip_ratio/low_mean": 0.0,
181
- "clip_ratio/low_min": 0.0,
182
- "clip_ratio/region_mean": 0.0,
183
  "completions/clipped_ratio": 0.0,
184
- "completions/max_length": 373.8,
185
- "completions/max_terminated_length": 373.8,
186
- "completions/mean_length": 283.69375,
187
- "completions/mean_terminated_length": 283.69375,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
- "entropy": 0.5599821880459785,
191
  "epoch": 0.0048,
192
- "frac_reward_zero_std": 0.775,
193
- "grad_norm": 0.30859375,
194
- "kl": 0.14428229751065375,
195
  "learning_rate": 8.244816e-06,
196
- "loss": -3.07892682030797e-05,
197
- "num_tokens": 807598.0,
198
- "reward": 0.08625000044703483,
199
- "reward_std": 0.10076271444559097,
200
- "rewards/env_goofspiel_reward/mean": 0.08625000044703483,
201
- "rewards/env_goofspiel_reward/std": 0.21401541233062743,
202
- "sampling/importance_sampling_ratio/max": 1.4588377714157104,
203
- "sampling/importance_sampling_ratio/mean": 0.9682739973068237,
204
- "sampling/importance_sampling_ratio/min": 0.5809138238430023,
205
- "sampling/sampling_logp_difference/max": 0.5705435633659363,
206
- "sampling/sampling_logp_difference/mean": 0.04744169861078262,
207
  "step": 30,
208
- "step_time": 3.545922210400022
209
  },
210
  {
211
- "clip_ratio/high_max": 0.0,
212
- "clip_ratio/high_mean": 0.0,
213
- "clip_ratio/low_mean": 0.0,
214
- "clip_ratio/low_min": 0.0,
215
- "clip_ratio/region_mean": 0.0,
216
  "completions/clipped_ratio": 0.0,
217
- "completions/max_length": 365.6,
218
- "completions/max_terminated_length": 365.6,
219
- "completions/mean_length": 290.56875,
220
- "completions/mean_terminated_length": 290.56875,
221
  "completions/min_length": 212.0,
222
  "completions/min_terminated_length": 212.0,
223
- "entropy": 0.5238839864730835,
224
  "epoch": 0.0056,
225
- "frac_reward_zero_std": 0.9,
226
- "grad_norm": 0.0791015625,
227
- "kl": 0.22365115247666836,
228
  "learning_rate": 9.666336e-06,
229
- "loss": -1.7423409735783936e-05,
230
- "num_tokens": 941209.0,
231
- "reward": 0.04125000163912773,
232
- "reward_std": 0.04772970825433731,
233
- "rewards/env_goofspiel_reward/mean": 0.04125000163912773,
234
- "rewards/env_goofspiel_reward/std": 0.16298522651195527,
235
- "sampling/importance_sampling_ratio/max": 1.4255825757980347,
236
- "sampling/importance_sampling_ratio/mean": 0.973192298412323,
237
- "sampling/importance_sampling_ratio/min": 0.5944858670234681,
238
- "sampling/sampling_logp_difference/max": 0.4666349172592163,
239
- "sampling/sampling_logp_difference/mean": 0.04362327083945274,
240
  "step": 35,
241
- "step_time": 3.448241228200095
242
  },
243
  {
244
- "clip_ratio/high_max": 0.0,
245
- "clip_ratio/high_mean": 0.0,
246
- "clip_ratio/low_mean": 0.0,
247
- "clip_ratio/low_min": 0.0,
248
- "clip_ratio/region_mean": 0.0,
249
  "completions/clipped_ratio": 0.0,
250
- "completions/max_length": 373.8,
251
- "completions/max_terminated_length": 373.8,
252
- "completions/mean_length": 289.4875,
253
- "completions/mean_terminated_length": 289.4875,
254
- "completions/min_length": 206.4,
255
- "completions/min_terminated_length": 206.4,
256
- "entropy": 0.5457443177700043,
257
  "epoch": 0.0064,
258
- "frac_reward_zero_std": 0.8125,
259
- "grad_norm": 0.05517578125,
260
- "kl": 0.21872444674372674,
261
  "learning_rate": 9.95063915881342e-06,
262
- "loss": 0.00012433364754542707,
263
- "num_tokens": 1076167.0,
264
- "reward": 0.05981250181794166,
265
- "reward_std": 0.07451137900352478,
266
- "rewards/env_goofspiel_reward/mean": 0.05981250181794166,
267
- "rewards/env_goofspiel_reward/std": 0.1769598752260208,
268
- "sampling/importance_sampling_ratio/max": 1.5738928318023682,
269
- "sampling/importance_sampling_ratio/mean": 1.0145540714263916,
270
- "sampling/importance_sampling_ratio/min": 0.6491193056106568,
271
- "sampling/sampling_logp_difference/max": 0.46645350456237794,
272
- "sampling/sampling_logp_difference/mean": 0.04598992168903351,
273
  "step": 40,
274
- "step_time": 3.4551121715998305
275
  },
276
  {
277
- "clip_ratio/high_max": 0.0,
278
- "clip_ratio/high_mean": 0.0,
279
- "clip_ratio/low_mean": 0.0,
280
  "clip_ratio/low_min": 0.0,
281
- "clip_ratio/region_mean": 0.0,
282
- "completions/clipped_ratio": 0.00625,
283
- "completions/max_length": 462.0,
284
- "completions/max_terminated_length": 374.0,
285
- "completions/mean_length": 300.95625,
286
- "completions/mean_terminated_length": 297.8118957519531,
287
  "completions/min_length": 212.0,
288
  "completions/min_terminated_length": 212.0,
289
- "entropy": 0.5893479824066162,
290
  "epoch": 0.0072,
291
- "frac_reward_zero_std": 0.7875,
292
- "grad_norm": 0.224609375,
293
- "kl": 0.2417328185401857,
294
  "learning_rate": 9.950635741493589e-06,
295
- "loss": 0.02391626685857773,
296
- "num_tokens": 1212925.0,
297
- "reward": 0.05993750244379044,
298
- "reward_std": 0.08494120314717293,
299
- "rewards/env_goofspiel_reward/mean": 0.05993750244379044,
300
- "rewards/env_goofspiel_reward/std": 0.17770446538925172,
301
- "sampling/importance_sampling_ratio/max": 2.169111466407776,
302
- "sampling/importance_sampling_ratio/mean": 1.020458698272705,
303
- "sampling/importance_sampling_ratio/min": 0.6463833510875702,
304
- "sampling/sampling_logp_difference/max": 0.5775727272033692,
305
- "sampling/sampling_logp_difference/mean": 0.04204721674323082,
306
  "step": 45,
307
- "step_time": 4.194754163199832
308
  },
309
  {
310
- "clip_ratio/high_max": 0.0,
311
- "clip_ratio/high_mean": 0.0,
312
- "clip_ratio/low_mean": 0.0,
313
- "clip_ratio/low_min": 0.0,
314
- "clip_ratio/region_mean": 0.0,
315
  "completions/clipped_ratio": 0.0,
316
- "completions/max_length": 373.4,
317
- "completions/max_terminated_length": 373.4,
318
- "completions/mean_length": 299.31875,
319
- "completions/mean_terminated_length": 299.31875,
320
  "completions/min_length": 212.0,
321
  "completions/min_terminated_length": 212.0,
322
- "entropy": 0.4704709783196449,
323
  "epoch": 0.008,
324
- "frac_reward_zero_std": 0.85,
325
- "grad_norm": 0.103515625,
326
- "kl": 0.4402174398303032,
327
  "learning_rate": 9.950629695468755e-06,
328
- "loss": 0.00011011158348992467,
329
- "num_tokens": 1350825.0,
330
- "reward": 0.04487500190734863,
331
- "reward_std": 0.063816387206316,
332
- "rewards/env_goofspiel_reward/mean": 0.04487500190734863,
333
- "rewards/env_goofspiel_reward/std": 0.17575940787792205,
334
- "sampling/importance_sampling_ratio/max": 1.630899429321289,
335
- "sampling/importance_sampling_ratio/mean": 0.9812780380249023,
336
- "sampling/importance_sampling_ratio/min": 0.3998015284538269,
337
- "sampling/sampling_logp_difference/max": 0.8225874900817871,
338
- "sampling/sampling_logp_difference/mean": 0.057265565544366834,
339
  "step": 50,
340
- "step_time": 3.4821775794000134
341
  },
342
  {
343
- "clip_ratio/high_max": 0.0,
344
- "clip_ratio/high_mean": 0.0,
345
- "clip_ratio/low_mean": 0.0,
346
- "clip_ratio/low_min": 0.0,
347
- "clip_ratio/region_mean": 0.0,
348
  "completions/clipped_ratio": 0.0,
349
- "completions/max_length": 374.0,
350
- "completions/max_terminated_length": 374.0,
351
- "completions/mean_length": 284.0375,
352
- "completions/mean_terminated_length": 284.0375,
353
- "completions/min_length": 206.8,
354
- "completions/min_terminated_length": 206.8,
355
- "entropy": 0.23580820970237254,
356
  "epoch": 0.0088,
357
- "frac_reward_zero_std": 0.7875,
358
- "grad_norm": 0.4765625,
359
- "kl": 0.6652570061385632,
360
  "learning_rate": 9.950621020743173e-06,
361
- "loss": 8.13461490906775e-05,
362
- "num_tokens": 1483464.0,
363
- "reward": 0.05993750244379044,
364
- "reward_std": 0.08494120314717293,
365
- "rewards/env_goofspiel_reward/mean": 0.05993750244379044,
366
- "rewards/env_goofspiel_reward/std": 0.18124795854091644,
367
- "sampling/importance_sampling_ratio/max": 1.8012218236923219,
368
- "sampling/importance_sampling_ratio/mean": 0.9974164485931396,
369
- "sampling/importance_sampling_ratio/min": 0.6288759648799896,
370
- "sampling/sampling_logp_difference/max": 0.6000619411468506,
371
- "sampling/sampling_logp_difference/mean": 0.03408294580876827,
372
  "step": 55,
373
- "step_time": 3.518262126200534
374
  },
375
  {
376
- "clip_ratio/high_max": 0.0,
377
- "clip_ratio/high_mean": 0.0,
378
- "clip_ratio/low_mean": 0.0,
379
  "clip_ratio/low_min": 0.0,
380
- "clip_ratio/region_mean": 0.0,
381
  "completions/clipped_ratio": 0.0,
382
- "completions/max_length": 373.8,
383
- "completions/max_terminated_length": 373.8,
384
- "completions/mean_length": 294.53125,
385
- "completions/mean_terminated_length": 294.53125,
386
- "completions/min_length": 219.0,
387
- "completions/min_terminated_length": 219.0,
388
- "entropy": 0.06949742138385773,
389
  "epoch": 0.0096,
390
- "frac_reward_zero_std": 0.8125,
391
- "grad_norm": 0.033447265625,
392
- "kl": 0.8152253001928329,
393
  "learning_rate": 9.950609717322956e-06,
394
- "loss": 0.00011886359425261617,
395
- "num_tokens": 1618570.0,
396
- "reward": 0.07500000223517418,
397
- "reward_std": 0.08485281318426133,
398
- "rewards/env_goofspiel_reward/mean": 0.07500000223517418,
399
- "rewards/env_goofspiel_reward/std": 0.1962749719619751,
400
- "sampling/importance_sampling_ratio/max": 1.434102201461792,
401
- "sampling/importance_sampling_ratio/mean": 1.0074862837791443,
402
- "sampling/importance_sampling_ratio/min": 0.8809547781944275,
403
- "sampling/sampling_logp_difference/max": 0.35270476043224336,
404
- "sampling/sampling_logp_difference/mean": 0.007789037330076099,
405
  "step": 60,
406
- "step_time": 3.4527530410003235
407
  },
408
  {
409
- "clip_ratio/high_max": 0.0,
410
- "clip_ratio/high_mean": 0.0,
411
- "clip_ratio/low_mean": 0.0,
412
  "clip_ratio/low_min": 0.0,
413
- "clip_ratio/region_mean": 0.0,
414
  "completions/clipped_ratio": 0.0,
415
- "completions/max_length": 445.2,
416
- "completions/max_terminated_length": 445.2,
417
- "completions/mean_length": 325.83125,
418
- "completions/mean_terminated_length": 325.83125,
419
- "completions/min_length": 251.2,
420
- "completions/min_terminated_length": 251.2,
421
- "entropy": 0.08617601059377193,
422
  "epoch": 0.0104,
423
- "frac_reward_zero_std": 0.875,
424
- "grad_norm": 0.018798828125,
425
- "kl": 0.7552784413099289,
426
  "learning_rate": 9.950595785216067e-06,
427
- "loss": 0.00014277141308411957,
428
- "num_tokens": 1755265.0,
429
- "reward": 0.03750000149011612,
430
- "reward_std": 0.0530330091714859,
431
- "rewards/env_goofspiel_reward/mean": 0.03750000149011612,
432
- "rewards/env_goofspiel_reward/std": 0.13009902238845825,
433
- "sampling/importance_sampling_ratio/max": 1.3271136283874512,
434
- "sampling/importance_sampling_ratio/mean": 1.004535722732544,
435
- "sampling/importance_sampling_ratio/min": 0.7700567066669464,
436
- "sampling/sampling_logp_difference/max": 0.3696352869272232,
437
- "sampling/sampling_logp_difference/mean": 0.008786045084707438,
438
  "step": 65,
439
- "step_time": 3.861200922199896
440
  },
441
  {
442
- "clip_ratio/high_max": 0.0,
443
- "clip_ratio/high_mean": 0.0,
444
- "clip_ratio/low_mean": 0.0,
445
  "clip_ratio/low_min": 0.0,
446
- "clip_ratio/region_mean": 0.0,
447
  "completions/clipped_ratio": 0.0,
448
- "completions/max_length": 732.0,
449
- "completions/max_terminated_length": 732.0,
450
- "completions/mean_length": 544.625,
451
- "completions/mean_terminated_length": 544.625,
452
- "completions/min_length": 383.6,
453
- "completions/min_terminated_length": 383.6,
454
- "entropy": 0.33896628841757775,
455
  "epoch": 0.0112,
456
- "frac_reward_zero_std": 0.75,
457
- "grad_norm": 0.2109375,
458
- "kl": 1.354549203068018,
459
  "learning_rate": 9.950579224432321e-06,
460
- "loss": 0.00035492791794240476,
461
- "num_tokens": 1929034.0,
462
- "reward": 0.04277083389461041,
463
- "reward_std": 0.061901307106018065,
464
- "rewards/env_goofspiel_reward/mean": 0.04277083389461041,
465
- "rewards/env_goofspiel_reward/std": 0.14327263236045837,
466
- "sampling/importance_sampling_ratio/max": 1.457657814025879,
467
- "sampling/importance_sampling_ratio/mean": 0.9943998098373413,
468
- "sampling/importance_sampling_ratio/min": 0.6945780634880065,
469
- "sampling/sampling_logp_difference/max": 0.450562047958374,
470
- "sampling/sampling_logp_difference/mean": 0.02044728323817253,
471
  "step": 70,
472
- "step_time": 5.568116331400051
473
  },
474
  {
475
  "clip_ratio/high_max": 0.0,
476
  "clip_ratio/high_mean": 0.0,
477
- "clip_ratio/low_mean": 0.0,
478
  "clip_ratio/low_min": 0.0,
479
- "clip_ratio/region_mean": 0.0,
480
  "completions/clipped_ratio": 0.0,
481
- "completions/max_length": 731.6,
482
- "completions/max_terminated_length": 731.6,
483
- "completions/mean_length": 564.09375,
484
- "completions/mean_terminated_length": 564.09375,
485
- "completions/min_length": 408.0,
486
- "completions/min_terminated_length": 408.0,
487
- "entropy": 0.490242238342762,
488
  "epoch": 0.012,
489
- "frac_reward_zero_std": 0.625,
490
- "grad_norm": 0.17578125,
491
- "kl": 0.48342139422893526,
492
  "learning_rate": 9.950560034983382e-06,
493
- "loss": 0.00017674852861091495,
494
- "num_tokens": 2107362.0,
495
- "reward": 0.055687499977648255,
496
- "reward_std": 0.07409889809787273,
497
- "rewards/env_goofspiel_reward/mean": 0.055687499977648255,
498
- "rewards/env_goofspiel_reward/std": 0.16855775713920593,
499
- "sampling/importance_sampling_ratio/max": 1.2386622190475465,
500
- "sampling/importance_sampling_ratio/mean": 0.9834899187088013,
501
- "sampling/importance_sampling_ratio/min": 0.776045274734497,
502
- "sampling/sampling_logp_difference/max": 0.22788455486297607,
503
- "sampling/sampling_logp_difference/mean": 0.01976088173687458,
504
  "step": 75,
505
- "step_time": 5.500796792399524
506
  },
507
  {
508
  "epoch": 0.012,
@@ -512,525 +512,525 @@
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
- "eval_completions/max_length": 614.4,
516
- "eval_completions/max_terminated_length": 614.4,
517
- "eval_completions/mean_length": 572.65,
518
- "eval_completions/mean_terminated_length": 572.65,
519
- "eval_completions/min_length": 531.8,
520
- "eval_completions/min_terminated_length": 531.8,
521
- "eval_entropy": 0.5134166538715362,
522
- "eval_frac_reward_zero_std": 0.8,
523
- "eval_kl": 0.5596067428588867,
524
- "eval_loss": -1.7431222659070045e-05,
525
- "eval_num_tokens": 2107362.0,
526
- "eval_reward": 0.021166666876524687,
527
- "eval_reward_std": 0.03134840028360486,
528
- "eval_rewards/env_goofspiel_reward/mean": 0.021166666876524687,
529
- "eval_rewards/env_goofspiel_reward/std": 0.04433333072811365,
530
- "eval_runtime": 2.6749,
531
- "eval_samples_per_second": 3.738,
532
- "eval_sampling/importance_sampling_ratio/max": 1.0850860834121705,
533
- "eval_sampling/importance_sampling_ratio/mean": 1.0015697360038758,
534
- "eval_sampling/importance_sampling_ratio/min": 0.9188181281089782,
535
- "eval_sampling/sampling_logp_difference/max": 0.10906529426574707,
536
- "eval_sampling/sampling_logp_difference/mean": 0.017179742455482483,
537
- "eval_steps_per_second": 1.122,
538
  "step": 75
539
  },
540
  {
541
- "clip_ratio/high_max": 0.0,
542
- "clip_ratio/high_mean": 0.0,
543
- "clip_ratio/low_mean": 0.0,
544
- "clip_ratio/low_min": 0.0,
545
- "clip_ratio/region_mean": 0.0,
546
  "completions/clipped_ratio": 0.0,
547
- "completions/max_length": 733.2,
548
- "completions/max_terminated_length": 733.2,
549
- "completions/mean_length": 549.075,
550
- "completions/mean_terminated_length": 549.075,
551
- "completions/min_length": 395.8,
552
- "completions/min_terminated_length": 395.8,
553
- "entropy": 0.527910877764225,
554
  "epoch": 0.0128,
555
- "frac_reward_zero_std": 0.7,
556
- "grad_norm": 0.255859375,
557
- "kl": 0.47208923250436785,
558
  "learning_rate": 9.95053821688277e-06,
559
- "loss": 0.0007011178880929947,
560
- "num_tokens": 2282715.0,
561
- "reward": 0.05902083367109299,
562
- "reward_std": 0.07722195237874985,
563
- "rewards/env_goofspiel_reward/mean": 0.05902083367109299,
564
- "rewards/env_goofspiel_reward/std": 0.15880293548107147,
565
- "sampling/importance_sampling_ratio/max": 1.3604092836380004,
566
- "sampling/importance_sampling_ratio/mean": 1.0046632289886475,
567
- "sampling/importance_sampling_ratio/min": 0.7511004090309144,
568
- "sampling/sampling_logp_difference/max": 0.3062611103057861,
569
- "sampling/sampling_logp_difference/mean": 0.022707394510507583,
570
  "step": 80,
571
- "step_time": 5.533078667399787
572
  },
573
  {
574
- "clip_ratio/high_max": 0.0,
575
- "clip_ratio/high_mean": 0.0,
576
- "clip_ratio/low_mean": 0.0,
577
- "clip_ratio/low_min": 0.0,
578
- "clip_ratio/region_mean": 0.0,
579
  "completions/clipped_ratio": 0.0,
580
- "completions/max_length": 732.2,
581
- "completions/max_terminated_length": 732.2,
582
- "completions/mean_length": 561.8875,
583
- "completions/mean_terminated_length": 561.8875,
584
- "completions/min_length": 408.0,
585
- "completions/min_terminated_length": 408.0,
586
- "entropy": 0.46153847873210907,
587
  "epoch": 0.0136,
588
- "frac_reward_zero_std": 0.825,
589
- "grad_norm": 0.095703125,
590
- "kl": 0.5197702750563622,
591
  "learning_rate": 9.950513770145857e-06,
592
- "loss": 0.00010828666854649782,
593
- "num_tokens": 2459616.0,
594
- "reward": 0.04585416615009308,
595
- "reward_std": 0.057717590034008025,
596
- "rewards/env_goofspiel_reward/mean": 0.04585416615009308,
597
- "rewards/env_goofspiel_reward/std": 0.1589788019657135,
598
- "sampling/importance_sampling_ratio/max": 1.5443368434906006,
599
- "sampling/importance_sampling_ratio/mean": 1.0077369570732118,
600
- "sampling/importance_sampling_ratio/min": 0.7487894654273987,
601
- "sampling/sampling_logp_difference/max": 0.36205780506134033,
602
- "sampling/sampling_logp_difference/mean": 0.021098615229129793,
603
  "step": 85,
604
- "step_time": 5.460692007999751
605
  },
606
  {
607
- "clip_ratio/high_max": 0.0,
608
- "clip_ratio/high_mean": 0.0,
609
- "clip_ratio/low_mean": 0.0,
610
- "clip_ratio/low_min": 0.0,
611
- "clip_ratio/region_mean": 0.0,
612
  "completions/clipped_ratio": 0.0,
613
- "completions/max_length": 732.0,
614
- "completions/max_terminated_length": 732.0,
615
- "completions/mean_length": 581.36875,
616
- "completions/mean_terminated_length": 581.36875,
617
- "completions/min_length": 408.0,
618
- "completions/min_terminated_length": 408.0,
619
- "entropy": 0.48351355642080307,
620
  "epoch": 0.0144,
621
- "frac_reward_zero_std": 0.8375,
622
- "grad_norm": 0.1630859375,
623
- "kl": 0.6207657802850008,
624
  "learning_rate": 9.950486694789862e-06,
625
- "loss": 0.00018907544435933232,
626
- "num_tokens": 2641146.0,
627
- "reward": 0.026770833879709244,
628
- "reward_std": 0.03856678232550621,
629
- "rewards/env_goofspiel_reward/mean": 0.026770833879709244,
630
- "rewards/env_goofspiel_reward/std": 0.10960558950901031,
631
- "sampling/importance_sampling_ratio/max": 1.2647137641906738,
632
- "sampling/importance_sampling_ratio/mean": 1.004696297645569,
633
- "sampling/importance_sampling_ratio/min": 0.7099822998046875,
634
- "sampling/sampling_logp_difference/max": 0.2817555904388428,
635
- "sampling/sampling_logp_difference/mean": 0.022446612268686293,
636
  "step": 90,
637
- "step_time": 5.498646953999923
638
  },
639
  {
640
- "clip_ratio/high_max": 0.0,
641
- "clip_ratio/high_mean": 0.0,
642
- "clip_ratio/low_mean": 0.0,
643
- "clip_ratio/low_min": 0.0,
644
- "clip_ratio/region_mean": 0.0,
645
  "completions/clipped_ratio": 0.0,
646
- "completions/max_length": 732.0,
647
- "completions/max_terminated_length": 732.0,
648
- "completions/mean_length": 549.825,
649
- "completions/mean_terminated_length": 549.825,
650
- "completions/min_length": 408.0,
651
- "completions/min_terminated_length": 408.0,
652
- "entropy": 0.5179941833019257,
653
  "epoch": 0.0152,
654
- "frac_reward_zero_std": 0.8125,
655
- "grad_norm": 0.0908203125,
656
- "kl": 0.46866228580474856,
657
  "learning_rate": 9.95045699083386e-06,
658
- "loss": 0.00028545891400426625,
659
- "num_tokens": 2815323.0,
660
- "reward": 0.03772916682064533,
661
- "reward_std": 0.05388742834329605,
662
- "rewards/env_goofspiel_reward/mean": 0.03772916682064533,
663
- "rewards/env_goofspiel_reward/std": 0.13617317676544188,
664
- "sampling/importance_sampling_ratio/max": 1.2603100776672362,
665
- "sampling/importance_sampling_ratio/mean": 1.0017661929130555,
666
- "sampling/importance_sampling_ratio/min": 0.7478124976158143,
667
- "sampling/sampling_logp_difference/max": 0.25655815601348875,
668
- "sampling/sampling_logp_difference/mean": 0.023887046799063682,
669
  "step": 95,
670
- "step_time": 5.659158172999559
671
  },
672
  {
673
  "clip_ratio/high_max": 0.0,
674
  "clip_ratio/high_mean": 0.0,
675
- "clip_ratio/low_mean": 0.0,
676
- "clip_ratio/low_min": 0.0,
677
- "clip_ratio/region_mean": 0.0,
678
  "completions/clipped_ratio": 0.0,
679
- "completions/max_length": 1003.6,
680
- "completions/max_terminated_length": 1003.6,
681
- "completions/mean_length": 754.59375,
682
- "completions/mean_terminated_length": 754.59375,
683
- "completions/min_length": 550.8,
684
- "completions/min_terminated_length": 550.8,
685
- "entropy": 0.5820856660604476,
686
  "epoch": 0.016,
687
- "frac_reward_zero_std": 0.8125,
688
- "grad_norm": 0.171875,
689
- "kl": 0.4048814922571182,
690
  "learning_rate": 9.950424658298776e-06,
691
- "loss": 0.0002957838121801615,
692
- "num_tokens": 3022802.0,
693
- "reward": 0.03360416684299707,
694
- "reward_std": 0.0420433908700943,
695
- "rewards/env_goofspiel_reward/mean": 0.03360416684299707,
696
- "rewards/env_goofspiel_reward/std": 0.12295188903808593,
697
- "sampling/importance_sampling_ratio/max": 1.3984624147415161,
698
- "sampling/importance_sampling_ratio/mean": 0.980130672454834,
699
- "sampling/importance_sampling_ratio/min": 0.6590921759605408,
700
- "sampling/sampling_logp_difference/max": 0.3471600294113159,
701
- "sampling/sampling_logp_difference/mean": 0.02802230753004551,
702
  "step": 100,
703
- "step_time": 7.31073631659965
704
  },
705
  {
706
  "clip_ratio/high_max": 0.0,
707
  "clip_ratio/high_mean": 0.0,
708
- "clip_ratio/low_mean": 0.0,
709
  "clip_ratio/low_min": 0.0,
710
- "clip_ratio/region_mean": 0.0,
711
  "completions/clipped_ratio": 0.0,
712
- "completions/max_length": 1075.0,
713
- "completions/max_terminated_length": 1075.0,
714
- "completions/mean_length": 814.73125,
715
- "completions/mean_terminated_length": 814.73125,
716
- "completions/min_length": 590.0,
717
- "completions/min_terminated_length": 590.0,
718
- "entropy": 0.6229563593864441,
719
  "epoch": 0.0168,
720
- "frac_reward_zero_std": 0.775,
721
- "grad_norm": 0.09326171875,
722
- "kl": 0.4244683228433132,
723
  "learning_rate": 9.950389697207388e-06,
724
- "loss": 0.00020875066984444858,
725
- "num_tokens": 3239290.0,
726
- "reward": 0.038875000365078446,
727
- "reward_std": 0.049674246832728385,
728
- "rewards/env_goofspiel_reward/mean": 0.038875000365078446,
729
- "rewards/env_goofspiel_reward/std": 0.11504672318696976,
730
- "sampling/importance_sampling_ratio/max": 1.3492220401763917,
731
- "sampling/importance_sampling_ratio/mean": 0.9891064882278442,
732
- "sampling/importance_sampling_ratio/min": 0.5561659098671476,
733
- "sampling/sampling_logp_difference/max": 4.213754487037659,
734
- "sampling/sampling_logp_difference/mean": 0.044166411831974985,
735
  "step": 105,
736
- "step_time": 7.745885893800368
737
  },
738
  {
739
- "clip_ratio/high_max": 0.0,
740
- "clip_ratio/high_mean": 0.0,
741
- "clip_ratio/low_mean": 0.0,
742
- "clip_ratio/low_min": 0.0,
743
- "clip_ratio/region_mean": 0.0,
744
  "completions/clipped_ratio": 0.0,
745
- "completions/max_length": 1075.6,
746
- "completions/max_terminated_length": 1075.6,
747
- "completions/mean_length": 814.6375,
748
- "completions/mean_terminated_length": 814.6375,
749
- "completions/min_length": 590.0,
750
- "completions/min_terminated_length": 590.0,
751
- "entropy": 0.6465200364589692,
752
  "epoch": 0.0176,
753
- "frac_reward_zero_std": 0.7875,
754
- "grad_norm": 0.007080078125,
755
- "kl": 0.3995703622698784,
756
  "learning_rate": 9.950352107584324e-06,
757
- "loss": 0.0007265920285135508,
758
- "num_tokens": 3456558.0,
759
- "reward": 0.023437499813735486,
760
- "reward_std": 0.03473661988973618,
761
- "rewards/env_goofspiel_reward/mean": 0.023437499813735486,
762
- "rewards/env_goofspiel_reward/std": 0.0773700624704361,
763
- "sampling/importance_sampling_ratio/max": 1.4152228832244873,
764
- "sampling/importance_sampling_ratio/mean": 0.9911394953727722,
765
- "sampling/importance_sampling_ratio/min": 0.4911388456960605,
766
- "sampling/sampling_logp_difference/max": 4.677203369140625,
767
- "sampling/sampling_logp_difference/mean": 0.04649800434708595,
768
  "step": 110,
769
- "step_time": 7.710595274399202
770
  },
771
  {
772
- "clip_ratio/high_max": 0.0,
773
- "clip_ratio/high_mean": 0.0,
774
- "clip_ratio/low_mean": 0.0,
775
- "clip_ratio/low_min": 0.0,
776
- "clip_ratio/region_mean": 0.0,
777
  "completions/clipped_ratio": 0.0,
778
- "completions/max_length": 1075.0,
779
- "completions/max_terminated_length": 1075.0,
780
- "completions/mean_length": 831.425,
781
- "completions/mean_terminated_length": 831.425,
782
- "completions/min_length": 590.0,
783
- "completions/min_terminated_length": 590.0,
784
- "entropy": 0.6319158881902694,
785
  "epoch": 0.0184,
786
- "frac_reward_zero_std": 0.7625,
787
- "grad_norm": 0.30078125,
788
- "kl": 0.4417851775884628,
789
  "learning_rate": 9.950311889456064e-06,
790
- "loss": -0.00014657191932201385,
791
- "num_tokens": 3677316.0,
792
- "reward": 0.03468749914318323,
793
- "reward_std": 0.0499394167214632,
794
- "rewards/env_goofspiel_reward/mean": 0.03468749914318323,
795
- "rewards/env_goofspiel_reward/std": 0.10438641458749771,
796
- "sampling/importance_sampling_ratio/max": 1.4034383773803711,
797
- "sampling/importance_sampling_ratio/mean": 1.003822433948517,
798
- "sampling/importance_sampling_ratio/min": 0.7400717735290527,
799
- "sampling/sampling_logp_difference/max": 0.2535316705703735,
800
- "sampling/sampling_logp_difference/mean": 0.02623956575989723,
801
  "step": 115,
802
- "step_time": 7.7139906279999195
803
  },
804
  {
805
  "clip_ratio/high_max": 0.0,
806
  "clip_ratio/high_mean": 0.0,
807
- "clip_ratio/low_mean": 0.0,
808
- "clip_ratio/low_min": 0.0,
809
- "clip_ratio/region_mean": 0.0,
810
  "completions/clipped_ratio": 0.0,
811
- "completions/max_length": 1075.2,
812
- "completions/max_terminated_length": 1075.2,
813
- "completions/mean_length": 809.6125,
814
- "completions/mean_terminated_length": 809.6125,
815
- "completions/min_length": 578.0,
816
- "completions/min_terminated_length": 578.0,
817
- "entropy": 0.6280775606632233,
818
  "epoch": 0.0192,
819
- "frac_reward_zero_std": 0.825,
820
- "grad_norm": 0.1328125,
821
- "kl": 0.39189435467123984,
822
  "learning_rate": 9.950269042850943e-06,
823
- "loss": 0.00020067014265805482,
824
- "num_tokens": 3893277.0,
825
- "reward": 0.028249999321997166,
826
- "reward_std": 0.0342946782708168,
827
- "rewards/env_goofspiel_reward/mean": 0.028249999321997166,
828
- "rewards/env_goofspiel_reward/std": 0.10060491263866425,
829
- "sampling/importance_sampling_ratio/max": 1.449436855316162,
830
- "sampling/importance_sampling_ratio/mean": 0.9966808676719665,
831
- "sampling/importance_sampling_ratio/min": 0.6293053507804871,
832
- "sampling/sampling_logp_difference/max": 0.3306601524353027,
833
- "sampling/sampling_logp_difference/mean": 0.027687131240963937,
834
  "step": 120,
835
- "step_time": 7.60252943919968
836
  },
837
  {
838
  "clip_ratio/high_max": 0.0,
839
  "clip_ratio/high_mean": 0.0,
840
- "clip_ratio/low_mean": 0.0,
841
  "clip_ratio/low_min": 0.0,
842
- "clip_ratio/region_mean": 0.0,
843
  "completions/clipped_ratio": 0.0,
844
- "completions/max_length": 1075.0,
845
- "completions/max_terminated_length": 1075.0,
846
- "completions/mean_length": 831.9125,
847
- "completions/mean_terminated_length": 831.9125,
848
- "completions/min_length": 590.0,
849
- "completions/min_terminated_length": 590.0,
850
- "entropy": 0.6203011125326157,
851
  "epoch": 0.02,
852
- "frac_reward_zero_std": 0.8,
853
- "grad_norm": 0.0888671875,
854
- "kl": 0.48651044964790346,
855
  "learning_rate": 9.95022356779914e-06,
856
- "loss": -4.498030175454914e-05,
857
- "num_tokens": 4114505.0,
858
- "reward": 0.0279999990016222,
859
- "reward_std": 0.04065863937139511,
860
- "rewards/env_goofspiel_reward/mean": 0.0279999990016222,
861
- "rewards/env_goofspiel_reward/std": 0.09859532788395882,
862
- "sampling/importance_sampling_ratio/max": 1.3767565965652466,
863
- "sampling/importance_sampling_ratio/mean": 1.0073092341423036,
864
- "sampling/importance_sampling_ratio/min": 0.6863486647605896,
865
- "sampling/sampling_logp_difference/max": 0.2979677677154541,
866
- "sampling/sampling_logp_difference/mean": 0.026996534690260886,
867
  "step": 125,
868
- "step_time": 7.651589208400037
869
  },
870
  {
871
- "clip_ratio/high_max": 0.0,
872
- "clip_ratio/high_mean": 0.0,
873
- "clip_ratio/low_mean": 0.0,
874
- "clip_ratio/low_min": 0.0,
875
- "clip_ratio/region_mean": 0.0,
876
  "completions/clipped_ratio": 0.0,
877
- "completions/max_length": 1206.4,
878
- "completions/max_terminated_length": 1206.4,
879
- "completions/mean_length": 933.89375,
880
- "completions/mean_terminated_length": 933.89375,
881
- "completions/min_length": 657.2,
882
- "completions/min_terminated_length": 657.2,
883
- "entropy": 0.6028641879558563,
884
  "epoch": 0.0208,
885
- "frac_reward_zero_std": 0.85,
886
- "grad_norm": 0.12890625,
887
- "kl": 0.3837214097380638,
888
  "learning_rate": 9.950175464332696e-06,
889
- "loss": 3.275293856859207e-05,
890
- "num_tokens": 4351587.0,
891
- "reward": 0.02824999988079071,
892
- "reward_std": 0.040128308534622195,
893
- "rewards/env_goofspiel_reward/mean": 0.02824999988079071,
894
- "rewards/env_goofspiel_reward/std": 0.107171730697155,
895
- "sampling/importance_sampling_ratio/max": 1.4337808609008789,
896
- "sampling/importance_sampling_ratio/mean": 1.0062252044677735,
897
- "sampling/importance_sampling_ratio/min": 0.7049606084823609,
898
- "sampling/sampling_logp_difference/max": 0.29388277530670165,
899
- "sampling/sampling_logp_difference/mean": 0.027071699127554895,
900
  "step": 130,
901
- "step_time": 8.72059853139981
902
  },
903
  {
904
  "clip_ratio/high_max": 0.0,
905
  "clip_ratio/high_mean": 0.0,
906
- "clip_ratio/low_mean": 0.0,
907
- "clip_ratio/low_min": 0.0,
908
- "clip_ratio/region_mean": 0.0,
909
  "completions/clipped_ratio": 0.0,
910
- "completions/max_length": 1403.2,
911
- "completions/max_terminated_length": 1403.2,
912
- "completions/mean_length": 1043.53125,
913
- "completions/mean_terminated_length": 1043.53125,
914
- "completions/min_length": 758.0,
915
- "completions/min_terminated_length": 758.0,
916
- "entropy": 0.5716616868972778,
917
  "epoch": 0.0216,
918
- "frac_reward_zero_std": 0.825,
919
- "grad_norm": 0.11865234375,
920
- "kl": 0.3439621731638908,
921
  "learning_rate": 9.950124732485496e-06,
922
- "loss": -0.00015461102593690158,
923
- "num_tokens": 4604604.0,
924
- "reward": 0.02993750013411045,
925
- "reward_std": 0.04251479506492615,
926
- "rewards/env_goofspiel_reward/mean": 0.02993750013411045,
927
- "rewards/env_goofspiel_reward/std": 0.10386608839035034,
928
- "sampling/importance_sampling_ratio/max": 1.560789942741394,
929
- "sampling/importance_sampling_ratio/mean": 1.0229114413261413,
930
- "sampling/importance_sampling_ratio/min": 0.656338346004486,
931
- "sampling/sampling_logp_difference/max": 0.33734931945800783,
932
- "sampling/sampling_logp_difference/mean": 0.026270415633916855,
933
  "step": 135,
934
- "step_time": 10.42263705259993
935
  },
936
  {
937
- "clip_ratio/high_max": 0.0,
938
- "clip_ratio/high_mean": 0.0,
939
- "clip_ratio/low_mean": 0.0,
940
  "clip_ratio/low_min": 0.0,
941
- "clip_ratio/region_mean": 0.0,
942
  "completions/clipped_ratio": 0.0,
943
- "completions/max_length": 1404.2,
944
- "completions/max_terminated_length": 1404.2,
945
- "completions/mean_length": 1106.9125,
946
- "completions/mean_terminated_length": 1106.9125,
947
- "completions/min_length": 746.4,
948
- "completions/min_terminated_length": 746.4,
949
- "entropy": 0.6015262037515641,
950
  "epoch": 0.0224,
951
- "frac_reward_zero_std": 0.7875,
952
- "grad_norm": 0.1142578125,
953
- "kl": 0.4554178059101105,
954
  "learning_rate": 9.95007137229328e-06,
955
- "loss": 4.9450411461293696e-05,
956
- "num_tokens": 4870794.0,
957
- "reward": 0.02393750089686364,
958
- "reward_std": 0.03491339806932956,
959
- "rewards/env_goofspiel_reward/mean": 0.02393750089686364,
960
- "rewards/env_goofspiel_reward/std": 0.07218290558084846,
961
- "sampling/importance_sampling_ratio/max": 1.4548166275024415,
962
- "sampling/importance_sampling_ratio/mean": 0.9814130544662476,
963
- "sampling/importance_sampling_ratio/min": 0.6002501428127289,
964
- "sampling/sampling_logp_difference/max": 0.33309857845306395,
965
- "sampling/sampling_logp_difference/mean": 0.028818363696336745,
966
  "step": 140,
967
- "step_time": 10.392815717999474
968
  },
969
  {
970
- "clip_ratio/high_max": 0.0,
971
- "clip_ratio/high_mean": 0.0,
972
- "clip_ratio/low_mean": 0.0,
973
- "clip_ratio/low_min": 0.0,
974
- "clip_ratio/region_mean": 0.0,
975
  "completions/clipped_ratio": 0.0,
976
- "completions/max_length": 1403.4,
977
- "completions/max_terminated_length": 1403.4,
978
- "completions/mean_length": 1064.275,
979
- "completions/mean_terminated_length": 1064.275,
980
- "completions/min_length": 746.6,
981
- "completions/min_terminated_length": 746.6,
982
- "entropy": 0.5568872570991517,
983
  "epoch": 0.0232,
984
- "frac_reward_zero_std": 0.775,
985
- "grad_norm": 0.2041015625,
986
- "kl": 0.3555284239351749,
987
  "learning_rate": 9.950015383793636e-06,
988
- "loss": -0.00010267798788845539,
989
- "num_tokens": 5127626.0,
990
- "reward": 0.03343750163912773,
991
- "reward_std": 0.04799487330019474,
992
- "rewards/env_goofspiel_reward/mean": 0.03343750163912773,
993
- "rewards/env_goofspiel_reward/std": 0.10812449753284455,
994
- "sampling/importance_sampling_ratio/max": 1.587186908721924,
995
- "sampling/importance_sampling_ratio/mean": 1.0198248624801636,
996
- "sampling/importance_sampling_ratio/min": 0.7297749638557434,
997
- "sampling/sampling_logp_difference/max": 0.2695728540420532,
998
- "sampling/sampling_logp_difference/mean": 0.0254237774759531,
999
  "step": 145,
1000
- "step_time": 10.32607021879976
1001
  },
1002
  {
1003
- "clip_ratio/high_max": 0.0,
1004
- "clip_ratio/high_mean": 0.0,
1005
- "clip_ratio/low_mean": 0.0,
1006
  "clip_ratio/low_min": 0.0,
1007
- "clip_ratio/region_mean": 0.0,
1008
  "completions/clipped_ratio": 0.0,
1009
- "completions/max_length": 1400.4,
1010
- "completions/max_terminated_length": 1400.4,
1011
- "completions/mean_length": 1040.89375,
1012
- "completions/mean_terminated_length": 1040.89375,
1013
- "completions/min_length": 758.0,
1014
- "completions/min_terminated_length": 758.0,
1015
- "entropy": 0.5438663050532341,
1016
  "epoch": 0.024,
1017
- "frac_reward_zero_std": 0.8,
1018
- "grad_norm": 0.11181640625,
1019
- "kl": 0.313924690335989,
1020
  "learning_rate": 9.949956767026006e-06,
1021
- "loss": 0.0002234043786302209,
1022
- "num_tokens": 5379923.0,
1023
- "reward": 0.03181250132620335,
1024
- "reward_std": 0.0451664462685585,
1025
- "rewards/env_goofspiel_reward/mean": 0.03181250132620335,
1026
- "rewards/env_goofspiel_reward/std": 0.10608797073364258,
1027
- "sampling/importance_sampling_ratio/max": 1.3631083726882935,
1028
- "sampling/importance_sampling_ratio/mean": 1.0006531119346618,
1029
- "sampling/importance_sampling_ratio/min": 0.691138219833374,
1030
- "sampling/sampling_logp_difference/max": 0.26457092761993406,
1031
- "sampling/sampling_logp_difference/mean": 0.024369171261787413,
1032
  "step": 150,
1033
- "step_time": 10.337828036600513
1034
  },
1035
  {
1036
  "epoch": 0.024,
@@ -1040,68 +1040,101 @@
1040
  "eval_clip_ratio/low_min": 0.0,
1041
  "eval_clip_ratio/region_mean": 0.0,
1042
  "eval_completions/clipped_ratio": 0.0,
1043
- "eval_completions/max_length": 1167.6,
1044
- "eval_completions/max_terminated_length": 1167.6,
1045
- "eval_completions/mean_length": 1085.75,
1046
- "eval_completions/mean_terminated_length": 1085.75,
1047
- "eval_completions/min_length": 1003.6,
1048
- "eval_completions/min_terminated_length": 1003.6,
1049
- "eval_entropy": 0.5619663238525391,
1050
- "eval_frac_reward_zero_std": 0.9,
1051
- "eval_kl": 0.4370845973491669,
1052
- "eval_loss": -0.00014827963605057448,
1053
- "eval_num_tokens": 5379923.0,
1054
- "eval_reward": -0.0004999999888241291,
1055
- "eval_reward_std": 0.0007071067579090595,
1056
- "eval_rewards/env_goofspiel_reward/mean": -0.0004999999888241291,
1057
- "eval_rewards/env_goofspiel_reward/std": 0.0009999999776482583,
1058
- "eval_runtime": 4.3907,
1059
- "eval_samples_per_second": 2.278,
1060
- "eval_sampling/importance_sampling_ratio/max": 1.2002264738082886,
1061
- "eval_sampling/importance_sampling_ratio/mean": 1.0190080761909486,
1062
- "eval_sampling/importance_sampling_ratio/min": 0.8775287628173828,
1063
- "eval_sampling/sampling_logp_difference/max": 0.19612762928009034,
1064
- "eval_sampling/sampling_logp_difference/mean": 0.025080177932977676,
1065
- "eval_steps_per_second": 0.683,
1066
  "step": 150
1067
  },
1068
  {
1069
- "epoch": 0.02432,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1070
  "eval_clip_ratio/high_max": 0.0,
1071
  "eval_clip_ratio/high_mean": 0.0,
1072
  "eval_clip_ratio/low_mean": 0.0,
1073
  "eval_clip_ratio/low_min": 0.0,
1074
  "eval_clip_ratio/region_mean": 0.0,
1075
  "eval_completions/clipped_ratio": 0.0,
1076
- "eval_completions/max_length": 1166.0,
1077
- "eval_completions/max_terminated_length": 1166.0,
1078
- "eval_completions/mean_length": 1082.75,
1079
- "eval_completions/mean_terminated_length": 1082.75,
1080
- "eval_completions/min_length": 999.8,
1081
- "eval_completions/min_terminated_length": 999.8,
1082
- "eval_entropy": 0.5707040071487427,
1083
- "eval_frac_reward_zero_std": 1.0,
1084
- "eval_kl": 0.40999372601509093,
1085
- "eval_loss": 3.846790423267521e-05,
1086
- "eval_num_tokens": 5476501.0,
1087
- "eval_reward": 0.06000000238418579,
1088
- "eval_reward_std": 0.0,
1089
- "eval_rewards/env_goofspiel_reward/mean": 0.06000000238418579,
1090
- "eval_rewards/env_goofspiel_reward/std": 0.06928203105926514,
1091
- "eval_runtime": 4.0973,
1092
- "eval_samples_per_second": 2.441,
1093
- "eval_sampling/importance_sampling_ratio/max": 1.1712106943130494,
1094
- "eval_sampling/importance_sampling_ratio/mean": 1.011259377002716,
1095
- "eval_sampling/importance_sampling_ratio/min": 0.8288854002952576,
1096
- "eval_sampling/sampling_logp_difference/max": 0.17412886619567872,
1097
- "eval_sampling/sampling_logp_difference/mean": 0.024767952039837837,
1098
- "eval_steps_per_second": 0.732,
1099
- "step": 152
1100
  }
1101
  ],
1102
  "logging_steps": 5,
1103
  "max_steps": 18750,
1104
- "num_input_tokens_seen": 5476501,
1105
  "num_train_epochs": 3,
1106
  "save_steps": 500,
1107
  "stateful_callbacks": {
@@ -1117,7 +1150,7 @@
1117
  }
1118
  },
1119
  "total_flos": 0.0,
1120
- "train_batch_size": 8,
1121
  "trial_name": null,
1122
  "trial_params": null
1123
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.02496,
6
  "eval_steps": 500,
7
+ "global_step": 156,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "clip_ratio/high_max": 0.01656746044754982,
14
+ "clip_ratio/high_mean": 0.00828373022377491,
15
+ "clip_ratio/low_mean": 0.009885329194366932,
16
+ "clip_ratio/low_min": 0.002380952425301075,
17
+ "clip_ratio/region_mean": 0.018169059325009583,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 373.8,
20
+ "completions/max_terminated_length": 373.8,
21
+ "completions/mean_length": 296.71875,
22
+ "completions/mean_terminated_length": 296.71875,
23
+ "completions/min_length": 206.8,
24
+ "completions/min_terminated_length": 206.8,
25
+ "entropy": 0.35859955847263336,
26
  "epoch": 0.0008,
27
+ "frac_reward_zero_std": 0.475,
28
+ "grad_norm": 0.11806713044643402,
29
+ "kl": 0.010710475360974669,
30
  "learning_rate": 1.137216e-06,
31
+ "loss": 0.00025553270243108275,
32
+ "num_tokens": 136400.0,
33
+ "reward": 0.2433750033378601,
34
+ "reward_std": 0.22857226729393004,
35
+ "rewards/env_goofspiel_reward/mean": 0.2433750033378601,
36
+ "rewards/env_goofspiel_reward/std": 0.3573280215263367,
37
+ "sampling/importance_sampling_ratio/max": 1.642977237701416,
38
+ "sampling/importance_sampling_ratio/mean": 0.9299223780632019,
39
+ "sampling/importance_sampling_ratio/min": 0.2294576272368431,
40
+ "sampling/sampling_logp_difference/max": 1.239228868484497,
41
+ "sampling/sampling_logp_difference/mean": 0.08511904180049897,
42
  "step": 5,
43
+ "step_time": 5.990162861399949
44
  },
45
  {
46
+ "clip_ratio/high_max": 0.03551879096776247,
47
+ "clip_ratio/high_mean": 0.019321895483881236,
48
+ "clip_ratio/low_mean": 0.010304330103099345,
49
  "clip_ratio/low_min": 0.0,
50
+ "clip_ratio/region_mean": 0.02962622568011284,
51
  "completions/clipped_ratio": 0.0,
52
  "completions/max_length": 374.0,
53
  "completions/max_terminated_length": 374.0,
54
+ "completions/mean_length": 291.0,
55
+ "completions/mean_terminated_length": 291.0,
56
+ "completions/min_length": 199.6,
57
+ "completions/min_terminated_length": 199.6,
58
+ "entropy": 0.3628120839595795,
59
  "epoch": 0.0016,
60
+ "frac_reward_zero_std": 0.425,
61
+ "grad_norm": 0.11497774720191956,
62
+ "kl": 0.014784016623161733,
63
  "learning_rate": 2.5587359999999995e-06,
64
+ "loss": 0.0005161482840776443,
65
+ "num_tokens": 271519.0,
66
+ "reward": 0.3373125076293945,
67
+ "reward_std": 0.28646664023399354,
68
+ "rewards/env_goofspiel_reward/mean": 0.3373125076293945,
69
+ "rewards/env_goofspiel_reward/std": 0.39762375950813295,
70
+ "sampling/importance_sampling_ratio/max": 2.0244646072387695,
71
+ "sampling/importance_sampling_ratio/mean": 0.95240398645401,
72
+ "sampling/importance_sampling_ratio/min": 0.3043131291866302,
73
+ "sampling/sampling_logp_difference/max": 1.4355120182037353,
74
+ "sampling/sampling_logp_difference/mean": 0.08179195821285248,
75
  "step": 10,
76
+ "step_time": 5.4302939728002455
77
  },
78
  {
79
+ "clip_ratio/high_max": 0.020830108411610128,
80
+ "clip_ratio/high_mean": 0.01188564244657755,
81
+ "clip_ratio/low_mean": 0.015411754604429007,
82
+ "clip_ratio/low_min": 0.004805492050945759,
83
+ "clip_ratio/region_mean": 0.0272973969578743,
84
  "completions/clipped_ratio": 0.0,
85
+ "completions/max_length": 374.0,
86
+ "completions/max_terminated_length": 374.0,
87
+ "completions/mean_length": 282.2875,
88
+ "completions/mean_terminated_length": 282.2875,
89
+ "completions/min_length": 189.6,
90
+ "completions/min_terminated_length": 189.6,
91
+ "entropy": 0.35702232643961906,
92
  "epoch": 0.0024,
93
+ "frac_reward_zero_std": 0.4875,
94
+ "grad_norm": 0.15851116180419922,
95
+ "kl": 0.03802298142109066,
96
  "learning_rate": 3.9802559999999995e-06,
97
+ "loss": -8.745418745093048e-05,
98
+ "num_tokens": 404021.0,
99
+ "reward": 0.30337501466274264,
100
+ "reward_std": 0.2497854709625244,
101
+ "rewards/env_goofspiel_reward/mean": 0.30337501466274264,
102
+ "rewards/env_goofspiel_reward/std": 0.37030403017997743,
103
+ "sampling/importance_sampling_ratio/max": 1.8557178020477294,
104
+ "sampling/importance_sampling_ratio/mean": 0.9347150444984436,
105
+ "sampling/importance_sampling_ratio/min": 0.2671460062265396,
106
+ "sampling/sampling_logp_difference/max": 1.3799697399139403,
107
+ "sampling/sampling_logp_difference/mean": 0.08675528764724731,
108
  "step": 15,
109
+ "step_time": 5.612129885600188
110
  },
111
  {
112
+ "clip_ratio/high_max": 0.017504085041582584,
113
+ "clip_ratio/high_mean": 0.010222630854696036,
114
+ "clip_ratio/low_mean": 0.016076818853616715,
115
+ "clip_ratio/low_min": 0.005441176518797874,
116
+ "clip_ratio/region_mean": 0.02629944970831275,
117
  "completions/clipped_ratio": 0.0,
118
+ "completions/max_length": 373.8,
119
+ "completions/max_terminated_length": 373.8,
120
+ "completions/mean_length": 281.4375,
121
+ "completions/mean_terminated_length": 281.4375,
122
  "completions/min_length": 212.0,
123
  "completions/min_terminated_length": 212.0,
124
+ "entropy": 0.33125333562493325,
125
  "epoch": 0.0032,
126
+ "frac_reward_zero_std": 0.4875,
127
+ "grad_norm": 0.3056468069553375,
128
+ "kl": 0.2386752042453736,
129
  "learning_rate": 5.401775999999999e-06,
130
+ "loss": -0.00010562442475929857,
131
+ "num_tokens": 536239.0,
132
+ "reward": 0.33362500965595243,
133
+ "reward_std": 0.22821871638298036,
134
+ "rewards/env_goofspiel_reward/mean": 0.33362500965595243,
135
+ "rewards/env_goofspiel_reward/std": 0.3822357296943665,
136
+ "sampling/importance_sampling_ratio/max": 2.044223356246948,
137
+ "sampling/importance_sampling_ratio/mean": 0.9417648911476135,
138
+ "sampling/importance_sampling_ratio/min": 0.1664988785982132,
139
+ "sampling/sampling_logp_difference/max": 1.9717662334442139,
140
+ "sampling/sampling_logp_difference/mean": 0.09318876564502716,
141
  "step": 20,
142
+ "step_time": 5.200572347599882
143
  },
144
  {
145
+ "clip_ratio/high_max": 0.014910130761563778,
146
+ "clip_ratio/high_mean": 0.007455065380781889,
147
+ "clip_ratio/low_mean": 0.014673632569611073,
148
+ "clip_ratio/low_min": 0.005902777798473835,
149
+ "clip_ratio/region_mean": 0.02212869795039296,
150
  "completions/clipped_ratio": 0.0,
151
+ "completions/max_length": 374.4,
152
+ "completions/max_terminated_length": 374.4,
153
+ "completions/mean_length": 300.20625,
154
+ "completions/mean_terminated_length": 300.20625,
155
+ "completions/min_length": 214.2,
156
+ "completions/min_terminated_length": 214.2,
157
+ "entropy": 0.3311316817998886,
158
  "epoch": 0.004,
159
+ "frac_reward_zero_std": 0.5,
160
+ "grad_norm": 0.24050310254096985,
161
+ "kl": 0.2951220686081797,
162
  "learning_rate": 6.8232959999999994e-06,
163
+ "loss": 0.00011087653692811727,
164
+ "num_tokens": 674105.0,
165
+ "reward": 0.3523750066757202,
166
+ "reward_std": 0.22291541397571563,
167
+ "rewards/env_goofspiel_reward/mean": 0.3523750066757202,
168
+ "rewards/env_goofspiel_reward/std": 0.39020044207572935,
169
+ "sampling/importance_sampling_ratio/max": 1.9415390253067017,
170
+ "sampling/importance_sampling_ratio/mean": 0.9240148186683654,
171
+ "sampling/importance_sampling_ratio/min": 0.07138497838750482,
172
+ "sampling/sampling_logp_difference/max": 2.9348490715026854,
173
+ "sampling/sampling_logp_difference/mean": 0.10956742316484451,
174
  "step": 25,
175
+ "step_time": 5.527079869400405
176
  },
177
  {
178
+ "clip_ratio/high_max": 0.012152777798473836,
179
+ "clip_ratio/high_mean": 0.006076388899236918,
180
+ "clip_ratio/low_mean": 0.01880923202261329,
181
+ "clip_ratio/low_min": 0.005718954280018807,
182
+ "clip_ratio/region_mean": 0.024885620921850204,
183
  "completions/clipped_ratio": 0.0,
184
+ "completions/max_length": 373.4,
185
+ "completions/max_terminated_length": 373.4,
186
+ "completions/mean_length": 283.375,
187
+ "completions/mean_terminated_length": 283.375,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
+ "entropy": 0.32351960614323616,
191
  "epoch": 0.0048,
192
+ "frac_reward_zero_std": 0.4125,
193
+ "grad_norm": 0.12574604153633118,
194
+ "kl": 0.5364798322319985,
195
  "learning_rate": 8.244816e-06,
196
+ "loss": -0.0002830292796716094,
197
+ "num_tokens": 805805.0,
198
+ "reward": 0.4162500202655792,
199
+ "reward_std": 0.30228814482688904,
200
+ "rewards/env_goofspiel_reward/mean": 0.4162500202655792,
201
+ "rewards/env_goofspiel_reward/std": 0.4285269021987915,
202
+ "sampling/importance_sampling_ratio/max": 2.1214988470077514,
203
+ "sampling/importance_sampling_ratio/mean": 0.9363813877105713,
204
+ "sampling/importance_sampling_ratio/min": 0.08070152997970581,
205
+ "sampling/sampling_logp_difference/max": 1.7284237623214722,
206
+ "sampling/sampling_logp_difference/mean": 0.1223350703716278,
207
  "step": 30,
208
+ "step_time": 5.215961435999452
209
  },
210
  {
211
+ "clip_ratio/high_max": 0.011638931930065155,
212
+ "clip_ratio/high_mean": 0.0058194659650325775,
213
+ "clip_ratio/low_mean": 0.01746001038700342,
214
+ "clip_ratio/low_min": 0.003125,
215
+ "clip_ratio/region_mean": 0.023279476352035998,
216
  "completions/clipped_ratio": 0.0,
217
+ "completions/max_length": 366.0,
218
+ "completions/max_terminated_length": 366.0,
219
+ "completions/mean_length": 290.51875,
220
+ "completions/mean_terminated_length": 290.51875,
221
  "completions/min_length": 212.0,
222
  "completions/min_terminated_length": 212.0,
223
+ "entropy": 0.33921139910817144,
224
  "epoch": 0.0056,
225
+ "frac_reward_zero_std": 0.3875,
226
+ "grad_norm": 0.14579056203365326,
227
+ "kl": 0.4404270384460688,
228
  "learning_rate": 9.666336e-06,
229
+ "loss": -0.0004368364345282316,
230
+ "num_tokens": 940358.0,
231
+ "reward": 0.42375001311302185,
232
+ "reward_std": 0.3022881507873535,
233
+ "rewards/env_goofspiel_reward/mean": 0.42375001311302185,
234
+ "rewards/env_goofspiel_reward/std": 0.4120412886142731,
235
+ "sampling/importance_sampling_ratio/max": 2.12521378993988,
236
+ "sampling/importance_sampling_ratio/mean": 0.896282148361206,
237
+ "sampling/importance_sampling_ratio/min": 0.10518757700920105,
238
+ "sampling/sampling_logp_difference/max": 1.934821891784668,
239
+ "sampling/sampling_logp_difference/mean": 0.14520370960235596,
240
  "step": 35,
241
+ "step_time": 5.421745918600391
242
  },
243
  {
244
+ "clip_ratio/high_max": 0.010828081332147122,
245
+ "clip_ratio/high_mean": 0.005414040666073561,
246
+ "clip_ratio/low_mean": 0.02190126050263643,
247
+ "clip_ratio/low_min": 0.008496732078492641,
248
+ "clip_ratio/region_mean": 0.02731530126184225,
249
  "completions/clipped_ratio": 0.0,
250
+ "completions/max_length": 374.4,
251
+ "completions/max_terminated_length": 374.4,
252
+ "completions/mean_length": 288.0625,
253
+ "completions/mean_terminated_length": 288.0625,
254
+ "completions/min_length": 196.8,
255
+ "completions/min_terminated_length": 196.8,
256
+ "entropy": 0.23345143683254718,
257
  "epoch": 0.0064,
258
+ "frac_reward_zero_std": 0.4,
259
+ "grad_norm": 0.07758636772632599,
260
+ "kl": 2.4490169264376163,
261
  "learning_rate": 9.95063915881342e-06,
262
+ "loss": -0.00021622611675411462,
263
+ "num_tokens": 1075088.0,
264
+ "reward": 0.49831252098083495,
265
+ "reward_std": 0.24987386167049408,
266
+ "rewards/env_goofspiel_reward/mean": 0.49831252098083495,
267
+ "rewards/env_goofspiel_reward/std": 0.39479002356529236,
268
+ "sampling/importance_sampling_ratio/max": 2.2193925380706787,
269
+ "sampling/importance_sampling_ratio/mean": 0.8783411741256714,
270
+ "sampling/importance_sampling_ratio/min": 0.005415213014930487,
271
+ "sampling/sampling_logp_difference/max": 3.553822565078735,
272
+ "sampling/sampling_logp_difference/mean": 0.16359376013278962,
273
  "step": 40,
274
+ "step_time": 5.113493552800537
275
  },
276
  {
277
+ "clip_ratio/high_max": 0.011822755448520184,
278
+ "clip_ratio/high_mean": 0.005911377724260092,
279
+ "clip_ratio/low_mean": 0.015286377724260091,
280
  "clip_ratio/low_min": 0.0,
281
+ "clip_ratio/region_mean": 0.021197755355387926,
282
+ "completions/clipped_ratio": 0.0,
283
+ "completions/max_length": 374.2,
284
+ "completions/max_terminated_length": 374.2,
285
+ "completions/mean_length": 297.9,
286
+ "completions/mean_terminated_length": 297.9,
287
  "completions/min_length": 212.0,
288
  "completions/min_terminated_length": 212.0,
289
+ "entropy": 0.2887581422924995,
290
  "epoch": 0.0072,
291
+ "frac_reward_zero_std": 0.4375,
292
+ "grad_norm": 0.07674326002597809,
293
+ "kl": 1.0048415780067443,
294
  "learning_rate": 9.950635741493589e-06,
295
+ "loss": -0.0004248165525496006,
296
+ "num_tokens": 1212402.0,
297
+ "reward": 0.5099375128746033,
298
+ "reward_std": 0.24404022693634034,
299
+ "rewards/env_goofspiel_reward/mean": 0.5099375128746033,
300
+ "rewards/env_goofspiel_reward/std": 0.3780271291732788,
301
+ "sampling/importance_sampling_ratio/max": 2.1818215370178224,
302
+ "sampling/importance_sampling_ratio/mean": 0.8968270421028137,
303
+ "sampling/importance_sampling_ratio/min": 0.040483005344867706,
304
+ "sampling/sampling_logp_difference/max": 2.9256459712982177,
305
+ "sampling/sampling_logp_difference/mean": 0.14963440895080565,
306
  "step": 45,
307
+ "step_time": 5.549473219600077
308
  },
309
  {
310
+ "clip_ratio/high_max": 0.014889705926179886,
311
+ "clip_ratio/high_mean": 0.007444852963089943,
312
+ "clip_ratio/low_mean": 0.009685134887695313,
313
+ "clip_ratio/low_min": 0.005409356765449047,
314
+ "clip_ratio/region_mean": 0.017129987850785254,
315
  "completions/clipped_ratio": 0.0,
316
+ "completions/max_length": 374.2,
317
+ "completions/max_terminated_length": 374.2,
318
+ "completions/mean_length": 299.5625,
319
+ "completions/mean_terminated_length": 299.5625,
320
  "completions/min_length": 212.0,
321
  "completions/min_terminated_length": 212.0,
322
+ "entropy": 0.35890122279524805,
323
  "epoch": 0.008,
324
+ "frac_reward_zero_std": 0.3375,
325
+ "grad_norm": 0.0733598917722702,
326
+ "kl": 0.8497510462999344,
327
  "learning_rate": 9.950629695468755e-06,
328
+ "loss": -0.0006576518062502146,
329
+ "num_tokens": 1349581.0,
330
+ "reward": 0.5286250114440918,
331
+ "reward_std": 0.33428473472595216,
332
+ "rewards/env_goofspiel_reward/mean": 0.5286250114440918,
333
+ "rewards/env_goofspiel_reward/std": 0.4310214161872864,
334
+ "sampling/importance_sampling_ratio/max": 2.438156485557556,
335
+ "sampling/importance_sampling_ratio/mean": 0.9875588178634643,
336
+ "sampling/importance_sampling_ratio/min": 0.08052640929818153,
337
+ "sampling/sampling_logp_difference/max": 1.630429244041443,
338
+ "sampling/sampling_logp_difference/mean": 0.12703385949134827,
339
  "step": 50,
340
+ "step_time": 5.318245979599669
341
  },
342
  {
343
+ "clip_ratio/high_max": 0.008750000037252903,
344
+ "clip_ratio/high_mean": 0.005937500018626451,
345
+ "clip_ratio/low_mean": 0.008613153640180827,
346
+ "clip_ratio/low_min": 0.002777777798473835,
347
+ "clip_ratio/region_mean": 0.014550653658807278,
348
  "completions/clipped_ratio": 0.0,
349
+ "completions/max_length": 373.6,
350
+ "completions/max_terminated_length": 373.6,
351
+ "completions/mean_length": 282.6625,
352
+ "completions/mean_terminated_length": 282.6625,
353
+ "completions/min_length": 201.6,
354
+ "completions/min_terminated_length": 201.6,
355
+ "entropy": 0.324828689545393,
356
  "epoch": 0.0088,
357
+ "frac_reward_zero_std": 0.375,
358
+ "grad_norm": 0.07007352262735367,
359
+ "kl": 0.7210939504206181,
360
  "learning_rate": 9.950621020743173e-06,
361
+ "loss": -0.0003687262535095215,
362
+ "num_tokens": 1481810.0,
363
+ "reward": 0.4908750057220459,
364
+ "reward_std": 0.29203510880470274,
365
+ "rewards/env_goofspiel_reward/mean": 0.4908750057220459,
366
+ "rewards/env_goofspiel_reward/std": 0.38116695880889895,
367
+ "sampling/importance_sampling_ratio/max": 2.387629532814026,
368
+ "sampling/importance_sampling_ratio/mean": 0.9571901321411133,
369
+ "sampling/importance_sampling_ratio/min": 0.1296325594186783,
370
+ "sampling/sampling_logp_difference/max": 1.4196331977844239,
371
+ "sampling/sampling_logp_difference/mean": 0.11004247218370437,
372
  "step": 55,
373
+ "step_time": 5.2706628203999575
374
  },
375
  {
376
+ "clip_ratio/high_max": 0.008534356765449048,
377
+ "clip_ratio/high_mean": 0.005656067188829184,
378
+ "clip_ratio/low_mean": 0.009407982788980007,
379
  "clip_ratio/low_min": 0.0,
380
+ "clip_ratio/region_mean": 0.01506404997780919,
381
  "completions/clipped_ratio": 0.0,
382
+ "completions/max_length": 374.2,
383
+ "completions/max_terminated_length": 374.2,
384
+ "completions/mean_length": 293.0875,
385
+ "completions/mean_terminated_length": 293.0875,
386
+ "completions/min_length": 213.8,
387
+ "completions/min_terminated_length": 213.8,
388
+ "entropy": 0.33867746219038963,
389
  "epoch": 0.0096,
390
+ "frac_reward_zero_std": 0.4,
391
+ "grad_norm": 0.07561016827821732,
392
+ "kl": 1.2898097231984138,
393
  "learning_rate": 9.950609717322956e-06,
394
+ "loss": -0.0007428391836583614,
395
+ "num_tokens": 1617445.0,
396
+ "reward": 0.5697500050067902,
397
+ "reward_std": 0.26551859080791473,
398
+ "rewards/env_goofspiel_reward/mean": 0.5697500050067902,
399
+ "rewards/env_goofspiel_reward/std": 0.3711147367954254,
400
+ "sampling/importance_sampling_ratio/max": 2.2963364124298096,
401
+ "sampling/importance_sampling_ratio/mean": 0.976230013370514,
402
+ "sampling/importance_sampling_ratio/min": 0.07447343468666076,
403
+ "sampling/sampling_logp_difference/max": 1.4920601844787598,
404
+ "sampling/sampling_logp_difference/mean": 0.1161011055111885,
405
  "step": 60,
406
+ "step_time": 5.45707285199951
407
  },
408
  {
409
+ "clip_ratio/high_max": 0.006066176481544972,
410
+ "clip_ratio/high_mean": 0.003033088240772486,
411
+ "clip_ratio/low_mean": 0.00883374186232686,
412
  "clip_ratio/low_min": 0.0,
413
+ "clip_ratio/region_mean": 0.011866830103099347,
414
  "completions/clipped_ratio": 0.0,
415
+ "completions/max_length": 373.8,
416
+ "completions/max_terminated_length": 373.8,
417
+ "completions/mean_length": 273.0875,
418
+ "completions/mean_terminated_length": 273.0875,
419
+ "completions/min_length": 199.6,
420
+ "completions/min_terminated_length": 199.6,
421
+ "entropy": 0.2858745433390141,
422
  "epoch": 0.0104,
423
+ "frac_reward_zero_std": 0.4625,
424
+ "grad_norm": 0.05500200390815735,
425
+ "kl": 0.8528277602046728,
426
  "learning_rate": 9.950595785216067e-06,
427
+ "loss": -0.0008720592595636845,
428
+ "num_tokens": 1746271.0,
429
+ "reward": 0.6673750162124634,
430
+ "reward_std": 0.2547352105379105,
431
+ "rewards/env_goofspiel_reward/mean": 0.6673750162124634,
432
+ "rewards/env_goofspiel_reward/std": 0.35124587416648867,
433
+ "sampling/importance_sampling_ratio/max": 2.0680487632751463,
434
+ "sampling/importance_sampling_ratio/mean": 1.0014554977416992,
435
+ "sampling/importance_sampling_ratio/min": 0.10332663245499134,
436
+ "sampling/sampling_logp_difference/max": 2.259426403045654,
437
+ "sampling/sampling_logp_difference/mean": 0.10407083481550217,
438
  "step": 65,
439
+ "step_time": 5.311621308399845
440
  },
441
  {
442
+ "clip_ratio/high_max": 0.01180555559694767,
443
+ "clip_ratio/high_mean": 0.005902777798473835,
444
+ "clip_ratio/low_mean": 0.005365742836147547,
445
  "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.011268520634621382,
447
  "completions/clipped_ratio": 0.0,
448
+ "completions/max_length": 374.2,
449
+ "completions/max_terminated_length": 374.2,
450
+ "completions/mean_length": 279.90625,
451
+ "completions/mean_terminated_length": 279.90625,
452
+ "completions/min_length": 206.8,
453
+ "completions/min_terminated_length": 206.8,
454
+ "entropy": 0.24370604529976844,
455
  "epoch": 0.0112,
456
+ "frac_reward_zero_std": 0.475,
457
+ "grad_norm": 0.055401790887117386,
458
+ "kl": 1.1588016405701638,
459
  "learning_rate": 9.950579224432321e-06,
460
+ "loss": -0.0012944268994033337,
461
+ "num_tokens": 1878445.0,
462
+ "reward": 0.7311875104904175,
463
+ "reward_std": 0.24934353530406952,
464
+ "rewards/env_goofspiel_reward/mean": 0.7311875104904175,
465
+ "rewards/env_goofspiel_reward/std": 0.3471538543701172,
466
+ "sampling/importance_sampling_ratio/max": 2.0970391750335695,
467
+ "sampling/importance_sampling_ratio/mean": 0.9700004577636718,
468
+ "sampling/importance_sampling_ratio/min": 0.16687118411064147,
469
+ "sampling/sampling_logp_difference/max": 1.9643285274505615,
470
+ "sampling/sampling_logp_difference/mean": 0.10032870918512345,
471
  "step": 70,
472
+ "step_time": 5.481714394999836
473
  },
474
  {
475
  "clip_ratio/high_max": 0.0,
476
  "clip_ratio/high_mean": 0.0,
477
+ "clip_ratio/low_mean": 0.013843201845884324,
478
  "clip_ratio/low_min": 0.0,
479
+ "clip_ratio/region_mean": 0.013843201845884324,
480
  "completions/clipped_ratio": 0.0,
481
+ "completions/max_length": 374.4,
482
+ "completions/max_terminated_length": 374.4,
483
+ "completions/mean_length": 291.925,
484
+ "completions/mean_terminated_length": 291.925,
485
+ "completions/min_length": 199.6,
486
+ "completions/min_terminated_length": 199.6,
487
+ "entropy": 0.2260144092142582,
488
  "epoch": 0.012,
489
+ "frac_reward_zero_std": 0.475,
490
+ "grad_norm": 0.023790325969457626,
491
+ "kl": 4.869625084102154,
492
  "learning_rate": 9.950560034983382e-06,
493
+ "loss": -0.000660108495503664,
494
+ "num_tokens": 2013416.0,
495
+ "reward": 0.7499375104904175,
496
+ "reward_std": 0.26525343060493467,
497
+ "rewards/env_goofspiel_reward/mean": 0.7499375104904175,
498
+ "rewards/env_goofspiel_reward/std": 0.3587073266506195,
499
+ "sampling/importance_sampling_ratio/max": 2.49392306804657,
500
+ "sampling/importance_sampling_ratio/mean": 0.9810920596122742,
501
+ "sampling/importance_sampling_ratio/min": 0.07269524885341525,
502
+ "sampling/sampling_logp_difference/max": 2.6866398811340333,
503
+ "sampling/sampling_logp_difference/mean": 0.09891045838594437,
504
  "step": 75,
505
+ "step_time": 5.191597324399481
506
  },
507
  {
508
  "epoch": 0.012,
 
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
+ "eval_completions/max_length": 290.0,
516
+ "eval_completions/max_terminated_length": 290.0,
517
+ "eval_completions/mean_length": 269.7,
518
+ "eval_completions/mean_terminated_length": 269.7,
519
+ "eval_completions/min_length": 249.4,
520
+ "eval_completions/min_terminated_length": 249.4,
521
+ "eval_entropy": 0.2245341122150421,
522
+ "eval_frac_reward_zero_std": 0.4,
523
+ "eval_kl": 0.6459955453872681,
524
+ "eval_loss": -0.00020105746807530522,
525
+ "eval_num_tokens": 2013416.0,
526
+ "eval_reward": 0.9000000357627869,
527
+ "eval_reward_std": 0.2545584440231323,
528
+ "eval_rewards/env_goofspiel_reward/mean": 0.9000000357627869,
529
+ "eval_rewards/env_goofspiel_reward/std": 0.32784610986709595,
530
+ "eval_runtime": 2.4336,
531
+ "eval_samples_per_second": 4.109,
532
+ "eval_sampling/importance_sampling_ratio/max": 1.3313236951828002,
533
+ "eval_sampling/importance_sampling_ratio/mean": 1.0307578563690185,
534
+ "eval_sampling/importance_sampling_ratio/min": 0.6750483691692353,
535
+ "eval_sampling/sampling_logp_difference/max": 0.6310659945011139,
536
+ "eval_sampling/sampling_logp_difference/mean": 0.06487421616911888,
537
+ "eval_steps_per_second": 1.233,
538
  "step": 75
539
  },
540
  {
541
+ "clip_ratio/high_max": 0.005441176518797874,
542
+ "clip_ratio/high_mean": 0.002720588259398937,
543
+ "clip_ratio/low_mean": 0.008915441203862428,
544
+ "clip_ratio/low_min": 0.0029411764815449716,
545
+ "clip_ratio/region_mean": 0.011636029463261366,
546
  "completions/clipped_ratio": 0.0,
547
+ "completions/max_length": 374.4,
548
+ "completions/max_terminated_length": 374.4,
549
+ "completions/mean_length": 283.0125,
550
+ "completions/mean_terminated_length": 283.0125,
551
+ "completions/min_length": 206.8,
552
+ "completions/min_terminated_length": 206.8,
553
+ "entropy": 0.2153526909649372,
554
  "epoch": 0.0128,
555
+ "frac_reward_zero_std": 0.4625,
556
+ "grad_norm": 0.07206813246011734,
557
+ "kl": 1.6877675473690033,
558
  "learning_rate": 9.95053821688277e-06,
559
+ "loss": -0.0009287594817578793,
560
+ "num_tokens": 2145819.0,
561
+ "reward": 0.7724375128746033,
562
+ "reward_std": 0.24404022693634034,
563
+ "rewards/env_goofspiel_reward/mean": 0.7724375128746033,
564
+ "rewards/env_goofspiel_reward/std": 0.34512641429901125,
565
+ "sampling/importance_sampling_ratio/max": 2.1818724155426024,
566
+ "sampling/importance_sampling_ratio/mean": 0.9245784163475037,
567
+ "sampling/importance_sampling_ratio/min": 0.13156846463680266,
568
+ "sampling/sampling_logp_difference/max": 1.658913779258728,
569
+ "sampling/sampling_logp_difference/mean": 0.10279093384742737,
570
  "step": 80,
571
+ "step_time": 5.441123167599471
572
  },
573
  {
574
+ "clip_ratio/high_max": 0.0029411764815449716,
575
+ "clip_ratio/high_mean": 0.0014705882407724858,
576
+ "clip_ratio/low_mean": 0.011678629741072654,
577
+ "clip_ratio/low_min": 0.0029411764815449716,
578
+ "clip_ratio/region_mean": 0.013149217981845141,
579
  "completions/clipped_ratio": 0.0,
580
+ "completions/max_length": 373.6,
581
+ "completions/max_terminated_length": 373.6,
582
+ "completions/mean_length": 287.66875,
583
+ "completions/mean_terminated_length": 287.66875,
584
+ "completions/min_length": 201.6,
585
+ "completions/min_terminated_length": 201.6,
586
+ "entropy": 0.2192211326211691,
587
  "epoch": 0.0136,
588
+ "frac_reward_zero_std": 0.3625,
589
+ "grad_norm": 0.049152813851833344,
590
+ "kl": 1.2483771443367004,
591
  "learning_rate": 9.950513770145857e-06,
592
+ "loss": -0.0017359975725412368,
593
+ "num_tokens": 2279605.0,
594
+ "reward": 0.8173750400543213,
595
+ "reward_std": 0.3077682375907898,
596
+ "rewards/env_goofspiel_reward/mean": 0.8173750400543213,
597
+ "rewards/env_goofspiel_reward/std": 0.4139078199863434,
598
+ "sampling/importance_sampling_ratio/max": 2.272643804550171,
599
+ "sampling/importance_sampling_ratio/mean": 0.9856367349624634,
600
+ "sampling/importance_sampling_ratio/min": 0.07546308934688568,
601
+ "sampling/sampling_logp_difference/max": 1.7427294969558715,
602
+ "sampling/sampling_logp_difference/mean": 0.118245729804039,
603
  "step": 85,
604
+ "step_time": 5.293755201000204
605
  },
606
  {
607
+ "clip_ratio/high_max": 0.005263157933950424,
608
+ "clip_ratio/high_mean": 0.002631578966975212,
609
+ "clip_ratio/low_mean": 0.010241443105041981,
610
+ "clip_ratio/low_min": 0.002631578966975212,
611
+ "clip_ratio/region_mean": 0.012873022072017192,
612
  "completions/clipped_ratio": 0.0,
613
+ "completions/max_length": 374.4,
614
+ "completions/max_terminated_length": 374.4,
615
+ "completions/mean_length": 297.4875,
616
+ "completions/mean_terminated_length": 297.4875,
617
+ "completions/min_length": 207.0,
618
+ "completions/min_terminated_length": 207.0,
619
+ "entropy": 0.19745248332619666,
620
  "epoch": 0.0144,
621
+ "frac_reward_zero_std": 0.375,
622
+ "grad_norm": 0.10243905335664749,
623
+ "kl": 1.8384233698248864,
624
  "learning_rate": 9.950486694789862e-06,
625
+ "loss": -0.0016141118481755256,
626
+ "num_tokens": 2415904.0,
627
+ "reward": 0.8171875357627869,
628
+ "reward_std": 0.28682020902633665,
629
+ "rewards/env_goofspiel_reward/mean": 0.8171875357627869,
630
+ "rewards/env_goofspiel_reward/std": 0.4000221848487854,
631
+ "sampling/importance_sampling_ratio/max": 1.893965482711792,
632
+ "sampling/importance_sampling_ratio/mean": 0.9034451127052308,
633
+ "sampling/importance_sampling_ratio/min": 0.10746023580431938,
634
+ "sampling/sampling_logp_difference/max": 2.5204232692718507,
635
+ "sampling/sampling_logp_difference/mean": 0.11648025661706925,
636
  "step": 90,
637
+ "step_time": 5.317576567400283
638
  },
639
  {
640
+ "clip_ratio/high_max": 0.001666666753590107,
641
+ "clip_ratio/high_mean": 0.0008333333767950535,
642
+ "clip_ratio/low_mean": 0.011362766660749913,
643
+ "clip_ratio/low_min": 0.002631578966975212,
644
+ "clip_ratio/region_mean": 0.012196100037544965,
645
  "completions/clipped_ratio": 0.0,
646
+ "completions/max_length": 374.6,
647
+ "completions/max_terminated_length": 374.6,
648
+ "completions/mean_length": 282.925,
649
+ "completions/mean_terminated_length": 282.925,
650
+ "completions/min_length": 199.6,
651
+ "completions/min_terminated_length": 199.6,
652
+ "entropy": 0.14891982078552246,
653
  "epoch": 0.0152,
654
+ "frac_reward_zero_std": 0.4625,
655
+ "grad_norm": 0.022762339562177658,
656
+ "kl": 1.1775232434272767,
657
  "learning_rate": 9.95045699083386e-06,
658
+ "loss": -0.00033199070021510123,
659
+ "num_tokens": 2548327.0,
660
+ "reward": 0.8961250424385071,
661
+ "reward_std": 0.28125171959400175,
662
+ "rewards/env_goofspiel_reward/mean": 0.8961250424385071,
663
+ "rewards/env_goofspiel_reward/std": 0.39407379031181333,
664
+ "sampling/importance_sampling_ratio/max": 2.3227984428405763,
665
+ "sampling/importance_sampling_ratio/mean": 0.9894420146942139,
666
+ "sampling/importance_sampling_ratio/min": 0.1872235879302025,
667
+ "sampling/sampling_logp_difference/max": 1.8188178777694701,
668
+ "sampling/sampling_logp_difference/mean": 0.0839442029595375,
669
  "step": 95,
670
+ "step_time": 5.388430412399975
671
  },
672
  {
673
  "clip_ratio/high_max": 0.0,
674
  "clip_ratio/high_mean": 0.0,
675
+ "clip_ratio/low_mean": 0.013613153528422118,
676
+ "clip_ratio/low_min": 0.0029411764815449716,
677
+ "clip_ratio/region_mean": 0.013613153528422118,
678
  "completions/clipped_ratio": 0.0,
679
+ "completions/max_length": 374.2,
680
+ "completions/max_terminated_length": 374.2,
681
+ "completions/mean_length": 285.875,
682
+ "completions/mean_terminated_length": 285.875,
683
+ "completions/min_length": 218.8,
684
+ "completions/min_terminated_length": 218.8,
685
+ "entropy": 0.16100836284458636,
686
  "epoch": 0.016,
687
+ "frac_reward_zero_std": 0.5,
688
+ "grad_norm": 0.03195883333683014,
689
+ "kl": 1.7148397326469422,
690
  "learning_rate": 9.950424658298776e-06,
691
+ "loss": -0.00081931222230196,
692
+ "num_tokens": 2682046.0,
693
+ "reward": 0.9037500023841858,
694
+ "reward_std": 0.23864853382110596,
695
+ "rewards/env_goofspiel_reward/mean": 0.9037500023841858,
696
+ "rewards/env_goofspiel_reward/std": 0.37243106961250305,
697
+ "sampling/importance_sampling_ratio/max": 1.8023416996002197,
698
+ "sampling/importance_sampling_ratio/mean": 0.8842753887176513,
699
+ "sampling/importance_sampling_ratio/min": 0.10478578060865402,
700
+ "sampling/sampling_logp_difference/max": 1.7476533651351929,
701
+ "sampling/sampling_logp_difference/mean": 0.09434186965227127,
702
  "step": 100,
703
+ "step_time": 5.138518711399956
704
  },
705
  {
706
  "clip_ratio/high_max": 0.0,
707
  "clip_ratio/high_mean": 0.0,
708
+ "clip_ratio/low_mean": 0.0045955882407724856,
709
  "clip_ratio/low_min": 0.0,
710
+ "clip_ratio/region_mean": 0.0045955882407724856,
711
  "completions/clipped_ratio": 0.0,
712
+ "completions/max_length": 374.4,
713
+ "completions/max_terminated_length": 374.4,
714
+ "completions/mean_length": 288.55,
715
+ "completions/mean_terminated_length": 288.55,
716
+ "completions/min_length": 212.0,
717
+ "completions/min_terminated_length": 212.0,
718
+ "entropy": 0.15357921570539473,
719
  "epoch": 0.0168,
720
+ "frac_reward_zero_std": 0.475,
721
+ "grad_norm": 0.02917584963142872,
722
+ "kl": 1.2142755538225174,
723
  "learning_rate": 9.950389697207388e-06,
724
+ "loss": -0.0009133240208029747,
725
+ "num_tokens": 2815770.0,
726
+ "reward": 0.9600000262260437,
727
+ "reward_std": 0.2545584440231323,
728
+ "rewards/env_goofspiel_reward/mean": 0.9600000262260437,
729
+ "rewards/env_goofspiel_reward/std": 0.34941941499710083,
730
+ "sampling/importance_sampling_ratio/max": 1.8894808292388916,
731
+ "sampling/importance_sampling_ratio/mean": 0.9526291608810424,
732
+ "sampling/importance_sampling_ratio/min": 0.12061886340379716,
733
+ "sampling/sampling_logp_difference/max": 1.9378384590148925,
734
+ "sampling/sampling_logp_difference/mean": 0.09047931358218193,
735
  "step": 105,
736
+ "step_time": 5.383052910200059
737
  },
738
  {
739
+ "clip_ratio/high_max": 0.002631578966975212,
740
+ "clip_ratio/high_mean": 0.001315789483487606,
741
+ "clip_ratio/low_mean": 0.014272123482078314,
742
+ "clip_ratio/low_min": 0.002631578966975212,
743
+ "clip_ratio/region_mean": 0.01558791296556592,
744
  "completions/clipped_ratio": 0.0,
745
+ "completions/max_length": 374.6,
746
+ "completions/max_terminated_length": 374.6,
747
+ "completions/mean_length": 287.1375,
748
+ "completions/mean_terminated_length": 287.1375,
749
+ "completions/min_length": 207.0,
750
+ "completions/min_terminated_length": 207.0,
751
+ "entropy": 0.11877087596803904,
752
  "epoch": 0.0176,
753
+ "frac_reward_zero_std": 0.575,
754
+ "grad_norm": 0.03493434935808182,
755
+ "kl": 1.660761559009552,
756
  "learning_rate": 9.950352107584324e-06,
757
+ "loss": -0.0012725419364869595,
758
+ "num_tokens": 2950253.0,
759
+ "reward": 0.9823125243186951,
760
+ "reward_std": 0.22300379872322082,
761
+ "rewards/env_goofspiel_reward/mean": 0.9823125243186951,
762
+ "rewards/env_goofspiel_reward/std": 0.34692143797874453,
763
+ "sampling/importance_sampling_ratio/max": 2.0992087841033937,
764
+ "sampling/importance_sampling_ratio/mean": 0.9242552995681763,
765
+ "sampling/importance_sampling_ratio/min": 0.0625721976161003,
766
+ "sampling/sampling_logp_difference/max": 2.7260493278503417,
767
+ "sampling/sampling_logp_difference/mean": 0.11037325486540794,
768
  "step": 110,
769
+ "step_time": 5.269659819400658
770
  },
771
  {
772
+ "clip_ratio/high_max": 0.0029411764815449716,
773
+ "clip_ratio/high_mean": 0.0014705882407724858,
774
+ "clip_ratio/low_mean": 0.010151143837720155,
775
+ "clip_ratio/low_min": 0.003125,
776
+ "clip_ratio/region_mean": 0.011621732078492642,
777
  "completions/clipped_ratio": 0.0,
778
+ "completions/max_length": 374.2,
779
+ "completions/max_terminated_length": 374.2,
780
+ "completions/mean_length": 293.24375,
781
+ "completions/mean_terminated_length": 293.24375,
782
+ "completions/min_length": 212.0,
783
+ "completions/min_terminated_length": 212.0,
784
+ "entropy": 0.11200540419667959,
785
  "epoch": 0.0184,
786
+ "frac_reward_zero_std": 0.7125,
787
+ "grad_norm": 0.020553065463900566,
788
+ "kl": 1.3526975795626641,
789
  "learning_rate": 9.950311889456064e-06,
790
+ "loss": -0.0005503765307366848,
791
+ "num_tokens": 3085187.0,
792
+ "reward": 1.0274375319480895,
793
+ "reward_std": 0.14858081489801406,
794
+ "rewards/env_goofspiel_reward/mean": 1.0274375319480895,
795
+ "rewards/env_goofspiel_reward/std": 0.324732506275177,
796
+ "sampling/importance_sampling_ratio/max": 1.7125876188278197,
797
+ "sampling/importance_sampling_ratio/mean": 0.9561464786529541,
798
+ "sampling/importance_sampling_ratio/min": 0.10175237003713847,
799
+ "sampling/sampling_logp_difference/max": 2.038061809539795,
800
+ "sampling/sampling_logp_difference/mean": 0.07954240888357163,
801
  "step": 115,
802
+ "step_time": 5.627786207199643
803
  },
804
  {
805
  "clip_ratio/high_max": 0.0,
806
  "clip_ratio/high_mean": 0.0,
807
+ "clip_ratio/low_mean": 0.009591933339834213,
808
+ "clip_ratio/low_min": 0.002500000037252903,
809
+ "clip_ratio/region_mean": 0.009591933339834213,
810
  "completions/clipped_ratio": 0.0,
811
+ "completions/max_length": 374.8,
812
+ "completions/max_terminated_length": 374.8,
813
+ "completions/mean_length": 285.71875,
814
+ "completions/mean_terminated_length": 285.71875,
815
+ "completions/min_length": 212.0,
816
+ "completions/min_terminated_length": 212.0,
817
+ "entropy": 0.10979099776595831,
818
  "epoch": 0.0192,
819
+ "frac_reward_zero_std": 0.55,
820
+ "grad_norm": 0.0316302515566349,
821
+ "kl": 4.151332414150238,
822
  "learning_rate": 9.950269042850943e-06,
823
+ "loss": -0.0010401386767625808,
824
+ "num_tokens": 3218750.0,
825
+ "reward": 1.0012500524520873,
826
+ "reward_std": 0.20682872831821442,
827
+ "rewards/env_goofspiel_reward/mean": 1.0012500524520873,
828
+ "rewards/env_goofspiel_reward/std": 0.3268236696720123,
829
+ "sampling/importance_sampling_ratio/max": 1.743626070022583,
830
+ "sampling/importance_sampling_ratio/mean": 0.9425964832305909,
831
+ "sampling/importance_sampling_ratio/min": 0.10384658798575401,
832
+ "sampling/sampling_logp_difference/max": 2.185199999809265,
833
+ "sampling/sampling_logp_difference/mean": 0.09657576829195022,
834
  "step": 120,
835
+ "step_time": 5.24430227939938
836
  },
837
  {
838
  "clip_ratio/high_max": 0.0,
839
  "clip_ratio/high_mean": 0.0,
840
+ "clip_ratio/low_mean": 0.005753676500171423,
841
  "clip_ratio/low_min": 0.0,
842
+ "clip_ratio/region_mean": 0.005753676500171423,
843
  "completions/clipped_ratio": 0.0,
844
+ "completions/max_length": 374.6,
845
+ "completions/max_terminated_length": 374.6,
846
+ "completions/mean_length": 292.33125,
847
+ "completions/mean_terminated_length": 292.33125,
848
+ "completions/min_length": 207.0,
849
+ "completions/min_terminated_length": 207.0,
850
+ "entropy": 0.22348260767757894,
851
  "epoch": 0.02,
852
+ "frac_reward_zero_std": 0.4625,
853
+ "grad_norm": 0.046369779855012894,
854
+ "kl": 1.2617603421211243,
855
  "learning_rate": 9.95022356779914e-06,
856
+ "loss": -0.0007782015483826399,
857
+ "num_tokens": 3353835.0,
858
+ "reward": 0.8960625171661377,
859
+ "reward_std": 0.2599501311779022,
860
+ "rewards/env_goofspiel_reward/mean": 0.8960625171661377,
861
+ "rewards/env_goofspiel_reward/std": 0.4054258644580841,
862
+ "sampling/importance_sampling_ratio/max": 1.864054226875305,
863
+ "sampling/importance_sampling_ratio/mean": 0.9797484517097473,
864
+ "sampling/importance_sampling_ratio/min": 0.16405612826347352,
865
+ "sampling/sampling_logp_difference/max": 1.281555461883545,
866
+ "sampling/sampling_logp_difference/mean": 0.09155683666467666,
867
  "step": 125,
868
+ "step_time": 5.507626709400029
869
  },
870
  {
871
+ "clip_ratio/high_max": 0.004777777753770351,
872
+ "clip_ratio/high_mean": 0.0032222223468124866,
873
+ "clip_ratio/low_mean": 0.008599499333649873,
874
+ "clip_ratio/low_min": 0.0016129031777381898,
875
+ "clip_ratio/region_mean": 0.01182172168046236,
876
  "completions/clipped_ratio": 0.0,
877
+ "completions/max_length": 516.8,
878
+ "completions/max_terminated_length": 516.8,
879
+ "completions/mean_length": 405.09375,
880
+ "completions/mean_terminated_length": 405.09375,
881
+ "completions/min_length": 285.2,
882
+ "completions/min_terminated_length": 285.2,
883
+ "entropy": 0.2538205787539482,
884
  "epoch": 0.0208,
885
+ "frac_reward_zero_std": 0.375,
886
+ "grad_norm": 0.03732698783278465,
887
+ "kl": 1.2245262771844865,
888
  "learning_rate": 9.950175464332696e-06,
889
+ "loss": -0.001604497991502285,
890
+ "num_tokens": 3506594.0,
891
+ "reward": 0.8659999966621399,
892
+ "reward_std": 0.31071450710296633,
893
+ "rewards/env_goofspiel_reward/mean": 0.8659999966621399,
894
+ "rewards/env_goofspiel_reward/std": 0.4219347298145294,
895
+ "sampling/importance_sampling_ratio/max": 2.1955170392990113,
896
+ "sampling/importance_sampling_ratio/mean": 0.9442144870758057,
897
+ "sampling/importance_sampling_ratio/min": 0.11454989165067672,
898
+ "sampling/sampling_logp_difference/max": 1.5350213170051574,
899
+ "sampling/sampling_logp_difference/mean": 0.10076762735843658,
900
  "step": 130,
901
+ "step_time": 6.450475099400319
902
  },
903
  {
904
  "clip_ratio/high_max": 0.0,
905
  "clip_ratio/high_mean": 0.0,
906
+ "clip_ratio/low_mean": 0.009226978290826082,
907
+ "clip_ratio/low_min": 0.0018518518656492234,
908
+ "clip_ratio/region_mean": 0.009226978290826082,
909
  "completions/clipped_ratio": 0.0,
910
+ "completions/max_length": 732.6,
911
+ "completions/max_terminated_length": 732.6,
912
+ "completions/mean_length": 546.55,
913
+ "completions/mean_terminated_length": 546.55,
914
+ "completions/min_length": 408.0,
915
+ "completions/min_terminated_length": 408.0,
916
+ "entropy": 0.23710975274443627,
917
  "epoch": 0.0216,
918
+ "frac_reward_zero_std": 0.3375,
919
+ "grad_norm": 0.06111254543066025,
920
+ "kl": 1.2316449135541916,
921
  "learning_rate": 9.950124732485496e-06,
922
+ "loss": -0.0011480608955025673,
923
+ "num_tokens": 3681614.0,
924
+ "reward": 0.8281875014305115,
925
+ "reward_std": 0.2840506821870804,
926
+ "rewards/env_goofspiel_reward/mean": 0.8281875014305115,
927
+ "rewards/env_goofspiel_reward/std": 0.42113742232322693,
928
+ "sampling/importance_sampling_ratio/max": 2.122027373313904,
929
+ "sampling/importance_sampling_ratio/mean": 0.9814165592193603,
930
+ "sampling/importance_sampling_ratio/min": 0.08027622438967227,
931
+ "sampling/sampling_logp_difference/max": 1.9004024505615233,
932
+ "sampling/sampling_logp_difference/mean": 0.08553826659917832,
933
  "step": 135,
934
+ "step_time": 7.9832679632005235
935
  },
936
  {
937
+ "clip_ratio/high_max": 0.003724137879908085,
938
+ "clip_ratio/high_mean": 0.0018620689399540425,
939
+ "clip_ratio/low_mean": 0.004548311699181795,
940
  "clip_ratio/low_min": 0.0,
941
+ "clip_ratio/region_mean": 0.006410380732268095,
942
  "completions/clipped_ratio": 0.0,
943
+ "completions/max_length": 731.8,
944
+ "completions/max_terminated_length": 731.8,
945
+ "completions/mean_length": 582.2,
946
+ "completions/mean_terminated_length": 582.2,
947
+ "completions/min_length": 408.0,
948
+ "completions/min_terminated_length": 408.0,
949
+ "entropy": 0.1793461933732033,
950
  "epoch": 0.0224,
951
+ "frac_reward_zero_std": 0.375,
952
+ "grad_norm": 0.03889832645654678,
953
+ "kl": 2.291236698627472,
954
  "learning_rate": 9.95007137229328e-06,
955
+ "loss": -0.0022148976102471353,
956
+ "num_tokens": 3864515.0,
957
+ "reward": 0.9315416216850281,
958
+ "reward_std": 0.3372309923171997,
959
+ "rewards/env_goofspiel_reward/mean": 0.9315416216850281,
960
+ "rewards/env_goofspiel_reward/std": 0.450560998916626,
961
+ "sampling/importance_sampling_ratio/max": 2.064964485168457,
962
+ "sampling/importance_sampling_ratio/mean": 0.9481699466705322,
963
+ "sampling/importance_sampling_ratio/min": 0.06474384777247906,
964
+ "sampling/sampling_logp_difference/max": 2.467247152328491,
965
+ "sampling/sampling_logp_difference/mean": 0.09816154837608337,
966
  "step": 140,
967
+ "step_time": 8.106385086600312
968
  },
969
  {
970
+ "clip_ratio/high_max": 0.005647214874625206,
971
+ "clip_ratio/high_mean": 0.002823607437312603,
972
+ "clip_ratio/low_mean": 0.008411859162151814,
973
+ "clip_ratio/low_min": 0.001923076994717121,
974
+ "clip_ratio/region_mean": 0.011235466599464417,
975
  "completions/clipped_ratio": 0.0,
976
+ "completions/max_length": 732.2,
977
+ "completions/max_terminated_length": 732.2,
978
+ "completions/mean_length": 559.74375,
979
+ "completions/mean_terminated_length": 559.74375,
980
+ "completions/min_length": 395.8,
981
+ "completions/min_terminated_length": 395.8,
982
+ "entropy": 0.1993610180914402,
983
  "epoch": 0.0232,
984
+ "frac_reward_zero_std": 0.375,
985
+ "grad_norm": 0.031588390469551086,
986
+ "kl": 1.1563616126775742,
987
  "learning_rate": 9.950015383793636e-06,
988
+ "loss": -0.002377602644264698,
989
+ "num_tokens": 4041287.0,
990
+ "reward": 0.9530208110809326,
991
+ "reward_std": 0.27621357440948485,
992
+ "rewards/env_goofspiel_reward/mean": 0.9530208110809326,
993
+ "rewards/env_goofspiel_reward/std": 0.40836617946624754,
994
+ "sampling/importance_sampling_ratio/max": 2.1861939907073973,
995
+ "sampling/importance_sampling_ratio/mean": 0.9681573390960694,
996
+ "sampling/importance_sampling_ratio/min": 0.05116618424654007,
997
+ "sampling/sampling_logp_difference/max": 2.082305932044983,
998
+ "sampling/sampling_logp_difference/mean": 0.08064938932657242,
999
  "step": 145,
1000
+ "step_time": 7.88830038939941
1001
  },
1002
  {
1003
+ "clip_ratio/high_max": 0.003750000149011612,
1004
+ "clip_ratio/high_mean": 0.001875000074505806,
1005
+ "clip_ratio/low_mean": 0.00870660375803709,
1006
  "clip_ratio/low_min": 0.0,
1007
+ "clip_ratio/region_mean": 0.010581603739410639,
1008
  "completions/clipped_ratio": 0.0,
1009
+ "completions/max_length": 730.4,
1010
+ "completions/max_terminated_length": 730.4,
1011
+ "completions/mean_length": 545.94375,
1012
+ "completions/mean_terminated_length": 545.94375,
1013
+ "completions/min_length": 388.4,
1014
+ "completions/min_terminated_length": 388.4,
1015
+ "entropy": 0.17713614292442798,
1016
  "epoch": 0.024,
1017
+ "frac_reward_zero_std": 0.3375,
1018
+ "grad_norm": 0.026577942073345184,
1019
+ "kl": 1.0718628287315368,
1020
  "learning_rate": 9.949956767026006e-06,
1021
+ "loss": -0.0016981028020381927,
1022
+ "num_tokens": 4215342.0,
1023
+ "reward": 0.9501249670982361,
1024
+ "reward_std": 0.31861052215099334,
1025
+ "rewards/env_goofspiel_reward/mean": 0.9501249670982361,
1026
+ "rewards/env_goofspiel_reward/std": 0.41967723369598386,
1027
+ "sampling/importance_sampling_ratio/max": 2.1912062644958494,
1028
+ "sampling/importance_sampling_ratio/mean": 1.0014893412590027,
1029
+ "sampling/importance_sampling_ratio/min": 0.08548425957560539,
1030
+ "sampling/sampling_logp_difference/max": 1.9367387771606446,
1031
+ "sampling/sampling_logp_difference/mean": 0.07785529419779777,
1032
  "step": 150,
1033
+ "step_time": 7.899470061399915
1034
  },
1035
  {
1036
  "epoch": 0.024,
 
1040
  "eval_clip_ratio/low_min": 0.0,
1041
  "eval_clip_ratio/region_mean": 0.0,
1042
  "eval_completions/clipped_ratio": 0.0,
1043
+ "eval_completions/max_length": 563.4,
1044
+ "eval_completions/max_terminated_length": 563.4,
1045
+ "eval_completions/mean_length": 522.9,
1046
+ "eval_completions/mean_terminated_length": 522.9,
1047
+ "eval_completions/min_length": 483.2,
1048
+ "eval_completions/min_terminated_length": 483.2,
1049
+ "eval_entropy": 0.12081696838140488,
1050
+ "eval_frac_reward_zero_std": 0.7,
1051
+ "eval_kl": 1.9419716477394104,
1052
+ "eval_loss": -0.0004246043972671032,
1053
+ "eval_num_tokens": 4215342.0,
1054
+ "eval_reward": 1.1049999594688416,
1055
+ "eval_reward_std": 0.15320646464824678,
1056
+ "eval_rewards/env_goofspiel_reward/mean": 1.1049999594688416,
1057
+ "eval_rewards/env_goofspiel_reward/std": 0.2007630228996277,
1058
+ "eval_runtime": 3.0894,
1059
+ "eval_samples_per_second": 3.237,
1060
+ "eval_sampling/importance_sampling_ratio/max": 1.3710826635360718,
1061
+ "eval_sampling/importance_sampling_ratio/mean": 0.9819563269615174,
1062
+ "eval_sampling/importance_sampling_ratio/min": 0.5958439499139786,
1063
+ "eval_sampling/sampling_logp_difference/max": 0.8319960534572601,
1064
+ "eval_sampling/sampling_logp_difference/mean": 0.0594001155346632,
1065
+ "eval_steps_per_second": 0.971,
1066
  "step": 150
1067
  },
1068
  {
1069
+ "clip_ratio/high_max": 0.0,
1070
+ "clip_ratio/high_mean": 0.0,
1071
+ "clip_ratio/low_mean": 0.005752738565206528,
1072
+ "clip_ratio/low_min": 0.0,
1073
+ "clip_ratio/region_mean": 0.005752738565206528,
1074
+ "completions/clipped_ratio": 0.0,
1075
+ "completions/max_length": 715.0,
1076
+ "completions/max_terminated_length": 715.0,
1077
+ "completions/mean_length": 553.19375,
1078
+ "completions/mean_terminated_length": 553.19375,
1079
+ "completions/min_length": 422.4,
1080
+ "completions/min_terminated_length": 422.4,
1081
+ "entropy": 0.14132860042154788,
1082
+ "epoch": 0.0248,
1083
+ "frac_reward_zero_std": 0.3375,
1084
+ "grad_norm": 0.09626387804746628,
1085
+ "kl": 2.062944608926773,
1086
+ "learning_rate": 9.949895522031688e-06,
1087
+ "loss": -0.0018732130527496337,
1088
+ "num_tokens": 4391001.0,
1089
+ "reward": 0.9909374356269837,
1090
+ "reward_std": 0.29919453859329226,
1091
+ "rewards/env_goofspiel_reward/mean": 0.9909374356269837,
1092
+ "rewards/env_goofspiel_reward/std": 0.4077799320220947,
1093
+ "sampling/importance_sampling_ratio/max": 2.0831751585006715,
1094
+ "sampling/importance_sampling_ratio/mean": 0.9425314784049987,
1095
+ "sampling/importance_sampling_ratio/min": 0.059898375906050204,
1096
+ "sampling/sampling_logp_difference/max": 2.2892024993896483,
1097
+ "sampling/sampling_logp_difference/mean": 0.08296994566917419,
1098
+ "step": 155,
1099
+ "step_time": 7.558112652399904
1100
+ },
1101
+ {
1102
+ "epoch": 0.02496,
1103
  "eval_clip_ratio/high_max": 0.0,
1104
  "eval_clip_ratio/high_mean": 0.0,
1105
  "eval_clip_ratio/low_mean": 0.0,
1106
  "eval_clip_ratio/low_min": 0.0,
1107
  "eval_clip_ratio/region_mean": 0.0,
1108
  "eval_completions/clipped_ratio": 0.0,
1109
+ "eval_completions/max_length": 563.0,
1110
+ "eval_completions/max_terminated_length": 563.0,
1111
+ "eval_completions/mean_length": 523.1,
1112
+ "eval_completions/mean_terminated_length": 523.1,
1113
+ "eval_completions/min_length": 483.6,
1114
+ "eval_completions/min_terminated_length": 483.6,
1115
+ "eval_entropy": 0.09278524518013001,
1116
+ "eval_frac_reward_zero_std": 0.6,
1117
+ "eval_kl": 1.07927747964859,
1118
+ "eval_loss": 6.382538413163275e-05,
1119
+ "eval_num_tokens": 4427509.0,
1120
+ "eval_reward": 1.1483332872390748,
1121
+ "eval_reward_std": 0.15320646464824678,
1122
+ "eval_rewards/env_goofspiel_reward/mean": 1.1483332872390748,
1123
+ "eval_rewards/env_goofspiel_reward/std": 0.20928734838962554,
1124
+ "eval_runtime": 3.1042,
1125
+ "eval_samples_per_second": 3.221,
1126
+ "eval_sampling/importance_sampling_ratio/max": 1.3094272136688232,
1127
+ "eval_sampling/importance_sampling_ratio/mean": 1.0892619371414185,
1128
+ "eval_sampling/importance_sampling_ratio/min": 0.8267600655555725,
1129
+ "eval_sampling/sampling_logp_difference/max": 0.34806744158267977,
1130
+ "eval_sampling/sampling_logp_difference/mean": 0.03115060944110155,
1131
+ "eval_steps_per_second": 0.966,
1132
+ "step": 156
1133
  }
1134
  ],
1135
  "logging_steps": 5,
1136
  "max_steps": 18750,
1137
+ "num_input_tokens_seen": 4427509,
1138
  "num_train_epochs": 3,
1139
  "save_steps": 500,
1140
  "stateful_callbacks": {
 
1150
  }
1151
  },
1152
  "total_flos": 0.0,
1153
+ "train_batch_size": 4,
1154
  "trial_name": null,
1155
  "trial_params": null
1156
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b300496f72b512d9eb82d58bc70e9cfecf1e6725146e612c791121039cde76d
3
  size 7185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906bc07f18d85f3fdbe47d01e60bbe6f967852d19caecc88d502ce07c5e4aa78
3
  size 7185