Gege24 commited on
Commit
1802ce3
·
verified ·
1 Parent(s): a807cb0

Upload task output 1

Browse files
Files changed (4) hide show
  1. loss.txt +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +370 -370
  4. training_args.bin +1 -1
loss.txt CHANGED
@@ -1 +1 @@
1
- 75,-0.06899999883025884
 
1
+ 75,-0.12000000476837158
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d37254765789231f8a3a2350286dac9a3200c0359f893e954337a3ad0438489f
3
  size 988097824
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d495dcb537ad13a85d3c74235ea2ad8f46917cd31620e9644884473efb2fcc9d
3
  size 988097824
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.01875,
6
  "eval_steps": 500,
7
  "global_step": 75,
8
  "is_hyper_param_search": false,
@@ -16,31 +16,31 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 374.4,
20
- "completions/max_terminated_length": 374.4,
21
- "completions/mean_length": 290.6960021972656,
22
- "completions/mean_terminated_length": 290.6960021972656,
23
- "completions/min_length": 167.6,
24
- "completions/min_terminated_length": 167.6,
25
- "entropy": 0.7587062597274781,
26
- "epoch": 0.00125,
27
- "frac_reward_zero_std": 0.6159999966621399,
28
- "grad_norm": 0.32421875,
29
- "kl": 0.007210364751517773,
30
  "learning_rate": 1.137216e-06,
31
- "loss": -0.000457207765430212,
32
- "num_tokens": 211069.0,
33
- "reward": 0.06548000425100327,
34
- "reward_std": 0.0959968164563179,
35
- "rewards/env_goofspiel_reward/mean": 0.06548000574111938,
36
- "rewards/env_goofspiel_reward/std": 0.20242175161838533,
37
- "sampling/importance_sampling_ratio/max": 1.9353477716445924,
38
- "sampling/importance_sampling_ratio/mean": 1.0085526347160338,
39
- "sampling/importance_sampling_ratio/min": 0.5233797252178192,
40
- "sampling/sampling_logp_difference/max": 0.6646340131759644,
41
- "sampling/sampling_logp_difference/mean": 0.06287899985909462,
42
  "step": 5,
43
- "step_time": 5.862163845000214
44
  },
45
  {
46
  "clip_ratio/high_max": 0.0,
@@ -51,29 +51,29 @@
51
  "completions/clipped_ratio": 0.0,
52
  "completions/max_length": 373.8,
53
  "completions/max_terminated_length": 373.8,
54
- "completions/mean_length": 285.77599182128904,
55
- "completions/mean_terminated_length": 285.77599182128904,
56
- "completions/min_length": 187.0,
57
- "completions/min_terminated_length": 187.0,
58
- "entropy": 0.7223453521728516,
59
- "epoch": 0.0025,
60
- "frac_reward_zero_std": 0.6879999876022339,
61
- "grad_norm": 0.373046875,
62
- "kl": 0.01190796154551208,
63
  "learning_rate": 2.5587359999999995e-06,
64
- "loss": 0.0004954038187861443,
65
- "num_tokens": 419291.0,
66
- "reward": 0.06636000201106071,
67
- "reward_std": 0.08932172805070877,
68
- "rewards/env_goofspiel_reward/mean": 0.06636000350117684,
69
- "rewards/env_goofspiel_reward/std": 0.20822030901908875,
70
- "sampling/importance_sampling_ratio/max": 1.9151880025863648,
71
- "sampling/importance_sampling_ratio/mean": 1.00549418926239,
72
- "sampling/importance_sampling_ratio/min": 0.5084045708179474,
73
- "sampling/sampling_logp_difference/max": 0.8484642028808593,
74
- "sampling/sampling_logp_difference/mean": 0.05378681421279907,
75
  "step": 10,
76
- "step_time": 5.392790380800034
77
  },
78
  {
79
  "clip_ratio/high_max": 0.0,
@@ -82,31 +82,31 @@
82
  "clip_ratio/low_min": 0.0,
83
  "clip_ratio/region_mean": 0.0,
84
  "completions/clipped_ratio": 0.0,
85
- "completions/max_length": 374.0,
86
- "completions/max_terminated_length": 374.0,
87
- "completions/mean_length": 286.59998779296876,
88
- "completions/mean_terminated_length": 286.59998779296876,
89
- "completions/min_length": 186.4,
90
- "completions/min_terminated_length": 186.4,
91
- "entropy": 0.7013472616672516,
92
- "epoch": 0.00375,
93
- "frac_reward_zero_std": 0.743999981880188,
94
- "grad_norm": 0.25,
95
- "kl": 0.03683878714218736,
96
  "learning_rate": 3.9802559999999995e-06,
97
- "loss": 0.00018892403459176422,
98
- "num_tokens": 629381.0,
99
- "reward": 0.05708000063896179,
100
- "reward_std": 0.0819678172469139,
101
- "rewards/env_goofspiel_reward/mean": 0.05708000287413597,
102
- "rewards/env_goofspiel_reward/std": 0.19145022630691527,
103
- "sampling/importance_sampling_ratio/max": 1.9665297031402589,
104
- "sampling/importance_sampling_ratio/mean": 1.0253857135772706,
105
- "sampling/importance_sampling_ratio/min": 0.538357138633728,
106
- "sampling/sampling_logp_difference/max": 0.7756662368774414,
107
- "sampling/sampling_logp_difference/mean": 0.06448477879166603,
108
  "step": 15,
109
- "step_time": 5.349944851799956
110
  },
111
  {
112
  "clip_ratio/high_max": 0.0,
@@ -115,31 +115,31 @@
115
  "clip_ratio/low_min": 0.0,
116
  "clip_ratio/region_mean": 0.0,
117
  "completions/clipped_ratio": 0.0,
118
- "completions/max_length": 374.8,
119
- "completions/max_terminated_length": 374.8,
120
- "completions/mean_length": 291.7519897460937,
121
- "completions/mean_terminated_length": 291.7519897460937,
122
  "completions/min_length": 199.6,
123
  "completions/min_terminated_length": 199.6,
124
- "entropy": 0.7047547340393067,
125
- "epoch": 0.005,
126
- "frac_reward_zero_std": 0.7519999742507935,
127
- "grad_norm": 0.294921875,
128
- "kl": 0.08839112929999829,
129
  "learning_rate": 5.401775999999999e-06,
130
- "loss": -2.657829609233886e-05,
131
- "num_tokens": 840556.0,
132
- "reward": 0.06196000054478645,
133
- "reward_std": 0.07529272958636284,
134
- "rewards/env_goofspiel_reward/mean": 0.061960003525018695,
135
- "rewards/env_goofspiel_reward/std": 0.19518478214740753,
136
- "sampling/importance_sampling_ratio/max": 1.6097611427307128,
137
- "sampling/importance_sampling_ratio/mean": 0.9887367606163024,
138
- "sampling/importance_sampling_ratio/min": 0.46161189675331116,
139
- "sampling/sampling_logp_difference/max": 0.6100110828876495,
140
- "sampling/sampling_logp_difference/mean": 0.058672596514225,
141
  "step": 20,
142
- "step_time": 5.242961023999942
143
  },
144
  {
145
  "clip_ratio/high_max": 0.0,
@@ -148,31 +148,31 @@
148
  "clip_ratio/low_min": 0.0,
149
  "clip_ratio/region_mean": 0.0,
150
  "completions/clipped_ratio": 0.0,
151
- "completions/max_length": 374.2,
152
- "completions/max_terminated_length": 374.2,
153
- "completions/mean_length": 287.9799865722656,
154
- "completions/mean_terminated_length": 287.9799865722656,
155
- "completions/min_length": 194.4,
156
- "completions/min_terminated_length": 194.4,
157
- "entropy": 0.6883363842964172,
158
- "epoch": 0.00625,
159
- "frac_reward_zero_std": 0.7599999785423279,
160
- "grad_norm": 0.1826171875,
161
- "kl": 0.13179944828152657,
162
  "learning_rate": 6.8232959999999994e-06,
163
- "loss": 6.652346346527337e-05,
164
- "num_tokens": 1049402.0,
165
- "reward": 0.07416000291705131,
166
- "reward_std": 0.10555689632892609,
167
- "rewards/env_goofspiel_reward/mean": 0.07416000291705131,
168
- "rewards/env_goofspiel_reward/std": 0.22879809141159058,
169
- "sampling/importance_sampling_ratio/max": 1.9118820905685425,
170
- "sampling/importance_sampling_ratio/mean": 0.9868301272392273,
171
- "sampling/importance_sampling_ratio/min": 0.6072056829929352,
172
- "sampling/sampling_logp_difference/max": 0.5817384719848633,
173
- "sampling/sampling_logp_difference/mean": 0.05620769709348679,
174
  "step": 25,
175
- "step_time": 5.260266781999962
176
  },
177
  {
178
  "clip_ratio/high_max": 0.0,
@@ -181,31 +181,31 @@
181
  "clip_ratio/low_min": 0.0,
182
  "clip_ratio/region_mean": 0.0,
183
  "completions/clipped_ratio": 0.0,
184
- "completions/max_length": 416.8,
185
- "completions/max_terminated_length": 416.8,
186
- "completions/mean_length": 299.07598876953125,
187
- "completions/mean_terminated_length": 299.07598876953125,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
- "entropy": 0.4900937914848328,
191
- "epoch": 0.0075,
192
- "frac_reward_zero_std": 0.7599999785423279,
193
- "grad_norm": 0.1318359375,
194
- "kl": 0.13062527999281884,
195
  "learning_rate": 8.244816e-06,
196
- "loss": 0.00041568251326680184,
197
- "num_tokens": 1263196.0,
198
- "reward": 0.07428000047802925,
199
- "reward_std": 0.10527405217289924,
200
- "rewards/env_goofspiel_reward/mean": 0.07428000047802925,
201
- "rewards/env_goofspiel_reward/std": 0.21391310691833496,
202
- "sampling/importance_sampling_ratio/max": 1.9750086545944214,
203
- "sampling/importance_sampling_ratio/mean": 1.0017020225524902,
204
- "sampling/importance_sampling_ratio/min": 0.4652911275625229,
205
- "sampling/sampling_logp_difference/max": 0.7616503953933715,
206
- "sampling/sampling_logp_difference/mean": 0.048386485874652864,
207
  "step": 30,
208
- "step_time": 5.606105185600154
209
  },
210
  {
211
  "clip_ratio/high_max": 0.0,
@@ -216,29 +216,29 @@
216
  "completions/clipped_ratio": 0.0,
217
  "completions/max_length": 374.0,
218
  "completions/max_terminated_length": 374.0,
219
- "completions/mean_length": 290.9440002441406,
220
- "completions/mean_terminated_length": 290.9440002441406,
221
- "completions/min_length": 212.0,
222
- "completions/min_terminated_length": 212.0,
223
- "entropy": 0.2889633044600487,
224
- "epoch": 0.00875,
225
- "frac_reward_zero_std": 0.8479999780654908,
226
- "grad_norm": 0.0859375,
227
- "kl": 0.22455788254737855,
228
  "learning_rate": 9.666336e-06,
229
- "loss": 6.903421599417925e-05,
230
- "num_tokens": 1474071.0,
231
- "reward": 0.052800000086426734,
232
- "reward_std": 0.06788224689662456,
233
- "rewards/env_goofspiel_reward/mean": 0.052800000086426734,
234
- "rewards/env_goofspiel_reward/std": 0.16736633479595184,
235
- "sampling/importance_sampling_ratio/max": 1.6834668874740601,
236
- "sampling/importance_sampling_ratio/mean": 0.9958889007568359,
237
- "sampling/importance_sampling_ratio/min": 0.43778104782104493,
238
- "sampling/sampling_logp_difference/max": 0.8263626098632812,
239
- "sampling/sampling_logp_difference/mean": 0.0358995582908392,
240
  "step": 35,
241
- "step_time": 5.237710429800063
242
  },
243
  {
244
  "clip_ratio/high_max": 0.0,
@@ -247,31 +247,31 @@
247
  "clip_ratio/low_min": 0.0,
248
  "clip_ratio/region_mean": 0.0,
249
  "completions/clipped_ratio": 0.0,
250
- "completions/max_length": 374.0,
251
- "completions/max_terminated_length": 374.0,
252
- "completions/mean_length": 286.0199951171875,
253
- "completions/mean_terminated_length": 286.0199951171875,
254
  "completions/min_length": 212.0,
255
  "completions/min_terminated_length": 212.0,
256
- "entropy": 0.24612685889005662,
257
- "epoch": 0.01,
258
- "frac_reward_zero_std": 0.8159999847412109,
259
- "grad_norm": 0.32421875,
260
- "kl": 0.1633337877690792,
261
- "learning_rate": 9.950637941994321e-06,
262
- "loss": -2.846699208021164e-05,
263
- "num_tokens": 1681137.0,
264
- "reward": 0.060000001639127734,
265
- "reward_std": 0.08485281020402909,
266
- "rewards/env_goofspiel_reward/mean": 0.060000001639127734,
267
- "rewards/env_goofspiel_reward/std": 0.19129492342472076,
268
- "sampling/importance_sampling_ratio/max": 1.5720022439956665,
269
- "sampling/importance_sampling_ratio/mean": 0.9784366488456726,
270
- "sampling/importance_sampling_ratio/min": 0.47974646687507627,
271
- "sampling/sampling_logp_difference/max": 0.702510929107666,
272
- "sampling/sampling_logp_difference/mean": 0.03313286602497101,
273
  "step": 40,
274
- "step_time": 5.238652371199805
275
  },
276
  {
277
  "clip_ratio/high_max": 0.0,
@@ -280,31 +280,31 @@
280
  "clip_ratio/low_min": 0.0,
281
  "clip_ratio/region_mean": 0.0,
282
  "completions/clipped_ratio": 0.0,
283
- "completions/max_length": 660.4,
284
- "completions/max_terminated_length": 660.4,
285
- "completions/mean_length": 489.1719970703125,
286
- "completions/mean_terminated_length": 489.1719970703125,
287
- "completions/min_length": 322.4,
288
- "completions/min_terminated_length": 322.4,
289
- "entropy": 0.3432160258293152,
290
- "epoch": 0.01125,
291
- "frac_reward_zero_std": 0.7599999785423279,
292
- "grad_norm": 0.2255859375,
293
- "kl": 0.3002031177282333,
294
- "learning_rate": 9.950629581350144e-06,
295
- "loss": 0.0002033085562288761,
296
- "num_tokens": 1938183.0,
297
- "reward": 0.03768000081181526,
298
- "reward_std": 0.05008201580494642,
299
- "rewards/env_goofspiel_reward/mean": 0.03768000081181526,
300
- "rewards/env_goofspiel_reward/std": 0.12525621727108954,
301
- "sampling/importance_sampling_ratio/max": 1.7067772150039673,
302
- "sampling/importance_sampling_ratio/mean": 1.0024658203125,
303
- "sampling/importance_sampling_ratio/min": 0.5572002470493317,
304
- "sampling/sampling_logp_difference/max": 0.6115247011184692,
305
- "sampling/sampling_logp_difference/mean": 0.03118431307375431,
306
  "step": 45,
307
- "step_time": 7.8333522611999795
308
  },
309
  {
310
  "clip_ratio/high_max": 0.0,
@@ -313,31 +313,31 @@
313
  "clip_ratio/low_min": 0.0,
314
  "clip_ratio/region_mean": 0.0,
315
  "completions/clipped_ratio": 0.0,
316
- "completions/max_length": 731.6,
317
- "completions/max_terminated_length": 731.6,
318
- "completions/mean_length": 558.283984375,
319
- "completions/mean_terminated_length": 558.283984375,
320
- "completions/min_length": 408.0,
321
- "completions/min_terminated_length": 408.0,
322
- "entropy": 0.29817005395889284,
323
- "epoch": 0.0125,
324
- "frac_reward_zero_std": 0.8399999737739563,
325
- "grad_norm": 0.14453125,
326
- "kl": 0.14909197837114335,
327
- "learning_rate": 9.950614789456512e-06,
328
- "loss": -3.2902939710766074e-05,
329
- "num_tokens": 2214781.0,
330
- "reward": 0.03273333106189966,
331
- "reward_std": 0.04195500072091818,
332
- "rewards/env_goofspiel_reward/mean": 0.032733332738280295,
333
- "rewards/env_goofspiel_reward/std": 0.12340526506304741,
334
- "sampling/importance_sampling_ratio/max": 1.7580672979354859,
335
- "sampling/importance_sampling_ratio/mean": 1.0121817111968994,
336
- "sampling/importance_sampling_ratio/min": 0.5544978082180023,
337
- "sampling/sampling_logp_difference/max": 0.6947145342826844,
338
- "sampling/sampling_logp_difference/mean": 0.02995888851583004,
339
  "step": 50,
340
- "step_time": 8.399313552200056
341
  },
342
  {
343
  "clip_ratio/high_max": 0.0,
@@ -346,31 +346,31 @@
346
  "clip_ratio/low_min": 0.0,
347
  "clip_ratio/region_mean": 0.0,
348
  "completions/clipped_ratio": 0.0,
349
- "completions/max_length": 737.0,
350
- "completions/max_terminated_length": 737.0,
351
- "completions/mean_length": 563.09599609375,
352
- "completions/mean_terminated_length": 563.09599609375,
353
- "completions/min_length": 408.0,
354
- "completions/min_terminated_length": 408.0,
355
- "entropy": 0.3130415454506874,
356
- "epoch": 0.01375,
357
- "frac_reward_zero_std": 0.8319999814033509,
358
- "grad_norm": 0.12451171875,
359
- "kl": 0.17116529867053032,
360
- "learning_rate": 9.95059356633892e-06,
361
- "loss": 0.00011836685007438064,
362
- "num_tokens": 2492933.0,
363
- "reward": 0.0414799977093935,
364
- "reward_std": 0.04919577315449715,
365
- "rewards/env_goofspiel_reward/mean": 0.0414799977093935,
366
- "rewards/env_goofspiel_reward/std": 0.13337331116199494,
367
- "sampling/importance_sampling_ratio/max": 1.614890694618225,
368
- "sampling/importance_sampling_ratio/mean": 0.9976115822792053,
369
- "sampling/importance_sampling_ratio/min": 0.5176251292228699,
370
- "sampling/sampling_logp_difference/max": 0.5567813754081726,
371
- "sampling/sampling_logp_difference/mean": 0.030348184704780578,
372
  "step": 55,
373
- "step_time": 8.462650469800156
374
  },
375
  {
376
  "clip_ratio/high_max": 0.0,
@@ -379,31 +379,31 @@
379
  "clip_ratio/low_min": 0.0,
380
  "clip_ratio/region_mean": 0.0,
381
  "completions/clipped_ratio": 0.0,
382
- "completions/max_length": 731.8,
383
- "completions/max_terminated_length": 731.8,
384
- "completions/mean_length": 561.4599853515625,
385
- "completions/mean_terminated_length": 561.4599853515625,
386
- "completions/min_length": 408.0,
387
- "completions/min_terminated_length": 408.0,
388
- "entropy": 0.3217264384031296,
389
- "epoch": 0.015,
390
- "frac_reward_zero_std": 0.8239999771118164,
391
- "grad_norm": 0.103515625,
392
- "kl": 0.16943022534251212,
393
- "learning_rate": 9.950565912033946e-06,
394
- "loss": 0.00016170007875189186,
395
- "num_tokens": 2768999.0,
396
- "reward": 0.046799997240304946,
397
- "reward_std": 0.06128258481621742,
398
- "rewards/env_goofspiel_reward/mean": 0.046799997240304946,
399
- "rewards/env_goofspiel_reward/std": 0.1459730952978134,
400
- "sampling/importance_sampling_ratio/max": 1.6248928308486938,
401
- "sampling/importance_sampling_ratio/mean": 0.994913375377655,
402
- "sampling/importance_sampling_ratio/min": 0.5438100695610046,
403
- "sampling/sampling_logp_difference/max": 0.5243477821350098,
404
- "sampling/sampling_logp_difference/mean": 0.02915378250181675,
405
  "step": 60,
406
- "step_time": 8.311450370799957
407
  },
408
  {
409
  "clip_ratio/high_max": 0.0,
@@ -412,31 +412,31 @@
412
  "clip_ratio/low_min": 0.0,
413
  "clip_ratio/region_mean": 0.0,
414
  "completions/clipped_ratio": 0.0,
415
- "completions/max_length": 936.2,
416
- "completions/max_terminated_length": 936.2,
417
- "completions/mean_length": 718.1879760742188,
418
- "completions/mean_terminated_length": 718.1879760742188,
419
- "completions/min_length": 517.2,
420
- "completions/min_terminated_length": 517.2,
421
- "entropy": 0.3693429589271545,
422
- "epoch": 0.01625,
423
- "frac_reward_zero_std": 0.8319999694824218,
424
- "grad_norm": 0.10400390625,
425
- "kl": 0.16856326386332512,
426
- "learning_rate": 9.950531826589252e-06,
427
- "loss": 9.133372223004698e-05,
428
- "num_tokens": 3085009.0,
429
- "reward": 0.03046666570007801,
430
- "reward_std": 0.04365205764770508,
431
- "rewards/env_goofspiel_reward/mean": 0.030466666072607042,
432
- "rewards/env_goofspiel_reward/std": 0.1246792048215866,
433
- "sampling/importance_sampling_ratio/max": 2.0000662088394163,
434
- "sampling/importance_sampling_ratio/mean": 0.9988593339920044,
435
- "sampling/importance_sampling_ratio/min": 0.5936131238937378,
436
- "sampling/sampling_logp_difference/max": 0.661067008972168,
437
- "sampling/sampling_logp_difference/mean": 0.029154302552342415,
438
  "step": 65,
439
- "step_time": 10.795809616799943
440
  },
441
  {
442
  "clip_ratio/high_max": 0.0,
@@ -445,31 +445,31 @@
445
  "clip_ratio/low_min": 0.0,
446
  "clip_ratio/region_mean": 0.0,
447
  "completions/clipped_ratio": 0.0,
448
- "completions/max_length": 1074.6,
449
- "completions/max_terminated_length": 1074.6,
450
- "completions/mean_length": 809.1799682617187,
451
- "completions/mean_terminated_length": 809.1799682617187,
452
- "completions/min_length": 590.0,
453
- "completions/min_terminated_length": 590.0,
454
- "entropy": 0.3258677929639816,
455
- "epoch": 0.0175,
456
- "frac_reward_zero_std": 0.8239999771118164,
457
- "grad_norm": 0.232421875,
458
- "kl": 0.1824295423924923,
459
- "learning_rate": 9.950491310063582e-06,
460
- "loss": 0.00022613720502704382,
461
- "num_tokens": 3421737.0,
462
- "reward": 0.039453331381082535,
463
- "reward_std": 0.048102113604545596,
464
- "rewards/env_goofspiel_reward/mean": 0.03945333361625671,
465
- "rewards/env_goofspiel_reward/std": 0.12717084884643554,
466
- "sampling/importance_sampling_ratio/max": 1.7261427879333495,
467
- "sampling/importance_sampling_ratio/mean": 1.0054854035377503,
468
- "sampling/importance_sampling_ratio/min": 0.6340943217277527,
469
- "sampling/sampling_logp_difference/max": 0.4532318115234375,
470
- "sampling/sampling_logp_difference/mean": 0.02659378871321678,
471
  "step": 70,
472
- "step_time": 12.004160757599857
473
  },
474
  {
475
  "clip_ratio/high_max": 0.0,
@@ -478,69 +478,69 @@
478
  "clip_ratio/low_min": 0.0,
479
  "clip_ratio/region_mean": 0.0,
480
  "completions/clipped_ratio": 0.0,
481
- "completions/max_length": 1074.0,
482
- "completions/max_terminated_length": 1074.0,
483
- "completions/mean_length": 813.3439697265625,
484
- "completions/mean_terminated_length": 813.3439697265625,
485
- "completions/min_length": 590.0,
486
- "completions/min_terminated_length": 590.0,
487
- "entropy": 0.29610189199447634,
488
- "epoch": 0.01875,
489
- "frac_reward_zero_std": 0.8639999747276306,
490
- "grad_norm": 0.1201171875,
491
- "kl": 0.17389494478702544,
492
- "learning_rate": 9.950444362526773e-06,
493
- "loss": -2.63441979768686e-06,
494
- "num_tokens": 3760662.0,
495
- "reward": 0.03491999926045537,
496
- "reward_std": 0.041691013239324094,
497
- "rewards/env_goofspiel_reward/mean": 0.03492000075057149,
498
- "rewards/env_goofspiel_reward/std": 0.12171182483434677,
499
- "sampling/importance_sampling_ratio/max": 1.5990596771240235,
500
- "sampling/importance_sampling_ratio/mean": 1.0042136073112489,
501
- "sampling/importance_sampling_ratio/min": 0.5866354703903198,
502
- "sampling/sampling_logp_difference/max": 0.49382131099700927,
503
- "sampling/sampling_logp_difference/mean": 0.024137656763195993,
504
  "step": 75,
505
- "step_time": 11.794954787200004
506
  },
507
  {
508
- "epoch": 0.01875,
509
  "eval_clip_ratio/high_max": 0.0,
510
  "eval_clip_ratio/high_mean": 0.0,
511
  "eval_clip_ratio/low_mean": 0.0,
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
- "eval_completions/max_length": 895.0,
516
- "eval_completions/max_terminated_length": 895.0,
517
- "eval_completions/mean_length": 806.55,
518
- "eval_completions/mean_terminated_length": 806.55,
519
- "eval_completions/min_length": 725.8,
520
- "eval_completions/min_terminated_length": 725.8,
521
- "eval_entropy": 0.3078053116798401,
522
  "eval_frac_reward_zero_std": 0.6,
523
- "eval_kl": 0.5635438948869705,
524
- "eval_loss": 0.0008250019163824618,
525
- "eval_num_tokens": 3760662.0,
526
- "eval_reward": 0.06899999883025884,
527
- "eval_reward_std": 0.10040915999561548,
528
- "eval_rewards/env_goofspiel_reward/mean": 0.06899999883025884,
529
- "eval_rewards/env_goofspiel_reward/std": 0.11241451688110829,
530
- "eval_runtime": 3.5242,
531
- "eval_samples_per_second": 2.838,
532
- "eval_sampling/importance_sampling_ratio/max": 1.2238447427749635,
533
- "eval_sampling/importance_sampling_ratio/mean": 0.9734418511390686,
534
- "eval_sampling/importance_sampling_ratio/min": 0.7944899320602417,
535
- "eval_sampling/sampling_logp_difference/max": 0.2762513041496277,
536
- "eval_sampling/sampling_logp_difference/mean": 0.028397564217448233,
537
- "eval_steps_per_second": 0.851,
538
  "step": 75
539
  }
540
  ],
541
  "logging_steps": 5,
542
- "max_steps": 12000,
543
- "num_input_tokens_seen": 3760662,
544
  "num_train_epochs": 3,
545
  "save_steps": 500,
546
  "stateful_callbacks": {
@@ -556,7 +556,7 @@
556
  }
557
  },
558
  "total_flos": 0.0,
559
- "train_batch_size": 25,
560
  "trial_name": null,
561
  "trial_params": null
562
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.009000360014400577,
6
  "eval_steps": 500,
7
  "global_step": 75,
8
  "is_hyper_param_search": false,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 365.6,
20
+ "completions/max_terminated_length": 365.6,
21
+ "completions/mean_length": 292.2166809082031,
22
+ "completions/mean_terminated_length": 292.2166809082031,
23
+ "completions/min_length": 174.8,
24
+ "completions/min_terminated_length": 174.8,
25
+ "entropy": 0.7345801413059234,
26
+ "epoch": 0.0006000240009600384,
27
+ "frac_reward_zero_std": 0.650000023841858,
28
+ "grad_norm": 0.578125,
29
+ "kl": 0.014322867337614297,
30
  "learning_rate": 1.137216e-06,
31
+ "loss": 0.00023176579270511866,
32
+ "num_tokens": 101638.0,
33
+ "reward": 0.023166669206693767,
34
+ "reward_std": 0.03771236310712993,
35
+ "rewards/env_goofspiel_reward/mean": 0.023166668484918773,
36
+ "rewards/env_goofspiel_reward/std": 0.10488454704172909,
37
+ "sampling/importance_sampling_ratio/max": 2.230660581588745,
38
+ "sampling/importance_sampling_ratio/mean": 1.0512551069259644,
39
+ "sampling/importance_sampling_ratio/min": 0.45719883143901824,
40
+ "sampling/sampling_logp_difference/max": 0.9388755321502685,
41
+ "sampling/sampling_logp_difference/mean": 0.08014384806156158,
42
  "step": 5,
43
+ "step_time": 2.8670244561999425
44
  },
45
  {
46
  "clip_ratio/high_max": 0.0,
 
51
  "completions/clipped_ratio": 0.0,
52
  "completions/max_length": 373.8,
53
  "completions/max_terminated_length": 373.8,
54
+ "completions/mean_length": 284.3750061035156,
55
+ "completions/mean_terminated_length": 284.3750061035156,
56
+ "completions/min_length": 197.4,
57
+ "completions/min_terminated_length": 197.4,
58
+ "entropy": 0.7396668076515198,
59
+ "epoch": 0.0012000480019200767,
60
+ "frac_reward_zero_std": 0.5333333492279053,
61
+ "grad_norm": 0.6484375,
62
+ "kl": 0.008018274139612914,
63
  "learning_rate": 2.5587359999999995e-06,
64
+ "loss": 0.0010880917310714723,
65
+ "num_tokens": 201985.0,
66
+ "reward": 0.1135000076610595,
67
+ "reward_std": 0.1355288046877831,
68
+ "rewards/env_goofspiel_reward/mean": 0.11350000470411033,
69
+ "rewards/env_goofspiel_reward/std": 0.24339603506959975,
70
+ "sampling/importance_sampling_ratio/max": 1.9386430501937866,
71
+ "sampling/importance_sampling_ratio/mean": 1.0104641795158387,
72
+ "sampling/importance_sampling_ratio/min": 0.48966296911239626,
73
+ "sampling/sampling_logp_difference/max": 0.7527793884277344,
74
+ "sampling/sampling_logp_difference/mean": 0.06743223667144775,
75
  "step": 10,
76
+ "step_time": 2.562607514999763
77
  },
78
  {
79
  "clip_ratio/high_max": 0.0,
 
82
  "clip_ratio/low_min": 0.0,
83
  "clip_ratio/region_mean": 0.0,
84
  "completions/clipped_ratio": 0.0,
85
+ "completions/max_length": 373.8,
86
+ "completions/max_terminated_length": 373.8,
87
+ "completions/mean_length": 292.29168090820315,
88
+ "completions/mean_terminated_length": 292.29168090820315,
89
+ "completions/min_length": 205.4,
90
+ "completions/min_terminated_length": 205.4,
91
+ "entropy": 0.6868447959423065,
92
+ "epoch": 0.0018000720028801152,
93
+ "frac_reward_zero_std": 0.7666666865348816,
94
+ "grad_norm": 0.248046875,
95
+ "kl": 0.015538515662774444,
96
  "learning_rate": 3.9802559999999995e-06,
97
+ "loss": -0.0002336445264518261,
98
+ "num_tokens": 303718.0,
99
+ "reward": 0.02416666953358799,
100
+ "reward_std": 0.03653385282959789,
101
+ "rewards/env_goofspiel_reward/mean": 0.024166667682584374,
102
+ "rewards/env_goofspiel_reward/std": 0.10811053770594299,
103
+ "sampling/importance_sampling_ratio/max": 1.5562421321868896,
104
+ "sampling/importance_sampling_ratio/mean": 0.9962499618530274,
105
+ "sampling/importance_sampling_ratio/min": 0.4864930033683777,
106
+ "sampling/sampling_logp_difference/max": 0.7846987843513489,
107
+ "sampling/sampling_logp_difference/mean": 0.05959557741880417,
108
  "step": 15,
109
+ "step_time": 2.4648701715999777
110
  },
111
  {
112
  "clip_ratio/high_max": 0.0,
 
115
  "clip_ratio/low_min": 0.0,
116
  "clip_ratio/region_mean": 0.0,
117
  "completions/clipped_ratio": 0.0,
118
+ "completions/max_length": 365.4,
119
+ "completions/max_terminated_length": 365.4,
120
+ "completions/mean_length": 286.4166748046875,
121
+ "completions/mean_terminated_length": 286.4166748046875,
122
  "completions/min_length": 199.6,
123
  "completions/min_terminated_length": 199.6,
124
+ "entropy": 0.6791316926479339,
125
+ "epoch": 0.0024000960038401535,
126
+ "frac_reward_zero_std": 0.7833333492279053,
127
+ "grad_norm": 0.419921875,
128
+ "kl": 0.03971561994403601,
129
  "learning_rate": 5.401775999999999e-06,
130
+ "loss": 0.00017259303713217377,
131
+ "num_tokens": 403266.0,
132
+ "reward": 0.05466667115688324,
133
+ "reward_std": 0.07825315818190574,
134
+ "rewards/env_goofspiel_reward/mean": 0.054666668176651,
135
+ "rewards/env_goofspiel_reward/std": 0.1692157879471779,
136
+ "sampling/importance_sampling_ratio/max": 1.7059913635253907,
137
+ "sampling/importance_sampling_ratio/mean": 1.0254743337631225,
138
+ "sampling/importance_sampling_ratio/min": 0.5575755715370179,
139
+ "sampling/sampling_logp_difference/max": 0.5765037894248962,
140
+ "sampling/sampling_logp_difference/mean": 0.06591257303953171,
141
  "step": 20,
142
+ "step_time": 2.4083536974001616
143
  },
144
  {
145
  "clip_ratio/high_max": 0.0,
 
148
  "clip_ratio/low_min": 0.0,
149
  "clip_ratio/region_mean": 0.0,
150
  "completions/clipped_ratio": 0.0,
151
+ "completions/max_length": 373.6,
152
+ "completions/max_terminated_length": 373.6,
153
+ "completions/mean_length": 279.07501831054685,
154
+ "completions/mean_terminated_length": 279.07501831054685,
155
+ "completions/min_length": 206.8,
156
+ "completions/min_terminated_length": 206.8,
157
+ "entropy": 0.5845985025167465,
158
+ "epoch": 0.003000120004800192,
159
+ "frac_reward_zero_std": 0.8666666865348815,
160
+ "grad_norm": 0.2080078125,
161
+ "kl": 0.07888290733098983,
162
  "learning_rate": 6.8232959999999994e-06,
163
+ "loss": 0.00014582456788048148,
164
+ "num_tokens": 501949.0,
165
+ "reward": 0.029750002920627593,
166
+ "reward_std": 0.042544259876012805,
167
+ "rewards/env_goofspiel_reward/mean": 0.029750000685453415,
168
+ "rewards/env_goofspiel_reward/std": 0.11651719957590104,
169
+ "sampling/importance_sampling_ratio/max": 1.5395583629608154,
170
+ "sampling/importance_sampling_ratio/mean": 0.9871565222740173,
171
+ "sampling/importance_sampling_ratio/min": 0.6314660668373108,
172
+ "sampling/sampling_logp_difference/max": 0.43876824378967283,
173
+ "sampling/sampling_logp_difference/mean": 0.04884573593735695,
174
  "step": 25,
175
+ "step_time": 2.4319384599999467
176
  },
177
  {
178
  "clip_ratio/high_max": 0.0,
 
181
  "clip_ratio/low_min": 0.0,
182
  "clip_ratio/region_mean": 0.0,
183
  "completions/clipped_ratio": 0.0,
184
+ "completions/max_length": 373.6,
185
+ "completions/max_terminated_length": 373.6,
186
+ "completions/mean_length": 294.0166748046875,
187
+ "completions/mean_terminated_length": 294.0166748046875,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
+ "entropy": 0.5196643978357315,
191
+ "epoch": 0.0036001440057602304,
192
+ "frac_reward_zero_std": 0.850000011920929,
193
+ "grad_norm": 0.62109375,
194
+ "kl": 0.1175543449819088,
195
  "learning_rate": 8.244816e-06,
196
+ "loss": 0.0001264215330593288,
197
+ "num_tokens": 604591.0,
198
+ "reward": 0.044750004261732104,
199
+ "reward_std": 0.04985102787613869,
200
+ "rewards/env_goofspiel_reward/mean": 0.044750002399086955,
201
+ "rewards/env_goofspiel_reward/std": 0.125959412753582,
202
+ "sampling/importance_sampling_ratio/max": 1.7827884435653687,
203
+ "sampling/importance_sampling_ratio/mean": 0.9959828734397889,
204
+ "sampling/importance_sampling_ratio/min": 0.6090725898742676,
205
+ "sampling/sampling_logp_difference/max": 0.5527279376983643,
206
+ "sampling/sampling_logp_difference/mean": 0.05205402001738548,
207
  "step": 30,
208
+ "step_time": 2.3975842315998306
209
  },
210
  {
211
  "clip_ratio/high_max": 0.0,
 
216
  "completions/clipped_ratio": 0.0,
217
  "completions/max_length": 374.0,
218
  "completions/max_terminated_length": 374.0,
219
+ "completions/mean_length": 291.40834350585936,
220
+ "completions/mean_terminated_length": 291.40834350585936,
221
+ "completions/min_length": 219.2,
222
+ "completions/min_terminated_length": 219.2,
223
+ "entropy": 0.4317367374897003,
224
+ "epoch": 0.004200168006720269,
225
+ "frac_reward_zero_std": 0.7500000119209289,
226
+ "grad_norm": 0.00531005859375,
227
+ "kl": 0.12076274678111076,
228
  "learning_rate": 9.666336e-06,
229
+ "loss": 0.0003318458097055554,
230
+ "num_tokens": 706062.0,
231
+ "reward": 0.09500000476837159,
232
+ "reward_std": 0.12020815908908844,
233
+ "rewards/env_goofspiel_reward/mean": 0.09500000327825546,
234
+ "rewards/env_goofspiel_reward/std": 0.21089642345905305,
235
+ "sampling/importance_sampling_ratio/max": 1.4750993490219115,
236
+ "sampling/importance_sampling_ratio/mean": 0.9911892533302307,
237
+ "sampling/importance_sampling_ratio/min": 0.5305798888206482,
238
+ "sampling/sampling_logp_difference/max": 0.7321011543273925,
239
+ "sampling/sampling_logp_difference/mean": 0.043305123969912526,
240
  "step": 35,
241
+ "step_time": 2.43143495660006
242
  },
243
  {
244
  "clip_ratio/high_max": 0.0,
 
247
  "clip_ratio/low_min": 0.0,
248
  "clip_ratio/region_mean": 0.0,
249
  "completions/clipped_ratio": 0.0,
250
+ "completions/max_length": 373.4,
251
+ "completions/max_terminated_length": 373.4,
252
+ "completions/mean_length": 288.1250061035156,
253
+ "completions/mean_terminated_length": 288.1250061035156,
254
  "completions/min_length": 212.0,
255
  "completions/min_terminated_length": 212.0,
256
+ "entropy": 0.351773801445961,
257
+ "epoch": 0.004800192007680307,
258
+ "frac_reward_zero_std": 0.8666666865348815,
259
+ "grad_norm": 0.33203125,
260
+ "kl": 0.05839128475636244,
261
+ "learning_rate": 9.950639527236806e-06,
262
+ "loss": 4.926343681290746e-05,
263
+ "num_tokens": 806862.0,
264
+ "reward": 0.040000003576278684,
265
+ "reward_std": 0.05656854510307312,
266
+ "rewards/env_goofspiel_reward/mean": 0.04000000059604645,
267
+ "rewards/env_goofspiel_reward/std": 0.11344237923622132,
268
+ "sampling/importance_sampling_ratio/max": 1.5833612918853759,
269
+ "sampling/importance_sampling_ratio/mean": 1.0104422450065613,
270
+ "sampling/importance_sampling_ratio/min": 0.6002241253852845,
271
+ "sampling/sampling_logp_difference/max": 0.5244450092315673,
272
+ "sampling/sampling_logp_difference/mean": 0.03759892582893372,
273
  "step": 40,
274
+ "step_time": 2.415808950400242
275
  },
276
  {
277
  "clip_ratio/high_max": 0.0,
 
280
  "clip_ratio/low_min": 0.0,
281
  "clip_ratio/region_mean": 0.0,
282
  "completions/clipped_ratio": 0.0,
283
+ "completions/max_length": 373.8,
284
+ "completions/max_terminated_length": 373.8,
285
+ "completions/mean_length": 291.74168090820314,
286
+ "completions/mean_terminated_length": 291.74168090820314,
287
+ "completions/min_length": 212.0,
288
+ "completions/min_terminated_length": 212.0,
289
+ "entropy": 0.3240418329834938,
290
+ "epoch": 0.005400216008640346,
291
+ "frac_reward_zero_std": 0.9166666865348816,
292
+ "grad_norm": 0.0439453125,
293
+ "kl": 0.13300706073641777,
294
+ "learning_rate": 9.950637606636539e-06,
295
+ "loss": 0.0001355916727334261,
296
+ "num_tokens": 907008.0,
297
+ "reward": 0.034833335876464845,
298
+ "reward_std": 0.03535534143447876,
299
+ "rewards/env_goofspiel_reward/mean": 0.03483333364129067,
300
+ "rewards/env_goofspiel_reward/std": 0.14086373001337052,
301
+ "sampling/importance_sampling_ratio/max": 1.4610427141189575,
302
+ "sampling/importance_sampling_ratio/mean": 0.9817873358726501,
303
+ "sampling/importance_sampling_ratio/min": 0.6326651930809021,
304
+ "sampling/sampling_logp_difference/max": 0.46174774169921873,
305
+ "sampling/sampling_logp_difference/mean": 0.040712539479136466,
306
  "step": 45,
307
+ "step_time": 2.417739940000138
308
  },
309
  {
310
  "clip_ratio/high_max": 0.0,
 
313
  "clip_ratio/low_min": 0.0,
314
  "clip_ratio/region_mean": 0.0,
315
  "completions/clipped_ratio": 0.0,
316
+ "completions/max_length": 365.0,
317
+ "completions/max_terminated_length": 365.0,
318
+ "completions/mean_length": 279.3333435058594,
319
+ "completions/mean_terminated_length": 279.3333435058594,
320
+ "completions/min_length": 212.0,
321
+ "completions/min_terminated_length": 212.0,
322
+ "entropy": 0.3186733976006508,
323
+ "epoch": 0.006000240009600384,
324
+ "frac_reward_zero_std": 0.8833333611488342,
325
+ "grad_norm": 0.353515625,
326
+ "kl": 0.06966875828802585,
327
+ "learning_rate": 9.950634208652256e-06,
328
+ "loss": 0.00012671776348724962,
329
+ "num_tokens": 1005578.0,
330
+ "reward": 0.034916669048834593,
331
+ "reward_std": 0.049615327350329606,
332
+ "rewards/env_goofspiel_reward/mean": 0.03491666756453924,
333
+ "rewards/env_goofspiel_reward/std": 0.13708889302797617,
334
+ "sampling/importance_sampling_ratio/max": 1.5069894075393677,
335
+ "sampling/importance_sampling_ratio/mean": 1.0077889800071715,
336
+ "sampling/importance_sampling_ratio/min": 0.7747546076774597,
337
+ "sampling/sampling_logp_difference/max": 0.3697906732559204,
338
+ "sampling/sampling_logp_difference/mean": 0.03059108220040798,
339
  "step": 50,
340
+ "step_time": 2.3773288868003872
341
  },
342
  {
343
  "clip_ratio/high_max": 0.0,
 
346
  "clip_ratio/low_min": 0.0,
347
  "clip_ratio/region_mean": 0.0,
348
  "completions/clipped_ratio": 0.0,
349
+ "completions/max_length": 374.0,
350
+ "completions/max_terminated_length": 374.0,
351
+ "completions/mean_length": 305.49168090820314,
352
+ "completions/mean_terminated_length": 305.49168090820314,
353
+ "completions/min_length": 212.0,
354
+ "completions/min_terminated_length": 212.0,
355
+ "entropy": 0.29847966730594633,
356
+ "epoch": 0.006600264010560422,
357
+ "frac_reward_zero_std": 0.900000023841858,
358
+ "grad_norm": 0.30078125,
359
+ "kl": 0.08750866688787937,
360
+ "learning_rate": 9.950629333285305e-06,
361
+ "loss": -2.145505277439952e-06,
362
+ "num_tokens": 1110455.0,
363
+ "reward": 0.035000003129243853,
364
+ "reward_std": 0.04949747696518898,
365
+ "rewards/env_goofspiel_reward/mean": 0.03500000052154064,
366
+ "rewards/env_goofspiel_reward/std": 0.1412438616156578,
367
+ "sampling/importance_sampling_ratio/max": 1.3705053567886352,
368
+ "sampling/importance_sampling_ratio/mean": 1.0030481100082398,
369
+ "sampling/importance_sampling_ratio/min": 0.6937733888626099,
370
+ "sampling/sampling_logp_difference/max": 0.4317422866821289,
371
+ "sampling/sampling_logp_difference/mean": 0.03450411073863506,
372
  "step": 55,
373
+ "step_time": 2.4346962132000045
374
  },
375
  {
376
  "clip_ratio/high_max": 0.0,
 
379
  "clip_ratio/low_min": 0.0,
380
  "clip_ratio/region_mean": 0.0,
381
  "completions/clipped_ratio": 0.0,
382
+ "completions/max_length": 374.0,
383
+ "completions/max_terminated_length": 374.0,
384
+ "completions/mean_length": 294.62501220703126,
385
+ "completions/mean_terminated_length": 294.62501220703126,
386
+ "completions/min_length": 212.0,
387
+ "completions/min_terminated_length": 212.0,
388
+ "entropy": 0.2199392184615135,
389
+ "epoch": 0.007200288011520461,
390
+ "frac_reward_zero_std": 0.7666666746139527,
391
+ "grad_norm": 0.1328125,
392
+ "kl": 0.20852462351322174,
393
+ "learning_rate": 9.950622980537618e-06,
394
+ "loss": -1.4243402983993291e-05,
395
+ "num_tokens": 1211770.0,
396
+ "reward": 0.08483333513140678,
397
+ "reward_std": 0.09215958416461945,
398
+ "rewards/env_goofspiel_reward/mean": 0.08483333475887775,
399
+ "rewards/env_goofspiel_reward/std": 0.2297523573040962,
400
+ "sampling/importance_sampling_ratio/max": 1.3913918256759643,
401
+ "sampling/importance_sampling_ratio/mean": 0.9923399925231934,
402
+ "sampling/importance_sampling_ratio/min": 0.7218815922737122,
403
+ "sampling/sampling_logp_difference/max": 0.3741787910461426,
404
+ "sampling/sampling_logp_difference/mean": 0.023421294614672662,
405
  "step": 60,
406
+ "step_time": 2.4339242404003016
407
  },
408
  {
409
  "clip_ratio/high_max": 0.0,
 
412
  "clip_ratio/low_min": 0.0,
413
  "clip_ratio/region_mean": 0.0,
414
  "completions/clipped_ratio": 0.0,
415
+ "completions/max_length": 365.0,
416
+ "completions/max_terminated_length": 365.0,
417
+ "completions/mean_length": 299.108349609375,
418
+ "completions/mean_terminated_length": 299.108349609375,
419
+ "completions/min_length": 212.0,
420
+ "completions/min_terminated_length": 212.0,
421
+ "entropy": 0.14547686353325845,
422
+ "epoch": 0.0078003120124804995,
423
+ "frac_reward_zero_std": 0.8666666865348815,
424
+ "grad_norm": 0.01318359375,
425
+ "kl": 1.5776881486177445,
426
+ "learning_rate": 9.950615150411705e-06,
427
+ "loss": 0.00020953675266355276,
428
+ "num_tokens": 1315125.0,
429
+ "reward": 0.03991667032241821,
430
+ "reward_std": 0.05668639615178108,
431
+ "rewards/env_goofspiel_reward/mean": 0.03991666734218598,
432
+ "rewards/env_goofspiel_reward/std": 0.15578595399856568,
433
+ "sampling/importance_sampling_ratio/max": 1.5614466190338134,
434
+ "sampling/importance_sampling_ratio/mean": 1.0267313480377198,
435
+ "sampling/importance_sampling_ratio/min": 0.8062139034271241,
436
+ "sampling/sampling_logp_difference/max": 0.38132710456848146,
437
+ "sampling/sampling_logp_difference/mean": 0.017732756957411767,
438
  "step": 65,
439
+ "step_time": 2.3855804428001646
440
  },
441
  {
442
  "clip_ratio/high_max": 0.0,
 
445
  "clip_ratio/low_min": 0.0,
446
  "clip_ratio/region_mean": 0.0,
447
  "completions/clipped_ratio": 0.0,
448
+ "completions/max_length": 373.4,
449
+ "completions/max_terminated_length": 373.4,
450
+ "completions/mean_length": 288.8666748046875,
451
+ "completions/mean_terminated_length": 288.8666748046875,
452
+ "completions/min_length": 212.0,
453
+ "completions/min_terminated_length": 212.0,
454
+ "entropy": 0.36806915551424024,
455
+ "epoch": 0.008400336013440538,
456
+ "frac_reward_zero_std": 0.8333333730697632,
457
+ "grad_norm": 0.103515625,
458
+ "kl": 0.154165069013834,
459
+ "learning_rate": 9.950605842910668e-06,
460
+ "loss": 3.9057480171322824e-05,
461
+ "num_tokens": 1415706.0,
462
+ "reward": 0.0650000050663948,
463
+ "reward_std": 0.07778174877166748,
464
+ "rewards/env_goofspiel_reward/mean": 0.06500000134110451,
465
+ "rewards/env_goofspiel_reward/std": 0.20113323032855987,
466
+ "sampling/importance_sampling_ratio/max": 1.3685919523239136,
467
+ "sampling/importance_sampling_ratio/mean": 0.9923707365989685,
468
+ "sampling/importance_sampling_ratio/min": 0.6977368593215942,
469
+ "sampling/sampling_logp_difference/max": 0.37837958335876465,
470
+ "sampling/sampling_logp_difference/mean": 0.023688069358468056,
471
  "step": 70,
472
+ "step_time": 2.419219413600149
473
  },
474
  {
475
  "clip_ratio/high_max": 0.0,
 
478
  "clip_ratio/low_min": 0.0,
479
  "clip_ratio/region_mean": 0.0,
480
  "completions/clipped_ratio": 0.0,
481
+ "completions/max_length": 365.0,
482
+ "completions/max_terminated_length": 365.0,
483
+ "completions/mean_length": 296.4750122070312,
484
+ "completions/mean_terminated_length": 296.4750122070312,
485
+ "completions/min_length": 218.8,
486
+ "completions/min_terminated_length": 218.8,
487
+ "entropy": 0.44495113492012023,
488
+ "epoch": 0.009000360014400577,
489
+ "frac_reward_zero_std": 0.7666666984558106,
490
+ "grad_norm": 0.13671875,
491
+ "kl": 0.308458948135376,
492
+ "learning_rate": 9.950595058038197e-06,
493
+ "loss": 0.00013219380052760245,
494
+ "num_tokens": 1517756.0,
495
+ "reward": 0.054750004410743715,
496
+ "reward_std": 0.07813530415296555,
497
+ "rewards/env_goofspiel_reward/mean": 0.05475000143051147,
498
+ "rewards/env_goofspiel_reward/std": 0.17614837288856505,
499
+ "sampling/importance_sampling_ratio/max": 1.4038194417953491,
500
+ "sampling/importance_sampling_ratio/mean": 0.9863661766052246,
501
+ "sampling/importance_sampling_ratio/min": 0.6411556363105774,
502
+ "sampling/sampling_logp_difference/max": 0.5436622142791748,
503
+ "sampling/sampling_logp_difference/mean": 0.029225154593586922,
504
  "step": 75,
505
+ "step_time": 2.370857671000158
506
  },
507
  {
508
+ "epoch": 0.009000360014400577,
509
  "eval_clip_ratio/high_max": 0.0,
510
  "eval_clip_ratio/high_mean": 0.0,
511
  "eval_clip_ratio/low_mean": 0.0,
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
+ "eval_completions/max_length": 274.2,
516
+ "eval_completions/max_terminated_length": 274.2,
517
+ "eval_completions/mean_length": 254.7,
518
+ "eval_completions/mean_terminated_length": 254.7,
519
+ "eval_completions/min_length": 235.4,
520
+ "eval_completions/min_terminated_length": 235.4,
521
+ "eval_entropy": 0.4275285005569458,
522
  "eval_frac_reward_zero_std": 0.6,
523
+ "eval_kl": 0.317794269323349,
524
+ "eval_loss": 9.086851787287742e-05,
525
+ "eval_num_tokens": 1517756.0,
526
+ "eval_reward": 0.12000000476837158,
527
+ "eval_reward_std": 0.16970562934875488,
528
+ "eval_rewards/env_goofspiel_reward/mean": 0.12000000476837158,
529
+ "eval_rewards/env_goofspiel_reward/std": 0.24000003337860107,
530
+ "eval_runtime": 1.6574,
531
+ "eval_samples_per_second": 6.034,
532
+ "eval_sampling/importance_sampling_ratio/max": 1.1251073837280274,
533
+ "eval_sampling/importance_sampling_ratio/mean": 0.9802677392959595,
534
+ "eval_sampling/importance_sampling_ratio/min": 0.8105595707893372,
535
+ "eval_sampling/sampling_logp_difference/max": 0.2066459536552429,
536
+ "eval_sampling/sampling_logp_difference/mean": 0.03130748393014073,
537
+ "eval_steps_per_second": 1.81,
538
  "step": 75
539
  }
540
  ],
541
  "logging_steps": 5,
542
+ "max_steps": 24999,
543
+ "num_input_tokens_seen": 1517756,
544
  "num_train_epochs": 3,
545
  "save_steps": 500,
546
  "stateful_callbacks": {
 
556
  }
557
  },
558
  "total_flos": 0.0,
559
+ "train_batch_size": 12,
560
  "trial_name": null,
561
  "trial_params": null
562
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c53ec641d35398511966f6bbf25ac001ca90c58e99dcc4975f403287921d176
3
  size 7185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6a4c97c4543543d2fa415f06995465636d6f177fd18bba16e72de2f4e49a937
3
  size 7185