bimabk commited on
Commit
0a2c360
·
verified ·
1 Parent(s): 4940b17

Upload task output 1

Browse files
Files changed (6) hide show
  1. config.json +11 -15
  2. generation_config.json +10 -0
  3. loss.txt +1 -1
  4. model.safetensors +2 -2
  5. trainer_state.json +981 -420
  6. training_args.bin +1 -1
config.json CHANGED
@@ -3,13 +3,13 @@
3
  "Qwen2ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
- "bos_token_id": 151643,
7
  "dtype": "bfloat16",
8
  "eos_token_id": 151645,
9
  "hidden_act": "silu",
10
- "hidden_size": 3584,
11
  "initializer_range": 0.02,
12
- "intermediate_size": 18944,
13
  "layer_types": [
14
  "full_attention",
15
  "full_attention",
@@ -34,28 +34,24 @@
34
  "full_attention",
35
  "full_attention",
36
  "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
  "full_attention"
42
  ],
43
  "max_position_embeddings": 32768,
44
- "max_window_layers": 28,
45
  "model_type": "qwen2",
46
- "num_attention_heads": 28,
47
- "num_hidden_layers": 28,
48
- "num_key_value_heads": 4,
49
- "pad_token_id": null,
50
  "rms_norm_eps": 1e-06,
51
  "rope_parameters": {
52
  "rope_theta": 1000000.0,
53
  "rope_type": "default"
54
  },
55
  "sliding_window": null,
56
- "tie_word_embeddings": false,
57
  "transformers_version": "5.1.0",
58
- "use_cache": true,
59
  "use_sliding_window": false,
60
- "vocab_size": 152064
61
  }
 
3
  "Qwen2ForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
+ "bos_token_id": null,
7
  "dtype": "bfloat16",
8
  "eos_token_id": 151645,
9
  "hidden_act": "silu",
10
+ "hidden_size": 896,
11
  "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
  "layer_types": [
14
  "full_attention",
15
  "full_attention",
 
34
  "full_attention",
35
  "full_attention",
36
  "full_attention",
 
 
 
 
37
  "full_attention"
38
  ],
39
  "max_position_embeddings": 32768,
40
+ "max_window_layers": 21,
41
  "model_type": "qwen2",
42
+ "num_attention_heads": 14,
43
+ "num_hidden_layers": 24,
44
+ "num_key_value_heads": 2,
45
+ "pad_token_id": 151643,
46
  "rms_norm_eps": 1e-06,
47
  "rope_parameters": {
48
  "rope_theta": 1000000.0,
49
  "rope_type": "default"
50
  },
51
  "sliding_window": null,
52
+ "tie_word_embeddings": true,
53
  "transformers_version": "5.1.0",
54
+ "use_cache": false,
55
  "use_sliding_window": false,
56
+ "vocab_size": 151936
57
  }
generation_config.json CHANGED
@@ -1,3 +1,13 @@
1
  {
 
 
 
 
 
 
 
 
 
 
2
  "transformers_version": "5.1.0"
3
  }
 
1
  {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
  "transformers_version": "5.1.0"
13
  }
loss.txt CHANGED
@@ -1 +1 @@
1
- 75,-0.5700000166893006
 
1
+ 152,-0.06000000238418579
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccb66fc7d5882f759fe60a9cf8d5730ffdf76e7738298e0453e82b4ffb4c1a53
3
- size 15231272152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df164ec05eae3899c60a65c9089d6d4eedccdc33078b9a90ec77d9672e240f89
3
+ size 988097824
trainer_state.json CHANGED
@@ -2,507 +2,507 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.012,
6
  "eval_steps": 500,
7
- "global_step": 75,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "clip_ratio/high_max": 0.014437134563922881,
14
- "clip_ratio/high_mean": 0.007218567281961441,
15
- "clip_ratio/low_mean": 0.005763888917863369,
16
  "clip_ratio/low_min": 0.0,
17
- "clip_ratio/region_mean": 0.01298245619982481,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 374.0,
20
- "completions/max_terminated_length": 374.0,
21
- "completions/mean_length": 294.78125,
22
- "completions/mean_terminated_length": 294.78125,
23
- "completions/min_length": 189.6,
24
- "completions/min_terminated_length": 189.6,
25
- "entropy": 0.35714133381843566,
26
  "epoch": 0.0008,
27
- "frac_reward_zero_std": 0.475,
28
- "grad_norm": 0.14499153196811676,
29
- "kl": 0.006699140788987279,
30
  "learning_rate": 1.137216e-06,
31
- "loss": 0.0004814713727682829,
32
- "num_tokens": 136090.0,
33
- "reward": 0.292125004529953,
34
- "reward_std": 0.2656953722238541,
35
- "rewards/env_goofspiel_reward/mean": 0.292125004529953,
36
- "rewards/env_goofspiel_reward/std": 0.41643730401992796,
37
- "sampling/importance_sampling_ratio/max": 1.8895051956176758,
38
- "sampling/importance_sampling_ratio/mean": 0.9195514798164368,
39
- "sampling/importance_sampling_ratio/min": 0.2350650832056999,
40
- "sampling/sampling_logp_difference/max": 1.6994480609893798,
41
- "sampling/sampling_logp_difference/mean": 0.09322866201400756,
42
  "step": 5,
43
- "step_time": 5.709857220399681
44
  },
45
  {
46
- "clip_ratio/high_max": 0.02671568635851145,
47
- "clip_ratio/high_mean": 0.013357843179255724,
48
- "clip_ratio/low_mean": 0.016053921636193992,
49
  "clip_ratio/low_min": 0.0,
50
- "clip_ratio/region_mean": 0.029411764815449715,
51
  "completions/clipped_ratio": 0.0,
52
- "completions/max_length": 374.2,
53
- "completions/max_terminated_length": 374.2,
54
- "completions/mean_length": 290.54375,
55
- "completions/mean_terminated_length": 290.54375,
56
- "completions/min_length": 194.4,
57
- "completions/min_terminated_length": 194.4,
58
- "entropy": 0.36814531981945037,
59
  "epoch": 0.0016,
60
- "frac_reward_zero_std": 0.55,
61
- "grad_norm": 0.15409794449806213,
62
- "kl": 0.024915735074318945,
63
  "learning_rate": 2.5587359999999995e-06,
64
- "loss": 0.00025723695289343597,
65
- "num_tokens": 271136.0,
66
- "reward": 0.30362500846385954,
67
- "reward_std": 0.21761211454868318,
68
- "rewards/env_goofspiel_reward/mean": 0.30362500846385954,
69
- "rewards/env_goofspiel_reward/std": 0.40538435578346255,
70
- "sampling/importance_sampling_ratio/max": 2.3090060234069822,
71
- "sampling/importance_sampling_ratio/mean": 0.9982686519622803,
72
- "sampling/importance_sampling_ratio/min": 0.11310269832611083,
73
- "sampling/sampling_logp_difference/max": 1.6586394786834717,
74
- "sampling/sampling_logp_difference/mean": 0.09253094047307968,
75
  "step": 10,
76
- "step_time": 5.373984831200687
77
  },
78
  {
79
- "clip_ratio/high_max": 0.030514705926179886,
80
- "clip_ratio/high_mean": 0.01672794120386243,
81
- "clip_ratio/low_mean": 0.01482843142002821,
82
  "clip_ratio/low_min": 0.0,
83
- "clip_ratio/region_mean": 0.03155637262389064,
84
  "completions/clipped_ratio": 0.0,
85
- "completions/max_length": 375.6,
86
- "completions/max_terminated_length": 375.6,
87
- "completions/mean_length": 283.45625,
88
- "completions/mean_terminated_length": 283.45625,
89
- "completions/min_length": 194.6,
90
- "completions/min_terminated_length": 194.6,
91
- "entropy": 0.38065551668405534,
92
  "epoch": 0.0024,
93
- "frac_reward_zero_std": 0.4375,
94
- "grad_norm": 0.0815606489777565,
95
- "kl": 0.02344995441380888,
96
  "learning_rate": 3.9802559999999995e-06,
97
- "loss": 0.000516003929078579,
98
- "num_tokens": 403825.0,
99
- "reward": 0.35987500548362733,
100
- "reward_std": 0.2653418242931366,
101
- "rewards/env_goofspiel_reward/mean": 0.35987500548362733,
102
- "rewards/env_goofspiel_reward/std": 0.4154684245586395,
103
- "sampling/importance_sampling_ratio/max": 1.841845488548279,
104
- "sampling/importance_sampling_ratio/mean": 0.9473352670669556,
105
- "sampling/importance_sampling_ratio/min": 0.2071388103067875,
106
- "sampling/sampling_logp_difference/max": 1.491676390171051,
107
- "sampling/sampling_logp_difference/mean": 0.09024534374475479,
108
  "step": 15,
109
- "step_time": 5.279587671000627
110
  },
111
  {
112
- "clip_ratio/high_max": 0.023333333432674408,
113
- "clip_ratio/high_mean": 0.011666666716337204,
114
- "clip_ratio/low_mean": 0.012774122878909111,
115
  "clip_ratio/low_min": 0.0,
116
- "clip_ratio/region_mean": 0.024440789688378574,
117
  "completions/clipped_ratio": 0.0,
118
- "completions/max_length": 373.8,
119
- "completions/max_terminated_length": 373.8,
120
- "completions/mean_length": 279.5875,
121
- "completions/mean_terminated_length": 279.5875,
122
- "completions/min_length": 206.8,
123
- "completions/min_terminated_length": 206.8,
124
- "entropy": 0.3506437622010708,
125
  "epoch": 0.0032,
126
- "frac_reward_zero_std": 0.4625,
127
- "grad_norm": 0.12220246344804764,
128
- "kl": 0.23682632837444545,
129
  "learning_rate": 5.401775999999999e-06,
130
- "loss": -0.0002838193904608488,
131
- "num_tokens": 535747.0,
132
- "reward": 0.374812513589859,
133
- "reward_std": 0.24421700537204744,
134
- "rewards/env_goofspiel_reward/mean": 0.374812513589859,
135
- "rewards/env_goofspiel_reward/std": 0.39649735689163207,
136
- "sampling/importance_sampling_ratio/max": 2.3718762159347535,
137
- "sampling/importance_sampling_ratio/mean": 0.9897154331207275,
138
- "sampling/importance_sampling_ratio/min": 0.2213693767786026,
139
- "sampling/sampling_logp_difference/max": 2.0126638174057008,
140
- "sampling/sampling_logp_difference/mean": 0.10554229319095612,
141
  "step": 20,
142
- "step_time": 5.332370807199913
143
  },
144
  {
145
- "clip_ratio/high_max": 0.029443860985338688,
146
- "clip_ratio/high_mean": 0.016110819298774004,
147
- "clip_ratio/low_mean": 0.027831450570374727,
148
- "clip_ratio/low_min": 0.011572128906846047,
149
- "clip_ratio/region_mean": 0.04394227024167776,
150
  "completions/clipped_ratio": 0.0,
151
- "completions/max_length": 374.4,
152
- "completions/max_terminated_length": 374.4,
153
- "completions/mean_length": 301.0375,
154
- "completions/mean_terminated_length": 301.0375,
155
- "completions/min_length": 218.8,
156
- "completions/min_terminated_length": 218.8,
157
- "entropy": 0.3545067012310028,
158
  "epoch": 0.004,
159
- "frac_reward_zero_std": 0.35,
160
- "grad_norm": 0.09991537779569626,
161
- "kl": 0.6746378809213638,
162
  "learning_rate": 6.8232959999999994e-06,
163
- "loss": -0.0003991848789155483,
164
- "num_tokens": 673746.0,
165
- "reward": 0.34875001609325407,
166
- "reward_std": 0.3128947615623474,
167
- "rewards/env_goofspiel_reward/mean": 0.34875001609325407,
168
- "rewards/env_goofspiel_reward/std": 0.3975376784801483,
169
- "sampling/importance_sampling_ratio/max": 2.3523300170898436,
170
- "sampling/importance_sampling_ratio/mean": 0.9350853443145752,
171
- "sampling/importance_sampling_ratio/min": 0.03211224116384983,
172
- "sampling/sampling_logp_difference/max": 2.597071409225464,
173
- "sampling/sampling_logp_difference/mean": 0.14846422374248505,
174
  "step": 25,
175
- "step_time": 5.470841645199835
176
  },
177
  {
178
- "clip_ratio/high_max": 0.015093954280018806,
179
- "clip_ratio/high_mean": 0.007546977140009403,
180
- "clip_ratio/low_mean": 0.01916505442932248,
181
- "clip_ratio/low_min": 0.005625000037252903,
182
- "clip_ratio/region_mean": 0.026712031569331884,
183
  "completions/clipped_ratio": 0.0,
184
- "completions/max_length": 374.0,
185
- "completions/max_terminated_length": 374.0,
186
- "completions/mean_length": 283.44375,
187
- "completions/mean_terminated_length": 283.44375,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
- "entropy": 0.3386394247412682,
191
  "epoch": 0.0048,
192
- "frac_reward_zero_std": 0.5625,
193
- "grad_norm": 0.05208470672369003,
194
- "kl": 3.1543088920414446,
195
  "learning_rate": 8.244816e-06,
196
- "loss": 3.84216895326972e-05,
197
- "num_tokens": 805457.0,
198
- "reward": 0.41250001192092894,
199
- "reward_std": 0.2121320277452469,
200
- "rewards/env_goofspiel_reward/mean": 0.41250001192092894,
201
- "rewards/env_goofspiel_reward/std": 0.39523468613624574,
202
- "sampling/importance_sampling_ratio/max": 2.1884907484054565,
203
- "sampling/importance_sampling_ratio/mean": 0.9641352295875549,
204
- "sampling/importance_sampling_ratio/min": 0.17558300793170928,
205
- "sampling/sampling_logp_difference/max": 1.910474991798401,
206
- "sampling/sampling_logp_difference/mean": 0.11390596330165863,
207
  "step": 30,
208
- "step_time": 5.248301958399679
209
  },
210
  {
211
- "clip_ratio/high_max": 0.021911457646638155,
212
- "clip_ratio/high_mean": 0.010955728823319077,
213
- "clip_ratio/low_mean": 0.016263545025140047,
214
- "clip_ratio/low_min": 0.002631578966975212,
215
- "clip_ratio/region_mean": 0.02721927403472364,
216
  "completions/clipped_ratio": 0.0,
217
- "completions/max_length": 366.0,
218
- "completions/max_terminated_length": 366.0,
219
- "completions/mean_length": 290.6625,
220
- "completions/mean_terminated_length": 290.6625,
221
  "completions/min_length": 212.0,
222
  "completions/min_terminated_length": 212.0,
223
- "entropy": 0.4240268304944038,
224
  "epoch": 0.0056,
225
- "frac_reward_zero_std": 0.4,
226
- "grad_norm": 0.08057750761508942,
227
- "kl": 1.8736468333750964,
228
  "learning_rate": 9.666336e-06,
229
- "loss": -2.150831278413534e-05,
230
- "num_tokens": 940033.0,
231
- "reward": 0.4274375081062317,
232
- "reward_std": 0.2758600294589996,
233
- "rewards/env_goofspiel_reward/mean": 0.4274375081062317,
234
- "rewards/env_goofspiel_reward/std": 0.4116846978664398,
235
- "sampling/importance_sampling_ratio/max": 2.5006643772125243,
236
- "sampling/importance_sampling_ratio/mean": 0.966198992729187,
237
- "sampling/importance_sampling_ratio/min": 0.07496144040487707,
238
- "sampling/sampling_logp_difference/max": 2.63707594871521,
239
- "sampling/sampling_logp_difference/mean": 0.1406691253185272,
240
  "step": 35,
241
- "step_time": 5.299385342999813
242
  },
243
  {
244
- "clip_ratio/high_max": 0.020319487527012826,
245
- "clip_ratio/high_mean": 0.010159743763506413,
246
- "clip_ratio/low_mean": 0.007234477158635855,
247
  "clip_ratio/low_min": 0.0,
248
- "clip_ratio/region_mean": 0.017394221015274526,
249
  "completions/clipped_ratio": 0.0,
250
- "completions/max_length": 374.2,
251
- "completions/max_terminated_length": 374.2,
252
- "completions/mean_length": 289.475,
253
- "completions/mean_terminated_length": 289.475,
254
- "completions/min_length": 207.0,
255
- "completions/min_terminated_length": 207.0,
256
- "entropy": 0.6166644155979156,
257
  "epoch": 0.0064,
258
- "frac_reward_zero_std": 0.5,
259
- "grad_norm": 0.051475733518600464,
260
- "kl": 0.6312531501054763,
261
  "learning_rate": 9.95063915881342e-06,
262
- "loss": 0.0008587016724050045,
263
- "num_tokens": 1074989.0,
264
- "reward": 0.2586875051259995,
265
- "reward_std": 0.2599501311779022,
266
- "rewards/env_goofspiel_reward/mean": 0.2586875051259995,
267
- "rewards/env_goofspiel_reward/std": 0.38087824583053587,
268
- "sampling/importance_sampling_ratio/max": 2.293054127693176,
269
- "sampling/importance_sampling_ratio/mean": 0.9133782982826233,
270
- "sampling/importance_sampling_ratio/min": 0.09586721286177635,
271
- "sampling/sampling_logp_difference/max": 1.7645570278167724,
272
- "sampling/sampling_logp_difference/mean": 0.1391677066683769,
273
  "step": 40,
274
- "step_time": 5.214609204999943
275
  },
276
  {
277
- "clip_ratio/high_max": 0.01441670972853899,
278
- "clip_ratio/high_mean": 0.007208354864269495,
279
- "clip_ratio/low_mean": 0.005737766716629266,
280
- "clip_ratio/low_min": 0.002631578966975212,
281
- "clip_ratio/region_mean": 0.012946121580898761,
282
- "completions/clipped_ratio": 0.0,
283
- "completions/max_length": 374.2,
284
- "completions/max_terminated_length": 374.2,
285
- "completions/mean_length": 294.13125,
286
- "completions/mean_terminated_length": 294.13125,
287
- "completions/min_length": 184.6,
288
- "completions/min_terminated_length": 184.6,
289
- "entropy": 0.7186032980680466,
290
  "epoch": 0.0072,
291
- "frac_reward_zero_std": 0.5125,
292
- "grad_norm": 0.10908176004886627,
293
- "kl": 0.9199723824858665,
294
  "learning_rate": 9.950635741493589e-06,
295
- "loss": 0.0010974571108818055,
296
- "num_tokens": 1211700.0,
297
- "reward": 0.20568750202655792,
298
- "reward_std": 0.21823083460330964,
299
- "rewards/env_goofspiel_reward/mean": 0.20568750202655792,
300
- "rewards/env_goofspiel_reward/std": 0.3572053849697113,
301
- "sampling/importance_sampling_ratio/max": 2.1740296363830565,
302
- "sampling/importance_sampling_ratio/mean": 0.8587659239768982,
303
- "sampling/importance_sampling_ratio/min": 0.19039739817380905,
304
- "sampling/sampling_logp_difference/max": 1.302869963645935,
305
- "sampling/sampling_logp_difference/mean": 0.159588959813118,
306
  "step": 45,
307
- "step_time": 5.3400724014001755
308
  },
309
  {
310
- "clip_ratio/high_max": 0.014580108411610126,
311
- "clip_ratio/high_mean": 0.008760642446577548,
312
- "clip_ratio/low_mean": 0.005718954280018807,
313
- "clip_ratio/low_min": 0.002777777798473835,
314
- "clip_ratio/region_mean": 0.014479596912860871,
315
  "completions/clipped_ratio": 0.0,
316
- "completions/max_length": 374.0,
317
- "completions/max_terminated_length": 374.0,
318
- "completions/mean_length": 298.4875,
319
- "completions/mean_terminated_length": 298.4875,
320
- "completions/min_length": 194.6,
321
- "completions/min_terminated_length": 194.6,
322
- "entropy": 0.7520321547985077,
323
  "epoch": 0.008,
324
- "frac_reward_zero_std": 0.55,
325
- "grad_norm": 0.07008689641952515,
326
- "kl": 1.4948164954781533,
327
  "learning_rate": 9.950629695468755e-06,
328
- "loss": 0.000722643407061696,
329
- "num_tokens": 1348707.0,
330
- "reward": 0.19093750715255736,
331
- "reward_std": 0.18605746924877167,
332
- "rewards/env_goofspiel_reward/mean": 0.19093750715255736,
333
- "rewards/env_goofspiel_reward/std": 0.32617470622062683,
334
- "sampling/importance_sampling_ratio/max": 2.441471576690674,
335
- "sampling/importance_sampling_ratio/mean": 0.8815865159034729,
336
- "sampling/importance_sampling_ratio/min": 0.047414033114910124,
337
- "sampling/sampling_logp_difference/max": 1.4149149417877198,
338
- "sampling/sampling_logp_difference/mean": 0.17731134295463563,
339
  "step": 50,
340
- "step_time": 5.3226749666000615
341
  },
342
  {
343
- "clip_ratio/high_max": 0.017320261523127555,
344
- "clip_ratio/high_mean": 0.008660130761563778,
345
- "clip_ratio/low_mean": 0.0015625,
346
  "clip_ratio/low_min": 0.0,
347
- "clip_ratio/region_mean": 0.010222630761563777,
348
  "completions/clipped_ratio": 0.0,
349
- "completions/max_length": 373.6,
350
- "completions/max_terminated_length": 373.6,
351
- "completions/mean_length": 284.1125,
352
- "completions/mean_terminated_length": 284.1125,
353
- "completions/min_length": 200.0,
354
- "completions/min_terminated_length": 200.0,
355
- "entropy": 0.7574372291564941,
356
  "epoch": 0.0088,
357
- "frac_reward_zero_std": 0.6125,
358
- "grad_norm": 0.05677078291773796,
359
- "kl": 0.9213189110159874,
360
  "learning_rate": 9.950621020743173e-06,
361
- "loss": 0.00019418969750404358,
362
- "num_tokens": 1481168.0,
363
- "reward": 0.16468750387430192,
364
- "reward_std": 0.15954096913337706,
365
- "rewards/env_goofspiel_reward/mean": 0.16468750387430192,
366
- "rewards/env_goofspiel_reward/std": 0.3025706380605698,
367
- "sampling/importance_sampling_ratio/max": 2.087395262718201,
368
- "sampling/importance_sampling_ratio/mean": 0.9323906660079956,
369
- "sampling/importance_sampling_ratio/min": 0.12477300018072128,
370
- "sampling/sampling_logp_difference/max": 1.5357290506362915,
371
- "sampling/sampling_logp_difference/mean": 0.16227305233478545,
372
  "step": 55,
373
- "step_time": 5.239984228400317
374
  },
375
  {
376
- "clip_ratio/high_max": 0.011312134563922882,
377
- "clip_ratio/high_mean": 0.005656067281961441,
378
- "clip_ratio/low_mean": 0.008540054224431515,
379
  "clip_ratio/low_min": 0.0,
380
- "clip_ratio/region_mean": 0.014196121599525213,
381
  "completions/clipped_ratio": 0.0,
382
- "completions/max_length": 374.8,
383
- "completions/max_terminated_length": 374.8,
384
- "completions/mean_length": 294.125,
385
- "completions/mean_terminated_length": 294.125,
386
- "completions/min_length": 218.4,
387
- "completions/min_terminated_length": 218.4,
388
- "entropy": 0.7390435010194778,
389
  "epoch": 0.0096,
390
- "frac_reward_zero_std": 0.5,
391
- "grad_norm": 0.059227459132671356,
392
- "kl": 0.8731714501976967,
393
  "learning_rate": 9.950609717322956e-06,
394
- "loss": 0.00016935726162046195,
395
- "num_tokens": 1616969.0,
396
- "reward": 0.22868750393390655,
397
- "reward_std": 0.23873692452907563,
398
- "rewards/env_goofspiel_reward/mean": 0.22868750393390655,
399
- "rewards/env_goofspiel_reward/std": 0.3541231632232666,
400
- "sampling/importance_sampling_ratio/max": 2.4580262422561647,
401
- "sampling/importance_sampling_ratio/mean": 1.027009415626526,
402
- "sampling/importance_sampling_ratio/min": 0.08994593024253845,
403
- "sampling/sampling_logp_difference/max": 1.2262043237686158,
404
- "sampling/sampling_logp_difference/mean": 0.15936054587364196,
405
  "step": 60,
406
- "step_time": 5.197383608799828
407
  },
408
  {
409
- "clip_ratio/high_max": 0.003125,
410
- "clip_ratio/high_mean": 0.0015625,
411
- "clip_ratio/low_mean": 0.004093567281961441,
412
  "clip_ratio/low_min": 0.0,
413
- "clip_ratio/region_mean": 0.005656067281961441,
414
  "completions/clipped_ratio": 0.0,
415
- "completions/max_length": 374.0,
416
- "completions/max_terminated_length": 374.0,
417
- "completions/mean_length": 272.7125,
418
- "completions/mean_terminated_length": 272.7125,
419
- "completions/min_length": 207.0,
420
- "completions/min_terminated_length": 207.0,
421
- "entropy": 0.641479243338108,
422
  "epoch": 0.0104,
423
- "frac_reward_zero_std": 0.5125,
424
- "grad_norm": 0.052345700562000275,
425
- "kl": 0.8409833669662475,
426
  "learning_rate": 9.950595785216067e-06,
427
- "loss": -0.0004354896955192089,
428
- "num_tokens": 1745735.0,
429
- "reward": 0.30350000858306886,
430
- "reward_std": 0.24943190813064575,
431
- "rewards/env_goofspiel_reward/mean": 0.30350000858306886,
432
- "rewards/env_goofspiel_reward/std": 0.43262303471565244,
433
- "sampling/importance_sampling_ratio/max": 2.308809924125671,
434
- "sampling/importance_sampling_ratio/mean": 0.9729154109954834,
435
- "sampling/importance_sampling_ratio/min": 0.2105877071619034,
436
- "sampling/sampling_logp_difference/max": 1.216427493095398,
437
- "sampling/sampling_logp_difference/mean": 0.14146182239055632,
438
  "step": 65,
439
- "step_time": 5.165435432399863
440
  },
441
  {
442
- "clip_ratio/high_max": 0.008823529444634914,
443
- "clip_ratio/high_mean": 0.004411764722317457,
444
- "clip_ratio/low_mean": 0.004421977140009403,
445
  "clip_ratio/low_min": 0.0,
446
- "clip_ratio/region_mean": 0.00883374186232686,
447
  "completions/clipped_ratio": 0.0,
448
- "completions/max_length": 374.0,
449
- "completions/max_terminated_length": 374.0,
450
- "completions/mean_length": 279.70625,
451
- "completions/mean_terminated_length": 279.70625,
452
- "completions/min_length": 194.6,
453
- "completions/min_terminated_length": 194.6,
454
- "entropy": 0.5012152880430222,
455
  "epoch": 0.0112,
456
- "frac_reward_zero_std": 0.325,
457
- "grad_norm": 0.08534521609544754,
458
- "kl": 0.8132658362388611,
459
  "learning_rate": 9.950579224432321e-06,
460
- "loss": -0.0004163400735706091,
461
- "num_tokens": 1877877.0,
462
- "reward": 0.4497500121593475,
463
- "reward_std": 0.32915821075439455,
464
- "rewards/env_goofspiel_reward/mean": 0.4497500121593475,
465
- "rewards/env_goofspiel_reward/std": 0.4289704501628876,
466
- "sampling/importance_sampling_ratio/max": 2.469445323944092,
467
- "sampling/importance_sampling_ratio/mean": 0.9713802933692932,
468
- "sampling/importance_sampling_ratio/min": 0.1933848649263382,
469
- "sampling/sampling_logp_difference/max": 1.2900412559509278,
470
- "sampling/sampling_logp_difference/mean": 0.12238389104604722,
471
  "step": 70,
472
- "step_time": 5.296977608999805
473
  },
474
  {
475
- "clip_ratio/high_max": 0.005409356765449047,
476
- "clip_ratio/high_mean": 0.0027046783827245234,
477
- "clip_ratio/low_mean": 0.011513618659228087,
478
  "clip_ratio/low_min": 0.0,
479
- "clip_ratio/region_mean": 0.01421829704195261,
480
  "completions/clipped_ratio": 0.0,
481
- "completions/max_length": 374.4,
482
- "completions/max_terminated_length": 374.4,
483
- "completions/mean_length": 292.43125,
484
- "completions/mean_terminated_length": 292.43125,
485
- "completions/min_length": 212.0,
486
- "completions/min_terminated_length": 212.0,
487
- "entropy": 0.39477833807468415,
488
  "epoch": 0.012,
489
- "frac_reward_zero_std": 0.45,
490
- "grad_norm": 0.08385973423719406,
491
- "kl": 0.9824723288416862,
492
  "learning_rate": 9.950560034983382e-06,
493
- "loss": -0.0006953636649996043,
494
- "num_tokens": 2012929.0,
495
- "reward": 0.5398125350475311,
496
- "reward_std": 0.2548236042261124,
497
- "rewards/env_goofspiel_reward/mean": 0.5398125350475311,
498
- "rewards/env_goofspiel_reward/std": 0.4067754566669464,
499
- "sampling/importance_sampling_ratio/max": 2.476490020751953,
500
- "sampling/importance_sampling_ratio/mean": 0.9818659067153931,
501
- "sampling/importance_sampling_ratio/min": 0.04677087515592575,
502
- "sampling/sampling_logp_difference/max": 1.8360216617584229,
503
- "sampling/sampling_logp_difference/mean": 0.1339985266327858,
504
  "step": 75,
505
- "step_time": 5.25195668339984
506
  },
507
  {
508
  "epoch": 0.012,
@@ -512,35 +512,596 @@
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
- "eval_completions/max_length": 310.0,
516
- "eval_completions/max_terminated_length": 310.0,
517
- "eval_completions/mean_length": 274.9,
518
- "eval_completions/mean_terminated_length": 274.9,
519
- "eval_completions/min_length": 241.2,
520
- "eval_completions/min_terminated_length": 241.2,
521
- "eval_entropy": 0.2987076103687286,
522
- "eval_frac_reward_zero_std": 0.6,
523
- "eval_kl": 0.8131496548652649,
524
- "eval_loss": -0.0002675331197679043,
525
- "eval_num_tokens": 2012929.0,
526
- "eval_reward": 0.5700000166893006,
527
- "eval_reward_std": 0.2121320366859436,
528
- "eval_rewards/env_goofspiel_reward/mean": 0.5700000166893006,
529
- "eval_rewards/env_goofspiel_reward/std": 0.3252412259578705,
530
- "eval_runtime": 2.5856,
531
- "eval_samples_per_second": 3.868,
532
- "eval_sampling/importance_sampling_ratio/max": 1.6977968692779541,
533
- "eval_sampling/importance_sampling_ratio/mean": 0.947873055934906,
534
- "eval_sampling/importance_sampling_ratio/min": 0.2541299909353256,
535
- "eval_sampling/sampling_logp_difference/max": 1.3066397070884705,
536
- "eval_sampling/sampling_logp_difference/mean": 0.14956869557499886,
537
- "eval_steps_per_second": 1.16,
538
  "step": 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  }
540
  ],
541
  "logging_steps": 5,
542
  "max_steps": 18750,
543
- "num_input_tokens_seen": 2012929,
544
  "num_train_epochs": 3,
545
  "save_steps": 500,
546
  "stateful_callbacks": {
@@ -550,13 +1111,13 @@
550
  "should_evaluate": false,
551
  "should_log": false,
552
  "should_save": true,
553
- "should_training_stop": false
554
  },
555
  "attributes": {}
556
  }
557
  },
558
  "total_flos": 0.0,
559
- "train_batch_size": 4,
560
  "trial_name": null,
561
  "trial_params": null
562
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.02432,
6
  "eval_steps": 500,
7
+ "global_step": 152,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 374.4,
20
+ "completions/max_terminated_length": 374.4,
21
+ "completions/mean_length": 293.55,
22
+ "completions/mean_terminated_length": 293.55,
23
+ "completions/min_length": 181.8,
24
+ "completions/min_terminated_length": 181.8,
25
+ "entropy": 0.7695603430271148,
26
  "epoch": 0.0008,
27
+ "frac_reward_zero_std": 0.675,
28
+ "grad_norm": 0.1396484375,
29
+ "kl": 0.007634526048786938,
30
  "learning_rate": 1.137216e-06,
31
+ "loss": 9.18293430004269e-05,
32
+ "num_tokens": 135798.0,
33
+ "reward": 0.051062504202127455,
34
+ "reward_std": 0.07574881687760353,
35
+ "rewards/env_goofspiel_reward/mean": 0.051062504202127455,
36
+ "rewards/env_goofspiel_reward/std": 0.195309117436409,
37
+ "sampling/importance_sampling_ratio/max": 1.719690752029419,
38
+ "sampling/importance_sampling_ratio/mean": 1.0014273881912232,
39
+ "sampling/importance_sampling_ratio/min": 0.48566103279590606,
40
+ "sampling/sampling_logp_difference/max": 0.7150738000869751,
41
+ "sampling/sampling_logp_difference/mean": 0.0632629081606865,
42
  "step": 5,
43
+ "step_time": 4.057331793400226
44
  },
45
  {
46
+ "clip_ratio/high_max": 0.0,
47
+ "clip_ratio/high_mean": 0.0,
48
+ "clip_ratio/low_mean": 0.0,
49
  "clip_ratio/low_min": 0.0,
50
+ "clip_ratio/region_mean": 0.0,
51
  "completions/clipped_ratio": 0.0,
52
+ "completions/max_length": 374.0,
53
+ "completions/max_terminated_length": 374.0,
54
+ "completions/mean_length": 290.5375,
55
+ "completions/mean_terminated_length": 290.5375,
56
+ "completions/min_length": 193.0,
57
+ "completions/min_terminated_length": 193.0,
58
+ "entropy": 0.6640829563140869,
59
  "epoch": 0.0016,
60
+ "frac_reward_zero_std": 0.7375,
61
+ "grad_norm": 0.296875,
62
+ "kl": 0.009508041350636631,
63
  "learning_rate": 2.5587359999999995e-06,
64
+ "loss": -5.3186528384685515e-05,
65
+ "num_tokens": 271318.0,
66
+ "reward": 0.05906250327825546,
67
+ "reward_std": 0.06461188569664955,
68
+ "rewards/env_goofspiel_reward/mean": 0.05906250327825546,
69
+ "rewards/env_goofspiel_reward/std": 0.16751175224781037,
70
+ "sampling/importance_sampling_ratio/max": 2.0673105001449583,
71
+ "sampling/importance_sampling_ratio/mean": 1.0338432788848877,
72
+ "sampling/importance_sampling_ratio/min": 0.6319645524024964,
73
+ "sampling/sampling_logp_difference/max": 0.6566181659698487,
74
+ "sampling/sampling_logp_difference/mean": 0.05573496893048287,
75
  "step": 10,
76
+ "step_time": 3.559387542600052
77
  },
78
  {
79
+ "clip_ratio/high_max": 0.0,
80
+ "clip_ratio/high_mean": 0.0,
81
+ "clip_ratio/low_mean": 0.0,
82
  "clip_ratio/low_min": 0.0,
83
+ "clip_ratio/region_mean": 0.0,
84
  "completions/clipped_ratio": 0.0,
85
+ "completions/max_length": 378.8,
86
+ "completions/max_terminated_length": 378.8,
87
+ "completions/mean_length": 285.15,
88
+ "completions/mean_terminated_length": 285.15,
89
+ "completions/min_length": 212.0,
90
+ "completions/min_terminated_length": 212.0,
91
+ "entropy": 0.5863359421491623,
92
  "epoch": 0.0024,
93
+ "frac_reward_zero_std": 0.8,
94
+ "grad_norm": 0.1650390625,
95
+ "kl": 0.05467459289357066,
96
  "learning_rate": 3.9802559999999995e-06,
97
+ "loss": 0.00011974496301263571,
98
+ "num_tokens": 403708.0,
99
+ "reward": 0.037000001792330296,
100
+ "reward_std": 0.0535633388790302,
101
+ "rewards/env_goofspiel_reward/mean": 0.037000001792330296,
102
+ "rewards/env_goofspiel_reward/std": 0.11216981350444258,
103
+ "sampling/importance_sampling_ratio/max": 1.5559733867645265,
104
+ "sampling/importance_sampling_ratio/mean": 0.9780893206596375,
105
+ "sampling/importance_sampling_ratio/min": 0.604676628112793,
106
+ "sampling/sampling_logp_difference/max": 0.4918365955352783,
107
+ "sampling/sampling_logp_difference/mean": 0.04973898231983185,
108
  "step": 15,
109
+ "step_time": 3.5593893944003865
110
  },
111
  {
112
+ "clip_ratio/high_max": 0.0,
113
+ "clip_ratio/high_mean": 0.0,
114
+ "clip_ratio/low_mean": 0.0,
115
  "clip_ratio/low_min": 0.0,
116
+ "clip_ratio/region_mean": 0.0,
117
  "completions/clipped_ratio": 0.0,
118
+ "completions/max_length": 374.0,
119
+ "completions/max_terminated_length": 374.0,
120
+ "completions/mean_length": 281.6375,
121
+ "completions/mean_terminated_length": 281.6375,
122
+ "completions/min_length": 212.0,
123
+ "completions/min_terminated_length": 212.0,
124
+ "entropy": 0.4126308411359787,
125
  "epoch": 0.0032,
126
+ "frac_reward_zero_std": 0.7375,
127
+ "grad_norm": 0.31640625,
128
+ "kl": 0.139003873616457,
129
  "learning_rate": 5.401775999999999e-06,
130
+ "loss": -0.00015898187411949037,
131
+ "num_tokens": 536243.0,
132
+ "reward": 0.06343750283122063,
133
+ "reward_std": 0.0905980572104454,
134
+ "rewards/env_goofspiel_reward/mean": 0.06343750283122063,
135
+ "rewards/env_goofspiel_reward/std": 0.19052477180957794,
136
+ "sampling/importance_sampling_ratio/max": 1.6631618976593017,
137
+ "sampling/importance_sampling_ratio/mean": 1.0170260667800903,
138
+ "sampling/importance_sampling_ratio/min": 0.643394160270691,
139
+ "sampling/sampling_logp_difference/max": 0.5536829710006714,
140
+ "sampling/sampling_logp_difference/mean": 0.03567908257246018,
141
  "step": 20,
142
+ "step_time": 3.513668759400389
143
  },
144
  {
145
+ "clip_ratio/high_max": 0.0,
146
+ "clip_ratio/high_mean": 0.0,
147
+ "clip_ratio/low_mean": 0.0,
148
+ "clip_ratio/low_min": 0.0,
149
+ "clip_ratio/region_mean": 0.0,
150
  "completions/clipped_ratio": 0.0,
151
+ "completions/max_length": 374.0,
152
+ "completions/max_terminated_length": 374.0,
153
+ "completions/mean_length": 300.975,
154
+ "completions/mean_terminated_length": 300.975,
155
+ "completions/min_length": 219.0,
156
+ "completions/min_terminated_length": 219.0,
157
+ "entropy": 0.5057466819882392,
158
  "epoch": 0.004,
159
+ "frac_reward_zero_std": 0.8625,
160
+ "grad_norm": 0.201171875,
161
+ "kl": 0.14662861209362746,
162
  "learning_rate": 6.8232959999999994e-06,
163
+ "loss": 0.00011511286720633507,
164
+ "num_tokens": 674802.0,
165
+ "reward": 0.056125002296175806,
166
+ "reward_std": 0.05851308616111055,
167
+ "rewards/env_goofspiel_reward/mean": 0.056125002296175806,
168
+ "rewards/env_goofspiel_reward/std": 0.1724803472403437,
169
+ "sampling/importance_sampling_ratio/max": 1.8632315158843995,
170
+ "sampling/importance_sampling_ratio/mean": 0.9876452207565307,
171
+ "sampling/importance_sampling_ratio/min": 0.5998193681240082,
172
+ "sampling/sampling_logp_difference/max": 0.5798031091690063,
173
+ "sampling/sampling_logp_difference/mean": 0.049172034859657286,
174
  "step": 25,
175
+ "step_time": 3.5666124442001093
176
  },
177
  {
178
+ "clip_ratio/high_max": 0.0,
179
+ "clip_ratio/high_mean": 0.0,
180
+ "clip_ratio/low_mean": 0.0,
181
+ "clip_ratio/low_min": 0.0,
182
+ "clip_ratio/region_mean": 0.0,
183
  "completions/clipped_ratio": 0.0,
184
+ "completions/max_length": 373.8,
185
+ "completions/max_terminated_length": 373.8,
186
+ "completions/mean_length": 283.69375,
187
+ "completions/mean_terminated_length": 283.69375,
188
  "completions/min_length": 212.0,
189
  "completions/min_terminated_length": 212.0,
190
+ "entropy": 0.5599821880459785,
191
  "epoch": 0.0048,
192
+ "frac_reward_zero_std": 0.775,
193
+ "grad_norm": 0.30859375,
194
+ "kl": 0.14428229751065375,
195
  "learning_rate": 8.244816e-06,
196
+ "loss": -3.07892682030797e-05,
197
+ "num_tokens": 807598.0,
198
+ "reward": 0.08625000044703483,
199
+ "reward_std": 0.10076271444559097,
200
+ "rewards/env_goofspiel_reward/mean": 0.08625000044703483,
201
+ "rewards/env_goofspiel_reward/std": 0.21401541233062743,
202
+ "sampling/importance_sampling_ratio/max": 1.4588377714157104,
203
+ "sampling/importance_sampling_ratio/mean": 0.9682739973068237,
204
+ "sampling/importance_sampling_ratio/min": 0.5809138238430023,
205
+ "sampling/sampling_logp_difference/max": 0.5705435633659363,
206
+ "sampling/sampling_logp_difference/mean": 0.04744169861078262,
207
  "step": 30,
208
+ "step_time": 3.545922210400022
209
  },
210
  {
211
+ "clip_ratio/high_max": 0.0,
212
+ "clip_ratio/high_mean": 0.0,
213
+ "clip_ratio/low_mean": 0.0,
214
+ "clip_ratio/low_min": 0.0,
215
+ "clip_ratio/region_mean": 0.0,
216
  "completions/clipped_ratio": 0.0,
217
+ "completions/max_length": 365.6,
218
+ "completions/max_terminated_length": 365.6,
219
+ "completions/mean_length": 290.56875,
220
+ "completions/mean_terminated_length": 290.56875,
221
  "completions/min_length": 212.0,
222
  "completions/min_terminated_length": 212.0,
223
+ "entropy": 0.5238839864730835,
224
  "epoch": 0.0056,
225
+ "frac_reward_zero_std": 0.9,
226
+ "grad_norm": 0.0791015625,
227
+ "kl": 0.22365115247666836,
228
  "learning_rate": 9.666336e-06,
229
+ "loss": -1.7423409735783936e-05,
230
+ "num_tokens": 941209.0,
231
+ "reward": 0.04125000163912773,
232
+ "reward_std": 0.04772970825433731,
233
+ "rewards/env_goofspiel_reward/mean": 0.04125000163912773,
234
+ "rewards/env_goofspiel_reward/std": 0.16298522651195527,
235
+ "sampling/importance_sampling_ratio/max": 1.4255825757980347,
236
+ "sampling/importance_sampling_ratio/mean": 0.973192298412323,
237
+ "sampling/importance_sampling_ratio/min": 0.5944858670234681,
238
+ "sampling/sampling_logp_difference/max": 0.4666349172592163,
239
+ "sampling/sampling_logp_difference/mean": 0.04362327083945274,
240
  "step": 35,
241
+ "step_time": 3.448241228200095
242
  },
243
  {
244
+ "clip_ratio/high_max": 0.0,
245
+ "clip_ratio/high_mean": 0.0,
246
+ "clip_ratio/low_mean": 0.0,
247
  "clip_ratio/low_min": 0.0,
248
+ "clip_ratio/region_mean": 0.0,
249
  "completions/clipped_ratio": 0.0,
250
+ "completions/max_length": 373.8,
251
+ "completions/max_terminated_length": 373.8,
252
+ "completions/mean_length": 289.4875,
253
+ "completions/mean_terminated_length": 289.4875,
254
+ "completions/min_length": 206.4,
255
+ "completions/min_terminated_length": 206.4,
256
+ "entropy": 0.5457443177700043,
257
  "epoch": 0.0064,
258
+ "frac_reward_zero_std": 0.8125,
259
+ "grad_norm": 0.05517578125,
260
+ "kl": 0.21872444674372674,
261
  "learning_rate": 9.95063915881342e-06,
262
+ "loss": 0.00012433364754542707,
263
+ "num_tokens": 1076167.0,
264
+ "reward": 0.05981250181794166,
265
+ "reward_std": 0.07451137900352478,
266
+ "rewards/env_goofspiel_reward/mean": 0.05981250181794166,
267
+ "rewards/env_goofspiel_reward/std": 0.1769598752260208,
268
+ "sampling/importance_sampling_ratio/max": 1.5738928318023682,
269
+ "sampling/importance_sampling_ratio/mean": 1.0145540714263916,
270
+ "sampling/importance_sampling_ratio/min": 0.6491193056106568,
271
+ "sampling/sampling_logp_difference/max": 0.46645350456237794,
272
+ "sampling/sampling_logp_difference/mean": 0.04598992168903351,
273
  "step": 40,
274
+ "step_time": 3.4551121715998305
275
  },
276
  {
277
+ "clip_ratio/high_max": 0.0,
278
+ "clip_ratio/high_mean": 0.0,
279
+ "clip_ratio/low_mean": 0.0,
280
+ "clip_ratio/low_min": 0.0,
281
+ "clip_ratio/region_mean": 0.0,
282
+ "completions/clipped_ratio": 0.00625,
283
+ "completions/max_length": 462.0,
284
+ "completions/max_terminated_length": 374.0,
285
+ "completions/mean_length": 300.95625,
286
+ "completions/mean_terminated_length": 297.8118957519531,
287
+ "completions/min_length": 212.0,
288
+ "completions/min_terminated_length": 212.0,
289
+ "entropy": 0.5893479824066162,
290
  "epoch": 0.0072,
291
+ "frac_reward_zero_std": 0.7875,
292
+ "grad_norm": 0.224609375,
293
+ "kl": 0.2417328185401857,
294
  "learning_rate": 9.950635741493589e-06,
295
+ "loss": 0.02391626685857773,
296
+ "num_tokens": 1212925.0,
297
+ "reward": 0.05993750244379044,
298
+ "reward_std": 0.08494120314717293,
299
+ "rewards/env_goofspiel_reward/mean": 0.05993750244379044,
300
+ "rewards/env_goofspiel_reward/std": 0.17770446538925172,
301
+ "sampling/importance_sampling_ratio/max": 2.169111466407776,
302
+ "sampling/importance_sampling_ratio/mean": 1.020458698272705,
303
+ "sampling/importance_sampling_ratio/min": 0.6463833510875702,
304
+ "sampling/sampling_logp_difference/max": 0.5775727272033692,
305
+ "sampling/sampling_logp_difference/mean": 0.04204721674323082,
306
  "step": 45,
307
+ "step_time": 4.194754163199832
308
  },
309
  {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
  "completions/clipped_ratio": 0.0,
316
+ "completions/max_length": 373.4,
317
+ "completions/max_terminated_length": 373.4,
318
+ "completions/mean_length": 299.31875,
319
+ "completions/mean_terminated_length": 299.31875,
320
+ "completions/min_length": 212.0,
321
+ "completions/min_terminated_length": 212.0,
322
+ "entropy": 0.4704709783196449,
323
  "epoch": 0.008,
324
+ "frac_reward_zero_std": 0.85,
325
+ "grad_norm": 0.103515625,
326
+ "kl": 0.4402174398303032,
327
  "learning_rate": 9.950629695468755e-06,
328
+ "loss": 0.00011011158348992467,
329
+ "num_tokens": 1350825.0,
330
+ "reward": 0.04487500190734863,
331
+ "reward_std": 0.063816387206316,
332
+ "rewards/env_goofspiel_reward/mean": 0.04487500190734863,
333
+ "rewards/env_goofspiel_reward/std": 0.17575940787792205,
334
+ "sampling/importance_sampling_ratio/max": 1.630899429321289,
335
+ "sampling/importance_sampling_ratio/mean": 0.9812780380249023,
336
+ "sampling/importance_sampling_ratio/min": 0.3998015284538269,
337
+ "sampling/sampling_logp_difference/max": 0.8225874900817871,
338
+ "sampling/sampling_logp_difference/mean": 0.057265565544366834,
339
  "step": 50,
340
+ "step_time": 3.4821775794000134
341
  },
342
  {
343
+ "clip_ratio/high_max": 0.0,
344
+ "clip_ratio/high_mean": 0.0,
345
+ "clip_ratio/low_mean": 0.0,
346
  "clip_ratio/low_min": 0.0,
347
+ "clip_ratio/region_mean": 0.0,
348
  "completions/clipped_ratio": 0.0,
349
+ "completions/max_length": 374.0,
350
+ "completions/max_terminated_length": 374.0,
351
+ "completions/mean_length": 284.0375,
352
+ "completions/mean_terminated_length": 284.0375,
353
+ "completions/min_length": 206.8,
354
+ "completions/min_terminated_length": 206.8,
355
+ "entropy": 0.23580820970237254,
356
  "epoch": 0.0088,
357
+ "frac_reward_zero_std": 0.7875,
358
+ "grad_norm": 0.4765625,
359
+ "kl": 0.6652570061385632,
360
  "learning_rate": 9.950621020743173e-06,
361
+ "loss": 8.13461490906775e-05,
362
+ "num_tokens": 1483464.0,
363
+ "reward": 0.05993750244379044,
364
+ "reward_std": 0.08494120314717293,
365
+ "rewards/env_goofspiel_reward/mean": 0.05993750244379044,
366
+ "rewards/env_goofspiel_reward/std": 0.18124795854091644,
367
+ "sampling/importance_sampling_ratio/max": 1.8012218236923219,
368
+ "sampling/importance_sampling_ratio/mean": 0.9974164485931396,
369
+ "sampling/importance_sampling_ratio/min": 0.6288759648799896,
370
+ "sampling/sampling_logp_difference/max": 0.6000619411468506,
371
+ "sampling/sampling_logp_difference/mean": 0.03408294580876827,
372
  "step": 55,
373
+ "step_time": 3.518262126200534
374
  },
375
  {
376
+ "clip_ratio/high_max": 0.0,
377
+ "clip_ratio/high_mean": 0.0,
378
+ "clip_ratio/low_mean": 0.0,
379
  "clip_ratio/low_min": 0.0,
380
+ "clip_ratio/region_mean": 0.0,
381
  "completions/clipped_ratio": 0.0,
382
+ "completions/max_length": 373.8,
383
+ "completions/max_terminated_length": 373.8,
384
+ "completions/mean_length": 294.53125,
385
+ "completions/mean_terminated_length": 294.53125,
386
+ "completions/min_length": 219.0,
387
+ "completions/min_terminated_length": 219.0,
388
+ "entropy": 0.06949742138385773,
389
  "epoch": 0.0096,
390
+ "frac_reward_zero_std": 0.8125,
391
+ "grad_norm": 0.033447265625,
392
+ "kl": 0.8152253001928329,
393
  "learning_rate": 9.950609717322956e-06,
394
+ "loss": 0.00011886359425261617,
395
+ "num_tokens": 1618570.0,
396
+ "reward": 0.07500000223517418,
397
+ "reward_std": 0.08485281318426133,
398
+ "rewards/env_goofspiel_reward/mean": 0.07500000223517418,
399
+ "rewards/env_goofspiel_reward/std": 0.1962749719619751,
400
+ "sampling/importance_sampling_ratio/max": 1.434102201461792,
401
+ "sampling/importance_sampling_ratio/mean": 1.0074862837791443,
402
+ "sampling/importance_sampling_ratio/min": 0.8809547781944275,
403
+ "sampling/sampling_logp_difference/max": 0.35270476043224336,
404
+ "sampling/sampling_logp_difference/mean": 0.007789037330076099,
405
  "step": 60,
406
+ "step_time": 3.4527530410003235
407
  },
408
  {
409
+ "clip_ratio/high_max": 0.0,
410
+ "clip_ratio/high_mean": 0.0,
411
+ "clip_ratio/low_mean": 0.0,
412
  "clip_ratio/low_min": 0.0,
413
+ "clip_ratio/region_mean": 0.0,
414
  "completions/clipped_ratio": 0.0,
415
+ "completions/max_length": 445.2,
416
+ "completions/max_terminated_length": 445.2,
417
+ "completions/mean_length": 325.83125,
418
+ "completions/mean_terminated_length": 325.83125,
419
+ "completions/min_length": 251.2,
420
+ "completions/min_terminated_length": 251.2,
421
+ "entropy": 0.08617601059377193,
422
  "epoch": 0.0104,
423
+ "frac_reward_zero_std": 0.875,
424
+ "grad_norm": 0.018798828125,
425
+ "kl": 0.7552784413099289,
426
  "learning_rate": 9.950595785216067e-06,
427
+ "loss": 0.00014277141308411957,
428
+ "num_tokens": 1755265.0,
429
+ "reward": 0.03750000149011612,
430
+ "reward_std": 0.0530330091714859,
431
+ "rewards/env_goofspiel_reward/mean": 0.03750000149011612,
432
+ "rewards/env_goofspiel_reward/std": 0.13009902238845825,
433
+ "sampling/importance_sampling_ratio/max": 1.3271136283874512,
434
+ "sampling/importance_sampling_ratio/mean": 1.004535722732544,
435
+ "sampling/importance_sampling_ratio/min": 0.7700567066669464,
436
+ "sampling/sampling_logp_difference/max": 0.3696352869272232,
437
+ "sampling/sampling_logp_difference/mean": 0.008786045084707438,
438
  "step": 65,
439
+ "step_time": 3.861200922199896
440
  },
441
  {
442
+ "clip_ratio/high_max": 0.0,
443
+ "clip_ratio/high_mean": 0.0,
444
+ "clip_ratio/low_mean": 0.0,
445
  "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.0,
447
  "completions/clipped_ratio": 0.0,
448
+ "completions/max_length": 732.0,
449
+ "completions/max_terminated_length": 732.0,
450
+ "completions/mean_length": 544.625,
451
+ "completions/mean_terminated_length": 544.625,
452
+ "completions/min_length": 383.6,
453
+ "completions/min_terminated_length": 383.6,
454
+ "entropy": 0.33896628841757775,
455
  "epoch": 0.0112,
456
+ "frac_reward_zero_std": 0.75,
457
+ "grad_norm": 0.2109375,
458
+ "kl": 1.354549203068018,
459
  "learning_rate": 9.950579224432321e-06,
460
+ "loss": 0.00035492791794240476,
461
+ "num_tokens": 1929034.0,
462
+ "reward": 0.04277083389461041,
463
+ "reward_std": 0.061901307106018065,
464
+ "rewards/env_goofspiel_reward/mean": 0.04277083389461041,
465
+ "rewards/env_goofspiel_reward/std": 0.14327263236045837,
466
+ "sampling/importance_sampling_ratio/max": 1.457657814025879,
467
+ "sampling/importance_sampling_ratio/mean": 0.9943998098373413,
468
+ "sampling/importance_sampling_ratio/min": 0.6945780634880065,
469
+ "sampling/sampling_logp_difference/max": 0.450562047958374,
470
+ "sampling/sampling_logp_difference/mean": 0.02044728323817253,
471
  "step": 70,
472
+ "step_time": 5.568116331400051
473
  },
474
  {
475
+ "clip_ratio/high_max": 0.0,
476
+ "clip_ratio/high_mean": 0.0,
477
+ "clip_ratio/low_mean": 0.0,
478
  "clip_ratio/low_min": 0.0,
479
+ "clip_ratio/region_mean": 0.0,
480
  "completions/clipped_ratio": 0.0,
481
+ "completions/max_length": 731.6,
482
+ "completions/max_terminated_length": 731.6,
483
+ "completions/mean_length": 564.09375,
484
+ "completions/mean_terminated_length": 564.09375,
485
+ "completions/min_length": 408.0,
486
+ "completions/min_terminated_length": 408.0,
487
+ "entropy": 0.490242238342762,
488
  "epoch": 0.012,
489
+ "frac_reward_zero_std": 0.625,
490
+ "grad_norm": 0.17578125,
491
+ "kl": 0.48342139422893526,
492
  "learning_rate": 9.950560034983382e-06,
493
+ "loss": 0.00017674852861091495,
494
+ "num_tokens": 2107362.0,
495
+ "reward": 0.055687499977648255,
496
+ "reward_std": 0.07409889809787273,
497
+ "rewards/env_goofspiel_reward/mean": 0.055687499977648255,
498
+ "rewards/env_goofspiel_reward/std": 0.16855775713920593,
499
+ "sampling/importance_sampling_ratio/max": 1.2386622190475465,
500
+ "sampling/importance_sampling_ratio/mean": 0.9834899187088013,
501
+ "sampling/importance_sampling_ratio/min": 0.776045274734497,
502
+ "sampling/sampling_logp_difference/max": 0.22788455486297607,
503
+ "sampling/sampling_logp_difference/mean": 0.01976088173687458,
504
  "step": 75,
505
+ "step_time": 5.500796792399524
506
  },
507
  {
508
  "epoch": 0.012,
 
512
  "eval_clip_ratio/low_min": 0.0,
513
  "eval_clip_ratio/region_mean": 0.0,
514
  "eval_completions/clipped_ratio": 0.0,
515
+ "eval_completions/max_length": 614.4,
516
+ "eval_completions/max_terminated_length": 614.4,
517
+ "eval_completions/mean_length": 572.65,
518
+ "eval_completions/mean_terminated_length": 572.65,
519
+ "eval_completions/min_length": 531.8,
520
+ "eval_completions/min_terminated_length": 531.8,
521
+ "eval_entropy": 0.5134166538715362,
522
+ "eval_frac_reward_zero_std": 0.8,
523
+ "eval_kl": 0.5596067428588867,
524
+ "eval_loss": -1.7431222659070045e-05,
525
+ "eval_num_tokens": 2107362.0,
526
+ "eval_reward": 0.021166666876524687,
527
+ "eval_reward_std": 0.03134840028360486,
528
+ "eval_rewards/env_goofspiel_reward/mean": 0.021166666876524687,
529
+ "eval_rewards/env_goofspiel_reward/std": 0.04433333072811365,
530
+ "eval_runtime": 2.6749,
531
+ "eval_samples_per_second": 3.738,
532
+ "eval_sampling/importance_sampling_ratio/max": 1.0850860834121705,
533
+ "eval_sampling/importance_sampling_ratio/mean": 1.0015697360038758,
534
+ "eval_sampling/importance_sampling_ratio/min": 0.9188181281089782,
535
+ "eval_sampling/sampling_logp_difference/max": 0.10906529426574707,
536
+ "eval_sampling/sampling_logp_difference/mean": 0.017179742455482483,
537
+ "eval_steps_per_second": 1.122,
538
  "step": 75
539
+ },
540
+ {
541
+ "clip_ratio/high_max": 0.0,
542
+ "clip_ratio/high_mean": 0.0,
543
+ "clip_ratio/low_mean": 0.0,
544
+ "clip_ratio/low_min": 0.0,
545
+ "clip_ratio/region_mean": 0.0,
546
+ "completions/clipped_ratio": 0.0,
547
+ "completions/max_length": 733.2,
548
+ "completions/max_terminated_length": 733.2,
549
+ "completions/mean_length": 549.075,
550
+ "completions/mean_terminated_length": 549.075,
551
+ "completions/min_length": 395.8,
552
+ "completions/min_terminated_length": 395.8,
553
+ "entropy": 0.527910877764225,
554
+ "epoch": 0.0128,
555
+ "frac_reward_zero_std": 0.7,
556
+ "grad_norm": 0.255859375,
557
+ "kl": 0.47208923250436785,
558
+ "learning_rate": 9.95053821688277e-06,
559
+ "loss": 0.0007011178880929947,
560
+ "num_tokens": 2282715.0,
561
+ "reward": 0.05902083367109299,
562
+ "reward_std": 0.07722195237874985,
563
+ "rewards/env_goofspiel_reward/mean": 0.05902083367109299,
564
+ "rewards/env_goofspiel_reward/std": 0.15880293548107147,
565
+ "sampling/importance_sampling_ratio/max": 1.3604092836380004,
566
+ "sampling/importance_sampling_ratio/mean": 1.0046632289886475,
567
+ "sampling/importance_sampling_ratio/min": 0.7511004090309144,
568
+ "sampling/sampling_logp_difference/max": 0.3062611103057861,
569
+ "sampling/sampling_logp_difference/mean": 0.022707394510507583,
570
+ "step": 80,
571
+ "step_time": 5.533078667399787
572
+ },
573
+ {
574
+ "clip_ratio/high_max": 0.0,
575
+ "clip_ratio/high_mean": 0.0,
576
+ "clip_ratio/low_mean": 0.0,
577
+ "clip_ratio/low_min": 0.0,
578
+ "clip_ratio/region_mean": 0.0,
579
+ "completions/clipped_ratio": 0.0,
580
+ "completions/max_length": 732.2,
581
+ "completions/max_terminated_length": 732.2,
582
+ "completions/mean_length": 561.8875,
583
+ "completions/mean_terminated_length": 561.8875,
584
+ "completions/min_length": 408.0,
585
+ "completions/min_terminated_length": 408.0,
586
+ "entropy": 0.46153847873210907,
587
+ "epoch": 0.0136,
588
+ "frac_reward_zero_std": 0.825,
589
+ "grad_norm": 0.095703125,
590
+ "kl": 0.5197702750563622,
591
+ "learning_rate": 9.950513770145857e-06,
592
+ "loss": 0.00010828666854649782,
593
+ "num_tokens": 2459616.0,
594
+ "reward": 0.04585416615009308,
595
+ "reward_std": 0.057717590034008025,
596
+ "rewards/env_goofspiel_reward/mean": 0.04585416615009308,
597
+ "rewards/env_goofspiel_reward/std": 0.1589788019657135,
598
+ "sampling/importance_sampling_ratio/max": 1.5443368434906006,
599
+ "sampling/importance_sampling_ratio/mean": 1.0077369570732118,
600
+ "sampling/importance_sampling_ratio/min": 0.7487894654273987,
601
+ "sampling/sampling_logp_difference/max": 0.36205780506134033,
602
+ "sampling/sampling_logp_difference/mean": 0.021098615229129793,
603
+ "step": 85,
604
+ "step_time": 5.460692007999751
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completions/clipped_ratio": 0.0,
613
+ "completions/max_length": 732.0,
614
+ "completions/max_terminated_length": 732.0,
615
+ "completions/mean_length": 581.36875,
616
+ "completions/mean_terminated_length": 581.36875,
617
+ "completions/min_length": 408.0,
618
+ "completions/min_terminated_length": 408.0,
619
+ "entropy": 0.48351355642080307,
620
+ "epoch": 0.0144,
621
+ "frac_reward_zero_std": 0.8375,
622
+ "grad_norm": 0.1630859375,
623
+ "kl": 0.6207657802850008,
624
+ "learning_rate": 9.950486694789862e-06,
625
+ "loss": 0.00018907544435933232,
626
+ "num_tokens": 2641146.0,
627
+ "reward": 0.026770833879709244,
628
+ "reward_std": 0.03856678232550621,
629
+ "rewards/env_goofspiel_reward/mean": 0.026770833879709244,
630
+ "rewards/env_goofspiel_reward/std": 0.10960558950901031,
631
+ "sampling/importance_sampling_ratio/max": 1.2647137641906738,
632
+ "sampling/importance_sampling_ratio/mean": 1.004696297645569,
633
+ "sampling/importance_sampling_ratio/min": 0.7099822998046875,
634
+ "sampling/sampling_logp_difference/max": 0.2817555904388428,
635
+ "sampling/sampling_logp_difference/mean": 0.022446612268686293,
636
+ "step": 90,
637
+ "step_time": 5.498646953999923
638
+ },
639
+ {
640
+ "clip_ratio/high_max": 0.0,
641
+ "clip_ratio/high_mean": 0.0,
642
+ "clip_ratio/low_mean": 0.0,
643
+ "clip_ratio/low_min": 0.0,
644
+ "clip_ratio/region_mean": 0.0,
645
+ "completions/clipped_ratio": 0.0,
646
+ "completions/max_length": 732.0,
647
+ "completions/max_terminated_length": 732.0,
648
+ "completions/mean_length": 549.825,
649
+ "completions/mean_terminated_length": 549.825,
650
+ "completions/min_length": 408.0,
651
+ "completions/min_terminated_length": 408.0,
652
+ "entropy": 0.5179941833019257,
653
+ "epoch": 0.0152,
654
+ "frac_reward_zero_std": 0.8125,
655
+ "grad_norm": 0.0908203125,
656
+ "kl": 0.46866228580474856,
657
+ "learning_rate": 9.95045699083386e-06,
658
+ "loss": 0.00028545891400426625,
659
+ "num_tokens": 2815323.0,
660
+ "reward": 0.03772916682064533,
661
+ "reward_std": 0.05388742834329605,
662
+ "rewards/env_goofspiel_reward/mean": 0.03772916682064533,
663
+ "rewards/env_goofspiel_reward/std": 0.13617317676544188,
664
+ "sampling/importance_sampling_ratio/max": 1.2603100776672362,
665
+ "sampling/importance_sampling_ratio/mean": 1.0017661929130555,
666
+ "sampling/importance_sampling_ratio/min": 0.7478124976158143,
667
+ "sampling/sampling_logp_difference/max": 0.25655815601348875,
668
+ "sampling/sampling_logp_difference/mean": 0.023887046799063682,
669
+ "step": 95,
670
+ "step_time": 5.659158172999559
671
+ },
672
+ {
673
+ "clip_ratio/high_max": 0.0,
674
+ "clip_ratio/high_mean": 0.0,
675
+ "clip_ratio/low_mean": 0.0,
676
+ "clip_ratio/low_min": 0.0,
677
+ "clip_ratio/region_mean": 0.0,
678
+ "completions/clipped_ratio": 0.0,
679
+ "completions/max_length": 1003.6,
680
+ "completions/max_terminated_length": 1003.6,
681
+ "completions/mean_length": 754.59375,
682
+ "completions/mean_terminated_length": 754.59375,
683
+ "completions/min_length": 550.8,
684
+ "completions/min_terminated_length": 550.8,
685
+ "entropy": 0.5820856660604476,
686
+ "epoch": 0.016,
687
+ "frac_reward_zero_std": 0.8125,
688
+ "grad_norm": 0.171875,
689
+ "kl": 0.4048814922571182,
690
+ "learning_rate": 9.950424658298776e-06,
691
+ "loss": 0.0002957838121801615,
692
+ "num_tokens": 3022802.0,
693
+ "reward": 0.03360416684299707,
694
+ "reward_std": 0.0420433908700943,
695
+ "rewards/env_goofspiel_reward/mean": 0.03360416684299707,
696
+ "rewards/env_goofspiel_reward/std": 0.12295188903808593,
697
+ "sampling/importance_sampling_ratio/max": 1.3984624147415161,
698
+ "sampling/importance_sampling_ratio/mean": 0.980130672454834,
699
+ "sampling/importance_sampling_ratio/min": 0.6590921759605408,
700
+ "sampling/sampling_logp_difference/max": 0.3471600294113159,
701
+ "sampling/sampling_logp_difference/mean": 0.02802230753004551,
702
+ "step": 100,
703
+ "step_time": 7.31073631659965
704
+ },
705
+ {
706
+ "clip_ratio/high_max": 0.0,
707
+ "clip_ratio/high_mean": 0.0,
708
+ "clip_ratio/low_mean": 0.0,
709
+ "clip_ratio/low_min": 0.0,
710
+ "clip_ratio/region_mean": 0.0,
711
+ "completions/clipped_ratio": 0.0,
712
+ "completions/max_length": 1075.0,
713
+ "completions/max_terminated_length": 1075.0,
714
+ "completions/mean_length": 814.73125,
715
+ "completions/mean_terminated_length": 814.73125,
716
+ "completions/min_length": 590.0,
717
+ "completions/min_terminated_length": 590.0,
718
+ "entropy": 0.6229563593864441,
719
+ "epoch": 0.0168,
720
+ "frac_reward_zero_std": 0.775,
721
+ "grad_norm": 0.09326171875,
722
+ "kl": 0.4244683228433132,
723
+ "learning_rate": 9.950389697207388e-06,
724
+ "loss": 0.00020875066984444858,
725
+ "num_tokens": 3239290.0,
726
+ "reward": 0.038875000365078446,
727
+ "reward_std": 0.049674246832728385,
728
+ "rewards/env_goofspiel_reward/mean": 0.038875000365078446,
729
+ "rewards/env_goofspiel_reward/std": 0.11504672318696976,
730
+ "sampling/importance_sampling_ratio/max": 1.3492220401763917,
731
+ "sampling/importance_sampling_ratio/mean": 0.9891064882278442,
732
+ "sampling/importance_sampling_ratio/min": 0.5561659098671476,
733
+ "sampling/sampling_logp_difference/max": 4.213754487037659,
734
+ "sampling/sampling_logp_difference/mean": 0.044166411831974985,
735
+ "step": 105,
736
+ "step_time": 7.745885893800368
737
+ },
738
+ {
739
+ "clip_ratio/high_max": 0.0,
740
+ "clip_ratio/high_mean": 0.0,
741
+ "clip_ratio/low_mean": 0.0,
742
+ "clip_ratio/low_min": 0.0,
743
+ "clip_ratio/region_mean": 0.0,
744
+ "completions/clipped_ratio": 0.0,
745
+ "completions/max_length": 1075.6,
746
+ "completions/max_terminated_length": 1075.6,
747
+ "completions/mean_length": 814.6375,
748
+ "completions/mean_terminated_length": 814.6375,
749
+ "completions/min_length": 590.0,
750
+ "completions/min_terminated_length": 590.0,
751
+ "entropy": 0.6465200364589692,
752
+ "epoch": 0.0176,
753
+ "frac_reward_zero_std": 0.7875,
754
+ "grad_norm": 0.007080078125,
755
+ "kl": 0.3995703622698784,
756
+ "learning_rate": 9.950352107584324e-06,
757
+ "loss": 0.0007265920285135508,
758
+ "num_tokens": 3456558.0,
759
+ "reward": 0.023437499813735486,
760
+ "reward_std": 0.03473661988973618,
761
+ "rewards/env_goofspiel_reward/mean": 0.023437499813735486,
762
+ "rewards/env_goofspiel_reward/std": 0.0773700624704361,
763
+ "sampling/importance_sampling_ratio/max": 1.4152228832244873,
764
+ "sampling/importance_sampling_ratio/mean": 0.9911394953727722,
765
+ "sampling/importance_sampling_ratio/min": 0.4911388456960605,
766
+ "sampling/sampling_logp_difference/max": 4.677203369140625,
767
+ "sampling/sampling_logp_difference/mean": 0.04649800434708595,
768
+ "step": 110,
769
+ "step_time": 7.710595274399202
770
+ },
771
+ {
772
+ "clip_ratio/high_max": 0.0,
773
+ "clip_ratio/high_mean": 0.0,
774
+ "clip_ratio/low_mean": 0.0,
775
+ "clip_ratio/low_min": 0.0,
776
+ "clip_ratio/region_mean": 0.0,
777
+ "completions/clipped_ratio": 0.0,
778
+ "completions/max_length": 1075.0,
779
+ "completions/max_terminated_length": 1075.0,
780
+ "completions/mean_length": 831.425,
781
+ "completions/mean_terminated_length": 831.425,
782
+ "completions/min_length": 590.0,
783
+ "completions/min_terminated_length": 590.0,
784
+ "entropy": 0.6319158881902694,
785
+ "epoch": 0.0184,
786
+ "frac_reward_zero_std": 0.7625,
787
+ "grad_norm": 0.30078125,
788
+ "kl": 0.4417851775884628,
789
+ "learning_rate": 9.950311889456064e-06,
790
+ "loss": -0.00014657191932201385,
791
+ "num_tokens": 3677316.0,
792
+ "reward": 0.03468749914318323,
793
+ "reward_std": 0.0499394167214632,
794
+ "rewards/env_goofspiel_reward/mean": 0.03468749914318323,
795
+ "rewards/env_goofspiel_reward/std": 0.10438641458749771,
796
+ "sampling/importance_sampling_ratio/max": 1.4034383773803711,
797
+ "sampling/importance_sampling_ratio/mean": 1.003822433948517,
798
+ "sampling/importance_sampling_ratio/min": 0.7400717735290527,
799
+ "sampling/sampling_logp_difference/max": 0.2535316705703735,
800
+ "sampling/sampling_logp_difference/mean": 0.02623956575989723,
801
+ "step": 115,
802
+ "step_time": 7.7139906279999195
803
+ },
804
+ {
805
+ "clip_ratio/high_max": 0.0,
806
+ "clip_ratio/high_mean": 0.0,
807
+ "clip_ratio/low_mean": 0.0,
808
+ "clip_ratio/low_min": 0.0,
809
+ "clip_ratio/region_mean": 0.0,
810
+ "completions/clipped_ratio": 0.0,
811
+ "completions/max_length": 1075.2,
812
+ "completions/max_terminated_length": 1075.2,
813
+ "completions/mean_length": 809.6125,
814
+ "completions/mean_terminated_length": 809.6125,
815
+ "completions/min_length": 578.0,
816
+ "completions/min_terminated_length": 578.0,
817
+ "entropy": 0.6280775606632233,
818
+ "epoch": 0.0192,
819
+ "frac_reward_zero_std": 0.825,
820
+ "grad_norm": 0.1328125,
821
+ "kl": 0.39189435467123984,
822
+ "learning_rate": 9.950269042850943e-06,
823
+ "loss": 0.00020067014265805482,
824
+ "num_tokens": 3893277.0,
825
+ "reward": 0.028249999321997166,
826
+ "reward_std": 0.0342946782708168,
827
+ "rewards/env_goofspiel_reward/mean": 0.028249999321997166,
828
+ "rewards/env_goofspiel_reward/std": 0.10060491263866425,
829
+ "sampling/importance_sampling_ratio/max": 1.449436855316162,
830
+ "sampling/importance_sampling_ratio/mean": 0.9966808676719665,
831
+ "sampling/importance_sampling_ratio/min": 0.6293053507804871,
832
+ "sampling/sampling_logp_difference/max": 0.3306601524353027,
833
+ "sampling/sampling_logp_difference/mean": 0.027687131240963937,
834
+ "step": 120,
835
+ "step_time": 7.60252943919968
836
+ },
837
+ {
838
+ "clip_ratio/high_max": 0.0,
839
+ "clip_ratio/high_mean": 0.0,
840
+ "clip_ratio/low_mean": 0.0,
841
+ "clip_ratio/low_min": 0.0,
842
+ "clip_ratio/region_mean": 0.0,
843
+ "completions/clipped_ratio": 0.0,
844
+ "completions/max_length": 1075.0,
845
+ "completions/max_terminated_length": 1075.0,
846
+ "completions/mean_length": 831.9125,
847
+ "completions/mean_terminated_length": 831.9125,
848
+ "completions/min_length": 590.0,
849
+ "completions/min_terminated_length": 590.0,
850
+ "entropy": 0.6203011125326157,
851
+ "epoch": 0.02,
852
+ "frac_reward_zero_std": 0.8,
853
+ "grad_norm": 0.0888671875,
854
+ "kl": 0.48651044964790346,
855
+ "learning_rate": 9.95022356779914e-06,
856
+ "loss": -4.498030175454914e-05,
857
+ "num_tokens": 4114505.0,
858
+ "reward": 0.0279999990016222,
859
+ "reward_std": 0.04065863937139511,
860
+ "rewards/env_goofspiel_reward/mean": 0.0279999990016222,
861
+ "rewards/env_goofspiel_reward/std": 0.09859532788395882,
862
+ "sampling/importance_sampling_ratio/max": 1.3767565965652466,
863
+ "sampling/importance_sampling_ratio/mean": 1.0073092341423036,
864
+ "sampling/importance_sampling_ratio/min": 0.6863486647605896,
865
+ "sampling/sampling_logp_difference/max": 0.2979677677154541,
866
+ "sampling/sampling_logp_difference/mean": 0.026996534690260886,
867
+ "step": 125,
868
+ "step_time": 7.651589208400037
869
+ },
870
+ {
871
+ "clip_ratio/high_max": 0.0,
872
+ "clip_ratio/high_mean": 0.0,
873
+ "clip_ratio/low_mean": 0.0,
874
+ "clip_ratio/low_min": 0.0,
875
+ "clip_ratio/region_mean": 0.0,
876
+ "completions/clipped_ratio": 0.0,
877
+ "completions/max_length": 1206.4,
878
+ "completions/max_terminated_length": 1206.4,
879
+ "completions/mean_length": 933.89375,
880
+ "completions/mean_terminated_length": 933.89375,
881
+ "completions/min_length": 657.2,
882
+ "completions/min_terminated_length": 657.2,
883
+ "entropy": 0.6028641879558563,
884
+ "epoch": 0.0208,
885
+ "frac_reward_zero_std": 0.85,
886
+ "grad_norm": 0.12890625,
887
+ "kl": 0.3837214097380638,
888
+ "learning_rate": 9.950175464332696e-06,
889
+ "loss": 3.275293856859207e-05,
890
+ "num_tokens": 4351587.0,
891
+ "reward": 0.02824999988079071,
892
+ "reward_std": 0.040128308534622195,
893
+ "rewards/env_goofspiel_reward/mean": 0.02824999988079071,
894
+ "rewards/env_goofspiel_reward/std": 0.107171730697155,
895
+ "sampling/importance_sampling_ratio/max": 1.4337808609008789,
896
+ "sampling/importance_sampling_ratio/mean": 1.0062252044677735,
897
+ "sampling/importance_sampling_ratio/min": 0.7049606084823609,
898
+ "sampling/sampling_logp_difference/max": 0.29388277530670165,
899
+ "sampling/sampling_logp_difference/mean": 0.027071699127554895,
900
+ "step": 130,
901
+ "step_time": 8.72059853139981
902
+ },
903
+ {
904
+ "clip_ratio/high_max": 0.0,
905
+ "clip_ratio/high_mean": 0.0,
906
+ "clip_ratio/low_mean": 0.0,
907
+ "clip_ratio/low_min": 0.0,
908
+ "clip_ratio/region_mean": 0.0,
909
+ "completions/clipped_ratio": 0.0,
910
+ "completions/max_length": 1403.2,
911
+ "completions/max_terminated_length": 1403.2,
912
+ "completions/mean_length": 1043.53125,
913
+ "completions/mean_terminated_length": 1043.53125,
914
+ "completions/min_length": 758.0,
915
+ "completions/min_terminated_length": 758.0,
916
+ "entropy": 0.5716616868972778,
917
+ "epoch": 0.0216,
918
+ "frac_reward_zero_std": 0.825,
919
+ "grad_norm": 0.11865234375,
920
+ "kl": 0.3439621731638908,
921
+ "learning_rate": 9.950124732485496e-06,
922
+ "loss": -0.00015461102593690158,
923
+ "num_tokens": 4604604.0,
924
+ "reward": 0.02993750013411045,
925
+ "reward_std": 0.04251479506492615,
926
+ "rewards/env_goofspiel_reward/mean": 0.02993750013411045,
927
+ "rewards/env_goofspiel_reward/std": 0.10386608839035034,
928
+ "sampling/importance_sampling_ratio/max": 1.560789942741394,
929
+ "sampling/importance_sampling_ratio/mean": 1.0229114413261413,
930
+ "sampling/importance_sampling_ratio/min": 0.656338346004486,
931
+ "sampling/sampling_logp_difference/max": 0.33734931945800783,
932
+ "sampling/sampling_logp_difference/mean": 0.026270415633916855,
933
+ "step": 135,
934
+ "step_time": 10.42263705259993
935
+ },
936
+ {
937
+ "clip_ratio/high_max": 0.0,
938
+ "clip_ratio/high_mean": 0.0,
939
+ "clip_ratio/low_mean": 0.0,
940
+ "clip_ratio/low_min": 0.0,
941
+ "clip_ratio/region_mean": 0.0,
942
+ "completions/clipped_ratio": 0.0,
943
+ "completions/max_length": 1404.2,
944
+ "completions/max_terminated_length": 1404.2,
945
+ "completions/mean_length": 1106.9125,
946
+ "completions/mean_terminated_length": 1106.9125,
947
+ "completions/min_length": 746.4,
948
+ "completions/min_terminated_length": 746.4,
949
+ "entropy": 0.6015262037515641,
950
+ "epoch": 0.0224,
951
+ "frac_reward_zero_std": 0.7875,
952
+ "grad_norm": 0.1142578125,
953
+ "kl": 0.4554178059101105,
954
+ "learning_rate": 9.95007137229328e-06,
955
+ "loss": 4.9450411461293696e-05,
956
+ "num_tokens": 4870794.0,
957
+ "reward": 0.02393750089686364,
958
+ "reward_std": 0.03491339806932956,
959
+ "rewards/env_goofspiel_reward/mean": 0.02393750089686364,
960
+ "rewards/env_goofspiel_reward/std": 0.07218290558084846,
961
+ "sampling/importance_sampling_ratio/max": 1.4548166275024415,
962
+ "sampling/importance_sampling_ratio/mean": 0.9814130544662476,
963
+ "sampling/importance_sampling_ratio/min": 0.6002501428127289,
964
+ "sampling/sampling_logp_difference/max": 0.33309857845306395,
965
+ "sampling/sampling_logp_difference/mean": 0.028818363696336745,
966
+ "step": 140,
967
+ "step_time": 10.392815717999474
968
+ },
969
+ {
970
+ "clip_ratio/high_max": 0.0,
971
+ "clip_ratio/high_mean": 0.0,
972
+ "clip_ratio/low_mean": 0.0,
973
+ "clip_ratio/low_min": 0.0,
974
+ "clip_ratio/region_mean": 0.0,
975
+ "completions/clipped_ratio": 0.0,
976
+ "completions/max_length": 1403.4,
977
+ "completions/max_terminated_length": 1403.4,
978
+ "completions/mean_length": 1064.275,
979
+ "completions/mean_terminated_length": 1064.275,
980
+ "completions/min_length": 746.6,
981
+ "completions/min_terminated_length": 746.6,
982
+ "entropy": 0.5568872570991517,
983
+ "epoch": 0.0232,
984
+ "frac_reward_zero_std": 0.775,
985
+ "grad_norm": 0.2041015625,
986
+ "kl": 0.3555284239351749,
987
+ "learning_rate": 9.950015383793636e-06,
988
+ "loss": -0.00010267798788845539,
989
+ "num_tokens": 5127626.0,
990
+ "reward": 0.03343750163912773,
991
+ "reward_std": 0.04799487330019474,
992
+ "rewards/env_goofspiel_reward/mean": 0.03343750163912773,
993
+ "rewards/env_goofspiel_reward/std": 0.10812449753284455,
994
+ "sampling/importance_sampling_ratio/max": 1.587186908721924,
995
+ "sampling/importance_sampling_ratio/mean": 1.0198248624801636,
996
+ "sampling/importance_sampling_ratio/min": 0.7297749638557434,
997
+ "sampling/sampling_logp_difference/max": 0.2695728540420532,
998
+ "sampling/sampling_logp_difference/mean": 0.0254237774759531,
999
+ "step": 145,
1000
+ "step_time": 10.32607021879976
1001
+ },
1002
+ {
1003
+ "clip_ratio/high_max": 0.0,
1004
+ "clip_ratio/high_mean": 0.0,
1005
+ "clip_ratio/low_mean": 0.0,
1006
+ "clip_ratio/low_min": 0.0,
1007
+ "clip_ratio/region_mean": 0.0,
1008
+ "completions/clipped_ratio": 0.0,
1009
+ "completions/max_length": 1400.4,
1010
+ "completions/max_terminated_length": 1400.4,
1011
+ "completions/mean_length": 1040.89375,
1012
+ "completions/mean_terminated_length": 1040.89375,
1013
+ "completions/min_length": 758.0,
1014
+ "completions/min_terminated_length": 758.0,
1015
+ "entropy": 0.5438663050532341,
1016
+ "epoch": 0.024,
1017
+ "frac_reward_zero_std": 0.8,
1018
+ "grad_norm": 0.11181640625,
1019
+ "kl": 0.313924690335989,
1020
+ "learning_rate": 9.949956767026006e-06,
1021
+ "loss": 0.0002234043786302209,
1022
+ "num_tokens": 5379923.0,
1023
+ "reward": 0.03181250132620335,
1024
+ "reward_std": 0.0451664462685585,
1025
+ "rewards/env_goofspiel_reward/mean": 0.03181250132620335,
1026
+ "rewards/env_goofspiel_reward/std": 0.10608797073364258,
1027
+ "sampling/importance_sampling_ratio/max": 1.3631083726882935,
1028
+ "sampling/importance_sampling_ratio/mean": 1.0006531119346618,
1029
+ "sampling/importance_sampling_ratio/min": 0.691138219833374,
1030
+ "sampling/sampling_logp_difference/max": 0.26457092761993406,
1031
+ "sampling/sampling_logp_difference/mean": 0.024369171261787413,
1032
+ "step": 150,
1033
+ "step_time": 10.337828036600513
1034
+ },
1035
+ {
1036
+ "epoch": 0.024,
1037
+ "eval_clip_ratio/high_max": 0.0,
1038
+ "eval_clip_ratio/high_mean": 0.0,
1039
+ "eval_clip_ratio/low_mean": 0.0,
1040
+ "eval_clip_ratio/low_min": 0.0,
1041
+ "eval_clip_ratio/region_mean": 0.0,
1042
+ "eval_completions/clipped_ratio": 0.0,
1043
+ "eval_completions/max_length": 1167.6,
1044
+ "eval_completions/max_terminated_length": 1167.6,
1045
+ "eval_completions/mean_length": 1085.75,
1046
+ "eval_completions/mean_terminated_length": 1085.75,
1047
+ "eval_completions/min_length": 1003.6,
1048
+ "eval_completions/min_terminated_length": 1003.6,
1049
+ "eval_entropy": 0.5619663238525391,
1050
+ "eval_frac_reward_zero_std": 0.9,
1051
+ "eval_kl": 0.4370845973491669,
1052
+ "eval_loss": -0.00014827963605057448,
1053
+ "eval_num_tokens": 5379923.0,
1054
+ "eval_reward": -0.0004999999888241291,
1055
+ "eval_reward_std": 0.0007071067579090595,
1056
+ "eval_rewards/env_goofspiel_reward/mean": -0.0004999999888241291,
1057
+ "eval_rewards/env_goofspiel_reward/std": 0.0009999999776482583,
1058
+ "eval_runtime": 4.3907,
1059
+ "eval_samples_per_second": 2.278,
1060
+ "eval_sampling/importance_sampling_ratio/max": 1.2002264738082886,
1061
+ "eval_sampling/importance_sampling_ratio/mean": 1.0190080761909486,
1062
+ "eval_sampling/importance_sampling_ratio/min": 0.8775287628173828,
1063
+ "eval_sampling/sampling_logp_difference/max": 0.19612762928009034,
1064
+ "eval_sampling/sampling_logp_difference/mean": 0.025080177932977676,
1065
+ "eval_steps_per_second": 0.683,
1066
+ "step": 150
1067
+ },
1068
+ {
1069
+ "epoch": 0.02432,
1070
+ "eval_clip_ratio/high_max": 0.0,
1071
+ "eval_clip_ratio/high_mean": 0.0,
1072
+ "eval_clip_ratio/low_mean": 0.0,
1073
+ "eval_clip_ratio/low_min": 0.0,
1074
+ "eval_clip_ratio/region_mean": 0.0,
1075
+ "eval_completions/clipped_ratio": 0.0,
1076
+ "eval_completions/max_length": 1166.0,
1077
+ "eval_completions/max_terminated_length": 1166.0,
1078
+ "eval_completions/mean_length": 1082.75,
1079
+ "eval_completions/mean_terminated_length": 1082.75,
1080
+ "eval_completions/min_length": 999.8,
1081
+ "eval_completions/min_terminated_length": 999.8,
1082
+ "eval_entropy": 0.5707040071487427,
1083
+ "eval_frac_reward_zero_std": 1.0,
1084
+ "eval_kl": 0.40999372601509093,
1085
+ "eval_loss": 3.846790423267521e-05,
1086
+ "eval_num_tokens": 5476501.0,
1087
+ "eval_reward": 0.06000000238418579,
1088
+ "eval_reward_std": 0.0,
1089
+ "eval_rewards/env_goofspiel_reward/mean": 0.06000000238418579,
1090
+ "eval_rewards/env_goofspiel_reward/std": 0.06928203105926514,
1091
+ "eval_runtime": 4.0973,
1092
+ "eval_samples_per_second": 2.441,
1093
+ "eval_sampling/importance_sampling_ratio/max": 1.1712106943130494,
1094
+ "eval_sampling/importance_sampling_ratio/mean": 1.011259377002716,
1095
+ "eval_sampling/importance_sampling_ratio/min": 0.8288854002952576,
1096
+ "eval_sampling/sampling_logp_difference/max": 0.17412886619567872,
1097
+ "eval_sampling/sampling_logp_difference/mean": 0.024767952039837837,
1098
+ "eval_steps_per_second": 0.732,
1099
+ "step": 152
1100
  }
1101
  ],
1102
  "logging_steps": 5,
1103
  "max_steps": 18750,
1104
+ "num_input_tokens_seen": 5476501,
1105
  "num_train_epochs": 3,
1106
  "save_steps": 500,
1107
  "stateful_callbacks": {
 
1111
  "should_evaluate": false,
1112
  "should_log": false,
1113
  "should_save": true,
1114
+ "should_training_stop": true
1115
  },
1116
  "attributes": {}
1117
  }
1118
  },
1119
  "total_flos": 0.0,
1120
+ "train_batch_size": 8,
1121
  "trial_name": null,
1122
  "trial_params": null
1123
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:906bc07f18d85f3fdbe47d01e60bbe6f967852d19caecc88d502ce07c5e4aa78
3
  size 7185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b300496f72b512d9eb82d58bc70e9cfecf1e6725146e612c791121039cde76d
3
  size 7185