bimabk commited on
Commit
5ed91fc
·
verified ·
1 Parent(s): 4814b81

Upload task output 1

Browse files
adapter_config.json CHANGED
@@ -29,13 +29,13 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "v_proj",
33
- "down_proj",
34
- "up_proj",
35
  "o_proj",
 
 
36
  "k_proj",
37
- "gate_proj",
38
- "q_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "gate_proj",
 
 
33
  "o_proj",
34
+ "v_proj",
35
+ "q_proj",
36
  "k_proj",
37
+ "up_proj",
38
+ "down_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d4e9eebb2b73438828d23b8d54ab10ce0c2534a61b6aa0633e1866c67f9e1d8
3
  size 319876032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec359459fe517709f85e49817da65dc6bc2ef784dff0e2f3f9dd5a384abdc9d
3
  size 319876032
loss.txt CHANGED
@@ -1 +1 @@
1
- 75,no_eval
 
1
+ 10,18.063774490356444
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2206073a6598893988ad5f1d96aa193d274fd6ad2a8d8c7ab8f56b29b6d4d0aa
3
- size 3620731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
3
+ size 3620829
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0015,
6
  "eval_steps": 500,
7
- "global_step": 75,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -15,32 +15,32 @@
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
- "completions/clipped_ratio": 0.75,
19
  "completions/max_length": 16.0,
20
- "completions/max_terminated_length": 12.0,
21
- "completions/mean_length": 14.0625,
22
- "completions/mean_terminated_length": 8.25,
23
- "completions/min_length": 4.0,
24
- "completions/min_terminated_length": 4.0,
25
- "entropy": 0.5195984467864037,
26
  "epoch": 2e-05,
27
  "frac_reward_zero_std": 0.0,
28
- "grad_norm": 0.5884848833084106,
29
  "kl": 0.0,
30
  "learning_rate": 0.0,
31
- "loss": 0.0687,
32
- "num_tokens": 53025.0,
33
- "reward": -18.636951446533203,
34
- "reward_std": 16.357603073120117,
35
- "rewards/rollout_reward_func/mean": -18.636951446533203,
36
- "rewards/rollout_reward_func/std": 16.357603073120117,
37
- "sampling/importance_sampling_ratio/max": 1.261159062385559,
38
- "sampling/importance_sampling_ratio/mean": 1.0131760835647583,
39
- "sampling/importance_sampling_ratio/min": 0.8812755942344666,
40
- "sampling/sampling_logp_difference/max": 0.1213843822479248,
41
- "sampling/sampling_logp_difference/mean": 0.012358092702925205,
42
  "step": 1,
43
- "step_time": 13.533009208999829
44
  },
45
  {
46
  "clip_ratio/high_max": 0.0,
@@ -48,302 +48,143 @@
48
  "clip_ratio/low_mean": 0.0,
49
  "clip_ratio/low_min": 0.0,
50
  "clip_ratio/region_mean": 0.0,
51
- "entropy": 0.5195984467864037,
52
  "epoch": 4e-05,
53
- "grad_norm": 0.5867869853973389,
54
  "kl": 0.0,
55
- "learning_rate": 8.571428571428572e-08,
56
- "loss": 0.0687,
57
  "step": 2,
58
- "step_time": 3.5312206580001657
59
  },
60
  {
61
  "clip_ratio/high_max": 0.0,
62
  "clip_ratio/high_mean": 0.0,
63
- "clip_ratio/low_mean": 0.0,
64
  "clip_ratio/low_min": 0.0,
65
- "clip_ratio/region_mean": 0.0,
66
  "completions/clipped_ratio": 0.75,
67
  "completions/max_length": 16.0,
68
- "completions/max_terminated_length": 15.0,
69
- "completions/mean_length": 14.9375,
70
- "completions/mean_terminated_length": 11.75,
71
- "completions/min_length": 5.0,
72
- "completions/min_terminated_length": 5.0,
73
- "entropy": 0.42067771404981613,
74
  "epoch": 6e-05,
75
  "frac_reward_zero_std": 0.0,
76
- "grad_norm": 0.45643824338912964,
77
- "kl": 0.0004211474661133252,
78
- "learning_rate": 1.7142857142857143e-07,
79
- "loss": 0.0769,
80
- "num_tokens": 106238.0,
81
- "reward": -19.64282989501953,
82
- "reward_std": 15.413658142089844,
83
- "rewards/rollout_reward_func/mean": -19.64282989501953,
84
- "rewards/rollout_reward_func/std": 15.413658142089844,
85
- "sampling/importance_sampling_ratio/max": 1.2733169794082642,
86
- "sampling/importance_sampling_ratio/mean": 0.978705644607544,
87
- "sampling/importance_sampling_ratio/min": 0.7849969863891602,
88
- "sampling/sampling_logp_difference/max": 0.22296667098999023,
89
- "sampling/sampling_logp_difference/mean": 0.011069882661104202,
90
  "step": 3,
91
- "step_time": 13.39454092699998
92
  },
93
  {
94
- "clip_ratio/high_max": 0.0,
95
- "clip_ratio/high_mean": 0.0,
96
  "clip_ratio/low_mean": 0.0,
97
  "clip_ratio/low_min": 0.0,
98
- "clip_ratio/region_mean": 0.0,
99
- "entropy": 0.4206809252500534,
100
  "epoch": 8e-05,
101
- "grad_norm": 0.4594491124153137,
102
- "kl": 0.00033940414868993685,
103
- "learning_rate": 2.5714285714285716e-07,
104
- "loss": 0.0761,
105
  "step": 4,
106
- "step_time": 3.593265089999477
107
  },
108
  {
109
- "clip_ratio/high_max": 0.009259259328246117,
110
- "clip_ratio/high_mean": 0.002314814832061529,
111
  "clip_ratio/low_mean": 0.0,
112
  "clip_ratio/low_min": 0.0,
113
- "clip_ratio/region_mean": 0.002314814832061529,
114
- "completions/clipped_ratio": 0.65625,
115
  "completions/max_length": 16.0,
116
- "completions/max_terminated_length": 15.0,
117
- "completions/mean_length": 13.5625,
118
- "completions/mean_terminated_length": 8.909090995788574,
119
  "completions/min_length": 4.0,
120
  "completions/min_terminated_length": 4.0,
121
- "entropy": 0.42936040461063385,
122
  "epoch": 0.0001,
123
  "frac_reward_zero_std": 0.0,
124
- "grad_norm": 0.4430800974369049,
125
- "kl": 0.0005995778919896111,
126
- "learning_rate": 3.4285714285714286e-07,
127
- "loss": -0.0094,
128
- "num_tokens": 158161.0,
129
- "reward": -25.30706787109375,
130
- "reward_std": 12.832451820373535,
131
- "rewards/rollout_reward_func/mean": -25.30706787109375,
132
- "rewards/rollout_reward_func/std": 12.832451820373535,
133
- "sampling/importance_sampling_ratio/max": 1.1259655952453613,
134
- "sampling/importance_sampling_ratio/mean": 0.955443263053894,
135
- "sampling/importance_sampling_ratio/min": 0.759828507900238,
136
- "sampling/sampling_logp_difference/max": 0.24739313125610352,
137
- "sampling/sampling_logp_difference/mean": 0.012292729690670967,
138
  "step": 5,
139
- "step_time": 13.135760864000076
140
  },
141
  {
142
- "clip_ratio/high_max": 0.0,
143
- "clip_ratio/high_mean": 0.0,
144
- "clip_ratio/low_mean": 0.0,
145
- "clip_ratio/low_min": 0.0,
146
- "clip_ratio/region_mean": 0.0,
147
- "entropy": 0.42947498708963394,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  "epoch": 0.00012,
149
- "grad_norm": 0.4316604435443878,
150
- "kl": 0.00045547448826255277,
151
- "learning_rate": 4.2857142857142857e-07,
152
- "loss": -0.0102,
153
  "step": 6,
154
- "step_time": 3.579201618999832
155
- },
156
- {
157
- "clip_ratio/high_max": 0.008620689623057842,
158
- "clip_ratio/high_mean": 0.0021551724057644606,
159
- "clip_ratio/low_mean": 0.0,
160
- "clip_ratio/low_min": 0.0,
161
- "clip_ratio/region_mean": 0.0021551724057644606,
162
- "completions/clipped_ratio": 0.6875,
163
- "completions/max_length": 16.0,
164
- "completions/max_terminated_length": 14.0,
165
- "completions/mean_length": 13.6875,
166
- "completions/mean_terminated_length": 8.600000381469727,
167
- "completions/min_length": 4.0,
168
- "completions/min_terminated_length": 4.0,
169
- "entropy": 0.42659611254930496,
170
- "epoch": 0.00014,
171
- "frac_reward_zero_std": 0.0,
172
- "grad_norm": 0.47957348823547363,
173
- "kl": 0.00030913886985217687,
174
- "learning_rate": 5.142857142857143e-07,
175
- "loss": 0.1087,
176
- "num_tokens": 209683.0,
177
- "reward": -21.248210906982422,
178
- "reward_std": 16.31087303161621,
179
- "rewards/rollout_reward_func/mean": -21.248210906982422,
180
- "rewards/rollout_reward_func/std": 16.31087303161621,
181
- "sampling/importance_sampling_ratio/max": 1.2203210592269897,
182
- "sampling/importance_sampling_ratio/mean": 0.9698781967163086,
183
- "sampling/importance_sampling_ratio/min": 0.6302762627601624,
184
- "sampling/sampling_logp_difference/max": 0.24562978744506836,
185
- "sampling/sampling_logp_difference/mean": 0.011444764211773872,
186
- "step": 7,
187
- "step_time": 13.088727697000195
188
- },
189
- {
190
- "clip_ratio/high_max": 0.008620689623057842,
191
- "clip_ratio/high_mean": 0.004108297638595104,
192
- "clip_ratio/low_mean": 0.0,
193
- "clip_ratio/low_min": 0.0,
194
- "clip_ratio/region_mean": 0.004108297638595104,
195
- "entropy": 0.4278160482645035,
196
- "epoch": 0.00016,
197
- "grad_norm": 0.4772587716579437,
198
- "kl": 0.000408434629207477,
199
- "learning_rate": 6.000000000000001e-07,
200
- "loss": 0.108,
201
- "step": 8,
202
- "step_time": 4.637348582999948
203
- },
204
- {
205
- "clip_ratio/high_max": 0.008064515888690948,
206
- "clip_ratio/high_mean": 0.002016128972172737,
207
- "clip_ratio/low_mean": 0.0,
208
- "clip_ratio/low_min": 0.0,
209
- "clip_ratio/region_mean": 0.002016128972172737,
210
- "completions/clipped_ratio": 0.84375,
211
- "completions/max_length": 16.0,
212
- "completions/max_terminated_length": 15.0,
213
- "completions/mean_length": 15.34375,
214
- "completions/mean_terminated_length": 11.800000190734863,
215
- "completions/min_length": 6.0,
216
- "completions/min_terminated_length": 6.0,
217
- "entropy": 0.39533767104148865,
218
- "epoch": 0.00018,
219
- "frac_reward_zero_std": 0.0,
220
- "grad_norm": 0.4479968845844269,
221
- "kl": 0.0003632318548625335,
222
- "learning_rate": 6.857142857142857e-07,
223
- "loss": 0.0331,
224
- "num_tokens": 262786.0,
225
- "reward": -20.36313247680664,
226
- "reward_std": 12.939953804016113,
227
- "rewards/rollout_reward_func/mean": -20.36313247680664,
228
- "rewards/rollout_reward_func/std": 12.939953804016113,
229
- "sampling/importance_sampling_ratio/max": 1.270872712135315,
230
- "sampling/importance_sampling_ratio/mean": 1.0103130340576172,
231
- "sampling/importance_sampling_ratio/min": 0.8130013346672058,
232
- "sampling/sampling_logp_difference/max": 0.18152618408203125,
233
- "sampling/sampling_logp_difference/mean": 0.010795784182846546,
234
- "step": 9,
235
- "step_time": 12.636375802000202
236
- },
237
- {
238
- "clip_ratio/high_max": 0.0,
239
- "clip_ratio/high_mean": 0.0,
240
- "clip_ratio/low_mean": 0.0,
241
- "clip_ratio/low_min": 0.0,
242
- "clip_ratio/region_mean": 0.0,
243
- "entropy": 0.3951461538672447,
244
- "epoch": 0.0002,
245
- "grad_norm": 0.4540300667285919,
246
- "kl": 0.00028537910839077085,
247
- "learning_rate": 7.714285714285714e-07,
248
- "loss": 0.0341,
249
- "step": 10,
250
- "step_time": 3.5246556089996375
251
- },
252
- {
253
- "clip_ratio/high_max": 0.0078125,
254
- "clip_ratio/high_mean": 0.001953125,
255
- "clip_ratio/low_mean": 0.0,
256
- "clip_ratio/low_min": 0.0,
257
- "clip_ratio/region_mean": 0.001953125,
258
- "completions/clipped_ratio": 0.84375,
259
- "completions/max_length": 16.0,
260
- "completions/max_terminated_length": 16.0,
261
- "completions/mean_length": 15.09375,
262
- "completions/mean_terminated_length": 10.199999809265137,
263
- "completions/min_length": 4.0,
264
- "completions/min_terminated_length": 4.0,
265
- "entropy": 0.44926057010889053,
266
- "epoch": 0.00022,
267
- "frac_reward_zero_std": 0.0,
268
- "grad_norm": 0.6062055826187134,
269
- "kl": 0.0004280444991309196,
270
- "learning_rate": 8.571428571428571e-07,
271
- "loss": 0.1022,
272
- "num_tokens": 316291.0,
273
- "reward": -22.426130294799805,
274
- "reward_std": 11.176600456237793,
275
- "rewards/rollout_reward_func/mean": -22.426130294799805,
276
- "rewards/rollout_reward_func/std": 11.17660140991211,
277
- "sampling/importance_sampling_ratio/max": 1.3410085439682007,
278
- "sampling/importance_sampling_ratio/mean": 1.018998384475708,
279
- "sampling/importance_sampling_ratio/min": 0.8014979362487793,
280
- "sampling/sampling_logp_difference/max": 0.26187610626220703,
281
- "sampling/sampling_logp_difference/mean": 0.011133530177175999,
282
- "step": 11,
283
- "step_time": 13.926207335999152
284
- },
285
- {
286
- "clip_ratio/high_max": 0.0,
287
- "clip_ratio/high_mean": 0.0,
288
- "clip_ratio/low_mean": 0.0,
289
- "clip_ratio/low_min": 0.0,
290
- "clip_ratio/region_mean": 0.0,
291
- "entropy": 0.4503633603453636,
292
- "epoch": 0.00024,
293
- "grad_norm": 0.6362693905830383,
294
- "kl": 0.0003500747407088056,
295
- "learning_rate": 9.428571428571429e-07,
296
- "loss": 0.1016,
297
- "step": 12,
298
- "step_time": 3.686019106999538
299
- },
300
- {
301
- "clip_ratio/high_max": 0.0,
302
- "clip_ratio/high_mean": 0.0,
303
- "clip_ratio/low_mean": 0.0,
304
- "clip_ratio/low_min": 0.0,
305
- "clip_ratio/region_mean": 0.0,
306
- "completions/clipped_ratio": 0.84375,
307
- "completions/max_length": 16.0,
308
- "completions/max_terminated_length": 15.0,
309
- "completions/mean_length": 15.03125,
310
- "completions/mean_terminated_length": 9.800000190734863,
311
- "completions/min_length": 3.0,
312
- "completions/min_terminated_length": 3.0,
313
- "entropy": 0.47256265580654144,
314
- "epoch": 0.00026,
315
- "frac_reward_zero_std": 0.0,
316
- "grad_norm": 0.3631357252597809,
317
- "kl": 0.0003765556830330752,
318
- "learning_rate": 1.0285714285714286e-06,
319
- "loss": 0.0954,
320
- "num_tokens": 367798.0,
321
- "reward": -21.7075138092041,
322
- "reward_std": 14.198619842529297,
323
- "rewards/rollout_reward_func/mean": -21.7075138092041,
324
- "rewards/rollout_reward_func/std": 14.198619842529297,
325
- "sampling/importance_sampling_ratio/max": 1.2773635387420654,
326
- "sampling/importance_sampling_ratio/mean": 0.995408833026886,
327
- "sampling/importance_sampling_ratio/min": 0.778171181678772,
328
- "sampling/sampling_logp_difference/max": 0.18239641189575195,
329
- "sampling/sampling_logp_difference/mean": 0.010933874174952507,
330
- "step": 13,
331
- "step_time": 12.429970339999727
332
- },
333
- {
334
- "clip_ratio/high_max": 0.0,
335
- "clip_ratio/high_mean": 0.0,
336
- "clip_ratio/low_mean": 0.0,
337
- "clip_ratio/low_min": 0.0,
338
- "clip_ratio/region_mean": 0.0,
339
- "entropy": 0.4725050553679466,
340
- "epoch": 0.00028,
341
- "grad_norm": 0.3502637445926666,
342
- "kl": 0.00048724883527029306,
343
- "learning_rate": 1.1142857142857143e-06,
344
- "loss": 0.0956,
345
- "step": 14,
346
- "step_time": 3.513378806000219
347
  },
348
  {
349
  "clip_ratio/high_max": 0.0,
@@ -353,333 +194,30 @@
353
  "clip_ratio/region_mean": 0.0,
354
  "completions/clipped_ratio": 0.78125,
355
  "completions/max_length": 16.0,
356
- "completions/max_terminated_length": 15.0,
357
  "completions/mean_length": 14.6875,
358
  "completions/mean_terminated_length": 10.0,
359
- "completions/min_length": 4.0,
360
- "completions/min_terminated_length": 4.0,
361
- "entropy": 0.48245932161808014,
362
- "epoch": 0.0003,
363
- "frac_reward_zero_std": 0.0,
364
- "grad_norm": 0.5791538953781128,
365
- "kl": 0.00043258603545837104,
366
- "learning_rate": 1.2000000000000002e-06,
367
- "loss": 0.0699,
368
- "num_tokens": 422015.0,
369
- "reward": -18.020889282226562,
370
- "reward_std": 14.099430084228516,
371
- "rewards/rollout_reward_func/mean": -18.020889282226562,
372
- "rewards/rollout_reward_func/std": 14.0994291305542,
373
- "sampling/importance_sampling_ratio/max": 1.131913423538208,
374
- "sampling/importance_sampling_ratio/mean": 0.9780954122543335,
375
- "sampling/importance_sampling_ratio/min": 0.7532981038093567,
376
- "sampling/sampling_logp_difference/max": 0.2378932237625122,
377
- "sampling/sampling_logp_difference/mean": 0.01171853393316269,
378
- "step": 15,
379
- "step_time": 12.757522889000484
380
- },
381
- {
382
- "clip_ratio/high_max": 0.012500000186264515,
383
- "clip_ratio/high_mean": 0.0031250000465661287,
384
- "clip_ratio/low_mean": 0.0,
385
- "clip_ratio/low_min": 0.0,
386
- "clip_ratio/region_mean": 0.0031250000465661287,
387
- "entropy": 0.48212701082229614,
388
- "epoch": 0.00032,
389
- "grad_norm": 0.5567578077316284,
390
- "kl": 0.0003868155945383478,
391
- "learning_rate": 1.2857142857142856e-06,
392
- "loss": 0.0683,
393
- "step": 16,
394
- "step_time": 3.6391623570002594
395
- },
396
- {
397
- "clip_ratio/high_max": 0.0,
398
- "clip_ratio/high_mean": 0.0,
399
- "clip_ratio/low_mean": 0.0,
400
- "clip_ratio/low_min": 0.0,
401
- "clip_ratio/region_mean": 0.0,
402
- "completions/clipped_ratio": 0.6875,
403
- "completions/max_length": 16.0,
404
- "completions/max_terminated_length": 15.0,
405
- "completions/mean_length": 14.125,
406
- "completions/mean_terminated_length": 10.0,
407
  "completions/min_length": 3.0,
408
  "completions/min_terminated_length": 3.0,
409
- "entropy": 0.3981349244713783,
410
- "epoch": 0.00034,
411
- "frac_reward_zero_std": 0.0,
412
- "grad_norm": 0.46128952503204346,
413
- "kl": 0.00022623784388997592,
414
- "learning_rate": 1.3714285714285715e-06,
415
- "loss": 0.093,
416
- "num_tokens": 476133.0,
417
- "reward": -17.307281494140625,
418
- "reward_std": 17.012971878051758,
419
- "rewards/rollout_reward_func/mean": -17.307281494140625,
420
- "rewards/rollout_reward_func/std": 17.012971878051758,
421
- "sampling/importance_sampling_ratio/max": 1.3407179117202759,
422
- "sampling/importance_sampling_ratio/mean": 1.0220906734466553,
423
- "sampling/importance_sampling_ratio/min": 0.7680003643035889,
424
- "sampling/sampling_logp_difference/max": 0.22670793533325195,
425
- "sampling/sampling_logp_difference/mean": 0.01049777027219534,
426
- "step": 17,
427
- "step_time": 14.12015571899974
428
- },
429
- {
430
- "clip_ratio/high_max": 0.0,
431
- "clip_ratio/high_mean": 0.0,
432
- "clip_ratio/low_mean": 0.0,
433
- "clip_ratio/low_min": 0.0,
434
- "clip_ratio/region_mean": 0.0,
435
- "entropy": 0.3973981589078903,
436
- "epoch": 0.00036,
437
- "grad_norm": 0.4621075987815857,
438
- "kl": 0.0002368360073887743,
439
- "learning_rate": 1.4571428571428571e-06,
440
- "loss": 0.0935,
441
- "step": 18,
442
- "step_time": 4.419824714000242
443
- },
444
- {
445
- "clip_ratio/high_max": 0.0,
446
- "clip_ratio/high_mean": 0.0,
447
- "clip_ratio/low_mean": 0.0,
448
- "clip_ratio/low_min": 0.0,
449
- "clip_ratio/region_mean": 0.0,
450
- "completions/clipped_ratio": 0.75,
451
- "completions/max_length": 16.0,
452
- "completions/max_terminated_length": 16.0,
453
- "completions/mean_length": 14.53125,
454
- "completions/mean_terminated_length": 10.125,
455
- "completions/min_length": 4.0,
456
- "completions/min_terminated_length": 4.0,
457
- "entropy": 0.4408823549747467,
458
- "epoch": 0.00038,
459
- "frac_reward_zero_std": 0.0,
460
- "grad_norm": 0.49735671281814575,
461
- "kl": 0.00043844833271577954,
462
- "learning_rate": 1.5428571428571428e-06,
463
- "loss": 0.0479,
464
- "num_tokens": 528759.0,
465
- "reward": -20.804988861083984,
466
- "reward_std": 14.42788314819336,
467
- "rewards/rollout_reward_func/mean": -20.804988861083984,
468
- "rewards/rollout_reward_func/std": 14.42788314819336,
469
- "sampling/importance_sampling_ratio/max": 1.2070324420928955,
470
- "sampling/importance_sampling_ratio/mean": 0.9982270002365112,
471
- "sampling/importance_sampling_ratio/min": 0.8168321251869202,
472
- "sampling/sampling_logp_difference/max": 0.16861987113952637,
473
- "sampling/sampling_logp_difference/mean": 0.010207044892013073,
474
- "step": 19,
475
- "step_time": 12.619404869999471
476
- },
477
- {
478
- "clip_ratio/high_max": 0.0,
479
- "clip_ratio/high_mean": 0.0,
480
- "clip_ratio/low_mean": 0.0,
481
- "clip_ratio/low_min": 0.0,
482
- "clip_ratio/region_mean": 0.0,
483
- "entropy": 0.44093089550733566,
484
- "epoch": 0.0004,
485
- "grad_norm": 0.4937126040458679,
486
- "kl": 0.00047725086915306747,
487
- "learning_rate": 1.6285714285714284e-06,
488
- "loss": 0.0483,
489
- "step": 20,
490
- "step_time": 3.610557893000532
491
- },
492
- {
493
- "clip_ratio/high_max": 0.0,
494
- "clip_ratio/high_mean": 0.0,
495
- "clip_ratio/low_mean": 0.0,
496
- "clip_ratio/low_min": 0.0,
497
- "clip_ratio/region_mean": 0.0,
498
- "completions/clipped_ratio": 0.75,
499
- "completions/max_length": 16.0,
500
- "completions/max_terminated_length": 16.0,
501
- "completions/mean_length": 14.78125,
502
- "completions/mean_terminated_length": 11.125,
503
- "completions/min_length": 4.0,
504
- "completions/min_terminated_length": 4.0,
505
- "entropy": 0.4765045493841171,
506
- "epoch": 0.00042,
507
- "frac_reward_zero_std": 0.0,
508
- "grad_norm": 0.4461357593536377,
509
- "kl": 0.00039704455411992967,
510
- "learning_rate": 1.7142857142857143e-06,
511
- "loss": 0.0319,
512
- "num_tokens": 582547.0,
513
- "reward": -21.366497039794922,
514
- "reward_std": 13.540549278259277,
515
- "rewards/rollout_reward_func/mean": -21.366497039794922,
516
- "rewards/rollout_reward_func/std": 13.540549278259277,
517
- "sampling/importance_sampling_ratio/max": 1.3568636178970337,
518
- "sampling/importance_sampling_ratio/mean": 0.9794997572898865,
519
- "sampling/importance_sampling_ratio/min": 0.7170642018318176,
520
- "sampling/sampling_logp_difference/max": 0.17396187782287598,
521
- "sampling/sampling_logp_difference/mean": 0.011949114501476288,
522
- "step": 21,
523
- "step_time": 13.120537562000209
524
- },
525
- {
526
- "clip_ratio/high_max": 0.0,
527
- "clip_ratio/high_mean": 0.0,
528
- "clip_ratio/low_mean": 0.0,
529
- "clip_ratio/low_min": 0.0,
530
- "clip_ratio/region_mean": 0.0,
531
- "entropy": 0.4768811762332916,
532
- "epoch": 0.00044,
533
- "grad_norm": 0.42503097653388977,
534
- "kl": 0.0004109822984901257,
535
- "learning_rate": 1.8e-06,
536
- "loss": 0.0334,
537
- "step": 22,
538
- "step_time": 3.706944928000212
539
- },
540
- {
541
- "clip_ratio/high_max": 0.0,
542
- "clip_ratio/high_mean": 0.0,
543
- "clip_ratio/low_mean": 0.0,
544
- "clip_ratio/low_min": 0.0,
545
- "clip_ratio/region_mean": 0.0,
546
- "completions/clipped_ratio": 0.8125,
547
- "completions/max_length": 16.0,
548
- "completions/max_terminated_length": 15.0,
549
- "completions/mean_length": 14.84375,
550
- "completions/mean_terminated_length": 9.833333969116211,
551
- "completions/min_length": 4.0,
552
- "completions/min_terminated_length": 4.0,
553
- "entropy": 0.5320451036095619,
554
- "epoch": 0.00046,
555
- "frac_reward_zero_std": 0.0,
556
- "grad_norm": 0.4428236186504364,
557
- "kl": 0.0005832012830069289,
558
- "learning_rate": 1.8857142857142858e-06,
559
- "loss": 0.092,
560
- "num_tokens": 634760.0,
561
- "reward": -19.927682876586914,
562
- "reward_std": 16.771766662597656,
563
- "rewards/rollout_reward_func/mean": -19.927682876586914,
564
- "rewards/rollout_reward_func/std": 16.771766662597656,
565
- "sampling/importance_sampling_ratio/max": 1.4887969493865967,
566
- "sampling/importance_sampling_ratio/mean": 1.030691146850586,
567
- "sampling/importance_sampling_ratio/min": 0.8164731860160828,
568
- "sampling/sampling_logp_difference/max": 0.2149658203125,
569
- "sampling/sampling_logp_difference/mean": 0.013042854145169258,
570
- "step": 23,
571
- "step_time": 12.996803623999767
572
- },
573
- {
574
- "clip_ratio/high_max": 0.0,
575
- "clip_ratio/high_mean": 0.0,
576
- "clip_ratio/low_mean": 0.0,
577
- "clip_ratio/low_min": 0.0,
578
- "clip_ratio/region_mean": 0.0,
579
- "entropy": 0.5324564427137375,
580
- "epoch": 0.00048,
581
- "grad_norm": 0.438973069190979,
582
- "kl": 0.0006122617051005363,
583
- "learning_rate": 1.9714285714285714e-06,
584
- "loss": 0.0906,
585
- "step": 24,
586
- "step_time": 3.5907621789997393
587
- },
588
- {
589
- "clip_ratio/high_max": 0.0,
590
- "clip_ratio/high_mean": 0.0,
591
- "clip_ratio/low_mean": 0.0,
592
- "clip_ratio/low_min": 0.0,
593
- "clip_ratio/region_mean": 0.0,
594
- "completions/clipped_ratio": 0.78125,
595
- "completions/max_length": 16.0,
596
- "completions/max_terminated_length": 16.0,
597
- "completions/mean_length": 14.5625,
598
- "completions/mean_terminated_length": 9.428571701049805,
599
- "completions/min_length": 4.0,
600
- "completions/min_terminated_length": 4.0,
601
- "entropy": 0.4107111692428589,
602
- "epoch": 0.0005,
603
- "frac_reward_zero_std": 0.0,
604
- "grad_norm": 0.48443400859832764,
605
- "kl": 0.0005294791480991989,
606
- "learning_rate": 2.0571428571428573e-06,
607
- "loss": 0.0265,
608
- "num_tokens": 686641.0,
609
- "reward": -24.298006057739258,
610
- "reward_std": 11.884275436401367,
611
- "rewards/rollout_reward_func/mean": -24.298006057739258,
612
- "rewards/rollout_reward_func/std": 11.88427448272705,
613
- "sampling/importance_sampling_ratio/max": 1.3545169830322266,
614
- "sampling/importance_sampling_ratio/mean": 1.0202080011367798,
615
- "sampling/importance_sampling_ratio/min": 0.8227697610855103,
616
- "sampling/sampling_logp_difference/max": 0.19383776187896729,
617
- "sampling/sampling_logp_difference/mean": 0.010375281795859337,
618
- "step": 25,
619
- "step_time": 11.739334979000432
620
- },
621
- {
622
- "clip_ratio/high_max": 0.0,
623
- "clip_ratio/high_mean": 0.0,
624
- "clip_ratio/low_mean": 0.0,
625
- "clip_ratio/low_min": 0.0,
626
- "clip_ratio/region_mean": 0.0,
627
- "entropy": 0.41023270785808563,
628
- "epoch": 0.00052,
629
- "grad_norm": 0.4764305651187897,
630
- "kl": 0.0004724617610918358,
631
- "learning_rate": 2.142857142857143e-06,
632
- "loss": 0.0261,
633
- "step": 26,
634
- "step_time": 3.5787725539998974
635
- },
636
- {
637
- "clip_ratio/high_max": 0.0,
638
- "clip_ratio/high_mean": 0.0,
639
- "clip_ratio/low_mean": 0.0,
640
- "clip_ratio/low_min": 0.0,
641
- "clip_ratio/region_mean": 0.0,
642
- "completions/clipped_ratio": 0.71875,
643
- "completions/max_length": 16.0,
644
- "completions/max_terminated_length": 16.0,
645
- "completions/mean_length": 14.09375,
646
- "completions/mean_terminated_length": 9.222222328186035,
647
- "completions/min_length": 4.0,
648
- "completions/min_terminated_length": 4.0,
649
- "entropy": 0.42711327224969864,
650
- "epoch": 0.00054,
651
  "frac_reward_zero_std": 0.0,
652
- "grad_norm": 0.5134465098381042,
653
- "kl": 0.00047725687181809917,
654
- "learning_rate": 2.2285714285714286e-06,
655
- "loss": 0.1005,
656
- "num_tokens": 739663.0,
657
- "reward": -21.107803344726562,
658
- "reward_std": 12.081624984741211,
659
- "rewards/rollout_reward_func/mean": -21.107803344726562,
660
- "rewards/rollout_reward_func/std": 12.081623077392578,
661
- "sampling/importance_sampling_ratio/max": 1.1832900047302246,
662
- "sampling/importance_sampling_ratio/mean": 0.9839767813682556,
663
- "sampling/importance_sampling_ratio/min": 0.7083062529563904,
664
- "sampling/sampling_logp_difference/max": 0.24417352676391602,
665
- "sampling/sampling_logp_difference/mean": 0.015798263251781464,
666
- "step": 27,
667
- "step_time": 13.923148167000363
668
- },
669
- {
670
- "clip_ratio/high_max": 0.0078125,
671
- "clip_ratio/high_mean": 0.001953125,
672
- "clip_ratio/low_mean": 0.0,
673
- "clip_ratio/low_min": 0.0,
674
- "clip_ratio/region_mean": 0.001953125,
675
- "entropy": 0.4274953231215477,
676
- "epoch": 0.00056,
677
- "grad_norm": 0.5122133493423462,
678
- "kl": 0.0005963902549410705,
679
- "learning_rate": 2.3142857142857145e-06,
680
- "loss": 0.1015,
681
- "step": 28,
682
- "step_time": 3.6348850080003103
683
  },
684
  {
685
  "clip_ratio/high_max": 0.0,
@@ -687,47 +225,14 @@
687
  "clip_ratio/low_mean": 0.001953125,
688
  "clip_ratio/low_min": 0.0,
689
  "clip_ratio/region_mean": 0.001953125,
690
- "completions/clipped_ratio": 0.84375,
691
- "completions/max_length": 16.0,
692
- "completions/max_terminated_length": 15.0,
693
- "completions/mean_length": 15.15625,
694
- "completions/mean_terminated_length": 10.600000381469727,
695
- "completions/min_length": 4.0,
696
- "completions/min_terminated_length": 4.0,
697
- "entropy": 0.44716596603393555,
698
- "epoch": 0.00058,
699
- "frac_reward_zero_std": 0.0,
700
- "grad_norm": 0.47282618284225464,
701
- "kl": 0.000547109026229009,
702
- "learning_rate": 2.4000000000000003e-06,
703
- "loss": 0.0551,
704
- "num_tokens": 791760.0,
705
- "reward": -22.874284744262695,
706
- "reward_std": 16.014789581298828,
707
- "rewards/rollout_reward_func/mean": -22.874284744262695,
708
- "rewards/rollout_reward_func/std": 16.014787673950195,
709
- "sampling/importance_sampling_ratio/max": 1.445327639579773,
710
- "sampling/importance_sampling_ratio/mean": 0.9864210486412048,
711
- "sampling/importance_sampling_ratio/min": 0.6746537685394287,
712
- "sampling/sampling_logp_difference/max": 0.24254560470581055,
713
- "sampling/sampling_logp_difference/mean": 0.01340758427977562,
714
- "step": 29,
715
- "step_time": 12.450915582999642
716
- },
717
- {
718
- "clip_ratio/high_max": 0.0,
719
- "clip_ratio/high_mean": 0.0,
720
- "clip_ratio/low_mean": 0.0024038462433964014,
721
- "clip_ratio/low_min": 0.0,
722
- "clip_ratio/region_mean": 0.0024038462433964014,
723
- "entropy": 0.44682736694812775,
724
- "epoch": 0.0006,
725
- "grad_norm": 0.47560998797416687,
726
- "kl": 0.0005742600187659264,
727
- "learning_rate": 2.4857142857142858e-06,
728
- "loss": 0.0575,
729
- "step": 30,
730
- "step_time": 3.652044981000472
731
  },
732
  {
733
  "clip_ratio/high_max": 0.0,
@@ -735,32 +240,32 @@
735
  "clip_ratio/low_mean": 0.0,
736
  "clip_ratio/low_min": 0.0,
737
  "clip_ratio/region_mean": 0.0,
738
- "completions/clipped_ratio": 0.78125,
739
  "completions/max_length": 16.0,
740
- "completions/max_terminated_length": 14.0,
741
- "completions/mean_length": 14.4375,
742
- "completions/mean_terminated_length": 8.85714340209961,
743
- "completions/min_length": 4.0,
744
- "completions/min_terminated_length": 4.0,
745
- "entropy": 0.4201854541897774,
746
- "epoch": 0.00062,
747
  "frac_reward_zero_std": 0.0,
748
- "grad_norm": 0.523423969745636,
749
- "kl": 0.0003851138026220724,
750
- "learning_rate": 2.571428571428571e-06,
751
- "loss": 0.0874,
752
- "num_tokens": 844586.0,
753
- "reward": -21.070207595825195,
754
- "reward_std": 17.62769317626953,
755
- "rewards/rollout_reward_func/mean": -21.070207595825195,
756
- "rewards/rollout_reward_func/std": 17.62769317626953,
757
- "sampling/importance_sampling_ratio/max": 1.375394344329834,
758
- "sampling/importance_sampling_ratio/mean": 0.9897651672363281,
759
- "sampling/importance_sampling_ratio/min": 0.694791316986084,
760
- "sampling/sampling_logp_difference/max": 0.20741581916809082,
761
- "sampling/sampling_logp_difference/mean": 0.010995335876941681,
762
- "step": 31,
763
- "step_time": 12.358129288999862
764
  },
765
  {
766
  "clip_ratio/high_max": 0.0,
@@ -768,1060 +273,52 @@
768
  "clip_ratio/low_mean": 0.0,
769
  "clip_ratio/low_min": 0.0,
770
  "clip_ratio/region_mean": 0.0,
771
- "entropy": 0.4208702892065048,
772
- "epoch": 0.00064,
773
- "grad_norm": 0.490420937538147,
774
- "kl": 0.00032598181496723555,
775
- "learning_rate": 2.657142857142857e-06,
776
- "loss": 0.0874,
777
- "step": 32,
778
- "step_time": 3.600919356000759
779
  },
780
  {
781
- "clip_ratio/high_max": 0.0,
782
- "clip_ratio/high_mean": 0.0,
783
- "clip_ratio/low_mean": 0.0,
784
- "clip_ratio/low_min": 0.0,
785
- "clip_ratio/region_mean": 0.0,
786
- "completions/clipped_ratio": 0.84375,
787
- "completions/max_length": 16.0,
788
- "completions/max_terminated_length": 13.0,
789
- "completions/mean_length": 15.0,
790
- "completions/mean_terminated_length": 9.600000381469727,
791
- "completions/min_length": 4.0,
792
- "completions/min_terminated_length": 4.0,
793
- "entropy": 0.45302998274564743,
794
- "epoch": 0.00066,
795
- "frac_reward_zero_std": 0.0,
796
- "grad_norm": 0.45933911204338074,
797
- "kl": 0.00046427604684140533,
798
- "learning_rate": 2.742857142857143e-06,
799
- "loss": 0.0425,
800
- "num_tokens": 896932.0,
801
- "reward": -21.56837272644043,
802
- "reward_std": 17.189640045166016,
803
- "rewards/rollout_reward_func/mean": -21.56837272644043,
804
- "rewards/rollout_reward_func/std": 17.189638137817383,
805
- "sampling/importance_sampling_ratio/max": 1.3717750310897827,
806
- "sampling/importance_sampling_ratio/mean": 1.0236396789550781,
807
- "sampling/importance_sampling_ratio/min": 0.7908633947372437,
808
- "sampling/sampling_logp_difference/max": 0.16204869747161865,
809
- "sampling/sampling_logp_difference/mean": 0.011112736538052559,
810
- "step": 33,
811
- "step_time": 12.270265751999887
812
- },
813
- {
814
- "clip_ratio/high_max": 0.0,
815
- "clip_ratio/high_mean": 0.0,
816
- "clip_ratio/low_mean": 0.001953125,
817
- "clip_ratio/low_min": 0.0,
818
- "clip_ratio/region_mean": 0.001953125,
819
- "entropy": 0.45316264033317566,
820
- "epoch": 0.00068,
821
- "grad_norm": 0.47820040583610535,
822
- "kl": 0.0006169998086988926,
823
- "learning_rate": 2.8285714285714288e-06,
824
- "loss": 0.0429,
825
- "step": 34,
826
- "step_time": 3.612241633999929
827
- },
828
- {
829
- "clip_ratio/high_max": 0.0,
830
- "clip_ratio/high_mean": 0.0,
831
- "clip_ratio/low_mean": 0.0,
832
- "clip_ratio/low_min": 0.0,
833
- "clip_ratio/region_mean": 0.0,
834
- "completions/clipped_ratio": 0.84375,
835
- "completions/max_length": 16.0,
836
- "completions/max_terminated_length": 16.0,
837
- "completions/mean_length": 15.09375,
838
- "completions/mean_terminated_length": 10.199999809265137,
839
- "completions/min_length": 4.0,
840
- "completions/min_terminated_length": 4.0,
841
- "entropy": 0.42138345539569855,
842
- "epoch": 0.0007,
843
- "frac_reward_zero_std": 0.0,
844
- "grad_norm": 0.46025797724723816,
845
- "kl": 0.0004996234347345307,
846
- "learning_rate": 2.9142857142857142e-06,
847
- "loss": 0.053,
848
- "num_tokens": 948802.0,
849
- "reward": -24.316551208496094,
850
- "reward_std": 12.835836410522461,
851
- "rewards/rollout_reward_func/mean": -24.316551208496094,
852
- "rewards/rollout_reward_func/std": 12.835835456848145,
853
- "sampling/importance_sampling_ratio/max": 1.328806757926941,
854
- "sampling/importance_sampling_ratio/mean": 1.0274875164031982,
855
- "sampling/importance_sampling_ratio/min": 0.894749641418457,
856
- "sampling/sampling_logp_difference/max": 0.18797016143798828,
857
- "sampling/sampling_logp_difference/mean": 0.011560991406440735,
858
- "step": 35,
859
- "step_time": 13.08812723400024
860
- },
861
- {
862
- "clip_ratio/high_max": 0.0,
863
- "clip_ratio/high_mean": 0.0,
864
- "clip_ratio/low_mean": 0.001953125,
865
- "clip_ratio/low_min": 0.0,
866
- "clip_ratio/region_mean": 0.001953125,
867
- "entropy": 0.42126451432704926,
868
- "epoch": 0.00072,
869
- "grad_norm": 0.43526744842529297,
870
- "kl": 0.000492801918881014,
871
- "learning_rate": 3e-06,
872
- "loss": 0.054,
873
- "step": 36,
874
- "step_time": 4.318984840999747
875
- },
876
- {
877
- "clip_ratio/high_max": 0.0,
878
- "clip_ratio/high_mean": 0.0,
879
- "clip_ratio/low_mean": 0.0,
880
- "clip_ratio/low_min": 0.0,
881
- "clip_ratio/region_mean": 0.0,
882
- "completions/clipped_ratio": 0.75,
883
- "completions/max_length": 16.0,
884
- "completions/max_terminated_length": 15.0,
885
- "completions/mean_length": 14.53125,
886
- "completions/mean_terminated_length": 10.125,
887
- "completions/min_length": 4.0,
888
- "completions/min_terminated_length": 4.0,
889
- "entropy": 0.4301793724298477,
890
- "epoch": 0.00074,
891
- "frac_reward_zero_std": 0.0,
892
- "grad_norm": 0.3941028416156769,
893
- "kl": 0.0006222588490345515,
894
- "learning_rate": 2.999999999444446e-06,
895
- "loss": 0.058,
896
- "num_tokens": 1001889.0,
897
- "reward": -21.4174861907959,
898
- "reward_std": 12.194807052612305,
899
- "rewards/rollout_reward_func/mean": -21.4174861907959,
900
- "rewards/rollout_reward_func/std": 12.194808006286621,
901
- "sampling/importance_sampling_ratio/max": 1.2621595859527588,
902
- "sampling/importance_sampling_ratio/mean": 1.006618618965149,
903
- "sampling/importance_sampling_ratio/min": 0.8662732243537903,
904
- "sampling/sampling_logp_difference/max": 0.19760990142822266,
905
- "sampling/sampling_logp_difference/mean": 0.00977108720690012,
906
- "step": 37,
907
- "step_time": 12.948201442999562
908
- },
909
- {
910
- "clip_ratio/high_max": 0.0,
911
- "clip_ratio/high_mean": 0.0,
912
- "clip_ratio/low_mean": 0.0,
913
- "clip_ratio/low_min": 0.0,
914
- "clip_ratio/region_mean": 0.0,
915
- "entropy": 0.43141717463731766,
916
- "epoch": 0.00076,
917
- "grad_norm": 0.39947450160980225,
918
- "kl": 0.000883200435055187,
919
- "learning_rate": 2.9999999977777837e-06,
920
- "loss": 0.0575,
921
- "step": 38,
922
- "step_time": 3.602961716000209
923
- },
924
- {
925
- "clip_ratio/high_max": 0.0,
926
- "clip_ratio/high_mean": 0.0,
927
- "clip_ratio/low_mean": 0.0,
928
- "clip_ratio/low_min": 0.0,
929
- "clip_ratio/region_mean": 0.0,
930
- "completions/clipped_ratio": 0.71875,
931
- "completions/max_length": 16.0,
932
- "completions/max_terminated_length": 15.0,
933
- "completions/mean_length": 13.625,
934
- "completions/mean_terminated_length": 7.555555820465088,
935
- "completions/min_length": 3.0,
936
- "completions/min_terminated_length": 3.0,
937
- "entropy": 0.48835278302431107,
938
- "epoch": 0.00078,
939
- "frac_reward_zero_std": 0.125,
940
- "grad_norm": 0.39885786175727844,
941
- "kl": 0.0009218690829584375,
942
- "learning_rate": 2.9999999950000137e-06,
943
- "loss": 0.0949,
944
- "num_tokens": 1056411.0,
945
- "reward": -22.038280487060547,
946
- "reward_std": 14.705154418945312,
947
- "rewards/rollout_reward_func/mean": -22.038280487060547,
948
- "rewards/rollout_reward_func/std": 14.705153465270996,
949
- "sampling/importance_sampling_ratio/max": 1.3119615316390991,
950
- "sampling/importance_sampling_ratio/mean": 1.0176267623901367,
951
- "sampling/importance_sampling_ratio/min": 0.7629126310348511,
952
- "sampling/sampling_logp_difference/max": 0.12403631210327148,
953
- "sampling/sampling_logp_difference/mean": 0.01314721629023552,
954
- "step": 39,
955
- "step_time": 12.314849860000322
956
- },
957
- {
958
- "clip_ratio/high_max": 0.0,
959
- "clip_ratio/high_mean": 0.0,
960
- "clip_ratio/low_mean": 0.0,
961
- "clip_ratio/low_min": 0.0,
962
- "clip_ratio/region_mean": 0.0,
963
- "entropy": 0.4897266551852226,
964
- "epoch": 0.0008,
965
- "grad_norm": 0.3934723734855652,
966
- "kl": 0.0010027830867329612,
967
- "learning_rate": 2.999999991111135e-06,
968
- "loss": 0.095,
969
- "step": 40,
970
- "step_time": 3.6701166389993887
971
- },
972
- {
973
- "clip_ratio/high_max": 0.0,
974
- "clip_ratio/high_mean": 0.0,
975
- "clip_ratio/low_mean": 0.0,
976
- "clip_ratio/low_min": 0.0,
977
- "clip_ratio/region_mean": 0.0,
978
- "completions/clipped_ratio": 0.6875,
979
- "completions/max_length": 16.0,
980
- "completions/max_terminated_length": 16.0,
981
- "completions/mean_length": 14.28125,
982
- "completions/mean_terminated_length": 10.5,
983
- "completions/min_length": 3.0,
984
- "completions/min_terminated_length": 3.0,
985
- "entropy": 0.4647715613245964,
986
- "epoch": 0.00082,
987
- "frac_reward_zero_std": 0.0,
988
- "grad_norm": 0.4407760500907898,
989
- "kl": 0.000995866059383843,
990
- "learning_rate": 2.9999999861111487e-06,
991
- "loss": 0.0416,
992
- "num_tokens": 1109059.0,
993
- "reward": -20.27096939086914,
994
- "reward_std": 16.170513153076172,
995
- "rewards/rollout_reward_func/mean": -20.27096939086914,
996
- "rewards/rollout_reward_func/std": 16.170513153076172,
997
- "sampling/importance_sampling_ratio/max": 1.238898515701294,
998
- "sampling/importance_sampling_ratio/mean": 0.9764783382415771,
999
- "sampling/importance_sampling_ratio/min": 0.7559868693351746,
1000
- "sampling/sampling_logp_difference/max": 0.24738311767578125,
1001
- "sampling/sampling_logp_difference/mean": 0.011778589338064194,
1002
- "step": 41,
1003
- "step_time": 12.945746578000126
1004
- },
1005
- {
1006
- "clip_ratio/high_max": 0.0,
1007
- "clip_ratio/high_mean": 0.0,
1008
- "clip_ratio/low_mean": 0.0,
1009
- "clip_ratio/low_min": 0.0,
1010
- "clip_ratio/region_mean": 0.0,
1011
- "entropy": 0.46506941318511963,
1012
- "epoch": 0.00084,
1013
- "grad_norm": 0.41802552342414856,
1014
- "kl": 0.0008689216920174658,
1015
- "learning_rate": 2.999999980000054e-06,
1016
- "loss": 0.0416,
1017
- "step": 42,
1018
- "step_time": 3.5847704960001465
1019
- },
1020
- {
1021
- "clip_ratio/high_max": 0.0,
1022
- "clip_ratio/high_mean": 0.0,
1023
- "clip_ratio/low_mean": 0.0,
1024
- "clip_ratio/low_min": 0.0,
1025
- "clip_ratio/region_mean": 0.0,
1026
- "completions/clipped_ratio": 0.8125,
1027
- "completions/max_length": 16.0,
1028
- "completions/max_terminated_length": 16.0,
1029
- "completions/mean_length": 14.90625,
1030
- "completions/mean_terminated_length": 10.166666984558105,
1031
- "completions/min_length": 3.0,
1032
- "completions/min_terminated_length": 3.0,
1033
- "entropy": 0.40763208270072937,
1034
- "epoch": 0.00086,
1035
- "frac_reward_zero_std": 0.0,
1036
- "grad_norm": 0.38299527764320374,
1037
- "kl": 0.0007383096963167191,
1038
- "learning_rate": 2.999999972777851e-06,
1039
- "loss": 0.0721,
1040
- "num_tokens": 1163200.0,
1041
- "reward": -18.545978546142578,
1042
- "reward_std": 14.86248779296875,
1043
- "rewards/rollout_reward_func/mean": -18.545978546142578,
1044
- "rewards/rollout_reward_func/std": 14.862486839294434,
1045
- "sampling/importance_sampling_ratio/max": 1.201629400253296,
1046
- "sampling/importance_sampling_ratio/mean": 1.003993272781372,
1047
- "sampling/importance_sampling_ratio/min": 0.8553942441940308,
1048
- "sampling/sampling_logp_difference/max": 0.14433181285858154,
1049
- "sampling/sampling_logp_difference/mean": 0.010116634890437126,
1050
- "step": 43,
1051
- "step_time": 13.120121834000201
1052
- },
1053
- {
1054
- "clip_ratio/high_max": 0.0,
1055
- "clip_ratio/high_mean": 0.0,
1056
- "clip_ratio/low_mean": 0.003969253972172737,
1057
- "clip_ratio/low_min": 0.0,
1058
- "clip_ratio/region_mean": 0.003969253972172737,
1059
- "entropy": 0.4081807807087898,
1060
- "epoch": 0.00088,
1061
- "grad_norm": 0.38337594270706177,
1062
- "kl": 0.0009075161142391153,
1063
- "learning_rate": 2.99999996444454e-06,
1064
- "loss": 0.071,
1065
- "step": 44,
1066
- "step_time": 3.613092282999787
1067
- },
1068
- {
1069
- "clip_ratio/high_max": 0.0,
1070
- "clip_ratio/high_mean": 0.0,
1071
- "clip_ratio/low_mean": 0.0,
1072
- "clip_ratio/low_min": 0.0,
1073
- "clip_ratio/region_mean": 0.0,
1074
- "completions/clipped_ratio": 0.78125,
1075
- "completions/max_length": 16.0,
1076
- "completions/max_terminated_length": 16.0,
1077
- "completions/mean_length": 14.75,
1078
- "completions/mean_terminated_length": 10.285715103149414,
1079
- "completions/min_length": 4.0,
1080
- "completions/min_terminated_length": 4.0,
1081
- "entropy": 0.439114011824131,
1082
- "epoch": 0.0009,
1083
- "frac_reward_zero_std": 0.0,
1084
- "grad_norm": 0.45874863862991333,
1085
- "kl": 0.001827418542234227,
1086
- "learning_rate": 2.9999999550001207e-06,
1087
- "loss": 0.0394,
1088
- "num_tokens": 1218086.0,
1089
- "reward": -17.6557559967041,
1090
- "reward_std": 14.141748428344727,
1091
- "rewards/rollout_reward_func/mean": -17.6557559967041,
1092
- "rewards/rollout_reward_func/std": 14.14174747467041,
1093
- "sampling/importance_sampling_ratio/max": 1.4917187690734863,
1094
- "sampling/importance_sampling_ratio/mean": 0.9951067566871643,
1095
- "sampling/importance_sampling_ratio/min": 0.7373844385147095,
1096
- "sampling/sampling_logp_difference/max": 0.22326016426086426,
1097
- "sampling/sampling_logp_difference/mean": 0.013849505223333836,
1098
- "step": 45,
1099
- "step_time": 12.33521299799986
1100
- },
1101
- {
1102
- "clip_ratio/high_max": 0.0,
1103
- "clip_ratio/high_mean": 0.0,
1104
- "clip_ratio/low_mean": 0.0,
1105
- "clip_ratio/low_min": 0.0,
1106
- "clip_ratio/region_mean": 0.0,
1107
- "entropy": 0.4387468546628952,
1108
- "epoch": 0.00092,
1109
- "grad_norm": 0.4409801959991455,
1110
- "kl": 0.0015551562537439167,
1111
- "learning_rate": 2.999999944444594e-06,
1112
- "loss": 0.04,
1113
- "step": 46,
1114
- "step_time": 4.31953496000051
1115
- },
1116
- {
1117
- "clip_ratio/high_max": 0.0,
1118
- "clip_ratio/high_mean": 0.0,
1119
- "clip_ratio/low_mean": 0.0,
1120
- "clip_ratio/low_min": 0.0,
1121
- "clip_ratio/region_mean": 0.0,
1122
- "completions/clipped_ratio": 0.71875,
1123
- "completions/max_length": 16.0,
1124
- "completions/max_terminated_length": 14.0,
1125
- "completions/mean_length": 14.0,
1126
- "completions/mean_terminated_length": 8.88888931274414,
1127
- "completions/min_length": 4.0,
1128
- "completions/min_terminated_length": 4.0,
1129
- "entropy": 0.4944975897669792,
1130
- "epoch": 0.00094,
1131
- "frac_reward_zero_std": 0.0,
1132
- "grad_norm": 0.6308720707893372,
1133
- "kl": 0.0014178274941514246,
1134
- "learning_rate": 2.999999932777959e-06,
1135
- "loss": 0.037,
1136
- "num_tokens": 1271722.0,
1137
- "reward": -16.693885803222656,
1138
- "reward_std": 18.024343490600586,
1139
- "rewards/rollout_reward_func/mean": -16.693885803222656,
1140
- "rewards/rollout_reward_func/std": 18.02434539794922,
1141
- "sampling/importance_sampling_ratio/max": 1.36251699924469,
1142
- "sampling/importance_sampling_ratio/mean": 0.9912035465240479,
1143
- "sampling/importance_sampling_ratio/min": 0.8056352734565735,
1144
- "sampling/sampling_logp_difference/max": 0.24249553680419922,
1145
- "sampling/sampling_logp_difference/mean": 0.011305240914225578,
1146
- "step": 47,
1147
- "step_time": 12.623038148999512
1148
- },
1149
- {
1150
- "clip_ratio/high_max": 0.0,
1151
- "clip_ratio/high_mean": 0.0,
1152
- "clip_ratio/low_mean": 0.0,
1153
- "clip_ratio/low_min": 0.0,
1154
- "clip_ratio/region_mean": 0.0,
1155
- "entropy": 0.4939929470419884,
1156
- "epoch": 0.00096,
1157
- "grad_norm": 0.6490526795387268,
1158
- "kl": 0.0013543795867008157,
1159
- "learning_rate": 2.9999999200002154e-06,
1160
- "loss": 0.0363,
1161
- "step": 48,
1162
- "step_time": 3.6226852679997137
1163
- },
1164
- {
1165
- "clip_ratio/high_max": 0.0,
1166
- "clip_ratio/high_mean": 0.0,
1167
- "clip_ratio/low_mean": 0.001953125,
1168
- "clip_ratio/low_min": 0.0,
1169
- "clip_ratio/region_mean": 0.001953125,
1170
- "completions/clipped_ratio": 0.8125,
1171
- "completions/max_length": 16.0,
1172
- "completions/max_terminated_length": 16.0,
1173
- "completions/mean_length": 15.0,
1174
- "completions/mean_terminated_length": 10.666666984558105,
1175
- "completions/min_length": 4.0,
1176
- "completions/min_terminated_length": 4.0,
1177
- "entropy": 0.419900543987751,
1178
- "epoch": 0.00098,
1179
- "frac_reward_zero_std": 0.0,
1180
- "grad_norm": 0.4623699188232422,
1181
- "kl": 0.0007145817362470552,
1182
- "learning_rate": 2.999999906111364e-06,
1183
- "loss": 0.1004,
1184
- "num_tokens": 1325693.0,
1185
- "reward": -20.183937072753906,
1186
- "reward_std": 13.987285614013672,
1187
- "rewards/rollout_reward_func/mean": -20.183937072753906,
1188
- "rewards/rollout_reward_func/std": 13.987285614013672,
1189
- "sampling/importance_sampling_ratio/max": 1.3713921308517456,
1190
- "sampling/importance_sampling_ratio/mean": 1.0176022052764893,
1191
- "sampling/importance_sampling_ratio/min": 0.8512320518493652,
1192
- "sampling/sampling_logp_difference/max": 0.18564534187316895,
1193
- "sampling/sampling_logp_difference/mean": 0.012047262862324715,
1194
- "step": 49,
1195
- "step_time": 12.366732422000041
1196
- },
1197
- {
1198
- "clip_ratio/high_max": 0.012500000186264515,
1199
- "clip_ratio/high_mean": 0.0031250000465661287,
1200
- "clip_ratio/low_mean": 0.0,
1201
- "clip_ratio/low_min": 0.0,
1202
- "clip_ratio/region_mean": 0.0031250000465661287,
1203
- "entropy": 0.42126236110925674,
1204
- "epoch": 0.001,
1205
- "grad_norm": 0.42615005373954773,
1206
- "kl": 0.0008786674443399534,
1207
- "learning_rate": 2.9999998911114045e-06,
1208
- "loss": 0.0997,
1209
- "step": 50,
1210
- "step_time": 3.6702840139996624
1211
- },
1212
- {
1213
- "clip_ratio/high_max": 0.0,
1214
- "clip_ratio/high_mean": 0.0,
1215
- "clip_ratio/low_mean": 0.0,
1216
- "clip_ratio/low_min": 0.0,
1217
- "clip_ratio/region_mean": 0.0,
1218
- "completions/clipped_ratio": 0.84375,
1219
- "completions/max_length": 16.0,
1220
- "completions/max_terminated_length": 16.0,
1221
- "completions/mean_length": 15.0625,
1222
- "completions/mean_terminated_length": 10.0,
1223
- "completions/min_length": 3.0,
1224
- "completions/min_terminated_length": 3.0,
1225
- "entropy": 0.45311274379491806,
1226
- "epoch": 0.00102,
1227
- "frac_reward_zero_std": 0.0,
1228
- "grad_norm": 0.5940121412277222,
1229
- "kl": 0.0008269677782664075,
1230
- "learning_rate": 2.9999998750003373e-06,
1231
- "loss": 0.0528,
1232
- "num_tokens": 1378085.0,
1233
- "reward": -21.42279815673828,
1234
- "reward_std": 15.290786743164062,
1235
- "rewards/rollout_reward_func/mean": -21.42279815673828,
1236
- "rewards/rollout_reward_func/std": 15.290786743164062,
1237
- "sampling/importance_sampling_ratio/max": 1.2291712760925293,
1238
- "sampling/importance_sampling_ratio/mean": 1.0124390125274658,
1239
- "sampling/importance_sampling_ratio/min": 0.8846192359924316,
1240
- "sampling/sampling_logp_difference/max": 0.21129131317138672,
1241
- "sampling/sampling_logp_difference/mean": 0.009809674695134163,
1242
- "step": 51,
1243
- "step_time": 13.111581994000062
1244
- },
1245
- {
1246
- "clip_ratio/high_max": 0.0,
1247
- "clip_ratio/high_mean": 0.0,
1248
- "clip_ratio/low_mean": 0.0,
1249
- "clip_ratio/low_min": 0.0,
1250
- "clip_ratio/region_mean": 0.0,
1251
- "entropy": 0.45369481295347214,
1252
- "epoch": 0.00104,
1253
- "grad_norm": 0.5907822251319885,
1254
- "kl": 0.0009798892569961026,
1255
- "learning_rate": 2.999999857778162e-06,
1256
- "loss": 0.0518,
1257
- "step": 52,
1258
- "step_time": 3.608162464000088
1259
- },
1260
- {
1261
- "clip_ratio/high_max": 0.01315789483487606,
1262
- "clip_ratio/high_mean": 0.003289473708719015,
1263
- "clip_ratio/low_mean": 0.0,
1264
- "clip_ratio/low_min": 0.0,
1265
- "clip_ratio/region_mean": 0.003289473708719015,
1266
- "completions/clipped_ratio": 0.5625,
1267
- "completions/max_length": 16.0,
1268
- "completions/max_terminated_length": 14.0,
1269
- "completions/mean_length": 12.28125,
1270
- "completions/mean_terminated_length": 7.500000476837158,
1271
- "completions/min_length": 3.0,
1272
- "completions/min_terminated_length": 3.0,
1273
- "entropy": 0.443049892783165,
1274
- "epoch": 0.00106,
1275
- "frac_reward_zero_std": 0.0,
1276
- "grad_norm": 0.45799383521080017,
1277
- "kl": 0.0016540068900212646,
1278
- "learning_rate": 2.9999998394448782e-06,
1279
- "loss": 0.1527,
1280
- "num_tokens": 1429659.0,
1281
- "reward": -19.423545837402344,
1282
- "reward_std": 16.36824607849121,
1283
- "rewards/rollout_reward_func/mean": -19.423545837402344,
1284
- "rewards/rollout_reward_func/std": 16.36824607849121,
1285
- "sampling/importance_sampling_ratio/max": 1.430258870124817,
1286
- "sampling/importance_sampling_ratio/mean": 1.0030834674835205,
1287
- "sampling/importance_sampling_ratio/min": 0.7807595729827881,
1288
- "sampling/sampling_logp_difference/max": 0.17966920137405396,
1289
- "sampling/sampling_logp_difference/mean": 0.015100691467523575,
1290
- "step": 53,
1291
- "step_time": 12.377532298999995
1292
- },
1293
- {
1294
- "clip_ratio/high_max": 0.0,
1295
- "clip_ratio/high_mean": 0.0,
1296
- "clip_ratio/low_mean": 0.001953125,
1297
- "clip_ratio/low_min": 0.0,
1298
- "clip_ratio/region_mean": 0.001953125,
1299
- "entropy": 0.4446665346622467,
1300
- "epoch": 0.00108,
1301
- "grad_norm": 0.47425946593284607,
1302
- "kl": 0.001763768566888757,
1303
- "learning_rate": 2.9999998200004873e-06,
1304
- "loss": 0.1528,
1305
- "step": 54,
1306
- "step_time": 3.616540865999923
1307
- },
1308
- {
1309
- "clip_ratio/high_max": 0.0,
1310
- "clip_ratio/high_mean": 0.0,
1311
- "clip_ratio/low_mean": 0.0,
1312
- "clip_ratio/low_min": 0.0,
1313
- "clip_ratio/region_mean": 0.0,
1314
- "completions/clipped_ratio": 0.71875,
1315
- "completions/max_length": 16.0,
1316
- "completions/max_terminated_length": 11.0,
1317
- "completions/mean_length": 13.375,
1318
- "completions/mean_terminated_length": 6.666666507720947,
1319
- "completions/min_length": 3.0,
1320
- "completions/min_terminated_length": 3.0,
1321
- "entropy": 0.5619853883981705,
1322
- "epoch": 0.0011,
1323
- "frac_reward_zero_std": 0.0,
1324
- "grad_norm": 0.6419393420219421,
1325
- "kl": 0.00159555982099846,
1326
- "learning_rate": 2.9999997994449878e-06,
1327
- "loss": 0.0778,
1328
- "num_tokens": 1483948.0,
1329
- "reward": -20.401737213134766,
1330
- "reward_std": 16.437597274780273,
1331
- "rewards/rollout_reward_func/mean": -20.401737213134766,
1332
- "rewards/rollout_reward_func/std": 16.437597274780273,
1333
- "sampling/importance_sampling_ratio/max": 1.144453525543213,
1334
- "sampling/importance_sampling_ratio/mean": 0.9405947923660278,
1335
- "sampling/importance_sampling_ratio/min": 0.610896110534668,
1336
- "sampling/sampling_logp_difference/max": 0.264723539352417,
1337
- "sampling/sampling_logp_difference/mean": 0.016143541783094406,
1338
- "step": 55,
1339
- "step_time": 14.737511243999961
1340
- },
1341
- {
1342
- "clip_ratio/high_max": 0.0,
1343
- "clip_ratio/high_mean": 0.0,
1344
- "clip_ratio/low_mean": 0.0,
1345
- "clip_ratio/low_min": 0.0,
1346
- "clip_ratio/region_mean": 0.0,
1347
- "entropy": 0.5617516338825226,
1348
- "epoch": 0.00112,
1349
- "grad_norm": 0.6307651996612549,
1350
- "kl": 0.0013526002294383943,
1351
- "learning_rate": 2.9999997777783805e-06,
1352
- "loss": 0.0769,
1353
- "step": 56,
1354
- "step_time": 3.6947174689994426
1355
- },
1356
- {
1357
- "clip_ratio/high_max": 0.0,
1358
- "clip_ratio/high_mean": 0.0,
1359
- "clip_ratio/low_mean": 0.0,
1360
- "clip_ratio/low_min": 0.0,
1361
- "clip_ratio/region_mean": 0.0,
1362
- "completions/clipped_ratio": 0.65625,
1363
- "completions/max_length": 16.0,
1364
- "completions/max_terminated_length": 16.0,
1365
- "completions/mean_length": 13.96875,
1366
- "completions/mean_terminated_length": 10.090909004211426,
1367
- "completions/min_length": 3.0,
1368
- "completions/min_terminated_length": 3.0,
1369
- "entropy": 0.44592302292585373,
1370
- "epoch": 0.00114,
1371
- "frac_reward_zero_std": 0.0,
1372
- "grad_norm": 0.4100278317928314,
1373
- "kl": 0.0011417547648306936,
1374
- "learning_rate": 2.999999755000665e-06,
1375
- "loss": 0.1278,
1376
- "num_tokens": 1537342.0,
1377
- "reward": -19.665666580200195,
1378
- "reward_std": 15.70829963684082,
1379
- "rewards/rollout_reward_func/mean": -19.665666580200195,
1380
- "rewards/rollout_reward_func/std": 15.708298683166504,
1381
- "sampling/importance_sampling_ratio/max": 1.1984679698944092,
1382
- "sampling/importance_sampling_ratio/mean": 1.0152921676635742,
1383
- "sampling/importance_sampling_ratio/min": 0.6651595830917358,
1384
- "sampling/sampling_logp_difference/max": 0.22649931907653809,
1385
- "sampling/sampling_logp_difference/mean": 0.011754296720027924,
1386
- "step": 57,
1387
- "step_time": 13.501649334999911
1388
- },
1389
- {
1390
- "clip_ratio/high_max": 0.0,
1391
- "clip_ratio/high_mean": 0.0,
1392
- "clip_ratio/low_mean": 0.0,
1393
- "clip_ratio/low_min": 0.0,
1394
- "clip_ratio/region_mean": 0.0,
1395
- "entropy": 0.4463474452495575,
1396
- "epoch": 0.00116,
1397
- "grad_norm": 0.40949589014053345,
1398
- "kl": 0.0009677642083261162,
1399
- "learning_rate": 2.9999997311118423e-06,
1400
- "loss": 0.1279,
1401
- "step": 58,
1402
- "step_time": 3.6741236200000458
1403
- },
1404
- {
1405
- "clip_ratio/high_max": 0.0,
1406
- "clip_ratio/high_mean": 0.0,
1407
- "clip_ratio/low_mean": 0.0,
1408
- "clip_ratio/low_min": 0.0,
1409
- "clip_ratio/region_mean": 0.0,
1410
- "completions/clipped_ratio": 0.8125,
1411
- "completions/max_length": 16.0,
1412
- "completions/max_terminated_length": 13.0,
1413
- "completions/mean_length": 14.46875,
1414
- "completions/mean_terminated_length": 7.833333492279053,
1415
- "completions/min_length": 3.0,
1416
- "completions/min_terminated_length": 3.0,
1417
- "entropy": 0.42436008155345917,
1418
- "epoch": 0.00118,
1419
- "frac_reward_zero_std": 0.0,
1420
- "grad_norm": 0.43110400438308716,
1421
- "kl": 0.0018153611745219678,
1422
- "learning_rate": 2.9999997061119113e-06,
1423
- "loss": 0.0759,
1424
- "num_tokens": 1590549.0,
1425
- "reward": -23.231876373291016,
1426
- "reward_std": 14.007892608642578,
1427
- "rewards/rollout_reward_func/mean": -23.231876373291016,
1428
- "rewards/rollout_reward_func/std": 14.007891654968262,
1429
- "sampling/importance_sampling_ratio/max": 1.177204966545105,
1430
- "sampling/importance_sampling_ratio/mean": 0.9738996624946594,
1431
- "sampling/importance_sampling_ratio/min": 0.7864365577697754,
1432
- "sampling/sampling_logp_difference/max": 0.11908435821533203,
1433
- "sampling/sampling_logp_difference/mean": 0.010427574627101421,
1434
- "step": 59,
1435
- "step_time": 13.387355422999917
1436
- },
1437
- {
1438
- "clip_ratio/high_max": 0.0,
1439
- "clip_ratio/high_mean": 0.0,
1440
- "clip_ratio/low_mean": 0.0,
1441
- "clip_ratio/low_min": 0.0,
1442
- "clip_ratio/region_mean": 0.0,
1443
- "entropy": 0.42374272644519806,
1444
- "epoch": 0.0012,
1445
- "grad_norm": 0.4322911202907562,
1446
- "kl": 0.0018385612638667226,
1447
- "learning_rate": 2.999999680000872e-06,
1448
- "loss": 0.0772,
1449
- "step": 60,
1450
- "step_time": 3.6944392599998537
1451
- },
1452
- {
1453
- "clip_ratio/high_max": 0.0078125,
1454
- "clip_ratio/high_mean": 0.001953125,
1455
- "clip_ratio/low_mean": 0.0,
1456
- "clip_ratio/low_min": 0.0,
1457
- "clip_ratio/region_mean": 0.001953125,
1458
- "completions/clipped_ratio": 0.71875,
1459
- "completions/max_length": 16.0,
1460
- "completions/max_terminated_length": 16.0,
1461
- "completions/mean_length": 14.25,
1462
- "completions/mean_terminated_length": 9.777777671813965,
1463
- "completions/min_length": 4.0,
1464
- "completions/min_terminated_length": 4.0,
1465
- "entropy": 0.4840910881757736,
1466
- "epoch": 0.00122,
1467
- "frac_reward_zero_std": 0.125,
1468
- "grad_norm": 0.42527732253074646,
1469
- "kl": 0.001206837929203175,
1470
- "learning_rate": 2.9999996527787258e-06,
1471
- "loss": -0.0195,
1472
- "num_tokens": 1644232.0,
1473
- "reward": -20.776317596435547,
1474
- "reward_std": 15.025127410888672,
1475
- "rewards/rollout_reward_func/mean": -20.776317596435547,
1476
- "rewards/rollout_reward_func/std": 15.025128364562988,
1477
- "sampling/importance_sampling_ratio/max": 1.7264726161956787,
1478
- "sampling/importance_sampling_ratio/mean": 1.053358554840088,
1479
- "sampling/importance_sampling_ratio/min": 0.8105581402778625,
1480
- "sampling/sampling_logp_difference/max": 0.23974990844726562,
1481
- "sampling/sampling_logp_difference/mean": 0.014413169585168362,
1482
- "step": 61,
1483
- "step_time": 12.136578666999867
1484
- },
1485
- {
1486
- "clip_ratio/high_max": 0.0,
1487
- "clip_ratio/high_mean": 0.0,
1488
- "clip_ratio/low_mean": 0.0,
1489
- "clip_ratio/low_min": 0.0,
1490
- "clip_ratio/region_mean": 0.0,
1491
- "entropy": 0.4840751215815544,
1492
- "epoch": 0.00124,
1493
- "grad_norm": 0.4295765161514282,
1494
- "kl": 0.0011497176892589778,
1495
- "learning_rate": 2.9999996244454716e-06,
1496
- "loss": -0.0186,
1497
- "step": 62,
1498
- "step_time": 3.65384825800038
1499
- },
1500
- {
1501
- "clip_ratio/high_max": 0.0,
1502
- "clip_ratio/high_mean": 0.0,
1503
- "clip_ratio/low_mean": 0.0,
1504
- "clip_ratio/low_min": 0.0,
1505
- "clip_ratio/region_mean": 0.0,
1506
- "completions/clipped_ratio": 0.5625,
1507
- "completions/max_length": 16.0,
1508
- "completions/max_terminated_length": 15.0,
1509
- "completions/mean_length": 13.875,
1510
- "completions/mean_terminated_length": 11.142857551574707,
1511
- "completions/min_length": 3.0,
1512
- "completions/min_terminated_length": 3.0,
1513
- "entropy": 0.3919781446456909,
1514
- "epoch": 0.00126,
1515
- "frac_reward_zero_std": 0.0,
1516
- "grad_norm": 0.4190310835838318,
1517
- "kl": 0.00048243661876767874,
1518
- "learning_rate": 2.999999595001109e-06,
1519
- "loss": 0.1343,
1520
- "num_tokens": 1699341.0,
1521
- "reward": -14.78390884399414,
1522
- "reward_std": 15.05773639678955,
1523
- "rewards/rollout_reward_func/mean": -14.78390884399414,
1524
- "rewards/rollout_reward_func/std": 15.05773639678955,
1525
- "sampling/importance_sampling_ratio/max": 1.1515072584152222,
1526
- "sampling/importance_sampling_ratio/mean": 0.9987464547157288,
1527
- "sampling/importance_sampling_ratio/min": 0.8501996397972107,
1528
- "sampling/sampling_logp_difference/max": 0.20190739631652832,
1529
- "sampling/sampling_logp_difference/mean": 0.008220545016229153,
1530
- "step": 63,
1531
- "step_time": 12.735009469000488
1532
- },
1533
- {
1534
- "clip_ratio/high_max": 0.0,
1535
- "clip_ratio/high_mean": 0.0,
1536
- "clip_ratio/low_mean": 0.0,
1537
- "clip_ratio/low_min": 0.0,
1538
- "clip_ratio/region_mean": 0.0,
1539
- "entropy": 0.3920908346772194,
1540
- "epoch": 0.00128,
1541
- "grad_norm": 0.41324248909950256,
1542
- "kl": 0.000511926511535421,
1543
- "learning_rate": 2.9999995644456395e-06,
1544
- "loss": 0.1353,
1545
- "step": 64,
1546
- "step_time": 4.1289103880003495
1547
- },
1548
- {
1549
- "clip_ratio/high_max": 0.0,
1550
- "clip_ratio/high_mean": 0.0,
1551
- "clip_ratio/low_mean": 0.0,
1552
- "clip_ratio/low_min": 0.0,
1553
- "clip_ratio/region_mean": 0.0,
1554
- "completions/clipped_ratio": 0.875,
1555
- "completions/max_length": 16.0,
1556
- "completions/max_terminated_length": 6.0,
1557
- "completions/mean_length": 14.5625,
1558
- "completions/mean_terminated_length": 4.5,
1559
- "completions/min_length": 4.0,
1560
- "completions/min_terminated_length": 4.0,
1561
- "entropy": 0.4723147228360176,
1562
- "epoch": 0.0013,
1563
- "frac_reward_zero_std": 0.0,
1564
- "grad_norm": 0.521558403968811,
1565
- "kl": 0.0010458012475282885,
1566
- "learning_rate": 2.9999995327790616e-06,
1567
- "loss": 0.1397,
1568
- "num_tokens": 1752125.0,
1569
- "reward": -23.402069091796875,
1570
- "reward_std": 12.324657440185547,
1571
- "rewards/rollout_reward_func/mean": -23.402069091796875,
1572
- "rewards/rollout_reward_func/std": 12.324657440185547,
1573
- "sampling/importance_sampling_ratio/max": 1.294718623161316,
1574
- "sampling/importance_sampling_ratio/mean": 1.0054374933242798,
1575
- "sampling/importance_sampling_ratio/min": 0.7819744944572449,
1576
- "sampling/sampling_logp_difference/max": 0.2261056900024414,
1577
- "sampling/sampling_logp_difference/mean": 0.013052692636847496,
1578
- "step": 65,
1579
- "step_time": 12.858984567999869
1580
- },
1581
- {
1582
- "clip_ratio/high_max": 0.0,
1583
- "clip_ratio/high_mean": 0.0,
1584
- "clip_ratio/low_mean": 0.0,
1585
- "clip_ratio/low_min": 0.0,
1586
- "clip_ratio/region_mean": 0.0,
1587
- "entropy": 0.4741643965244293,
1588
- "epoch": 0.00132,
1589
- "grad_norm": 0.5274317860603333,
1590
- "kl": 0.001182250547572039,
1591
- "learning_rate": 2.9999995000013764e-06,
1592
- "loss": 0.1407,
1593
- "step": 66,
1594
- "step_time": 3.581743365999955
1595
- },
1596
- {
1597
- "clip_ratio/high_max": 0.0,
1598
- "clip_ratio/high_mean": 0.0,
1599
- "clip_ratio/low_mean": 0.0,
1600
- "clip_ratio/low_min": 0.0,
1601
- "clip_ratio/region_mean": 0.0,
1602
- "completions/clipped_ratio": 0.75,
1603
- "completions/max_length": 16.0,
1604
- "completions/max_terminated_length": 16.0,
1605
- "completions/mean_length": 14.34375,
1606
- "completions/mean_terminated_length": 9.375,
1607
- "completions/min_length": 3.0,
1608
- "completions/min_terminated_length": 3.0,
1609
- "entropy": 0.47390806674957275,
1610
- "epoch": 0.00134,
1611
- "frac_reward_zero_std": 0.0,
1612
- "grad_norm": 0.5683096051216125,
1613
- "kl": 0.0013981750817038119,
1614
- "learning_rate": 2.9999994661125834e-06,
1615
- "loss": 0.1355,
1616
- "num_tokens": 1806198.0,
1617
- "reward": -19.801048278808594,
1618
- "reward_std": 16.17534637451172,
1619
- "rewards/rollout_reward_func/mean": -19.801048278808594,
1620
- "rewards/rollout_reward_func/std": 16.17534637451172,
1621
- "sampling/importance_sampling_ratio/max": 1.4784481525421143,
1622
- "sampling/importance_sampling_ratio/mean": 1.0331366062164307,
1623
- "sampling/importance_sampling_ratio/min": 0.8605573177337646,
1624
- "sampling/sampling_logp_difference/max": 0.20436906814575195,
1625
- "sampling/sampling_logp_difference/mean": 0.01395871490240097,
1626
- "step": 67,
1627
- "step_time": 12.371656173999781
1628
- },
1629
- {
1630
- "clip_ratio/high_max": 0.0,
1631
- "clip_ratio/high_mean": 0.0,
1632
- "clip_ratio/low_mean": 0.0,
1633
- "clip_ratio/low_min": 0.0,
1634
- "clip_ratio/region_mean": 0.0,
1635
- "entropy": 0.47276531904935837,
1636
- "epoch": 0.00136,
1637
- "grad_norm": 0.5657866597175598,
1638
- "kl": 0.0015692545857746154,
1639
- "learning_rate": 2.999999431112683e-06,
1640
- "loss": 0.1327,
1641
- "step": 68,
1642
- "step_time": 3.63474406000023
1643
- },
1644
- {
1645
- "clip_ratio/high_max": 0.0,
1646
- "clip_ratio/high_mean": 0.0,
1647
- "clip_ratio/low_mean": 0.0,
1648
- "clip_ratio/low_min": 0.0,
1649
- "clip_ratio/region_mean": 0.0,
1650
- "completions/clipped_ratio": 0.65625,
1651
- "completions/max_length": 16.0,
1652
- "completions/max_terminated_length": 15.0,
1653
- "completions/mean_length": 13.21875,
1654
- "completions/mean_terminated_length": 7.909090995788574,
1655
- "completions/min_length": 3.0,
1656
- "completions/min_terminated_length": 3.0,
1657
- "entropy": 0.4877706617116928,
1658
- "epoch": 0.00138,
1659
- "frac_reward_zero_std": 0.0,
1660
- "grad_norm": 0.5023260712623596,
1661
- "kl": 0.0024651199055369943,
1662
- "learning_rate": 2.999999395001675e-06,
1663
- "loss": 0.0828,
1664
- "num_tokens": 1858004.0,
1665
- "reward": -18.188522338867188,
1666
- "reward_std": 19.19774627685547,
1667
- "rewards/rollout_reward_func/mean": -18.188522338867188,
1668
- "rewards/rollout_reward_func/std": 19.197744369506836,
1669
- "sampling/importance_sampling_ratio/max": 1.3357534408569336,
1670
- "sampling/importance_sampling_ratio/mean": 0.9936745166778564,
1671
- "sampling/importance_sampling_ratio/min": 0.7666462063789368,
1672
- "sampling/sampling_logp_difference/max": 0.20173311233520508,
1673
- "sampling/sampling_logp_difference/mean": 0.016002200543880463,
1674
- "step": 69,
1675
- "step_time": 12.922190936000334
1676
- },
1677
- {
1678
- "clip_ratio/high_max": 0.0,
1679
- "clip_ratio/high_mean": 0.0,
1680
- "clip_ratio/low_mean": 0.0,
1681
- "clip_ratio/low_min": 0.0,
1682
- "clip_ratio/region_mean": 0.0,
1683
- "entropy": 0.48947182297706604,
1684
- "epoch": 0.0014,
1685
- "grad_norm": 0.5027741193771362,
1686
- "kl": 0.002827476884704083,
1687
- "learning_rate": 2.9999993577795593e-06,
1688
- "loss": 0.0816,
1689
- "step": 70,
1690
- "step_time": 3.6084864500003277
1691
- },
1692
- {
1693
- "clip_ratio/high_max": 0.0,
1694
- "clip_ratio/high_mean": 0.0,
1695
- "clip_ratio/low_mean": 0.0,
1696
- "clip_ratio/low_min": 0.0,
1697
- "clip_ratio/region_mean": 0.0,
1698
- "completions/clipped_ratio": 0.65625,
1699
- "completions/max_length": 16.0,
1700
- "completions/max_terminated_length": 16.0,
1701
- "completions/mean_length": 13.15625,
1702
- "completions/mean_terminated_length": 7.727272987365723,
1703
- "completions/min_length": 4.0,
1704
- "completions/min_terminated_length": 4.0,
1705
- "entropy": 0.4982185810804367,
1706
- "epoch": 0.00142,
1707
- "frac_reward_zero_std": 0.0,
1708
- "grad_norm": 0.5128209590911865,
1709
- "kl": 0.004301712120650336,
1710
- "learning_rate": 2.999999319446336e-06,
1711
- "loss": 0.1722,
1712
- "num_tokens": 1912265.0,
1713
- "reward": -19.690006256103516,
1714
- "reward_std": 15.063920974731445,
1715
- "rewards/rollout_reward_func/mean": -19.690006256103516,
1716
- "rewards/rollout_reward_func/std": 15.063920974731445,
1717
- "sampling/importance_sampling_ratio/max": 1.3594207763671875,
1718
- "sampling/importance_sampling_ratio/mean": 1.0231478214263916,
1719
- "sampling/importance_sampling_ratio/min": 0.6679673790931702,
1720
- "sampling/sampling_logp_difference/max": 0.2407078742980957,
1721
- "sampling/sampling_logp_difference/mean": 0.013544058427214622,
1722
- "step": 71,
1723
- "step_time": 12.93631576200005
1724
- },
1725
- {
1726
- "clip_ratio/high_max": 0.0,
1727
- "clip_ratio/high_mean": 0.0,
1728
- "clip_ratio/low_mean": 0.0,
1729
- "clip_ratio/low_min": 0.0,
1730
- "clip_ratio/region_mean": 0.0,
1731
- "entropy": 0.49808724224567413,
1732
- "epoch": 0.00144,
1733
- "grad_norm": 0.4984101355075836,
1734
- "kl": 0.005028701314586215,
1735
- "learning_rate": 2.9999992800020054e-06,
1736
- "loss": 0.1724,
1737
- "step": 72,
1738
- "step_time": 3.6676567169997725
1739
- },
1740
- {
1741
- "clip_ratio/high_max": 0.0,
1742
- "clip_ratio/high_mean": 0.0,
1743
- "clip_ratio/low_mean": 0.0,
1744
- "clip_ratio/low_min": 0.0,
1745
- "clip_ratio/region_mean": 0.0,
1746
- "completions/clipped_ratio": 0.71875,
1747
- "completions/max_length": 16.0,
1748
- "completions/max_terminated_length": 14.0,
1749
- "completions/mean_length": 14.09375,
1750
- "completions/mean_terminated_length": 9.222222328186035,
1751
- "completions/min_length": 3.0,
1752
- "completions/min_terminated_length": 3.0,
1753
- "entropy": 0.4910426437854767,
1754
- "epoch": 0.00146,
1755
- "frac_reward_zero_std": 0.0,
1756
- "grad_norm": 0.5857475996017456,
1757
- "kl": 0.0031242658151313663,
1758
- "learning_rate": 2.9999992394465676e-06,
1759
- "loss": 0.0632,
1760
- "num_tokens": 1965150.0,
1761
- "reward": -18.6701717376709,
1762
- "reward_std": 15.390301704406738,
1763
- "rewards/rollout_reward_func/mean": -18.6701717376709,
1764
- "rewards/rollout_reward_func/std": 15.390301704406738,
1765
- "sampling/importance_sampling_ratio/max": 1.6061283349990845,
1766
- "sampling/importance_sampling_ratio/mean": 1.0126081705093384,
1767
- "sampling/importance_sampling_ratio/min": 0.7513319849967957,
1768
- "sampling/sampling_logp_difference/max": 0.38132286071777344,
1769
- "sampling/sampling_logp_difference/mean": 0.013925802893936634,
1770
- "step": 73,
1771
- "step_time": 13.05256272500037
1772
- },
1773
- {
1774
- "clip_ratio/high_max": 0.0,
1775
- "clip_ratio/high_mean": 0.0,
1776
- "clip_ratio/low_mean": 0.0,
1777
- "clip_ratio/low_min": 0.0,
1778
- "clip_ratio/region_mean": 0.0,
1779
- "entropy": 0.4915947765111923,
1780
- "epoch": 0.00148,
1781
- "grad_norm": 0.5674751996994019,
1782
- "kl": 0.003559437900548801,
1783
- "learning_rate": 2.9999991977800225e-06,
1784
- "loss": 0.0631,
1785
- "step": 74,
1786
- "step_time": 4.736504551000053
1787
- },
1788
- {
1789
- "clip_ratio/high_max": 0.0,
1790
- "clip_ratio/high_mean": 0.0,
1791
- "clip_ratio/low_mean": 0.0,
1792
- "clip_ratio/low_min": 0.0,
1793
- "clip_ratio/region_mean": 0.0,
1794
- "completions/clipped_ratio": 0.8125,
1795
- "completions/max_length": 16.0,
1796
- "completions/max_terminated_length": 14.0,
1797
- "completions/mean_length": 14.46875,
1798
- "completions/mean_terminated_length": 7.833333492279053,
1799
- "completions/min_length": 4.0,
1800
- "completions/min_terminated_length": 4.0,
1801
- "entropy": 0.4485122114419937,
1802
- "epoch": 0.0015,
1803
- "frac_reward_zero_std": 0.0,
1804
- "grad_norm": 0.5359442830085754,
1805
- "kl": 0.0026965075376210734,
1806
- "learning_rate": 2.9999991550023697e-06,
1807
- "loss": 0.0904,
1808
- "num_tokens": 2018404.0,
1809
- "reward": -22.368053436279297,
1810
- "reward_std": 13.008703231811523,
1811
- "rewards/rollout_reward_func/mean": -22.368053436279297,
1812
- "rewards/rollout_reward_func/std": 13.008703231811523,
1813
- "sampling/importance_sampling_ratio/max": 1.2978986501693726,
1814
- "sampling/importance_sampling_ratio/mean": 1.035149335861206,
1815
- "sampling/importance_sampling_ratio/min": 0.8059366941452026,
1816
- "sampling/sampling_logp_difference/max": 0.1702725887298584,
1817
- "sampling/sampling_logp_difference/mean": 0.009985161945223808,
1818
- "step": 75,
1819
- "step_time": 13.661403129999599
1820
  }
1821
  ],
1822
  "logging_steps": 1.0,
1823
  "max_steps": 100000,
1824
- "num_input_tokens_seen": 2018404,
1825
  "num_train_epochs": 2,
1826
  "save_steps": 500,
1827
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.0002,
6
  "eval_steps": 500,
7
+ "global_step": 10,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
15
  "clip_ratio/low_mean": 0.0,
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.90625,
19
  "completions/max_length": 16.0,
20
+ "completions/max_terminated_length": 16.0,
21
+ "completions/mean_length": 15.59375,
22
+ "completions/mean_terminated_length": 11.666666984558105,
23
+ "completions/min_length": 9.0,
24
+ "completions/min_terminated_length": 9.0,
25
+ "entropy": 0.45986470580101013,
26
  "epoch": 2e-05,
27
  "frac_reward_zero_std": 0.0,
28
+ "grad_norm": 0.5555855631828308,
29
  "kl": 0.0,
30
  "learning_rate": 0.0,
31
+ "loss": -0.0036,
32
+ "num_tokens": 53302.0,
33
+ "reward": -20.565004348754883,
34
+ "reward_std": 12.108231544494629,
35
+ "rewards/rollout_reward_func/mean": -20.565004348754883,
36
+ "rewards/rollout_reward_func/std": 12.108230590820312,
37
+ "sampling/importance_sampling_ratio/max": 1.2069631814956665,
38
+ "sampling/importance_sampling_ratio/mean": 1.0164316892623901,
39
+ "sampling/importance_sampling_ratio/min": 0.691445529460907,
40
+ "sampling/sampling_logp_difference/max": 0.14298677444458008,
41
+ "sampling/sampling_logp_difference/mean": 0.01139700785279274,
42
  "step": 1,
43
+ "step_time": 13.593145795000055
44
  },
45
  {
46
  "clip_ratio/high_max": 0.0,
 
48
  "clip_ratio/low_mean": 0.0,
49
  "clip_ratio/low_min": 0.0,
50
  "clip_ratio/region_mean": 0.0,
51
+ "entropy": 0.45986470580101013,
52
  "epoch": 4e-05,
53
+ "grad_norm": 0.5478876829147339,
54
  "kl": 0.0,
55
+ "learning_rate": 8.571428571428574e-08,
56
+ "loss": -0.0036,
57
  "step": 2,
58
+ "step_time": 3.5665654679996805
59
  },
60
  {
61
  "clip_ratio/high_max": 0.0,
62
  "clip_ratio/high_mean": 0.0,
63
+ "clip_ratio/low_mean": 0.0031250000465661287,
64
  "clip_ratio/low_min": 0.0,
65
+ "clip_ratio/region_mean": 0.0031250000465661287,
66
  "completions/clipped_ratio": 0.75,
67
  "completions/max_length": 16.0,
68
+ "completions/max_terminated_length": 16.0,
69
+ "completions/mean_length": 14.4375,
70
+ "completions/mean_terminated_length": 9.75,
71
+ "completions/min_length": 4.0,
72
+ "completions/min_terminated_length": 4.0,
73
+ "entropy": 0.4573390781879425,
74
  "epoch": 6e-05,
75
  "frac_reward_zero_std": 0.0,
76
+ "grad_norm": 0.4110209047794342,
77
+ "kl": 0.00046882770402589813,
78
+ "learning_rate": 1.7142857142857149e-07,
79
+ "loss": 0.1188,
80
+ "num_tokens": 106349.0,
81
+ "reward": -22.559879302978516,
82
+ "reward_std": 14.563770294189453,
83
+ "rewards/rollout_reward_func/mean": -22.559879302978516,
84
+ "rewards/rollout_reward_func/std": 14.563770294189453,
85
+ "sampling/importance_sampling_ratio/max": 1.2753626108169556,
86
+ "sampling/importance_sampling_ratio/mean": 0.9817989468574524,
87
+ "sampling/importance_sampling_ratio/min": 0.8001569509506226,
88
+ "sampling/sampling_logp_difference/max": 0.2265012264251709,
89
+ "sampling/sampling_logp_difference/mean": 0.012659025378525257,
90
  "step": 3,
91
+ "step_time": 13.572802005000085
92
  },
93
  {
94
+ "clip_ratio/high_max": 0.0078125,
95
+ "clip_ratio/high_mean": 0.001953125,
96
  "clip_ratio/low_mean": 0.0,
97
  "clip_ratio/low_min": 0.0,
98
+ "clip_ratio/region_mean": 0.001953125,
99
+ "entropy": 0.45671091973781586,
100
  "epoch": 8e-05,
101
+ "grad_norm": 0.3884861469268799,
102
+ "kl": 0.00040267666918225586,
103
+ "learning_rate": 2.571428571428572e-07,
104
+ "loss": 0.1171,
105
  "step": 4,
106
+ "step_time": 3.672753089998878
107
  },
108
  {
109
+ "clip_ratio/high_max": 0.0,
110
+ "clip_ratio/high_mean": 0.0,
111
  "clip_ratio/low_mean": 0.0,
112
  "clip_ratio/low_min": 0.0,
113
+ "clip_ratio/region_mean": 0.0,
114
+ "completions/clipped_ratio": 0.78125,
115
  "completions/max_length": 16.0,
116
+ "completions/max_terminated_length": 13.0,
117
+ "completions/mean_length": 14.09375,
118
+ "completions/mean_terminated_length": 7.285714626312256,
119
  "completions/min_length": 4.0,
120
  "completions/min_terminated_length": 4.0,
121
+ "entropy": 0.4637308567762375,
122
  "epoch": 0.0001,
123
  "frac_reward_zero_std": 0.0,
124
+ "grad_norm": 0.5923936367034912,
125
+ "kl": 0.00045897291784058325,
126
+ "learning_rate": 3.4285714285714297e-07,
127
+ "loss": 0.1461,
128
+ "num_tokens": 158201.0,
129
+ "reward": -23.239702224731445,
130
+ "reward_std": 11.33220386505127,
131
+ "rewards/rollout_reward_func/mean": -23.239702224731445,
132
+ "rewards/rollout_reward_func/std": 11.332201957702637,
133
+ "sampling/importance_sampling_ratio/max": 1.179511308670044,
134
+ "sampling/importance_sampling_ratio/mean": 0.9810092449188232,
135
+ "sampling/importance_sampling_ratio/min": 0.7553079128265381,
136
+ "sampling/sampling_logp_difference/max": 0.19478893280029297,
137
+ "sampling/sampling_logp_difference/mean": 0.012187635526061058,
138
  "step": 5,
139
+ "step_time": 12.394922012998904
140
  },
141
  {
142
+ "epoch": 0.0001,
143
+ "eval_clip_ratio/high_max": 0.0,
144
+ "eval_clip_ratio/high_mean": 0.0,
145
+ "eval_clip_ratio/low_mean": 0.0,
146
+ "eval_clip_ratio/low_min": 0.0,
147
+ "eval_clip_ratio/region_mean": 0.0,
148
+ "eval_completions/clipped_ratio": 0.85,
149
+ "eval_completions/max_length": 16.0,
150
+ "eval_completions/max_terminated_length": 5.4,
151
+ "eval_completions/mean_length": 15.225,
152
+ "eval_completions/mean_terminated_length": 5.15,
153
+ "eval_completions/min_length": 12.9,
154
+ "eval_completions/min_terminated_length": 4.9,
155
+ "eval_entropy": 0.45741056799888613,
156
+ "eval_frac_reward_zero_std": 0.0,
157
+ "eval_kl": 0.00042195062269456685,
158
+ "eval_loss": 0.004154330585151911,
159
+ "eval_num_tokens": 158201.0,
160
+ "eval_reward": -25.101631736755373,
161
+ "eval_reward_std": 13.229044437408447,
162
+ "eval_rewards/rollout_reward_func/mean": -25.101631736755373,
163
+ "eval_rewards/rollout_reward_func/std": 13.2290442943573,
164
+ "eval_runtime": 13.4386,
165
+ "eval_samples_per_second": 0.744,
166
+ "eval_sampling/importance_sampling_ratio/max": 1.112263822555542,
167
+ "eval_sampling/importance_sampling_ratio/mean": 0.9942494869232178,
168
+ "eval_sampling/importance_sampling_ratio/min": 0.8756316006183624,
169
+ "eval_sampling/sampling_logp_difference/max": 0.1471207022666931,
170
+ "eval_sampling/sampling_logp_difference/mean": 0.013286880496889353,
171
+ "eval_steps_per_second": 0.223,
172
+ "step": 5
173
+ },
174
+ {
175
+ "clip_ratio/high_max": 0.0,
176
+ "clip_ratio/high_mean": 0.0,
177
+ "clip_ratio/low_mean": 0.0,
178
+ "clip_ratio/low_min": 0.0,
179
+ "clip_ratio/region_mean": 0.0,
180
+ "entropy": 0.46300478279590607,
181
  "epoch": 0.00012,
182
+ "grad_norm": 0.5754485726356506,
183
+ "kl": 0.0004794619817403145,
184
+ "learning_rate": 4.2857142857142867e-07,
185
+ "loss": 0.1461,
186
  "step": 6,
187
+ "step_time": 3.5247847910004566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  },
189
  {
190
  "clip_ratio/high_max": 0.0,
 
194
  "clip_ratio/region_mean": 0.0,
195
  "completions/clipped_ratio": 0.78125,
196
  "completions/max_length": 16.0,
197
+ "completions/max_terminated_length": 14.0,
198
  "completions/mean_length": 14.6875,
199
  "completions/mean_terminated_length": 10.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  "completions/min_length": 3.0,
201
  "completions/min_terminated_length": 3.0,
202
+ "entropy": 0.4924817681312561,
203
+ "epoch": 0.00014,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  "frac_reward_zero_std": 0.0,
205
+ "grad_norm": 0.5342097282409668,
206
+ "kl": 0.00047566716602887027,
207
+ "learning_rate": 5.142857142857144e-07,
208
+ "loss": 0.0788,
209
+ "num_tokens": 211022.0,
210
+ "reward": -21.68277359008789,
211
+ "reward_std": 14.98656940460205,
212
+ "rewards/rollout_reward_func/mean": -21.68277359008789,
213
+ "rewards/rollout_reward_func/std": 14.986568450927734,
214
+ "sampling/importance_sampling_ratio/max": 1.2211930751800537,
215
+ "sampling/importance_sampling_ratio/mean": 0.9879010319709778,
216
+ "sampling/importance_sampling_ratio/min": 0.7968342304229736,
217
+ "sampling/sampling_logp_difference/max": 0.2077500820159912,
218
+ "sampling/sampling_logp_difference/mean": 0.015078969299793243,
219
+ "step": 7,
220
+ "step_time": 13.125986247000583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  },
222
  {
223
  "clip_ratio/high_max": 0.0,
 
225
  "clip_ratio/low_mean": 0.001953125,
226
  "clip_ratio/low_min": 0.0,
227
  "clip_ratio/region_mean": 0.001953125,
228
+ "entropy": 0.4939192831516266,
229
+ "epoch": 0.00016,
230
+ "grad_norm": 0.5269240140914917,
231
+ "kl": 0.0005614961846731603,
232
+ "learning_rate": 6.000000000000002e-07,
233
+ "loss": 0.078,
234
+ "step": 8,
235
+ "step_time": 4.57110312200075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  },
237
  {
238
  "clip_ratio/high_max": 0.0,
 
240
  "clip_ratio/low_mean": 0.0,
241
  "clip_ratio/low_min": 0.0,
242
  "clip_ratio/region_mean": 0.0,
243
+ "completions/clipped_ratio": 0.84375,
244
  "completions/max_length": 16.0,
245
+ "completions/max_terminated_length": 16.0,
246
+ "completions/mean_length": 15.53125,
247
+ "completions/mean_terminated_length": 13.0,
248
+ "completions/min_length": 3.0,
249
+ "completions/min_terminated_length": 3.0,
250
+ "entropy": 0.4650999382138252,
251
+ "epoch": 0.00018,
252
  "frac_reward_zero_std": 0.0,
253
+ "grad_norm": 0.4413743019104004,
254
+ "kl": 0.0005437940853880718,
255
+ "learning_rate": 6.857142857142859e-07,
256
+ "loss": 0.0419,
257
+ "num_tokens": 262651.0,
258
+ "reward": -24.077335357666016,
259
+ "reward_std": 11.67958927154541,
260
+ "rewards/rollout_reward_func/mean": -24.077335357666016,
261
+ "rewards/rollout_reward_func/std": 11.67958927154541,
262
+ "sampling/importance_sampling_ratio/max": 1.2573319673538208,
263
+ "sampling/importance_sampling_ratio/mean": 1.0008736848831177,
264
+ "sampling/importance_sampling_ratio/min": 0.8466115593910217,
265
+ "sampling/sampling_logp_difference/max": 0.15867352485656738,
266
+ "sampling/sampling_logp_difference/mean": 0.011756965890526772,
267
+ "step": 9,
268
+ "step_time": 12.814324255000429
269
  },
270
  {
271
  "clip_ratio/high_max": 0.0,
 
273
  "clip_ratio/low_mean": 0.0,
274
  "clip_ratio/low_min": 0.0,
275
  "clip_ratio/region_mean": 0.0,
276
+ "entropy": 0.46368078887462616,
277
+ "epoch": 0.0002,
278
+ "grad_norm": 0.4458825886249542,
279
+ "kl": 0.0005188173963688314,
280
+ "learning_rate": 7.714285714285716e-07,
281
+ "loss": 0.0428,
282
+ "step": 10,
283
+ "step_time": 3.553372275999209
284
  },
285
  {
286
+ "epoch": 0.0002,
287
+ "eval_clip_ratio/high_max": 0.0,
288
+ "eval_clip_ratio/high_mean": 0.0,
289
+ "eval_clip_ratio/low_mean": 0.0,
290
+ "eval_clip_ratio/low_min": 0.0,
291
+ "eval_clip_ratio/region_mean": 0.0,
292
+ "eval_completions/clipped_ratio": 0.7,
293
+ "eval_completions/max_length": 16.0,
294
+ "eval_completions/max_terminated_length": 9.8,
295
+ "eval_completions/mean_length": 14.7,
296
+ "eval_completions/mean_terminated_length": 9.16666669845581,
297
+ "eval_completions/min_length": 11.8,
298
+ "eval_completions/min_terminated_length": 8.6,
299
+ "eval_entropy": 0.40578661262989046,
300
+ "eval_frac_reward_zero_std": 0.0,
301
+ "eval_kl": 0.00042531737562967464,
302
+ "eval_loss": 0.02340291067957878,
303
+ "eval_num_tokens": 262651.0,
304
+ "eval_reward": -18.063774490356444,
305
+ "eval_reward_std": 14.382386589050293,
306
+ "eval_rewards/rollout_reward_func/mean": -18.063774490356444,
307
+ "eval_rewards/rollout_reward_func/std": 14.382386589050293,
308
+ "eval_runtime": 13.98,
309
+ "eval_samples_per_second": 0.715,
310
+ "eval_sampling/importance_sampling_ratio/max": 1.1354158759117126,
311
+ "eval_sampling/importance_sampling_ratio/mean": 1.0028302788734436,
312
+ "eval_sampling/importance_sampling_ratio/min": 0.9048627257347107,
313
+ "eval_sampling/sampling_logp_difference/max": 0.13887375593185425,
314
+ "eval_sampling/sampling_logp_difference/mean": 0.011902038706466556,
315
+ "eval_steps_per_second": 0.215,
316
+ "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  }
318
  ],
319
  "logging_steps": 1.0,
320
  "max_steps": 100000,
321
+ "num_input_tokens_seen": 262651,
322
  "num_train_epochs": 2,
323
  "save_steps": 500,
324
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb0e24b2a66d930ca7964cb444586aae8c29ce9e3b9f7dbedde1321db29e772e
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ec21def335d91e767cfc8442100434c1b8f711cd3db15cf69c2a4331fa4d590
3
  size 8081