sonicdog00 commited on
Commit
53db09e
·
verified ·
1 Parent(s): a27716c

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +886 -886
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
3
- datasets: knoveleng/open-rs
4
  library_name: transformers
5
  model_name: OpenRS-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for OpenRS-GRPO
15
 
16
- This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/sonicdog00-korea-university/openrs/runs/kdawmmgb)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 
3
  library_name: transformers
4
  model_name: OpenRS-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for OpenRS-GRPO
13
 
14
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/sonicdog00-korea-university/openrs/runs/ez1iu4d5)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06513786550378427,
4
- "train_runtime": 17727.636,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.406,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0719883194193244,
4
+ "train_runtime": 18385.1788,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.392,
7
+ "train_steps_per_second": 0.005
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06513786550378427,
4
- "train_runtime": 17727.636,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.406,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0719883194193244,
4
+ "train_runtime": 18385.1788,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.392,
7
+ "train_steps_per_second": 0.005
8
  }
trainer_state.json CHANGED
@@ -10,1612 +10,1612 @@
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
- "completion_length": 3135.291748046875,
14
  "epoch": 0.001713796058269066,
15
- "grad_norm": 0.1234593614935875,
16
  "kl": 0.0,
17
  "learning_rate": 1e-07,
18
- "loss": 0.0149,
19
- "reward": 0.23611111752688885,
20
- "reward_std": 0.30932011269032955,
21
- "rewards/accuracy_reward": 0.041666666977107525,
22
  "rewards/difficulty_following_reward": 0.0,
23
- "rewards/format_reward": 0.15277778171002865,
24
  "step": 1,
25
- "vanishing_advantage_ratio": 0.3333333432674408
26
  },
27
  {
28
  "clip_ratio": 0.0,
29
- "completion_length": 2893.9722900390625,
30
  "epoch": 0.003427592116538132,
31
- "grad_norm": 0.1797490417957306,
32
  "kl": 0.0,
33
  "learning_rate": 2e-07,
34
- "loss": 0.1056,
35
- "reward": 0.20833333767950535,
36
- "reward_std": 0.2490638718008995,
37
- "rewards/accuracy_reward": 0.02777777798473835,
38
  "rewards/difficulty_following_reward": 0.0,
39
- "rewards/format_reward": 0.15277778171002865,
40
  "step": 2,
41
- "vanishing_advantage_ratio": 0.2500000074505806
42
  },
43
  {
44
  "clip_ratio": 0.0,
45
- "completion_length": 3052.9862060546875,
46
  "epoch": 0.005141388174807198,
47
- "grad_norm": 0.1267692893743515,
48
- "kl": 4.1365623474121094e-05,
49
  "learning_rate": 3e-07,
50
- "loss": 0.0573,
51
- "reward": 0.243055556435138,
52
- "reward_std": 0.2783401180058718,
53
- "rewards/accuracy_reward": 0.06944444496184587,
54
  "rewards/difficulty_following_reward": 0.0,
55
- "rewards/format_reward": 0.10416666744276881,
56
  "step": 3,
57
- "vanishing_advantage_ratio": 0.5000000149011612
58
  },
59
  {
60
  "clip_ratio": 0.0,
61
- "completion_length": 2740.6944580078125,
62
  "epoch": 0.006855184233076264,
63
- "grad_norm": 0.17930592596530914,
64
- "kl": 3.7789344787597656e-05,
65
  "learning_rate": 4e-07,
66
- "loss": 0.0202,
67
- "reward": 0.32638889737427235,
68
- "reward_std": 0.43739713728427887,
69
- "rewards/accuracy_reward": 0.08333333395421505,
70
  "rewards/difficulty_following_reward": 0.0,
71
- "rewards/format_reward": 0.1597222238779068,
72
  "step": 4,
73
- "vanishing_advantage_ratio": 0.2500000074505806
74
  },
75
  {
76
  "clip_ratio": 0.0,
77
- "completion_length": 2730.8055419921875,
78
  "epoch": 0.00856898029134533,
79
- "grad_norm": 0.1377512514591217,
80
- "kl": 4.1604042053222656e-05,
81
  "learning_rate": 5e-07,
82
- "loss": 0.0621,
83
- "reward": 0.493055559694767,
84
- "reward_std": 0.5705233514308929,
85
- "rewards/accuracy_reward": 0.16666666883975267,
86
  "rewards/difficulty_following_reward": 0.0,
87
- "rewards/format_reward": 0.15972222574055195,
88
  "step": 5,
89
- "vanishing_advantage_ratio": 0.1666666716337204
90
  },
91
  {
92
  "clip_ratio": 0.0,
93
- "completion_length": 2965.138916015625,
94
  "epoch": 0.010282776349614395,
95
- "grad_norm": 0.14657363295555115,
96
- "kl": 5.21540641784668e-05,
97
  "learning_rate": 6e-07,
98
- "loss": 0.0363,
99
- "reward": 0.243055559694767,
100
- "reward_std": 0.38589781522750854,
101
- "rewards/accuracy_reward": 0.0555555559694767,
102
  "rewards/difficulty_following_reward": 0.0,
103
- "rewards/format_reward": 0.1319444477558136,
104
  "step": 6,
105
- "vanishing_advantage_ratio": 0.2777777910232544
106
  },
107
  {
108
  "clip_ratio": 0.0,
109
- "completion_length": 2290.0556030273438,
110
  "epoch": 0.011996572407883462,
111
- "grad_norm": 0.17396818101406097,
112
- "kl": 5.3763389587402344e-05,
113
  "learning_rate": 7e-07,
114
- "loss": 0.0676,
115
- "reward": 0.4305555671453476,
116
- "reward_std": 0.3960871510207653,
117
- "rewards/accuracy_reward": 0.09722222480922937,
118
  "rewards/difficulty_following_reward": 0.0,
119
- "rewards/format_reward": 0.2361111119389534,
120
  "step": 7,
121
  "vanishing_advantage_ratio": 0.3333333432674408
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
- "completion_length": 2718.3056030273438,
126
  "epoch": 0.013710368466152529,
127
- "grad_norm": 0.13110554218292236,
128
- "kl": 3.319978713989258e-05,
129
  "learning_rate": 8e-07,
130
- "loss": 0.0839,
131
- "reward": 0.4722222238779068,
132
- "reward_std": 0.42657903023064137,
133
- "rewards/accuracy_reward": 0.1111111119389534,
134
  "rewards/difficulty_following_reward": 0.0,
135
- "rewards/format_reward": 0.2500000074505806,
136
  "step": 8,
137
- "vanishing_advantage_ratio": 0.4444444589316845
138
  },
139
  {
140
  "clip_ratio": 0.0,
141
- "completion_length": 2462.8055114746094,
142
  "epoch": 0.015424164524421594,
143
- "grad_norm": 0.14741061627864838,
144
- "kl": 3.781914710998535e-05,
145
  "learning_rate": 9e-07,
146
- "loss": 0.0689,
147
- "reward": 0.4583333358168602,
148
- "reward_std": 0.374173566699028,
149
- "rewards/accuracy_reward": 0.1250000037252903,
150
  "rewards/difficulty_following_reward": 0.0,
151
- "rewards/format_reward": 0.20833333767950535,
152
  "step": 9,
153
  "vanishing_advantage_ratio": 0.3333333432674408
154
  },
155
  {
156
  "clip_ratio": 0.0,
157
- "completion_length": 3018.6805725097656,
158
  "epoch": 0.01713796058269066,
159
- "grad_norm": 0.12218267470598221,
160
- "kl": 3.737211227416992e-05,
161
  "learning_rate": 1e-06,
162
- "loss": 0.0729,
163
- "reward": 0.5277777928858995,
164
- "reward_std": 0.5827038362622261,
165
- "rewards/accuracy_reward": 0.19444444496184587,
166
  "rewards/difficulty_following_reward": 0.0,
167
- "rewards/format_reward": 0.13888888899236917,
168
  "step": 10,
169
- "vanishing_advantage_ratio": 0.3333333432674408
170
  },
171
  {
172
  "clip_ratio": 0.0,
173
- "completion_length": 2796.763946533203,
174
  "epoch": 0.018851756640959727,
175
- "grad_norm": 0.19447727501392365,
176
- "kl": 3.8176774978637695e-05,
177
  "learning_rate": 9.997258721585931e-07,
178
- "loss": 0.0653,
179
- "reward": 0.3958333511836827,
180
- "reward_std": 0.3750236164778471,
181
- "rewards/accuracy_reward": 0.1111111156642437,
182
  "rewards/difficulty_following_reward": 0.0,
183
- "rewards/format_reward": 0.1736111124046147,
184
  "step": 11,
185
- "vanishing_advantage_ratio": 0.43055556155741215
186
  },
187
  {
188
  "clip_ratio": 0.0,
189
- "completion_length": 3165.5694580078125,
190
  "epoch": 0.02056555269922879,
191
- "grad_norm": 0.13901270925998688,
192
- "kl": 3.3974647521972656e-05,
193
  "learning_rate": 9.989038226169207e-07,
194
- "loss": 0.0748,
195
- "reward": 0.3263888922519982,
196
- "reward_std": 0.4248256888240576,
197
- "rewards/accuracy_reward": 0.09722222480922937,
198
  "rewards/difficulty_following_reward": 0.0,
199
- "rewards/format_reward": 0.1319444444961846,
200
  "step": 12,
201
- "vanishing_advantage_ratio": 0.2777777910232544
202
  },
203
  {
204
  "clip_ratio": 0.0,
205
- "completion_length": 2812.7083740234375,
206
  "epoch": 0.022279348757497857,
207
- "grad_norm": 0.18154460191726685,
208
- "kl": 2.600252628326416e-05,
209
  "learning_rate": 9.975348529157229e-07,
210
- "loss": 0.0766,
211
- "reward": 0.31944444961845875,
212
- "reward_std": 0.44224512577056885,
213
- "rewards/accuracy_reward": 0.08333333488553762,
214
  "rewards/difficulty_following_reward": 0.0,
215
- "rewards/format_reward": 0.1527777798473835,
216
  "step": 13,
217
- "vanishing_advantage_ratio": 0.1944444552063942
218
  },
219
  {
220
  "clip_ratio": 0.0,
221
- "completion_length": 2717.736114501953,
222
  "epoch": 0.023993144815766924,
223
- "grad_norm": 0.14561408758163452,
224
- "kl": 4.267692565917969e-05,
225
  "learning_rate": 9.956206309337066e-07,
226
- "loss": 0.0128,
227
- "reward": 0.3124999962747097,
228
- "reward_std": 0.3933219611644745,
229
- "rewards/accuracy_reward": 0.06944444589316845,
230
  "rewards/difficulty_following_reward": 0.0,
231
- "rewards/format_reward": 0.17361111380159855,
232
  "step": 14,
233
- "vanishing_advantage_ratio": 0.36111112125217915
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
- "completion_length": 3323.8056030273438,
238
  "epoch": 0.02570694087403599,
239
- "grad_norm": 0.13735535740852356,
240
- "kl": 2.0116567611694336e-05,
241
  "learning_rate": 9.931634888554935e-07,
242
- "loss": 0.0895,
243
- "reward": 0.4027777872979641,
244
- "reward_std": 0.5426600202918053,
245
- "rewards/accuracy_reward": 0.15277778171002865,
246
  "rewards/difficulty_following_reward": 0.0,
247
- "rewards/format_reward": 0.09722222480922937,
248
  "step": 15,
249
- "vanishing_advantage_ratio": 0.1805555671453476
250
  },
251
  {
252
  "clip_ratio": 0.0,
253
- "completion_length": 2356.2500610351562,
254
  "epoch": 0.027420736932305057,
255
- "grad_norm": 0.12638089060783386,
256
- "kl": 3.668665885925293e-05,
257
  "learning_rate": 9.901664203302124e-07,
258
- "loss": 0.0365,
259
- "reward": 0.5902777761220932,
260
- "reward_std": 0.5737965255975723,
261
- "rewards/accuracy_reward": 0.15277778264135122,
262
  "rewards/difficulty_following_reward": 0.0,
263
- "rewards/format_reward": 0.28472222201526165,
264
  "step": 16,
265
- "vanishing_advantage_ratio": 0.347222238779068
266
  },
267
  {
268
  "clip_ratio": 0.0,
269
- "completion_length": 2843.3751220703125,
270
  "epoch": 0.02913453299057412,
271
- "grad_norm": 0.12914972007274628,
272
- "kl": 1.8578022718429565e-05,
273
  "learning_rate": 9.866330768241983e-07,
274
- "loss": 0.0311,
275
- "reward": 0.3333333400078118,
276
- "reward_std": 0.36667141877114773,
277
- "rewards/accuracy_reward": 0.11111111380159855,
278
  "rewards/difficulty_following_reward": 0.0,
279
- "rewards/format_reward": 0.11111111240461469,
280
  "step": 17,
281
- "vanishing_advantage_ratio": 0.5000000149011612
282
  },
283
  {
284
  "clip_ratio": 0.0,
285
- "completion_length": 3191.361083984375,
286
  "epoch": 0.030848329048843187,
287
- "grad_norm": 0.16445335745811462,
288
- "kl": 2.8409063816070557e-05,
289
  "learning_rate": 9.825677631722435e-07,
290
- "loss": 0.0738,
291
- "reward": 0.1736111156642437,
292
- "reward_std": 0.2155014369636774,
293
- "rewards/accuracy_reward": 0.0416666679084301,
294
  "rewards/difficulty_following_reward": 0.0,
295
- "rewards/format_reward": 0.09027778077870607,
296
  "step": 18,
297
- "vanishing_advantage_ratio": 0.5833333432674408
298
  },
299
  {
300
  "clip_ratio": 0.0,
301
- "completion_length": 3148.77783203125,
302
  "epoch": 0.032562125107112254,
303
- "grad_norm": 0.10606271028518677,
304
- "kl": 3.5881996154785156e-05,
305
  "learning_rate": 9.779754323328192e-07,
306
- "loss": 0.0617,
307
- "reward": 0.3194444444961846,
308
- "reward_std": 0.4261201396584511,
309
- "rewards/accuracy_reward": 0.09722222294658422,
310
  "rewards/difficulty_following_reward": 0.0,
311
- "rewards/format_reward": 0.12500000232830644,
312
  "step": 19,
313
- "vanishing_advantage_ratio": 0.3611111268401146
314
  },
315
  {
316
  "clip_ratio": 0.0,
317
- "completion_length": 3273.138916015625,
318
  "epoch": 0.03427592116538132,
319
- "grad_norm": 0.10688318312168121,
320
- "kl": 2.744048833847046e-05,
321
  "learning_rate": 9.728616793536587e-07,
322
- "loss": 0.0473,
323
- "reward": 0.1666666716337204,
324
- "reward_std": 0.24878324940800667,
325
- "rewards/accuracy_reward": 0.02777777798473835,
326
  "rewards/difficulty_following_reward": 0.0,
327
- "rewards/format_reward": 0.11111111426725984,
328
  "step": 20,
329
- "vanishing_advantage_ratio": 0.416666679084301
330
  },
331
  {
332
  "clip_ratio": 0.0,
333
- "completion_length": 2865.1250915527344,
334
  "epoch": 0.03598971722365039,
335
- "grad_norm": 0.1771811544895172,
336
- "kl": 6.73532485961914e-05,
337
  "learning_rate": 9.672327345550543e-07,
338
- "loss": 0.1162,
339
- "reward": 0.3680555671453476,
340
- "reward_std": 0.5005812719464302,
341
- "rewards/accuracy_reward": 0.0972222238779068,
342
  "rewards/difficulty_following_reward": 0.0,
343
- "rewards/format_reward": 0.17361111287027597,
344
  "step": 21,
345
- "vanishing_advantage_ratio": 0.2083333469927311
346
  },
347
  {
348
  "clip_ratio": 0.0,
349
- "completion_length": 2781.2222900390625,
350
  "epoch": 0.037703513281919454,
351
- "grad_norm": 0.14734819531440735,
352
- "kl": 6.908178329467773e-05,
353
  "learning_rate": 9.610954559391704e-07,
354
- "loss": 0.118,
355
- "reward": 0.38888889737427235,
356
- "reward_std": 0.3173447232693434,
357
- "rewards/accuracy_reward": 0.1111111156642437,
358
  "rewards/difficulty_following_reward": 0.0,
359
- "rewards/format_reward": 0.16666666977107525,
360
  "step": 22,
361
  "vanishing_advantage_ratio": 0.3333333432674408
362
  },
363
  {
364
  "clip_ratio": 0.0,
365
- "completion_length": 3032.0833740234375,
366
  "epoch": 0.03941730934018852,
367
- "grad_norm": 0.13921721279621124,
368
- "kl": 9.47713851928711e-05,
369
  "learning_rate": 9.54457320834625e-07,
370
- "loss": 0.0163,
371
- "reward": 0.3194444477558136,
372
- "reward_std": 0.47703739255666733,
373
- "rewards/accuracy_reward": 0.06944444496184587,
374
  "rewards/difficulty_following_reward": 0.0,
375
- "rewards/format_reward": 0.180555559694767,
376
  "step": 23,
377
- "vanishing_advantage_ratio": 0.26388889737427235
378
  },
379
  {
380
  "clip_ratio": 0.0,
381
- "completion_length": 2767.4861450195312,
382
  "epoch": 0.04113110539845758,
383
- "grad_norm": 0.13283671438694,
384
- "kl": 0.00014197826385498047,
385
  "learning_rate": 9.473264167865171e-07,
386
- "loss": 0.0132,
387
- "reward": 0.3194444514811039,
388
- "reward_std": 0.3195137083530426,
389
- "rewards/accuracy_reward": 0.06944444496184587,
390
  "rewards/difficulty_following_reward": 0.0,
391
- "rewards/format_reward": 0.180555559694767,
392
  "step": 24,
393
- "vanishing_advantage_ratio": 0.416666679084301
394
  },
395
  {
396
  "clip_ratio": 0.0,
397
- "completion_length": 2574.625,
398
  "epoch": 0.04284490145672665,
399
- "grad_norm": 0.12056349962949753,
400
- "kl": 0.00019240379333496094,
401
  "learning_rate": 9.397114317029974e-07,
402
- "loss": 0.0681,
403
- "reward": 0.5000000121071935,
404
- "reward_std": 0.3675909973680973,
405
- "rewards/accuracy_reward": 0.13888888899236917,
406
  "rewards/difficulty_following_reward": 0.0,
407
- "rewards/format_reward": 0.22222222853451967,
408
  "step": 25,
409
- "vanishing_advantage_ratio": 0.4583333544433117
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
- "completion_length": 3157.513916015625,
414
  "epoch": 0.044558697514995714,
415
- "grad_norm": 0.11926140636205673,
416
- "kl": 0.00018858909606933594,
417
  "learning_rate": 9.316216432703916e-07,
418
- "loss": 0.0933,
419
- "reward": 0.2708333432674408,
420
- "reward_std": 0.41086217388510704,
421
- "rewards/accuracy_reward": 0.06944444589316845,
422
  "rewards/difficulty_following_reward": 0.0,
423
- "rewards/format_reward": 0.13194444868713617,
424
  "step": 26,
425
- "vanishing_advantage_ratio": 0.347222238779068
426
  },
427
  {
428
  "clip_ratio": 0.0,
429
- "completion_length": 2885.8333740234375,
430
  "epoch": 0.04627249357326478,
431
- "grad_norm": 0.15303786098957062,
432
- "kl": 0.0003833770751953125,
433
  "learning_rate": 9.230669076497687e-07,
434
- "loss": 0.1048,
435
- "reward": 0.3055555606260896,
436
- "reward_std": 0.38973100297152996,
437
- "rewards/accuracy_reward": 0.08333333488553762,
438
  "rewards/difficulty_following_reward": 0.0,
439
- "rewards/format_reward": 0.13888889271765947,
440
  "step": 27,
441
- "vanishing_advantage_ratio": 0.1666666716337204
442
  },
443
  {
444
  "clip_ratio": 0.0,
445
- "completion_length": 2201.6805419921875,
446
  "epoch": 0.04798628963153385,
447
- "grad_norm": 0.1955413967370987,
448
- "kl": 0.0005526542663574219,
449
  "learning_rate": 9.140576474687263e-07,
450
- "loss": 0.0602,
451
- "reward": 0.5625000074505806,
452
- "reward_std": 0.5387887954711914,
453
- "rewards/accuracy_reward": 0.13888889364898205,
454
  "rewards/difficulty_following_reward": 0.0,
455
- "rewards/format_reward": 0.2847222201526165,
456
  "step": 28,
457
- "vanishing_advantage_ratio": 0.2500000074505806
458
  },
459
  {
460
  "clip_ratio": 0.0,
461
- "completion_length": 2828.138916015625,
462
  "epoch": 0.049700085689802914,
463
- "grad_norm": 0.1482599526643753,
464
- "kl": 0.0004673004150390625,
465
  "learning_rate": 9.046048391230247e-07,
466
- "loss": 0.0391,
467
- "reward": 0.4513888955116272,
468
- "reward_std": 0.4079795628786087,
469
- "rewards/accuracy_reward": 0.13888889271765947,
470
  "rewards/difficulty_following_reward": 0.0,
471
- "rewards/format_reward": 0.1736111156642437,
472
  "step": 29,
473
- "vanishing_advantage_ratio": 0.4166666716337204
474
  },
475
  {
476
  "clip_ratio": 0.0,
477
- "completion_length": 2359.527801513672,
478
  "epoch": 0.05141388174807198,
479
- "grad_norm": 0.22218456864356995,
480
- "kl": 0.0006856918334960938,
481
  "learning_rate": 8.9471999940354e-07,
482
- "loss": 0.0802,
483
- "reward": 0.6527777761220932,
484
- "reward_std": 0.5185052454471588,
485
- "rewards/accuracy_reward": 0.18055555690079927,
486
  "rewards/difficulty_following_reward": 0.0,
487
- "rewards/format_reward": 0.29166667349636555,
488
  "step": 30,
489
- "vanishing_advantage_ratio": 0.1666666716337204
490
  },
491
  {
492
  "clip_ratio": 0.0,
493
- "completion_length": 2744.52783203125,
494
  "epoch": 0.05312767780634105,
495
- "grad_norm": 0.16492263972759247,
496
- "kl": 0.000579833984375,
497
  "learning_rate": 8.844151714648274e-07,
498
- "loss": 0.0452,
499
- "reward": 0.3750000037252903,
500
- "reward_std": 0.34963209368288517,
501
- "rewards/accuracy_reward": 0.0833333358168602,
502
  "rewards/difficulty_following_reward": 0.0,
503
- "rewards/format_reward": 0.2083333358168602,
504
  "step": 31,
505
- "vanishing_advantage_ratio": 0.416666679084301
506
  },
507
  {
508
  "clip_ratio": 0.0,
509
- "completion_length": 3013.625,
510
  "epoch": 0.054841473864610114,
511
- "grad_norm": 0.11328811198472977,
512
- "kl": 0.00067138671875,
513
  "learning_rate": 8.737029101523929e-07,
514
- "loss": 0.0737,
515
- "reward": 0.3750000037252903,
516
- "reward_std": 0.5095698311924934,
517
- "rewards/accuracy_reward": 0.12500000279396772,
518
  "rewards/difficulty_following_reward": 0.0,
519
- "rewards/format_reward": 0.12500000186264515,
520
  "step": 32,
521
- "vanishing_advantage_ratio": 0.2638889029622078
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
- "completion_length": 2419.0416870117188,
526
  "epoch": 0.056555269922879174,
527
- "grad_norm": 0.1530231535434723,
528
- "kl": 0.0009984970092773438,
529
  "learning_rate": 8.625962667065487e-07,
530
- "loss": 0.1043,
531
- "reward": 0.5625000074505806,
532
- "reward_std": 0.5970685631036758,
533
- "rewards/accuracy_reward": 0.16666666977107525,
534
  "rewards/difficulty_following_reward": 0.0,
535
- "rewards/format_reward": 0.2291666679084301,
536
  "step": 33,
537
- "vanishing_advantage_ratio": 0.3333333432674408
538
  },
539
  {
540
  "clip_ratio": 0.0,
541
- "completion_length": 2907.861083984375,
542
  "epoch": 0.05826906598114824,
543
- "grad_norm": 0.12301065772771835,
544
- "kl": 0.0006427764892578125,
545
  "learning_rate": 8.511087728614862e-07,
546
- "loss": 0.0463,
547
- "reward": 0.26388888992369175,
548
- "reward_std": 0.4172779843211174,
549
- "rewards/accuracy_reward": 0.06944444589316845,
550
  "rewards/difficulty_following_reward": 0.0,
551
- "rewards/format_reward": 0.1250000037252903,
552
  "step": 34,
553
- "vanishing_advantage_ratio": 0.2500000074505806
554
  },
555
  {
556
  "clip_ratio": 0.0,
557
- "completion_length": 3100.763916015625,
558
  "epoch": 0.05998286203941731,
559
- "grad_norm": 0.12839922308921814,
560
- "kl": 0.0008440017700195312,
561
  "learning_rate": 8.392544243589427e-07,
562
- "loss": 0.0551,
563
- "reward": 0.16666666697710752,
564
- "reward_std": 0.2315547615289688,
565
- "rewards/accuracy_reward": 0.02777777798473835,
566
  "rewards/difficulty_following_reward": 0.0,
567
- "rewards/format_reward": 0.11111111473292112,
568
  "step": 35,
569
- "vanishing_advantage_ratio": 0.3333333432674408
570
  },
571
  {
572
  "clip_ratio": 0.0,
573
- "completion_length": 2822.0555419921875,
574
  "epoch": 0.061696658097686374,
575
- "grad_norm": 0.12107470631599426,
576
- "kl": 0.0008678436279296875,
577
  "learning_rate": 8.270476638965461e-07,
578
- "loss": 0.0289,
579
- "reward": 0.3124999972060323,
580
- "reward_std": 0.22435275837779045,
581
- "rewards/accuracy_reward": 0.0833333358168602,
582
  "rewards/difficulty_following_reward": 0.0,
583
- "rewards/format_reward": 0.14583333674818277,
584
  "step": 36,
585
  "vanishing_advantage_ratio": 0.5000000149011612
586
  },
587
  {
588
  "clip_ratio": 0.0,
589
- "completion_length": 3060.6527709960938,
590
  "epoch": 0.06341045415595545,
591
- "grad_norm": 0.09756799787282944,
592
- "kl": 0.0010995864868164062,
593
  "learning_rate": 8.145033635316128e-07,
594
- "loss": 0.0338,
595
- "reward": 0.1736111156642437,
596
- "reward_std": 0.17782795429229736,
597
- "rewards/accuracy_reward": 0.02777777798473835,
598
  "rewards/difficulty_following_reward": 0.0,
599
- "rewards/format_reward": 0.11805556016042829,
600
  "step": 37,
601
- "vanishing_advantage_ratio": 0.5833333432674408
602
  },
603
  {
604
  "clip_ratio": 0.0,
605
- "completion_length": 3165.7777709960938,
606
  "epoch": 0.06512425021422451,
607
- "grad_norm": 0.11750654131174088,
608
- "kl": 0.0009393692016601562,
609
  "learning_rate": 8.01636806561836e-07,
610
- "loss": 0.0565,
611
- "reward": 0.16666667256504297,
612
- "reward_std": 0.25459469109773636,
613
- "rewards/accuracy_reward": 0.013888888992369175,
614
  "rewards/difficulty_following_reward": 0.0,
615
- "rewards/format_reward": 0.13888889271765947,
616
  "step": 38,
617
- "vanishing_advantage_ratio": 0.1666666716337204
618
  },
619
  {
620
  "clip_ratio": 0.0,
621
- "completion_length": 3037.2222290039062,
622
  "epoch": 0.06683804627249357,
623
- "grad_norm": 0.1207585409283638,
624
- "kl": 0.0012197494506835938,
625
  "learning_rate": 7.884636689049422e-07,
626
- "loss": 0.0499,
627
- "reward": 0.4166666604578495,
628
- "reward_std": 0.4621501453220844,
629
- "rewards/accuracy_reward": 0.12500000465661287,
630
  "rewards/difficulty_following_reward": 0.0,
631
- "rewards/format_reward": 0.16666666930541396,
632
  "step": 39,
633
  "vanishing_advantage_ratio": 0.2500000074505806
634
  },
635
  {
636
  "clip_ratio": 0.0,
637
- "completion_length": 2825.666748046875,
638
  "epoch": 0.06855184233076264,
639
- "grad_norm": 0.1460999995470047,
640
- "kl": 0.0011749267578125,
641
  "learning_rate": 7.75e-07,
642
- "loss": 0.0277,
643
- "reward": 0.3402777835726738,
644
- "reward_std": 0.4121079109609127,
645
- "rewards/accuracy_reward": 0.08333333395421505,
646
  "rewards/difficulty_following_reward": 0.0,
647
- "rewards/format_reward": 0.1736111156642437,
648
  "step": 40,
649
  "vanishing_advantage_ratio": 0.1944444552063942
650
  },
651
  {
652
  "clip_ratio": 0.0,
653
- "completion_length": 2901.1111450195312,
654
  "epoch": 0.0702656383890317,
655
- "grad_norm": 0.1785588562488556,
656
- "kl": 0.001255035400390625,
657
  "learning_rate": 7.612622032536507e-07,
658
- "loss": 0.1139,
659
- "reward": 0.3194444486871362,
660
- "reward_std": 0.35272070951759815,
661
- "rewards/accuracy_reward": 0.06944444496184587,
662
  "rewards/difficulty_following_reward": 0.0,
663
  "rewards/format_reward": 0.18055556062608957,
664
  "step": 41,
665
- "vanishing_advantage_ratio": 0.2500000074505806
666
  },
667
  {
668
  "clip_ratio": 0.0,
669
- "completion_length": 2538.1250610351562,
670
  "epoch": 0.07197943444730077,
671
- "grad_norm": 0.12826786935329437,
672
- "kl": 0.0012569427490234375,
673
  "learning_rate": 7.472670160550848e-07,
674
- "loss": 0.0716,
675
- "reward": 0.4236111119389534,
676
- "reward_std": 0.4551766887307167,
677
  "rewards/accuracy_reward": 0.09722222294658422,
678
  "rewards/difficulty_following_reward": 0.0,
679
- "rewards/format_reward": 0.2291666716337204,
680
  "step": 42,
681
- "vanishing_advantage_ratio": 0.3611111268401146
682
  },
683
  {
684
  "clip_ratio": 0.0,
685
- "completion_length": 2633.791748046875,
686
  "epoch": 0.07369323050556983,
687
- "grad_norm": 0.16073498129844666,
688
- "kl": 0.001422882080078125,
689
  "learning_rate": 7.330314893841101e-07,
690
- "loss": 0.0611,
691
- "reward": 0.47222223225980997,
692
- "reward_std": 0.44562001153826714,
693
- "rewards/accuracy_reward": 0.13888889085501432,
694
  "rewards/difficulty_following_reward": 0.0,
695
- "rewards/format_reward": 0.19444444496184587,
696
  "step": 43,
697
- "vanishing_advantage_ratio": 0.2222222313284874
698
  },
699
  {
700
  "clip_ratio": 0.0,
701
- "completion_length": 2699.4166870117188,
702
  "epoch": 0.07540702656383891,
703
- "grad_norm": 0.12924349308013916,
704
- "kl": 0.0012722015380859375,
705
  "learning_rate": 7.185729670371604e-07,
706
- "loss": 0.0353,
707
- "reward": 0.5208333544433117,
708
- "reward_std": 0.49474263936281204,
709
- "rewards/accuracy_reward": 0.15277778077870607,
710
  "rewards/difficulty_following_reward": 0.0,
711
- "rewards/format_reward": 0.2152777835726738,
712
  "step": 44,
713
- "vanishing_advantage_ratio": 0.2500000074505806
714
  },
715
  {
716
  "clip_ratio": 0.0,
717
- "completion_length": 2845.7083740234375,
718
  "epoch": 0.07712082262210797,
719
- "grad_norm": 0.11390957236289978,
720
- "kl": 0.00176239013671875,
721
  "learning_rate": 7.039090644965509e-07,
722
- "loss": 0.0475,
723
- "reward": 0.45138888992369175,
724
- "reward_std": 0.46796466782689095,
725
- "rewards/accuracy_reward": 0.12500000093132257,
726
  "rewards/difficulty_following_reward": 0.0,
727
- "rewards/format_reward": 0.20138889364898205,
728
  "step": 45,
729
- "vanishing_advantage_ratio": 0.2500000074505806
730
  },
731
  {
732
  "clip_ratio": 0.0,
733
- "completion_length": 2379.652801513672,
734
  "epoch": 0.07883461868037704,
735
- "grad_norm": 0.19179578125476837,
736
- "kl": 0.001857757568359375,
737
  "learning_rate": 6.890576474687263e-07,
738
- "loss": 0.0782,
739
- "reward": 0.2916666753590107,
740
- "reward_std": 0.24928437173366547,
741
- "rewards/accuracy_reward": 0.02777777798473835,
742
  "rewards/difficulty_following_reward": 0.0,
743
- "rewards/format_reward": 0.236111119389534,
744
  "step": 46,
745
- "vanishing_advantage_ratio": 0.4166666716337204
746
  },
747
  {
748
  "clip_ratio": 0.0,
749
- "completion_length": 2883.013916015625,
750
  "epoch": 0.0805484147386461,
751
- "grad_norm": 0.1341867595911026,
752
- "kl": 0.0015201568603515625,
753
  "learning_rate": 6.740368101176495e-07,
754
- "loss": 0.0509,
755
- "reward": 0.3124999962747097,
756
- "reward_std": 0.3921421244740486,
757
- "rewards/accuracy_reward": 0.06944444496184587,
758
  "rewards/difficulty_following_reward": 0.0,
759
- "rewards/format_reward": 0.17361111287027597,
760
  "step": 47,
761
- "vanishing_advantage_ratio": 0.2500000074505806
762
  },
763
  {
764
  "clip_ratio": 0.0,
765
- "completion_length": 2353.513916015625,
766
  "epoch": 0.08226221079691516,
767
- "grad_norm": 0.17986062169075012,
768
- "kl": 0.001934051513671875,
769
  "learning_rate": 6.588648530198504e-07,
770
- "loss": 0.1147,
771
- "reward": 0.4305555559694767,
772
- "reward_std": 0.45186642557382584,
773
- "rewards/accuracy_reward": 0.0833333358168602,
774
  "rewards/difficulty_following_reward": 0.0,
775
- "rewards/format_reward": 0.2638888917863369,
776
  "step": 48,
777
- "vanishing_advantage_ratio": 0.0833333358168602
778
  },
779
  {
780
  "clip_ratio": 0.0,
781
- "completion_length": 2381.236083984375,
782
  "epoch": 0.08397600685518423,
783
- "grad_norm": 0.21353779733181,
784
- "kl": 0.0018062591552734375,
785
  "learning_rate": 6.435602608679916e-07,
786
- "loss": 0.1211,
787
- "reward": 0.5138888955116272,
788
- "reward_std": 0.4158581532537937,
789
- "rewards/accuracy_reward": 0.12500000279396772,
790
  "rewards/difficulty_following_reward": 0.0,
791
- "rewards/format_reward": 0.2638888917863369,
792
  "step": 49,
793
- "vanishing_advantage_ratio": 0.2500000074505806
794
  },
795
  {
796
  "clip_ratio": 0.0,
797
- "completion_length": 2708.1945190429688,
798
  "epoch": 0.0856898029134533,
799
- "grad_norm": 0.15710178017616272,
800
- "kl": 0.002422332763671875,
801
  "learning_rate": 6.281416799501187e-07,
802
- "loss": 0.0607,
803
- "reward": 0.3541666716337204,
804
- "reward_std": 0.32185615226626396,
805
- "rewards/accuracy_reward": 0.041666666977107525,
806
  "rewards/difficulty_following_reward": 0.0,
807
- "rewards/format_reward": 0.2708333358168602,
808
  "step": 50,
809
- "vanishing_advantage_ratio": 0.3333333432674408
810
  },
811
  {
812
  "clip_ratio": 0.0,
813
- "completion_length": 2687.1527709960938,
814
  "epoch": 0.08740359897172237,
815
- "grad_norm": 0.12324044853448868,
816
- "kl": 0.0019092559814453125,
817
  "learning_rate": 6.126278954320294e-07,
818
- "loss": 0.0835,
819
- "reward": 0.2986111082136631,
820
- "reward_std": 0.3335576541721821,
821
- "rewards/accuracy_reward": 0.06944444496184587,
822
  "rewards/difficulty_following_reward": 0.0,
823
- "rewards/format_reward": 0.15972222480922937,
824
  "step": 51,
825
- "vanishing_advantage_ratio": 0.3611111342906952
826
  },
827
  {
828
  "clip_ratio": 0.0,
829
- "completion_length": 3042.3333740234375,
830
  "epoch": 0.08911739502999143,
831
- "grad_norm": 0.13603372871875763,
832
- "kl": 0.0019130706787109375,
833
  "learning_rate": 5.97037808470444e-07,
834
- "loss": 0.0355,
835
- "reward": 0.5347222350537777,
836
- "reward_std": 0.5967377945780754,
837
- "rewards/accuracy_reward": 0.19444444868713617,
838
  "rewards/difficulty_following_reward": 0.0,
839
- "rewards/format_reward": 0.14583333767950535,
840
  "step": 52,
841
- "vanishing_advantage_ratio": 0.1666666716337204
842
  },
843
  {
844
  "clip_ratio": 0.0,
845
- "completion_length": 2828.40283203125,
846
  "epoch": 0.0908311910882605,
847
- "grad_norm": 0.15737178921699524,
848
- "kl": 0.0020122528076171875,
849
  "learning_rate": 5.813904131848564e-07,
850
- "loss": 0.0849,
851
- "reward": 0.3680555634200573,
852
- "reward_std": 0.5401013866066933,
853
- "rewards/accuracy_reward": 0.12500000279396772,
854
  "rewards/difficulty_following_reward": 0.0,
855
- "rewards/format_reward": 0.11805555690079927,
856
  "step": 53,
857
- "vanishing_advantage_ratio": 0.1944444552063942
858
  },
859
  {
860
  "clip_ratio": 0.0,
861
- "completion_length": 3081.236083984375,
862
  "epoch": 0.09254498714652956,
863
- "grad_norm": 0.14406447112560272,
864
- "kl": 0.0018634796142578125,
865
  "learning_rate": 5.657047735161255e-07,
866
- "loss": 0.0754,
867
- "reward": 0.2708333325572312,
868
- "reward_std": 0.36696687154471874,
869
- "rewards/accuracy_reward": 0.06944444589316845,
870
  "rewards/difficulty_following_reward": 0.0,
871
- "rewards/format_reward": 0.13194444635882974,
872
  "step": 54,
873
- "vanishing_advantage_ratio": 0.3333333432674408
874
  },
875
  {
876
  "clip_ratio": 0.0,
877
- "completion_length": 3035.236083984375,
878
  "epoch": 0.09425878320479864,
879
- "grad_norm": 0.15260501205921173,
880
- "kl": 0.002197265625,
881
  "learning_rate": 5.5e-07,
882
- "loss": 0.1049,
883
- "reward": 0.3888888992369175,
884
- "reward_std": 0.5262130200862885,
885
- "rewards/accuracy_reward": 0.1111111119389534,
886
  "rewards/difficulty_following_reward": 0.0,
887
- "rewards/format_reward": 0.16666666977107525,
888
  "step": 55,
889
- "vanishing_advantage_ratio": 0.2500000074505806
890
  },
891
  {
892
  "clip_ratio": 0.0,
893
- "completion_length": 2801.5416259765625,
894
  "epoch": 0.0959725792630677,
895
- "grad_norm": 0.17021320760250092,
896
- "kl": 0.0022754669189453125,
897
  "learning_rate": 5.342952264838747e-07,
898
- "loss": 0.0853,
899
- "reward": 0.4236111119389534,
900
- "reward_std": 0.45923004299402237,
901
- "rewards/accuracy_reward": 0.1250000037252903,
902
  "rewards/difficulty_following_reward": 0.0,
903
- "rewards/format_reward": 0.1736111119389534,
904
  "step": 56,
905
- "vanishing_advantage_ratio": 0.25
906
  },
907
  {
908
  "clip_ratio": 0.0,
909
- "completion_length": 2974.6805419921875,
910
  "epoch": 0.09768637532133675,
911
- "grad_norm": 0.11549308151006699,
912
- "kl": 0.0024566650390625,
913
  "learning_rate": 5.186095868151436e-07,
914
- "loss": 0.0758,
915
- "reward": 0.3680555671453476,
916
- "reward_std": 0.4127896688878536,
917
- "rewards/accuracy_reward": 0.1111111119389534,
918
  "rewards/difficulty_following_reward": 0.0,
919
- "rewards/format_reward": 0.14583333441987634,
920
  "step": 57,
921
- "vanishing_advantage_ratio": 0.2500000074505806
922
  },
923
  {
924
  "clip_ratio": 0.0,
925
- "completion_length": 3138.1806030273438,
926
  "epoch": 0.09940017137960583,
927
- "grad_norm": 0.1200040727853775,
928
- "kl": 0.002838134765625,
929
  "learning_rate": 5.02962191529556e-07,
930
- "loss": 0.0454,
931
- "reward": 0.2083333325572312,
932
- "reward_std": 0.2104328442364931,
933
- "rewards/accuracy_reward": 0.02777777798473835,
934
  "rewards/difficulty_following_reward": 0.0,
935
- "rewards/format_reward": 0.1527777803130448,
936
  "step": 58,
937
- "vanishing_advantage_ratio": 0.416666679084301
938
  },
939
  {
940
  "clip_ratio": 0.0,
941
- "completion_length": 2535.5416870117188,
942
  "epoch": 0.10111396743787489,
943
- "grad_norm": 0.1694697141647339,
944
- "kl": 0.003231048583984375,
945
  "learning_rate": 4.873721045679706e-07,
946
- "loss": 0.1082,
947
- "reward": 0.618055559694767,
948
- "reward_std": 0.6503396853804588,
949
- "rewards/accuracy_reward": 0.1944444477558136,
950
  "rewards/difficulty_following_reward": 0.0,
951
- "rewards/format_reward": 0.2291666716337204,
952
  "step": 59,
953
  "vanishing_advantage_ratio": 0.1666666716337204
954
  },
955
  {
956
  "clip_ratio": 0.0,
957
- "completion_length": 2990.6805419921875,
958
  "epoch": 0.10282776349614396,
959
- "grad_norm": 0.11708780378103256,
960
- "kl": 0.0026702880859375,
961
  "learning_rate": 4.7185832004988133e-07,
962
- "loss": 0.0607,
963
- "reward": 0.5208333432674408,
964
- "reward_std": 0.6233179718255997,
965
- "rewards/accuracy_reward": 0.1944444477558136,
966
  "rewards/difficulty_following_reward": 0.0,
967
- "rewards/format_reward": 0.1319444477558136,
968
  "step": 60,
969
- "vanishing_advantage_ratio": 0.25
970
  },
971
  {
972
  "clip_ratio": 0.0,
973
- "completion_length": 2945.2222900390625,
974
  "epoch": 0.10454155955441302,
975
- "grad_norm": 0.147248312830925,
976
- "kl": 0.002758026123046875,
977
  "learning_rate": 4.5643973913200837e-07,
978
- "loss": 0.0707,
979
- "reward": 0.2847222280688584,
980
- "reward_std": 0.3064969703555107,
981
- "rewards/accuracy_reward": 0.0694444477558136,
982
  "rewards/difficulty_following_reward": 0.0,
983
- "rewards/format_reward": 0.1458333362825215,
984
  "step": 61,
985
  "vanishing_advantage_ratio": 0.3333333432674408
986
  },
987
  {
988
  "clip_ratio": 0.0,
989
- "completion_length": 2515.6666870117188,
990
  "epoch": 0.1062553556126821,
991
- "grad_norm": 0.15092438459396362,
992
- "kl": 0.003406524658203125,
993
  "learning_rate": 4.4113514698014953e-07,
994
- "loss": 0.0271,
995
- "reward": 0.5000000074505806,
996
- "reward_std": 0.5652750581502914,
997
- "rewards/accuracy_reward": 0.13888888992369175,
998
  "rewards/difficulty_following_reward": 0.0,
999
- "rewards/format_reward": 0.2222222276031971,
1000
  "step": 62,
1001
- "vanishing_advantage_ratio": 0.1666666716337204
1002
  },
1003
  {
1004
  "clip_ratio": 0.0,
1005
- "completion_length": 3210.2916870117188,
1006
  "epoch": 0.10796915167095116,
1007
- "grad_norm": 0.14689189195632935,
1008
- "kl": 0.002899169921875,
1009
  "learning_rate": 4.2596318988235037e-07,
1010
- "loss": 0.0911,
1011
- "reward": 0.3263888992369175,
1012
- "reward_std": 0.4785457216203213,
1013
- "rewards/accuracy_reward": 0.09722222574055195,
1014
  "rewards/difficulty_following_reward": 0.0,
1015
- "rewards/format_reward": 0.13194444822147489,
1016
  "step": 63,
1017
- "vanishing_advantage_ratio": 0.18055556155741215
1018
  },
1019
  {
1020
  "clip_ratio": 0.0,
1021
- "completion_length": 2830.5833129882812,
1022
  "epoch": 0.10968294772922023,
1023
- "grad_norm": 0.13354940712451935,
1024
- "kl": 0.0031719207763671875,
1025
  "learning_rate": 4.1094235253127374e-07,
1026
- "loss": 0.0593,
1027
- "reward": 0.30555555829778314,
1028
- "reward_std": 0.26662319526076317,
1029
- "rewards/accuracy_reward": 0.09722222480922937,
1030
  "rewards/difficulty_following_reward": 0.0,
1031
- "rewards/format_reward": 0.11111111240461469,
1032
  "step": 64,
1033
- "vanishing_advantage_ratio": 0.5000000149011612
1034
  },
1035
  {
1036
  "clip_ratio": 0.0,
1037
- "completion_length": 3102.7361450195312,
1038
  "epoch": 0.11139674378748929,
1039
- "grad_norm": 0.1135522872209549,
1040
- "kl": 0.003261566162109375,
1041
  "learning_rate": 3.9609093550344907e-07,
1042
- "loss": 0.0753,
1043
- "reward": 0.18055555829778314,
1044
- "reward_std": 0.1522968877106905,
1045
- "rewards/accuracy_reward": 0.013888888992369175,
1046
  "rewards/difficulty_following_reward": 0.0,
1047
- "rewards/format_reward": 0.15277778217568994,
1048
  "step": 65,
1049
- "vanishing_advantage_ratio": 0.5000000149011612
1050
  },
1051
  {
1052
  "clip_ratio": 0.0,
1053
- "completion_length": 2707.6666870117188,
1054
  "epoch": 0.11311053984575835,
1055
- "grad_norm": 0.13425862789154053,
1056
- "kl": 0.003246307373046875,
1057
  "learning_rate": 3.8142703296283953e-07,
1058
- "loss": 0.0475,
1059
- "reward": 0.5208333395421505,
1060
- "reward_std": 0.4286172688007355,
1061
- "rewards/accuracy_reward": 0.180555559694767,
1062
  "rewards/difficulty_following_reward": 0.0,
1063
- "rewards/format_reward": 0.15972222574055195,
1064
  "step": 66,
1065
- "vanishing_advantage_ratio": 0.5000000074505806
1066
  },
1067
  {
1068
  "clip_ratio": 0.0,
1069
- "completion_length": 2820.361083984375,
1070
  "epoch": 0.11482433590402742,
1071
- "grad_norm": 0.18349036574363708,
1072
- "kl": 0.00356292724609375,
1073
  "learning_rate": 3.6696851061588994e-07,
1074
- "loss": 0.077,
1075
- "reward": 0.4583333507180214,
1076
- "reward_std": 0.6253736391663551,
1077
- "rewards/accuracy_reward": 0.12500000279396772,
1078
  "rewards/difficulty_following_reward": 0.0,
1079
- "rewards/format_reward": 0.2083333358168602,
1080
  "step": 67,
1081
- "vanishing_advantage_ratio": 0.0833333358168602
1082
  },
1083
  {
1084
  "clip_ratio": 0.0,
1085
- "completion_length": 2947.138916015625,
1086
  "epoch": 0.11653813196229648,
1087
- "grad_norm": 0.14458446204662323,
1088
- "kl": 0.0033721923828125,
1089
  "learning_rate": 3.5273298394491515e-07,
1090
- "loss": 0.0576,
1091
- "reward": 0.37500000558793545,
1092
- "reward_std": 0.24039746448397636,
1093
- "rewards/accuracy_reward": 0.09722222480922937,
1094
  "rewards/difficulty_following_reward": 0.0,
1095
- "rewards/format_reward": 0.18055555783212185,
1096
  "step": 68,
1097
- "vanishing_advantage_ratio": 0.2500000074505806
1098
  },
1099
  {
1100
  "clip_ratio": 0.0,
1101
- "completion_length": 2803.513916015625,
1102
  "epoch": 0.11825192802056556,
1103
- "grad_norm": 0.18413224816322327,
1104
- "kl": 0.003757476806640625,
1105
  "learning_rate": 3.387377967463493e-07,
1106
- "loss": 0.1572,
1107
- "reward": 0.5833333432674408,
1108
- "reward_std": 0.592583030462265,
1109
- "rewards/accuracy_reward": 0.1944444514811039,
1110
  "rewards/difficulty_following_reward": 0.0,
1111
- "rewards/format_reward": 0.1944444477558136,
1112
  "step": 69,
1113
- "vanishing_advantage_ratio": 0.09722222574055195
1114
  },
1115
  {
1116
  "clip_ratio": 0.0,
1117
- "completion_length": 2804.9305419921875,
1118
  "epoch": 0.11996572407883462,
1119
- "grad_norm": 0.1695493459701538,
1120
- "kl": 0.003505706787109375,
1121
  "learning_rate": 3.250000000000001e-07,
1122
- "loss": 0.002,
1123
- "reward": 0.3680555522441864,
1124
- "reward_std": 0.40423472225666046,
1125
- "rewards/accuracy_reward": 0.11111111287027597,
1126
  "rewards/difficulty_following_reward": 0.0,
1127
- "rewards/format_reward": 0.1458333395421505,
1128
  "step": 70,
1129
- "vanishing_advantage_ratio": 0.4166666716337204
1130
  },
1131
  {
1132
  "clip_ratio": 0.0,
1133
- "completion_length": 2884.7916870117188,
1134
  "epoch": 0.12167952013710369,
1135
- "grad_norm": 0.17039252817630768,
1136
- "kl": 0.00363922119140625,
1137
  "learning_rate": 3.115363310950578e-07,
1138
- "loss": 0.0761,
1139
- "reward": 0.4305555559694767,
1140
- "reward_std": 0.5452307164669037,
1141
- "rewards/accuracy_reward": 0.15277778077870607,
1142
  "rewards/difficulty_following_reward": 0.0,
1143
- "rewards/format_reward": 0.12500000279396772,
1144
  "step": 71,
1145
- "vanishing_advantage_ratio": 0.361111119389534
1146
  },
1147
  {
1148
  "clip_ratio": 0.0,
1149
- "completion_length": 2375.625,
1150
  "epoch": 0.12339331619537275,
1151
- "grad_norm": 0.16211162507534027,
1152
- "kl": 0.0038604736328125,
1153
  "learning_rate": 2.9836319343816397e-07,
1154
- "loss": 0.0321,
1155
- "reward": 0.4513888880610466,
1156
- "reward_std": 0.2963226269930601,
1157
- "rewards/accuracy_reward": 0.11111111473292112,
1158
  "rewards/difficulty_following_reward": 0.0,
1159
- "rewards/format_reward": 0.2291666679084301,
1160
  "step": 72,
1161
- "vanishing_advantage_ratio": 0.416666679084301
1162
  },
1163
  {
1164
  "clip_ratio": 0.0,
1165
- "completion_length": 2680.6805725097656,
1166
  "epoch": 0.12510711225364182,
1167
- "grad_norm": 0.18615099787712097,
1168
- "kl": 0.0038604736328125,
1169
  "learning_rate": 2.854966364683872e-07,
1170
- "loss": 0.0706,
1171
- "reward": 0.5833333432674408,
1172
- "reward_std": 0.47368447482585907,
1173
- "rewards/accuracy_reward": 0.18055555876344442,
1174
  "rewards/difficulty_following_reward": 0.0,
1175
- "rewards/format_reward": 0.22222222480922937,
1176
  "step": 73,
1177
  "vanishing_advantage_ratio": 0.2500000074505806
1178
  },
1179
  {
1180
  "clip_ratio": 0.0,
1181
- "completion_length": 2270.250030517578,
1182
  "epoch": 0.1268209083119109,
1183
- "grad_norm": 0.17723990976810455,
1184
- "kl": 0.0043487548828125,
1185
  "learning_rate": 2.729523361034538e-07,
1186
- "loss": 0.0446,
1187
- "reward": 0.4583333283662796,
1188
- "reward_std": 0.5667729154229164,
1189
- "rewards/accuracy_reward": 0.0972222238779068,
1190
  "rewards/difficulty_following_reward": 0.0,
1191
- "rewards/format_reward": 0.2638888880610466,
1192
  "step": 74,
1193
- "vanishing_advantage_ratio": 0.09722222574055195
1194
  },
1195
  {
1196
  "clip_ratio": 0.0,
1197
- "completion_length": 3122.8750610351562,
1198
  "epoch": 0.12853470437017994,
1199
- "grad_norm": 0.10203089565038681,
1200
- "kl": 0.003383636474609375,
1201
  "learning_rate": 2.6074557564105724e-07,
1202
- "loss": 0.0706,
1203
- "reward": 0.2708333395421505,
1204
- "reward_std": 0.3444611057639122,
1205
- "rewards/accuracy_reward": 0.06944444496184587,
1206
  "rewards/difficulty_following_reward": 0.0,
1207
- "rewards/format_reward": 0.13194444915279746,
1208
  "step": 75,
1209
- "vanishing_advantage_ratio": 0.416666679084301
1210
  },
1211
  {
1212
  "clip_ratio": 0.0,
1213
- "completion_length": 2106.3472900390625,
1214
  "epoch": 0.13024850042844902,
1215
- "grad_norm": 0.18112541735172272,
1216
- "kl": 0.00616455078125,
1217
  "learning_rate": 2.488912271385139e-07,
1218
- "loss": 0.0177,
1219
- "reward": 0.5833333358168602,
1220
- "reward_std": 0.47035834193229675,
1221
- "rewards/accuracy_reward": 0.13888889364898205,
1222
  "rewards/difficulty_following_reward": 0.0,
1223
- "rewards/format_reward": 0.3055555559694767,
1224
  "step": 76,
1225
- "vanishing_advantage_ratio": 0.3055555671453476
1226
  },
1227
  {
1228
  "clip_ratio": 0.0,
1229
- "completion_length": 2881.013916015625,
1230
  "epoch": 0.1319622964867181,
1231
- "grad_norm": 0.12448816746473312,
1232
- "kl": 0.00357818603515625,
1233
  "learning_rate": 2.374037332934512e-07,
1234
- "loss": 0.0967,
1235
- "reward": 0.5138888955116272,
1236
- "reward_std": 0.7071040645241737,
1237
- "rewards/accuracy_reward": 0.18055555783212185,
1238
  "rewards/difficulty_following_reward": 0.0,
1239
- "rewards/format_reward": 0.15277778077870607,
1240
  "step": 77,
1241
- "vanishing_advantage_ratio": 0.3055555708706379
1242
  },
1243
  {
1244
  "clip_ratio": 0.0,
1245
- "completion_length": 3254.3611450195312,
1246
  "epoch": 0.13367609254498714,
1247
- "grad_norm": 0.10527807474136353,
1248
- "kl": 0.003459930419921875,
1249
  "learning_rate": 2.2629708984760706e-07,
1250
- "loss": 0.0512,
1251
- "reward": 0.2500000041909516,
1252
- "reward_std": 0.36582790687680244,
1253
- "rewards/accuracy_reward": 0.08333333674818277,
1254
  "rewards/difficulty_following_reward": 0.0,
1255
- "rewards/format_reward": 0.08333333488553762,
1256
  "step": 78,
1257
- "vanishing_advantage_ratio": 0.416666679084301
1258
  },
1259
  {
1260
  "clip_ratio": 0.0,
1261
- "completion_length": 2438.5416259765625,
1262
  "epoch": 0.1353898886032562,
1263
- "grad_norm": 0.15114635229110718,
1264
- "kl": 0.0055389404296875,
1265
  "learning_rate": 2.1558482853517253e-07,
1266
- "loss": 0.0167,
1267
- "reward": 0.4513888955116272,
1268
- "reward_std": 0.27200620248913765,
1269
- "rewards/accuracy_reward": 0.1111111156642437,
1270
  "rewards/difficulty_following_reward": 0.0,
1271
- "rewards/format_reward": 0.2291666716337204,
1272
  "step": 79,
1273
- "vanishing_advantage_ratio": 0.3333333432674408
1274
  },
1275
  {
1276
  "clip_ratio": 0.0,
1277
- "completion_length": 3177.1666870117188,
1278
  "epoch": 0.13710368466152528,
1279
- "grad_norm": 0.13894222676753998,
1280
- "kl": 0.004375457763671875,
1281
  "learning_rate": 2.0528000059645995e-07,
1282
- "loss": 0.0528,
1283
- "reward": 0.3125000037252903,
1284
- "reward_std": 0.4319829046726227,
1285
- "rewards/accuracy_reward": 0.08333333488553762,
1286
  "rewards/difficulty_following_reward": 0.0,
1287
- "rewards/format_reward": 0.14583333767950535,
1288
  "step": 80,
1289
- "vanishing_advantage_ratio": 0.3333333432674408
1290
  },
1291
  {
1292
  "clip_ratio": 0.0,
1293
- "completion_length": 2965.4027709960938,
1294
  "epoch": 0.13881748071979436,
1295
- "grad_norm": 0.1256721317768097,
1296
- "kl": 0.004421234130859375,
1297
  "learning_rate": 1.9539516087697517e-07,
1298
- "loss": 0.0653,
1299
- "reward": 0.24305555736646056,
1300
- "reward_std": 0.3140085842460394,
1301
- "rewards/accuracy_reward": 0.06944444496184587,
1302
  "rewards/difficulty_following_reward": 0.0,
1303
- "rewards/format_reward": 0.10416666930541396,
1304
  "step": 81,
1305
- "vanishing_advantage_ratio": 0.416666679084301
1306
  },
1307
  {
1308
  "clip_ratio": 0.0,
1309
- "completion_length": 2896.1527709960938,
1310
  "epoch": 0.1405312767780634,
1311
- "grad_norm": 0.16527993977069855,
1312
- "kl": 0.00421142578125,
1313
  "learning_rate": 1.8594235253127372e-07,
1314
- "loss": 0.1779,
1315
- "reward": 0.381944440305233,
1316
- "reward_std": 0.508243752643466,
1317
- "rewards/accuracy_reward": 0.11111111473292112,
1318
  "rewards/difficulty_following_reward": 0.0,
1319
- "rewards/format_reward": 0.15972222574055195,
1320
  "step": 82,
1321
- "vanishing_advantage_ratio": 0.1666666716337204
1322
  },
1323
  {
1324
  "clip_ratio": 0.0,
1325
- "completion_length": 2593.5416870117188,
1326
  "epoch": 0.14224507283633248,
1327
- "grad_norm": 0.19563595950603485,
1328
- "kl": 0.00440216064453125,
1329
  "learning_rate": 1.7693309235023127e-07,
1330
- "loss": 0.0942,
1331
- "reward": 0.4583333358168602,
1332
- "reward_std": 0.5371576994657516,
1333
- "rewards/accuracy_reward": 0.12500000186264515,
1334
  "rewards/difficulty_following_reward": 0.0,
1335
- "rewards/format_reward": 0.2083333358168602,
1336
  "step": 83,
1337
- "vanishing_advantage_ratio": 0.1666666716337204
1338
  },
1339
  {
1340
  "clip_ratio": 0.0,
1341
- "completion_length": 2866.763885498047,
1342
  "epoch": 0.14395886889460155,
1343
- "grad_norm": 0.12824302911758423,
1344
- "kl": 0.00487518310546875,
1345
  "learning_rate": 1.6837835672960831e-07,
1346
- "loss": 0.0191,
1347
- "reward": 0.4930555745959282,
1348
- "reward_std": 0.49528179317712784,
1349
- "rewards/accuracy_reward": 0.15277778077870607,
1350
  "rewards/difficulty_following_reward": 0.0,
1351
- "rewards/format_reward": 0.18750000232830644,
1352
  "step": 84,
1353
- "vanishing_advantage_ratio": 0.458333358168602
1354
  },
1355
  {
1356
  "clip_ratio": 0.0,
1357
- "completion_length": 2863.5972900390625,
1358
  "epoch": 0.1456726649528706,
1359
- "grad_norm": 0.1436908096075058,
1360
- "kl": 0.0040435791015625,
1361
  "learning_rate": 1.6028856829700258e-07,
1362
- "loss": 0.0606,
1363
- "reward": 0.47222222574055195,
1364
- "reward_std": 0.517341261729598,
1365
- "rewards/accuracy_reward": 0.1666666716337204,
1366
  "rewards/difficulty_following_reward": 0.0,
1367
- "rewards/format_reward": 0.1388888917863369,
1368
  "step": 85,
1369
  "vanishing_advantage_ratio": 0.2500000074505806
1370
  },
1371
  {
1372
  "clip_ratio": 0.0,
1373
- "completion_length": 2785.736083984375,
1374
  "epoch": 0.14738646101113967,
1375
- "grad_norm": 0.17216697335243225,
1376
- "kl": 0.004482269287109375,
1377
  "learning_rate": 1.5267358321348285e-07,
1378
- "loss": 0.0824,
1379
- "reward": 0.41666666604578495,
1380
- "reward_std": 0.485262468457222,
1381
- "rewards/accuracy_reward": 0.12500000186264515,
1382
  "rewards/difficulty_following_reward": 0.0,
1383
- "rewards/format_reward": 0.1666666679084301,
1384
  "step": 86,
1385
- "vanishing_advantage_ratio": 0.1666666716337204
1386
  },
1387
  {
1388
  "clip_ratio": 0.0,
1389
- "completion_length": 3307.3194580078125,
1390
  "epoch": 0.14910025706940874,
1391
- "grad_norm": 0.10646296292543411,
1392
- "kl": 0.003597259521484375,
1393
  "learning_rate": 1.4554267916537495e-07,
1394
- "loss": 0.0495,
1395
- "reward": 0.43750001676380634,
1396
- "reward_std": 0.3348946310579777,
1397
- "rewards/accuracy_reward": 0.16666667349636555,
1398
  "rewards/difficulty_following_reward": 0.0,
1399
- "rewards/format_reward": 0.10416666883975267,
1400
  "step": 87,
1401
  "vanishing_advantage_ratio": 0.3333333432674408
1402
  },
1403
  {
1404
  "clip_ratio": 0.0,
1405
- "completion_length": 3317.1806030273438,
1406
  "epoch": 0.15081405312767782,
1407
- "grad_norm": 0.131212517619133,
1408
- "kl": 0.004451751708984375,
1409
  "learning_rate": 1.3890454406082956e-07,
1410
- "loss": 0.0764,
1411
- "reward": 0.3680555671453476,
1412
- "reward_std": 0.4501563832163811,
1413
- "rewards/accuracy_reward": 0.15277778077870607,
1414
  "rewards/difficulty_following_reward": 0.0,
1415
- "rewards/format_reward": 0.06250000186264515,
1416
  "step": 88,
1417
- "vanishing_advantage_ratio": 0.4444444552063942
1418
  },
1419
  {
1420
  "clip_ratio": 0.0,
1421
- "completion_length": 2650.8056030273438,
1422
  "epoch": 0.15252784918594686,
1423
- "grad_norm": 0.15206274390220642,
1424
- "kl": 0.00530242919921875,
1425
  "learning_rate": 1.3276726544494571e-07,
1426
- "loss": 0.0723,
1427
- "reward": 0.2847222238779068,
1428
- "reward_std": 0.39606497436761856,
1429
- "rewards/accuracy_reward": 0.0555555559694767,
1430
  "rewards/difficulty_following_reward": 0.0,
1431
- "rewards/format_reward": 0.1736111124046147,
1432
  "step": 89,
1433
- "vanishing_advantage_ratio": 0.3333333432674408
1434
  },
1435
  {
1436
  "clip_ratio": 0.0,
1437
- "completion_length": 2848.7500610351562,
1438
  "epoch": 0.15424164524421594,
1439
- "grad_norm": 0.16075682640075684,
1440
- "kl": 0.003780364990234375,
1441
  "learning_rate": 1.2713832064634125e-07,
1442
- "loss": 0.0582,
1443
- "reward": 0.31250000558793545,
1444
- "reward_std": 0.2296726070344448,
1445
  "rewards/accuracy_reward": 0.055555556900799274,
1446
  "rewards/difficulty_following_reward": 0.0,
1447
- "rewards/format_reward": 0.20138888992369175,
1448
  "step": 90,
1449
- "vanishing_advantage_ratio": 0.5000000074505806
1450
  },
1451
  {
1452
  "clip_ratio": 0.0,
1453
- "completion_length": 2962.125,
1454
  "epoch": 0.155955441302485,
1455
- "grad_norm": 0.14642441272735596,
1456
- "kl": 0.0050201416015625,
1457
  "learning_rate": 1.220245676671809e-07,
1458
- "loss": 0.0831,
1459
- "reward": 0.3680555489845574,
1460
- "reward_std": 0.40546936355531216,
1461
- "rewards/accuracy_reward": 0.1111111119389534,
1462
  "rewards/difficulty_following_reward": 0.0,
1463
- "rewards/format_reward": 0.1458333362825215,
1464
  "step": 91,
1465
- "vanishing_advantage_ratio": 0.3333333432674408
1466
  },
1467
  {
1468
  "clip_ratio": 0.0,
1469
- "completion_length": 2616.8472290039062,
1470
  "epoch": 0.15766923736075408,
1471
- "grad_norm": 0.1574823409318924,
1472
- "kl": 0.004451751708984375,
1473
  "learning_rate": 1.1743223682775649e-07,
1474
- "loss": 0.0517,
1475
- "reward": 0.28472222574055195,
1476
- "reward_std": 0.22857079841196537,
1477
- "rewards/accuracy_reward": 0.0416666679084301,
1478
  "rewards/difficulty_following_reward": 0.0,
1479
- "rewards/format_reward": 0.20138889364898205,
1480
  "step": 92,
1481
- "vanishing_advantage_ratio": 0.416666679084301
1482
  },
1483
  {
1484
  "clip_ratio": 0.0,
1485
- "completion_length": 2823.7639770507812,
1486
  "epoch": 0.15938303341902313,
1487
- "grad_norm": 0.164349764585495,
1488
- "kl": 0.0048828125,
1489
  "learning_rate": 1.1336692317580158e-07,
1490
- "loss": 0.0555,
1491
- "reward": 0.3958333325572312,
1492
- "reward_std": 0.33638707362115383,
1493
- "rewards/accuracy_reward": 0.11111111473292112,
1494
  "rewards/difficulty_following_reward": 0.0,
1495
- "rewards/format_reward": 0.17361111612990499,
1496
  "step": 93,
1497
- "vanishing_advantage_ratio": 0.2500000074505806
1498
  },
1499
  {
1500
  "clip_ratio": 0.0,
1501
- "completion_length": 2778.7916870117188,
1502
  "epoch": 0.1610968294772922,
1503
- "grad_norm": 0.14506115019321442,
1504
- "kl": 0.0046539306640625,
1505
  "learning_rate": 1.0983357966978745e-07,
1506
- "loss": 0.0633,
1507
- "reward": 0.2916666683740914,
1508
- "reward_std": 0.25966140627861023,
1509
- "rewards/accuracy_reward": 0.041666666977107525,
1510
  "rewards/difficulty_following_reward": 0.0,
1511
- "rewards/format_reward": 0.20833334000781178,
1512
  "step": 94,
1513
  "vanishing_advantage_ratio": 0.4166666716337204
1514
  },
1515
  {
1516
  "clip_ratio": 0.0,
1517
- "completion_length": 2717.3750610351562,
1518
  "epoch": 0.16281062553556128,
1519
- "grad_norm": 0.16493435204029083,
1520
- "kl": 0.00475311279296875,
1521
  "learning_rate": 1.068365111445064e-07,
1522
- "loss": 0.1016,
1523
- "reward": 0.5208333358168602,
1524
- "reward_std": 0.588125430047512,
1525
- "rewards/accuracy_reward": 0.15277778264135122,
1526
  "rewards/difficulty_following_reward": 0.0,
1527
- "rewards/format_reward": 0.2152777798473835,
1528
  "step": 95,
1529
  "vanishing_advantage_ratio": 0.0833333358168602
1530
  },
1531
  {
1532
  "clip_ratio": 0.0,
1533
- "completion_length": 2940.2638549804688,
1534
  "epoch": 0.16452442159383032,
1535
- "grad_norm": 0.14790275692939758,
1536
- "kl": 0.0041656494140625,
1537
  "learning_rate": 1.0437936906629334e-07,
1538
- "loss": 0.0765,
1539
- "reward": 0.6666666734963655,
1540
- "reward_std": 0.6499281954020262,
1541
- "rewards/accuracy_reward": 0.2500000037252903,
1542
  "rewards/difficulty_following_reward": 0.0,
1543
- "rewards/format_reward": 0.16666667349636555,
1544
  "step": 96,
1545
- "vanishing_advantage_ratio": 0.2500000074505806
1546
  },
1547
  {
1548
  "clip_ratio": 0.0,
1549
- "completion_length": 3096.1805419921875,
1550
  "epoch": 0.1662382176520994,
1551
- "grad_norm": 0.13928674161434174,
1552
- "kl": 0.00507354736328125,
1553
  "learning_rate": 1.0246514708427701e-07,
1554
- "loss": 0.1055,
1555
- "reward": 0.2638888917863369,
1556
- "reward_std": 0.436099786311388,
1557
- "rewards/accuracy_reward": 0.0555555559694767,
1558
  "rewards/difficulty_following_reward": 0.0,
1559
- "rewards/format_reward": 0.15277778171002865,
1560
  "step": 97,
1561
- "vanishing_advantage_ratio": 0.1666666716337204
1562
  },
1563
  {
1564
  "clip_ratio": 0.0,
1565
- "completion_length": 2866.5000610351562,
1566
  "epoch": 0.16795201371036847,
1567
- "grad_norm": 0.10994574427604675,
1568
- "kl": 0.0052490234375,
1569
  "learning_rate": 1.0109617738307911e-07,
1570
- "loss": 0.0385,
1571
- "reward": 0.28472222574055195,
1572
- "reward_std": 0.30150144174695015,
1573
- "rewards/accuracy_reward": 0.055555556900799274,
1574
  "rewards/difficulty_following_reward": 0.0,
1575
- "rewards/format_reward": 0.1736111156642437,
1576
  "step": 98,
1577
- "vanishing_advantage_ratio": 0.5000000074505806
1578
  },
1579
  {
1580
  "clip_ratio": 0.0,
1581
- "completion_length": 3153.5000610351562,
1582
  "epoch": 0.16966580976863754,
1583
- "grad_norm": 0.13096007704734802,
1584
- "kl": 0.0048980712890625,
1585
  "learning_rate": 1.002741278414069e-07,
1586
- "loss": 0.0808,
1587
- "reward": 0.2777777835726738,
1588
- "reward_std": 0.3955828621983528,
1589
- "rewards/accuracy_reward": 0.08333333395421505,
1590
  "rewards/difficulty_following_reward": 0.0,
1591
- "rewards/format_reward": 0.11111111287027597,
1592
  "step": 99,
1593
- "vanishing_advantage_ratio": 0.3472222238779068
1594
  },
1595
  {
1596
  "clip_ratio": 0.0,
1597
- "completion_length": 2818.7083740234375,
1598
  "epoch": 0.1713796058269066,
1599
- "grad_norm": 0.13920088112354279,
1600
- "kl": 0.00577545166015625,
1601
  "learning_rate": 1e-07,
1602
- "loss": 0.0032,
1603
- "reward": 0.30555555783212185,
1604
- "reward_std": 0.32475365325808525,
1605
- "rewards/accuracy_reward": 0.06944444496184587,
1606
  "rewards/difficulty_following_reward": 0.0,
1607
- "rewards/format_reward": 0.16666667349636555,
1608
  "step": 100,
1609
- "vanishing_advantage_ratio": 0.2500000074505806
1610
  },
1611
  {
1612
  "epoch": 0.1713796058269066,
1613
  "step": 100,
1614
  "total_flos": 0.0,
1615
- "train_loss": 0.06513786550378427,
1616
- "train_runtime": 17727.636,
1617
- "train_samples_per_second": 0.406,
1618
- "train_steps_per_second": 0.006
1619
  }
1620
  ],
1621
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
+ "completion_length": 2910.513916015625,
14
  "epoch": 0.001713796058269066,
15
+ "grad_norm": 0.07309982925653458,
16
  "kl": 0.0,
17
  "learning_rate": 1e-07,
18
+ "loss": 0.0596,
19
+ "reward": 0.11805555876344442,
20
+ "reward_std": 0.060043493285775185,
21
+ "rewards/accuracy_reward": 0.0,
22
  "rewards/difficulty_following_reward": 0.0,
23
+ "rewards/format_reward": 0.11805555876344442,
24
  "step": 1,
25
+ "vanishing_advantage_ratio": 0.7500000074505806
26
  },
27
  {
28
  "clip_ratio": 0.0,
29
+ "completion_length": 2873.4722900390625,
30
  "epoch": 0.003427592116538132,
31
+ "grad_norm": 0.13450193405151367,
32
  "kl": 0.0,
33
  "learning_rate": 2e-07,
34
+ "loss": 0.0299,
35
+ "reward": 0.17361111612990499,
36
+ "reward_std": 0.13970773108303547,
37
+ "rewards/accuracy_reward": 0.0,
38
  "rewards/difficulty_following_reward": 0.0,
39
+ "rewards/format_reward": 0.17361111612990499,
40
  "step": 2,
41
+ "vanishing_advantage_ratio": 0.5416666865348816
42
  },
43
  {
44
  "clip_ratio": 0.0,
45
+ "completion_length": 2874.8056030273438,
46
  "epoch": 0.005141388174807198,
47
+ "grad_norm": 0.1418725550174713,
48
+ "kl": 4.89354133605957e-05,
49
  "learning_rate": 3e-07,
50
+ "loss": 0.0777,
51
+ "reward": 0.1527777835726738,
52
+ "reward_std": 0.176795095205307,
53
+ "rewards/accuracy_reward": 0.0,
54
  "rewards/difficulty_following_reward": 0.0,
55
+ "rewards/format_reward": 0.1527777835726738,
56
  "step": 3,
57
+ "vanishing_advantage_ratio": 0.3333333432674408
58
  },
59
  {
60
  "clip_ratio": 0.0,
61
+ "completion_length": 2538.0694580078125,
62
  "epoch": 0.006855184233076264,
63
+ "grad_norm": 0.23347292840480804,
64
+ "kl": 6.121397018432617e-05,
65
  "learning_rate": 4e-07,
66
+ "loss": 0.1711,
67
+ "reward": 0.2500000074505806,
68
+ "reward_std": 0.23697294667363167,
69
+ "rewards/accuracy_reward": 0.0,
70
  "rewards/difficulty_following_reward": 0.0,
71
+ "rewards/format_reward": 0.2500000074505806,
72
  "step": 4,
73
+ "vanishing_advantage_ratio": 0.0833333358168602
74
  },
75
  {
76
  "clip_ratio": 0.0,
77
+ "completion_length": 2422.1666259765625,
78
  "epoch": 0.00856898029134533,
79
+ "grad_norm": 0.25554895401000977,
80
+ "kl": 6.949901580810547e-05,
81
  "learning_rate": 5e-07,
82
+ "loss": 0.1879,
83
+ "reward": 0.22916666977107525,
84
+ "reward_std": 0.21095014363527298,
85
+ "rewards/accuracy_reward": 0.0,
86
  "rewards/difficulty_following_reward": 0.0,
87
+ "rewards/format_reward": 0.22916666977107525,
88
  "step": 5,
89
+ "vanishing_advantage_ratio": 0.2500000074505806
90
  },
91
  {
92
  "clip_ratio": 0.0,
93
+ "completion_length": 2720.361083984375,
94
  "epoch": 0.010282776349614395,
95
+ "grad_norm": 0.1855655461549759,
96
+ "kl": 5.7816505432128906e-05,
97
  "learning_rate": 6e-07,
98
+ "loss": 0.1443,
99
+ "reward": 0.1666666716337204,
100
+ "reward_std": 0.2220027968287468,
101
+ "rewards/accuracy_reward": 0.0,
102
  "rewards/difficulty_following_reward": 0.0,
103
+ "rewards/format_reward": 0.1666666716337204,
104
  "step": 6,
105
+ "vanishing_advantage_ratio": 0.2500000074505806
106
  },
107
  {
108
  "clip_ratio": 0.0,
109
+ "completion_length": 2124.763885498047,
110
  "epoch": 0.011996572407883462,
111
+ "grad_norm": 0.19423389434814453,
112
+ "kl": 5.745887756347656e-05,
113
  "learning_rate": 7e-07,
114
+ "loss": 0.0686,
115
+ "reward": 0.25,
116
+ "reward_std": 0.17486263997852802,
117
+ "rewards/accuracy_reward": 0.0,
118
  "rewards/difficulty_following_reward": 0.0,
119
+ "rewards/format_reward": 0.25,
120
  "step": 7,
121
  "vanishing_advantage_ratio": 0.3333333432674408
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 2672.6527709960938,
126
  "epoch": 0.013710368466152529,
127
+ "grad_norm": 0.1445378065109253,
128
+ "kl": 3.981590270996094e-05,
129
  "learning_rate": 8e-07,
130
+ "loss": 0.0707,
131
+ "reward": 0.18055555783212185,
132
+ "reward_std": 0.20800206437706947,
133
+ "rewards/accuracy_reward": 0.0,
134
  "rewards/difficulty_following_reward": 0.0,
135
+ "rewards/format_reward": 0.18055555783212185,
136
  "step": 8,
137
+ "vanishing_advantage_ratio": 0.2500000074505806
138
  },
139
  {
140
  "clip_ratio": 0.0,
141
+ "completion_length": 2435.027801513672,
142
  "epoch": 0.015424164524421594,
143
+ "grad_norm": 0.2157757580280304,
144
+ "kl": 5.0008296966552734e-05,
145
  "learning_rate": 9e-07,
146
+ "loss": 0.1989,
147
+ "reward": 0.2152777798473835,
148
+ "reward_std": 0.16122430190443993,
149
+ "rewards/accuracy_reward": 0.0,
150
  "rewards/difficulty_following_reward": 0.0,
151
+ "rewards/format_reward": 0.2152777798473835,
152
  "step": 9,
153
  "vanishing_advantage_ratio": 0.3333333432674408
154
  },
155
  {
156
  "clip_ratio": 0.0,
157
+ "completion_length": 2983.874969482422,
158
  "epoch": 0.01713796058269066,
159
+ "grad_norm": 0.1537931114435196,
160
+ "kl": 3.7670135498046875e-05,
161
  "learning_rate": 1e-06,
162
+ "loss": 0.0572,
163
+ "reward": 0.1388888917863369,
164
+ "reward_std": 0.1325911059975624,
165
+ "rewards/accuracy_reward": 0.0,
166
  "rewards/difficulty_following_reward": 0.0,
167
+ "rewards/format_reward": 0.1388888917863369,
168
  "step": 10,
169
+ "vanishing_advantage_ratio": 0.416666679084301
170
  },
171
  {
172
  "clip_ratio": 0.0,
173
+ "completion_length": 2714.6666870117188,
174
  "epoch": 0.018851756640959727,
175
+ "grad_norm": 0.14581632614135742,
176
+ "kl": 4.884600639343262e-05,
177
  "learning_rate": 9.997258721585931e-07,
178
+ "loss": 0.0249,
179
+ "reward": 0.4097222276031971,
180
+ "reward_std": 0.5287230610847473,
181
+ "rewards/accuracy_reward": 0.13888889085501432,
182
  "rewards/difficulty_following_reward": 0.0,
183
+ "rewards/format_reward": 0.1319444477558136,
184
  "step": 11,
185
+ "vanishing_advantage_ratio": 0.3333333432674408
186
  },
187
  {
188
  "clip_ratio": 0.0,
189
+ "completion_length": 3190.5555419921875,
190
  "epoch": 0.02056555269922879,
191
+ "grad_norm": 0.12614919245243073,
192
+ "kl": 3.2901763916015625e-05,
193
  "learning_rate": 9.989038226169207e-07,
194
+ "loss": 0.0385,
195
+ "reward": 0.305555556435138,
196
+ "reward_std": 0.44778930954635143,
197
+ "rewards/accuracy_reward": 0.08333333488553762,
198
  "rewards/difficulty_following_reward": 0.0,
199
+ "rewards/format_reward": 0.13888889411464334,
200
  "step": 12,
201
+ "vanishing_advantage_ratio": 0.2638889029622078
202
  },
203
  {
204
  "clip_ratio": 0.0,
205
+ "completion_length": 2701.9583129882812,
206
  "epoch": 0.022279348757497857,
207
+ "grad_norm": 0.17424772679805756,
208
+ "kl": 0.00010010600090026855,
209
  "learning_rate": 9.975348529157229e-07,
210
+ "loss": 0.0723,
211
+ "reward": 0.486111119389534,
212
+ "reward_std": 0.6361183077096939,
213
+ "rewards/accuracy_reward": 0.15277778264135122,
214
  "rewards/difficulty_following_reward": 0.0,
215
+ "rewards/format_reward": 0.18055556016042829,
216
  "step": 13,
217
+ "vanishing_advantage_ratio": 0.1666666716337204
218
  },
219
  {
220
  "clip_ratio": 0.0,
221
+ "completion_length": 2698.0972595214844,
222
  "epoch": 0.023993144815766924,
223
+ "grad_norm": 0.13479822874069214,
224
+ "kl": 7.367134094238281e-05,
225
  "learning_rate": 9.956206309337066e-07,
226
+ "loss": 0.0696,
227
+ "reward": 0.38194445334374905,
228
+ "reward_std": 0.4227895326912403,
229
+ "rewards/accuracy_reward": 0.09722222294658422,
230
  "rewards/difficulty_following_reward": 0.0,
231
+ "rewards/format_reward": 0.1875,
232
  "step": 14,
233
+ "vanishing_advantage_ratio": 0.3611111268401146
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
+ "completion_length": 3156.9722290039062,
238
  "epoch": 0.02570694087403599,
239
+ "grad_norm": 0.12347124516963959,
240
+ "kl": 4.169344902038574e-05,
241
  "learning_rate": 9.931634888554935e-07,
242
+ "loss": 0.0423,
243
+ "reward": 0.4791666679084301,
244
+ "reward_std": 0.5301840528845787,
245
+ "rewards/accuracy_reward": 0.16666666883975267,
246
  "rewards/difficulty_following_reward": 0.0,
247
+ "rewards/format_reward": 0.14583333488553762,
248
  "step": 15,
249
+ "vanishing_advantage_ratio": 0.2916666753590107
250
  },
251
  {
252
  "clip_ratio": 0.0,
253
+ "completion_length": 2310.138916015625,
254
  "epoch": 0.027420736932305057,
255
+ "grad_norm": 0.20043165981769562,
256
+ "kl": 0.0002892017364501953,
257
  "learning_rate": 9.901664203302124e-07,
258
+ "loss": 0.0751,
259
+ "reward": 0.5208333283662796,
260
+ "reward_std": 0.6124575287103653,
261
+ "rewards/accuracy_reward": 0.12500000279396772,
262
  "rewards/difficulty_following_reward": 0.0,
263
+ "rewards/format_reward": 0.27083333767950535,
264
  "step": 16,
265
+ "vanishing_advantage_ratio": 0.1944444552063942
266
  },
267
  {
268
  "clip_ratio": 0.0,
269
+ "completion_length": 2492.9583435058594,
270
  "epoch": 0.02913453299057412,
271
+ "grad_norm": 0.17674975097179413,
272
+ "kl": 0.00019311904907226562,
273
  "learning_rate": 9.866330768241983e-07,
274
+ "loss": 0.0971,
275
+ "reward": 0.4652777761220932,
276
+ "reward_std": 0.4706720970571041,
277
+ "rewards/accuracy_reward": 0.12500000186264515,
278
  "rewards/difficulty_following_reward": 0.0,
279
+ "rewards/format_reward": 0.2152777798473835,
280
  "step": 17,
281
+ "vanishing_advantage_ratio": 0.2500000074505806
282
  },
283
  {
284
  "clip_ratio": 0.0,
285
+ "completion_length": 2977.625,
286
  "epoch": 0.030848329048843187,
287
+ "grad_norm": 0.11323793232440948,
288
+ "kl": 0.00023287534713745117,
289
  "learning_rate": 9.825677631722435e-07,
290
+ "loss": 0.0141,
291
+ "reward": 0.35416666977107525,
292
+ "reward_std": 0.40337391197681427,
293
+ "rewards/accuracy_reward": 0.11111111473292112,
294
  "rewards/difficulty_following_reward": 0.0,
295
+ "rewards/format_reward": 0.1319444477558136,
296
  "step": 18,
297
+ "vanishing_advantage_ratio": 0.5000000074505806
298
  },
299
  {
300
  "clip_ratio": 0.0,
301
+ "completion_length": 2997.7222290039062,
302
  "epoch": 0.032562125107112254,
303
+ "grad_norm": 0.12171383947134018,
304
+ "kl": 0.00020396709442138672,
305
  "learning_rate": 9.779754323328192e-07,
306
+ "loss": 0.0399,
307
+ "reward": 0.3472222313284874,
308
+ "reward_std": 0.39615704864263535,
309
+ "rewards/accuracy_reward": 0.11111111473292112,
310
  "rewards/difficulty_following_reward": 0.0,
311
+ "rewards/format_reward": 0.1250000037252903,
312
  "step": 19,
313
+ "vanishing_advantage_ratio": 0.4444444514811039
314
  },
315
  {
316
  "clip_ratio": 0.0,
317
+ "completion_length": 3004.875,
318
  "epoch": 0.03427592116538132,
319
+ "grad_norm": 0.15534202754497528,
320
+ "kl": 0.0001380443572998047,
321
  "learning_rate": 9.728616793536587e-07,
322
+ "loss": 0.1373,
323
+ "reward": 0.3541666651144624,
324
+ "reward_std": 0.5004552379250526,
325
+ "rewards/accuracy_reward": 0.11111111287027597,
326
  "rewards/difficulty_following_reward": 0.0,
327
+ "rewards/format_reward": 0.13194444682449102,
328
  "step": 20,
329
+ "vanishing_advantage_ratio": 0.2500000074505806
330
  },
331
  {
332
  "clip_ratio": 0.0,
333
+ "completion_length": 2767.9166870117188,
334
  "epoch": 0.03598971722365039,
335
+ "grad_norm": 0.12289870530366898,
336
+ "kl": 0.0005383491516113281,
337
  "learning_rate": 9.672327345550543e-07,
338
+ "loss": 0.0652,
339
+ "reward": 0.4722222313284874,
340
+ "reward_std": 0.4577459469437599,
341
+ "rewards/accuracy_reward": 0.1250000037252903,
342
  "rewards/difficulty_following_reward": 0.0,
343
+ "rewards/format_reward": 0.2222222276031971,
344
  "step": 21,
345
+ "vanishing_advantage_ratio": 0.3333333432674408
346
  },
347
  {
348
  "clip_ratio": 0.0,
349
+ "completion_length": 2990.611083984375,
350
  "epoch": 0.037703513281919454,
351
+ "grad_norm": 0.1411183923482895,
352
+ "kl": 0.00030428171157836914,
353
  "learning_rate": 9.610954559391704e-07,
354
+ "loss": 0.0744,
355
+ "reward": 0.2152777803130448,
356
+ "reward_std": 0.3392846863716841,
357
+ "rewards/accuracy_reward": 0.0555555559694767,
358
  "rewards/difficulty_following_reward": 0.0,
359
+ "rewards/format_reward": 0.10416666883975267,
360
  "step": 22,
361
  "vanishing_advantage_ratio": 0.3333333432674408
362
  },
363
  {
364
  "clip_ratio": 0.0,
365
+ "completion_length": 3004.9583129882812,
366
  "epoch": 0.03941730934018852,
367
+ "grad_norm": 0.10731787979602814,
368
+ "kl": 0.0005457401275634766,
369
  "learning_rate": 9.54457320834625e-07,
370
+ "loss": 0.0293,
371
+ "reward": 0.20833333674818277,
372
+ "reward_std": 0.2911061681807041,
373
+ "rewards/accuracy_reward": 0.041666666977107525,
374
  "rewards/difficulty_following_reward": 0.0,
375
+ "rewards/format_reward": 0.12500000139698386,
376
  "step": 23,
377
+ "vanishing_advantage_ratio": 0.416666679084301
378
  },
379
  {
380
  "clip_ratio": 0.0,
381
+ "completion_length": 2584.2916870117188,
382
  "epoch": 0.04113110539845758,
383
+ "grad_norm": 0.15161892771720886,
384
+ "kl": 0.0003027915954589844,
385
  "learning_rate": 9.473264167865171e-07,
386
+ "loss": 0.06,
387
+ "reward": 0.4861111044883728,
388
+ "reward_std": 0.47357870638370514,
389
+ "rewards/accuracy_reward": 0.13888889364898205,
390
  "rewards/difficulty_following_reward": 0.0,
391
+ "rewards/format_reward": 0.2083333358168602,
392
  "step": 24,
393
+ "vanishing_advantage_ratio": 0.2500000074505806
394
  },
395
  {
396
  "clip_ratio": 0.0,
397
+ "completion_length": 2536.3195190429688,
398
  "epoch": 0.04284490145672665,
399
+ "grad_norm": 0.14601115882396698,
400
+ "kl": 0.0003859996795654297,
401
  "learning_rate": 9.397114317029974e-07,
402
+ "loss": 0.0375,
403
+ "reward": 0.5277777910232544,
404
+ "reward_std": 0.5513055957853794,
405
+ "rewards/accuracy_reward": 0.13888889085501432,
406
  "rewards/difficulty_following_reward": 0.0,
407
+ "rewards/format_reward": 0.25000000931322575,
408
  "step": 25,
409
+ "vanishing_advantage_ratio": 0.2083333432674408
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
+ "completion_length": 2963.8333129882812,
414
  "epoch": 0.044558697514995714,
415
+ "grad_norm": 0.14391477406024933,
416
+ "kl": 0.00029397010803222656,
417
  "learning_rate": 9.316216432703916e-07,
418
+ "loss": 0.0872,
419
+ "reward": 0.31944445334374905,
420
+ "reward_std": 0.39608718268573284,
421
+ "rewards/accuracy_reward": 0.08333333395421505,
422
  "rewards/difficulty_following_reward": 0.0,
423
+ "rewards/format_reward": 0.15277778217568994,
424
  "step": 26,
425
+ "vanishing_advantage_ratio": 0.34722223319113255
426
  },
427
  {
428
  "clip_ratio": 0.0,
429
+ "completion_length": 2815.4861450195312,
430
  "epoch": 0.04627249357326478,
431
+ "grad_norm": 0.14442721009254456,
432
+ "kl": 0.00036406517028808594,
433
  "learning_rate": 9.230669076497687e-07,
434
+ "loss": 0.062,
435
+ "reward": 0.3333333358168602,
436
+ "reward_std": 0.37698132544755936,
437
+ "rewards/accuracy_reward": 0.09722222294658422,
438
  "rewards/difficulty_following_reward": 0.0,
439
+ "rewards/format_reward": 0.13888889364898205,
440
  "step": 27,
441
+ "vanishing_advantage_ratio": 0.5000000149011612
442
  },
443
  {
444
  "clip_ratio": 0.0,
445
+ "completion_length": 2111.8333740234375,
446
  "epoch": 0.04798628963153385,
447
+ "grad_norm": 0.20688049495220184,
448
+ "kl": 0.0007753372192382812,
449
  "learning_rate": 9.140576474687263e-07,
450
+ "loss": 0.0729,
451
+ "reward": 0.5416666716337204,
452
+ "reward_std": 0.5932498127222061,
453
+ "rewards/accuracy_reward": 0.15277778171002865,
454
  "rewards/difficulty_following_reward": 0.0,
455
+ "rewards/format_reward": 0.23611111007630825,
456
  "step": 28,
457
+ "vanishing_advantage_ratio": 0.1666666716337204
458
  },
459
  {
460
  "clip_ratio": 0.0,
461
+ "completion_length": 2817.1111450195312,
462
  "epoch": 0.049700085689802914,
463
+ "grad_norm": 0.1729794591665268,
464
+ "kl": 0.000820159912109375,
465
  "learning_rate": 9.046048391230247e-07,
466
+ "loss": 0.0789,
467
+ "reward": 0.3958333288319409,
468
+ "reward_std": 0.45582315884530544,
469
+ "rewards/accuracy_reward": 0.15277778171002865,
470
  "rewards/difficulty_following_reward": 0.0,
471
+ "rewards/format_reward": 0.09027778031304479,
472
  "step": 29,
473
+ "vanishing_advantage_ratio": 0.3333333432674408
474
  },
475
  {
476
  "clip_ratio": 0.0,
477
+ "completion_length": 2341.2222595214844,
478
  "epoch": 0.05141388174807198,
479
+ "grad_norm": 0.16700980067253113,
480
+ "kl": 0.000972747802734375,
481
  "learning_rate": 8.9471999940354e-07,
482
+ "loss": 0.0022,
483
+ "reward": 0.3958333358168602,
484
+ "reward_std": 0.4832002595067024,
485
+ "rewards/accuracy_reward": 0.08333333395421505,
486
  "rewards/difficulty_following_reward": 0.0,
487
+ "rewards/format_reward": 0.2291666679084301,
488
  "step": 30,
489
+ "vanishing_advantage_ratio": 0.20833334140479565
490
  },
491
  {
492
  "clip_ratio": 0.0,
493
+ "completion_length": 2607.0416259765625,
494
  "epoch": 0.05312767780634105,
495
+ "grad_norm": 0.13790923357009888,
496
+ "kl": 0.0006513595581054688,
497
  "learning_rate": 8.844151714648274e-07,
498
+ "loss": 0.0044,
499
+ "reward": 0.3263888880610466,
500
+ "reward_std": 0.33597812056541443,
501
+ "rewards/accuracy_reward": 0.06944444496184587,
502
  "rewards/difficulty_following_reward": 0.0,
503
+ "rewards/format_reward": 0.18750000558793545,
504
  "step": 31,
505
+ "vanishing_advantage_ratio": 0.3333333432674408
506
  },
507
  {
508
  "clip_ratio": 0.0,
509
+ "completion_length": 2935.6806030273438,
510
  "epoch": 0.054841473864610114,
511
+ "grad_norm": 0.13338877260684967,
512
+ "kl": 0.000720977783203125,
513
  "learning_rate": 8.737029101523929e-07,
514
+ "loss": 0.0753,
515
+ "reward": 0.5416666716337204,
516
+ "reward_std": 0.6741950437426567,
517
+ "rewards/accuracy_reward": 0.16666667070239782,
518
  "rewards/difficulty_following_reward": 0.0,
519
+ "rewards/format_reward": 0.2083333358168602,
520
  "step": 32,
521
+ "vanishing_advantage_ratio": 0.1111111156642437
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
+ "completion_length": 2178.9861755371094,
526
  "epoch": 0.056555269922879174,
527
+ "grad_norm": 0.20104320347309113,
528
+ "kl": 0.0009927749633789062,
529
  "learning_rate": 8.625962667065487e-07,
530
+ "loss": 0.0859,
531
+ "reward": 0.6458333283662796,
532
+ "reward_std": 0.5696279220283031,
533
+ "rewards/accuracy_reward": 0.19444444589316845,
534
  "rewards/difficulty_following_reward": 0.0,
535
+ "rewards/format_reward": 0.2569444440305233,
536
  "step": 33,
537
+ "vanishing_advantage_ratio": 0.1944444514811039
538
  },
539
  {
540
  "clip_ratio": 0.0,
541
+ "completion_length": 2810.7222290039062,
542
  "epoch": 0.05826906598114824,
543
+ "grad_norm": 0.11122416704893112,
544
+ "kl": 0.0006494522094726562,
545
  "learning_rate": 8.511087728614862e-07,
546
+ "loss": 0.0723,
547
+ "reward": 0.3611111119389534,
548
+ "reward_std": 0.2897772639989853,
549
+ "rewards/accuracy_reward": 0.09722222574055195,
550
  "rewards/difficulty_following_reward": 0.0,
551
+ "rewards/format_reward": 0.16666666977107525,
552
  "step": 34,
553
+ "vanishing_advantage_ratio": 0.5416666809469461
554
  },
555
  {
556
  "clip_ratio": 0.0,
557
+ "completion_length": 3160.1249389648438,
558
  "epoch": 0.05998286203941731,
559
+ "grad_norm": 0.08742042630910873,
560
+ "kl": 0.0006799697875976562,
561
  "learning_rate": 8.392544243589427e-07,
562
+ "loss": 0.011,
563
+ "reward": 0.180555556435138,
564
+ "reward_std": 0.20691198110580444,
565
+ "rewards/accuracy_reward": 0.041666666977107525,
566
  "rewards/difficulty_following_reward": 0.0,
567
+ "rewards/format_reward": 0.09722222527489066,
568
  "step": 35,
569
+ "vanishing_advantage_ratio": 0.5833333432674408
570
  },
571
  {
572
  "clip_ratio": 0.0,
573
+ "completion_length": 2915.0556030273438,
574
  "epoch": 0.061696658097686374,
575
+ "grad_norm": 0.1445658802986145,
576
+ "kl": 0.0008459091186523438,
577
  "learning_rate": 8.270476638965461e-07,
578
+ "loss": 0.0597,
579
+ "reward": 0.19444444589316845,
580
+ "reward_std": 0.22136816568672657,
581
+ "rewards/accuracy_reward": 0.041666666977107525,
582
  "rewards/difficulty_following_reward": 0.0,
583
+ "rewards/format_reward": 0.11111111240461469,
584
  "step": 36,
585
  "vanishing_advantage_ratio": 0.5000000149011612
586
  },
587
  {
588
  "clip_ratio": 0.0,
589
+ "completion_length": 3040.625,
590
  "epoch": 0.06341045415595545,
591
+ "grad_norm": 0.10913708806037903,
592
+ "kl": 0.0008916854858398438,
593
  "learning_rate": 8.145033635316128e-07,
594
+ "loss": 0.0071,
595
+ "reward": 0.1597222238779068,
596
+ "reward_std": 0.17662217281758785,
597
+ "rewards/accuracy_reward": 0.041666666977107525,
598
  "rewards/difficulty_following_reward": 0.0,
599
+ "rewards/format_reward": 0.07638889225199819,
600
  "step": 37,
601
+ "vanishing_advantage_ratio": 0.7500000149011612
602
  },
603
  {
604
  "clip_ratio": 0.0,
605
+ "completion_length": 3170.638916015625,
606
  "epoch": 0.06512425021422451,
607
+ "grad_norm": 0.11745505779981613,
608
+ "kl": 0.0007038116455078125,
609
  "learning_rate": 8.01636806561836e-07,
610
+ "loss": 0.0956,
611
+ "reward": 0.19444444216787815,
612
+ "reward_std": 0.2374982163310051,
613
+ "rewards/accuracy_reward": 0.0555555559694767,
614
  "rewards/difficulty_following_reward": 0.0,
615
+ "rewards/format_reward": 0.0833333358168602,
616
  "step": 38,
617
+ "vanishing_advantage_ratio": 0.5000000074505806
618
  },
619
  {
620
  "clip_ratio": 0.0,
621
+ "completion_length": 2920.8472290039062,
622
  "epoch": 0.06683804627249357,
623
+ "grad_norm": 0.12855492532253265,
624
+ "kl": 0.0009975433349609375,
625
  "learning_rate": 7.884636689049422e-07,
626
+ "loss": 0.0913,
627
+ "reward": 0.4791666832752526,
628
+ "reward_std": 0.48048618994653225,
629
+ "rewards/accuracy_reward": 0.15277777891606092,
630
  "rewards/difficulty_following_reward": 0.0,
631
+ "rewards/format_reward": 0.17361111054196954,
632
  "step": 39,
633
  "vanishing_advantage_ratio": 0.2500000074505806
634
  },
635
  {
636
  "clip_ratio": 0.0,
637
+ "completion_length": 2656.6666870117188,
638
  "epoch": 0.06855184233076264,
639
+ "grad_norm": 0.17364995181560516,
640
+ "kl": 0.0011653900146484375,
641
  "learning_rate": 7.75e-07,
642
+ "loss": 0.0587,
643
+ "reward": 0.4444444552063942,
644
+ "reward_std": 0.5394208431243896,
645
+ "rewards/accuracy_reward": 0.13888889271765947,
646
  "rewards/difficulty_following_reward": 0.0,
647
+ "rewards/format_reward": 0.1666666679084301,
648
  "step": 40,
649
  "vanishing_advantage_ratio": 0.1944444552063942
650
  },
651
  {
652
  "clip_ratio": 0.0,
653
+ "completion_length": 2850.90283203125,
654
  "epoch": 0.0702656383890317,
655
+ "grad_norm": 0.15265439450740814,
656
+ "kl": 0.0010967254638671875,
657
  "learning_rate": 7.612622032536507e-07,
658
+ "loss": 0.0992,
659
+ "reward": 0.4305555606260896,
660
+ "reward_std": 0.4596329629421234,
661
+ "rewards/accuracy_reward": 0.12500000093132257,
662
  "rewards/difficulty_following_reward": 0.0,
663
  "rewards/format_reward": 0.18055556062608957,
664
  "step": 41,
665
+ "vanishing_advantage_ratio": 0.1666666716337204
666
  },
667
  {
668
  "clip_ratio": 0.0,
669
+ "completion_length": 2313.3612060546875,
670
  "epoch": 0.07197943444730077,
671
+ "grad_norm": 0.23461805284023285,
672
+ "kl": 0.001087188720703125,
673
  "learning_rate": 7.472670160550848e-07,
674
+ "loss": 0.1506,
675
+ "reward": 0.4305555634200573,
676
+ "reward_std": 0.33141833916306496,
677
  "rewards/accuracy_reward": 0.09722222294658422,
678
  "rewards/difficulty_following_reward": 0.0,
679
+ "rewards/format_reward": 0.2361111119389534,
680
  "step": 42,
681
+ "vanishing_advantage_ratio": 0.416666679084301
682
  },
683
  {
684
  "clip_ratio": 0.0,
685
+ "completion_length": 2540.5139770507812,
686
  "epoch": 0.07369323050556983,
687
+ "grad_norm": 0.1409270167350769,
688
+ "kl": 0.001255035400390625,
689
  "learning_rate": 7.330314893841101e-07,
690
+ "loss": 0.0387,
691
+ "reward": 0.645833345130086,
692
+ "reward_std": 0.4597787447273731,
693
+ "rewards/accuracy_reward": 0.22222222480922937,
694
  "rewards/difficulty_following_reward": 0.0,
695
+ "rewards/format_reward": 0.20138889085501432,
696
  "step": 43,
697
+ "vanishing_advantage_ratio": 0.3333333432674408
698
  },
699
  {
700
  "clip_ratio": 0.0,
701
+ "completion_length": 2670.291748046875,
702
  "epoch": 0.07540702656383891,
703
+ "grad_norm": 0.15075387060642242,
704
+ "kl": 0.001132965087890625,
705
  "learning_rate": 7.185729670371604e-07,
706
+ "loss": 0.0947,
707
+ "reward": 0.2500000074505806,
708
+ "reward_std": 0.2931553889065981,
709
+ "rewards/accuracy_reward": 0.02777777798473835,
710
  "rewards/difficulty_following_reward": 0.0,
711
+ "rewards/format_reward": 0.1944444477558136,
712
  "step": 44,
713
+ "vanishing_advantage_ratio": 0.3333333432674408
714
  },
715
  {
716
  "clip_ratio": 0.0,
717
+ "completion_length": 2781.5416870117188,
718
  "epoch": 0.07712082262210797,
719
+ "grad_norm": 0.13281068205833435,
720
+ "kl": 0.0013933181762695312,
721
  "learning_rate": 7.039090644965509e-07,
722
+ "loss": 0.0486,
723
+ "reward": 0.3819444486871362,
724
+ "reward_std": 0.3621416501700878,
725
+ "rewards/accuracy_reward": 0.1111111156642437,
726
  "rewards/difficulty_following_reward": 0.0,
727
+ "rewards/format_reward": 0.15972222480922937,
728
  "step": 45,
729
+ "vanishing_advantage_ratio": 0.3333333432674408
730
  },
731
  {
732
  "clip_ratio": 0.0,
733
+ "completion_length": 2386.4722290039062,
734
  "epoch": 0.07883461868037704,
735
+ "grad_norm": 0.125954732298851,
736
+ "kl": 0.00160980224609375,
737
  "learning_rate": 6.890576474687263e-07,
738
+ "loss": 0.0362,
739
+ "reward": 0.555555559694767,
740
+ "reward_std": 0.49604532122612,
741
+ "rewards/accuracy_reward": 0.15277778264135122,
742
  "rewards/difficulty_following_reward": 0.0,
743
+ "rewards/format_reward": 0.25000000186264515,
744
  "step": 46,
745
+ "vanishing_advantage_ratio": 0.43055556900799274
746
  },
747
  {
748
  "clip_ratio": 0.0,
749
+ "completion_length": 2793.875,
750
  "epoch": 0.0805484147386461,
751
+ "grad_norm": 0.15522564947605133,
752
+ "kl": 0.001346588134765625,
753
  "learning_rate": 6.740368101176495e-07,
754
+ "loss": 0.1015,
755
+ "reward": 0.29166666977107525,
756
+ "reward_std": 0.34645168110728264,
757
+ "rewards/accuracy_reward": 0.0555555559694767,
758
  "rewards/difficulty_following_reward": 0.0,
759
+ "rewards/format_reward": 0.18055555783212185,
760
  "step": 47,
761
+ "vanishing_advantage_ratio": 0.1666666716337204
762
  },
763
  {
764
  "clip_ratio": 0.0,
765
+ "completion_length": 2424.4306030273438,
766
  "epoch": 0.08226221079691516,
767
+ "grad_norm": 0.162226602435112,
768
+ "kl": 0.002109527587890625,
769
  "learning_rate": 6.588648530198504e-07,
770
+ "loss": 0.0622,
771
+ "reward": 0.486111119389534,
772
+ "reward_std": 0.49431629478931427,
773
+ "rewards/accuracy_reward": 0.12500000186264515,
774
  "rewards/difficulty_following_reward": 0.0,
775
+ "rewards/format_reward": 0.2361111119389534,
776
  "step": 48,
777
+ "vanishing_advantage_ratio": 0.1666666716337204
778
  },
779
  {
780
  "clip_ratio": 0.0,
781
+ "completion_length": 2259.90283203125,
782
  "epoch": 0.08397600685518423,
783
+ "grad_norm": 0.19547973573207855,
784
+ "kl": 0.00144195556640625,
785
  "learning_rate": 6.435602608679916e-07,
786
+ "loss": 0.0995,
787
+ "reward": 0.6180555671453476,
788
+ "reward_std": 0.4886632487177849,
789
+ "rewards/accuracy_reward": 0.16666667256504297,
790
  "rewards/difficulty_following_reward": 0.0,
791
+ "rewards/format_reward": 0.2847222276031971,
792
  "step": 49,
793
+ "vanishing_advantage_ratio": 0.1666666716337204
794
  },
795
  {
796
  "clip_ratio": 0.0,
797
+ "completion_length": 2402.8333435058594,
798
  "epoch": 0.0856898029134533,
799
+ "grad_norm": 0.18848426640033722,
800
+ "kl": 0.002063751220703125,
801
  "learning_rate": 6.281416799501187e-07,
802
+ "loss": 0.0181,
803
+ "reward": 0.3472222201526165,
804
+ "reward_std": 0.407356571406126,
805
+ "rewards/accuracy_reward": 0.0555555559694767,
806
  "rewards/difficulty_following_reward": 0.0,
807
+ "rewards/format_reward": 0.2361111156642437,
808
  "step": 50,
809
+ "vanishing_advantage_ratio": 0.0833333358168602
810
  },
811
  {
812
  "clip_ratio": 0.0,
813
+ "completion_length": 2527.8333740234375,
814
  "epoch": 0.08740359897172237,
815
+ "grad_norm": 0.16279014945030212,
816
+ "kl": 0.00167083740234375,
817
  "learning_rate": 6.126278954320294e-07,
818
+ "loss": 0.0806,
819
+ "reward": 0.6458333432674408,
820
+ "reward_std": 0.522053524851799,
821
+ "rewards/accuracy_reward": 0.2083333358168602,
822
  "rewards/difficulty_following_reward": 0.0,
823
+ "rewards/format_reward": 0.22916667349636555,
824
  "step": 51,
825
+ "vanishing_advantage_ratio": 0.3333333432674408
826
  },
827
  {
828
  "clip_ratio": 0.0,
829
+ "completion_length": 2555.5555725097656,
830
  "epoch": 0.08911739502999143,
831
+ "grad_norm": 0.19262254238128662,
832
+ "kl": 0.001934051513671875,
833
  "learning_rate": 5.97037808470444e-07,
834
+ "loss": 0.1354,
835
+ "reward": 0.5763888992369175,
836
+ "reward_std": 0.6699204966425896,
837
+ "rewards/accuracy_reward": 0.18055556062608957,
838
  "rewards/difficulty_following_reward": 0.0,
839
+ "rewards/format_reward": 0.21527778171002865,
840
  "step": 52,
841
+ "vanishing_advantage_ratio": 0.1111111156642437
842
  },
843
  {
844
  "clip_ratio": 0.0,
845
+ "completion_length": 2766.388916015625,
846
  "epoch": 0.0908311910882605,
847
+ "grad_norm": 0.185680091381073,
848
+ "kl": 0.001705169677734375,
849
  "learning_rate": 5.813904131848564e-07,
850
+ "loss": 0.1234,
851
+ "reward": 0.28472222574055195,
852
+ "reward_std": 0.2922345921397209,
853
+ "rewards/accuracy_reward": 0.055555556900799274,
854
  "rewards/difficulty_following_reward": 0.0,
855
+ "rewards/format_reward": 0.1736111119389534,
856
  "step": 53,
857
+ "vanishing_advantage_ratio": 0.2777777910232544
858
  },
859
  {
860
  "clip_ratio": 0.0,
861
+ "completion_length": 2823.0,
862
  "epoch": 0.09254498714652956,
863
+ "grad_norm": 0.10592877119779587,
864
+ "kl": 0.001628875732421875,
865
  "learning_rate": 5.657047735161255e-07,
866
+ "loss": 0.002,
867
+ "reward": 0.31249999441206455,
868
+ "reward_std": 0.2363711018115282,
869
+ "rewards/accuracy_reward": 0.08333333395421505,
870
  "rewards/difficulty_following_reward": 0.0,
871
+ "rewards/format_reward": 0.14583333767950535,
872
  "step": 54,
873
+ "vanishing_advantage_ratio": 0.416666679084301
874
  },
875
  {
876
  "clip_ratio": 0.0,
877
+ "completion_length": 3061.5834350585938,
878
  "epoch": 0.09425878320479864,
879
+ "grad_norm": 0.13768742978572845,
880
+ "kl": 0.00177764892578125,
881
  "learning_rate": 5.5e-07,
882
+ "loss": 0.1519,
883
+ "reward": 0.29166667722165585,
884
+ "reward_std": 0.4167081881314516,
885
+ "rewards/accuracy_reward": 0.06944444589316845,
886
  "rewards/difficulty_following_reward": 0.0,
887
+ "rewards/format_reward": 0.1527777798473835,
888
  "step": 55,
889
+ "vanishing_advantage_ratio": 0.26388889737427235
890
  },
891
  {
892
  "clip_ratio": 0.0,
893
+ "completion_length": 2675.4584045410156,
894
  "epoch": 0.0959725792630677,
895
+ "grad_norm": 0.15516793727874756,
896
+ "kl": 0.0018901824951171875,
897
  "learning_rate": 5.342952264838747e-07,
898
+ "loss": 0.0732,
899
+ "reward": 0.3888888880610466,
900
+ "reward_std": 0.3425787817686796,
901
+ "rewards/accuracy_reward": 0.11111111380159855,
902
  "rewards/difficulty_following_reward": 0.0,
903
+ "rewards/format_reward": 0.1666666679084301,
904
  "step": 56,
905
+ "vanishing_advantage_ratio": 0.4166666716337204
906
  },
907
  {
908
  "clip_ratio": 0.0,
909
+ "completion_length": 2887.638916015625,
910
  "epoch": 0.09768637532133675,
911
+ "grad_norm": 0.13576894998550415,
912
+ "kl": 0.0018768310546875,
913
  "learning_rate": 5.186095868151436e-07,
914
+ "loss": 0.0397,
915
+ "reward": 0.5138888955116272,
916
+ "reward_std": 0.4866417311131954,
917
+ "rewards/accuracy_reward": 0.16666667349636555,
918
  "rewards/difficulty_following_reward": 0.0,
919
+ "rewards/format_reward": 0.18055556155741215,
920
  "step": 57,
921
+ "vanishing_advantage_ratio": 0.3333333432674408
922
  },
923
  {
924
  "clip_ratio": 0.0,
925
+ "completion_length": 2637.7083129882812,
926
  "epoch": 0.09940017137960583,
927
+ "grad_norm": 0.17882922291755676,
928
+ "kl": 0.00240325927734375,
929
  "learning_rate": 5.02962191529556e-07,
930
+ "loss": 0.1296,
931
+ "reward": 0.34722222946584225,
932
+ "reward_std": 0.39691098034381866,
933
+ "rewards/accuracy_reward": 0.06944444496184587,
934
  "rewards/difficulty_following_reward": 0.0,
935
+ "rewards/format_reward": 0.20833333767950535,
936
  "step": 58,
937
+ "vanishing_advantage_ratio": 0.2500000074505806
938
  },
939
  {
940
  "clip_ratio": 0.0,
941
+ "completion_length": 2386.9444274902344,
942
  "epoch": 0.10111396743787489,
943
+ "grad_norm": 0.15917612612247467,
944
+ "kl": 0.0022869110107421875,
945
  "learning_rate": 4.873721045679706e-07,
946
+ "loss": 0.0046,
947
+ "reward": 0.8402777761220932,
948
+ "reward_std": 0.6729260385036469,
949
+ "rewards/accuracy_reward": 0.2916666716337204,
950
  "rewards/difficulty_following_reward": 0.0,
951
+ "rewards/format_reward": 0.2569444477558136,
952
  "step": 59,
953
  "vanishing_advantage_ratio": 0.1666666716337204
954
  },
955
  {
956
  "clip_ratio": 0.0,
957
+ "completion_length": 3012.5000610351562,
958
  "epoch": 0.10282776349614396,
959
+ "grad_norm": 0.13718292117118835,
960
+ "kl": 0.0018711090087890625,
961
  "learning_rate": 4.7185832004988133e-07,
962
+ "loss": 0.0949,
963
+ "reward": 0.5069444496184587,
964
+ "reward_std": 0.5694549642503262,
965
+ "rewards/accuracy_reward": 0.180555559694767,
966
  "rewards/difficulty_following_reward": 0.0,
967
+ "rewards/format_reward": 0.14583333767950535,
968
  "step": 60,
969
+ "vanishing_advantage_ratio": 0.2500000074505806
970
  },
971
  {
972
  "clip_ratio": 0.0,
973
+ "completion_length": 2862.763916015625,
974
  "epoch": 0.10454155955441302,
975
+ "grad_norm": 0.15042024850845337,
976
+ "kl": 0.002170562744140625,
977
  "learning_rate": 4.5643973913200837e-07,
978
+ "loss": 0.1462,
979
+ "reward": 0.29861111007630825,
980
+ "reward_std": 0.30184047669172287,
981
+ "rewards/accuracy_reward": 0.0555555559694767,
982
  "rewards/difficulty_following_reward": 0.0,
983
+ "rewards/format_reward": 0.18750000558793545,
984
  "step": 61,
985
  "vanishing_advantage_ratio": 0.3333333432674408
986
  },
987
  {
988
  "clip_ratio": 0.0,
989
+ "completion_length": 2483.3195190429688,
990
  "epoch": 0.1062553556126821,
991
+ "grad_norm": 0.1718958169221878,
992
+ "kl": 0.0028839111328125,
993
  "learning_rate": 4.4113514698014953e-07,
994
+ "loss": 0.0138,
995
+ "reward": 0.4583333358168602,
996
+ "reward_std": 0.45289405435323715,
997
+ "rewards/accuracy_reward": 0.1388888917863369,
998
  "rewards/difficulty_following_reward": 0.0,
999
+ "rewards/format_reward": 0.180555559694767,
1000
  "step": 62,
1001
+ "vanishing_advantage_ratio": 0.26388889737427235
1002
  },
1003
  {
1004
  "clip_ratio": 0.0,
1005
+ "completion_length": 3120.638916015625,
1006
  "epoch": 0.10796915167095116,
1007
+ "grad_norm": 0.15143321454524994,
1008
+ "kl": 0.002471923828125,
1009
  "learning_rate": 4.2596318988235037e-07,
1010
+ "loss": 0.0997,
1011
+ "reward": 0.4444444519467652,
1012
+ "reward_std": 0.5317340567708015,
1013
+ "rewards/accuracy_reward": 0.16666667349636555,
1014
  "rewards/difficulty_following_reward": 0.0,
1015
+ "rewards/format_reward": 0.11111111147329211,
1016
  "step": 63,
1017
+ "vanishing_advantage_ratio": 0.2500000074505806
1018
  },
1019
  {
1020
  "clip_ratio": 0.0,
1021
+ "completion_length": 2806.263885498047,
1022
  "epoch": 0.10968294772922023,
1023
+ "grad_norm": 0.18540050089359283,
1024
+ "kl": 0.00319671630859375,
1025
  "learning_rate": 4.1094235253127374e-07,
1026
+ "loss": 0.0537,
1027
+ "reward": 0.3055555569007993,
1028
+ "reward_std": 0.346462732180953,
1029
+ "rewards/accuracy_reward": 0.055555556900799274,
1030
  "rewards/difficulty_following_reward": 0.0,
1031
+ "rewards/format_reward": 0.19444445054978132,
1032
  "step": 64,
1033
+ "vanishing_advantage_ratio": 0.1944444514811039
1034
  },
1035
  {
1036
  "clip_ratio": 0.0,
1037
+ "completion_length": 2921.3333740234375,
1038
  "epoch": 0.11139674378748929,
1039
+ "grad_norm": 0.1683017462491989,
1040
+ "kl": 0.00262451171875,
1041
  "learning_rate": 3.9609093550344907e-07,
1042
+ "loss": 0.1098,
1043
+ "reward": 0.3055555638857186,
1044
+ "reward_std": 0.3952417355030775,
1045
+ "rewards/accuracy_reward": 0.06944444496184587,
1046
  "rewards/difficulty_following_reward": 0.0,
1047
+ "rewards/format_reward": 0.1666666683740914,
1048
  "step": 65,
1049
+ "vanishing_advantage_ratio": 0.3333333432674408
1050
  },
1051
  {
1052
  "clip_ratio": 0.0,
1053
+ "completion_length": 2583.1666870117188,
1054
  "epoch": 0.11311053984575835,
1055
+ "grad_norm": 0.21984979510307312,
1056
+ "kl": 0.002838134765625,
1057
  "learning_rate": 3.8142703296283953e-07,
1058
+ "loss": 0.1025,
1059
+ "reward": 0.6250000149011612,
1060
+ "reward_std": 0.46652648597955704,
1061
+ "rewards/accuracy_reward": 0.236111119389534,
1062
  "rewards/difficulty_following_reward": 0.0,
1063
+ "rewards/format_reward": 0.15277778171002865,
1064
  "step": 66,
1065
+ "vanishing_advantage_ratio": 0.5000000149011612
1066
  },
1067
  {
1068
  "clip_ratio": 0.0,
1069
+ "completion_length": 2952.7083740234375,
1070
  "epoch": 0.11482433590402742,
1071
+ "grad_norm": 0.1412336826324463,
1072
+ "kl": 0.003108978271484375,
1073
  "learning_rate": 3.6696851061588994e-07,
1074
+ "loss": 0.0266,
1075
+ "reward": 0.2569444477558136,
1076
+ "reward_std": 0.35272519290447235,
1077
+ "rewards/accuracy_reward": 0.0555555559694767,
1078
  "rewards/difficulty_following_reward": 0.0,
1079
+ "rewards/format_reward": 0.1458333395421505,
1080
  "step": 67,
1081
+ "vanishing_advantage_ratio": 0.3333333432674408
1082
  },
1083
  {
1084
  "clip_ratio": 0.0,
1085
+ "completion_length": 2923.541748046875,
1086
  "epoch": 0.11653813196229648,
1087
+ "grad_norm": 0.13442060351371765,
1088
+ "kl": 0.003047943115234375,
1089
  "learning_rate": 3.5273298394491515e-07,
1090
+ "loss": -0.002,
1091
+ "reward": 0.4027777723968029,
1092
+ "reward_std": 0.4289739280939102,
1093
+ "rewards/accuracy_reward": 0.12500000279396772,
1094
  "rewards/difficulty_following_reward": 0.0,
1095
+ "rewards/format_reward": 0.15277778171002865,
1096
  "step": 68,
1097
+ "vanishing_advantage_ratio": 0.3333333432674408
1098
  },
1099
  {
1100
  "clip_ratio": 0.0,
1101
+ "completion_length": 2733.9444580078125,
1102
  "epoch": 0.11825192802056556,
1103
+ "grad_norm": 0.14998486638069153,
1104
+ "kl": 0.0030517578125,
1105
  "learning_rate": 3.387377967463493e-07,
1106
+ "loss": 0.0939,
1107
+ "reward": 0.5763889029622078,
1108
+ "reward_std": 0.5283073335886002,
1109
+ "rewards/accuracy_reward": 0.18055555876344442,
1110
  "rewards/difficulty_following_reward": 0.0,
1111
+ "rewards/format_reward": 0.21527778171002865,
1112
  "step": 69,
1113
+ "vanishing_advantage_ratio": 0.1666666716337204
1114
  },
1115
  {
1116
  "clip_ratio": 0.0,
1117
+ "completion_length": 2523.2361450195312,
1118
  "epoch": 0.11996572407883462,
1119
+ "grad_norm": 0.1801416426897049,
1120
+ "kl": 0.0034027099609375,
1121
  "learning_rate": 3.250000000000001e-07,
1122
+ "loss": 0.0797,
1123
+ "reward": 0.36111112125217915,
1124
+ "reward_std": 0.3477918654680252,
1125
+ "rewards/accuracy_reward": 0.06944444496184587,
1126
  "rewards/difficulty_following_reward": 0.0,
1127
+ "rewards/format_reward": 0.22222222574055195,
1128
  "step": 70,
1129
+ "vanishing_advantage_ratio": 0.2500000074505806
1130
  },
1131
  {
1132
  "clip_ratio": 0.0,
1133
+ "completion_length": 2869.7361450195312,
1134
  "epoch": 0.12167952013710369,
1135
+ "grad_norm": 0.13626869022846222,
1136
+ "kl": 0.002971649169921875,
1137
  "learning_rate": 3.115363310950578e-07,
1138
+ "loss": -0.0103,
1139
+ "reward": 0.618055559694767,
1140
+ "reward_std": 0.49544404074549675,
1141
+ "rewards/accuracy_reward": 0.20833333767950535,
1142
  "rewards/difficulty_following_reward": 0.0,
1143
+ "rewards/format_reward": 0.2013888917863369,
1144
  "step": 71,
1145
+ "vanishing_advantage_ratio": 0.34722223319113255
1146
  },
1147
  {
1148
  "clip_ratio": 0.0,
1149
+ "completion_length": 2333.15283203125,
1150
  "epoch": 0.12339331619537275,
1151
+ "grad_norm": 0.22007954120635986,
1152
+ "kl": 0.003650665283203125,
1153
  "learning_rate": 2.9836319343816397e-07,
1154
+ "loss": 0.0953,
1155
+ "reward": 0.5486111231148243,
1156
+ "reward_std": 0.38520985655486584,
1157
+ "rewards/accuracy_reward": 0.16666666883975267,
1158
  "rewards/difficulty_following_reward": 0.0,
1159
+ "rewards/format_reward": 0.2152777798473835,
1160
  "step": 72,
1161
+ "vanishing_advantage_ratio": 0.3333333432674408
1162
  },
1163
  {
1164
  "clip_ratio": 0.0,
1165
+ "completion_length": 2609.166748046875,
1166
  "epoch": 0.12510711225364182,
1167
+ "grad_norm": 0.14889562129974365,
1168
+ "kl": 0.00350189208984375,
1169
  "learning_rate": 2.854966364683872e-07,
1170
+ "loss": 0.0133,
1171
+ "reward": 0.5972222238779068,
1172
+ "reward_std": 0.49283862113952637,
1173
+ "rewards/accuracy_reward": 0.18055556248873472,
1174
  "rewards/difficulty_following_reward": 0.0,
1175
+ "rewards/format_reward": 0.23611111287027597,
1176
  "step": 73,
1177
  "vanishing_advantage_ratio": 0.2500000074505806
1178
  },
1179
  {
1180
  "clip_ratio": 0.0,
1181
+ "completion_length": 1885.9722290039062,
1182
  "epoch": 0.1268209083119109,
1183
+ "grad_norm": 0.18624606728553772,
1184
+ "kl": 0.003803253173828125,
1185
  "learning_rate": 2.729523361034538e-07,
1186
+ "loss": 0.0368,
1187
+ "reward": 0.4791666641831398,
1188
+ "reward_std": 0.41904643177986145,
1189
+ "rewards/accuracy_reward": 0.11111111473292112,
1190
  "rewards/difficulty_following_reward": 0.0,
1191
+ "rewards/format_reward": 0.2569444514811039,
1192
  "step": 74,
1193
+ "vanishing_advantage_ratio": 0.2500000074505806
1194
  },
1195
  {
1196
  "clip_ratio": 0.0,
1197
+ "completion_length": 3035.8333740234375,
1198
  "epoch": 0.12853470437017994,
1199
+ "grad_norm": 0.13360603153705597,
1200
+ "kl": 0.003421783447265625,
1201
  "learning_rate": 2.6074557564105724e-07,
1202
+ "loss": 0.081,
1203
+ "reward": 0.4583333432674408,
1204
+ "reward_std": 0.5805492922663689,
1205
+ "rewards/accuracy_reward": 0.13888889085501432,
1206
  "rewards/difficulty_following_reward": 0.0,
1207
+ "rewards/format_reward": 0.180555559694767,
1208
  "step": 75,
1209
+ "vanishing_advantage_ratio": 0.1666666716337204
1210
  },
1211
  {
1212
  "clip_ratio": 0.0,
1213
+ "completion_length": 2006.5000305175781,
1214
  "epoch": 0.13024850042844902,
1215
+ "grad_norm": 0.186117023229599,
1216
+ "kl": 0.003589630126953125,
1217
  "learning_rate": 2.488912271385139e-07,
1218
+ "loss": 0.0093,
1219
+ "reward": 0.5555555522441864,
1220
+ "reward_std": 0.5216063931584358,
1221
+ "rewards/accuracy_reward": 0.13888888992369175,
1222
  "rewards/difficulty_following_reward": 0.0,
1223
+ "rewards/format_reward": 0.2777777835726738,
1224
  "step": 76,
1225
+ "vanishing_advantage_ratio": 0.1666666716337204
1226
  },
1227
  {
1228
  "clip_ratio": 0.0,
1229
+ "completion_length": 2852.1111450195312,
1230
  "epoch": 0.1319622964867181,
1231
+ "grad_norm": 0.14473779499530792,
1232
+ "kl": 0.003143310546875,
1233
  "learning_rate": 2.374037332934512e-07,
1234
+ "loss": 0.094,
1235
+ "reward": 0.4930555522441864,
1236
+ "reward_std": 0.5302782356739044,
1237
+ "rewards/accuracy_reward": 0.16666667070239782,
1238
  "rewards/difficulty_following_reward": 0.0,
1239
+ "rewards/format_reward": 0.15972222900018096,
1240
  "step": 77,
1241
+ "vanishing_advantage_ratio": 0.2500000074505806
1242
  },
1243
  {
1244
  "clip_ratio": 0.0,
1245
+ "completion_length": 3228.5277709960938,
1246
  "epoch": 0.13367609254498714,
1247
+ "grad_norm": 0.12343769520521164,
1248
+ "kl": 0.003009796142578125,
1249
  "learning_rate": 2.2629708984760706e-07,
1250
+ "loss": -0.016,
1251
+ "reward": 0.2638888843357563,
1252
+ "reward_std": 0.3488573133945465,
1253
+ "rewards/accuracy_reward": 0.08333333488553762,
1254
  "rewards/difficulty_following_reward": 0.0,
1255
+ "rewards/format_reward": 0.09722222248092294,
1256
  "step": 78,
1257
+ "vanishing_advantage_ratio": 0.5000000074505806
1258
  },
1259
  {
1260
  "clip_ratio": 0.0,
1261
+ "completion_length": 2360.986083984375,
1262
  "epoch": 0.1353898886032562,
1263
+ "grad_norm": 0.18473461270332336,
1264
+ "kl": 0.0040740966796875,
1265
  "learning_rate": 2.1558482853517253e-07,
1266
+ "loss": 0.0521,
1267
+ "reward": 0.4027777761220932,
1268
+ "reward_std": 0.3384508527815342,
1269
+ "rewards/accuracy_reward": 0.06944444496184587,
1270
  "rewards/difficulty_following_reward": 0.0,
1271
+ "rewards/format_reward": 0.2638888955116272,
1272
  "step": 79,
1273
+ "vanishing_advantage_ratio": 0.1805555671453476
1274
  },
1275
  {
1276
  "clip_ratio": 0.0,
1277
+ "completion_length": 3112.1111450195312,
1278
  "epoch": 0.13710368466152528,
1279
+ "grad_norm": 0.12150842696428299,
1280
+ "kl": 0.003692626953125,
1281
  "learning_rate": 2.0528000059645995e-07,
1282
+ "loss": 0.072,
1283
+ "reward": 0.3125000009313226,
1284
+ "reward_std": 0.3745912276208401,
1285
+ "rewards/accuracy_reward": 0.08333333395421505,
1286
  "rewards/difficulty_following_reward": 0.0,
1287
+ "rewards/format_reward": 0.14583333674818277,
1288
  "step": 80,
1289
+ "vanishing_advantage_ratio": 0.416666679084301
1290
  },
1291
  {
1292
  "clip_ratio": 0.0,
1293
+ "completion_length": 2774.513916015625,
1294
  "epoch": 0.13881748071979436,
1295
+ "grad_norm": 0.1500415951013565,
1296
+ "kl": 0.003559112548828125,
1297
  "learning_rate": 1.9539516087697517e-07,
1298
+ "loss": 0.0482,
1299
+ "reward": 0.41666668001562357,
1300
+ "reward_std": 0.4251419398933649,
1301
+ "rewards/accuracy_reward": 0.1388888955116272,
1302
  "rewards/difficulty_following_reward": 0.0,
1303
+ "rewards/format_reward": 0.13888889271765947,
1304
  "step": 81,
1305
+ "vanishing_advantage_ratio": 0.3333333432674408
1306
  },
1307
  {
1308
  "clip_ratio": 0.0,
1309
+ "completion_length": 2676.375,
1310
  "epoch": 0.1405312767780634,
1311
+ "grad_norm": 0.1790565550327301,
1312
+ "kl": 0.003726959228515625,
1313
  "learning_rate": 1.8594235253127372e-07,
1314
+ "loss": 0.0556,
1315
+ "reward": 0.506944440305233,
1316
+ "reward_std": 0.4319311436265707,
1317
+ "rewards/accuracy_reward": 0.13888889271765947,
1318
  "rewards/difficulty_following_reward": 0.0,
1319
+ "rewards/format_reward": 0.2291666716337204,
1320
  "step": 82,
1321
+ "vanishing_advantage_ratio": 0.2777777872979641
1322
  },
1323
  {
1324
  "clip_ratio": 0.0,
1325
+ "completion_length": 2491.4861755371094,
1326
  "epoch": 0.14224507283633248,
1327
+ "grad_norm": 0.16082800924777985,
1328
+ "kl": 0.0035247802734375,
1329
  "learning_rate": 1.7693309235023127e-07,
1330
+ "loss": 0.0628,
1331
+ "reward": 0.4444444477558136,
1332
+ "reward_std": 0.41663530841469765,
1333
+ "rewards/accuracy_reward": 0.11111111287027597,
1334
  "rewards/difficulty_following_reward": 0.0,
1335
+ "rewards/format_reward": 0.2222222276031971,
1336
  "step": 83,
1337
+ "vanishing_advantage_ratio": 0.34722223319113255
1338
  },
1339
  {
1340
  "clip_ratio": 0.0,
1341
+ "completion_length": 2634.1805725097656,
1342
  "epoch": 0.14395886889460155,
1343
+ "grad_norm": 0.15972262620925903,
1344
+ "kl": 0.003818511962890625,
1345
  "learning_rate": 1.6837835672960831e-07,
1346
+ "loss": 0.0673,
1347
+ "reward": 0.4375000074505806,
1348
+ "reward_std": 0.48190969228744507,
1349
+ "rewards/accuracy_reward": 0.09722222294658422,
1350
  "rewards/difficulty_following_reward": 0.0,
1351
+ "rewards/format_reward": 0.24305556062608957,
1352
  "step": 84,
1353
+ "vanishing_advantage_ratio": 0.2500000074505806
1354
  },
1355
  {
1356
  "clip_ratio": 0.0,
1357
+ "completion_length": 2900.65283203125,
1358
  "epoch": 0.1456726649528706,
1359
+ "grad_norm": 0.15328432619571686,
1360
+ "kl": 0.003406524658203125,
1361
  "learning_rate": 1.6028856829700258e-07,
1362
+ "loss": 0.0828,
1363
+ "reward": 0.4861111044883728,
1364
+ "reward_std": 0.4899628609418869,
1365
+ "rewards/accuracy_reward": 0.1666666679084301,
1366
  "rewards/difficulty_following_reward": 0.0,
1367
+ "rewards/format_reward": 0.15277778171002865,
1368
  "step": 85,
1369
  "vanishing_advantage_ratio": 0.2500000074505806
1370
  },
1371
  {
1372
  "clip_ratio": 0.0,
1373
+ "completion_length": 2665.902801513672,
1374
  "epoch": 0.14738646101113967,
1375
+ "grad_norm": 0.16509687900543213,
1376
+ "kl": 0.0034637451171875,
1377
  "learning_rate": 1.5267358321348285e-07,
1378
+ "loss": 0.0968,
1379
+ "reward": 0.24999999813735485,
1380
+ "reward_std": 0.30046750232577324,
1381
+ "rewards/accuracy_reward": 0.02777777798473835,
1382
  "rewards/difficulty_following_reward": 0.0,
1383
+ "rewards/format_reward": 0.19444444589316845,
1384
  "step": 86,
1385
+ "vanishing_advantage_ratio": 0.26388889737427235
1386
  },
1387
  {
1388
  "clip_ratio": 0.0,
1389
+ "completion_length": 3268.4306030273438,
1390
  "epoch": 0.14910025706940874,
1391
+ "grad_norm": 0.10825953632593155,
1392
+ "kl": 0.003009796142578125,
1393
  "learning_rate": 1.4554267916537495e-07,
1394
+ "loss": 0.0666,
1395
+ "reward": 0.4027777696028352,
1396
+ "reward_std": 0.3645976707339287,
1397
+ "rewards/accuracy_reward": 0.15277778171002865,
1398
  "rewards/difficulty_following_reward": 0.0,
1399
+ "rewards/format_reward": 0.09722222574055195,
1400
  "step": 87,
1401
  "vanishing_advantage_ratio": 0.3333333432674408
1402
  },
1403
  {
1404
  "clip_ratio": 0.0,
1405
+ "completion_length": 3164.0277709960938,
1406
  "epoch": 0.15081405312767782,
1407
+ "grad_norm": 0.1394396275281906,
1408
+ "kl": 0.003810882568359375,
1409
  "learning_rate": 1.3890454406082956e-07,
1410
+ "loss": 0.0838,
1411
+ "reward": 0.31250000558793545,
1412
+ "reward_std": 0.385826725512743,
1413
+ "rewards/accuracy_reward": 0.09722222574055195,
1414
  "rewards/difficulty_following_reward": 0.0,
1415
+ "rewards/format_reward": 0.11805555876344442,
1416
  "step": 88,
1417
+ "vanishing_advantage_ratio": 0.416666679084301
1418
  },
1419
  {
1420
  "clip_ratio": 0.0,
1421
+ "completion_length": 2483.1527709960938,
1422
  "epoch": 0.15252784918594686,
1423
+ "grad_norm": 0.15645141899585724,
1424
+ "kl": 0.00453948974609375,
1425
  "learning_rate": 1.3276726544494571e-07,
1426
+ "loss": 0.0977,
1427
+ "reward": 0.34722223225980997,
1428
+ "reward_std": 0.3350673224776983,
1429
+ "rewards/accuracy_reward": 0.055555556900799274,
1430
  "rewards/difficulty_following_reward": 0.0,
1431
+ "rewards/format_reward": 0.23611111659556627,
1432
  "step": 89,
1433
+ "vanishing_advantage_ratio": 0.4166666716337204
1434
  },
1435
  {
1436
  "clip_ratio": 0.0,
1437
+ "completion_length": 2722.499969482422,
1438
  "epoch": 0.15424164524421594,
1439
+ "grad_norm": 0.150289386510849,
1440
+ "kl": 0.003353118896484375,
1441
  "learning_rate": 1.2713832064634125e-07,
1442
+ "loss": 0.0805,
1443
+ "reward": 0.27777778171002865,
1444
+ "reward_std": 0.3167807497084141,
1445
  "rewards/accuracy_reward": 0.055555556900799274,
1446
  "rewards/difficulty_following_reward": 0.0,
1447
+ "rewards/format_reward": 0.16666666697710752,
1448
  "step": 90,
1449
+ "vanishing_advantage_ratio": 0.5138888955116272
1450
  },
1451
  {
1452
  "clip_ratio": 0.0,
1453
+ "completion_length": 2716.0833129882812,
1454
  "epoch": 0.155955441302485,
1455
+ "grad_norm": 0.17992275953292847,
1456
+ "kl": 0.004726409912109375,
1457
  "learning_rate": 1.220245676671809e-07,
1458
+ "loss": 0.0844,
1459
+ "reward": 0.604166679084301,
1460
+ "reward_std": 0.6643020883202553,
1461
+ "rewards/accuracy_reward": 0.18055555690079927,
1462
  "rewards/difficulty_following_reward": 0.0,
1463
+ "rewards/format_reward": 0.2430555559694767,
1464
  "step": 91,
1465
+ "vanishing_advantage_ratio": 0.0833333358168602
1466
  },
1467
  {
1468
  "clip_ratio": 0.0,
1469
+ "completion_length": 2535.277801513672,
1470
  "epoch": 0.15766923736075408,
1471
+ "grad_norm": 0.15826836228370667,
1472
+ "kl": 0.00363922119140625,
1473
  "learning_rate": 1.1743223682775649e-07,
1474
+ "loss": 0.087,
1475
+ "reward": 0.3472222238779068,
1476
+ "reward_std": 0.333256833255291,
1477
+ "rewards/accuracy_reward": 0.08333333395421505,
1478
  "rewards/difficulty_following_reward": 0.0,
1479
+ "rewards/format_reward": 0.180555559694767,
1480
  "step": 92,
1481
+ "vanishing_advantage_ratio": 0.4166666716337204
1482
  },
1483
  {
1484
  "clip_ratio": 0.0,
1485
+ "completion_length": 2722.6806030273438,
1486
  "epoch": 0.15938303341902313,
1487
+ "grad_norm": 0.15519429743289948,
1488
+ "kl": 0.003940582275390625,
1489
  "learning_rate": 1.1336692317580158e-07,
1490
+ "loss": 0.0829,
1491
+ "reward": 0.5763888955116272,
1492
+ "reward_std": 0.48380210250616074,
1493
+ "rewards/accuracy_reward": 0.20833334047347307,
1494
  "rewards/difficulty_following_reward": 0.0,
1495
+ "rewards/format_reward": 0.1597222243435681,
1496
  "step": 93,
1497
+ "vanishing_advantage_ratio": 0.1666666716337204
1498
  },
1499
  {
1500
  "clip_ratio": 0.0,
1501
+ "completion_length": 2634.277801513672,
1502
  "epoch": 0.1610968294772922,
1503
+ "grad_norm": 0.1622547060251236,
1504
+ "kl": 0.004276275634765625,
1505
  "learning_rate": 1.0983357966978745e-07,
1506
+ "loss": 0.0776,
1507
+ "reward": 0.3888888955116272,
1508
+ "reward_std": 0.31960033625364304,
1509
+ "rewards/accuracy_reward": 0.1111111119389534,
1510
  "rewards/difficulty_following_reward": 0.0,
1511
+ "rewards/format_reward": 0.1666666679084301,
1512
  "step": 94,
1513
  "vanishing_advantage_ratio": 0.4166666716337204
1514
  },
1515
  {
1516
  "clip_ratio": 0.0,
1517
+ "completion_length": 2685.6666564941406,
1518
  "epoch": 0.16281062553556128,
1519
+ "grad_norm": 0.19985106587409973,
1520
+ "kl": 0.00397491455078125,
1521
  "learning_rate": 1.068365111445064e-07,
1522
+ "loss": 0.0964,
1523
+ "reward": 0.4166666641831398,
1524
+ "reward_std": 0.5311803258955479,
1525
+ "rewards/accuracy_reward": 0.11111111473292112,
1526
  "rewards/difficulty_following_reward": 0.0,
1527
+ "rewards/format_reward": 0.19444444589316845,
1528
  "step": 95,
1529
  "vanishing_advantage_ratio": 0.0833333358168602
1530
  },
1531
  {
1532
  "clip_ratio": 0.0,
1533
+ "completion_length": 2594.0972290039062,
1534
  "epoch": 0.16452442159383032,
1535
+ "grad_norm": 0.16063077747821808,
1536
+ "kl": 0.0034027099609375,
1537
  "learning_rate": 1.0437936906629334e-07,
1538
+ "loss": 0.0895,
1539
+ "reward": 0.5902777835726738,
1540
+ "reward_std": 0.5667251460254192,
1541
+ "rewards/accuracy_reward": 0.2083333320915699,
1542
  "rewards/difficulty_following_reward": 0.0,
1543
+ "rewards/format_reward": 0.17361111380159855,
1544
  "step": 96,
1545
+ "vanishing_advantage_ratio": 0.2500000111758709
1546
  },
1547
  {
1548
  "clip_ratio": 0.0,
1549
+ "completion_length": 3230.77783203125,
1550
  "epoch": 0.1662382176520994,
1551
+ "grad_norm": 0.19083616137504578,
1552
+ "kl": 0.00445556640625,
1553
  "learning_rate": 1.0246514708427701e-07,
1554
+ "loss": 0.0987,
1555
+ "reward": 0.29861112125217915,
1556
+ "reward_std": 0.4562392868101597,
1557
+ "rewards/accuracy_reward": 0.08333333488553762,
1558
  "rewards/difficulty_following_reward": 0.0,
1559
+ "rewards/format_reward": 0.13194444589316845,
1560
  "step": 97,
1561
+ "vanishing_advantage_ratio": 0.0
1562
  },
1563
  {
1564
  "clip_ratio": 0.0,
1565
+ "completion_length": 2813.527801513672,
1566
  "epoch": 0.16795201371036847,
1567
+ "grad_norm": 0.1406376212835312,
1568
+ "kl": 0.00418853759765625,
1569
  "learning_rate": 1.0109617738307911e-07,
1570
+ "loss": 0.0603,
1571
+ "reward": 0.4027777872979641,
1572
+ "reward_std": 0.40550027787685394,
1573
+ "rewards/accuracy_reward": 0.12500000186264515,
1574
  "rewards/difficulty_following_reward": 0.0,
1575
+ "rewards/format_reward": 0.15277777891606092,
1576
  "step": 98,
1577
+ "vanishing_advantage_ratio": 0.3333333432674408
1578
  },
1579
  {
1580
  "clip_ratio": 0.0,
1581
+ "completion_length": 2933.4166870117188,
1582
  "epoch": 0.16966580976863754,
1583
+ "grad_norm": 0.1805686354637146,
1584
+ "kl": 0.0044708251953125,
1585
  "learning_rate": 1.002741278414069e-07,
1586
+ "loss": 0.1209,
1587
+ "reward": 0.4236111082136631,
1588
+ "reward_std": 0.5723845213651657,
1589
+ "rewards/accuracy_reward": 0.1250000037252903,
1590
  "rewards/difficulty_following_reward": 0.0,
1591
+ "rewards/format_reward": 0.1736111156642437,
1592
  "step": 99,
1593
+ "vanishing_advantage_ratio": 0.0833333358168602
1594
  },
1595
  {
1596
  "clip_ratio": 0.0,
1597
+ "completion_length": 2795.6805419921875,
1598
  "epoch": 0.1713796058269066,
1599
+ "grad_norm": 0.16270823776721954,
1600
+ "kl": 0.004749298095703125,
1601
  "learning_rate": 1e-07,
1602
+ "loss": 0.1155,
1603
+ "reward": 0.24999999720603228,
1604
+ "reward_std": 0.32094707898795605,
1605
+ "rewards/accuracy_reward": 0.02777777798473835,
1606
  "rewards/difficulty_following_reward": 0.0,
1607
+ "rewards/format_reward": 0.19444444309920073,
1608
  "step": 100,
1609
+ "vanishing_advantage_ratio": 0.416666679084301
1610
  },
1611
  {
1612
  "epoch": 0.1713796058269066,
1613
  "step": 100,
1614
  "total_flos": 0.0,
1615
+ "train_loss": 0.0719883194193244,
1616
+ "train_runtime": 18385.1788,
1617
+ "train_samples_per_second": 0.392,
1618
+ "train_steps_per_second": 0.005
1619
  }
1620
  ],
1621
  "logging_steps": 1,