wlzhou commited on
Commit
7f9b019
·
verified ·
1 Parent(s): 481c277

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +468 -460
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/need-ai/open-r1-basic/runs/vg6arpjq)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/need-ai/open-r1-basic/runs/9iazxyaj)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 0.8416,
5
  "train_samples": 20,
6
- "train_samples_per_second": 237.641,
7
- "train_steps_per_second": 59.41
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 62.185223487904295,
4
+ "train_runtime": 1153.4055,
5
  "train_samples": 20,
6
+ "train_samples_per_second": 0.173,
7
+ "train_steps_per_second": 0.043
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 0.8416,
5
  "train_samples": 20,
6
- "train_samples_per_second": 237.641,
7
- "train_steps_per_second": 59.41
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 62.185223487904295,
4
+ "train_runtime": 1153.4055,
5
  "train_samples": 20,
6
+ "train_samples_per_second": 0.173,
7
+ "train_steps_per_second": 0.043
8
  }
trainer_state.json CHANGED
@@ -10,802 +10,810 @@
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
- "completion_length": 452.3125,
14
  "epoch": 0.2,
15
- "grad_norm": 2.0023154714436275,
16
  "kl": 0.0,
17
  "learning_rate": 4.000000000000001e-06,
18
- "loss": 0.1503,
19
- "reward": 0.375,
20
- "reward_std": 0.49297715723514557,
21
  "rewards/accuracy_reward_staging": 0.0,
22
- "rewards/format_reward": 0.375,
23
  "rewards/format_reward_staging": 0.0,
24
  "step": 1
25
  },
26
  {
27
  "clip_ratio": 0.0,
28
- "completion_length": 413.15625,
29
  "epoch": 0.4,
30
- "grad_norm": 2.341925597361157,
31
  "kl": 0.0,
32
  "learning_rate": 8.000000000000001e-06,
33
- "loss": 0.1794,
34
- "reward": 2.078125,
35
- "reward_std": 6.655139595270157,
36
  "rewards/accuracy_reward_staging": 0.015625,
37
- "rewards/format_reward": 0.5,
38
  "rewards/format_reward_staging": 0.015625,
39
  "step": 2
40
  },
41
  {
42
  "clip_ratio": 0.0,
43
- "completion_length": 457.78125,
44
  "epoch": 0.6,
45
- "grad_norm": 2.1132901651276517,
46
- "kl": 0.00373077392578125,
47
  "learning_rate": 1.2e-05,
48
- "loss": 0.2012,
49
- "reward": 0.265625,
50
- "reward_std": 0.42685678601264954,
51
- "rewards/accuracy_reward_staging": 0.0,
52
- "rewards/format_reward": 0.265625,
53
  "rewards/format_reward_staging": 0.0,
54
  "step": 3
55
  },
56
  {
57
  "clip_ratio": 0.0,
58
- "completion_length": 392.5,
59
  "epoch": 0.8,
60
- "grad_norm": 1.9147042924475104,
61
- "kl": 0.0198211669921875,
62
  "learning_rate": 1.6000000000000003e-05,
63
- "loss": 0.1991,
64
- "reward": 0.4375,
65
- "reward_std": 0.5018647164106369,
66
- "rewards/accuracy_reward_staging": 0.0,
67
- "rewards/format_reward": 0.4375,
68
  "rewards/format_reward_staging": 0.0,
69
  "step": 4
70
  },
71
  {
72
  "clip_ratio": 0.0,
73
- "completion_length": 395.9375,
74
  "epoch": 1.0,
75
- "grad_norm": 1.740620186312659,
76
- "kl": 0.1175537109375,
77
  "learning_rate": 2e-05,
78
- "loss": 0.2106,
79
- "reward": 0.75,
80
- "reward_std": 0.4110434949398041,
81
- "rewards/accuracy_reward_staging": 0.0,
82
- "rewards/format_reward": 0.75,
83
  "rewards/format_reward_staging": 0.0,
84
  "step": 5
85
  },
86
  {
87
  "clip_ratio": 0.0,
88
- "completion_length": 223.25,
89
  "epoch": 1.2,
90
- "grad_norm": 2.378166325570676,
91
- "kl": 0.38330078125,
92
  "learning_rate": 1.9975640502598243e-05,
93
- "loss": 0.093,
94
- "reward": 0.96875,
95
- "reward_std": 0.125,
96
- "rewards/accuracy_reward_staging": 0.0,
97
- "rewards/format_reward": 0.9375,
98
- "rewards/format_reward_staging": 0.03125,
99
  "step": 6
100
  },
101
  {
102
  "clip_ratio": 0.0,
103
- "completion_length": 219.921875,
104
  "epoch": 1.4,
105
- "grad_norm": 365.05292792030957,
106
- "kl": 29.92138671875,
107
  "learning_rate": 1.9902680687415704e-05,
108
- "loss": 0.0155,
109
- "reward": 4.109375,
110
- "reward_std": 8.601625442504883,
111
- "rewards/accuracy_reward_staging": 0.03125,
112
- "rewards/format_reward": 0.984375,
113
- "rewards/format_reward_staging": 0.0,
114
  "step": 7
115
  },
116
  {
117
  "clip_ratio": 0.0,
118
- "completion_length": 311.140625,
119
  "epoch": 1.6,
120
- "grad_norm": 4.278634190070862,
121
- "kl": 0.9921875,
122
  "learning_rate": 1.9781476007338058e-05,
123
- "loss": 0.3001,
124
- "reward": 0.875,
125
- "reward_std": 0.3340607285499573,
126
- "rewards/accuracy_reward_staging": 0.0,
127
  "rewards/format_reward": 0.84375,
128
- "rewards/format_reward_staging": 0.03125,
129
  "step": 8
130
  },
131
  {
132
  "clip_ratio": 0.0,
133
- "completion_length": 273.140625,
134
  "epoch": 1.8,
135
- "grad_norm": 2.7938623716418176,
136
- "kl": 1.033203125,
137
  "learning_rate": 1.961261695938319e-05,
138
- "loss": 0.1128,
139
- "reward": 0.96875,
140
- "reward_std": 0.08539125323295593,
141
- "rewards/accuracy_reward_staging": 0.0,
142
- "rewards/format_reward": 0.96875,
143
- "rewards/format_reward_staging": 0.0,
144
  "step": 9
145
  },
146
  {
147
  "epoch": 2.0,
148
- "grad_norm": 45.33623454848651,
149
  "learning_rate": 1.9396926207859085e-05,
150
- "loss": 0.0778,
151
  "step": 10
152
  },
153
  {
154
  "epoch": 2.0,
155
  "eval_clip_ratio": 0.0,
156
- "eval_completion_length": 216.453125,
157
- "eval_kl": 0.86875,
158
- "eval_loss": 0.32041892409324646,
159
- "eval_reward": 0.909375,
160
- "eval_reward_std": 0.25501468777656555,
161
- "eval_rewards/accuracy_reward_staging": 0.0,
162
- "eval_rewards/format_reward": 0.89375,
163
- "eval_rewards/format_reward_staging": 0.015625,
164
- "eval_runtime": 99.1923,
165
- "eval_samples_per_second": 0.202,
166
- "eval_steps_per_second": 0.02,
167
  "step": 10
168
  },
169
  {
170
  "clip_ratio": 0.0,
171
- "completion_length": 179.9765625,
172
  "epoch": 2.2,
173
- "grad_norm": 4.279708471245034,
174
- "kl": 5.43017578125,
175
  "learning_rate": 1.913545457642601e-05,
176
- "loss": 0.219,
177
- "reward": 0.9296875,
178
- "reward_std": 0.19234732538461685,
179
- "rewards/accuracy_reward_staging": 0.0,
180
- "rewards/format_reward": 0.890625,
181
- "rewards/format_reward_staging": 0.0390625,
182
  "step": 11
183
  },
184
  {
185
  "clip_ratio": 0.0,
186
- "completion_length": 215.078125,
187
  "epoch": 2.4,
188
- "grad_norm": 2.3327242968897277,
189
- "kl": 1.5615234375,
190
  "learning_rate": 1.8829475928589272e-05,
191
- "loss": 0.1519,
192
- "reward": 0.859375,
193
- "reward_std": 0.21347813308238983,
194
- "rewards/accuracy_reward_staging": 0.0,
195
- "rewards/format_reward": 0.84375,
196
- "rewards/format_reward_staging": 0.015625,
197
  "step": 12
198
  },
199
  {
200
  "clip_ratio": 0.0,
201
- "completion_length": 209.71875,
202
  "epoch": 2.6,
203
- "grad_norm": 2.299255809407369,
204
- "kl": 0.724609375,
205
  "learning_rate": 1.848048096156426e-05,
206
- "loss": 0.4741,
207
- "reward": 0.875,
208
- "reward_std": 0.28258590400218964,
209
- "rewards/accuracy_reward_staging": 0.0,
210
- "rewards/format_reward": 0.875,
211
- "rewards/format_reward_staging": 0.0,
212
  "step": 13
213
  },
214
  {
215
  "clip_ratio": 0.0,
216
- "completion_length": 139.953125,
217
  "epoch": 2.8,
218
- "grad_norm": 2.6130897341791766,
219
- "kl": 0.94921875,
220
  "learning_rate": 1.8090169943749477e-05,
221
- "loss": 0.2378,
222
- "reward": 0.78125,
223
- "reward_std": 0.42516323924064636,
224
- "rewards/accuracy_reward_staging": 0.0,
225
- "rewards/format_reward": 0.75,
226
- "rewards/format_reward_staging": 0.03125,
227
  "step": 14
228
  },
229
  {
230
  "clip_ratio": 0.0,
231
- "completion_length": 188.875,
232
  "epoch": 3.0,
233
- "grad_norm": 2.5218002560154344,
234
- "kl": 0.8828125,
235
  "learning_rate": 1.766044443118978e-05,
236
- "loss": 0.255,
237
- "reward": 0.75,
238
- "reward_std": 0.416047140955925,
239
- "rewards/accuracy_reward_staging": 0.0,
240
- "rewards/format_reward": 0.75,
241
- "rewards/format_reward_staging": 0.0,
242
  "step": 15
243
  },
244
  {
245
  "clip_ratio": 0.0,
246
- "completion_length": 121.71875,
247
  "epoch": 3.2,
248
- "grad_norm": 3973.2143623390243,
249
- "kl": 140.78125,
250
  "learning_rate": 1.7193398003386514e-05,
251
- "loss": 4.5241,
252
- "reward": 0.78125,
253
- "reward_std": 0.3996476083993912,
254
- "rewards/accuracy_reward_staging": 0.0,
255
- "rewards/format_reward": 0.765625,
256
- "rewards/format_reward_staging": 0.015625,
257
  "step": 16
258
  },
259
  {
260
  "clip_ratio": 0.0,
261
- "completion_length": 95.203125,
262
  "epoch": 3.4,
263
- "grad_norm": 3.1402184707771608,
264
- "kl": 1.0966796875,
265
  "learning_rate": 1.6691306063588583e-05,
266
- "loss": 0.1211,
267
- "reward": 0.90625,
268
- "reward_std": 0.2561737596988678,
269
- "rewards/accuracy_reward_staging": 0.0,
270
- "rewards/format_reward": 0.90625,
271
- "rewards/format_reward_staging": 0.0,
272
  "step": 17
273
  },
274
  {
275
  "clip_ratio": 0.0,
276
- "completion_length": 117.96875,
277
  "epoch": 3.6,
278
- "grad_norm": 18.543717938463967,
279
- "kl": 2.734375,
280
  "learning_rate": 1.6156614753256583e-05,
281
- "loss": 0.0948,
282
- "reward": 0.78125,
283
- "reward_std": 0.37276527285575867,
284
- "rewards/accuracy_reward_staging": 0.0,
285
- "rewards/format_reward": 0.765625,
286
- "rewards/format_reward_staging": 0.015625,
287
  "step": 18
288
  },
289
  {
290
  "clip_ratio": 0.0,
291
- "completion_length": 122.09375,
292
  "epoch": 3.8,
293
- "grad_norm": 85.01856100880318,
294
- "kl": 2.21484375,
295
  "learning_rate": 1.5591929034707468e-05,
296
- "loss": 0.2628,
297
- "reward": 0.75,
298
- "reward_std": 0.43655143678188324,
299
- "rewards/accuracy_reward_staging": 0.0,
300
- "rewards/format_reward": 0.75,
301
- "rewards/format_reward_staging": 0.0,
302
  "step": 19
303
  },
304
  {
305
  "epoch": 4.0,
306
- "grad_norm": 7.534201508284574,
307
  "learning_rate": 1.5000000000000002e-05,
308
- "loss": 0.3282,
309
  "step": 20
310
  },
311
  {
312
  "epoch": 4.0,
313
  "eval_clip_ratio": 0.0,
314
- "eval_completion_length": 117.20625,
315
- "eval_kl": 1.8859375,
316
- "eval_loss": 0.4934101700782776,
317
- "eval_reward": 0.865625,
318
- "eval_reward_std": 0.3061607271432877,
319
- "eval_rewards/accuracy_reward_staging": 0.0,
320
- "eval_rewards/format_reward": 0.8625,
321
- "eval_rewards/format_reward_staging": 0.003125,
322
- "eval_runtime": 82.791,
323
- "eval_samples_per_second": 0.242,
324
  "eval_steps_per_second": 0.024,
325
  "step": 20
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
- "completion_length": 142.3046875,
330
  "epoch": 4.2,
331
- "grad_norm": 3.431865738435327,
332
- "kl": 1.1591796875,
333
  "learning_rate": 1.4383711467890776e-05,
334
- "loss": 0.5598,
335
- "reward": 0.8828125,
336
- "reward_std": 0.23504295200109482,
337
- "rewards/accuracy_reward_staging": 0.0,
338
- "rewards/format_reward": 0.8828125,
339
- "rewards/format_reward_staging": 0.0,
340
  "step": 21
341
  },
342
  {
343
  "clip_ratio": 0.0,
344
- "completion_length": 108.421875,
345
  "epoch": 4.4,
346
- "grad_norm": 28.582562125442966,
347
- "kl": 2.015625,
348
  "learning_rate": 1.3746065934159123e-05,
349
- "loss": 0.3805,
350
- "reward": 0.890625,
351
- "reward_std": 0.3186737596988678,
352
- "rewards/accuracy_reward_staging": 0.0,
353
- "rewards/format_reward": 0.890625,
354
- "rewards/format_reward_staging": 0.0,
355
  "step": 22
356
  },
357
  {
358
  "clip_ratio": 0.0,
359
- "completion_length": 160.375,
360
  "epoch": 4.6,
361
- "grad_norm": 182.12056251763224,
362
- "kl": 3.80859375,
363
  "learning_rate": 1.3090169943749475e-05,
364
- "loss": 0.5022,
365
- "reward": 0.8125,
366
- "reward_std": 0.240902841091156,
367
- "rewards/accuracy_reward_staging": 0.0,
368
- "rewards/format_reward": 0.8125,
369
- "rewards/format_reward_staging": 0.0,
370
  "step": 23
371
  },
372
  {
373
  "clip_ratio": 0.0,
374
- "completion_length": 104.640625,
375
  "epoch": 4.8,
376
- "grad_norm": 7.658648872736992,
377
- "kl": 1.234375,
378
  "learning_rate": 1.2419218955996677e-05,
379
- "loss": 0.3389,
380
- "reward": 0.953125,
381
- "reward_std": 0.10077822208404541,
382
- "rewards/accuracy_reward_staging": 0.0,
383
- "rewards/format_reward": 0.953125,
384
- "rewards/format_reward_staging": 0.0,
385
  "step": 24
386
  },
387
  {
388
  "clip_ratio": 0.0,
389
- "completion_length": 336.75,
390
  "epoch": 5.0,
391
- "grad_norm": 1298.4886281780803,
392
- "kl": 59.49609375,
393
  "learning_rate": 1.1736481776669307e-05,
394
- "loss": 1.2749,
395
- "reward": 0.46875,
396
- "reward_std": 0.2561737596988678,
397
- "rewards/accuracy_reward_staging": 0.0,
398
- "rewards/format_reward": 0.46875,
399
- "rewards/format_reward_staging": 0.0,
400
  "step": 25
401
  },
402
  {
403
  "clip_ratio": 0.0,
404
- "completion_length": 486.5,
405
  "epoch": 5.2,
406
- "grad_norm": 2.214221470988038,
407
- "kl": 1.287109375,
408
  "learning_rate": 1.1045284632676535e-05,
409
- "loss": 0.1667,
410
- "reward": 0.328125,
411
- "reward_std": 0.11967839300632477,
412
- "rewards/accuracy_reward_staging": 0.0,
413
  "rewards/format_reward": 0.328125,
414
- "rewards/format_reward_staging": 0.0,
415
  "step": 26
416
  },
417
  {
418
  "clip_ratio": 0.0,
419
- "completion_length": 143.078125,
420
  "epoch": 5.4,
421
- "grad_norm": 18.952394918879374,
422
- "kl": 1.109375,
423
  "learning_rate": 1.0348994967025012e-05,
424
- "loss": 0.6906,
425
- "reward": 0.875,
426
- "reward_std": 0.2675696462392807,
427
- "rewards/accuracy_reward_staging": 0.0,
428
- "rewards/format_reward": 0.875,
429
- "rewards/format_reward_staging": 0.0,
430
  "step": 27
431
  },
432
  {
433
  "clip_ratio": 0.0,
434
- "completion_length": 274.375,
435
  "epoch": 5.6,
436
- "grad_norm": 53.50801061354546,
437
- "kl": 1.34375,
438
  "learning_rate": 9.651005032974994e-06,
439
- "loss": 0.5985,
440
- "reward": 0.53125,
441
- "reward_std": 0.4533684104681015,
442
- "rewards/accuracy_reward_staging": 0.0,
443
- "rewards/format_reward": 0.5,
444
- "rewards/format_reward_staging": 0.03125,
445
  "step": 28
446
  },
447
  {
448
  "clip_ratio": 0.0,
449
- "completion_length": 291.953125,
450
  "epoch": 5.8,
451
- "grad_norm": 4827.971254295739,
452
- "kl": 102.302734375,
453
  "learning_rate": 8.954715367323468e-06,
454
- "loss": 10.8379,
455
- "reward": 0.546875,
456
- "reward_std": 0.48989027738571167,
457
- "rewards/accuracy_reward_staging": 0.0,
458
- "rewards/format_reward": 0.53125,
459
- "rewards/format_reward_staging": 0.015625,
460
  "step": 29
461
  },
462
  {
463
  "epoch": 6.0,
464
- "grad_norm": 217.63607240320385,
465
  "learning_rate": 8.263518223330698e-06,
466
- "loss": 1.7709,
467
  "step": 30
468
  },
469
  {
470
  "epoch": 6.0,
471
  "eval_clip_ratio": 0.0,
472
- "eval_completion_length": 166.153125,
473
- "eval_kl": 2.5828125,
474
- "eval_loss": 0.5345276594161987,
475
- "eval_reward": 0.621875,
476
- "eval_reward_std": 0.3898551195859909,
477
- "eval_rewards/accuracy_reward_staging": 0.0,
478
- "eval_rewards/format_reward": 0.609375,
479
- "eval_rewards/format_reward_staging": 0.0125,
480
- "eval_runtime": 81.8428,
481
- "eval_samples_per_second": 0.244,
482
- "eval_steps_per_second": 0.024,
483
  "step": 30
484
  },
485
  {
486
  "clip_ratio": 0.0,
487
- "completion_length": 301.828125,
488
  "epoch": 6.2,
489
- "grad_norm": 46.971884241153774,
490
- "kl": 3.76806640625,
491
  "learning_rate": 7.580781044003324e-06,
492
- "loss": 0.4557,
493
- "reward": 0.5703125,
494
- "reward_std": 0.43463166058063507,
495
- "rewards/accuracy_reward_staging": 0.0,
496
- "rewards/format_reward": 0.546875,
497
- "rewards/format_reward_staging": 0.0234375,
498
  "step": 31
499
  },
500
  {
501
  "clip_ratio": 0.0,
502
- "completion_length": 147.0625,
503
  "epoch": 6.4,
504
- "grad_norm": 28.499738440121313,
505
- "kl": 1.439453125,
506
  "learning_rate": 6.909830056250527e-06,
507
- "loss": 0.3441,
508
- "reward": 0.65625,
509
- "reward_std": 0.40097813308238983,
510
- "rewards/accuracy_reward_staging": 0.0,
511
- "rewards/format_reward": 0.625,
512
- "rewards/format_reward_staging": 0.03125,
513
  "step": 32
514
  },
515
  {
516
  "clip_ratio": 0.0,
517
- "completion_length": 99.34375,
518
  "epoch": 6.6,
519
- "grad_norm": 870.0830446579233,
520
- "kl": 9.68359375,
521
  "learning_rate": 6.25393406584088e-06,
522
- "loss": 0.7721,
523
- "reward": 0.78125,
524
- "reward_std": 0.23680339753627777,
525
  "rewards/accuracy_reward_staging": 0.0,
526
- "rewards/format_reward": 0.765625,
527
- "rewards/format_reward_staging": 0.015625,
528
  "step": 33
529
  },
530
  {
531
  "clip_ratio": 0.0,
532
- "completion_length": 124.046875,
533
  "epoch": 6.8,
534
- "grad_norm": 2684.174855841622,
535
- "kl": 64.169921875,
536
  "learning_rate": 5.616288532109225e-06,
537
- "loss": 6.76,
538
- "reward": 0.5,
539
- "reward_std": 0.4433557987213135,
540
- "rewards/accuracy_reward_staging": 0.0,
541
- "rewards/format_reward": 0.5,
542
- "rewards/format_reward_staging": 0.0,
543
  "step": 34
544
  },
545
  {
546
  "clip_ratio": 0.0,
547
- "completion_length": 94.3125,
548
  "epoch": 7.0,
549
- "grad_norm": 127.36379864799719,
550
- "kl": 8.89453125,
551
  "learning_rate": 5.000000000000003e-06,
552
- "loss": 1.0328,
553
- "reward": 0.5625,
554
- "reward_std": 0.3604728728532791,
555
- "rewards/accuracy_reward_staging": 0.0,
556
- "rewards/format_reward": 0.5625,
557
- "rewards/format_reward_staging": 0.0,
558
  "step": 35
559
  },
560
  {
561
  "clip_ratio": 0.0,
562
- "completion_length": 108.578125,
563
  "epoch": 7.2,
564
- "grad_norm": 84.6244853693584,
565
- "kl": 1.953125,
566
  "learning_rate": 4.408070965292534e-06,
567
- "loss": 0.712,
568
- "reward": 0.6875,
569
- "reward_std": 0.3723389506340027,
570
- "rewards/accuracy_reward_staging": 0.0,
571
- "rewards/format_reward": 0.671875,
572
- "rewards/format_reward_staging": 0.015625,
573
  "step": 36
574
  },
575
  {
576
  "clip_ratio": 0.0,
577
- "completion_length": 74.03125,
578
  "epoch": 7.4,
579
- "grad_norm": 20.011312482681333,
580
- "kl": 2.0625,
581
  "learning_rate": 3.8433852467434175e-06,
582
- "loss": 0.1286,
583
- "reward": 0.46875,
584
- "reward_std": 0.45565588772296906,
585
- "rewards/accuracy_reward_staging": 0.0,
586
- "rewards/format_reward": 0.421875,
587
- "rewards/format_reward_staging": 0.046875,
588
  "step": 37
589
  },
590
  {
591
  "clip_ratio": 0.0,
592
- "completion_length": 115.84375,
593
  "epoch": 7.6,
594
- "grad_norm": 176.37444967265372,
595
- "kl": 6.32421875,
596
  "learning_rate": 3.308693936411421e-06,
597
- "loss": 0.9311,
598
- "reward": 0.59375,
599
- "reward_std": 0.426059752702713,
600
- "rewards/accuracy_reward_staging": 0.0,
601
- "rewards/format_reward": 0.5625,
602
- "rewards/format_reward_staging": 0.03125,
603
  "step": 38
604
  },
605
  {
606
  "clip_ratio": 0.0,
607
- "completion_length": 83.109375,
608
  "epoch": 7.8,
609
- "grad_norm": 36.50985813199302,
610
- "kl": 2.599609375,
611
  "learning_rate": 2.8066019966134907e-06,
612
- "loss": 0.2542,
613
- "reward": 0.59375,
614
- "reward_std": 0.40758590400218964,
615
- "rewards/accuracy_reward_staging": 0.0,
616
- "rewards/format_reward": 0.59375,
617
- "rewards/format_reward_staging": 0.0,
618
  "step": 39
619
  },
620
  {
621
  "epoch": 8.0,
622
- "grad_norm": 46.6545525065277,
623
  "learning_rate": 2.339555568810221e-06,
624
- "loss": 0.3509,
625
  "step": 40
626
  },
627
  {
628
  "epoch": 8.0,
629
  "eval_clip_ratio": 0.0,
630
- "eval_completion_length": 88.775,
631
- "eval_kl": 19.08515625,
632
- "eval_loss": 0.28077805042266846,
633
- "eval_reward": 0.871875,
634
- "eval_reward_std": 1.6184641629457475,
635
- "eval_rewards/accuracy_reward_staging": 0.003125,
636
- "eval_rewards/format_reward": 0.55,
637
- "eval_rewards/format_reward_staging": 0.009375,
638
- "eval_runtime": 43.7519,
639
- "eval_samples_per_second": 0.457,
640
- "eval_steps_per_second": 0.046,
641
  "step": 40
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
- "completion_length": 84.8984375,
646
  "epoch": 8.2,
647
- "grad_norm": 12.769162966798541,
648
- "kl": 3.1865234375,
649
  "learning_rate": 1.9098300562505266e-06,
650
- "loss": 0.192,
651
- "reward": 0.546875,
652
- "reward_std": 0.41088978946208954,
653
- "rewards/accuracy_reward_staging": 0.0,
654
- "rewards/format_reward": 0.5390625,
655
- "rewards/format_reward_staging": 0.0078125,
656
  "step": 41
657
  },
658
  {
659
  "clip_ratio": 0.0,
660
- "completion_length": 107.328125,
661
  "epoch": 8.4,
662
- "grad_norm": 18.888797406982157,
663
- "kl": 2.732421875,
664
  "learning_rate": 1.5195190384357405e-06,
665
- "loss": 0.3458,
666
- "reward": 0.40625,
667
- "reward_std": 0.3996476083993912,
668
- "rewards/accuracy_reward_staging": 0.0,
669
- "rewards/format_reward": 0.40625,
670
- "rewards/format_reward_staging": 0.0,
671
  "step": 42
672
  },
673
  {
674
  "clip_ratio": 0.0,
675
- "completion_length": 144.5625,
676
  "epoch": 8.6,
677
- "grad_norm": 35.47858284655087,
678
- "kl": 3.396484375,
679
  "learning_rate": 1.1705240714107301e-06,
680
- "loss": 0.7257,
681
- "reward": 0.640625,
682
- "reward_std": 0.414138063788414,
683
  "rewards/accuracy_reward_staging": 0.0,
684
- "rewards/format_reward": 0.59375,
685
- "rewards/format_reward_staging": 0.046875,
686
  "step": 43
687
  },
688
  {
689
  "clip_ratio": 0.0,
690
- "completion_length": 82.984375,
691
  "epoch": 8.8,
692
- "grad_norm": 17.28412490205481,
693
- "kl": 2.9765625,
694
  "learning_rate": 8.645454235739903e-07,
695
- "loss": 0.3137,
696
- "reward": 0.546875,
697
- "reward_std": 0.42867644131183624,
698
- "rewards/accuracy_reward_staging": 0.0,
699
- "rewards/format_reward": 0.53125,
700
- "rewards/format_reward_staging": 0.015625,
701
  "step": 44
702
  },
703
  {
704
  "clip_ratio": 0.0,
705
- "completion_length": 62.4375,
706
  "epoch": 9.0,
707
- "grad_norm": 27.53481619242555,
708
- "kl": 4.017578125,
709
  "learning_rate": 6.030737921409169e-07,
710
- "loss": -0.0146,
711
- "reward": 0.46875,
712
- "reward_std": 0.48456867039203644,
713
- "rewards/accuracy_reward_staging": 0.0,
714
- "rewards/format_reward": 0.46875,
715
- "rewards/format_reward_staging": 0.0,
716
  "step": 45
717
  },
718
  {
719
  "clip_ratio": 0.0,
720
- "completion_length": 98.09375,
721
  "epoch": 9.2,
722
- "grad_norm": 21.386737012957905,
723
- "kl": 4.35546875,
724
  "learning_rate": 3.8738304061681107e-07,
725
- "loss": 0.2661,
726
- "reward": 0.421875,
727
- "reward_std": 0.39476001262664795,
728
- "rewards/accuracy_reward_staging": 0.0,
729
- "rewards/format_reward": 0.421875,
730
- "rewards/format_reward_staging": 0.0,
731
  "step": 46
732
  },
733
  {
734
  "clip_ratio": 0.0,
735
- "completion_length": 85.625,
736
  "epoch": 9.4,
737
- "grad_norm": 46.899263555551805,
738
- "kl": 2.951171875,
739
  "learning_rate": 2.1852399266194312e-07,
740
- "loss": 0.5289,
741
- "reward": 0.75,
742
- "reward_std": 0.35296089947223663,
743
- "rewards/accuracy_reward_staging": 0.0,
744
- "rewards/format_reward": 0.75,
745
- "rewards/format_reward_staging": 0.0,
746
  "step": 47
747
  },
748
  {
749
  "clip_ratio": 0.0,
750
- "completion_length": 68.90625,
751
  "epoch": 9.6,
752
- "grad_norm": 23.059473156975322,
753
- "kl": 4.28515625,
754
  "learning_rate": 9.731931258429638e-08,
755
- "loss": 0.123,
756
- "reward": 0.53125,
757
- "reward_std": 0.4704566150903702,
758
- "rewards/accuracy_reward_staging": 0.0,
759
- "rewards/format_reward": 0.515625,
760
- "rewards/format_reward_staging": 0.015625,
761
  "step": 48
762
  },
763
  {
764
  "clip_ratio": 0.0,
765
- "completion_length": 92.140625,
766
  "epoch": 9.8,
767
- "grad_norm": 10.728234621998174,
768
- "kl": 2.490234375,
769
  "learning_rate": 2.4359497401758026e-08,
770
- "loss": 0.2611,
771
- "reward": 0.609375,
772
- "reward_std": 0.4152062386274338,
773
- "rewards/accuracy_reward_staging": 0.0,
774
- "rewards/format_reward": 0.609375,
775
- "rewards/format_reward_staging": 0.0,
776
  "step": 49
777
  },
778
  {
779
  "epoch": 10.0,
780
- "grad_norm": 19.523610245136258,
781
  "learning_rate": 0.0,
782
- "loss": 0.3147,
783
  "step": 50
784
  },
785
  {
786
  "epoch": 10.0,
787
  "eval_clip_ratio": 0.0,
788
- "eval_completion_length": 81.3625,
789
- "eval_kl": 2.358984375,
790
- "eval_loss": 0.19067667424678802,
791
- "eval_reward": 0.6,
792
- "eval_reward_std": 0.40582795441150665,
793
- "eval_rewards/accuracy_reward_staging": 0.0,
794
- "eval_rewards/format_reward": 0.5875,
795
- "eval_rewards/format_reward_staging": 0.0125,
796
- "eval_runtime": 40.4318,
797
- "eval_samples_per_second": 0.495,
798
- "eval_steps_per_second": 0.049,
799
  "step": 50
800
  },
801
  {
 
 
802
  "epoch": 10.0,
 
 
 
 
 
 
803
  "step": 50,
804
  "total_flos": 0.0,
805
- "train_loss": 0.0,
806
- "train_runtime": 0.8416,
807
- "train_samples_per_second": 237.641,
808
- "train_steps_per_second": 59.41
809
  }
810
  ],
811
  "logging_steps": 1,
 
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
+ "completion_length": 328.953125,
14
  "epoch": 0.2,
15
+ "grad_norm": 2.233631036923167,
16
  "kl": 0.0,
17
  "learning_rate": 4.000000000000001e-06,
18
+ "loss": 0.0599,
19
+ "reward": 0.734375,
20
+ "reward_std": 0.44938501715660095,
21
  "rewards/accuracy_reward_staging": 0.0,
22
+ "rewards/format_reward": 0.734375,
23
  "rewards/format_reward_staging": 0.0,
24
  "step": 1
25
  },
26
  {
27
  "clip_ratio": 0.0,
28
+ "completion_length": 296.46875,
29
  "epoch": 0.4,
30
+ "grad_norm": 2.4039124983770694,
31
  "kl": 0.0,
32
  "learning_rate": 8.000000000000001e-06,
33
+ "loss": 0.0574,
34
+ "reward": 2.359375,
35
+ "reward_std": 6.557279825210571,
36
  "rewards/accuracy_reward_staging": 0.015625,
37
+ "rewards/format_reward": 0.78125,
38
  "rewards/format_reward_staging": 0.015625,
39
  "step": 2
40
  },
41
  {
42
  "clip_ratio": 0.0,
43
+ "completion_length": 369.703125,
44
  "epoch": 0.6,
45
+ "grad_norm": 1.8186478314322454,
46
+ "kl": 0.0063934326171875,
47
  "learning_rate": 1.2e-05,
48
+ "loss": 0.1678,
49
+ "reward": 20.875,
50
+ "reward_std": 37.550247088074684,
51
+ "rewards/accuracy_reward_staging": 0.203125,
52
+ "rewards/format_reward": 0.5625,
53
  "rewards/format_reward_staging": 0.0,
54
  "step": 3
55
  },
56
  {
57
  "clip_ratio": 0.0,
58
+ "completion_length": 289.875,
59
  "epoch": 0.8,
60
+ "grad_norm": 1.8738553295630938,
61
+ "kl": 0.040130615234375,
62
  "learning_rate": 1.6000000000000003e-05,
63
+ "loss": -0.0018,
64
+ "reward": 17.96875,
65
+ "reward_std": 29.60683871805668,
66
+ "rewards/accuracy_reward_staging": 0.171875,
67
+ "rewards/format_reward": 0.78125,
68
  "rewards/format_reward_staging": 0.0,
69
  "step": 4
70
  },
71
  {
72
  "clip_ratio": 0.0,
73
+ "completion_length": 360.125,
74
  "epoch": 1.0,
75
+ "grad_norm": 2.096161007439382,
76
+ "kl": 0.15673828125,
77
  "learning_rate": 2e-05,
78
+ "loss": 0.2871,
79
+ "reward": 18.109375,
80
+ "reward_std": 24.160783976316452,
81
+ "rewards/accuracy_reward_staging": 0.171875,
82
+ "rewards/format_reward": 0.921875,
83
  "rewards/format_reward_staging": 0.0,
84
  "step": 5
85
  },
86
  {
87
  "clip_ratio": 0.0,
88
+ "completion_length": 377.890625,
89
  "epoch": 1.2,
90
+ "grad_norm": 1.9460066827379818,
91
+ "kl": 0.34521484375,
92
  "learning_rate": 1.9975640502598243e-05,
93
+ "loss": 0.1768,
94
+ "reward": 19.546875,
95
+ "reward_std": 28.07823872566223,
96
+ "rewards/accuracy_reward_staging": 0.1875,
97
+ "rewards/format_reward": 0.796875,
98
+ "rewards/format_reward_staging": 0.0,
99
  "step": 6
100
  },
101
  {
102
  "clip_ratio": 0.0,
103
+ "completion_length": 398.328125,
104
  "epoch": 1.4,
105
+ "grad_norm": 1.9345332551264938,
106
+ "kl": 0.47216796875,
107
  "learning_rate": 1.9902680687415704e-05,
108
+ "loss": 0.1841,
109
+ "reward": 13.375,
110
+ "reward_std": 22.561845421791077,
111
+ "rewards/accuracy_reward_staging": 0.125,
112
+ "rewards/format_reward": 0.796875,
113
+ "rewards/format_reward_staging": 0.078125,
114
  "step": 7
115
  },
116
  {
117
  "clip_ratio": 0.0,
118
+ "completion_length": 376.203125,
119
  "epoch": 1.6,
120
+ "grad_norm": 28.852778290018875,
121
+ "kl": 0.48193359375,
122
  "learning_rate": 1.9781476007338058e-05,
123
+ "loss": 0.1949,
124
+ "reward": 55.59375,
125
+ "reward_std": 41.964348047971725,
126
+ "rewards/accuracy_reward_staging": 0.546875,
127
  "rewards/format_reward": 0.84375,
128
+ "rewards/format_reward_staging": 0.0625,
129
  "step": 8
130
  },
131
  {
132
  "clip_ratio": 0.0,
133
+ "completion_length": 290.296875,
134
  "epoch": 1.8,
135
+ "grad_norm": 20.955640611081066,
136
+ "kl": 1.462890625,
137
  "learning_rate": 1.961261695938319e-05,
138
+ "loss": -0.0393,
139
+ "reward": 11.90625,
140
+ "reward_std": 18.741955757141113,
141
+ "rewards/accuracy_reward_staging": 0.109375,
142
+ "rewards/format_reward": 0.875,
143
+ "rewards/format_reward_staging": 0.09375,
144
  "step": 9
145
  },
146
  {
147
  "epoch": 2.0,
148
+ "grad_norm": 22.010242853054393,
149
  "learning_rate": 1.9396926207859085e-05,
150
+ "loss": 0.0349,
151
  "step": 10
152
  },
153
  {
154
  "epoch": 2.0,
155
  "eval_clip_ratio": 0.0,
156
+ "eval_completion_length": 318.5375,
157
+ "eval_kl": 13.2837890625,
158
+ "eval_loss": 0.5425169467926025,
159
+ "eval_reward": 32.7875,
160
+ "eval_reward_std": 24.585004723072053,
161
+ "eval_rewards/accuracy_reward_staging": 0.31875,
162
+ "eval_rewards/format_reward": 0.828125,
163
+ "eval_rewards/format_reward_staging": 0.084375,
164
+ "eval_runtime": 111.4589,
165
+ "eval_samples_per_second": 0.179,
166
+ "eval_steps_per_second": 0.018,
167
  "step": 10
168
  },
169
  {
170
  "clip_ratio": 0.0,
171
+ "completion_length": 358.484375,
172
  "epoch": 2.2,
173
+ "grad_norm": 5.685479817813176,
174
+ "kl": 1.0234375,
175
  "learning_rate": 1.913545457642601e-05,
176
+ "loss": 0.0814,
177
+ "reward": 36.921875,
178
+ "reward_std": 27.1712064743042,
179
+ "rewards/accuracy_reward_staging": 0.359375,
180
+ "rewards/format_reward": 0.9140625,
181
+ "rewards/format_reward_staging": 0.0703125,
182
  "step": 11
183
  },
184
  {
185
  "clip_ratio": 0.0,
186
+ "completion_length": 296.0,
187
  "epoch": 2.4,
188
+ "grad_norm": 1.8976148262148187,
189
+ "kl": 0.6943359375,
190
  "learning_rate": 1.8829475928589272e-05,
191
+ "loss": 0.1344,
192
+ "reward": 25.9375,
193
+ "reward_std": 0.17078250646591187,
194
+ "rewards/accuracy_reward_staging": 0.25,
195
+ "rewards/format_reward": 0.875,
196
+ "rewards/format_reward_staging": 0.0625,
197
  "step": 12
198
  },
199
  {
200
  "clip_ratio": 0.0,
201
+ "completion_length": 257.9375,
202
  "epoch": 2.6,
203
+ "grad_norm": 46.008358117472916,
204
+ "kl": 2.7431640625,
205
  "learning_rate": 1.848048096156426e-05,
206
+ "loss": 0.2127,
207
+ "reward": 16.421875,
208
+ "reward_std": 27.048566073179245,
209
+ "rewards/accuracy_reward_staging": 0.15625,
210
+ "rewards/format_reward": 0.71875,
211
+ "rewards/format_reward_staging": 0.078125,
212
  "step": 13
213
  },
214
  {
215
  "clip_ratio": 0.0,
216
+ "completion_length": 224.984375,
217
  "epoch": 2.8,
218
+ "grad_norm": 2.4693533285774656,
219
+ "kl": 0.796875,
220
  "learning_rate": 1.8090169943749477e-05,
221
+ "loss": 0.0197,
222
+ "reward": 19.375,
223
+ "reward_std": 24.386408746242523,
224
+ "rewards/accuracy_reward_staging": 0.1875,
225
+ "rewards/format_reward": 0.546875,
226
+ "rewards/format_reward_staging": 0.078125,
227
  "step": 14
228
  },
229
  {
230
  "clip_ratio": 0.0,
231
+ "completion_length": 188.8125,
232
  "epoch": 3.0,
233
+ "grad_norm": 2.2982453846918136,
234
+ "kl": 0.865234375,
235
  "learning_rate": 1.766044443118978e-05,
236
+ "loss": 0.0649,
237
+ "reward": 24.25,
238
+ "reward_std": 26.21424473822117,
239
+ "rewards/accuracy_reward_staging": 0.234375,
240
+ "rewards/format_reward": 0.765625,
241
+ "rewards/format_reward_staging": 0.046875,
242
  "step": 15
243
  },
244
  {
245
  "clip_ratio": 0.0,
246
+ "completion_length": 224.65625,
247
  "epoch": 3.2,
248
+ "grad_norm": 2.2466155648528727,
249
+ "kl": 0.814453125,
250
  "learning_rate": 1.7193398003386514e-05,
251
+ "loss": 0.1391,
252
+ "reward": 16.40625,
253
+ "reward_std": 12.839739605784416,
254
+ "rewards/accuracy_reward_staging": 0.15625,
255
+ "rewards/format_reward": 0.671875,
256
+ "rewards/format_reward_staging": 0.109375,
257
  "step": 16
258
  },
259
  {
260
  "clip_ratio": 0.0,
261
+ "completion_length": 225.8125,
262
  "epoch": 3.4,
263
+ "grad_norm": 5.112571418015419,
264
+ "kl": 1.255859375,
265
  "learning_rate": 1.6691306063588583e-05,
266
+ "loss": 0.2407,
267
+ "reward": 2.359375,
268
+ "reward_std": 6.558102443814278,
269
+ "rewards/accuracy_reward_staging": 0.015625,
270
+ "rewards/format_reward": 0.765625,
271
+ "rewards/format_reward_staging": 0.03125,
272
  "step": 17
273
  },
274
  {
275
  "clip_ratio": 0.0,
276
+ "completion_length": 185.21875,
277
  "epoch": 3.6,
278
+ "grad_norm": 72.26378452951813,
279
+ "kl": 1.720703125,
280
  "learning_rate": 1.6156614753256583e-05,
281
+ "loss": 0.0449,
282
+ "reward": 16.46875,
283
+ "reward_std": 23.07971879839897,
284
+ "rewards/accuracy_reward_staging": 0.15625,
285
+ "rewards/format_reward": 0.703125,
286
+ "rewards/format_reward_staging": 0.140625,
287
  "step": 18
288
  },
289
  {
290
  "clip_ratio": 0.0,
291
+ "completion_length": 184.390625,
292
  "epoch": 3.8,
293
+ "grad_norm": 2.4970793879830975,
294
+ "kl": 0.9013671875,
295
  "learning_rate": 1.5591929034707468e-05,
296
+ "loss": 0.0522,
297
+ "reward": 77.203125,
298
+ "reward_std": 60.76833724975586,
299
+ "rewards/accuracy_reward_staging": 0.765625,
300
+ "rewards/format_reward": 0.515625,
301
+ "rewards/format_reward_staging": 0.125,
302
  "step": 19
303
  },
304
  {
305
  "epoch": 4.0,
306
+ "grad_norm": 2.744480736140323,
307
  "learning_rate": 1.5000000000000002e-05,
308
+ "loss": 0.1602,
309
  "step": 20
310
  },
311
  {
312
  "epoch": 4.0,
313
  "eval_clip_ratio": 0.0,
314
+ "eval_completion_length": 210.384375,
315
+ "eval_kl": 1.496484375,
316
+ "eval_loss": 0.15775302052497864,
317
+ "eval_reward": 25.140625,
318
+ "eval_reward_std": 20.779218792915344,
319
+ "eval_rewards/accuracy_reward_staging": 0.24375,
320
+ "eval_rewards/format_reward": 0.70625,
321
+ "eval_rewards/format_reward_staging": 0.059375,
322
+ "eval_runtime": 83.568,
323
+ "eval_samples_per_second": 0.239,
324
  "eval_steps_per_second": 0.024,
325
  "step": 20
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
+ "completion_length": 210.4765625,
330
  "epoch": 4.2,
331
+ "grad_norm": 2.5604095864012844,
332
+ "kl": 1.318359375,
333
  "learning_rate": 1.4383711467890776e-05,
334
+ "loss": 0.0975,
335
+ "reward": 18.765625,
336
+ "reward_std": 16.231166645884514,
337
+ "rewards/accuracy_reward_staging": 0.1796875,
338
+ "rewards/format_reward": 0.734375,
339
+ "rewards/format_reward_staging": 0.0625,
340
  "step": 21
341
  },
342
  {
343
  "clip_ratio": 0.0,
344
+ "completion_length": 201.328125,
345
  "epoch": 4.4,
346
+ "grad_norm": 3.3796422509493653,
347
+ "kl": 1.326171875,
348
  "learning_rate": 1.3746065934159123e-05,
349
+ "loss": 0.0767,
350
+ "reward": 44.421875,
351
+ "reward_std": 40.881056517362595,
352
+ "rewards/accuracy_reward_staging": 0.4375,
353
+ "rewards/format_reward": 0.53125,
354
+ "rewards/format_reward_staging": 0.140625,
355
  "step": 22
356
  },
357
  {
358
  "clip_ratio": 0.0,
359
+ "completion_length": 216.640625,
360
  "epoch": 4.6,
361
+ "grad_norm": 82.74335209390387,
362
+ "kl": 2.109375,
363
  "learning_rate": 1.3090169943749475e-05,
364
+ "loss": 0.4923,
365
+ "reward": 35.171875,
366
+ "reward_std": 23.166345581412315,
367
+ "rewards/accuracy_reward_staging": 0.34375,
368
+ "rewards/format_reward": 0.578125,
369
+ "rewards/format_reward_staging": 0.21875,
370
  "step": 23
371
  },
372
  {
373
  "clip_ratio": 0.0,
374
+ "completion_length": 176.5625,
375
  "epoch": 4.8,
376
+ "grad_norm": 2.383806116929922,
377
+ "kl": 1.21484375,
378
  "learning_rate": 1.2419218955996677e-05,
379
+ "loss": 0.075,
380
+ "reward": 50.8125,
381
+ "reward_std": 33.22144624590874,
382
+ "rewards/accuracy_reward_staging": 0.5,
383
+ "rewards/format_reward": 0.6875,
384
+ "rewards/format_reward_staging": 0.125,
385
  "step": 24
386
  },
387
  {
388
  "clip_ratio": 0.0,
389
+ "completion_length": 165.625,
390
  "epoch": 5.0,
391
+ "grad_norm": 7.110162206929474,
392
+ "kl": 1.2626953125,
393
  "learning_rate": 1.1736481776669307e-05,
394
+ "loss": 0.1023,
395
+ "reward": 5.453125,
396
+ "reward_std": 10.403940886259079,
397
+ "rewards/accuracy_reward_staging": 0.046875,
398
+ "rewards/format_reward": 0.4375,
399
+ "rewards/format_reward_staging": 0.328125,
400
  "step": 25
401
  },
402
  {
403
  "clip_ratio": 0.0,
404
+ "completion_length": 128.234375,
405
  "epoch": 5.2,
406
+ "grad_norm": 43.694580176383475,
407
+ "kl": 2.83203125,
408
  "learning_rate": 1.1045284632676535e-05,
409
+ "loss": 0.0801,
410
+ "reward": 39.828125,
411
+ "reward_std": 34.28479705750942,
412
+ "rewards/accuracy_reward_staging": 0.390625,
413
  "rewards/format_reward": 0.328125,
414
+ "rewards/format_reward_staging": 0.4375,
415
  "step": 26
416
  },
417
  {
418
  "clip_ratio": 0.0,
419
+ "completion_length": 134.453125,
420
  "epoch": 5.4,
421
+ "grad_norm": 9.162574909416069,
422
+ "kl": 2.01953125,
423
  "learning_rate": 1.0348994967025012e-05,
424
+ "loss": 0.0665,
425
+ "reward": 32.03125,
426
+ "reward_std": 18.438117399811745,
427
+ "rewards/accuracy_reward_staging": 0.3125,
428
+ "rewards/format_reward": 0.359375,
429
+ "rewards/format_reward_staging": 0.421875,
430
  "step": 27
431
  },
432
  {
433
  "clip_ratio": 0.0,
434
+ "completion_length": 118.28125,
435
  "epoch": 5.6,
436
+ "grad_norm": 3.2683542850086518,
437
+ "kl": 1.1953125,
438
  "learning_rate": 9.651005032974994e-06,
439
+ "loss": 0.0622,
440
+ "reward": 24.34375,
441
+ "reward_std": 19.54595375061035,
442
+ "rewards/accuracy_reward_staging": 0.234375,
443
+ "rewards/format_reward": 0.296875,
444
+ "rewards/format_reward_staging": 0.609375,
445
  "step": 28
446
  },
447
  {
448
  "clip_ratio": 0.0,
449
+ "completion_length": 108.875,
450
  "epoch": 5.8,
451
+ "grad_norm": 12.531126144185139,
452
+ "kl": 2.03515625,
453
  "learning_rate": 8.954715367323468e-06,
454
+ "loss": 0.049,
455
+ "reward": 36.578125,
456
+ "reward_std": 34.67615723609924,
457
+ "rewards/accuracy_reward_staging": 0.359375,
458
+ "rewards/format_reward": 0.09375,
459
+ "rewards/format_reward_staging": 0.546875,
460
  "step": 29
461
  },
462
  {
463
  "epoch": 6.0,
464
+ "grad_norm": 7.540837741697303,
465
  "learning_rate": 8.263518223330698e-06,
466
+ "loss": 0.0011,
467
  "step": 30
468
  },
469
  {
470
  "epoch": 6.0,
471
  "eval_clip_ratio": 0.0,
472
+ "eval_completion_length": 100.296875,
473
+ "eval_kl": 3.015625,
474
+ "eval_loss": 0.11326652020215988,
475
+ "eval_reward": 25.828125,
476
+ "eval_reward_std": 27.24666577577591,
477
+ "eval_rewards/accuracy_reward_staging": 0.25,
478
+ "eval_rewards/format_reward": 0.096875,
479
+ "eval_rewards/format_reward_staging": 0.73125,
480
+ "eval_runtime": 37.6008,
481
+ "eval_samples_per_second": 0.532,
482
+ "eval_steps_per_second": 0.053,
483
  "step": 30
484
  },
485
  {
486
  "clip_ratio": 0.0,
487
+ "completion_length": 103.6328125,
488
  "epoch": 6.2,
489
+ "grad_norm": 2399452.1498753084,
490
+ "kl": 21377.13134765625,
491
  "learning_rate": 7.580781044003324e-06,
492
+ "loss": 3099.2456,
493
+ "reward": 32.890625,
494
+ "reward_std": 27.955890655517578,
495
+ "rewards/accuracy_reward_staging": 0.3203125,
496
+ "rewards/format_reward": 0.140625,
497
+ "rewards/format_reward_staging": 0.71875,
498
  "step": 31
499
  },
500
  {
501
  "clip_ratio": 0.0,
502
+ "completion_length": 99.6875,
503
  "epoch": 6.4,
504
+ "grad_norm": 2.865078068885676,
505
+ "kl": 1.111328125,
506
  "learning_rate": 6.909830056250527e-06,
507
+ "loss": 0.0568,
508
+ "reward": 39.9375,
509
+ "reward_std": 37.32955229282379,
510
+ "rewards/accuracy_reward_staging": 0.390625,
511
+ "rewards/format_reward": 0.078125,
512
+ "rewards/format_reward_staging": 0.796875,
513
  "step": 32
514
  },
515
  {
516
  "clip_ratio": 0.0,
517
+ "completion_length": 92.28125,
518
  "epoch": 6.6,
519
+ "grad_norm": 1415.8467105124043,
520
+ "kl": 77.443359375,
521
  "learning_rate": 6.25393406584088e-06,
522
+ "loss": 4.1061,
523
+ "reward": 0.875,
524
+ "reward_std": 0.3340607285499573,
525
  "rewards/accuracy_reward_staging": 0.0,
526
+ "rewards/format_reward": 0.0625,
527
+ "rewards/format_reward_staging": 0.8125,
528
  "step": 33
529
  },
530
  {
531
  "clip_ratio": 0.0,
532
+ "completion_length": 102.671875,
533
  "epoch": 6.8,
534
+ "grad_norm": 13.762426532249034,
535
+ "kl": 1.7431640625,
536
  "learning_rate": 5.616288532109225e-06,
537
+ "loss": 0.0994,
538
+ "reward": 22.8125,
539
+ "reward_std": 34.106568336486816,
540
+ "rewards/accuracy_reward_staging": 0.21875,
541
+ "rewards/format_reward": 0.1875,
542
+ "rewards/format_reward_staging": 0.75,
543
  "step": 34
544
  },
545
  {
546
  "clip_ratio": 0.0,
547
+ "completion_length": 95.3125,
548
  "epoch": 7.0,
549
+ "grad_norm": 2.6063236920960606,
550
+ "kl": 0.986328125,
551
  "learning_rate": 5.000000000000003e-06,
552
+ "loss": 0.0339,
553
+ "reward": 43.078125,
554
+ "reward_std": 44.85216808319092,
555
+ "rewards/accuracy_reward_staging": 0.421875,
556
+ "rewards/format_reward": 0.0625,
557
+ "rewards/format_reward_staging": 0.828125,
558
  "step": 35
559
  },
560
  {
561
  "clip_ratio": 0.0,
562
+ "completion_length": 99.4375,
563
  "epoch": 7.2,
564
+ "grad_norm": 3.5202361717965345,
565
+ "kl": 1.115234375,
566
  "learning_rate": 4.408070965292534e-06,
567
+ "loss": 0.0006,
568
+ "reward": 22.6875,
569
+ "reward_std": 22.75158140063286,
570
+ "rewards/accuracy_reward_staging": 0.21875,
571
+ "rewards/format_reward": 0.0625,
572
+ "rewards/format_reward_staging": 0.75,
573
  "step": 36
574
  },
575
  {
576
  "clip_ratio": 0.0,
577
+ "completion_length": 97.71875,
578
  "epoch": 7.4,
579
+ "grad_norm": 466.43821565088575,
580
+ "kl": 9.513671875,
581
  "learning_rate": 3.8433852467434175e-06,
582
+ "loss": 0.6714,
583
+ "reward": 52.484375,
584
+ "reward_std": 31.5599946975708,
585
+ "rewards/accuracy_reward_staging": 0.515625,
586
+ "rewards/format_reward": 0.078125,
587
+ "rewards/format_reward_staging": 0.84375,
588
  "step": 37
589
  },
590
  {
591
  "clip_ratio": 0.0,
592
+ "completion_length": 98.75,
593
  "epoch": 7.6,
594
+ "grad_norm": 17.64734775020204,
595
+ "kl": 0.958984375,
596
  "learning_rate": 3.308693936411421e-06,
597
+ "loss": 0.1583,
598
+ "reward": 30.59375,
599
+ "reward_std": 28.57139226794243,
600
+ "rewards/accuracy_reward_staging": 0.296875,
601
+ "rewards/format_reward": 0.09375,
602
+ "rewards/format_reward_staging": 0.8125,
603
  "step": 38
604
  },
605
  {
606
  "clip_ratio": 0.0,
607
+ "completion_length": 99.640625,
608
  "epoch": 7.8,
609
+ "grad_norm": 2.4780466685026976,
610
+ "kl": 0.8603515625,
611
  "learning_rate": 2.8066019966134907e-06,
612
+ "loss": 0.0571,
613
+ "reward": 22.8125,
614
+ "reward_std": 23.837719172239304,
615
+ "rewards/accuracy_reward_staging": 0.21875,
616
+ "rewards/format_reward": 0.125,
617
+ "rewards/format_reward_staging": 0.8125,
618
  "step": 39
619
  },
620
  {
621
  "epoch": 8.0,
622
+ "grad_norm": 2.372614998661594,
623
  "learning_rate": 2.339555568810221e-06,
624
+ "loss": 0.0938,
625
  "step": 40
626
  },
627
  {
628
  "epoch": 8.0,
629
  "eval_clip_ratio": 0.0,
630
+ "eval_completion_length": 96.296875,
631
+ "eval_kl": 1.165625,
632
+ "eval_loss": 0.08083952218294144,
633
+ "eval_reward": 36.484375,
634
+ "eval_reward_std": 30.610754093527795,
635
+ "eval_rewards/accuracy_reward_staging": 0.35625,
636
+ "eval_rewards/format_reward": 0.059375,
637
+ "eval_rewards/format_reward_staging": 0.8,
638
+ "eval_runtime": 40.1823,
639
+ "eval_samples_per_second": 0.498,
640
+ "eval_steps_per_second": 0.05,
641
  "step": 40
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
+ "completion_length": 99.84375,
646
  "epoch": 8.2,
647
+ "grad_norm": 3.8984904347848293,
648
+ "kl": 1.18994140625,
649
  "learning_rate": 1.9098300562505266e-06,
650
+ "loss": 0.182,
651
+ "reward": 35.9765625,
652
+ "reward_std": 32.3325060531497,
653
+ "rewards/accuracy_reward_staging": 0.3515625,
654
+ "rewards/format_reward": 0.0703125,
655
+ "rewards/format_reward_staging": 0.75,
656
  "step": 41
657
  },
658
  {
659
  "clip_ratio": 0.0,
660
+ "completion_length": 96.28125,
661
  "epoch": 8.4,
662
+ "grad_norm": 1.9843890762929741,
663
+ "kl": 0.8955078125,
664
  "learning_rate": 1.5195190384357405e-06,
665
+ "loss": 0.0792,
666
+ "reward": 58.625,
667
+ "reward_std": 38.564720049500465,
668
+ "rewards/accuracy_reward_staging": 0.578125,
669
+ "rewards/format_reward": 0.03125,
670
+ "rewards/format_reward_staging": 0.78125,
671
  "step": 42
672
  },
673
  {
674
  "clip_ratio": 0.0,
675
+ "completion_length": 89.109375,
676
  "epoch": 8.6,
677
+ "grad_norm": 5.357700004694207,
678
+ "kl": 1.40625,
679
  "learning_rate": 1.1705240714107301e-06,
680
+ "loss": 0.0795,
681
+ "reward": 0.875,
682
+ "reward_std": 0.3221946507692337,
683
  "rewards/accuracy_reward_staging": 0.0,
684
+ "rewards/format_reward": 0.046875,
685
+ "rewards/format_reward_staging": 0.828125,
686
  "step": 43
687
  },
688
  {
689
  "clip_ratio": 0.0,
690
+ "completion_length": 90.609375,
691
  "epoch": 8.8,
692
+ "grad_norm": 2.7303368647900483,
693
+ "kl": 0.94921875,
694
  "learning_rate": 8.645454235739903e-07,
695
+ "loss": 0.0761,
696
+ "reward": 41.484375,
697
+ "reward_std": 41.522899970412254,
698
+ "rewards/accuracy_reward_staging": 0.40625,
699
+ "rewards/format_reward": 0.03125,
700
+ "rewards/format_reward_staging": 0.828125,
701
  "step": 44
702
  },
703
  {
704
  "clip_ratio": 0.0,
705
+ "completion_length": 90.1875,
706
  "epoch": 9.0,
707
+ "grad_norm": 78.21470135974528,
708
+ "kl": 6.1640625,
709
  "learning_rate": 6.030737921409169e-07,
710
+ "loss": 0.1884,
711
+ "reward": 60.3125,
712
+ "reward_std": 25.90641689300537,
713
+ "rewards/accuracy_reward_staging": 0.59375,
714
+ "rewards/format_reward": 0.03125,
715
+ "rewards/format_reward_staging": 0.90625,
716
  "step": 45
717
  },
718
  {
719
  "clip_ratio": 0.0,
720
+ "completion_length": 91.8125,
721
  "epoch": 9.2,
722
+ "grad_norm": 130.51998459418078,
723
+ "kl": 8.439453125,
724
  "learning_rate": 3.8738304061681107e-07,
725
+ "loss": 0.5403,
726
+ "reward": 55.53125,
727
+ "reward_std": 25.359338760375977,
728
+ "rewards/accuracy_reward_staging": 0.546875,
729
+ "rewards/format_reward": 0.015625,
730
+ "rewards/format_reward_staging": 0.828125,
731
  "step": 46
732
  },
733
  {
734
  "clip_ratio": 0.0,
735
+ "completion_length": 87.453125,
736
  "epoch": 9.4,
737
+ "grad_norm": 4.1866163482406575,
738
+ "kl": 1.2099609375,
739
  "learning_rate": 2.1852399266194312e-07,
740
+ "loss": 0.0669,
741
+ "reward": 32.046875,
742
+ "reward_std": 17.416973531246185,
743
+ "rewards/accuracy_reward_staging": 0.3125,
744
+ "rewards/format_reward": 0.0625,
745
+ "rewards/format_reward_staging": 0.734375,
746
  "step": 47
747
  },
748
  {
749
  "clip_ratio": 0.0,
750
+ "completion_length": 85.34375,
751
  "epoch": 9.6,
752
+ "grad_norm": 4.835456441041561,
753
+ "kl": 1.0126953125,
754
  "learning_rate": 9.731931258429638e-08,
755
+ "loss": 0.1102,
756
+ "reward": 32.125,
757
+ "reward_std": 17.38801845908165,
758
+ "rewards/accuracy_reward_staging": 0.3125,
759
+ "rewards/format_reward": 0.0,
760
+ "rewards/format_reward_staging": 0.875,
761
  "step": 48
762
  },
763
  {
764
  "clip_ratio": 0.0,
765
+ "completion_length": 91.359375,
766
  "epoch": 9.8,
767
+ "grad_norm": 5.14395144365364,
768
+ "kl": 1.2841796875,
769
  "learning_rate": 2.4359497401758026e-08,
770
+ "loss": 0.0001,
771
+ "reward": 36.75,
772
+ "reward_std": 36.8648544549942,
773
+ "rewards/accuracy_reward_staging": 0.359375,
774
+ "rewards/format_reward": 0.09375,
775
+ "rewards/format_reward_staging": 0.71875,
776
  "step": 49
777
  },
778
  {
779
  "epoch": 10.0,
780
+ "grad_norm": 3.2937172350545656,
781
  "learning_rate": 0.0,
782
+ "loss": 0.0404,
783
  "step": 50
784
  },
785
  {
786
  "epoch": 10.0,
787
  "eval_clip_ratio": 0.0,
788
+ "eval_completion_length": 94.065625,
789
+ "eval_kl": 3.14765625,
790
+ "eval_loss": 0.24544629454612732,
791
+ "eval_reward": 38.0125,
792
+ "eval_reward_std": 26.67088475525379,
793
+ "eval_rewards/accuracy_reward_staging": 0.371875,
794
+ "eval_rewards/format_reward": 0.01875,
795
+ "eval_rewards/format_reward_staging": 0.80625,
796
+ "eval_runtime": 39.9739,
797
+ "eval_samples_per_second": 0.5,
798
+ "eval_steps_per_second": 0.05,
799
  "step": 50
800
  },
801
  {
802
+ "clip_ratio": 0.0,
803
+ "completion_length": 85.3125,
804
  "epoch": 10.0,
805
+ "kl": 1.142578125,
806
+ "reward": 46.140625,
807
+ "reward_std": 24.36823770403862,
808
+ "rewards/accuracy_reward_staging": 0.453125,
809
+ "rewards/format_reward": 0.03125,
810
+ "rewards/format_reward_staging": 0.796875,
811
  "step": 50,
812
  "total_flos": 0.0,
813
+ "train_loss": 62.185223487904295,
814
+ "train_runtime": 1153.4055,
815
+ "train_samples_per_second": 0.173,
816
+ "train_steps_per_second": 0.043
817
  }
818
  ],
819
  "logging_steps": 1,