atutej commited on
Commit
3b2dcaa
·
verified ·
1 Parent(s): 825dec2

Model save

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +281 -281
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/atutej/huggingface/runs/4e5ri02h)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/atutej/huggingface/runs/ek4mz2ft)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.04260323340275202,
4
- "train_runtime": 3381.4576,
5
- "train_samples_per_second": 0.296,
6
- "train_steps_per_second": 0.049
7
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.017725162387612355,
4
+ "train_runtime": 3869.3247,
5
+ "train_samples_per_second": 0.258,
6
+ "train_steps_per_second": 0.043
7
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.04260323340275202,
4
- "train_runtime": 3381.4576,
5
- "train_samples_per_second": 0.296,
6
- "train_steps_per_second": 0.049
7
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -0.017725162387612355,
4
+ "train_runtime": 3869.3247,
5
+ "train_samples_per_second": 0.258,
6
+ "train_steps_per_second": 0.043
7
  }
trainer_state.json CHANGED
@@ -16,25 +16,25 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.002083333333333337,
19
- "completions/max_length": 728.8,
20
- "completions/max_terminated_length": 700.9,
21
- "completions/mean_length": 237.7083427429199,
22
- "completions/mean_terminated_length": 236.154793548584,
23
- "completions/min_length": 55.3,
24
- "completions/min_terminated_length": 55.3,
25
  "epoch": 0.05997001499250375,
26
- "frac_reward_zero_std": 0.01666666716337204,
27
- "grad_norm": 0.453125,
28
- "kl": 0.05377130508422852,
29
  "learning_rate": 1.9855293386108995e-05,
30
- "loss": 0.0039,
31
- "num_tokens": 232036.0,
32
- "reward": 0.9150718182325364,
33
- "reward_std": 0.31903862953186035,
34
- "rewards/_accuracy_reward/mean": 0.29215513318777087,
35
- "rewards/_accuracy_reward/std": 0.20658667236566544,
36
- "rewards/_format_reward/mean": 0.6229166716337204,
37
- "rewards/_format_reward/std": 0.28700760900974276,
38
  "step": 10
39
  },
40
  {
@@ -44,25 +44,25 @@
44
  "clip_ratio/low_min": 0.0,
45
  "clip_ratio/region_mean": 0.0,
46
  "completions/clipped_ratio": 0.0,
47
- "completions/max_length": 706.2,
48
- "completions/max_terminated_length": 706.2,
49
- "completions/mean_length": 312.7083450317383,
50
- "completions/mean_terminated_length": 312.7083450317383,
51
- "completions/min_length": 92.1,
52
- "completions/min_terminated_length": 92.1,
53
  "epoch": 0.1199400299850075,
54
  "frac_reward_zero_std": 0.0,
55
- "grad_norm": 0.326171875,
56
- "kl": 0.069390869140625,
57
  "learning_rate": 1.936044737814273e-05,
58
- "loss": 0.0615,
59
- "num_tokens": 500288.0,
60
- "reward": 1.3227388501167296,
61
- "reward_std": 0.23705578446388245,
62
- "rewards/_accuracy_reward/mean": 0.35398882925510405,
63
- "rewards/_accuracy_reward/std": 0.18871113955974578,
64
- "rewards/_format_reward/mean": 0.96875,
65
- "rewards/_format_reward/std": 0.1373054191470146,
66
  "step": 20
67
  },
68
  {
@@ -72,25 +72,25 @@
72
  "clip_ratio/low_min": 0.0,
73
  "clip_ratio/region_mean": 0.0,
74
  "completions/clipped_ratio": 0.0,
75
- "completions/max_length": 639.0,
76
- "completions/max_terminated_length": 639.0,
77
- "completions/mean_length": 240.73542327880858,
78
- "completions/mean_terminated_length": 240.73542327880858,
79
- "completions/min_length": 75.9,
80
- "completions/min_terminated_length": 75.9,
81
  "epoch": 0.17991004497751126,
82
  "frac_reward_zero_std": 0.0,
83
- "grad_norm": 0.57421875,
84
- "kl": 0.099114990234375,
85
  "learning_rate": 1.8531342035272768e-05,
86
- "loss": 0.0922,
87
- "num_tokens": 734305.0,
88
- "reward": 1.4282155513763428,
89
- "reward_std": 0.16938417106866838,
90
- "rewards/_accuracy_reward/mean": 0.43863220810890197,
91
- "rewards/_accuracy_reward/std": 0.1596881665289402,
92
- "rewards/_format_reward/mean": 0.9895833373069763,
93
- "rewards/_format_reward/std": 0.06349536329507828,
94
  "step": 30
95
  },
96
  {
@@ -100,25 +100,25 @@
100
  "clip_ratio/low_min": 0.0,
101
  "clip_ratio/region_mean": 0.0,
102
  "completions/clipped_ratio": 0.0,
103
- "completions/max_length": 322.9,
104
- "completions/max_terminated_length": 322.9,
105
- "completions/mean_length": 115.66041946411133,
106
- "completions/mean_terminated_length": 115.66041946411133,
107
- "completions/min_length": 58.9,
108
- "completions/min_terminated_length": 58.9,
109
  "epoch": 0.239880059970015,
110
  "frac_reward_zero_std": 0.0,
111
  "grad_norm": 0.63671875,
112
- "kl": 0.198193359375,
113
  "learning_rate": 1.7397584510798208e-05,
114
- "loss": 0.0363,
115
- "num_tokens": 907950.0,
116
- "reward": 1.4457251667976379,
117
- "reward_std": 0.1396712526679039,
118
- "rewards/_accuracy_reward/mean": 0.4478084534406662,
119
- "rewards/_accuracy_reward/std": 0.1528099738061428,
120
- "rewards/_format_reward/mean": 0.9979166686534882,
121
- "rewards/_format_reward/std": 0.014433756470680237,
122
  "step": 40
123
  },
124
  {
@@ -128,24 +128,24 @@
128
  "clip_ratio/low_min": 0.0,
129
  "clip_ratio/region_mean": 0.0,
130
  "completions/clipped_ratio": 0.0,
131
- "completions/max_length": 344.4,
132
- "completions/max_terminated_length": 344.4,
133
- "completions/mean_length": 133.3541702270508,
134
- "completions/mean_terminated_length": 133.3541702270508,
135
- "completions/min_length": 68.4,
136
- "completions/min_terminated_length": 68.4,
137
  "epoch": 0.29985007496251875,
138
  "frac_reward_zero_std": 0.0,
139
- "grad_norm": 0.55859375,
140
- "kl": 0.16903076171875,
141
  "learning_rate": 1.5999661014486956e-05,
142
- "loss": 0.0222,
143
- "num_tokens": 1090352.0,
144
- "reward": 1.477199375629425,
145
- "reward_std": 0.12433719113469124,
146
- "rewards/_accuracy_reward/mean": 0.4771993726491928,
147
- "rewards/_accuracy_reward/std": 0.14259819611907004,
148
- "rewards/_format_reward/mean": 1.0,
149
  "rewards/_format_reward/std": 0.0,
150
  "step": 50
151
  },
@@ -155,26 +155,26 @@
155
  "clip_ratio/low_mean": 0.0,
156
  "clip_ratio/low_min": 0.0,
157
  "clip_ratio/region_mean": 0.0,
158
- "completions/clipped_ratio": 0.0,
159
- "completions/max_length": 441.4,
160
- "completions/max_terminated_length": 441.4,
161
- "completions/mean_length": 162.48541870117188,
162
- "completions/mean_terminated_length": 162.48541870117188,
163
- "completions/min_length": 73.3,
164
- "completions/min_terminated_length": 73.3,
165
  "epoch": 0.3598200899550225,
166
  "frac_reward_zero_std": 0.0,
167
- "grad_norm": 0.51953125,
168
- "kl": 0.134783935546875,
169
  "learning_rate": 1.4387491059717653e-05,
170
- "loss": 0.034,
171
- "num_tokens": 1286305.0,
172
- "reward": 1.4927098155021667,
173
- "reward_std": 0.1404846042394638,
174
- "rewards/_accuracy_reward/mean": 0.494793102145195,
175
- "rewards/_accuracy_reward/std": 0.15004281625151633,
176
- "rewards/_format_reward/mean": 0.9979166686534882,
177
- "rewards/_format_reward/std": 0.014433756470680237,
178
  "step": 60
179
  },
180
  {
@@ -183,26 +183,26 @@
183
  "clip_ratio/low_mean": 0.0,
184
  "clip_ratio/low_min": 0.0,
185
  "clip_ratio/region_mean": 0.0,
186
- "completions/clipped_ratio": 0.002083333333333337,
187
- "completions/max_length": 528.9,
188
- "completions/max_terminated_length": 504.8,
189
- "completions/mean_length": 158.65208740234374,
190
- "completions/mean_terminated_length": 156.81835327148437,
191
- "completions/min_length": 67.8,
192
- "completions/min_terminated_length": 67.8,
193
  "epoch": 0.4197901049475262,
194
  "frac_reward_zero_std": 0.0,
195
- "grad_norm": 0.49609375,
196
- "kl": 0.13594970703125,
197
  "learning_rate": 1.2618644849608068e-05,
198
- "loss": 0.0934,
199
- "num_tokens": 1480586.0,
200
- "reward": 1.4793070673942565,
201
- "reward_std": 0.14201814979314803,
202
- "rewards/_accuracy_reward/mean": 0.4897237092256546,
203
- "rewards/_accuracy_reward/std": 0.14367412701249122,
204
- "rewards/_format_reward/mean": 0.9895833373069763,
205
- "rewards/_format_reward/std": 0.06349536329507828,
206
  "step": 70
207
  },
208
  {
@@ -212,25 +212,25 @@
212
  "clip_ratio/low_min": 0.0,
213
  "clip_ratio/region_mean": 0.0,
214
  "completions/clipped_ratio": 0.0,
215
- "completions/max_length": 451.0,
216
- "completions/max_terminated_length": 451.0,
217
- "completions/mean_length": 140.0562545776367,
218
- "completions/mean_terminated_length": 140.0562545776367,
219
- "completions/min_length": 66.8,
220
- "completions/min_terminated_length": 66.8,
221
  "epoch": 0.47976011994003,
222
  "frac_reward_zero_std": 0.0,
223
- "grad_norm": 0.494140625,
224
- "kl": 0.13839111328125,
225
  "learning_rate": 1.075628745884457e-05,
226
- "loss": 0.0383,
227
- "num_tokens": 1665965.0,
228
- "reward": 1.5421025276184082,
229
- "reward_std": 0.15527970492839813,
230
- "rewards/_accuracy_reward/mean": 0.550435796380043,
231
- "rewards/_accuracy_reward/std": 0.14859750047326087,
232
- "rewards/_format_reward/mean": 0.9916666746139526,
233
- "rewards/_format_reward/std": 0.05773502588272095,
234
  "step": 80
235
  },
236
  {
@@ -240,25 +240,25 @@
240
  "clip_ratio/low_min": 0.0,
241
  "clip_ratio/region_mean": 0.0,
242
  "completions/clipped_ratio": 0.0,
243
- "completions/max_length": 357.0,
244
- "completions/max_terminated_length": 357.0,
245
- "completions/mean_length": 139.4875045776367,
246
- "completions/mean_terminated_length": 139.4875045776367,
247
- "completions/min_length": 65.1,
248
- "completions/min_terminated_length": 65.1,
249
  "epoch": 0.5397301349325337,
250
  "frac_reward_zero_std": 0.0,
251
- "grad_norm": 0.458984375,
252
- "kl": 0.109674072265625,
253
  "learning_rate": 8.866923223987303e-06,
254
- "loss": 0.0515,
255
- "num_tokens": 1850903.0,
256
- "reward": 1.4895017743110657,
257
- "reward_std": 0.1271946720778942,
258
- "rewards/_accuracy_reward/mean": 0.49366843402385713,
259
- "rewards/_accuracy_reward/std": 0.14131565093994142,
260
- "rewards/_format_reward/mean": 0.9958333373069763,
261
- "rewards/_format_reward/std": 0.028867512941360474,
262
  "step": 90
263
  },
264
  {
@@ -268,25 +268,25 @@
268
  "clip_ratio/low_min": 0.0,
269
  "clip_ratio/region_mean": 0.0,
270
  "completions/clipped_ratio": 0.0,
271
- "completions/max_length": 409.8,
272
- "completions/max_terminated_length": 409.8,
273
- "completions/mean_length": 147.96250610351564,
274
- "completions/mean_terminated_length": 147.96250610351564,
275
- "completions/min_length": 73.5,
276
- "completions/min_terminated_length": 73.5,
277
  "epoch": 0.5997001499250375,
278
  "frac_reward_zero_std": 0.0,
279
- "grad_norm": 0.515625,
280
- "kl": 0.10118408203125,
281
  "learning_rate": 7.018020889533348e-06,
282
- "loss": 0.0394,
283
- "num_tokens": 2039933.0,
284
- "reward": 1.497954213619232,
285
- "reward_std": 0.1677745535969734,
286
- "rewards/_accuracy_reward/mean": 0.5062875241041184,
287
- "rewards/_accuracy_reward/std": 0.1678355909883976,
288
- "rewards/_format_reward/mean": 0.9916666746139526,
289
- "rewards/_format_reward/std": 0.05773502588272095,
290
  "step": 100
291
  },
292
  {
@@ -296,25 +296,25 @@
296
  "clip_ratio/low_min": 0.0,
297
  "clip_ratio/region_mean": 0.0,
298
  "completions/clipped_ratio": 0.0,
299
- "completions/max_length": 390.7,
300
- "completions/max_terminated_length": 390.7,
301
- "completions/mean_length": 137.69375457763672,
302
- "completions/mean_terminated_length": 137.69375457763672,
303
- "completions/min_length": 64.9,
304
- "completions/min_terminated_length": 64.9,
305
  "epoch": 0.6596701649175413,
306
  "frac_reward_zero_std": 0.0,
307
- "grad_norm": 0.5234375,
308
- "kl": 0.090032958984375,
309
  "learning_rate": 5.2756043152032934e-06,
310
- "loss": 0.0155,
311
- "num_tokens": 2224082.0,
312
- "reward": 1.49248765707016,
313
- "reward_std": 0.14827005118131636,
314
- "rewards/_accuracy_reward/mean": 0.49873766899108884,
315
- "rewards/_accuracy_reward/std": 0.16313461735844612,
316
- "rewards/_format_reward/mean": 0.9937500059604645,
317
- "rewards/_format_reward/std": 0.04330126941204071,
318
  "step": 110
319
  },
320
  {
@@ -323,26 +323,26 @@
323
  "clip_ratio/low_mean": 0.0,
324
  "clip_ratio/low_min": 0.0,
325
  "clip_ratio/region_mean": 0.0,
326
- "completions/clipped_ratio": 0.0,
327
- "completions/max_length": 396.7,
328
- "completions/max_terminated_length": 396.7,
329
- "completions/mean_length": 147.02917022705077,
330
- "completions/mean_terminated_length": 147.02917022705077,
331
- "completions/min_length": 66.4,
332
- "completions/min_terminated_length": 66.4,
333
  "epoch": 0.719640179910045,
334
- "frac_reward_zero_std": 0.0,
335
- "grad_norm": 0.40234375,
336
- "kl": 0.082232666015625,
337
  "learning_rate": 3.7018947797172864e-06,
338
- "loss": 0.0188,
339
- "num_tokens": 2412640.0,
340
- "reward": 1.5127413272857666,
341
- "reward_std": 0.1542006738483906,
342
- "rewards/_accuracy_reward/mean": 0.5210746347904205,
343
- "rewards/_accuracy_reward/std": 0.16292970031499862,
344
- "rewards/_format_reward/mean": 0.9916666746139526,
345
- "rewards/_format_reward/std": 0.05773502588272095,
346
  "step": 120
347
  },
348
  {
@@ -352,25 +352,25 @@
352
  "clip_ratio/low_min": 0.0,
353
  "clip_ratio/region_mean": 0.0,
354
  "completions/clipped_ratio": 0.0,
355
- "completions/max_length": 439.5,
356
- "completions/max_terminated_length": 439.5,
357
- "completions/mean_length": 151.95000457763672,
358
- "completions/mean_terminated_length": 151.95000457763672,
359
- "completions/min_length": 58.6,
360
- "completions/min_terminated_length": 58.6,
361
  "epoch": 0.7796101949025487,
362
  "frac_reward_zero_std": 0.0,
363
- "grad_norm": 0.4765625,
364
- "kl": 0.08944091796875,
365
  "learning_rate": 2.353089073828255e-06,
366
- "loss": 0.0402,
367
- "num_tokens": 2604064.0,
368
- "reward": 1.5172916531562806,
369
- "reward_std": 0.13390944600105287,
370
- "rewards/_accuracy_reward/mean": 0.5214582800865173,
371
- "rewards/_accuracy_reward/std": 0.14149208813905717,
372
- "rewards/_format_reward/mean": 0.9958333373069763,
373
- "rewards/_format_reward/std": 0.028867512941360474,
374
  "step": 130
375
  },
376
  {
@@ -379,26 +379,26 @@
379
  "clip_ratio/low_mean": 0.0,
380
  "clip_ratio/low_min": 0.0,
381
  "clip_ratio/region_mean": 0.0,
382
- "completions/clipped_ratio": 0.0,
383
- "completions/max_length": 459.0,
384
- "completions/max_terminated_length": 459.0,
385
- "completions/mean_length": 143.82083587646486,
386
- "completions/mean_terminated_length": 143.82083587646486,
387
- "completions/min_length": 57.1,
388
- "completions/min_terminated_length": 57.1,
389
  "epoch": 0.8395802098950524,
390
- "frac_reward_zero_std": 0.0,
391
- "grad_norm": 0.486328125,
392
- "kl": 0.09305419921875,
393
  "learning_rate": 1.2773527263780626e-06,
394
- "loss": 0.066,
395
- "num_tokens": 2791442.0,
396
- "reward": 1.4838216304779053,
397
- "reward_std": 0.1543775752186775,
398
- "rewards/_accuracy_reward/mean": 0.4942382574081421,
399
- "rewards/_accuracy_reward/std": 0.1497928135097027,
400
- "rewards/_format_reward/mean": 0.9895833373069763,
401
- "rewards/_format_reward/std": 0.06349536329507828,
402
  "step": 140
403
  },
404
  {
@@ -407,25 +407,25 @@
407
  "clip_ratio/low_mean": 0.0,
408
  "clip_ratio/low_min": 0.0,
409
  "clip_ratio/region_mean": 0.0,
410
- "completions/clipped_ratio": 0.0,
411
- "completions/max_length": 428.9,
412
- "completions/max_terminated_length": 428.9,
413
- "completions/mean_length": 145.2979248046875,
414
- "completions/mean_terminated_length": 145.2979248046875,
415
- "completions/min_length": 57.3,
416
- "completions/min_terminated_length": 57.3,
417
  "epoch": 0.8995502248875562,
418
  "frac_reward_zero_std": 0.0,
419
- "grad_norm": 0.484375,
420
- "kl": 0.106024169921875,
421
  "learning_rate": 5.131000247938367e-07,
422
- "loss": 0.0329,
423
- "num_tokens": 2979625.0,
424
- "reward": 1.534533643722534,
425
- "reward_std": 0.1303482674062252,
426
- "rewards/_accuracy_reward/mean": 0.5345336198806763,
427
- "rewards/_accuracy_reward/std": 0.1466048091650009,
428
- "rewards/_format_reward/mean": 1.0,
429
  "rewards/_format_reward/std": 0.0,
430
  "step": 150
431
  },
@@ -435,26 +435,26 @@
435
  "clip_ratio/low_mean": 0.0,
436
  "clip_ratio/low_min": 0.0,
437
  "clip_ratio/region_mean": 0.0,
438
- "completions/clipped_ratio": 0.0,
439
- "completions/max_length": 383.6,
440
- "completions/max_terminated_length": 383.6,
441
- "completions/mean_length": 150.10000457763672,
442
- "completions/mean_terminated_length": 150.10000457763672,
443
- "completions/min_length": 58.6,
444
- "completions/min_terminated_length": 58.6,
445
  "epoch": 0.95952023988006,
446
  "frac_reward_zero_std": 0.0,
447
- "grad_norm": 2.046875,
448
- "kl": 0.114117431640625,
449
  "learning_rate": 8.762225008062675e-08,
450
- "loss": 0.0142,
451
- "num_tokens": 3169825.0,
452
- "reward": 1.4770540237426757,
453
- "reward_std": 0.1398945815861225,
454
- "rewards/_accuracy_reward/mean": 0.4812206119298935,
455
- "rewards/_accuracy_reward/std": 0.1583762623369694,
456
- "rewards/_format_reward/mean": 0.9958333373069763,
457
- "rewards/_format_reward/std": 0.028867512941360474,
458
  "step": 160
459
  },
460
  {
@@ -464,33 +464,33 @@
464
  "clip_ratio/low_min": 0.0,
465
  "clip_ratio/region_mean": 0.0,
466
  "completions/clipped_ratio": 0.0,
467
- "completions/max_length": 433.5,
468
- "completions/max_terminated_length": 433.5,
469
- "completions/mean_length": 161.9201431274414,
470
- "completions/mean_terminated_length": 161.9201431274414,
471
- "completions/min_length": 59.166666666666664,
472
- "completions/min_terminated_length": 59.166666666666664,
473
  "epoch": 0.9955022488755623,
474
  "frac_reward_zero_std": 0.0,
475
- "kl": 0.0867919921875,
476
- "num_tokens": 3287402.0,
477
- "reward": 1.5356033047040303,
478
- "reward_std": 0.14081149299939474,
479
- "rewards/_accuracy_reward/mean": 0.5460198571284612,
480
- "rewards/_accuracy_reward/std": 0.13319105903307596,
481
- "rewards/_format_reward/mean": 0.9895833333333334,
482
- "rewards/_format_reward/std": 0.05771308392286301,
483
  "step": 166,
484
  "total_flos": 0.0,
485
- "train_loss": 0.04260323340275202,
486
- "train_runtime": 3381.4576,
487
- "train_samples_per_second": 0.296,
488
- "train_steps_per_second": 0.049
489
  }
490
  ],
491
  "logging_steps": 10,
492
  "max_steps": 166,
493
- "num_input_tokens_seen": 3287402,
494
  "num_train_epochs": 1,
495
  "save_steps": 500,
496
  "stateful_callbacks": {
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.002083333333333337,
19
+ "completions/max_length": 414.3,
20
+ "completions/max_terminated_length": 390.7,
21
+ "completions/mean_length": 96.92292022705078,
22
+ "completions/mean_terminated_length": 95.08196258544922,
23
+ "completions/min_length": 43.4,
24
+ "completions/min_terminated_length": 43.4,
25
  "epoch": 0.05997001499250375,
26
+ "frac_reward_zero_std": 0.10000000149011612,
27
+ "grad_norm": 0.875,
28
+ "kl": 0.04828977584838867,
29
  "learning_rate": 1.9855293386108995e-05,
30
+ "loss": -0.0894,
31
+ "num_tokens": 164459.0,
32
+ "reward": 0.25649446398019793,
33
+ "reward_std": 0.12709882631897926,
34
+ "rewards/_accuracy_reward/mean": 0.25024444460868833,
35
+ "rewards/_accuracy_reward/std": 0.16934245973825454,
36
+ "rewards/_format_reward/mean": 0.00625,
37
+ "rewards/_format_reward/std": 0.02446230351924896,
38
  "step": 10
39
  },
40
  {
 
44
  "clip_ratio/low_min": 0.0,
45
  "clip_ratio/region_mean": 0.0,
46
  "completions/clipped_ratio": 0.0,
47
+ "completions/max_length": 405.9,
48
+ "completions/max_terminated_length": 405.9,
49
+ "completions/mean_length": 115.59583740234375,
50
+ "completions/mean_terminated_length": 115.59583740234375,
51
+ "completions/min_length": 65.6,
52
+ "completions/min_terminated_length": 65.6,
53
  "epoch": 0.1199400299850075,
54
  "frac_reward_zero_std": 0.0,
55
+ "grad_norm": 0.8359375,
56
+ "kl": 0.0818756103515625,
57
  "learning_rate": 1.936044737814273e-05,
58
+ "loss": 0.0581,
59
+ "num_tokens": 338097.0,
60
+ "reward": 0.36916170418262484,
61
+ "reward_std": 0.13012803941965104,
62
+ "rewards/_accuracy_reward/mean": 0.36916169822216033,
63
+ "rewards/_accuracy_reward/std": 0.1557157054543495,
64
+ "rewards/_format_reward/mean": 0.0,
65
+ "rewards/_format_reward/std": 0.0,
66
  "step": 20
67
  },
68
  {
 
72
  "clip_ratio/low_min": 0.0,
73
  "clip_ratio/region_mean": 0.0,
74
  "completions/clipped_ratio": 0.0,
75
+ "completions/max_length": 269.2,
76
+ "completions/max_terminated_length": 269.2,
77
+ "completions/mean_length": 85.4083351135254,
78
+ "completions/mean_terminated_length": 85.4083351135254,
79
+ "completions/min_length": 57.7,
80
+ "completions/min_terminated_length": 57.7,
81
  "epoch": 0.17991004497751126,
82
  "frac_reward_zero_std": 0.0,
83
+ "grad_norm": 0.6484375,
84
+ "kl": 0.102984619140625,
85
  "learning_rate": 1.8531342035272768e-05,
86
+ "loss": 0.0225,
87
+ "num_tokens": 497557.0,
88
+ "reward": 0.4350260511040688,
89
+ "reward_std": 0.14344265162944794,
90
+ "rewards/_accuracy_reward/mean": 0.43502604514360427,
91
+ "rewards/_accuracy_reward/std": 0.1680240161716938,
92
+ "rewards/_format_reward/mean": 0.0,
93
+ "rewards/_format_reward/std": 0.0,
94
  "step": 30
95
  },
96
  {
 
100
  "clip_ratio/low_min": 0.0,
101
  "clip_ratio/region_mean": 0.0,
102
  "completions/clipped_ratio": 0.0,
103
+ "completions/max_length": 198.7,
104
+ "completions/max_terminated_length": 198.7,
105
+ "completions/mean_length": 76.06458435058593,
106
+ "completions/mean_terminated_length": 76.06458435058593,
107
+ "completions/min_length": 44.7,
108
+ "completions/min_terminated_length": 44.7,
109
  "epoch": 0.239880059970015,
110
  "frac_reward_zero_std": 0.0,
111
  "grad_norm": 0.63671875,
112
+ "kl": 0.11517333984375,
113
  "learning_rate": 1.7397584510798208e-05,
114
+ "loss": 0.0064,
115
+ "num_tokens": 652196.0,
116
+ "reward": 0.4085416719317436,
117
+ "reward_std": 0.13231892064213752,
118
+ "rewards/_accuracy_reward/mean": 0.4085416689515114,
119
+ "rewards/_accuracy_reward/std": 0.1608368895947933,
120
+ "rewards/_format_reward/mean": 0.0,
121
+ "rewards/_format_reward/std": 0.0,
122
  "step": 40
123
  },
124
  {
 
128
  "clip_ratio/low_min": 0.0,
129
  "clip_ratio/region_mean": 0.0,
130
  "completions/clipped_ratio": 0.0,
131
+ "completions/max_length": 265.6,
132
+ "completions/max_terminated_length": 265.6,
133
+ "completions/mean_length": 89.46666946411133,
134
+ "completions/mean_terminated_length": 89.46666946411133,
135
+ "completions/min_length": 62.4,
136
+ "completions/min_terminated_length": 62.4,
137
  "epoch": 0.29985007496251875,
138
  "frac_reward_zero_std": 0.0,
139
+ "grad_norm": 0.392578125,
140
+ "kl": 0.118402099609375,
141
  "learning_rate": 1.5999661014486956e-05,
142
+ "loss": 0.0506,
143
+ "num_tokens": 813532.0,
144
+ "reward": 0.5038281351327896,
145
+ "reward_std": 0.12968316152691842,
146
+ "rewards/_accuracy_reward/mean": 0.5038281202316284,
147
+ "rewards/_accuracy_reward/std": 0.16127740293741227,
148
+ "rewards/_format_reward/mean": 0.0,
149
  "rewards/_format_reward/std": 0.0,
150
  "step": 50
151
  },
 
155
  "clip_ratio/low_mean": 0.0,
156
  "clip_ratio/low_min": 0.0,
157
  "clip_ratio/region_mean": 0.0,
158
+ "completions/clipped_ratio": 0.002083333333333337,
159
+ "completions/max_length": 364.6,
160
+ "completions/max_terminated_length": 304.4,
161
+ "completions/mean_length": 93.76042022705079,
162
+ "completions/mean_terminated_length": 91.8341781616211,
163
+ "completions/min_length": 62.7,
164
+ "completions/min_terminated_length": 62.7,
165
  "epoch": 0.3598200899550225,
166
  "frac_reward_zero_std": 0.0,
167
+ "grad_norm": 0.455078125,
168
+ "kl": 0.288519287109375,
169
  "learning_rate": 1.4387491059717653e-05,
170
+ "loss": 0.0338,
171
+ "num_tokens": 976497.0,
172
+ "reward": 0.4925865650177002,
173
+ "reward_std": 0.12617484703660012,
174
+ "rewards/_accuracy_reward/mean": 0.492586562037468,
175
+ "rewards/_accuracy_reward/std": 0.15444535091519357,
176
+ "rewards/_format_reward/mean": 0.0,
177
+ "rewards/_format_reward/std": 0.0,
178
  "step": 60
179
  },
180
  {
 
183
  "clip_ratio/low_mean": 0.0,
184
  "clip_ratio/low_min": 0.0,
185
  "clip_ratio/region_mean": 0.0,
186
+ "completions/clipped_ratio": 0.0,
187
+ "completions/max_length": 265.3,
188
+ "completions/max_terminated_length": 265.3,
189
+ "completions/mean_length": 85.07291946411132,
190
+ "completions/mean_terminated_length": 85.07291946411132,
191
+ "completions/min_length": 70.0,
192
+ "completions/min_terminated_length": 70.0,
193
  "epoch": 0.4197901049475262,
194
  "frac_reward_zero_std": 0.0,
195
+ "grad_norm": 0.451171875,
196
+ "kl": 0.089068603515625,
197
  "learning_rate": 1.2618644849608068e-05,
198
+ "loss": 0.0247,
199
+ "num_tokens": 1135460.0,
200
+ "reward": 0.4750963538885117,
201
+ "reward_std": 0.12951767966151237,
202
+ "rewards/_accuracy_reward/mean": 0.4750963240861893,
203
+ "rewards/_accuracy_reward/std": 0.1608477719128132,
204
+ "rewards/_format_reward/mean": 0.0,
205
+ "rewards/_format_reward/std": 0.0,
206
  "step": 70
207
  },
208
  {
 
212
  "clip_ratio/low_min": 0.0,
213
  "clip_ratio/region_mean": 0.0,
214
  "completions/clipped_ratio": 0.0,
215
+ "completions/max_length": 284.3,
216
+ "completions/max_terminated_length": 284.3,
217
+ "completions/mean_length": 87.73958587646484,
218
+ "completions/mean_terminated_length": 87.73958587646484,
219
+ "completions/min_length": 69.4,
220
+ "completions/min_terminated_length": 69.4,
221
  "epoch": 0.47976011994003,
222
  "frac_reward_zero_std": 0.0,
223
+ "grad_norm": 0.4140625,
224
+ "kl": 0.07869873046875,
225
  "learning_rate": 1.075628745884457e-05,
226
+ "loss": 0.0442,
227
+ "num_tokens": 1295727.0,
228
+ "reward": 0.5100168704986572,
229
+ "reward_std": 0.15252956375479698,
230
+ "rewards/_accuracy_reward/mean": 0.5100168436765671,
231
+ "rewards/_accuracy_reward/std": 0.17259212732315063,
232
+ "rewards/_format_reward/mean": 0.0,
233
+ "rewards/_format_reward/std": 0.0,
234
  "step": 80
235
  },
236
  {
 
240
  "clip_ratio/low_min": 0.0,
241
  "clip_ratio/region_mean": 0.0,
242
  "completions/clipped_ratio": 0.0,
243
+ "completions/max_length": 259.8,
244
+ "completions/max_terminated_length": 259.8,
245
+ "completions/mean_length": 88.41250228881836,
246
+ "completions/mean_terminated_length": 88.41250228881836,
247
+ "completions/min_length": 64.1,
248
+ "completions/min_terminated_length": 64.1,
249
  "epoch": 0.5397301349325337,
250
  "frac_reward_zero_std": 0.0,
251
+ "grad_norm": 0.66796875,
252
+ "kl": 0.17242431640625,
253
  "learning_rate": 8.866923223987303e-06,
254
+ "loss": 0.027,
255
+ "num_tokens": 1456149.0,
256
+ "reward": 0.4691749334335327,
257
+ "reward_std": 0.11657274290919303,
258
+ "rewards/_accuracy_reward/mean": 0.4691749155521393,
259
+ "rewards/_accuracy_reward/std": 0.13906535133719444,
260
+ "rewards/_format_reward/mean": 0.0,
261
+ "rewards/_format_reward/std": 0.0,
262
  "step": 90
263
  },
264
  {
 
268
  "clip_ratio/low_min": 0.0,
269
  "clip_ratio/region_mean": 0.0,
270
  "completions/clipped_ratio": 0.0,
271
+ "completions/max_length": 338.8,
272
+ "completions/max_terminated_length": 338.8,
273
+ "completions/mean_length": 94.3062530517578,
274
+ "completions/mean_terminated_length": 94.3062530517578,
275
+ "completions/min_length": 59.2,
276
+ "completions/min_terminated_length": 59.2,
277
  "epoch": 0.5997001499250375,
278
  "frac_reward_zero_std": 0.0,
279
+ "grad_norm": 0.59375,
280
+ "kl": 0.106280517578125,
281
  "learning_rate": 7.018020889533348e-06,
282
+ "loss": -0.0246,
283
+ "num_tokens": 1619424.0,
284
+ "reward": 0.5008379817008972,
285
+ "reward_std": 0.1251222789287567,
286
+ "rewards/_accuracy_reward/mean": 0.5008379787206649,
287
+ "rewards/_accuracy_reward/std": 0.14559592306613922,
288
+ "rewards/_format_reward/mean": 0.0,
289
+ "rewards/_format_reward/std": 0.0,
290
  "step": 100
291
  },
292
  {
 
296
  "clip_ratio/low_min": 0.0,
297
  "clip_ratio/region_mean": 0.0,
298
  "completions/clipped_ratio": 0.0,
299
+ "completions/max_length": 404.7,
300
+ "completions/max_terminated_length": 404.7,
301
+ "completions/mean_length": 115.0875030517578,
302
+ "completions/mean_terminated_length": 115.0875030517578,
303
+ "completions/min_length": 51.4,
304
+ "completions/min_terminated_length": 51.4,
305
  "epoch": 0.6596701649175413,
306
  "frac_reward_zero_std": 0.0,
307
+ "grad_norm": 0.353515625,
308
+ "kl": 0.0727294921875,
309
  "learning_rate": 5.2756043152032934e-06,
310
+ "loss": -0.0641,
311
+ "num_tokens": 1792722.0,
312
+ "reward": 0.4739863067865372,
313
+ "reward_std": 0.13204658553004264,
314
+ "rewards/_accuracy_reward/mean": 0.47398627996444703,
315
+ "rewards/_accuracy_reward/std": 0.15842494517564773,
316
+ "rewards/_format_reward/mean": 0.0,
317
+ "rewards/_format_reward/std": 0.0,
318
  "step": 110
319
  },
320
  {
 
323
  "clip_ratio/low_mean": 0.0,
324
  "clip_ratio/low_min": 0.0,
325
  "clip_ratio/region_mean": 0.0,
326
+ "completions/clipped_ratio": 0.002083333333333337,
327
+ "completions/max_length": 517.4,
328
+ "completions/max_terminated_length": 435.3,
329
+ "completions/mean_length": 136.24375381469727,
330
+ "completions/mean_terminated_length": 134.30962219238282,
331
+ "completions/min_length": 51.6,
332
+ "completions/min_terminated_length": 51.6,
333
  "epoch": 0.719640179910045,
334
+ "frac_reward_zero_std": 0.01666666716337204,
335
+ "grad_norm": 0.279296875,
336
+ "kl": 0.0670654296875,
337
  "learning_rate": 3.7018947797172864e-06,
338
+ "loss": -0.0695,
339
+ "num_tokens": 1976103.0,
340
+ "reward": 0.5253316760063171,
341
+ "reward_std": 0.12880267389118671,
342
+ "rewards/_accuracy_reward/mean": 0.5253316521644592,
343
+ "rewards/_accuracy_reward/std": 0.17076537311077117,
344
+ "rewards/_format_reward/mean": 0.0,
345
+ "rewards/_format_reward/std": 0.0,
346
  "step": 120
347
  },
348
  {
 
352
  "clip_ratio/low_min": 0.0,
353
  "clip_ratio/region_mean": 0.0,
354
  "completions/clipped_ratio": 0.0,
355
+ "completions/max_length": 522.1,
356
+ "completions/max_terminated_length": 522.1,
357
+ "completions/mean_length": 152.3375045776367,
358
+ "completions/mean_terminated_length": 152.3375045776367,
359
+ "completions/min_length": 46.7,
360
+ "completions/min_terminated_length": 46.7,
361
  "epoch": 0.7796101949025487,
362
  "frac_reward_zero_std": 0.0,
363
+ "grad_norm": 0.298828125,
364
+ "kl": 0.063446044921875,
365
  "learning_rate": 2.353089073828255e-06,
366
+ "loss": -0.0654,
367
+ "num_tokens": 2167713.0,
368
+ "reward": 0.48647034764289854,
369
+ "reward_std": 0.1461639277637005,
370
+ "rewards/_accuracy_reward/mean": 0.48647033274173734,
371
+ "rewards/_accuracy_reward/std": 0.17206955328583717,
372
+ "rewards/_format_reward/mean": 0.0,
373
+ "rewards/_format_reward/std": 0.0,
374
  "step": 130
375
  },
376
  {
 
379
  "clip_ratio/low_mean": 0.0,
380
  "clip_ratio/low_min": 0.0,
381
  "clip_ratio/region_mean": 0.0,
382
+ "completions/clipped_ratio": 0.002083333333333337,
383
+ "completions/max_length": 562.7,
384
+ "completions/max_terminated_length": 500.5,
385
+ "completions/mean_length": 158.33542022705078,
386
+ "completions/mean_terminated_length": 156.5459243774414,
387
+ "completions/min_length": 50.7,
388
+ "completions/min_terminated_length": 50.7,
389
  "epoch": 0.8395802098950524,
390
+ "frac_reward_zero_std": 0.03333333432674408,
391
+ "grad_norm": 0.5,
392
+ "kl": 0.06429443359375,
393
  "learning_rate": 1.2773527263780626e-06,
394
+ "loss": -0.0735,
395
+ "num_tokens": 2362058.0,
396
+ "reward": 0.4694186806678772,
397
+ "reward_std": 0.13372117429971694,
398
+ "rewards/_accuracy_reward/mean": 0.46941866278648375,
399
+ "rewards/_accuracy_reward/std": 0.1658677004277706,
400
+ "rewards/_format_reward/mean": 0.0,
401
+ "rewards/_format_reward/std": 0.0,
402
  "step": 140
403
  },
404
  {
 
407
  "clip_ratio/low_mean": 0.0,
408
  "clip_ratio/low_min": 0.0,
409
  "clip_ratio/region_mean": 0.0,
410
+ "completions/clipped_ratio": 0.004166666666666674,
411
+ "completions/max_length": 636.7,
412
+ "completions/max_terminated_length": 532.8,
413
+ "completions/mean_length": 168.87708892822266,
414
+ "completions/mean_terminated_length": 165.28546447753905,
415
+ "completions/min_length": 42.2,
416
+ "completions/min_terminated_length": 42.2,
417
  "epoch": 0.8995502248875562,
418
  "frac_reward_zero_std": 0.0,
419
+ "grad_norm": 0.232421875,
420
+ "kl": 0.085772705078125,
421
  "learning_rate": 5.131000247938367e-07,
422
+ "loss": -0.074,
423
+ "num_tokens": 2561559.0,
424
+ "reward": 0.5166842222213746,
425
+ "reward_std": 0.15563009977340697,
426
+ "rewards/_accuracy_reward/mean": 0.5166841924190522,
427
+ "rewards/_accuracy_reward/std": 0.1897404298186302,
428
+ "rewards/_format_reward/mean": 0.0,
429
  "rewards/_format_reward/std": 0.0,
430
  "step": 150
431
  },
 
435
  "clip_ratio/low_mean": 0.0,
436
  "clip_ratio/low_min": 0.0,
437
  "clip_ratio/region_mean": 0.0,
438
+ "completions/clipped_ratio": 0.004166666666666674,
439
+ "completions/max_length": 578.3,
440
+ "completions/max_terminated_length": 469.4,
441
+ "completions/mean_length": 176.32083740234376,
442
+ "completions/mean_terminated_length": 172.7707046508789,
443
+ "completions/min_length": 57.6,
444
+ "completions/min_terminated_length": 57.6,
445
  "epoch": 0.95952023988006,
446
  "frac_reward_zero_std": 0.0,
447
+ "grad_norm": 0.310546875,
448
+ "kl": 0.0620758056640625,
449
  "learning_rate": 8.762225008062675e-08,
450
+ "loss": -0.0572,
451
+ "num_tokens": 2764345.0,
452
+ "reward": 0.4916923582553864,
453
+ "reward_std": 0.14141732677817345,
454
+ "rewards/_accuracy_reward/mean": 0.49169233739376067,
455
+ "rewards/_accuracy_reward/std": 0.17603871822357178,
456
+ "rewards/_format_reward/mean": 0.0,
457
+ "rewards/_format_reward/std": 0.0,
458
  "step": 160
459
  },
460
  {
 
464
  "clip_ratio/low_min": 0.0,
465
  "clip_ratio/region_mean": 0.0,
466
  "completions/clipped_ratio": 0.0,
467
+ "completions/max_length": 447.6666666666667,
468
+ "completions/max_terminated_length": 447.6666666666667,
469
+ "completions/mean_length": 160.1840337117513,
470
+ "completions/mean_terminated_length": 160.1840337117513,
471
+ "completions/min_length": 49.333333333333336,
472
+ "completions/min_terminated_length": 49.333333333333336,
473
  "epoch": 0.9955022488755623,
474
  "frac_reward_zero_std": 0.0,
475
+ "kl": 0.069488525390625,
476
+ "num_tokens": 2881422.0,
477
+ "reward": 0.49811801811059314,
478
+ "reward_std": 0.12225283433993657,
479
+ "rewards/_accuracy_reward/mean": 0.4981180081764857,
480
+ "rewards/_accuracy_reward/std": 0.14967897906899452,
481
+ "rewards/_format_reward/mean": 0.0,
482
+ "rewards/_format_reward/std": 0.0,
483
  "step": 166,
484
  "total_flos": 0.0,
485
+ "train_loss": -0.017725162387612355,
486
+ "train_runtime": 3869.3247,
487
+ "train_samples_per_second": 0.258,
488
+ "train_steps_per_second": 0.043
489
  }
490
  ],
491
  "logging_steps": 10,
492
  "max_steps": 166,
493
+ "num_input_tokens_seen": 2881422,
494
  "num_train_epochs": 1,
495
  "save_steps": 500,
496
  "stateful_callbacks": {