Blancy commited on
Commit
38e5449
·
verified ·
1 Parent(s): cee4e2e

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +3 -3
  3. train_results.json +3 -3
  4. trainer_state.json +1341 -333
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen3-1.7B
3
- datasets: Blancy/1ktestfrom10kwithdifficultyclasses-without-difficult
4
  library_name: transformers
5
  model_name: Qwen3-1.7B-Open-R1-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen3-1.7B-Open-R1-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) on the [Blancy/1ktestfrom10kwithdifficultyclasses-without-difficult](https://huggingface.co/datasets/Blancy/1ktestfrom10kwithdifficultyclasses-without-difficult) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/7kqetcgc)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen3-1.7B
 
3
  library_name: transformers
4
  model_name: Qwen3-1.7B-Open-R1-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen3-1.7B-Open-R1-GRPO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/qvxw0mg7)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.004764942314540437,
4
- "train_runtime": 16049.8199,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.062,
7
  "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.2708671883675033,
4
+ "train_runtime": 44954.8681,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.067,
7
  "train_steps_per_second": 0.002
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.004764942314540437,
4
- "train_runtime": 16049.8199,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.062,
7
  "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.2708671883675033,
4
+ "train_runtime": 44954.8681,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.067,
7
  "train_steps_per_second": 0.002
8
  }
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 36,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12,7 +12,7 @@
12
  {
13
  "completion_length": 3512.2977294921875,
14
  "epoch": 0.027972027972027972,
15
- "grad_norm": 0.8605042695999146,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
@@ -26,9 +26,9 @@
26
  {
27
  "completion_length": 3691.6666870117188,
28
  "epoch": 0.055944055944055944,
29
- "grad_norm": 0.378974974155426,
30
  "kl": 0.0,
31
- "learning_rate": 2.5e-06,
32
  "loss": 0.0,
33
  "reward": 0.4680059477686882,
34
  "reward_std": 0.1798743773251772,
@@ -38,495 +38,1503 @@
38
  "step": 2
39
  },
40
  {
41
- "completion_length": 3411.9434814453125,
42
  "epoch": 0.08391608391608392,
43
- "grad_norm": 0.5123236775398254,
44
- "kl": 0.000850677490234375,
45
- "learning_rate": 5e-06,
46
  "loss": 0.0,
47
- "reward": 0.5870535895228386,
48
- "reward_std": 0.15364255383610725,
49
- "rewards/accuracy_reward": 0.15773810120299459,
50
  "rewards/format_reward": 0.0,
51
- "rewards/tag_count_reward": 0.4293154776096344,
52
  "step": 3
53
  },
54
  {
55
- "completion_length": 3326.3631591796875,
56
  "epoch": 0.11188811188811189,
57
- "grad_norm": 0.6567738056182861,
58
- "kl": 0.0010671615600585938,
59
- "learning_rate": 7.500000000000001e-06,
60
  "loss": 0.0,
61
- "reward": 0.5238095447421074,
62
- "reward_std": 0.12606673501431942,
63
- "rewards/accuracy_reward": 0.11309523973613977,
64
  "rewards/format_reward": 0.0,
65
- "rewards/tag_count_reward": 0.4107142984867096,
66
  "step": 4
67
  },
68
  {
69
- "completion_length": 3617.0536499023438,
70
  "epoch": 0.13986013986013987,
71
- "grad_norm": 0.5138365030288696,
72
- "kl": 0.00595855712890625,
73
- "learning_rate": 1e-05,
74
- "loss": 0.0002,
75
- "reward": 0.5572916716337204,
76
- "reward_std": 0.17760900035500526,
77
- "rewards/accuracy_reward": 0.13690476398915052,
78
  "rewards/format_reward": 0.0,
79
- "rewards/tag_count_reward": 0.4203869104385376,
80
  "step": 5
81
  },
82
  {
83
- "completion_length": 3649.2351684570312,
84
  "epoch": 0.16783216783216784,
85
- "grad_norm": 10.930187225341797,
86
- "kl": 0.11383056640625,
87
- "learning_rate": 9.978331270024887e-06,
88
- "loss": 0.0045,
89
- "reward": 0.5364583432674408,
90
- "reward_std": 0.24191416427493095,
91
- "rewards/accuracy_reward": 0.05952381109818816,
92
- "rewards/format_reward": 0.0,
93
- "rewards/tag_count_reward": 0.4769345298409462,
94
  "step": 6
95
  },
96
  {
97
- "completion_length": 3823.256103515625,
98
  "epoch": 0.1958041958041958,
99
- "grad_norm": 0.22635069489479065,
100
- "kl": 0.02630615234375,
101
- "learning_rate": 9.913533761814537e-06,
102
- "loss": 0.0011,
103
- "reward": 0.5312499925494194,
104
- "reward_std": 0.16136238723993301,
105
- "rewards/accuracy_reward": 0.14880952797830105,
106
- "rewards/format_reward": 0.0,
107
- "rewards/tag_count_reward": 0.382440485060215,
108
  "step": 7
109
  },
110
  {
111
- "completion_length": 3570.291748046875,
112
  "epoch": 0.22377622377622378,
113
- "grad_norm": 0.19483903050422668,
114
- "kl": 0.03973388671875,
115
- "learning_rate": 9.80623151079494e-06,
116
- "loss": 0.0016,
117
- "reward": 0.4813988208770752,
118
- "reward_std": 0.15597185026854277,
119
- "rewards/accuracy_reward": 0.09226190880872309,
120
  "rewards/format_reward": 0.0,
121
- "rewards/tag_count_reward": 0.3891369104385376,
122
  "step": 8
123
  },
124
  {
125
- "completion_length": 3669.8363647460938,
126
  "epoch": 0.2517482517482518,
127
- "grad_norm": 0.3098791539669037,
128
- "kl": 0.0445556640625,
129
- "learning_rate": 9.65745789630079e-06,
130
- "loss": 0.0018,
131
- "reward": 0.5543154776096344,
132
- "reward_std": 0.1814326737076044,
133
- "rewards/accuracy_reward": 0.16369047993794084,
134
- "rewards/format_reward": 0.0,
135
- "rewards/tag_count_reward": 0.3906250149011612,
136
  "step": 9
137
  },
138
  {
139
- "completion_length": 3507.3483276367188,
140
  "epoch": 0.27972027972027974,
141
- "grad_norm": 0.23944704234600067,
142
- "kl": 0.04766845703125,
143
- "learning_rate": 9.468645689567599e-06,
144
- "loss": 0.0019,
145
- "reward": 0.6175595223903656,
146
- "reward_std": 0.20797590538859367,
147
- "rewards/accuracy_reward": 0.1994047649204731,
148
- "rewards/format_reward": 0.0,
149
- "rewards/tag_count_reward": 0.418154776096344,
150
  "step": 10
151
  },
152
  {
153
- "completion_length": 3486.0625610351562,
154
  "epoch": 0.3076923076923077,
155
- "grad_norm": 0.2252836674451828,
156
- "kl": 0.0560302734375,
157
- "learning_rate": 9.241613255361455e-06,
158
- "loss": 0.0022,
159
- "reward": 0.5111607238650322,
160
- "reward_std": 0.14905241318047047,
161
- "rewards/accuracy_reward": 0.1250000037252903,
162
  "rewards/format_reward": 0.0,
163
- "rewards/tag_count_reward": 0.3861607164144516,
164
  "step": 11
165
  },
166
  {
167
- "completion_length": 3626.5327758789062,
168
  "epoch": 0.3356643356643357,
169
- "grad_norm": 0.3740616738796234,
170
- "kl": 0.06890869140625,
171
- "learning_rate": 8.978547040132317e-06,
172
- "loss": 0.0028,
173
- "reward": 0.4546131119132042,
174
- "reward_std": 0.10877569299191236,
175
- "rewards/accuracy_reward": 0.07142857369035482,
176
- "rewards/format_reward": 0.0,
177
- "rewards/tag_count_reward": 0.3831845298409462,
178
  "step": 12
179
  },
180
  {
181
- "completion_length": 3609.3452758789062,
182
  "epoch": 0.36363636363636365,
183
- "grad_norm": 0.24548310041427612,
184
- "kl": 0.073486328125,
185
- "learning_rate": 8.681980515339464e-06,
186
- "loss": 0.0029,
187
- "reward": 0.7008928880095482,
188
- "reward_std": 0.1800659578293562,
189
- "rewards/accuracy_reward": 0.3184523917734623,
190
- "rewards/format_reward": 0.0,
191
- "rewards/tag_count_reward": 0.382440485060215,
192
  "step": 13
193
  },
194
  {
195
- "completion_length": 3744.2798461914062,
196
  "epoch": 0.3916083916083916,
197
- "grad_norm": 0.2982732057571411,
198
- "kl": 0.0880126953125,
199
- "learning_rate": 8.354769778736407e-06,
200
- "loss": 0.0035,
201
- "reward": 0.7291666865348816,
202
- "reward_std": 0.3022393621504307,
203
- "rewards/accuracy_reward": 0.3660714328289032,
204
- "rewards/format_reward": 0.0,
205
- "rewards/tag_count_reward": 0.3630952388048172,
206
  "step": 14
207
  },
208
  {
209
- "completion_length": 4129.886962890625,
210
  "epoch": 0.4195804195804196,
211
- "grad_norm": 0.21393992006778717,
212
- "kl": 0.09912109375,
213
- "learning_rate": 8.00006604858821e-06,
214
- "loss": 0.004,
215
- "reward": 0.5193452388048172,
216
- "reward_std": 0.18354132398962975,
217
- "rewards/accuracy_reward": 0.20833333395421505,
218
- "rewards/format_reward": 0.0,
219
- "rewards/tag_count_reward": 0.3110119104385376,
220
  "step": 15
221
  },
222
  {
223
- "completion_length": 4105.565612792969,
224
  "epoch": 0.44755244755244755,
225
- "grad_norm": 0.2648344337940216,
226
- "kl": 0.111572265625,
227
- "learning_rate": 7.621285315716991e-06,
228
- "loss": 0.0045,
229
- "reward": 0.5394345372915268,
230
- "reward_std": 0.20268973521888256,
231
- "rewards/accuracy_reward": 0.22023809514939785,
232
- "rewards/format_reward": 0.0,
233
- "rewards/tag_count_reward": 0.3191964328289032,
234
  "step": 16
235
  },
236
  {
237
- "completion_length": 4122.520812988281,
238
  "epoch": 0.4755244755244755,
239
- "grad_norm": 0.18669383227825165,
240
- "kl": 0.135986328125,
241
- "learning_rate": 7.222075445642904e-06,
242
- "loss": 0.0054,
243
- "reward": 0.538690485060215,
244
- "reward_std": 0.12965587340295315,
245
- "rewards/accuracy_reward": 0.22619048180058599,
246
- "rewards/format_reward": 0.0,
247
- "rewards/tag_count_reward": 0.3125000074505806,
248
  "step": 17
249
  },
250
  {
251
- "completion_length": 4098.357116699219,
252
  "epoch": 0.5034965034965035,
253
- "grad_norm": 0.12457378208637238,
254
- "kl": 0.130126953125,
255
- "learning_rate": 6.80628104764508e-06,
256
- "loss": 0.0052,
257
- "reward": 0.5104166865348816,
258
- "reward_std": 0.1256360486149788,
259
- "rewards/accuracy_reward": 0.1994047632906586,
260
- "rewards/format_reward": 0.0,
261
- "rewards/tag_count_reward": 0.3110119104385376,
262
  "step": 18
263
  },
264
  {
265
- "completion_length": 3781.9761962890625,
266
  "epoch": 0.5314685314685315,
267
- "grad_norm": 0.1671716421842575,
268
- "kl": 0.1495361328125,
269
- "learning_rate": 6.377906449072578e-06,
270
- "loss": 0.006,
271
- "reward": 0.70238097012043,
272
- "reward_std": 0.2050741408020258,
273
- "rewards/accuracy_reward": 0.354166679084301,
274
- "rewards/format_reward": 0.0,
275
- "rewards/tag_count_reward": 0.348214291036129,
276
  "step": 19
277
  },
278
  {
279
- "completion_length": 4104.199401855469,
280
  "epoch": 0.5594405594405595,
281
- "grad_norm": 0.12906776368618011,
282
- "kl": 0.164306640625,
283
- "learning_rate": 5.9410771314830255e-06,
284
- "loss": 0.0066,
285
- "reward": 0.4784226268529892,
286
- "reward_std": 0.11611867044121027,
287
- "rewards/accuracy_reward": 0.18154762499034405,
288
  "rewards/format_reward": 0.0,
289
- "rewards/tag_count_reward": 0.296875,
290
  "step": 20
291
  },
292
  {
293
- "completion_length": 4042.2559814453125,
294
  "epoch": 0.5874125874125874,
295
- "grad_norm": 0.26009050011634827,
296
- "kl": 0.157470703125,
297
- "learning_rate": 5.500000000000001e-06,
298
- "loss": 0.0063,
299
- "reward": 0.5096726417541504,
300
- "reward_std": 0.19976547360420227,
301
- "rewards/accuracy_reward": 0.19642857555299997,
302
  "rewards/format_reward": 0.0,
303
  "rewards/tag_count_reward": 0.3132440522313118,
304
  "step": 21
305
  },
306
  {
307
- "completion_length": 4085.7352905273438,
308
  "epoch": 0.6153846153846154,
309
- "grad_norm": 0.1436339169740677,
310
- "kl": 0.16455078125,
311
- "learning_rate": 5.0589228685169776e-06,
312
- "loss": 0.0066,
313
- "reward": 0.5535714402794838,
314
- "reward_std": 0.15361771546304226,
315
- "rewards/accuracy_reward": 0.2380952388048172,
316
  "rewards/format_reward": 0.0,
317
- "rewards/tag_count_reward": 0.315476194024086,
318
  "step": 22
319
  },
320
  {
321
- "completion_length": 3802.232177734375,
322
  "epoch": 0.6433566433566433,
323
- "grad_norm": 0.5061047673225403,
324
- "kl": 0.18212890625,
325
- "learning_rate": 4.622093550927423e-06,
326
- "loss": 0.0073,
327
- "reward": 0.693452388048172,
328
- "reward_std": 0.22774529829621315,
329
- "rewards/accuracy_reward": 0.3511904813349247,
330
- "rewards/format_reward": 0.0,
331
- "rewards/tag_count_reward": 0.3422619104385376,
332
  "step": 23
333
  },
334
  {
335
- "completion_length": 3550.8185424804688,
336
  "epoch": 0.6713286713286714,
337
- "grad_norm": 0.14333468675613403,
338
- "kl": 0.171875,
339
- "learning_rate": 4.193718952354921e-06,
340
- "loss": 0.0069,
341
- "reward": 0.7083333432674408,
342
- "reward_std": 0.1849147491157055,
343
- "rewards/accuracy_reward": 0.3511904813349247,
344
  "rewards/format_reward": 0.0,
345
- "rewards/tag_count_reward": 0.3571428582072258,
346
  "step": 24
347
  },
348
  {
349
- "completion_length": 3492.514892578125,
350
  "epoch": 0.6993006993006993,
351
- "grad_norm": 0.14843598008155823,
352
- "kl": 0.174072265625,
353
- "learning_rate": 3.777924554357096e-06,
354
- "loss": 0.007,
355
- "reward": 0.8318452537059784,
356
- "reward_std": 0.24168619140982628,
357
- "rewards/accuracy_reward": 0.4494047686457634,
358
  "rewards/format_reward": 0.0,
359
- "rewards/tag_count_reward": 0.382440485060215,
360
  "step": 25
361
  },
362
  {
363
- "completion_length": 3533.2828369140625,
364
  "epoch": 0.7272727272727273,
365
- "grad_norm": 0.13751864433288574,
366
- "kl": 0.18212890625,
367
- "learning_rate": 3.378714684283011e-06,
368
- "loss": 0.0073,
369
- "reward": 0.737351194024086,
370
- "reward_std": 0.16352506168186665,
371
- "rewards/accuracy_reward": 0.380952388048172,
372
- "rewards/format_reward": 0.0,
373
- "rewards/tag_count_reward": 0.3563988134264946,
374
  "step": 26
375
  },
376
  {
377
- "completion_length": 3208.8630981445312,
378
  "epoch": 0.7552447552447552,
379
- "grad_norm": 0.47788581252098083,
380
- "kl": 0.20458984375,
381
- "learning_rate": 2.9999339514117913e-06,
382
- "loss": 0.0082,
383
- "reward": 0.7946428656578064,
384
- "reward_std": 0.18282870575785637,
385
- "rewards/accuracy_reward": 0.3898809514939785,
386
- "rewards/format_reward": 0.0,
387
- "rewards/tag_count_reward": 0.4047619104385376,
388
  "step": 27
389
  },
390
  {
391
- "completion_length": 3646.8929443359375,
392
  "epoch": 0.7832167832167832,
393
- "grad_norm": 0.1616424322128296,
394
- "kl": 0.191162109375,
395
- "learning_rate": 2.645230221263596e-06,
396
- "loss": 0.0076,
397
- "reward": 0.5208333432674408,
398
- "reward_std": 0.13056958466768265,
399
- "rewards/accuracy_reward": 0.16071428847499192,
400
- "rewards/format_reward": 0.0,
401
- "rewards/tag_count_reward": 0.3601190522313118,
402
  "step": 28
403
  },
404
  {
405
- "completion_length": 3597.824462890625,
406
  "epoch": 0.8111888111888111,
407
- "grad_norm": 0.2858404815196991,
408
- "kl": 0.168212890625,
409
- "learning_rate": 2.3180194846605367e-06,
410
- "loss": 0.0067,
411
- "reward": 0.6830357164144516,
412
- "reward_std": 0.22093605995178223,
413
- "rewards/accuracy_reward": 0.2946428619325161,
414
  "rewards/format_reward": 0.0,
415
- "rewards/tag_count_reward": 0.3883928582072258,
416
  "step": 29
417
  },
418
  {
419
- "completion_length": 3639.58935546875,
420
  "epoch": 0.8391608391608392,
421
- "grad_norm": 0.18465252220630646,
422
- "kl": 0.197021484375,
423
- "learning_rate": 2.021452959867684e-06,
424
- "loss": 0.0079,
425
- "reward": 0.5840773954987526,
426
- "reward_std": 0.2365283314138651,
427
- "rewards/accuracy_reward": 0.20535714458674192,
428
- "rewards/format_reward": 0.0,
429
- "rewards/tag_count_reward": 0.3787202388048172,
430
  "step": 30
431
  },
432
  {
433
- "completion_length": 3431.7291870117188,
434
  "epoch": 0.8671328671328671,
435
- "grad_norm": 0.18510738015174866,
436
- "kl": 0.185302734375,
437
- "learning_rate": 1.7583867446385461e-06,
438
- "loss": 0.0074,
439
- "reward": 0.7075892835855484,
440
- "reward_std": 0.22301654145121574,
441
- "rewards/accuracy_reward": 0.3065476305782795,
442
  "rewards/format_reward": 0.0,
443
- "rewards/tag_count_reward": 0.4010416716337204,
444
  "step": 31
445
  },
446
  {
447
- "completion_length": 3274.8601684570312,
448
  "epoch": 0.8951048951048951,
449
- "grad_norm": 0.15279972553253174,
450
- "kl": 0.181396484375,
451
- "learning_rate": 1.531354310432403e-06,
452
- "loss": 0.0073,
453
- "reward": 0.6845238208770752,
454
- "reward_std": 0.08698987402021885,
455
- "rewards/accuracy_reward": 0.2886904813349247,
456
  "rewards/format_reward": 0.0,
457
- "rewards/tag_count_reward": 0.3958333432674408,
458
  "step": 32
459
  },
460
  {
461
- "completion_length": 3458.9524536132812,
462
  "epoch": 0.9230769230769231,
463
- "grad_norm": 0.18014280498027802,
464
- "kl": 0.208984375,
465
- "learning_rate": 1.3425421036992098e-06,
466
- "loss": 0.0083,
467
- "reward": 0.7135416865348816,
468
- "reward_std": 0.207016596570611,
469
- "rewards/accuracy_reward": 0.3154761977493763,
470
  "rewards/format_reward": 0.0,
471
- "rewards/tag_count_reward": 0.398065485060215,
472
  "step": 33
473
  },
474
  {
475
- "completion_length": 3083.3065795898438,
476
  "epoch": 0.951048951048951,
477
- "grad_norm": 0.5020937323570251,
478
- "kl": 0.207275390625,
479
- "learning_rate": 1.1937684892050606e-06,
480
- "loss": 0.0083,
481
- "reward": 0.7477678805589676,
482
- "reward_std": 0.21528683602809906,
483
- "rewards/accuracy_reward": 0.3214285746216774,
484
  "rewards/format_reward": 0.0,
485
- "rewards/tag_count_reward": 0.4263392984867096,
486
  "step": 34
487
  },
488
  {
489
- "completion_length": 3171.6221313476562,
490
  "epoch": 0.9790209790209791,
491
- "grad_norm": 0.17938590049743652,
492
- "kl": 0.200439453125,
493
- "learning_rate": 1.0864662381854632e-06,
494
- "loss": 0.008,
495
- "reward": 0.721726194024086,
496
- "reward_std": 0.15224073268473148,
497
- "rewards/accuracy_reward": 0.31845238991081715,
498
  "rewards/format_reward": 0.0,
499
- "rewards/tag_count_reward": 0.4032738208770752,
500
  "step": 35
501
  },
502
  {
503
- "completion_length": 3707.666748046875,
504
  "epoch": 1.0,
505
- "grad_norm": 0.17938590049743652,
506
- "kl": 0.20670572916666666,
507
- "learning_rate": 1.0216687299751146e-06,
508
- "loss": 0.0062,
509
- "reward": 0.460317462682724,
510
- "reward_std": 0.09649834036827087,
511
- "rewards/accuracy_reward": 0.11111111473292112,
512
- "rewards/format_reward": 0.0,
513
- "rewards/tag_count_reward": 0.3492063581943512,
514
  "step": 36
515
  },
516
  {
517
- "epoch": 1.0,
518
- "step": 36,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  "total_flos": 0.0,
520
- "train_loss": 0.004764942314540437,
521
- "train_runtime": 16049.8199,
522
- "train_samples_per_second": 0.062,
523
  "train_steps_per_second": 0.002
524
  }
525
  ],
526
  "logging_steps": 1,
527
- "max_steps": 36,
528
  "num_input_tokens_seen": 0,
529
- "num_train_epochs": 1,
530
  "save_steps": 500,
531
  "stateful_callbacks": {
532
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 108,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12
  {
13
  "completion_length": 3512.2977294921875,
14
  "epoch": 0.027972027972027972,
15
+ "grad_norm": 0.8601931929588318,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
 
26
  {
27
  "completion_length": 3691.6666870117188,
28
  "epoch": 0.055944055944055944,
29
+ "grad_norm": 0.37887123227119446,
30
  "kl": 0.0,
31
+ "learning_rate": 9.090909090909091e-07,
32
  "loss": 0.0,
33
  "reward": 0.4680059477686882,
34
  "reward_std": 0.1798743773251772,
 
38
  "step": 2
39
  },
40
  {
41
+ "completion_length": 3442.3929443359375,
42
  "epoch": 0.08391608391608392,
43
+ "grad_norm": 0.6051873564720154,
44
+ "kl": 0.0008230209350585938,
45
+ "learning_rate": 1.8181818181818183e-06,
46
  "loss": 0.0,
47
+ "reward": 0.5766369253396988,
48
+ "reward_std": 0.12715193443000317,
49
+ "rewards/accuracy_reward": 0.1547619104385376,
50
  "rewards/format_reward": 0.0,
51
+ "rewards/tag_count_reward": 0.4218750074505806,
52
  "step": 3
53
  },
54
  {
55
+ "completion_length": 3409.6935424804688,
56
  "epoch": 0.11188811188811189,
57
+ "grad_norm": 1.0704518556594849,
58
+ "kl": 0.0009069442749023438,
59
+ "learning_rate": 2.7272727272727272e-06,
60
  "loss": 0.0,
61
+ "reward": 0.5037202537059784,
62
+ "reward_std": 0.12088673282414675,
63
+ "rewards/accuracy_reward": 0.10416666883975267,
64
  "rewards/format_reward": 0.0,
65
+ "rewards/tag_count_reward": 0.3995535746216774,
66
  "step": 4
67
  },
68
  {
69
+ "completion_length": 3750.7233276367188,
70
  "epoch": 0.13986013986013987,
71
+ "grad_norm": 1.2529385089874268,
72
+ "kl": 0.001476287841796875,
73
+ "learning_rate": 3.6363636363636366e-06,
74
+ "loss": 0.0001,
75
+ "reward": 0.495535708963871,
76
+ "reward_std": 0.149957662448287,
77
+ "rewards/accuracy_reward": 0.12202381482347846,
78
  "rewards/format_reward": 0.0,
79
+ "rewards/tag_count_reward": 0.373511902987957,
80
  "step": 5
81
  },
82
  {
83
+ "completion_length": 3797.9166870117188,
84
  "epoch": 0.16783216783216784,
85
+ "grad_norm": 0.43379437923431396,
86
+ "kl": 0.00420379638671875,
87
+ "learning_rate": 4.5454545454545455e-06,
88
+ "loss": 0.0002,
89
+ "reward": 0.4501488134264946,
90
+ "reward_std": 0.15580809116363525,
91
+ "rewards/accuracy_reward": 0.07738095289096236,
92
+ "rewards/format_reward": 0.0,
93
+ "rewards/tag_count_reward": 0.3727678656578064,
94
  "step": 6
95
  },
96
  {
97
+ "completion_length": 3904.8424072265625,
98
  "epoch": 0.1958041958041958,
99
+ "grad_norm": 0.2574431300163269,
100
+ "kl": 0.0105133056640625,
101
+ "learning_rate": 5.4545454545454545e-06,
102
+ "loss": 0.0004,
103
+ "reward": 0.4895833358168602,
104
+ "reward_std": 0.18993620201945305,
105
+ "rewards/accuracy_reward": 0.11607143236324191,
106
+ "rewards/format_reward": 0.0,
107
+ "rewards/tag_count_reward": 0.373511902987957,
108
  "step": 7
109
  },
110
  {
111
+ "completion_length": 3614.2798461914062,
112
  "epoch": 0.22377622377622378,
113
+ "grad_norm": 0.6184343695640564,
114
+ "kl": 0.018280029296875,
115
+ "learning_rate": 6.363636363636364e-06,
116
+ "loss": 0.0007,
117
+ "reward": 0.524553582072258,
118
+ "reward_std": 0.20281112380325794,
119
+ "rewards/accuracy_reward": 0.09226190717890859,
120
  "rewards/format_reward": 0.0,
121
+ "rewards/tag_count_reward": 0.432291679084301,
122
  "step": 8
123
  },
124
  {
125
+ "completion_length": 3627.4553833007812,
126
  "epoch": 0.2517482517482518,
127
+ "grad_norm": 0.2969047725200653,
128
+ "kl": 0.024139404296875,
129
+ "learning_rate": 7.272727272727273e-06,
130
+ "loss": 0.001,
131
+ "reward": 0.5773809626698494,
132
+ "reward_std": 0.21523480489850044,
133
+ "rewards/accuracy_reward": 0.16369048040360212,
134
+ "rewards/format_reward": 0.0,
135
+ "rewards/tag_count_reward": 0.4136904776096344,
136
  "step": 9
137
  },
138
  {
139
+ "completion_length": 3487.21435546875,
140
  "epoch": 0.27972027972027974,
141
+ "grad_norm": 0.24078179895877838,
142
+ "kl": 0.024017333984375,
143
+ "learning_rate": 8.181818181818183e-06,
144
+ "loss": 0.001,
145
+ "reward": 0.6346726417541504,
146
+ "reward_std": 0.2543078139424324,
147
+ "rewards/accuracy_reward": 0.1904761977493763,
148
+ "rewards/format_reward": 0.0,
149
+ "rewards/tag_count_reward": 0.4441964402794838,
150
  "step": 10
151
  },
152
  {
153
+ "completion_length": 3455.8631591796875,
154
  "epoch": 0.3076923076923077,
155
+ "grad_norm": 0.4435509741306305,
156
+ "kl": 0.033843994140625,
157
+ "learning_rate": 9.090909090909091e-06,
158
+ "loss": 0.0014,
159
+ "reward": 0.5729166865348816,
160
+ "reward_std": 0.18820499069988728,
161
+ "rewards/accuracy_reward": 0.14285714365541935,
162
  "rewards/format_reward": 0.0,
163
+ "rewards/tag_count_reward": 0.4300595372915268,
164
  "step": 11
165
  },
166
  {
167
+ "completion_length": 3493.541748046875,
168
  "epoch": 0.3356643356643357,
169
+ "grad_norm": 0.23296485841274261,
170
+ "kl": 0.0333251953125,
171
+ "learning_rate": 1e-05,
172
+ "loss": 0.0013,
173
+ "reward": 0.5052083358168602,
174
+ "reward_std": 0.16948581114411354,
175
+ "rewards/accuracy_reward": 0.09821428847499192,
176
+ "rewards/format_reward": 0.0,
177
+ "rewards/tag_count_reward": 0.4069940596818924,
178
  "step": 12
179
  },
180
  {
181
+ "completion_length": 3424.3751220703125,
182
  "epoch": 0.36363636363636365,
183
+ "grad_norm": 131.0524139404297,
184
+ "kl": 0.958984375,
185
+ "learning_rate": 9.997640060704818e-06,
186
+ "loss": 0.0384,
187
+ "reward": 0.7269345447421074,
188
+ "reward_std": 0.20091417245566845,
189
+ "rewards/accuracy_reward": 0.3273809589445591,
190
+ "rewards/format_reward": 0.0,
191
+ "rewards/tag_count_reward": 0.3995535746216774,
192
  "step": 13
193
  },
194
  {
195
+ "completion_length": 3556.96435546875,
196
  "epoch": 0.3916083916083916,
197
+ "grad_norm": 0.4128544330596924,
198
+ "kl": 0.051025390625,
199
+ "learning_rate": 9.990562718069703e-06,
200
+ "loss": 0.002,
201
+ "reward": 0.7678571492433548,
202
+ "reward_std": 0.29729820415377617,
203
+ "rewards/accuracy_reward": 0.3750000186264515,
204
+ "rewards/format_reward": 0.0,
205
+ "rewards/tag_count_reward": 0.3928571492433548,
206
  "step": 14
207
  },
208
  {
209
+ "completion_length": 3937.729248046875,
210
  "epoch": 0.4195804195804196,
211
+ "grad_norm": 0.3086971938610077,
212
+ "kl": 0.0550537109375,
213
+ "learning_rate": 9.978775395249763e-06,
214
+ "loss": 0.0022,
215
+ "reward": 0.5550595372915268,
216
+ "reward_std": 0.16711290925741196,
217
+ "rewards/accuracy_reward": 0.2291666716337204,
218
+ "rewards/format_reward": 0.0,
219
+ "rewards/tag_count_reward": 0.3258928656578064,
220
  "step": 15
221
  },
222
  {
223
+ "completion_length": 3943.59228515625,
224
  "epoch": 0.44755244755244755,
225
+ "grad_norm": 0.543189525604248,
226
+ "kl": 0.064453125,
227
+ "learning_rate": 9.962290455518914e-06,
228
+ "loss": 0.0026,
229
+ "reward": 0.584077388048172,
230
+ "reward_std": 0.2332012578845024,
231
+ "rewards/accuracy_reward": 0.24107143003493547,
232
+ "rewards/format_reward": 0.0,
233
+ "rewards/tag_count_reward": 0.3430059626698494,
234
  "step": 16
235
  },
236
  {
237
+ "completion_length": 4002.994140625,
238
  "epoch": 0.4755244755244755,
239
+ "grad_norm": 0.17314046621322632,
240
+ "kl": 0.08203125,
241
+ "learning_rate": 9.941125189302508e-06,
242
+ "loss": 0.0033,
243
+ "reward": 0.5959821566939354,
244
+ "reward_std": 0.18275887705385685,
245
+ "rewards/accuracy_reward": 0.2619047621265054,
246
+ "rewards/format_reward": 0.0,
247
+ "rewards/tag_count_reward": 0.3340773805975914,
248
  "step": 17
249
  },
250
  {
251
+ "completion_length": 4070.3750610351562,
252
  "epoch": 0.5034965034965035,
253
+ "grad_norm": 0.18910881876945496,
254
+ "kl": 0.0848388671875,
255
+ "learning_rate": 9.915301796042076e-06,
256
+ "loss": 0.0034,
257
+ "reward": 0.5171131044626236,
258
+ "reward_std": 0.16867330251261592,
259
+ "rewards/accuracy_reward": 0.19940475933253765,
260
+ "rewards/format_reward": 0.0,
261
+ "rewards/tag_count_reward": 0.3177083358168602,
262
  "step": 18
263
  },
264
  {
265
+ "completion_length": 3723.9761962890625,
266
  "epoch": 0.5314685314685315,
267
+ "grad_norm": 0.233817920088768,
268
+ "kl": 0.1026611328125,
269
+ "learning_rate": 9.884847360911168e-06,
270
+ "loss": 0.0041,
271
+ "reward": 0.7380952388048172,
272
+ "reward_std": 0.1809748988598585,
273
+ "rewards/accuracy_reward": 0.377976194024086,
274
+ "rewards/format_reward": 0.0,
275
+ "rewards/tag_count_reward": 0.3601190596818924,
276
  "step": 19
277
  },
278
  {
279
+ "completion_length": 4125.550720214844,
280
  "epoch": 0.5594405594405595,
281
+ "grad_norm": 0.12808525562286377,
282
+ "kl": 0.12451171875,
283
+ "learning_rate": 9.849793826406752e-06,
284
+ "loss": 0.005,
285
+ "reward": 0.4702381044626236,
286
+ "reward_std": 0.15858174674212933,
287
+ "rewards/accuracy_reward": 0.17261905409395695,
288
  "rewards/format_reward": 0.0,
289
+ "rewards/tag_count_reward": 0.2976190596818924,
290
  "step": 20
291
  },
292
  {
293
+ "completion_length": 4082.4910888671875,
294
  "epoch": 0.5874125874125874,
295
+ "grad_norm": 0.2625791132450104,
296
+ "kl": 0.1243896484375,
297
+ "learning_rate": 9.810177958845942e-06,
298
+ "loss": 0.005,
299
+ "reward": 0.494791679084301,
300
+ "reward_std": 0.1791169587522745,
301
+ "rewards/accuracy_reward": 0.18154762033373117,
302
  "rewards/format_reward": 0.0,
303
  "rewards/tag_count_reward": 0.3132440522313118,
304
  "step": 21
305
  },
306
  {
307
+ "completion_length": 4112.172607421875,
308
  "epoch": 0.6153846153846154,
309
+ "grad_norm": 0.2053973376750946,
310
+ "kl": 0.13818359375,
311
+ "learning_rate": 9.766041309803218e-06,
312
+ "loss": 0.0055,
313
+ "reward": 0.5200892984867096,
314
+ "reward_std": 0.11136791668832302,
315
+ "rewards/accuracy_reward": 0.21130952797830105,
316
  "rewards/format_reward": 0.0,
317
+ "rewards/tag_count_reward": 0.3087797611951828,
318
  "step": 22
319
  },
320
  {
321
+ "completion_length": 3917.4761962890625,
322
  "epoch": 0.6433566433566433,
323
+ "grad_norm": 0.19135095179080963,
324
+ "kl": 0.160400390625,
325
+ "learning_rate": 9.717430172528548e-06,
326
+ "loss": 0.0064,
327
+ "reward": 0.6279762163758278,
328
+ "reward_std": 0.19610902667045593,
329
+ "rewards/accuracy_reward": 0.2976190522313118,
330
+ "rewards/format_reward": 0.0,
331
+ "rewards/tag_count_reward": 0.3303571417927742,
332
  "step": 23
333
  },
334
  {
335
+ "completion_length": 3670.77099609375,
336
  "epoch": 0.6713286713286714,
337
+ "grad_norm": 0.16754572093486786,
338
+ "kl": 0.16552734375,
339
+ "learning_rate": 9.66439553339217e-06,
340
+ "loss": 0.0066,
341
+ "reward": 0.67113097012043,
342
+ "reward_std": 0.1715894378721714,
343
+ "rewards/accuracy_reward": 0.3244047649204731,
344
  "rewards/format_reward": 0.0,
345
+ "rewards/tag_count_reward": 0.3467261865735054,
346
  "step": 24
347
  },
348
  {
349
+ "completion_length": 3596.1072387695312,
350
  "epoch": 0.6993006993006993,
351
+ "grad_norm": 0.2030419558286667,
352
+ "kl": 0.1728515625,
353
+ "learning_rate": 9.606993018406931e-06,
354
+ "loss": 0.0069,
355
+ "reward": 0.7254464328289032,
356
+ "reward_std": 0.26019805669784546,
357
+ "rewards/accuracy_reward": 0.351190485060215,
358
  "rewards/format_reward": 0.0,
359
+ "rewards/tag_count_reward": 0.3742559626698494,
360
  "step": 25
361
  },
362
  {
363
+ "completion_length": 3640.6488037109375,
364
  "epoch": 0.7272727272727273,
365
+ "grad_norm": 61.024208068847656,
366
+ "kl": 1.417724609375,
367
+ "learning_rate": 9.54528283488428e-06,
368
+ "loss": 0.0568,
369
+ "reward": 0.7016369253396988,
370
+ "reward_std": 0.12191555928438902,
371
+ "rewards/accuracy_reward": 0.3571428656578064,
372
+ "rewards/format_reward": 0.0,
373
+ "rewards/tag_count_reward": 0.3444940522313118,
374
  "step": 26
375
  },
376
  {
377
+ "completion_length": 3402.327392578125,
378
  "epoch": 0.7552447552447552,
379
+ "grad_norm": 0.19034799933433533,
380
+ "kl": 0.213623046875,
381
+ "learning_rate": 9.479329708285107e-06,
382
+ "loss": 0.0085,
383
+ "reward": 0.7440476268529892,
384
+ "reward_std": 0.22099306993186474,
385
+ "rewards/accuracy_reward": 0.3482142873108387,
386
+ "rewards/format_reward": 0.0,
387
+ "rewards/tag_count_reward": 0.3958333432674408,
388
  "step": 27
389
  },
390
  {
391
+ "completion_length": 3721.1815795898438,
392
  "epoch": 0.7832167832167832,
393
+ "grad_norm": 0.41007938981056213,
394
+ "kl": 0.23095703125,
395
+ "learning_rate": 9.40920281433168e-06,
396
+ "loss": 0.0092,
397
+ "reward": 0.4918154776096344,
398
+ "reward_std": 0.14000625722110271,
399
+ "rewards/accuracy_reward": 0.13988095335662365,
400
+ "rewards/format_reward": 0.0,
401
+ "rewards/tag_count_reward": 0.3519345298409462,
402
  "step": 28
403
  },
404
  {
405
+ "completion_length": 3638.3036499023438,
406
  "epoch": 0.8111888111888111,
407
+ "grad_norm": 0.16842831671237946,
408
+ "kl": 0.206298828125,
409
+ "learning_rate": 9.334975706451863e-06,
410
+ "loss": 0.0083,
411
+ "reward": 0.6986607238650322,
412
+ "reward_std": 0.2177316304296255,
413
+ "rewards/accuracy_reward": 0.3035714365541935,
414
  "rewards/format_reward": 0.0,
415
+ "rewards/tag_count_reward": 0.3950892984867096,
416
  "step": 29
417
  },
418
  {
419
+ "completion_length": 3669.9852294921875,
420
  "epoch": 0.8391608391608392,
421
+ "grad_norm": 0.3207129240036011,
422
+ "kl": 0.263671875,
423
+ "learning_rate": 9.256726238631721e-06,
424
+ "loss": 0.0105,
425
+ "reward": 0.5602678805589676,
426
+ "reward_std": 0.17984714545309544,
427
+ "rewards/accuracy_reward": 0.19345238246023655,
428
+ "rewards/format_reward": 0.0,
429
+ "rewards/tag_count_reward": 0.3668154776096344,
430
  "step": 30
431
  },
432
  {
433
+ "completion_length": 3451.3572387695312,
434
  "epoch": 0.8671328671328671,
435
+ "grad_norm": 0.2888355553150177,
436
+ "kl": 0.24609375,
437
+ "learning_rate": 9.174536483757449e-06,
438
+ "loss": 0.0099,
439
+ "reward": 0.6800595372915268,
440
+ "reward_std": 0.22582022473216057,
441
+ "rewards/accuracy_reward": 0.2767857201397419,
442
  "rewards/format_reward": 0.0,
443
+ "rewards/tag_count_reward": 0.4032738208770752,
444
  "step": 31
445
  },
446
  {
447
+ "completion_length": 3316.4137573242188,
448
  "epoch": 0.8951048951048951,
449
+ "grad_norm": 0.1782606989145279,
450
+ "kl": 0.2578125,
451
+ "learning_rate": 9.088492647532244e-06,
452
+ "loss": 0.0103,
453
+ "reward": 0.6897321492433548,
454
+ "reward_std": 0.14058080688118935,
455
+ "rewards/accuracy_reward": 0.2976190559566021,
456
  "rewards/format_reward": 0.0,
457
+ "rewards/tag_count_reward": 0.3921131044626236,
458
  "step": 32
459
  },
460
  {
461
+ "completion_length": 3457.824462890625,
462
  "epoch": 0.9230769230769231,
463
+ "grad_norm": 0.20152917504310608,
464
+ "kl": 0.3046875,
465
+ "learning_rate": 8.998684978058423e-06,
466
+ "loss": 0.0122,
467
+ "reward": 0.7008928656578064,
468
+ "reward_std": 0.23007170855998993,
469
+ "rewards/accuracy_reward": 0.3035714328289032,
470
  "rewards/format_reward": 0.0,
471
+ "rewards/tag_count_reward": 0.3973214402794838,
472
  "step": 33
473
  },
474
  {
475
+ "completion_length": 3092.1190795898438,
476
  "epoch": 0.951048951048951,
477
+ "grad_norm": 0.20092906057834625,
478
+ "kl": 0.3056640625,
479
+ "learning_rate": 8.905207671179629e-06,
480
+ "loss": 0.0122,
481
+ "reward": 0.7663690745830536,
482
+ "reward_std": 0.22053436189889908,
483
+ "rewards/accuracy_reward": 0.3333333358168602,
484
  "rewards/format_reward": 0.0,
485
+ "rewards/tag_count_reward": 0.4330357238650322,
486
  "step": 34
487
  },
488
  {
489
+ "completion_length": 3940.1637573242188,
490
  "epoch": 0.9790209790209791,
491
+ "grad_norm": 0.1926574409008026,
492
+ "kl": 0.3583984375,
493
+ "learning_rate": 8.808158771682402e-06,
494
+ "loss": 0.0143,
495
+ "reward": 0.4360119104385376,
496
+ "reward_std": 0.14251151774078608,
497
+ "rewards/accuracy_reward": 0.11607143329456449,
498
  "rewards/format_reward": 0.0,
499
+ "rewards/tag_count_reward": 0.319940485060215,
500
  "step": 35
501
  },
502
  {
503
+ "completion_length": 4359.97802734375,
504
  "epoch": 1.0,
505
+ "grad_norm": 0.1926574409008026,
506
+ "kl": 0.400390625,
507
+ "learning_rate": 8.707640070460733e-06,
508
+ "loss": 0.012,
509
+ "reward": 0.3730158706506093,
510
+ "reward_std": 0.13879583527644476,
511
+ "rewards/accuracy_reward": 0.07936508022248745,
512
+ "rewards/format_reward": 0.0,
513
+ "rewards/tag_count_reward": 0.2936507960160573,
514
  "step": 36
515
  },
516
  {
517
+ "completion_length": 3477.6101684570312,
518
+ "epoch": 1.027972027972028,
519
+ "grad_norm": 0.21748770773410797,
520
+ "kl": 0.3212890625,
521
+ "learning_rate": 8.60375699775147e-06,
522
+ "loss": 0.0129,
523
+ "reward": 0.4784226194024086,
524
+ "reward_std": 0.12951527908444405,
525
+ "rewards/accuracy_reward": 0.09523809631355107,
526
+ "rewards/format_reward": 0.0,
527
+ "rewards/tag_count_reward": 0.3831845372915268,
528
+ "step": 37
529
+ },
530
+ {
531
+ "completion_length": 3679.9732666015625,
532
+ "epoch": 1.055944055944056,
533
+ "grad_norm": 0.16126495599746704,
534
+ "kl": 0.36572265625,
535
+ "learning_rate": 8.496618512552566e-06,
536
+ "loss": 0.0146,
537
+ "reward": 0.4895833432674408,
538
+ "reward_std": 0.15181067399680614,
539
+ "rewards/accuracy_reward": 0.11011904990300536,
540
+ "rewards/format_reward": 0.0,
541
+ "rewards/tag_count_reward": 0.3794642984867096,
542
+ "step": 38
543
+ },
544
+ {
545
+ "completion_length": 3395.2559814453125,
546
+ "epoch": 1.083916083916084,
547
+ "grad_norm": 0.31568872928619385,
548
+ "kl": 0.35546875,
549
+ "learning_rate": 8.38633698834013e-06,
550
+ "loss": 0.0142,
551
+ "reward": 0.563988097012043,
552
+ "reward_std": 0.16420039534568787,
553
+ "rewards/accuracy_reward": 0.13988095615059137,
554
+ "rewards/format_reward": 0.0,
555
+ "rewards/tag_count_reward": 0.4241071492433548,
556
+ "step": 39
557
+ },
558
+ {
559
+ "completion_length": 3251.02978515625,
560
+ "epoch": 1.1118881118881119,
561
+ "grad_norm": 0.28431281447410583,
562
+ "kl": 0.376953125,
563
+ "learning_rate": 8.273028095204174e-06,
564
+ "loss": 0.015,
565
+ "reward": 0.5565476268529892,
566
+ "reward_std": 0.11341512016952038,
567
+ "rewards/accuracy_reward": 0.1398809552192688,
568
+ "rewards/format_reward": 0.0,
569
+ "rewards/tag_count_reward": 0.416666679084301,
570
+ "step": 40
571
+ },
572
+ {
573
+ "completion_length": 3567.4495239257812,
574
+ "epoch": 1.1398601398601398,
575
+ "grad_norm": 0.28001177310943604,
576
+ "kl": 2.77392578125,
577
+ "learning_rate": 8.156810678526652e-06,
578
+ "loss": 0.1111,
579
+ "reward": 0.5498512089252472,
580
+ "reward_std": 0.1434172596782446,
581
+ "rewards/accuracy_reward": 0.16071428917348385,
582
+ "rewards/format_reward": 0.0,
583
+ "rewards/tag_count_reward": 0.3891369104385376,
584
+ "step": 41
585
+ },
586
+ {
587
+ "completion_length": 3581.7559814453125,
588
+ "epoch": 1.167832167832168,
589
+ "grad_norm": 164.8837127685547,
590
+ "kl": 0.478515625,
591
+ "learning_rate": 8.037806634329079e-06,
592
+ "loss": 0.0191,
593
+ "reward": 0.501488097012043,
594
+ "reward_std": 0.196690171957016,
595
+ "rewards/accuracy_reward": 0.10416666930541396,
596
+ "rewards/format_reward": 0.0,
597
+ "rewards/tag_count_reward": 0.3973214328289032,
598
+ "step": 42
599
+ },
600
+ {
601
+ "completion_length": 3735.9702758789062,
602
+ "epoch": 1.1958041958041958,
603
+ "grad_norm": 0.45882004499435425,
604
+ "kl": 0.5,
605
+ "learning_rate": 7.916140781420428e-06,
606
+ "loss": 0.02,
607
+ "reward": 0.5550595223903656,
608
+ "reward_std": 0.20235663652420044,
609
+ "rewards/accuracy_reward": 0.16666666977107525,
610
+ "rewards/format_reward": 0.0,
611
+ "rewards/tag_count_reward": 0.3883928582072258,
612
+ "step": 43
613
+ },
614
+ {
615
+ "completion_length": 3458.1281127929688,
616
+ "epoch": 1.2237762237762237,
617
+ "grad_norm": 0.7228833436965942,
618
+ "kl": 0.5859375,
619
+ "learning_rate": 7.791940730479435e-06,
620
+ "loss": 0.0234,
621
+ "reward": 0.5483631044626236,
622
+ "reward_std": 0.20634656865149736,
623
+ "rewards/accuracy_reward": 0.12797619216144085,
624
+ "rewards/format_reward": 0.0,
625
+ "rewards/tag_count_reward": 0.4203869104385376,
626
+ "step": 44
627
+ },
628
+ {
629
+ "completion_length": 3490.0357666015625,
630
+ "epoch": 1.2517482517482517,
631
+ "grad_norm": 0.43375658988952637,
632
+ "kl": 0.744140625,
633
+ "learning_rate": 7.665336750208624e-06,
634
+ "loss": 0.0297,
635
+ "reward": 0.5811012014746666,
636
+ "reward_std": 0.21103684417903423,
637
+ "rewards/accuracy_reward": 0.18452381365932524,
638
+ "rewards/format_reward": 0.0,
639
+ "rewards/tag_count_reward": 0.3965773954987526,
640
+ "step": 45
641
+ },
642
+ {
643
+ "completion_length": 3487.4435424804688,
644
+ "epoch": 1.2797202797202798,
645
+ "grad_norm": 1.1383850574493408,
646
+ "kl": 0.9482421875,
647
+ "learning_rate": 7.536461630700426e-06,
648
+ "loss": 0.0379,
649
+ "reward": 0.5952381193637848,
650
+ "reward_std": 0.24980730190873146,
651
+ "rewards/accuracy_reward": 0.16071428917348385,
652
+ "rewards/format_reward": 0.0,
653
+ "rewards/tag_count_reward": 0.4345238134264946,
654
+ "step": 46
655
+ },
656
+ {
657
+ "completion_length": 3268.75,
658
+ "epoch": 1.3076923076923077,
659
+ "grad_norm": 1.0813361406326294,
660
+ "kl": 1.525390625,
661
+ "learning_rate": 7.4054505441587075e-06,
662
+ "loss": 0.061,
663
+ "reward": 0.6049107313156128,
664
+ "reward_std": 0.17533794045448303,
665
+ "rewards/accuracy_reward": 0.11607143003493547,
666
+ "rewards/format_reward": 0.0,
667
+ "rewards/tag_count_reward": 0.488839291036129,
668
+ "step": 47
669
+ },
670
+ {
671
+ "completion_length": 2660.7113037109375,
672
+ "epoch": 1.3356643356643356,
673
+ "grad_norm": 9.25616455078125,
674
+ "kl": 1.716796875,
675
+ "learning_rate": 7.272440903121792e-06,
676
+ "loss": 0.0687,
677
+ "reward": 0.6532738357782364,
678
+ "reward_std": 0.17452342063188553,
679
+ "rewards/accuracy_reward": 0.13095238571986556,
680
+ "rewards/format_reward": 0.0,
681
+ "rewards/tag_count_reward": 0.522321417927742,
682
+ "step": 48
683
+ },
684
+ {
685
+ "completion_length": 2249.235137939453,
686
+ "epoch": 1.3636363636363638,
687
+ "grad_norm": 2.210108995437622,
688
+ "kl": 581.240234375,
689
+ "learning_rate": 7.137572216335695e-06,
690
+ "loss": 23.2541,
691
+ "reward": 0.7261904925107956,
692
+ "reward_std": 0.21592015214264393,
693
+ "rewards/accuracy_reward": 0.2172619104385376,
694
+ "rewards/format_reward": 0.0,
695
+ "rewards/tag_count_reward": 0.508928582072258,
696
+ "step": 49
697
+ },
698
+ {
699
+ "completion_length": 2210.3809814453125,
700
+ "epoch": 1.3916083916083917,
701
+ "grad_norm": 24608.181640625,
702
+ "kl": 1.755859375,
703
+ "learning_rate": 7.000985942428694e-06,
704
+ "loss": 0.0702,
705
+ "reward": 0.8735119253396988,
706
+ "reward_std": 0.29524174705147743,
707
+ "rewards/accuracy_reward": 0.3690476268529892,
708
+ "rewards/format_reward": 0.0,
709
+ "rewards/tag_count_reward": 0.5044643059372902,
710
+ "step": 50
711
+ },
712
+ {
713
+ "completion_length": 2277.0952758789062,
714
+ "epoch": 1.4195804195804196,
715
+ "grad_norm": 1.4279229640960693,
716
+ "kl": 2.052734375,
717
+ "learning_rate": 6.862825341540779e-06,
718
+ "loss": 0.0821,
719
+ "reward": 0.7269345223903656,
720
+ "reward_std": 0.25008974969387054,
721
+ "rewards/accuracy_reward": 0.21130953170359135,
722
+ "rewards/format_reward": 0.0,
723
+ "rewards/tag_count_reward": 0.515625,
724
+ "step": 51
725
+ },
726
+ {
727
+ "completion_length": 2418.7025146484375,
728
+ "epoch": 1.4475524475524475,
729
+ "grad_norm": 16.061054229736328,
730
+ "kl": 1.5625,
731
+ "learning_rate": 6.723235325063544e-06,
732
+ "loss": 0.0625,
733
+ "reward": 0.8162202537059784,
734
+ "reward_std": 0.2910061627626419,
735
+ "rewards/accuracy_reward": 0.3065476268529892,
736
+ "rewards/format_reward": 0.0,
737
+ "rewards/tag_count_reward": 0.509672611951828,
738
+ "step": 52
739
+ },
740
+ {
741
+ "completion_length": 2389.8839721679688,
742
+ "epoch": 1.4755244755244754,
743
+ "grad_norm": 1.9534995555877686,
744
+ "kl": 1.697265625,
745
+ "learning_rate": 6.582362303648142e-06,
746
+ "loss": 0.0679,
747
+ "reward": 0.82738097012043,
748
+ "reward_std": 0.27743372321128845,
749
+ "rewards/accuracy_reward": 0.3244047611951828,
750
+ "rewards/format_reward": 0.0,
751
+ "rewards/tag_count_reward": 0.5029762089252472,
752
+ "step": 53
753
+ },
754
+ {
755
+ "completion_length": 2719.7142944335938,
756
+ "epoch": 1.5034965034965035,
757
+ "grad_norm": 1.6721709966659546,
758
+ "kl": 1.46875,
759
+ "learning_rate": 6.440354033640739e-06,
760
+ "loss": 0.0588,
761
+ "reward": 0.7924107164144516,
762
+ "reward_std": 0.2318180948495865,
763
+ "rewards/accuracy_reward": 0.27976191882044077,
764
+ "rewards/format_reward": 0.0,
765
+ "rewards/tag_count_reward": 0.5126488208770752,
766
+ "step": 54
767
+ },
768
+ {
769
+ "completion_length": 2739.5684814453125,
770
+ "epoch": 1.5314685314685315,
771
+ "grad_norm": 4.0345778465271,
772
+ "kl": 1.296875,
773
+ "learning_rate": 6.297359462106504e-06,
774
+ "loss": 0.0519,
775
+ "reward": 0.9494047909975052,
776
+ "reward_std": 0.33292248100042343,
777
+ "rewards/accuracy_reward": 0.4226190596818924,
778
+ "rewards/format_reward": 0.0,
779
+ "rewards/tag_count_reward": 0.526785708963871,
780
+ "step": 55
781
+ },
782
+ {
783
+ "completion_length": 3403.4136962890625,
784
+ "epoch": 1.5594405594405596,
785
+ "grad_norm": 0.4623602032661438,
786
+ "kl": 1.232421875,
787
+ "learning_rate": 6.1535285706047075e-06,
788
+ "loss": 0.0493,
789
+ "reward": 0.7641369104385376,
790
+ "reward_std": 0.2736235670745373,
791
+ "rewards/accuracy_reward": 0.2767857164144516,
792
+ "rewards/format_reward": 0.0,
793
+ "rewards/tag_count_reward": 0.4873512014746666,
794
+ "step": 56
795
+ },
796
+ {
797
+ "completion_length": 3603.52392578125,
798
+ "epoch": 1.5874125874125875,
799
+ "grad_norm": 0.5890073776245117,
800
+ "kl": 1.009765625,
801
+ "learning_rate": 6.00901221787878e-06,
802
+ "loss": 0.0404,
803
+ "reward": 0.8504464477300644,
804
+ "reward_std": 0.328320749104023,
805
+ "rewards/accuracy_reward": 0.3898809552192688,
806
+ "rewards/format_reward": 0.0,
807
+ "rewards/tag_count_reward": 0.4605654925107956,
808
+ "step": 57
809
+ },
810
+ {
811
+ "completion_length": 3894.0327758789062,
812
+ "epoch": 1.6153846153846154,
813
+ "grad_norm": 1.1042261123657227,
814
+ "kl": 1.07421875,
815
+ "learning_rate": 5.863961981626321e-06,
816
+ "loss": 0.043,
817
+ "reward": 0.677827388048172,
818
+ "reward_std": 0.29943743720650673,
819
+ "rewards/accuracy_reward": 0.27678571082651615,
820
+ "rewards/format_reward": 0.0,
821
+ "rewards/tag_count_reward": 0.4010416716337204,
822
+ "step": 58
823
+ },
824
+ {
825
+ "completion_length": 3703.5418090820312,
826
+ "epoch": 1.6433566433566433,
827
+ "grad_norm": 0.8175861239433289,
828
+ "kl": 0.8017578125,
829
+ "learning_rate": 5.718529999515018e-06,
830
+ "loss": 0.032,
831
+ "reward": 0.7596726417541504,
832
+ "reward_std": 0.31634173914790154,
833
+ "rewards/accuracy_reward": 0.348214291036129,
834
+ "rewards/format_reward": 0.0,
835
+ "rewards/tag_count_reward": 0.4114583432674408,
836
+ "step": 59
837
+ },
838
+ {
839
+ "completion_length": 3456.7232666015625,
840
+ "epoch": 1.6713286713286712,
841
+ "grad_norm": 0.6864073872566223,
842
+ "kl": 0.88671875,
843
+ "learning_rate": 5.572868809611258e-06,
844
+ "loss": 0.0355,
845
+ "reward": 0.7306547909975052,
846
+ "reward_std": 0.3101131170988083,
847
+ "rewards/accuracy_reward": 0.3244047649204731,
848
+ "rewards/format_reward": 0.0,
849
+ "rewards/tag_count_reward": 0.4062500074505806,
850
+ "step": 60
851
+ },
852
+ {
853
+ "completion_length": 3348.4910888671875,
854
+ "epoch": 1.6993006993006992,
855
+ "grad_norm": 1.040555715560913,
856
+ "kl": 0.91796875,
857
+ "learning_rate": 5.427131190388743e-06,
858
+ "loss": 0.0367,
859
+ "reward": 0.8162202388048172,
860
+ "reward_std": 0.31024138629436493,
861
+ "rewards/accuracy_reward": 0.3720238134264946,
862
+ "rewards/format_reward": 0.0,
863
+ "rewards/tag_count_reward": 0.4441964402794838,
864
+ "step": 61
865
+ },
866
+ {
867
+ "completion_length": 3336.732177734375,
868
+ "epoch": 1.7272727272727273,
869
+ "grad_norm": 1.47223699092865,
870
+ "kl": 1.1650390625,
871
+ "learning_rate": 5.281470000484985e-06,
872
+ "loss": 0.0466,
873
+ "reward": 0.76488097012043,
874
+ "reward_std": 0.2751483619213104,
875
+ "rewards/accuracy_reward": 0.33630953542888165,
876
+ "rewards/format_reward": 0.0,
877
+ "rewards/tag_count_reward": 0.4285714328289032,
878
+ "step": 62
879
+ },
880
+ {
881
+ "completion_length": 2883.4136962890625,
882
+ "epoch": 1.7552447552447552,
883
+ "grad_norm": 12.537298202514648,
884
+ "kl": 0.833984375,
885
+ "learning_rate": 5.136038018373682e-06,
886
+ "loss": 0.0333,
887
+ "reward": 0.8363095223903656,
888
+ "reward_std": 0.20866946317255497,
889
+ "rewards/accuracy_reward": 0.3630952425301075,
890
+ "rewards/format_reward": 0.0,
891
+ "rewards/tag_count_reward": 0.4732142984867096,
892
+ "step": 63
893
+ },
894
+ {
895
+ "completion_length": 3351.5625610351562,
896
+ "epoch": 1.7832167832167833,
897
+ "grad_norm": 1.4591819047927856,
898
+ "kl": 38.779296875,
899
+ "learning_rate": 4.9909877821212215e-06,
900
+ "loss": 1.5543,
901
+ "reward": 0.6175595447421074,
902
+ "reward_std": 0.17579150572419167,
903
+ "rewards/accuracy_reward": 0.1666666679084301,
904
+ "rewards/format_reward": 0.0,
905
+ "rewards/tag_count_reward": 0.4508928656578064,
906
+ "step": 64
907
+ },
908
+ {
909
+ "completion_length": 3067.014892578125,
910
+ "epoch": 1.8111888111888113,
911
+ "grad_norm": 1026.2440185546875,
912
+ "kl": 1.087890625,
913
+ "learning_rate": 4.8464714293952956e-06,
914
+ "loss": 0.0436,
915
+ "reward": 0.8541666865348816,
916
+ "reward_std": 0.2841528169810772,
917
+ "rewards/accuracy_reward": 0.3779761977493763,
918
+ "rewards/format_reward": 0.0,
919
+ "rewards/tag_count_reward": 0.476190485060215,
920
+ "step": 65
921
+ },
922
+ {
923
+ "completion_length": 2978.4346313476562,
924
+ "epoch": 1.8391608391608392,
925
+ "grad_norm": 0.6207138299942017,
926
+ "kl": 1.2548828125,
927
+ "learning_rate": 4.702640537893498e-06,
928
+ "loss": 0.0502,
929
+ "reward": 0.7068452537059784,
930
+ "reward_std": 0.21755698695778847,
931
+ "rewards/accuracy_reward": 0.22916667396202683,
932
+ "rewards/format_reward": 0.0,
933
+ "rewards/tag_count_reward": 0.4776785746216774,
934
+ "step": 66
935
+ },
936
+ {
937
+ "completion_length": 2767.7798461914062,
938
+ "epoch": 1.867132867132867,
939
+ "grad_norm": 1.02067232131958,
940
+ "kl": 1.326171875,
941
+ "learning_rate": 4.559645966359263e-06,
942
+ "loss": 0.0531,
943
+ "reward": 0.7477678656578064,
944
+ "reward_std": 0.24407314509153366,
945
+ "rewards/accuracy_reward": 0.2678571455180645,
946
+ "rewards/format_reward": 0.0,
947
+ "rewards/tag_count_reward": 0.4799107238650322,
948
+ "step": 67
949
+ },
950
+ {
951
+ "completion_length": 2605.0506591796875,
952
+ "epoch": 1.895104895104895,
953
+ "grad_norm": 1.0706675052642822,
954
+ "kl": 1.21484375,
955
+ "learning_rate": 4.417637696351861e-06,
956
+ "loss": 0.0485,
957
+ "reward": 0.7604166865348816,
958
+ "reward_std": 0.16445039585232735,
959
+ "rewards/accuracy_reward": 0.2767857201397419,
960
+ "rewards/format_reward": 0.0,
961
+ "rewards/tag_count_reward": 0.4836309626698494,
962
+ "step": 68
963
+ },
964
+ {
965
+ "completion_length": 2840.5625610351562,
966
+ "epoch": 1.9230769230769231,
967
+ "grad_norm": 2.3417489528656006,
968
+ "kl": 1.453125,
969
+ "learning_rate": 4.2767646749364574e-06,
970
+ "loss": 0.0582,
971
+ "reward": 0.7202381193637848,
972
+ "reward_std": 0.2862440012395382,
973
+ "rewards/accuracy_reward": 0.2589285746216774,
974
+ "rewards/format_reward": 0.0,
975
+ "rewards/tag_count_reward": 0.4613095298409462,
976
+ "step": 69
977
+ },
978
+ {
979
+ "completion_length": 2714.1785888671875,
980
+ "epoch": 1.951048951048951,
981
+ "grad_norm": 1.3657090663909912,
982
+ "kl": 1.0849609375,
983
+ "learning_rate": 4.137174658459223e-06,
984
+ "loss": 0.0434,
985
+ "reward": 0.8623512089252472,
986
+ "reward_std": 0.26481272652745247,
987
+ "rewards/accuracy_reward": 0.3630952388048172,
988
+ "rewards/format_reward": 0.0,
989
+ "rewards/tag_count_reward": 0.4992559552192688,
990
+ "step": 70
991
+ },
992
+ {
993
+ "completion_length": 3436.5238647460938,
994
+ "epoch": 1.9790209790209792,
995
+ "grad_norm": 1.1092079877853394,
996
+ "kl": 1.720703125,
997
+ "learning_rate": 3.999014057571309e-06,
998
+ "loss": 0.069,
999
+ "reward": 0.6227678656578064,
1000
+ "reward_std": 0.25541481748223305,
1001
+ "rewards/accuracy_reward": 0.18750000558793545,
1002
+ "rewards/format_reward": 0.0,
1003
+ "rewards/tag_count_reward": 0.4352678582072258,
1004
+ "step": 71
1005
+ },
1006
+ {
1007
+ "completion_length": 4095.6446940104165,
1008
+ "epoch": 2.0,
1009
+ "grad_norm": 1.8023747205734253,
1010
+ "kl": 1.6927083333333333,
1011
+ "learning_rate": 3.862427783664306e-06,
1012
+ "loss": 0.0507,
1013
+ "reward": 0.4900793731212616,
1014
+ "reward_std": 0.1970900297164917,
1015
+ "rewards/accuracy_reward": 0.09920635198553403,
1016
+ "rewards/format_reward": 0.0,
1017
+ "rewards/tag_count_reward": 0.3908730248610179,
1018
+ "step": 72
1019
+ },
1020
+ {
1021
+ "completion_length": 3306.2947387695312,
1022
+ "epoch": 2.027972027972028,
1023
+ "grad_norm": 1.4027706384658813,
1024
+ "kl": 1.6396484375,
1025
+ "learning_rate": 3.7275590968782092e-06,
1026
+ "loss": 0.0656,
1027
+ "reward": 0.5691964253783226,
1028
+ "reward_std": 0.21935388445854187,
1029
+ "rewards/accuracy_reward": 0.11309524020180106,
1030
+ "rewards/format_reward": 0.0,
1031
+ "rewards/tag_count_reward": 0.4561011865735054,
1032
+ "step": 73
1033
+ },
1034
+ {
1035
+ "completion_length": 3458.6339721679688,
1036
+ "epoch": 2.055944055944056,
1037
+ "grad_norm": 0.5813905596733093,
1038
+ "kl": 1.58984375,
1039
+ "learning_rate": 3.5945494558412964e-06,
1040
+ "loss": 0.0637,
1041
+ "reward": 0.5885416716337204,
1042
+ "reward_std": 0.19907395914196968,
1043
+ "rewards/accuracy_reward": 0.1428571450524032,
1044
+ "rewards/format_reward": 0.0,
1045
+ "rewards/tag_count_reward": 0.4456845298409462,
1046
+ "step": 74
1047
+ },
1048
+ {
1049
+ "completion_length": 3205.52685546875,
1050
+ "epoch": 2.0839160839160837,
1051
+ "grad_norm": 0.4965643286705017,
1052
+ "kl": 1.177734375,
1053
+ "learning_rate": 3.463538369299576e-06,
1054
+ "loss": 0.0472,
1055
+ "reward": 0.6235119104385376,
1056
+ "reward_std": 0.23606937006115913,
1057
+ "rewards/accuracy_reward": 0.14285714365541935,
1058
+ "rewards/format_reward": 0.0,
1059
+ "rewards/tag_count_reward": 0.4806547686457634,
1060
+ "step": 75
1061
+ },
1062
+ {
1063
+ "completion_length": 3080.65771484375,
1064
+ "epoch": 2.111888111888112,
1065
+ "grad_norm": 5.0962443351745605,
1066
+ "kl": 1.013671875,
1067
+ "learning_rate": 3.334663249791378e-06,
1068
+ "loss": 0.0406,
1069
+ "reward": 0.5773809552192688,
1070
+ "reward_std": 0.19794748164713383,
1071
+ "rewards/accuracy_reward": 0.1220238134264946,
1072
+ "rewards/format_reward": 0.0,
1073
+ "rewards/tag_count_reward": 0.4553571492433548,
1074
+ "step": 76
1075
+ },
1076
+ {
1077
+ "completion_length": 3481.1190795898438,
1078
+ "epoch": 2.13986013986014,
1079
+ "grad_norm": 477.3974914550781,
1080
+ "kl": 9.28515625,
1081
+ "learning_rate": 3.208059269520568e-06,
1082
+ "loss": 0.3739,
1083
+ "reward": 0.5937500149011612,
1084
+ "reward_std": 0.18974663130939007,
1085
+ "rewards/accuracy_reward": 0.1815476231276989,
1086
+ "rewards/format_reward": 0.0,
1087
+ "rewards/tag_count_reward": 0.412202388048172,
1088
+ "step": 77
1089
+ },
1090
+ {
1091
+ "completion_length": 3660.8959350585938,
1092
+ "epoch": 2.167832167832168,
1093
+ "grad_norm": 1.8242417573928833,
1094
+ "kl": 0.9853515625,
1095
+ "learning_rate": 3.0838592185795733e-06,
1096
+ "loss": 0.0394,
1097
+ "reward": 0.537202388048172,
1098
+ "reward_std": 0.24809184670448303,
1099
+ "rewards/accuracy_reward": 0.12797619309276342,
1100
+ "rewards/format_reward": 0.0,
1101
+ "rewards/tag_count_reward": 0.409226194024086,
1102
+ "step": 78
1103
+ },
1104
+ {
1105
+ "completion_length": 3652.916748046875,
1106
+ "epoch": 2.195804195804196,
1107
+ "grad_norm": 2.0212318897247314,
1108
+ "kl": 1.2041015625,
1109
+ "learning_rate": 2.962193365670921e-06,
1110
+ "loss": 0.0482,
1111
+ "reward": 0.5483631044626236,
1112
+ "reward_std": 0.21292699500918388,
1113
+ "rewards/accuracy_reward": 0.14583333488553762,
1114
+ "rewards/format_reward": 0.0,
1115
+ "rewards/tag_count_reward": 0.402529776096344,
1116
+ "step": 79
1117
+ },
1118
+ {
1119
+ "completion_length": 3400.9554443359375,
1120
+ "epoch": 2.2237762237762237,
1121
+ "grad_norm": 2.087148427963257,
1122
+ "kl": 1.166015625,
1123
+ "learning_rate": 2.84318932147335e-06,
1124
+ "loss": 0.0467,
1125
+ "reward": 0.5111607238650322,
1126
+ "reward_std": 0.19660818576812744,
1127
+ "rewards/accuracy_reward": 0.08928571827709675,
1128
+ "rewards/format_reward": 0.0,
1129
+ "rewards/tag_count_reward": 0.4218750074505806,
1130
+ "step": 80
1131
+ },
1132
+ {
1133
+ "completion_length": 3498.4970703125,
1134
+ "epoch": 2.2517482517482517,
1135
+ "grad_norm": 1.5509693622589111,
1136
+ "kl": 1.2734375,
1137
+ "learning_rate": 2.726971904795827e-06,
1138
+ "loss": 0.0509,
1139
+ "reward": 0.5863095372915268,
1140
+ "reward_std": 0.2618163302540779,
1141
+ "rewards/accuracy_reward": 0.16964286286383867,
1142
+ "rewards/format_reward": 0.0,
1143
+ "rewards/tag_count_reward": 0.4166666716337204,
1144
+ "step": 81
1145
+ },
1146
+ {
1147
+ "completion_length": 3320.2977294921875,
1148
+ "epoch": 2.2797202797202796,
1149
+ "grad_norm": 1.4535397291183472,
1150
+ "kl": 1.3671875,
1151
+ "learning_rate": 2.6136630116598715e-06,
1152
+ "loss": 0.0547,
1153
+ "reward": 0.633928582072258,
1154
+ "reward_std": 0.2568051181733608,
1155
+ "rewards/accuracy_reward": 0.19345238618552685,
1156
+ "rewards/format_reward": 0.0,
1157
+ "rewards/tag_count_reward": 0.4404762014746666,
1158
+ "step": 82
1159
+ },
1160
+ {
1161
+ "completion_length": 3347.7708740234375,
1162
+ "epoch": 2.3076923076923075,
1163
+ "grad_norm": 1.1822701692581177,
1164
+ "kl": 1.642578125,
1165
+ "learning_rate": 2.503381487447436e-06,
1166
+ "loss": 0.0657,
1167
+ "reward": 0.603422611951828,
1168
+ "reward_std": 0.18616832420229912,
1169
+ "rewards/accuracy_reward": 0.16369047947227955,
1170
+ "rewards/format_reward": 0.0,
1171
+ "rewards/tag_count_reward": 0.4397321566939354,
1172
+ "step": 83
1173
+ },
1174
+ {
1175
+ "completion_length": 3295.8721313476562,
1176
+ "epoch": 2.335664335664336,
1177
+ "grad_norm": 0.6564051508903503,
1178
+ "kl": 1.583984375,
1179
+ "learning_rate": 2.396243002248531e-06,
1180
+ "loss": 0.0634,
1181
+ "reward": 0.5677083432674408,
1182
+ "reward_std": 0.21577723138034344,
1183
+ "rewards/accuracy_reward": 0.1220238097012043,
1184
+ "rewards/format_reward": 0.0,
1185
+ "rewards/tag_count_reward": 0.4456845372915268,
1186
+ "step": 84
1187
+ },
1188
+ {
1189
+ "completion_length": 3335.4048461914062,
1190
+ "epoch": 2.3636363636363638,
1191
+ "grad_norm": 0.6571016907691956,
1192
+ "kl": 1.974609375,
1193
+ "learning_rate": 2.29235992953927e-06,
1194
+ "loss": 0.0789,
1195
+ "reward": 0.6979166716337204,
1196
+ "reward_std": 0.298696992918849,
1197
+ "rewards/accuracy_reward": 0.279761902987957,
1198
+ "rewards/format_reward": 0.0,
1199
+ "rewards/tag_count_reward": 0.4181547686457634,
1200
+ "step": 85
1201
+ },
1202
+ {
1203
+ "completion_length": 3272.0983276367188,
1204
+ "epoch": 2.3916083916083917,
1205
+ "grad_norm": 0.7644421458244324,
1206
+ "kl": 1.79296875,
1207
+ "learning_rate": 2.1918412283175996e-06,
1208
+ "loss": 0.0716,
1209
+ "reward": 0.8958333432674408,
1210
+ "reward_std": 0.3616176173090935,
1211
+ "rewards/accuracy_reward": 0.4761904776096344,
1212
+ "rewards/format_reward": 0.0,
1213
+ "rewards/tag_count_reward": 0.4196428656578064,
1214
+ "step": 86
1215
+ },
1216
+ {
1217
+ "completion_length": 3789.7709350585938,
1218
+ "epoch": 2.4195804195804196,
1219
+ "grad_norm": 1.8745439052581787,
1220
+ "kl": 2.90625,
1221
+ "learning_rate": 2.0947923288203713e-06,
1222
+ "loss": 0.1163,
1223
+ "reward": 0.6034226417541504,
1224
+ "reward_std": 0.2801675796508789,
1225
+ "rewards/accuracy_reward": 0.23511904664337635,
1226
+ "rewards/format_reward": 0.0,
1227
+ "rewards/tag_count_reward": 0.368303582072258,
1228
+ "step": 87
1229
+ },
1230
+ {
1231
+ "completion_length": 3644.3690795898438,
1232
+ "epoch": 2.4475524475524475,
1233
+ "grad_norm": 1.7979298830032349,
1234
+ "kl": 1.87109375,
1235
+ "learning_rate": 2.0013150219415796e-06,
1236
+ "loss": 0.0749,
1237
+ "reward": 0.7566964477300644,
1238
+ "reward_std": 0.34903113171458244,
1239
+ "rewards/accuracy_reward": 0.345238097012043,
1240
+ "rewards/format_reward": 0.0,
1241
+ "rewards/tag_count_reward": 0.4114583432674408,
1242
+ "step": 88
1243
+ },
1244
+ {
1245
+ "completion_length": 3629.369140625,
1246
+ "epoch": 2.4755244755244754,
1247
+ "grad_norm": 0.8993220329284668,
1248
+ "kl": 2.6484375,
1249
+ "learning_rate": 1.9115073524677572e-06,
1250
+ "loss": 0.1058,
1251
+ "reward": 0.6696428805589676,
1252
+ "reward_std": 0.3975626900792122,
1253
+ "rewards/accuracy_reward": 0.2827381007373333,
1254
+ "rewards/format_reward": 0.0,
1255
+ "rewards/tag_count_reward": 0.386904776096344,
1256
+ "step": 89
1257
+ },
1258
+ {
1259
+ "completion_length": 3657.0298461914062,
1260
+ "epoch": 2.5034965034965033,
1261
+ "grad_norm": 2.3261046409606934,
1262
+ "kl": 2.380859375,
1263
+ "learning_rate": 1.8254635162425506e-06,
1264
+ "loss": 0.0952,
1265
+ "reward": 0.6808035746216774,
1266
+ "reward_std": 0.27695968747138977,
1267
+ "rewards/accuracy_reward": 0.27976190857589245,
1268
+ "rewards/format_reward": 0.0,
1269
+ "rewards/tag_count_reward": 0.4010416716337204,
1270
+ "step": 90
1271
+ },
1272
+ {
1273
+ "completion_length": 3351.8870239257812,
1274
+ "epoch": 2.5314685314685317,
1275
+ "grad_norm": 1.173328161239624,
1276
+ "kl": 1.958984375,
1277
+ "learning_rate": 1.743273761368281e-06,
1278
+ "loss": 0.0784,
1279
+ "reward": 0.8422619253396988,
1280
+ "reward_std": 0.3281675949692726,
1281
+ "rewards/accuracy_reward": 0.4255952462553978,
1282
+ "rewards/format_reward": 0.0,
1283
+ "rewards/tag_count_reward": 0.416666679084301,
1284
+ "step": 91
1285
+ },
1286
+ {
1287
+ "completion_length": 3638.806640625,
1288
+ "epoch": 2.5594405594405596,
1289
+ "grad_norm": 0.6548011898994446,
1290
+ "kl": 1.91015625,
1291
+ "learning_rate": 1.665024293548139e-06,
1292
+ "loss": 0.0763,
1293
+ "reward": 0.6852678656578064,
1294
+ "reward_std": 0.273727435618639,
1295
+ "rewards/accuracy_reward": 0.2886904776096344,
1296
+ "rewards/format_reward": 0.0,
1297
+ "rewards/tag_count_reward": 0.3965773954987526,
1298
+ "step": 92
1299
+ },
1300
+ {
1301
+ "completion_length": 3484.2113647460938,
1302
+ "epoch": 2.5874125874125875,
1303
+ "grad_norm": 0.9616138935089111,
1304
+ "kl": 1.80859375,
1305
+ "learning_rate": 1.5907971856683201e-06,
1306
+ "loss": 0.0722,
1307
+ "reward": 0.7834821343421936,
1308
+ "reward_std": 0.32932380586862564,
1309
+ "rewards/accuracy_reward": 0.3779762014746666,
1310
+ "rewards/format_reward": 0.0,
1311
+ "rewards/tag_count_reward": 0.4055059626698494,
1312
+ "step": 93
1313
+ },
1314
+ {
1315
+ "completion_length": 3699.3363037109375,
1316
+ "epoch": 2.6153846153846154,
1317
+ "grad_norm": 0.8441236615180969,
1318
+ "kl": 1.95703125,
1319
+ "learning_rate": 1.5206702917148948e-06,
1320
+ "loss": 0.0782,
1321
+ "reward": 0.751488097012043,
1322
+ "reward_std": 0.3020743727684021,
1323
+ "rewards/accuracy_reward": 0.35416667722165585,
1324
+ "rewards/format_reward": 0.0,
1325
+ "rewards/tag_count_reward": 0.3973214402794838,
1326
+ "step": 94
1327
+ },
1328
+ {
1329
+ "completion_length": 3259.8780517578125,
1330
+ "epoch": 2.6433566433566433,
1331
+ "grad_norm": 0.8067697882652283,
1332
+ "kl": 1.578125,
1333
+ "learning_rate": 1.4547171651157216e-06,
1334
+ "loss": 0.063,
1335
+ "reward": 0.8683035969734192,
1336
+ "reward_std": 0.3106134869158268,
1337
+ "rewards/accuracy_reward": 0.4494047686457634,
1338
+ "rewards/format_reward": 0.0,
1339
+ "rewards/tag_count_reward": 0.4188988134264946,
1340
+ "step": 95
1341
+ },
1342
+ {
1343
+ "completion_length": 3130.3958740234375,
1344
+ "epoch": 2.6713286713286712,
1345
+ "grad_norm": 0.9448930025100708,
1346
+ "kl": 2.51953125,
1347
+ "learning_rate": 1.3930069815930699e-06,
1348
+ "loss": 0.1008,
1349
+ "reward": 0.845238134264946,
1350
+ "reward_std": 0.3288194462656975,
1351
+ "rewards/accuracy_reward": 0.413690485060215,
1352
+ "rewards/format_reward": 0.0,
1353
+ "rewards/tag_count_reward": 0.4315476194024086,
1354
+ "step": 96
1355
+ },
1356
+ {
1357
+ "completion_length": 2959.2113037109375,
1358
+ "epoch": 2.699300699300699,
1359
+ "grad_norm": 86.35860443115234,
1360
+ "kl": 1.66796875,
1361
+ "learning_rate": 1.3356044666078316e-06,
1362
+ "loss": 0.0667,
1363
+ "reward": 0.9196428656578064,
1364
+ "reward_std": 0.31560714542865753,
1365
+ "rewards/accuracy_reward": 0.4702381044626236,
1366
+ "rewards/format_reward": 0.0,
1367
+ "rewards/tag_count_reward": 0.449404776096344,
1368
+ "step": 97
1369
+ },
1370
+ {
1371
+ "completion_length": 3174.482177734375,
1372
+ "epoch": 2.7272727272727275,
1373
+ "grad_norm": 1.08586847782135,
1374
+ "kl": 2.1953125,
1375
+ "learning_rate": 1.2825698274714542e-06,
1376
+ "loss": 0.0879,
1377
+ "reward": 0.7790178656578064,
1378
+ "reward_std": 0.2925742566585541,
1379
+ "rewards/accuracy_reward": 0.36011906154453754,
1380
+ "rewards/format_reward": 0.0,
1381
+ "rewards/tag_count_reward": 0.4188988134264946,
1382
+ "step": 98
1383
+ },
1384
+ {
1385
+ "completion_length": 2669.029815673828,
1386
+ "epoch": 2.755244755244755,
1387
+ "grad_norm": 1.5489476919174194,
1388
+ "kl": 1.087890625,
1389
+ "learning_rate": 1.2339586901967831e-06,
1390
+ "loss": 0.0435,
1391
+ "reward": 0.82738097012043,
1392
+ "reward_std": 0.1871817335486412,
1393
+ "rewards/accuracy_reward": 0.3750000037252903,
1394
+ "rewards/format_reward": 0.0,
1395
+ "rewards/tag_count_reward": 0.4523809552192688,
1396
+ "step": 99
1397
+ },
1398
+ {
1399
+ "completion_length": 3378.9107666015625,
1400
+ "epoch": 2.7832167832167833,
1401
+ "grad_norm": 2.6001334190368652,
1402
+ "kl": 1.6171875,
1403
+ "learning_rate": 1.1898220411540584e-06,
1404
+ "loss": 0.0646,
1405
+ "reward": 0.5625000074505806,
1406
+ "reward_std": 0.17016959004104137,
1407
+ "rewards/accuracy_reward": 0.15476190857589245,
1408
+ "rewards/format_reward": 0.0,
1409
+ "rewards/tag_count_reward": 0.4077381044626236,
1410
+ "step": 100
1411
+ },
1412
+ {
1413
+ "completion_length": 3032.5596313476562,
1414
+ "epoch": 2.8111888111888113,
1415
+ "grad_norm": 0.5906639695167542,
1416
+ "kl": 1.033203125,
1417
+ "learning_rate": 1.15020617359325e-06,
1418
+ "loss": 0.0413,
1419
+ "reward": 0.8377976268529892,
1420
+ "reward_std": 0.27019889280200005,
1421
+ "rewards/accuracy_reward": 0.3750000074505806,
1422
+ "rewards/format_reward": 0.0,
1423
+ "rewards/tag_count_reward": 0.4627976268529892,
1424
+ "step": 101
1425
+ },
1426
+ {
1427
+ "completion_length": 3194.2351684570312,
1428
+ "epoch": 2.839160839160839,
1429
+ "grad_norm": 2.6414685249328613,
1430
+ "kl": 1.2294921875,
1431
+ "learning_rate": 1.1151526390888332e-06,
1432
+ "loss": 0.0492,
1433
+ "reward": 0.6569940447807312,
1434
+ "reward_std": 0.21223976835608482,
1435
+ "rewards/accuracy_reward": 0.2321428603027016,
1436
+ "rewards/format_reward": 0.0,
1437
+ "rewards/tag_count_reward": 0.4248512089252472,
1438
+ "step": 102
1439
+ },
1440
+ {
1441
+ "completion_length": 3116.6101684570312,
1442
+ "epoch": 2.867132867132867,
1443
+ "grad_norm": 1.5984132289886475,
1444
+ "kl": 1.490234375,
1445
+ "learning_rate": 1.0846982039579245e-06,
1446
+ "loss": 0.0596,
1447
+ "reward": 0.7373512089252472,
1448
+ "reward_std": 0.23233365640044212,
1449
+ "rewards/accuracy_reward": 0.2886904813349247,
1450
+ "rewards/format_reward": 0.0,
1451
+ "rewards/tag_count_reward": 0.4486607238650322,
1452
+ "step": 103
1453
+ },
1454
+ {
1455
+ "completion_length": 2812.1041870117188,
1456
+ "epoch": 2.895104895104895,
1457
+ "grad_norm": 2.6789627075195312,
1458
+ "kl": 1.0205078125,
1459
+ "learning_rate": 1.0588748106974919e-06,
1460
+ "loss": 0.0408,
1461
+ "reward": 0.7633928805589676,
1462
+ "reward_std": 0.1795758679509163,
1463
+ "rewards/accuracy_reward": 0.3125000037252903,
1464
+ "rewards/format_reward": 0.0,
1465
+ "rewards/tag_count_reward": 0.450892873108387,
1466
+ "step": 104
1467
+ },
1468
+ {
1469
+ "completion_length": 3047.0625,
1470
+ "epoch": 2.9230769230769234,
1471
+ "grad_norm": 2.337068796157837,
1472
+ "kl": 1.31640625,
1473
+ "learning_rate": 1.0377095444810873e-06,
1474
+ "loss": 0.0526,
1475
+ "reward": 0.7068452537059784,
1476
+ "reward_std": 0.2549416534602642,
1477
+ "rewards/accuracy_reward": 0.2678571417927742,
1478
+ "rewards/format_reward": 0.0,
1479
+ "rewards/tag_count_reward": 0.4389881044626236,
1480
+ "step": 105
1481
+ },
1482
+ {
1483
+ "completion_length": 2715.509033203125,
1484
+ "epoch": 2.951048951048951,
1485
+ "grad_norm": 1.1716344356536865,
1486
+ "kl": 0.943359375,
1487
+ "learning_rate": 1.0212246047502374e-06,
1488
+ "loss": 0.0378,
1489
+ "reward": 0.8489583432674408,
1490
+ "reward_std": 0.2508317902684212,
1491
+ "rewards/accuracy_reward": 0.3690476343035698,
1492
+ "rewards/format_reward": 0.0,
1493
+ "rewards/tag_count_reward": 0.4799107238650322,
1494
+ "step": 106
1495
+ },
1496
+ {
1497
+ "completion_length": 3682.6786499023438,
1498
+ "epoch": 2.979020979020979,
1499
+ "grad_norm": 1.8174355030059814,
1500
+ "kl": 1.59375,
1501
+ "learning_rate": 1.0094372819302978e-06,
1502
+ "loss": 0.0637,
1503
+ "reward": 0.5141369104385376,
1504
+ "reward_std": 0.207677461206913,
1505
+ "rewards/accuracy_reward": 0.1398809556849301,
1506
+ "rewards/format_reward": 0.0,
1507
+ "rewards/tag_count_reward": 0.3742559552192688,
1508
+ "step": 107
1509
+ },
1510
+ {
1511
+ "completion_length": 4012.118408203125,
1512
+ "epoch": 3.0,
1513
+ "grad_norm": 1.1122511625289917,
1514
+ "kl": 1.6484375,
1515
+ "learning_rate": 1.002359939295183e-06,
1516
+ "loss": 0.0495,
1517
+ "reward": 0.4861111243565877,
1518
+ "reward_std": 0.16074346005916595,
1519
+ "rewards/accuracy_reward": 0.10714286069075267,
1520
+ "rewards/format_reward": 0.0,
1521
+ "rewards/tag_count_reward": 0.3789682586987813,
1522
+ "step": 108
1523
+ },
1524
+ {
1525
+ "epoch": 3.0,
1526
+ "step": 108,
1527
  "total_flos": 0.0,
1528
+ "train_loss": 0.2708671883675033,
1529
+ "train_runtime": 44954.8681,
1530
+ "train_samples_per_second": 0.067,
1531
  "train_steps_per_second": 0.002
1532
  }
1533
  ],
1534
  "logging_steps": 1,
1535
+ "max_steps": 108,
1536
  "num_input_tokens_seen": 0,
1537
+ "num_train_epochs": 3,
1538
  "save_steps": 500,
1539
  "stateful_callbacks": {
1540
  "TrainerControl": {