Blancy commited on
Commit
de56e62
·
verified ·
1 Parent(s): 3608d77

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +287 -287
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen3-1.7B
3
- datasets: Blancy/secondfiltered-math220k-difficulty_stratified_10k_filtered_only_medium_difficulty
4
  library_name: transformers
5
  model_name: Qwen3-1.7B-Open-R1-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen3-1.7B-Open-R1-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B) on the [Blancy/secondfiltered-math220k-difficulty_stratified_10k_filtered_only_medium_difficulty](https://huggingface.co/datasets/Blancy/secondfiltered-math220k-difficulty_stratified_10k_filtered_only_medium_difficulty) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/wcdxh0jk)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen3-1.7B
 
3
  library_name: transformers
4
  model_name: Qwen3-1.7B-Open-R1-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen3-1.7B-Open-R1-GRPO
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/iqlnk5h1)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.8157487940822636,
4
- "train_runtime": 11133.5212,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.09,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.005675484693879993,
4
+ "train_runtime": 16137.0184,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.062,
7
+ "train_steps_per_second": 0.002
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.8157487940822636,
4
- "train_runtime": 11133.5212,
5
  "train_samples": 1000,
6
- "train_samples_per_second": 0.09,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.005675484693879993,
4
+ "train_runtime": 16137.0184,
5
  "train_samples": 1000,
6
+ "train_samples_per_second": 0.062,
7
+ "train_steps_per_second": 0.002
8
  }
trainer_state.json CHANGED
@@ -10,517 +10,517 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 3740.9702758789062,
14
  "epoch": 0.027972027972027972,
15
- "grad_norm": 2.5353879928588867,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
19
- "reward": 0.5535714328289032,
20
- "reward_std": 0.23917681723833084,
21
- "rewards/accuracy_reward": 0.11011904943734407,
22
  "rewards/format_reward": 0.0,
23
- "rewards/tag_count_reward": 0.443452388048172,
24
  "step": 1
25
  },
26
  {
27
- "completion_length": 3610.02685546875,
28
  "epoch": 0.055944055944055944,
29
- "grad_norm": 1.1924161911010742,
30
  "kl": 0.0,
31
  "learning_rate": 2.5e-06,
32
  "loss": 0.0,
33
- "reward": 0.6622024029493332,
34
- "reward_std": 0.2192090731114149,
35
- "rewards/accuracy_reward": 0.2113095261156559,
36
  "rewards/format_reward": 0.0,
37
- "rewards/tag_count_reward": 0.450892873108387,
38
  "step": 2
39
  },
40
  {
41
- "completion_length": 3514.4406127929688,
42
  "epoch": 0.08391608391608392,
43
- "grad_norm": 1.2518582344055176,
44
- "kl": 0.001033782958984375,
45
  "learning_rate": 5e-06,
46
  "loss": 0.0,
47
- "reward": 0.5863095298409462,
48
- "reward_std": 0.2138436622917652,
49
- "rewards/accuracy_reward": 0.13690476515330374,
50
  "rewards/format_reward": 0.0,
51
- "rewards/tag_count_reward": 0.4494047611951828,
52
  "step": 3
53
  },
54
  {
55
- "completion_length": 3706.636962890625,
56
  "epoch": 0.11188811188811189,
57
- "grad_norm": 1.1967724561691284,
58
- "kl": 0.004276275634765625,
59
  "learning_rate": 7.500000000000001e-06,
60
- "loss": 0.0002,
61
- "reward": 0.6011904999613762,
62
- "reward_std": 0.1717666843906045,
63
- "rewards/accuracy_reward": 0.1726190522313118,
64
  "rewards/format_reward": 0.0,
65
- "rewards/tag_count_reward": 0.4285714402794838,
66
  "step": 4
67
  },
68
  {
69
- "completion_length": 3468.27978515625,
70
  "epoch": 0.13986013986013987,
71
- "grad_norm": 1.1130006313323975,
72
- "kl": 0.02197265625,
73
  "learning_rate": 1e-05,
74
- "loss": 0.0009,
75
- "reward": 0.6629464626312256,
76
- "reward_std": 0.26501624286174774,
77
- "rewards/accuracy_reward": 0.2380952388048172,
78
  "rewards/format_reward": 0.0,
79
- "rewards/tag_count_reward": 0.4248512014746666,
80
  "step": 5
81
  },
82
  {
83
- "completion_length": 3647.2382202148438,
84
  "epoch": 0.16783216783216784,
85
- "grad_norm": 0.6197645664215088,
86
- "kl": 0.04852294921875,
87
  "learning_rate": 9.978331270024887e-06,
88
- "loss": 0.0019,
89
- "reward": 0.5558035895228386,
90
- "reward_std": 0.19512668810784817,
91
- "rewards/accuracy_reward": 0.1011904813349247,
92
  "rewards/format_reward": 0.0,
93
- "rewards/tag_count_reward": 0.4546131044626236,
94
  "step": 6
95
  },
96
  {
97
- "completion_length": 3054.0327758789062,
98
  "epoch": 0.1958041958041958,
99
- "grad_norm": 0.3058634102344513,
100
- "kl": 0.1123046875,
101
  "learning_rate": 9.913533761814537e-06,
102
- "loss": 0.0045,
103
- "reward": 0.9293155074119568,
104
- "reward_std": 0.32718629390001297,
105
- "rewards/accuracy_reward": 0.2589285708963871,
106
  "rewards/format_reward": 0.0,
107
- "rewards/tag_count_reward": 0.6703869253396988,
108
  "step": 7
109
  },
110
  {
111
- "completion_length": 3840.2678833007812,
112
  "epoch": 0.22377622377622378,
113
- "grad_norm": 0.20688477158546448,
114
- "kl": 0.1109619140625,
115
  "learning_rate": 9.80623151079494e-06,
116
- "loss": 0.0044,
117
- "reward": 0.9114583432674408,
118
- "reward_std": 0.45086005330085754,
119
- "rewards/accuracy_reward": 0.4523809552192688,
120
  "rewards/format_reward": 0.0,
121
- "rewards/tag_count_reward": 0.459077388048172,
122
  "step": 8
123
  },
124
  {
125
- "completion_length": 3944.8988647460938,
126
  "epoch": 0.2517482517482518,
127
- "grad_norm": 0.22741489112377167,
128
- "kl": 0.127685546875,
129
  "learning_rate": 9.65745789630079e-06,
130
- "loss": 0.0051,
131
- "reward": 0.7061012089252472,
132
- "reward_std": 0.36991604790091515,
133
- "rewards/accuracy_reward": 0.3005952350795269,
134
  "rewards/format_reward": 0.0,
135
- "rewards/tag_count_reward": 0.4055059552192688,
136
  "step": 9
137
  },
138
  {
139
- "completion_length": 3999.8125610351562,
140
  "epoch": 0.27972027972027974,
141
- "grad_norm": 0.15535762906074524,
142
- "kl": 0.155029296875,
143
  "learning_rate": 9.468645689567599e-06,
144
- "loss": 0.0062,
145
- "reward": 0.836309552192688,
146
- "reward_std": 0.403655294328928,
147
- "rewards/accuracy_reward": 0.4345238208770752,
148
  "rewards/format_reward": 0.0,
149
- "rewards/tag_count_reward": 0.4017857164144516,
150
  "step": 10
151
  },
152
  {
153
- "completion_length": 4248.943420410156,
154
  "epoch": 0.3076923076923077,
155
- "grad_norm": 0.16839423775672913,
156
- "kl": 0.216064453125,
157
  "learning_rate": 9.241613255361455e-06,
158
- "loss": 0.0087,
159
- "reward": 0.5141369104385376,
160
- "reward_std": 0.3293060325086117,
161
- "rewards/accuracy_reward": 0.19940476678311825,
162
  "rewards/format_reward": 0.0,
163
- "rewards/tag_count_reward": 0.3147321417927742,
164
  "step": 11
165
  },
166
  {
167
- "completion_length": 4181.681579589844,
168
  "epoch": 0.3356643356643357,
169
- "grad_norm": 0.21270643174648285,
170
- "kl": 0.28564453125,
171
  "learning_rate": 8.978547040132317e-06,
172
- "loss": 0.0114,
173
- "reward": 0.560267873108387,
174
- "reward_std": 0.32143769413232803,
175
- "rewards/accuracy_reward": 0.22619048319756985,
176
  "rewards/format_reward": 0.0,
177
- "rewards/tag_count_reward": 0.334077388048172,
178
  "step": 12
179
  },
180
  {
181
- "completion_length": 4254.375,
182
  "epoch": 0.36363636363636365,
183
- "grad_norm": 0.20891696214675903,
184
- "kl": 0.341796875,
185
  "learning_rate": 8.681980515339464e-06,
186
- "loss": 0.0137,
187
- "reward": 0.5074404925107956,
188
- "reward_std": 0.3785836473107338,
189
- "rewards/accuracy_reward": 0.1994047649204731,
190
  "rewards/format_reward": 0.0,
191
- "rewards/tag_count_reward": 0.3080357238650322,
192
  "step": 13
193
  },
194
  {
195
- "completion_length": 2929.9107666015625,
196
  "epoch": 0.3916083916083916,
197
- "grad_norm": 7.643617153167725,
198
- "kl": 0.830078125,
199
  "learning_rate": 8.354769778736407e-06,
200
- "loss": 0.0332,
201
- "reward": 0.758928582072258,
202
- "reward_std": 0.3943299725651741,
203
- "rewards/accuracy_reward": 0.2708333395421505,
204
  "rewards/format_reward": 0.0,
205
- "rewards/tag_count_reward": 0.4880952462553978,
206
  "step": 14
207
  },
208
  {
209
- "completion_length": 1188.7470397949219,
210
  "epoch": 0.4195804195804196,
211
- "grad_norm": 28577.451171875,
212
- "kl": 512.5,
213
  "learning_rate": 8.00006604858821e-06,
214
- "loss": 20.5117,
215
- "reward": 0.9099702686071396,
216
- "reward_std": 0.3528708219528198,
217
- "rewards/accuracy_reward": 0.23214286379516125,
218
  "rewards/format_reward": 0.0,
219
- "rewards/tag_count_reward": 0.677827388048172,
220
  "step": 15
221
  },
222
  {
223
- "completion_length": 1025.5803833007812,
224
  "epoch": 0.44755244755244755,
225
- "grad_norm": 7.379767894744873,
226
- "kl": 2.345703125,
227
  "learning_rate": 7.621285315716991e-06,
228
- "loss": 0.0937,
229
- "reward": 0.8757440596818924,
230
- "reward_std": 0.3126339688897133,
231
- "rewards/accuracy_reward": 0.16666666883975267,
232
  "rewards/format_reward": 0.0,
233
- "rewards/tag_count_reward": 0.709077388048172,
234
  "step": 16
235
  },
236
  {
237
- "completion_length": 1385.9315795898438,
238
  "epoch": 0.4755244755244755,
239
- "grad_norm": 1.5446727275848389,
240
- "kl": 0.7958984375,
241
  "learning_rate": 7.222075445642904e-06,
242
- "loss": 0.0318,
243
- "reward": 0.8377976417541504,
244
- "reward_std": 0.4007321819663048,
245
- "rewards/accuracy_reward": 0.2648809626698494,
246
  "rewards/format_reward": 0.0,
247
- "rewards/tag_count_reward": 0.5729166641831398,
248
  "step": 17
249
  },
250
  {
251
- "completion_length": 1099.3571472167969,
252
  "epoch": 0.5034965034965035,
253
- "grad_norm": 2.3834547996520996,
254
- "kl": 1.29296875,
255
  "learning_rate": 6.80628104764508e-06,
256
- "loss": 0.0518,
257
- "reward": 0.7760416716337204,
258
- "reward_std": 0.37692204862833023,
259
- "rewards/accuracy_reward": 0.16666666977107525,
260
  "rewards/format_reward": 0.0,
261
- "rewards/tag_count_reward": 0.6093750298023224,
262
  "step": 18
263
  },
264
  {
265
- "completion_length": 705.9643096923828,
266
  "epoch": 0.5314685314685315,
267
- "grad_norm": 1.268905758857727,
268
- "kl": 2.6015625,
269
  "learning_rate": 6.377906449072578e-06,
270
- "loss": 0.104,
271
- "reward": 1.0922618806362152,
272
- "reward_std": 0.2668025754392147,
273
- "rewards/accuracy_reward": 0.19940476678311825,
274
  "rewards/format_reward": 0.0,
275
- "rewards/tag_count_reward": 0.8928571492433548,
276
  "step": 19
277
  },
278
  {
279
- "completion_length": 626.8452758789062,
280
  "epoch": 0.5594405594405595,
281
- "grad_norm": 1.6185439825057983,
282
- "kl": 2.73828125,
283
  "learning_rate": 5.9410771314830255e-06,
284
- "loss": 0.1096,
285
- "reward": 1.064732164144516,
286
- "reward_std": 0.2738974019885063,
287
- "rewards/accuracy_reward": 0.1607142873108387,
288
  "rewards/format_reward": 0.0,
289
- "rewards/tag_count_reward": 0.9040178805589676,
290
  "step": 20
291
  },
292
  {
293
- "completion_length": 654.1041717529297,
294
  "epoch": 0.5874125874125874,
295
- "grad_norm": 0.9367347359657288,
296
- "kl": 2.76171875,
297
  "learning_rate": 5.500000000000001e-06,
298
- "loss": 0.1106,
299
- "reward": 0.9836309850215912,
300
- "reward_std": 0.2889478802680969,
301
- "rewards/accuracy_reward": 0.08630952518433332,
302
  "rewards/format_reward": 0.0,
303
- "rewards/tag_count_reward": 0.8973214328289032,
304
  "step": 21
305
  },
306
  {
307
- "completion_length": 613.5803680419922,
308
  "epoch": 0.6153846153846154,
309
- "grad_norm": 2.1199615001678467,
310
- "kl": 2.8359375,
311
  "learning_rate": 5.0589228685169776e-06,
312
- "loss": 0.1135,
313
- "reward": 1.02976194024086,
314
- "reward_std": 0.263757660984993,
315
- "rewards/accuracy_reward": 0.1220238134264946,
316
  "rewards/format_reward": 0.0,
317
- "rewards/tag_count_reward": 0.9077381044626236,
318
  "step": 22
319
  },
320
  {
321
- "completion_length": 627.8363342285156,
322
  "epoch": 0.6433566433566433,
323
- "grad_norm": 0.6018344163894653,
324
- "kl": 2.61328125,
325
  "learning_rate": 4.622093550927423e-06,
326
- "loss": 0.1043,
327
- "reward": 1.0096726268529892,
328
- "reward_std": 0.23419346287846565,
329
- "rewards/accuracy_reward": 0.10416666883975267,
330
  "rewards/format_reward": 0.0,
331
- "rewards/tag_count_reward": 0.90550597012043,
332
  "step": 23
333
  },
334
  {
335
- "completion_length": 550.7351455688477,
336
  "epoch": 0.6713286713286714,
337
- "grad_norm": 1.0858573913574219,
338
- "kl": 2.90234375,
339
  "learning_rate": 4.193718952354921e-06,
340
- "loss": 0.116,
341
- "reward": 1.1629464626312256,
342
- "reward_std": 0.2896396704018116,
343
- "rewards/accuracy_reward": 0.2321428619325161,
344
  "rewards/format_reward": 0.0,
345
- "rewards/tag_count_reward": 0.930803582072258,
346
  "step": 24
347
  },
348
  {
349
- "completion_length": 529.0982208251953,
350
  "epoch": 0.6993006993006993,
351
- "grad_norm": 1.6778327226638794,
352
- "kl": 3.05859375,
353
  "learning_rate": 3.777924554357096e-06,
354
- "loss": 0.1223,
355
- "reward": 1.1614583432674408,
356
- "reward_std": 0.28430329635739326,
357
- "rewards/accuracy_reward": 0.2291666679084301,
358
  "rewards/format_reward": 0.0,
359
- "rewards/tag_count_reward": 0.9322916716337204,
360
  "step": 25
361
  },
362
  {
363
- "completion_length": 686.3035736083984,
364
  "epoch": 0.7272727272727273,
365
- "grad_norm": 1.1516579389572144,
366
- "kl": 1.87890625,
367
  "learning_rate": 3.378714684283011e-06,
368
- "loss": 0.075,
369
- "reward": 1.0297619253396988,
370
- "reward_std": 0.27740226313471794,
371
- "rewards/accuracy_reward": 0.1279761923942715,
372
  "rewards/format_reward": 0.0,
373
- "rewards/tag_count_reward": 0.9017857313156128,
374
  "step": 26
375
  },
376
  {
377
- "completion_length": 625.6250305175781,
378
  "epoch": 0.7552447552447552,
379
- "grad_norm": 1.0151987075805664,
380
- "kl": 2.1953125,
381
  "learning_rate": 2.9999339514117913e-06,
382
- "loss": 0.0878,
383
- "reward": 0.995535746216774,
384
- "reward_std": 0.2647087723016739,
385
- "rewards/accuracy_reward": 0.0922619067132473,
386
  "rewards/format_reward": 0.0,
387
- "rewards/tag_count_reward": 0.9032738208770752,
388
  "step": 27
389
  },
390
  {
391
- "completion_length": 743.3601226806641,
392
  "epoch": 0.7832167832167832,
393
- "grad_norm": 1.265594720840454,
394
- "kl": 2.052734375,
395
  "learning_rate": 2.645230221263596e-06,
396
- "loss": 0.0821,
397
- "reward": 0.9337797909975052,
398
- "reward_std": 0.2643532156944275,
399
- "rewards/accuracy_reward": 0.041666666977107525,
400
  "rewards/format_reward": 0.0,
401
- "rewards/tag_count_reward": 0.8921131044626236,
402
  "step": 28
403
  },
404
  {
405
- "completion_length": 704.5833587646484,
406
  "epoch": 0.8111888111888111,
407
- "grad_norm": 1.292297601699829,
408
- "kl": 1.8984375,
409
  "learning_rate": 2.3180194846605367e-06,
410
- "loss": 0.0758,
411
- "reward": 0.9635417014360428,
412
- "reward_std": 0.2751614972949028,
413
- "rewards/accuracy_reward": 0.06845238362438977,
414
  "rewards/format_reward": 0.0,
415
- "rewards/tag_count_reward": 0.8950892835855484,
416
  "step": 29
417
  },
418
  {
419
- "completion_length": 726.8244323730469,
420
  "epoch": 0.8391608391608392,
421
- "grad_norm": 1.4452919960021973,
422
- "kl": 1.96875,
423
  "learning_rate": 2.021452959867684e-06,
424
- "loss": 0.0786,
425
- "reward": 0.9531250149011612,
426
- "reward_std": 0.27541816234588623,
427
- "rewards/accuracy_reward": 0.05357143096625805,
428
  "rewards/format_reward": 0.0,
429
- "rewards/tag_count_reward": 0.899553582072258,
430
  "step": 30
431
  },
432
  {
433
- "completion_length": 729.4553527832031,
434
  "epoch": 0.8671328671328671,
435
- "grad_norm": 1.028757929801941,
436
- "kl": 2.0859375,
437
  "learning_rate": 1.7583867446385461e-06,
438
- "loss": 0.0835,
439
- "reward": 0.9769345372915268,
440
- "reward_std": 0.2937622144818306,
441
- "rewards/accuracy_reward": 0.08333333488553762,
442
  "rewards/format_reward": 0.0,
443
- "rewards/tag_count_reward": 0.8936012089252472,
444
  "step": 31
445
  },
446
  {
447
- "completion_length": 755.1101379394531,
448
  "epoch": 0.8951048951048951,
449
- "grad_norm": 1.4110631942749023,
450
- "kl": 2.0,
451
  "learning_rate": 1.531354310432403e-06,
452
- "loss": 0.08,
453
- "reward": 0.9494047909975052,
454
- "reward_std": 0.2776322588324547,
455
- "rewards/accuracy_reward": 0.05357142956927419,
456
  "rewards/format_reward": 0.0,
457
- "rewards/tag_count_reward": 0.8958333432674408,
458
  "step": 32
459
  },
460
  {
461
- "completion_length": 725.3601226806641,
462
  "epoch": 0.9230769230769231,
463
- "grad_norm": 14314.6650390625,
464
- "kl": 174.45703125,
465
  "learning_rate": 1.3425421036992098e-06,
466
- "loss": 6.9931,
467
- "reward": 0.9568452686071396,
468
- "reward_std": 0.27130044624209404,
469
- "rewards/accuracy_reward": 0.059523812029510736,
470
  "rewards/format_reward": 0.0,
471
- "rewards/tag_count_reward": 0.8973214477300644,
472
  "step": 33
473
  },
474
  {
475
- "completion_length": 645.8035736083984,
476
  "epoch": 0.951048951048951,
477
- "grad_norm": 4.487875938415527,
478
- "kl": 2.46484375,
479
  "learning_rate": 1.1937684892050606e-06,
480
- "loss": 0.0984,
481
- "reward": 0.9687500149011612,
482
- "reward_std": 0.23901110514998436,
483
- "rewards/accuracy_reward": 0.06845238292589784,
484
  "rewards/format_reward": 0.0,
485
- "rewards/tag_count_reward": 0.9002976417541504,
486
  "step": 34
487
  },
488
  {
489
- "completion_length": 708.7559509277344,
490
  "epoch": 0.9790209790209791,
491
- "grad_norm": 1.2720866203308105,
492
- "kl": 2.10546875,
493
  "learning_rate": 1.0864662381854632e-06,
494
- "loss": 0.0842,
495
- "reward": 0.9687500298023224,
496
- "reward_std": 0.28124209865927696,
497
- "rewards/accuracy_reward": 0.06547619262710214,
498
  "rewards/format_reward": 0.0,
499
- "rewards/tag_count_reward": 0.9032738208770752,
500
  "step": 35
501
  },
502
  {
503
- "completion_length": 167.14473724365234,
504
  "epoch": 1.0,
505
- "grad_norm": 1.2720866203308105,
506
- "kl": 2.2916666666666665,
507
  "learning_rate": 1.0216687299751146e-06,
508
- "loss": 0.0688,
509
- "reward": 0.9960317611694336,
510
- "reward_std": 0.026725949719548225,
511
- "rewards/accuracy_reward": 0.003968254042168458,
512
  "rewards/format_reward": 0.0,
513
- "rewards/tag_count_reward": 0.9920635024706522,
514
  "step": 36
515
  },
516
  {
517
  "epoch": 1.0,
518
  "step": 36,
519
  "total_flos": 0.0,
520
- "train_loss": 0.8157487940822636,
521
- "train_runtime": 11133.5212,
522
- "train_samples_per_second": 0.09,
523
- "train_steps_per_second": 0.003
524
  }
525
  ],
526
  "logging_steps": 1,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 3512.2977294921875,
14
  "epoch": 0.027972027972027972,
15
+ "grad_norm": 0.8604350090026855,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
  "loss": 0.0,
19
+ "reward": 0.4687500149011612,
20
+ "reward_std": 0.12072492484003305,
21
+ "rewards/accuracy_reward": 0.08333333441987634,
22
  "rewards/format_reward": 0.0,
23
+ "rewards/tag_count_reward": 0.3854166716337204,
24
  "step": 1
25
  },
26
  {
27
+ "completion_length": 3691.6666870117188,
28
  "epoch": 0.055944055944055944,
29
+ "grad_norm": 0.3791728615760803,
30
  "kl": 0.0,
31
  "learning_rate": 2.5e-06,
32
  "loss": 0.0,
33
+ "reward": 0.4680059477686882,
34
+ "reward_std": 0.1798743773251772,
35
+ "rewards/accuracy_reward": 0.09226190531626344,
36
  "rewards/format_reward": 0.0,
37
+ "rewards/tag_count_reward": 0.3757440596818924,
38
  "step": 2
39
  },
40
  {
41
+ "completion_length": 3429.5089721679688,
42
  "epoch": 0.08391608391608392,
43
+ "grad_norm": 0.5100764036178589,
44
+ "kl": 0.0008497238159179688,
45
  "learning_rate": 5e-06,
46
  "loss": 0.0,
47
+ "reward": 0.5654762014746666,
48
+ "reward_std": 0.14018218219280243,
49
+ "rewards/accuracy_reward": 0.1428571455180645,
50
  "rewards/format_reward": 0.0,
51
+ "rewards/tag_count_reward": 0.4226190596818924,
52
  "step": 3
53
  },
54
  {
55
+ "completion_length": 3350.5059814453125,
56
  "epoch": 0.11188811188811189,
57
+ "grad_norm": 0.5230619311332703,
58
+ "kl": 0.001438140869140625,
59
  "learning_rate": 7.500000000000001e-06,
60
+ "loss": 0.0001,
61
+ "reward": 0.5230654776096344,
62
+ "reward_std": 0.1375708756968379,
63
+ "rewards/accuracy_reward": 0.11011904943734407,
64
  "rewards/format_reward": 0.0,
65
+ "rewards/tag_count_reward": 0.4129464328289032,
66
  "step": 4
67
  },
68
  {
69
+ "completion_length": 3646.4851684570312,
70
  "epoch": 0.13986013986013987,
71
+ "grad_norm": 0.8575068712234497,
72
+ "kl": 0.00823211669921875,
73
  "learning_rate": 1e-05,
74
+ "loss": 0.0003,
75
+ "reward": 0.5424107164144516,
76
+ "reward_std": 0.20411022752523422,
77
+ "rewards/accuracy_reward": 0.12500000279396772,
78
  "rewards/format_reward": 0.0,
79
+ "rewards/tag_count_reward": 0.4174107238650322,
80
  "step": 5
81
  },
82
  {
83
+ "completion_length": 3599.0298461914062,
84
  "epoch": 0.16783216783216784,
85
+ "grad_norm": 38.98723602294922,
86
+ "kl": 0.415771484375,
87
  "learning_rate": 9.978331270024887e-06,
88
+ "loss": 0.0166,
89
+ "reward": 0.5632440447807312,
90
+ "reward_std": 0.2680988386273384,
91
+ "rewards/accuracy_reward": 0.06547619216144085,
92
  "rewards/format_reward": 0.0,
93
+ "rewards/tag_count_reward": 0.497767873108387,
94
  "step": 6
95
  },
96
  {
97
+ "completion_length": 3781.4287109375,
98
  "epoch": 0.1958041958041958,
99
+ "grad_norm": 0.22663027048110962,
100
+ "kl": 0.03509521484375,
101
  "learning_rate": 9.913533761814537e-06,
102
+ "loss": 0.0014,
103
+ "reward": 0.529017873108387,
104
+ "reward_std": 0.16097233816981316,
105
+ "rewards/accuracy_reward": 0.1458333358168602,
106
  "rewards/format_reward": 0.0,
107
+ "rewards/tag_count_reward": 0.3831845298409462,
108
  "step": 7
109
  },
110
  {
111
+ "completion_length": 3490.544677734375,
112
  "epoch": 0.22377622377622378,
113
+ "grad_norm": 0.6074256300926208,
114
+ "kl": 0.0587158203125,
115
  "learning_rate": 9.80623151079494e-06,
116
+ "loss": 0.0023,
117
+ "reward": 0.5052083507180214,
118
+ "reward_std": 0.21143596433103085,
119
+ "rewards/accuracy_reward": 0.10416666883975267,
120
  "rewards/format_reward": 0.0,
121
+ "rewards/tag_count_reward": 0.4010416716337204,
122
  "step": 8
123
  },
124
  {
125
+ "completion_length": 3470.9791870117188,
126
  "epoch": 0.2517482517482518,
127
+ "grad_norm": 0.22321581840515137,
128
+ "kl": 0.058837890625,
129
  "learning_rate": 9.65745789630079e-06,
130
+ "loss": 0.0024,
131
+ "reward": 0.6011904776096344,
132
+ "reward_std": 0.23324432224035263,
133
+ "rewards/accuracy_reward": 0.19940476678311825,
134
  "rewards/format_reward": 0.0,
135
+ "rewards/tag_count_reward": 0.4017857238650322,
136
  "step": 9
137
  },
138
  {
139
+ "completion_length": 3337.8363647460938,
140
  "epoch": 0.27972027972027974,
141
+ "grad_norm": 0.23314569890499115,
142
+ "kl": 0.059814453125,
143
  "learning_rate": 9.468645689567599e-06,
144
+ "loss": 0.0024,
145
+ "reward": 0.6495535969734192,
146
+ "reward_std": 0.19363375008106232,
147
+ "rewards/accuracy_reward": 0.2291666679084301,
148
  "rewards/format_reward": 0.0,
149
+ "rewards/tag_count_reward": 0.4203869178891182,
150
  "step": 10
151
  },
152
  {
153
+ "completion_length": 3455.556640625,
154
  "epoch": 0.3076923076923077,
155
+ "grad_norm": 0.16831204295158386,
156
+ "kl": 0.06622314453125,
157
  "learning_rate": 9.241613255361455e-06,
158
+ "loss": 0.0027,
159
+ "reward": 0.5208333432674408,
160
+ "reward_std": 0.12359252013266087,
161
+ "rewards/accuracy_reward": 0.13392857275903225,
162
  "rewards/format_reward": 0.0,
163
+ "rewards/tag_count_reward": 0.3869047686457634,
164
  "step": 11
165
  },
166
  {
167
+ "completion_length": 3545.419677734375,
168
  "epoch": 0.3356643356643357,
169
+ "grad_norm": 0.20184171199798584,
170
+ "kl": 0.07110595703125,
171
  "learning_rate": 8.978547040132317e-06,
172
+ "loss": 0.0028,
173
+ "reward": 0.4813988134264946,
174
+ "reward_std": 0.15685281716287136,
175
+ "rewards/accuracy_reward": 0.10416667070239782,
176
  "rewards/format_reward": 0.0,
177
+ "rewards/tag_count_reward": 0.3772321492433548,
178
  "step": 12
179
  },
180
  {
181
+ "completion_length": 3602.4286499023438,
182
  "epoch": 0.36363636363636365,
183
+ "grad_norm": 0.6997456550598145,
184
+ "kl": 0.0869140625,
185
  "learning_rate": 8.681980515339464e-06,
186
+ "loss": 0.0035,
187
+ "reward": 0.6718750074505806,
188
+ "reward_std": 0.1674486780539155,
189
+ "rewards/accuracy_reward": 0.3005952499806881,
190
  "rewards/format_reward": 0.0,
191
+ "rewards/tag_count_reward": 0.3712797686457634,
192
  "step": 13
193
  },
194
  {
195
+ "completion_length": 3812.681640625,
196
  "epoch": 0.3916083916083916,
197
+ "grad_norm": 0.19167593121528625,
198
+ "kl": 0.0965576171875,
199
  "learning_rate": 8.354769778736407e-06,
200
+ "loss": 0.0039,
201
+ "reward": 0.693452388048172,
202
+ "reward_std": 0.24091331334784627,
203
+ "rewards/accuracy_reward": 0.3392857201397419,
204
  "rewards/format_reward": 0.0,
205
+ "rewards/tag_count_reward": 0.3541666716337204,
206
  "step": 14
207
  },
208
  {
209
+ "completion_length": 4161.53271484375,
210
  "epoch": 0.4195804195804196,
211
+ "grad_norm": 0.22708754241466522,
212
+ "kl": 0.108642578125,
213
  "learning_rate": 8.00006604858821e-06,
214
+ "loss": 0.0043,
215
+ "reward": 0.4962797611951828,
216
+ "reward_std": 0.17475327849388123,
217
+ "rewards/accuracy_reward": 0.190476194024086,
218
  "rewards/format_reward": 0.0,
219
+ "rewards/tag_count_reward": 0.305803582072258,
220
  "step": 15
221
  },
222
  {
223
+ "completion_length": 4106.3929443359375,
224
  "epoch": 0.44755244755244755,
225
+ "grad_norm": 0.20696592330932617,
226
+ "kl": 0.120361328125,
227
  "learning_rate": 7.621285315716991e-06,
228
+ "loss": 0.0048,
229
+ "reward": 0.538690485060215,
230
+ "reward_std": 0.1706959567964077,
231
+ "rewards/accuracy_reward": 0.22023809887468815,
232
  "rewards/format_reward": 0.0,
233
+ "rewards/tag_count_reward": 0.318452388048172,
234
  "step": 16
235
  },
236
  {
237
+ "completion_length": 4136.211486816406,
238
  "epoch": 0.4755244755244755,
239
+ "grad_norm": 0.2171836495399475,
240
+ "kl": 0.145751953125,
241
  "learning_rate": 7.222075445642904e-06,
242
+ "loss": 0.0058,
243
+ "reward": 0.5014881044626236,
244
+ "reward_std": 0.15721454098820686,
245
+ "rewards/accuracy_reward": 0.19642857275903225,
246
  "rewards/format_reward": 0.0,
247
+ "rewards/tag_count_reward": 0.3050595223903656,
248
  "step": 17
249
  },
250
  {
251
+ "completion_length": 4130.928527832031,
252
  "epoch": 0.5034965034965035,
253
+ "grad_norm": 0.12546850740909576,
254
+ "kl": 0.1390380859375,
255
  "learning_rate": 6.80628104764508e-06,
256
+ "loss": 0.0056,
257
+ "reward": 0.4866071492433548,
258
+ "reward_std": 0.14047462213784456,
259
+ "rewards/accuracy_reward": 0.18154762126505375,
260
  "rewards/format_reward": 0.0,
261
+ "rewards/tag_count_reward": 0.3050595298409462,
262
  "step": 18
263
  },
264
  {
265
+ "completion_length": 3794.3424072265625,
266
  "epoch": 0.5314685314685315,
267
+ "grad_norm": 0.1713448166847229,
268
+ "kl": 0.15673828125,
269
  "learning_rate": 6.377906449072578e-06,
270
+ "loss": 0.0063,
271
+ "reward": 0.6927083432674408,
272
+ "reward_std": 0.22955949790775776,
273
+ "rewards/accuracy_reward": 0.3482142947614193,
274
  "rewards/format_reward": 0.0,
275
+ "rewards/tag_count_reward": 0.3444940596818924,
276
  "step": 19
277
  },
278
  {
279
+ "completion_length": 4126.431640625,
280
  "epoch": 0.5594405594405595,
281
+ "grad_norm": 0.13459710776805878,
282
+ "kl": 0.175537109375,
283
  "learning_rate": 5.9410771314830255e-06,
284
+ "loss": 0.007,
285
+ "reward": 0.4598214402794838,
286
+ "reward_std": 0.13734917528927326,
287
+ "rewards/accuracy_reward": 0.16369048226624727,
288
  "rewards/format_reward": 0.0,
289
+ "rewards/tag_count_reward": 0.2961309477686882,
290
  "step": 20
291
  },
292
  {
293
+ "completion_length": 4043.2351684570312,
294
  "epoch": 0.5874125874125874,
295
+ "grad_norm": 0.28960075974464417,
296
+ "kl": 0.16748046875,
297
  "learning_rate": 5.500000000000001e-06,
298
+ "loss": 0.0067,
299
+ "reward": 0.5342261865735054,
300
+ "reward_std": 0.22838787361979485,
301
+ "rewards/accuracy_reward": 0.21428572200238705,
302
  "rewards/format_reward": 0.0,
303
+ "rewards/tag_count_reward": 0.319940485060215,
304
  "step": 21
305
  },
306
  {
307
+ "completion_length": 4076.5684814453125,
308
  "epoch": 0.6153846153846154,
309
+ "grad_norm": 0.15393978357315063,
310
+ "kl": 0.172607421875,
311
  "learning_rate": 5.0589228685169776e-06,
312
+ "loss": 0.0069,
313
+ "reward": 0.5535714477300644,
314
+ "reward_std": 0.1490439474582672,
315
+ "rewards/accuracy_reward": 0.2410714328289032,
316
  "rewards/format_reward": 0.0,
317
+ "rewards/tag_count_reward": 0.3125000074505806,
318
  "step": 22
319
  },
320
  {
321
+ "completion_length": 3821.038818359375,
322
  "epoch": 0.6433566433566433,
323
+ "grad_norm": 0.21775129437446594,
324
+ "kl": 0.189697265625,
325
  "learning_rate": 4.622093550927423e-06,
326
+ "loss": 0.0076,
327
+ "reward": 0.7075892984867096,
328
+ "reward_std": 0.2531384788453579,
329
+ "rewards/accuracy_reward": 0.3630952499806881,
330
  "rewards/format_reward": 0.0,
331
+ "rewards/tag_count_reward": 0.3444940596818924,
332
  "step": 23
333
  },
334
  {
335
+ "completion_length": 3507.6101684570312,
336
  "epoch": 0.6713286713286714,
337
+ "grad_norm": 0.14808182418346405,
338
+ "kl": 0.18017578125,
339
  "learning_rate": 4.193718952354921e-06,
340
+ "loss": 0.0072,
341
+ "reward": 0.7343750149011612,
342
+ "reward_std": 0.18677468597888947,
343
+ "rewards/accuracy_reward": 0.3690476231276989,
344
  "rewards/format_reward": 0.0,
345
+ "rewards/tag_count_reward": 0.3653273805975914,
346
  "step": 24
347
  },
348
  {
349
+ "completion_length": 3405.7262573242188,
350
  "epoch": 0.6993006993006993,
351
+ "grad_norm": 0.1778838038444519,
352
+ "kl": 0.1806640625,
353
  "learning_rate": 3.777924554357096e-06,
354
+ "loss": 0.0072,
355
+ "reward": 0.8072916865348816,
356
+ "reward_std": 0.28540152311325073,
357
+ "rewards/accuracy_reward": 0.4196428582072258,
358
  "rewards/format_reward": 0.0,
359
+ "rewards/tag_count_reward": 0.3876488208770752,
360
  "step": 25
361
  },
362
  {
363
+ "completion_length": 3539.7589721679688,
364
  "epoch": 0.7272727272727273,
365
+ "grad_norm": 10.079139709472656,
366
+ "kl": 0.380615234375,
367
  "learning_rate": 3.378714684283011e-06,
368
+ "loss": 0.0152,
369
+ "reward": 0.7224702537059784,
370
+ "reward_std": 0.17052935622632504,
371
+ "rewards/accuracy_reward": 0.3690476268529892,
372
  "rewards/format_reward": 0.0,
373
+ "rewards/tag_count_reward": 0.3534226268529892,
374
  "step": 26
375
  },
376
  {
377
+ "completion_length": 3090.5238647460938,
378
  "epoch": 0.7552447552447552,
379
+ "grad_norm": 0.18040025234222412,
380
+ "kl": 0.198486328125,
381
  "learning_rate": 2.9999339514117913e-06,
382
+ "loss": 0.0079,
383
+ "reward": 0.8050595223903656,
384
+ "reward_std": 0.17971179634332657,
385
+ "rewards/accuracy_reward": 0.3928571529686451,
386
  "rewards/format_reward": 0.0,
387
+ "rewards/tag_count_reward": 0.412202388048172,
388
  "step": 27
389
  },
390
  {
391
+ "completion_length": 3675.3035888671875,
392
  "epoch": 0.7832167832167832,
393
+ "grad_norm": 0.2067694365978241,
394
+ "kl": 0.204345703125,
395
  "learning_rate": 2.645230221263596e-06,
396
+ "loss": 0.0082,
397
+ "reward": 0.5119047686457634,
398
+ "reward_std": 0.16108463145792484,
399
+ "rewards/accuracy_reward": 0.14285714365541935,
400
  "rewards/format_reward": 0.0,
401
+ "rewards/tag_count_reward": 0.3690476268529892,
402
  "step": 28
403
  },
404
  {
405
+ "completion_length": 3554.83935546875,
406
  "epoch": 0.8111888111888111,
407
+ "grad_norm": 0.1498899906873703,
408
+ "kl": 0.177734375,
409
  "learning_rate": 2.3180194846605367e-06,
410
+ "loss": 0.0071,
411
+ "reward": 0.7105654925107956,
412
+ "reward_std": 0.21086332947015762,
413
+ "rewards/accuracy_reward": 0.3035714291036129,
414
  "rewards/format_reward": 0.0,
415
+ "rewards/tag_count_reward": 0.4069940522313118,
416
  "step": 29
417
  },
418
  {
419
+ "completion_length": 3624.2232055664062,
420
  "epoch": 0.8391608391608392,
421
+ "grad_norm": 0.19065174460411072,
422
+ "kl": 0.209716796875,
423
  "learning_rate": 2.021452959867684e-06,
424
+ "loss": 0.0084,
425
+ "reward": 0.5654762014746666,
426
+ "reward_std": 0.18950912356376648,
427
+ "rewards/accuracy_reward": 0.18452381528913975,
428
  "rewards/format_reward": 0.0,
429
+ "rewards/tag_count_reward": 0.3809523805975914,
430
  "step": 30
431
  },
432
  {
433
+ "completion_length": 3379.0714721679688,
434
  "epoch": 0.8671328671328671,
435
+ "grad_norm": 0.19460894167423248,
436
+ "kl": 0.196044921875,
437
  "learning_rate": 1.7583867446385461e-06,
438
+ "loss": 0.0079,
439
+ "reward": 0.7239583432674408,
440
+ "reward_std": 0.23103895224630833,
441
+ "rewards/accuracy_reward": 0.3125000074505806,
442
  "rewards/format_reward": 0.0,
443
+ "rewards/tag_count_reward": 0.4114583358168602,
444
  "step": 31
445
  },
446
  {
447
+ "completion_length": 3222.9405517578125,
448
  "epoch": 0.8951048951048951,
449
+ "grad_norm": 0.19526593387126923,
450
+ "kl": 0.1943359375,
451
  "learning_rate": 1.531354310432403e-06,
452
+ "loss": 0.0078,
453
+ "reward": 0.6793154776096344,
454
+ "reward_std": 0.13999886438250542,
455
+ "rewards/accuracy_reward": 0.279761902987957,
456
  "rewards/format_reward": 0.0,
457
+ "rewards/tag_count_reward": 0.3995535746216774,
458
  "step": 32
459
  },
460
  {
461
+ "completion_length": 3435.9376220703125,
462
  "epoch": 0.9230769230769231,
463
+ "grad_norm": 0.16224603354930878,
464
+ "kl": 0.225341796875,
465
  "learning_rate": 1.3425421036992098e-06,
466
+ "loss": 0.009,
467
+ "reward": 0.7209821492433548,
468
+ "reward_std": 0.18096437118947506,
469
+ "rewards/accuracy_reward": 0.3244047649204731,
470
  "rewards/format_reward": 0.0,
471
+ "rewards/tag_count_reward": 0.396577388048172,
472
  "step": 33
473
  },
474
  {
475
+ "completion_length": 2996.5952758789062,
476
  "epoch": 0.951048951048951,
477
+ "grad_norm": 0.20968052744865417,
478
+ "kl": 0.20849609375,
479
  "learning_rate": 1.1937684892050606e-06,
480
+ "loss": 0.0083,
481
+ "reward": 0.8229166865348816,
482
+ "reward_std": 0.2238299883902073,
483
+ "rewards/accuracy_reward": 0.380952388048172,
484
  "rewards/format_reward": 0.0,
485
+ "rewards/tag_count_reward": 0.4419642984867096,
486
  "step": 34
487
  },
488
  {
489
+ "completion_length": 3923.5328979492188,
490
  "epoch": 0.9790209790209791,
491
+ "grad_norm": 0.2057952880859375,
492
+ "kl": 0.232666015625,
493
  "learning_rate": 1.0864662381854632e-06,
494
+ "loss": 0.0093,
495
+ "reward": 0.4434523954987526,
496
+ "reward_std": 0.1376960724592209,
497
+ "rewards/accuracy_reward": 0.11904762173071504,
498
  "rewards/format_reward": 0.0,
499
+ "rewards/tag_count_reward": 0.3244047686457634,
500
  "step": 35
501
  },
502
  {
503
+ "completion_length": 4366.71484375,
504
  "epoch": 1.0,
505
+ "grad_norm": 0.2057952880859375,
506
+ "kl": 0.24381510416666666,
507
  "learning_rate": 1.0216687299751146e-06,
508
+ "loss": 0.0073,
509
+ "reward": 0.3928571442763011,
510
+ "reward_std": 0.11071915179491043,
511
+ "rewards/accuracy_reward": 0.09920635198553403,
512
  "rewards/format_reward": 0.0,
513
+ "rewards/tag_count_reward": 0.2936507960160573,
514
  "step": 36
515
  },
516
  {
517
  "epoch": 1.0,
518
  "step": 36,
519
  "total_flos": 0.0,
520
+ "train_loss": 0.005675484693879993,
521
+ "train_runtime": 16137.0184,
522
+ "train_samples_per_second": 0.062,
523
+ "train_steps_per_second": 0.002
524
  }
525
  ],
526
  "logging_steps": 1,