QYWH commited on
Commit
24964ce
·
verified ·
1 Parent(s): 61284b2

Model save

Browse files
README.md CHANGED
@@ -27,14 +27,14 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/2495479412-/huggingface/runs/ll8pde88)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
 
35
  ### Framework versions
36
 
37
- - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.3.0
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/2495479412-/huggingface/runs/jq7at7mf)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
 
35
  ### Framework versions
36
 
37
+ - TRL: 0.15.1
38
  - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.3.0
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.036479350634126606,
4
- "train_runtime": 34004.8654,
5
- "train_samples": 1300,
6
- "train_samples_per_second": 1.529,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.011576007895314433,
4
+ "train_runtime": 33226.465,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.602,
7
+ "train_steps_per_second": 0.005
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e51f376edd035aeedab94a3ff434d2f503da5f1e58c828fe0ffe97e3728bb6f
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5202ee4d11bf12fa59e49685898173c59ed7b46ea6aea39ad2e5e1bd0277c10
3
  size 3554214752
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.036479350634126606,
4
- "train_runtime": 34004.8654,
5
- "train_samples": 1300,
6
- "train_samples_per_second": 1.529,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.011576007895314433,
4
+ "train_runtime": 33226.465,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.602,
7
+ "train_steps_per_second": 0.005
8
  }
trainer_state.json CHANGED
@@ -1,575 +1,475 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 33.390243902439025,
5
- "eval_steps": 100,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 393.713187789917,
13
- "epoch": 0.975609756097561,
14
- "grad_norm": 0.2263876348733902,
15
- "kl": 0.00019991397857666016,
16
- "learning_rate": 5e-06,
17
  "loss": 0.0,
18
- "reward": 0.3574329398572445,
19
- "reward_std": 0.317268418520689,
20
- "rewards/accuracy_reward_word": 0.3574329435825348,
21
- "rewards/format_reward": 0.0,
 
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 359.7960117224491,
26
- "epoch": 1.7804878048780488,
27
- "grad_norm": 0.4675009250640869,
28
- "kl": 0.029177347819010418,
29
- "learning_rate": 1e-05,
30
- "loss": 0.001,
31
- "reward": 0.4508711948539271,
32
- "reward_std": 0.29290521957657556,
33
- "rewards/accuracy_reward_word": 0.45087120072408154,
34
- "rewards/format_reward": 0.0,
 
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 302.4170865145597,
39
- "epoch": 2.5853658536585367,
40
- "grad_norm": 1.2330620288848877,
41
- "kl": 0.06524658203125,
42
- "learning_rate": 1.5000000000000002e-05,
43
- "loss": 0.0022,
44
- "reward": 0.5415759321415063,
45
- "reward_std": 0.24065017248644974,
46
- "rewards/accuracy_reward_word": 0.5415759276260029,
47
- "rewards/format_reward": 0.0,
 
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 272.7953320127545,
52
- "epoch": 3.3902439024390243,
53
- "grad_norm": 0.30344322323799133,
54
- "kl": 0.09641335227272728,
55
- "learning_rate": 2e-05,
56
- "loss": 0.0032,
57
- "reward": 0.5741085495912668,
58
- "reward_std": 0.2243843080871033,
59
- "rewards/accuracy_reward_word": 0.5741085464304144,
60
- "rewards/format_reward": 0.0,
 
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 281.6178417783795,
65
- "epoch": 4.195121951219512,
66
- "grad_norm": 0.3025609254837036,
67
- "kl": 0.16150272253787878,
68
- "learning_rate": 1.9961946980917457e-05,
69
- "loss": 0.0053,
70
- "reward": 0.5691162316185056,
71
- "reward_std": 0.24541335485198282,
72
- "rewards/accuracy_reward_word": 0.569116218975096,
73
- "rewards/format_reward": 0.0,
 
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 292.8073728156812,
78
- "epoch": 5.0,
79
- "grad_norm": 0.8864567279815674,
80
- "kl": 0.36754261363636365,
81
- "learning_rate": 1.9848077530122083e-05,
82
- "loss": 0.0122,
83
- "reward": 0.6015709214138262,
84
- "reward_std": 0.24950947499636447,
85
- "rewards/accuracy_reward_word": 0.6015709196076249,
86
- "rewards/format_reward": 0.0,
 
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 175.47266426086426,
91
- "epoch": 5.975609756097561,
92
- "grad_norm": 0.7226300835609436,
93
- "kl": 0.44776611328125,
94
- "learning_rate": 1.9659258262890683e-05,
95
- "loss": 0.0179,
96
- "reward": 0.6118914943188429,
97
- "reward_std": 0.1827767850831151,
98
- "rewards/accuracy_reward_word": 0.6118914864957332,
99
- "rewards/format_reward": 0.0,
 
100
  "step": 35
101
  },
102
  {
103
- "completion_length": 131.66680838844994,
104
- "epoch": 6.780487804878049,
105
- "grad_norm": 18.362396240234375,
106
- "kl": 0.5860558712121212,
107
- "learning_rate": 1.9396926207859085e-05,
108
- "loss": 0.0194,
109
- "reward": 0.6025556867772882,
110
- "reward_std": 0.16030028168902252,
111
- "rewards/accuracy_reward_word": 0.6025556894865903,
112
- "rewards/format_reward": 0.0,
 
113
  "step": 40
114
  },
115
  {
116
- "completion_length": 166.34443363998875,
117
- "epoch": 7.585365853658536,
118
- "grad_norm": 0.8004411458969116,
119
- "kl": 0.518850615530303,
120
- "learning_rate": 1.9063077870366504e-05,
121
- "loss": 0.0171,
122
- "reward": 0.6216023144396868,
123
- "reward_std": 0.20047903162511913,
124
- "rewards/accuracy_reward_word": 0.6216023090210828,
125
- "rewards/format_reward": 0.0,
 
126
  "step": 45
127
  },
128
  {
129
- "completion_length": 264.6024202289003,
130
- "epoch": 8.390243902439025,
131
- "grad_norm": 0.4066580832004547,
132
- "kl": 0.6091086647727273,
133
- "learning_rate": 1.866025403784439e-05,
134
- "loss": 0.0201,
135
- "reward": 0.6114482121034102,
136
- "reward_std": 0.2318851254654653,
137
- "rewards/accuracy_reward_word": 0.6114482130065109,
138
- "rewards/format_reward": 0.0,
 
139
  "step": 50
140
  },
141
  {
142
- "completion_length": 190.6026874889027,
143
- "epoch": 9.195121951219512,
144
- "grad_norm": 0.17414061725139618,
145
- "kl": 0.6156486742424242,
146
- "learning_rate": 1.819152044288992e-05,
147
- "loss": 0.0203,
148
- "reward": 0.6352746134454553,
149
- "reward_std": 0.17895382462125836,
150
- "rewards/accuracy_reward_word": 0.6352746274435159,
151
- "rewards/format_reward": 0.0,
 
152
  "step": 55
153
  },
154
  {
155
- "completion_length": 203.3459234526663,
156
- "epoch": 10.0,
157
- "grad_norm": 0.20065073668956757,
158
- "kl": 0.6055279356060606,
159
- "learning_rate": 1.766044443118978e-05,
160
- "loss": 0.02,
161
- "reward": 0.6350559790929159,
162
- "reward_std": 0.17616610262881627,
163
- "rewards/accuracy_reward_word": 0.6350559863177213,
164
- "rewards/format_reward": 0.0,
 
165
  "step": 60
166
  },
167
  {
168
- "completion_length": 185.47132530212403,
169
- "epoch": 10.975609756097562,
170
- "grad_norm": 0.14110271632671356,
171
- "kl": 0.5395751953125,
172
- "learning_rate": 1.7071067811865477e-05,
173
- "loss": 0.0216,
174
- "reward": 0.6301242753863334,
175
- "reward_std": 0.16279728673398494,
176
- "rewards/accuracy_reward_word": 0.6301242724061012,
177
- "rewards/format_reward": 0.0,
 
178
  "step": 65
179
  },
180
  {
181
- "completion_length": 151.86269656094638,
182
- "epoch": 11.78048780487805,
183
- "grad_norm": 0.15463927388191223,
184
- "kl": 0.5140269886363636,
185
- "learning_rate": 1.6427876096865394e-05,
186
- "loss": 0.017,
187
- "reward": 0.6122245905977307,
188
- "reward_std": 0.13943401035485845,
189
- "rewards/accuracy_reward_word": 0.61222458969463,
190
- "rewards/format_reward": 0.0,
 
191
  "step": 70
192
  },
193
  {
194
- "completion_length": 178.9215455488725,
195
- "epoch": 12.585365853658537,
196
- "grad_norm": 0.18595437705516815,
197
- "kl": 0.42743844696969696,
198
- "learning_rate": 1.573576436351046e-05,
199
- "loss": 0.0141,
200
- "reward": 0.6210114558537801,
201
- "reward_std": 0.14690511865597783,
202
- "rewards/accuracy_reward_word": 0.6210114535960284,
203
- "rewards/format_reward": 0.0,
 
204
  "step": 75
205
  },
206
  {
207
- "completion_length": 198.75474432742956,
208
- "epoch": 13.390243902439025,
209
- "grad_norm": 0.14770109951496124,
210
- "kl": 0.3772194602272727,
211
- "learning_rate": 1.5000000000000002e-05,
212
- "loss": 0.0125,
213
- "reward": 0.6053550211769162,
214
- "reward_std": 0.15184391656834068,
215
- "rewards/accuracy_reward_word": 0.6053550184676142,
216
- "rewards/format_reward": 0.0,
 
217
  "step": 80
218
  },
219
  {
220
- "completion_length": 184.9996028090968,
221
- "epoch": 14.195121951219512,
222
- "grad_norm": 0.13854625821113586,
223
- "kl": 0.3505267518939394,
224
- "learning_rate": 1.4226182617406996e-05,
225
- "loss": 0.0116,
226
- "reward": 0.6068314526117209,
227
- "reward_std": 0.1331417437529925,
228
- "rewards/accuracy_reward_word": 0.6068314548694727,
229
- "rewards/format_reward": 0.0,
 
230
  "step": 85
231
  },
232
  {
233
- "completion_length": 178.54586884469697,
234
- "epoch": 15.0,
235
- "grad_norm": 0.5907576680183411,
236
- "kl": 0.4736032196969697,
237
- "learning_rate": 1.342020143325669e-05,
238
- "loss": 0.0157,
239
- "reward": 0.606158793424115,
240
- "reward_std": 0.13893395368800018,
241
- "rewards/accuracy_reward_word": 0.6061588015520212,
242
- "rewards/format_reward": 0.0,
 
243
  "step": 90
244
  },
245
  {
246
- "completion_length": 171.66697311401367,
247
- "epoch": 15.975609756097562,
248
- "grad_norm": 53.438209533691406,
249
- "kl": 13.038330078125,
250
- "learning_rate": 1.2588190451025209e-05,
251
- "loss": 0.5218,
252
- "reward": 0.5597633935511113,
253
- "reward_std": 0.10934587656520307,
254
- "rewards/accuracy_reward_word": 0.5597633916884661,
255
- "rewards/format_reward": 0.0,
 
256
  "step": 95
257
  },
258
  {
259
- "completion_length": 176.56074940074575,
260
- "epoch": 16.78048780487805,
261
- "grad_norm": 0.5583186745643616,
262
- "kl": 0.4050662878787879,
263
- "learning_rate": 1.1736481776669307e-05,
264
- "loss": 0.0134,
265
- "reward": 0.5518235546169858,
266
- "reward_std": 0.10686868465872425,
267
- "rewards/accuracy_reward_word": 0.55182356003559,
268
- "rewards/format_reward": 0.0,
269
- "step": 100
270
- },
271
- {
272
- "epoch": 16.78048780487805,
273
- "eval_completion_length": 190.2759769984654,
274
- "eval_kl": 0.32310267857142855,
275
- "eval_loss": 0.014612293802201748,
276
- "eval_reward": 0.5988541117736271,
277
- "eval_reward_std": 0.07925501785108022,
278
- "eval_rewards/accuracy_reward_word": 0.5988540819713047,
279
- "eval_rewards/format_reward": 0.0,
280
- "eval_runtime": 96.8362,
281
- "eval_samples_per_second": 2.065,
282
- "eval_steps_per_second": 0.01,
283
  "step": 100
284
  },
285
  {
286
- "completion_length": 218.19211023504084,
287
- "epoch": 17.585365853658537,
288
- "grad_norm": 54.766273498535156,
289
- "kl": 8.253255208333334,
290
- "learning_rate": 1.0871557427476585e-05,
291
- "loss": 0.2714,
292
- "reward": 0.5985936960487654,
293
- "reward_std": 0.14751893265003507,
294
- "rewards/accuracy_reward_word": 0.5985937005642689,
295
- "rewards/format_reward": 0.0,
 
296
  "step": 105
297
  },
298
  {
299
- "completion_length": 254.85891122529,
300
- "epoch": 18.390243902439025,
301
- "grad_norm": 3.6129367351531982,
302
- "kl": 5.241092566287879,
303
- "learning_rate": 1e-05,
304
- "loss": 0.1729,
305
- "reward": 0.629414488420342,
306
- "reward_std": 0.18181673561533293,
307
- "rewards/accuracy_reward_word": 0.6294144843563889,
308
- "rewards/format_reward": 0.0,
 
309
  "step": 110
310
  },
311
  {
312
- "completion_length": 263.85566780783915,
313
- "epoch": 19.195121951219512,
314
- "grad_norm": 1.826474905014038,
315
- "kl": 0.8953006628787878,
316
- "learning_rate": 9.128442572523418e-06,
317
- "loss": 0.0296,
318
- "reward": 0.6560273572350993,
319
- "reward_std": 0.2060928950932893,
320
- "rewards/accuracy_reward_word": 0.6560273635568041,
321
- "rewards/format_reward": 0.0,
 
322
  "step": 115
323
  },
324
  {
325
- "completion_length": 203.282883384011,
326
- "epoch": 20.0,
327
- "grad_norm": 0.5897179841995239,
328
- "kl": 1.3379794034090908,
329
- "learning_rate": 8.263518223330698e-06,
330
- "loss": 0.0442,
331
- "reward": 0.6160052671576991,
332
- "reward_std": 0.1589522831367724,
333
- "rewards/accuracy_reward_word": 0.6160052716732025,
334
- "rewards/format_reward": 0.0,
 
335
  "step": 120
336
  },
337
  {
338
- "completion_length": 161.04855613708497,
339
- "epoch": 20.975609756097562,
340
- "grad_norm": 0.2893044650554657,
341
- "kl": 0.331640625,
342
- "learning_rate": 7.411809548974792e-06,
343
- "loss": 0.0133,
344
- "reward": 0.5946122907102108,
345
- "reward_std": 0.12153792111203074,
346
- "rewards/accuracy_reward_word": 0.5946122877299785,
347
- "rewards/format_reward": 0.0,
 
348
  "step": 125
349
  },
350
  {
351
- "completion_length": 154.29532727328214,
352
- "epoch": 21.78048780487805,
353
- "grad_norm": 0.17429320514202118,
354
- "kl": 0.28111683238636365,
355
- "learning_rate": 6.579798566743314e-06,
356
- "loss": 0.0093,
357
- "reward": 0.577382534290805,
358
- "reward_std": 0.11978449254776492,
359
- "rewards/accuracy_reward_word": 0.5773825306784023,
360
- "rewards/format_reward": 0.0,
 
361
  "step": 130
362
  },
363
  {
364
- "completion_length": 159.07292406486744,
365
- "epoch": 22.585365853658537,
366
- "grad_norm": 0.32492542266845703,
367
- "kl": 0.2540838068181818,
368
- "learning_rate": 5.773817382593008e-06,
369
- "loss": 0.0084,
370
- "reward": 0.5901522681568608,
371
- "reward_std": 0.11932384177590862,
372
- "rewards/accuracy_reward_word": 0.5901522708661628,
373
- "rewards/format_reward": 0.0,
 
374
  "step": 135
375
  },
376
  {
377
- "completion_length": 175.563588922674,
378
- "epoch": 23.390243902439025,
379
- "grad_norm": 1.4657223224639893,
380
- "kl": 0.2944040009469697,
381
- "learning_rate": 5.000000000000003e-06,
382
- "loss": 0.0098,
383
- "reward": 0.5877998123566309,
384
- "reward_std": 0.12781856005842035,
385
- "rewards/accuracy_reward_word": 0.5877998132597316,
386
- "rewards/format_reward": 0.0,
 
387
  "step": 140
388
  },
389
  {
390
- "completion_length": 196.80899186567828,
391
- "epoch": 24.195121951219512,
392
- "grad_norm": 0.38781723380088806,
393
- "kl": 0.3389707623106061,
394
- "learning_rate": 4.264235636489542e-06,
395
- "loss": 0.0112,
396
- "reward": 0.6222013411196795,
397
- "reward_std": 0.1575910769628756,
398
- "rewards/accuracy_reward_word": 0.6222013402165789,
399
- "rewards/format_reward": 0.0,
 
400
  "step": 145
401
  },
402
  {
403
- "completion_length": 212.34213649865353,
404
- "epoch": 25.0,
405
- "grad_norm": 0.2279985398054123,
406
- "kl": 0.3564157196969697,
407
- "learning_rate": 3.5721239031346067e-06,
408
- "loss": 0.0118,
409
- "reward": 0.6341576734275529,
410
- "reward_std": 0.16485171076474767,
411
- "rewards/accuracy_reward_word": 0.6341576770399556,
412
- "rewards/format_reward": 0.0,
 
413
  "step": 150
414
  },
415
  {
416
- "completion_length": 198.5949857711792,
417
- "epoch": 25.975609756097562,
418
- "grad_norm": 0.14645038545131683,
419
- "kl": 0.32952880859375,
420
- "learning_rate": 2.9289321881345257e-06,
421
- "loss": 0.0132,
422
- "reward": 0.6285704858601093,
423
- "reward_std": 0.16049452810548245,
424
- "rewards/accuracy_reward_word": 0.6285704836249352,
425
- "rewards/format_reward": 0.0,
 
426
  "step": 155
427
  },
428
  {
429
- "completion_length": 189.9249274513938,
430
- "epoch": 26.78048780487805,
431
- "grad_norm": 0.13024762272834778,
432
- "kl": 0.3255800189393939,
433
- "learning_rate": 2.339555568810221e-06,
434
- "loss": 0.0108,
435
- "reward": 0.6150937617728205,
436
- "reward_std": 0.14573924322471474,
437
- "rewards/accuracy_reward_word": 0.6150937717069279,
438
- "rewards/format_reward": 0.0,
 
439
  "step": 160
440
  },
441
  {
442
- "completion_length": 174.02354685465494,
443
- "epoch": 27.585365853658537,
444
- "grad_norm": 0.13969573378562927,
445
- "kl": 0.3170572916666667,
446
- "learning_rate": 1.808479557110081e-06,
447
- "loss": 0.0105,
448
- "reward": 0.6073169595364368,
449
- "reward_std": 0.12986301354160815,
450
- "rewards/accuracy_reward_word": 0.607316970373645,
451
- "rewards/format_reward": 0.0,
452
- "step": 165
453
- },
454
- {
455
- "completion_length": 181.3133195819277,
456
- "epoch": 28.390243902439025,
457
- "grad_norm": 0.21579568088054657,
458
- "kl": 0.3319720643939394,
459
- "learning_rate": 1.339745962155613e-06,
460
- "loss": 0.011,
461
- "reward": 0.6149095554243434,
462
- "reward_std": 0.13368092341856522,
463
- "rewards/accuracy_reward_word": 0.6149095631006992,
464
- "rewards/format_reward": 0.0,
465
- "step": 170
466
- },
467
- {
468
- "completion_length": 179.86851408987334,
469
- "epoch": 29.195121951219512,
470
- "grad_norm": 0.1424599587917328,
471
- "kl": 0.32353811553030304,
472
- "learning_rate": 9.369221296335007e-07,
473
- "loss": 0.0107,
474
- "reward": 0.6067391426274271,
475
- "reward_std": 0.14061830226670613,
476
- "rewards/accuracy_reward_word": 0.6067391435305277,
477
- "rewards/format_reward": 0.0,
478
- "step": 175
479
- },
480
- {
481
- "completion_length": 177.73133919455788,
482
- "epoch": 30.0,
483
- "grad_norm": 0.12455170601606369,
484
- "kl": 0.32202888257575757,
485
- "learning_rate": 6.030737921409169e-07,
486
- "loss": 0.0107,
487
- "reward": 0.6030513856447104,
488
- "reward_std": 0.13355868055739187,
489
- "rewards/accuracy_reward_word": 0.6030513928695158,
490
- "rewards/format_reward": 0.0,
491
- "step": 180
492
- },
493
- {
494
- "completion_length": 178.40078887939453,
495
- "epoch": 30.975609756097562,
496
- "grad_norm": 0.14259420335292816,
497
- "kl": 0.315283203125,
498
- "learning_rate": 3.4074173710931804e-07,
499
- "loss": 0.0127,
500
- "reward": 0.6099405620247126,
501
- "reward_std": 0.13809194271452724,
502
- "rewards/accuracy_reward_word": 0.6099405620247126,
503
- "rewards/format_reward": 0.0,
504
- "step": 185
505
- },
506
- {
507
- "completion_length": 174.6302841648911,
508
- "epoch": 31.78048780487805,
509
- "grad_norm": 0.1332584172487259,
510
- "kl": 0.3048354640151515,
511
- "learning_rate": 1.519224698779198e-07,
512
- "loss": 0.0101,
513
- "reward": 0.5993030166084116,
514
- "reward_std": 0.1328924118795178,
515
- "rewards/accuracy_reward_word": 0.5993030138991096,
516
- "rewards/format_reward": 0.0,
517
- "step": 190
518
- },
519
- {
520
- "completion_length": 184.39110634543678,
521
- "epoch": 32.58536585365854,
522
- "grad_norm": 0.13871651887893677,
523
- "kl": 0.31865530303030304,
524
- "learning_rate": 3.805301908254455e-08,
525
- "loss": 0.0106,
526
- "reward": 0.6105609360066327,
527
- "reward_std": 0.14098199039246095,
528
- "rewards/accuracy_reward_word": 0.610560937812834,
529
- "rewards/format_reward": 0.0,
530
- "step": 195
531
- },
532
- {
533
- "completion_length": 184.56710722952178,
534
- "epoch": 33.390243902439025,
535
- "grad_norm": 0.2092735767364502,
536
- "kl": 0.3228574810606061,
537
- "learning_rate": 0.0,
538
- "loss": 0.0107,
539
- "reward": 0.6148976295283346,
540
- "reward_std": 0.1421807540975737,
541
- "rewards/accuracy_reward_word": 0.614897631334536,
542
- "rewards/format_reward": 0.0,
543
- "step": 200
544
- },
545
- {
546
- "epoch": 33.390243902439025,
547
- "eval_completion_length": 183.40406145368303,
548
- "eval_kl": 0.3189174107142857,
549
- "eval_loss": 0.014444979839026928,
550
- "eval_reward": 0.6251552956444877,
551
- "eval_reward_std": 0.1085951988186155,
552
- "eval_rewards/accuracy_reward_word": 0.6251552700996399,
553
- "eval_rewards/format_reward": 0.0,
554
- "eval_runtime": 95.3621,
555
- "eval_samples_per_second": 2.097,
556
- "eval_steps_per_second": 0.01,
557
- "step": 200
558
- },
559
- {
560
- "epoch": 33.390243902439025,
561
- "step": 200,
562
  "total_flos": 0.0,
563
- "train_loss": 0.036479350634126606,
564
- "train_runtime": 34004.8654,
565
- "train_samples_per_second": 1.529,
566
- "train_steps_per_second": 0.006
567
  }
568
  ],
569
  "logging_steps": 5,
570
- "max_steps": 200,
571
  "num_input_tokens_seen": 0,
572
- "num_train_epochs": 40,
573
  "save_steps": 500,
574
  "stateful_callbacks": {
575
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 19.895104895104897,
5
+ "eval_steps": 1000,
6
+ "global_step": 160,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 364.37423477172854,
13
+ "epoch": 0.5594405594405595,
14
+ "grad_norm": 0.27259644865989685,
15
+ "kl": 0.00020606517791748046,
16
+ "learning_rate": 6.25e-06,
17
  "loss": 0.0,
18
+ "reward": 1.2981241554021836,
19
+ "reward_std": 0.34615046456456183,
20
+ "rewards/accuracy_reward_word": 0.32314828839153054,
21
+ "rewards/format_reward": 0.9845982514321804,
22
+ "rewards/repetition_penalty_reward": -0.009622399129875702,
23
  "step": 5
24
  },
25
  {
26
+ "completion_length": 300.59944620999426,
27
+ "epoch": 1.2237762237762237,
28
+ "grad_norm": 0.6720463037490845,
29
+ "kl": 0.054093274203213776,
30
+ "learning_rate": 1.25e-05,
31
+ "loss": 0.0024,
32
+ "reward": 1.4319009455767544,
33
+ "reward_std": 0.3311943354254419,
34
+ "rewards/accuracy_reward_word": 0.44601600854234263,
35
+ "rewards/format_reward": 0.992999211631038,
36
+ "rewards/repetition_penalty_reward": -0.007114298457467653,
37
  "step": 10
38
  },
39
  {
40
+ "completion_length": 217.79476432800294,
41
+ "epoch": 1.7832167832167833,
42
+ "grad_norm": 0.29593613743782043,
43
+ "kl": 0.110693359375,
44
+ "learning_rate": 1.8750000000000002e-05,
45
+ "loss": 0.0044,
46
+ "reward": 1.5967296287417412,
47
+ "reward_std": 0.3098897160962224,
48
+ "rewards/accuracy_reward_word": 0.6012545950710774,
49
+ "rewards/format_reward": 0.9985491126775742,
50
+ "rewards/repetition_penalty_reward": -0.003074125228158664,
51
  "step": 15
52
  },
53
  {
54
+ "completion_length": 200.27791127291593,
55
+ "epoch": 2.4475524475524475,
56
+ "grad_norm": 0.26520445942878723,
57
+ "kl": 0.1642123135653409,
58
+ "learning_rate": 1.9961946980917457e-05,
59
+ "loss": 0.0072,
60
+ "reward": 1.6224368851293216,
61
+ "reward_std": 0.27555874311788514,
62
+ "rewards/accuracy_reward_word": 0.6239803387698802,
63
+ "rewards/format_reward": 0.998985393480821,
64
+ "rewards/repetition_penalty_reward": -0.000528900803601573,
65
  "step": 20
66
  },
67
  {
68
+ "completion_length": 270.1645825125954,
69
+ "epoch": 3.111888111888112,
70
+ "grad_norm": 0.34845927357673645,
71
+ "kl": 0.24596058238636365,
72
+ "learning_rate": 1.9807852804032306e-05,
73
+ "loss": 0.0108,
74
+ "reward": 1.5945834151723168,
75
+ "reward_std": 0.3396115639162334,
76
+ "rewards/accuracy_reward_word": 0.6209050552411512,
77
+ "rewards/format_reward": 0.9739245474338531,
78
+ "rewards/repetition_penalty_reward": -0.0002461845716342478,
79
  "step": 25
80
  },
81
  {
82
+ "completion_length": 235.43092441558838,
83
+ "epoch": 3.6713286713286712,
84
+ "grad_norm": 3.628526449203491,
85
+ "kl": 1.1632080078125,
86
+ "learning_rate": 1.953716950748227e-05,
87
+ "loss": 0.0466,
88
+ "reward": 1.3556241348385811,
89
+ "reward_std": 0.18557674251496792,
90
+ "rewards/accuracy_reward_word": 0.36838247990235684,
91
+ "rewards/format_reward": 0.9909598484635354,
92
+ "rewards/repetition_penalty_reward": -0.003718209845078491,
93
  "step": 30
94
  },
95
  {
96
+ "completion_length": 246.95547086542302,
97
+ "epoch": 4.335664335664336,
98
+ "grad_norm": 1.1314058303833008,
99
+ "kl": 0.3246515447443182,
100
+ "learning_rate": 1.9153114791194475e-05,
101
+ "loss": 0.0143,
102
+ "reward": 1.3994574384255842,
103
+ "reward_std": 0.20727728540077806,
104
+ "rewards/accuracy_reward_word": 0.41514577995985746,
105
+ "rewards/format_reward": 0.9877232407981699,
106
+ "rewards/repetition_penalty_reward": -0.003411588459336847,
107
  "step": 35
108
  },
109
  {
110
+ "completion_length": 295.24443359375,
111
+ "epoch": 4.895104895104895,
112
+ "grad_norm": 0.7106680274009705,
113
+ "kl": 0.45169677734375,
114
+ "learning_rate": 1.866025403784439e-05,
115
+ "loss": 0.0181,
116
+ "reward": 1.5822205603122712,
117
+ "reward_std": 0.3901990856975317,
118
+ "rewards/accuracy_reward_word": 0.6376858472824096,
119
+ "rewards/format_reward": 0.9446428999304771,
120
+ "rewards/repetition_penalty_reward": -0.0001081747655689469,
121
  "step": 40
122
  },
123
  {
124
+ "completion_length": 227.82925120267,
125
+ "epoch": 5.559440559440559,
126
+ "grad_norm": 0.8820157051086426,
127
+ "kl": 0.4667746803977273,
128
+ "learning_rate": 1.806444604267483e-05,
129
+ "loss": 0.0205,
130
+ "reward": 1.4798817797140642,
131
+ "reward_std": 0.2851217135533013,
132
+ "rewards/accuracy_reward_word": 0.5120031014931473,
133
+ "rewards/format_reward": 0.9679383540695364,
134
+ "rewards/repetition_penalty_reward": -5.967220460495975e-05,
135
  "step": 45
136
  },
137
  {
138
+ "completion_length": 215.02862150018865,
139
+ "epoch": 6.223776223776224,
140
+ "grad_norm": 0.19495409727096558,
141
+ "kl": 0.41028941761363635,
142
+ "learning_rate": 1.737277336810124e-05,
143
+ "loss": 0.0181,
144
+ "reward": 1.5078922672705217,
145
+ "reward_std": 0.27444807008247485,
146
+ "rewards/accuracy_reward_word": 0.5324522018093955,
147
+ "rewards/format_reward": 0.9754464673725042,
148
+ "rewards/repetition_penalty_reward": -6.40446375076532e-06,
149
  "step": 50
150
  },
151
  {
152
+ "completion_length": 253.25693054199218,
153
+ "epoch": 6.783216783216783,
154
+ "grad_norm": 0.161370187997818,
155
+ "kl": 0.3687744140625,
156
+ "learning_rate": 1.659345815100069e-05,
157
+ "loss": 0.0148,
158
+ "reward": 1.617374736070633,
159
+ "reward_std": 0.3065785804763436,
160
+ "rewards/accuracy_reward_word": 0.6409355964511633,
161
+ "rewards/format_reward": 0.9764509290456772,
162
+ "rewards/repetition_penalty_reward": -1.1780754721257836e-05,
163
  "step": 55
164
  },
165
  {
166
+ "completion_length": 275.0440474423495,
167
+ "epoch": 7.4475524475524475,
168
+ "grad_norm": 0.2272227257490158,
169
+ "kl": 0.35566850142045453,
170
+ "learning_rate": 1.573576436351046e-05,
171
+ "loss": 0.0156,
172
+ "reward": 1.5927915437655016,
173
+ "reward_std": 0.30714009304276924,
174
+ "rewards/accuracy_reward_word": 0.6195999278940938,
175
+ "rewards/format_reward": 0.97321432557973,
176
+ "rewards/repetition_penalty_reward": -2.271822495458764e-05,
177
  "step": 60
178
  },
179
  {
180
+ "completion_length": 262.15382610667837,
181
+ "epoch": 8.111888111888112,
182
+ "grad_norm": 0.19355681538581848,
183
+ "kl": 0.3338068181818182,
184
+ "learning_rate": 1.4809887689193878e-05,
185
+ "loss": 0.0147,
186
+ "reward": 1.4948042712428353,
187
+ "reward_std": 0.2743698521940546,
188
+ "rewards/accuracy_reward_word": 0.5210835107348182,
189
+ "rewards/format_reward": 0.9737216295166449,
190
+ "rewards/repetition_penalty_reward": -8.746641965858131e-07,
191
  "step": 65
192
  },
193
  {
194
+ "completion_length": 250.4502347946167,
195
+ "epoch": 8.671328671328672,
196
+ "grad_norm": 0.1308055967092514,
197
+ "kl": 0.31865234375,
198
+ "learning_rate": 1.3826834323650899e-05,
199
+ "loss": 0.0127,
200
+ "reward": 1.4026295721530915,
201
+ "reward_std": 0.23001151392236352,
202
+ "rewards/accuracy_reward_word": 0.4245518417446874,
203
+ "rewards/format_reward": 0.9781250409781933,
204
+ "rewards/repetition_penalty_reward": -4.732433080789633e-05,
205
  "step": 70
206
  },
207
  {
208
+ "completion_length": 257.2775072617964,
209
+ "epoch": 9.335664335664335,
210
+ "grad_norm": 0.1448088437318802,
211
+ "kl": 0.2882302024147727,
212
+ "learning_rate": 1.2798290140309924e-05,
213
+ "loss": 0.0127,
214
+ "reward": 1.4172937978397717,
215
+ "reward_std": 0.23249881604517048,
216
+ "rewards/accuracy_reward_word": 0.4396151855418628,
217
+ "rewards/format_reward": 0.9776786098426039,
218
+ "rewards/repetition_penalty_reward": 0.0,
219
  "step": 75
220
  },
221
  {
222
+ "completion_length": 259.0001234054565,
223
+ "epoch": 9.895104895104895,
224
+ "grad_norm": 0.1194038838148117,
225
+ "kl": 0.26318359375,
226
+ "learning_rate": 1.1736481776669307e-05,
227
+ "loss": 0.0105,
228
+ "reward": 1.4497434869408607,
229
+ "reward_std": 0.25042697712779044,
230
+ "rewards/accuracy_reward_word": 0.47421625480055807,
231
+ "rewards/format_reward": 0.9755580753087998,
232
+ "rewards/repetition_penalty_reward": -3.085259668296203e-05,
233
  "step": 80
234
  },
235
  {
236
+ "completion_length": 251.73012508045542,
237
+ "epoch": 10.55944055944056,
238
+ "grad_norm": 0.12483782321214676,
239
+ "kl": 0.25578169389204547,
240
+ "learning_rate": 1.0654031292301432e-05,
241
+ "loss": 0.0113,
242
+ "reward": 1.477828394282948,
243
+ "reward_std": 0.24647284172136674,
244
+ "rewards/accuracy_reward_word": 0.49977607000619173,
245
+ "rewards/format_reward": 0.978084458546205,
246
+ "rewards/repetition_penalty_reward": -3.212933171942661e-05,
247
  "step": 85
248
  },
249
  {
250
+ "completion_length": 243.65271100130948,
251
+ "epoch": 11.223776223776223,
252
+ "grad_norm": 0.4139878749847412,
253
+ "kl": 0.2525967684659091,
254
+ "learning_rate": 9.563806126346643e-06,
255
+ "loss": 0.0111,
256
+ "reward": 1.4612440195950596,
257
+ "reward_std": 0.23823331762105227,
258
+ "rewards/accuracy_reward_word": 0.4819643903862346,
259
+ "rewards/format_reward": 0.9793019870465453,
260
+ "rewards/repetition_penalty_reward": -2.236067350416058e-05,
261
  "step": 90
262
  },
263
  {
264
+ "completion_length": 246.18003330230712,
265
+ "epoch": 11.783216783216783,
266
+ "grad_norm": 0.11199598014354706,
267
+ "kl": 0.23426513671875,
268
+ "learning_rate": 8.478766138100834e-06,
269
+ "loss": 0.0094,
270
+ "reward": 1.4039153650403022,
271
+ "reward_std": 0.20266931243240832,
272
+ "rewards/accuracy_reward_word": 0.4193565859692171,
273
+ "rewards/format_reward": 0.9845982529222965,
274
+ "rewards/repetition_penalty_reward": -3.9484114859078544e-05,
275
  "step": 95
276
  },
277
  {
278
+ "completion_length": 256.24219929088247,
279
+ "epoch": 12.447552447552448,
280
+ "grad_norm": 0.23437026143074036,
281
+ "kl": 0.24050071022727273,
282
+ "learning_rate": 7.411809548974792e-06,
283
+ "loss": 0.0106,
284
+ "reward": 1.3849081302231008,
285
+ "reward_std": 0.20749073509465565,
286
+ "rewards/accuracy_reward_word": 0.4026952323249795,
287
+ "rewards/format_reward": 0.9822443513707682,
288
+ "rewards/repetition_penalty_reward": -3.145250816900939e-05,
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  "step": 100
290
  },
291
  {
292
+ "completion_length": 257.46277410333806,
293
+ "epoch": 13.111888111888112,
294
+ "grad_norm": 0.11270800977945328,
295
+ "kl": 0.21811745383522727,
296
+ "learning_rate": 6.375619617162985e-06,
297
+ "loss": 0.0096,
298
+ "reward": 1.4131186618046327,
299
+ "reward_std": 0.2085759434002367,
300
+ "rewards/accuracy_reward_word": 0.4291051478040489,
301
+ "rewards/format_reward": 0.9840706532651727,
302
+ "rewards/repetition_penalty_reward": -5.714496695542369e-05,
303
  "step": 105
304
  },
305
  {
306
+ "completion_length": 260.4148557662964,
307
+ "epoch": 13.671328671328672,
308
+ "grad_norm": 0.1118827760219574,
309
+ "kl": 0.21689453125,
310
+ "learning_rate": 5.382513867649663e-06,
311
+ "loss": 0.0087,
312
+ "reward": 1.413018099963665,
313
+ "reward_std": 0.20751294190995395,
314
+ "rewards/accuracy_reward_word": 0.4284829759038985,
315
+ "rewards/format_reward": 0.9845982521772385,
316
+ "rewards/repetition_penalty_reward": -6.31413837254513e-05,
317
  "step": 110
318
  },
319
  {
320
+ "completion_length": 255.0886882435192,
321
+ "epoch": 14.335664335664335,
322
+ "grad_norm": 0.1052035540342331,
323
+ "kl": 0.21266867897727273,
324
+ "learning_rate": 4.444297669803981e-06,
325
+ "loss": 0.0094,
326
+ "reward": 1.3777457651766865,
327
+ "reward_std": 0.19213764581151985,
328
+ "rewards/accuracy_reward_word": 0.3917047695934095,
329
+ "rewards/format_reward": 0.9860998696901582,
330
+ "rewards/repetition_penalty_reward": -5.887894752215123e-05,
331
  "step": 115
332
  },
333
  {
334
+ "completion_length": 250.88360347747803,
335
+ "epoch": 14.895104895104895,
336
+ "grad_norm": 0.1148873046040535,
337
+ "kl": 0.20421142578125,
338
+ "learning_rate": 3.5721239031346067e-06,
339
+ "loss": 0.0082,
340
+ "reward": 1.4046544060111046,
341
+ "reward_std": 0.194459034409374,
342
+ "rewards/accuracy_reward_word": 0.41711079380474986,
343
+ "rewards/format_reward": 0.9876116372644901,
344
+ "rewards/repetition_penalty_reward": -6.803718715673313e-05,
345
  "step": 120
346
  },
347
  {
348
+ "completion_length": 249.46257175098765,
349
+ "epoch": 15.55944055944056,
350
+ "grad_norm": 0.11614679545164108,
351
+ "kl": 0.20160466974431818,
352
+ "learning_rate": 2.776360379402445e-06,
353
+ "loss": 0.0089,
354
+ "reward": 1.4239109510725194,
355
+ "reward_std": 0.20904017239809036,
356
+ "rewards/accuracy_reward_word": 0.4381290450692177,
357
+ "rewards/format_reward": 0.985795489766381,
358
+ "rewards/repetition_penalty_reward": -1.3588532800151205e-05,
359
  "step": 125
360
  },
361
  {
362
+ "completion_length": 252.77699990706012,
363
+ "epoch": 16.223776223776223,
364
+ "grad_norm": 0.11460437625646591,
365
+ "kl": 0.20647638494318182,
366
+ "learning_rate": 2.0664665970876496e-06,
367
+ "loss": 0.0091,
368
+ "reward": 1.4404897730458865,
369
+ "reward_std": 0.22268841487609528,
370
+ "rewards/accuracy_reward_word": 0.45830493445762177,
371
+ "rewards/format_reward": 0.9822443588213488,
372
+ "rewards/repetition_penalty_reward": -5.952174582158808e-05,
373
  "step": 130
374
  },
375
  {
376
+ "completion_length": 256.87255535125735,
377
+ "epoch": 16.783216783216783,
378
+ "grad_norm": 0.10793906450271606,
379
+ "kl": 0.20732421875,
380
+ "learning_rate": 1.4508812932705364e-06,
381
+ "loss": 0.0083,
382
+ "reward": 1.478071430325508,
383
+ "reward_std": 0.22782372254878283,
384
+ "rewards/accuracy_reward_word": 0.49404887384735047,
385
+ "rewards/format_reward": 0.9840402141213417,
386
+ "rewards/repetition_penalty_reward": -1.7666907024249667e-05,
387
  "step": 135
388
  },
389
  {
390
+ "completion_length": 254.8349351015958,
391
+ "epoch": 17.447552447552447,
392
+ "grad_norm": 0.13680703938007355,
393
+ "kl": 0.19947398792613635,
394
+ "learning_rate": 9.369221296335007e-07,
395
+ "loss": 0.0088,
396
+ "reward": 1.4661599763415076,
397
+ "reward_std": 0.22828074849464677,
398
+ "rewards/accuracy_reward_word": 0.4821980211206458,
399
+ "rewards/format_reward": 0.9839691946452315,
400
+ "rewards/repetition_penalty_reward": -7.247217581607401e-06,
401
  "step": 140
402
  },
403
  {
404
+ "completion_length": 254.28024777499112,
405
+ "epoch": 18.111888111888113,
406
+ "grad_norm": 0.12113064527511597,
407
+ "kl": 0.2043124112215909,
408
+ "learning_rate": 5.306987050489442e-07,
409
+ "loss": 0.009,
410
+ "reward": 1.4483650543472983,
411
+ "reward_std": 0.222187442493371,
412
+ "rewards/accuracy_reward_word": 0.4643406889147379,
413
+ "rewards/format_reward": 0.9840706539424983,
414
+ "rewards/repetition_penalty_reward": -4.6301358824249206e-05,
415
  "step": 145
416
  },
417
  {
418
+ "completion_length": 254.92579154968263,
419
+ "epoch": 18.67132867132867,
420
+ "grad_norm": 0.12059218436479568,
421
+ "kl": 0.2029052734375,
422
+ "learning_rate": 2.370399288006664e-07,
423
+ "loss": 0.0081,
424
+ "reward": 1.4487672820687294,
425
+ "reward_std": 0.228369791386649,
426
+ "rewards/accuracy_reward_word": 0.46667207330465316,
427
+ "rewards/format_reward": 0.9821428969502449,
428
+ "rewards/repetition_penalty_reward": -4.7683547745691615e-05,
429
  "step": 150
430
  },
431
  {
432
+ "completion_length": 255.3859702023593,
433
+ "epoch": 19.335664335664337,
434
+ "grad_norm": 0.12672924995422363,
435
+ "kl": 0.20241477272727273,
436
+ "learning_rate": 5.943661777680354e-08,
437
+ "loss": 0.0089,
438
+ "reward": 1.4418584392829374,
439
+ "reward_std": 0.22563406520269133,
440
+ "rewards/accuracy_reward_word": 0.45739410241896455,
441
+ "rewards/format_reward": 0.9844764979048208,
442
+ "rewards/repetition_penalty_reward": -1.2175325537100434e-05,
443
  "step": 155
444
  },
445
  {
446
+ "completion_length": 250.57400856018066,
447
+ "epoch": 19.895104895104897,
448
+ "grad_norm": 0.12355446815490723,
449
+ "kl": 0.19569091796875,
450
+ "learning_rate": 0.0,
451
+ "loss": 0.0078,
452
+ "reward": 1.4596911922097207,
453
+ "reward_std": 0.21876802388578653,
454
+ "rewards/accuracy_reward_word": 0.4751062370836735,
455
+ "rewards/format_reward": 0.9845982559025288,
456
+ "rewards/repetition_penalty_reward": -1.3301288072398166e-05,
457
  "step": 160
458
  },
459
  {
460
+ "epoch": 19.895104895104897,
461
+ "step": 160,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  "total_flos": 0.0,
463
+ "train_loss": 0.011576007895314433,
464
+ "train_runtime": 33226.465,
465
+ "train_samples_per_second": 0.602,
466
+ "train_steps_per_second": 0.005
467
  }
468
  ],
469
  "logging_steps": 5,
470
+ "max_steps": 160,
471
  "num_input_tokens_seen": 0,
472
+ "num_train_epochs": 20,
473
  "save_steps": 500,
474
  "stateful_callbacks": {
475
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:515f14bdee751293bb315b8e548550983d747382833c2d5a27a5780db6fe0b7e
3
- size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b151708dbfb039fab3363622568c9f229a742f720cf88aa4b879c5da30f5f8
3
+ size 7608