chenggong commited on
Commit
0bf609e
·
verified ·
1 Parent(s): 727de28

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Math-7B
3
+ library_name: transformers
4
+ model_name: Qwen-2.5-Math-7B-Max-v3
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen-2.5-Math-7B-Max-v3
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="chenggong1995/Qwen-2.5-Math-7B-Max-v3", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/v5cfvn9b)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.16.0.dev0
38
+ - Transformers: 4.49.0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.3.2
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.03207497191133684,
4
+ "train_runtime": 41076.4058,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 0.548,
7
+ "train_steps_per_second": 0.004
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.03207497191133684,
4
+ "train_runtime": 41076.4058,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 0.548,
7
+ "train_steps_per_second": 0.004
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9893390191897655,
5
+ "eval_steps": 60,
6
+ "global_step": 174,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "clip_ratio": 0.0,
13
+ "completion_length": 705.7877807617188,
14
+ "epoch": 0.017057569296375266,
15
+ "grad_norm": 0.2958298623561859,
16
+ "kl": 0.0,
17
+ "learning_rate": 1e-06,
18
+ "loss": 0.0834,
19
+ "reward": 0.47402435541152954,
20
+ "reward_std": 0.3045174852013588,
21
+ "rewards/cosine_scaled_reward": 0.47402435541152954,
22
+ "step": 1
23
+ },
24
+ {
25
+ "clip_ratio": 0.0,
26
+ "completion_length": 692.6982593536377,
27
+ "epoch": 0.08528784648187633,
28
+ "grad_norm": 0.17230483889579773,
29
+ "kl": 0.00200594961643219,
30
+ "learning_rate": 1e-06,
31
+ "loss": 0.1076,
32
+ "reward": 0.4518069000914693,
33
+ "reward_std": 0.2899208152666688,
34
+ "rewards/cosine_scaled_reward": 0.4518069000914693,
35
+ "step": 5
36
+ },
37
+ {
38
+ "clip_ratio": 0.0,
39
+ "completion_length": 685.9979393005372,
40
+ "epoch": 0.17057569296375266,
41
+ "grad_norm": 0.16429604589939117,
42
+ "kl": 0.00012786388397216796,
43
+ "learning_rate": 1e-06,
44
+ "loss": 0.1077,
45
+ "reward": 0.4576279394328594,
46
+ "reward_std": 0.2894581612199545,
47
+ "rewards/cosine_scaled_reward": 0.4576279394328594,
48
+ "step": 10
49
+ },
50
+ {
51
+ "clip_ratio": 0.0,
52
+ "completion_length": 664.4338722229004,
53
+ "epoch": 0.255863539445629,
54
+ "grad_norm": 0.29537156224250793,
55
+ "kl": 0.0002796053886413574,
56
+ "learning_rate": 1e-06,
57
+ "loss": 0.0932,
58
+ "reward": 0.48066430613398553,
59
+ "reward_std": 0.2709693659096956,
60
+ "rewards/cosine_scaled_reward": 0.48066430613398553,
61
+ "step": 15
62
+ },
63
+ {
64
+ "clip_ratio": 0.0,
65
+ "completion_length": 661.8659011840821,
66
+ "epoch": 0.3411513859275053,
67
+ "grad_norm": 0.17067702114582062,
68
+ "kl": 0.0004772186279296875,
69
+ "learning_rate": 1e-06,
70
+ "loss": 0.0929,
71
+ "reward": 0.4877126231789589,
72
+ "reward_std": 0.26678171902894976,
73
+ "rewards/cosine_scaled_reward": 0.4877126231789589,
74
+ "step": 20
75
+ },
76
+ {
77
+ "clip_ratio": 0.0,
78
+ "completion_length": 686.9898681640625,
79
+ "epoch": 0.42643923240938164,
80
+ "grad_norm": 0.22597813606262207,
81
+ "kl": 0.00075836181640625,
82
+ "learning_rate": 1e-06,
83
+ "loss": 0.0922,
84
+ "reward": 0.4946805603802204,
85
+ "reward_std": 0.26594343446195123,
86
+ "rewards/cosine_scaled_reward": 0.4946805603802204,
87
+ "step": 25
88
+ },
89
+ {
90
+ "clip_ratio": 0.0,
91
+ "completion_length": 700.4682479858399,
92
+ "epoch": 0.511727078891258,
93
+ "grad_norm": 0.1372915506362915,
94
+ "kl": 0.0009161949157714844,
95
+ "learning_rate": 1e-06,
96
+ "loss": 0.0724,
97
+ "reward": 0.49349360913038254,
98
+ "reward_std": 0.2691137969493866,
99
+ "rewards/cosine_scaled_reward": 0.49349360913038254,
100
+ "step": 30
101
+ },
102
+ {
103
+ "clip_ratio": 0.0,
104
+ "completion_length": 678.8015800476074,
105
+ "epoch": 0.5970149253731343,
106
+ "grad_norm": 0.11743893474340439,
107
+ "kl": 0.0013933181762695312,
108
+ "learning_rate": 1e-06,
109
+ "loss": 0.0516,
110
+ "reward": 0.49990383088588713,
111
+ "reward_std": 0.2354368444532156,
112
+ "rewards/cosine_scaled_reward": 0.49990383088588713,
113
+ "step": 35
114
+ },
115
+ {
116
+ "clip_ratio": 0.0,
117
+ "completion_length": 670.113818359375,
118
+ "epoch": 0.6823027718550106,
119
+ "grad_norm": 0.1322937160730362,
120
+ "kl": 0.001470184326171875,
121
+ "learning_rate": 1e-06,
122
+ "loss": 0.0423,
123
+ "reward": 0.4925546832382679,
124
+ "reward_std": 0.23934022188186646,
125
+ "rewards/cosine_scaled_reward": 0.4925546832382679,
126
+ "step": 40
127
+ },
128
+ {
129
+ "clip_ratio": 0.0,
130
+ "completion_length": 693.9797073364258,
131
+ "epoch": 0.767590618336887,
132
+ "grad_norm": 0.11684294044971466,
133
+ "kl": 0.0016462326049804688,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.0472,
136
+ "reward": 0.5110804848372936,
137
+ "reward_std": 0.24233248196542262,
138
+ "rewards/cosine_scaled_reward": 0.5110804848372936,
139
+ "step": 45
140
+ },
141
+ {
142
+ "clip_ratio": 0.0,
143
+ "completion_length": 703.8312713623047,
144
+ "epoch": 0.8528784648187633,
145
+ "grad_norm": 0.13510337471961975,
146
+ "kl": 0.0022981643676757814,
147
+ "learning_rate": 1e-06,
148
+ "loss": 0.0378,
149
+ "reward": 0.5143898174166679,
150
+ "reward_std": 0.24455392695963382,
151
+ "rewards/cosine_scaled_reward": 0.5143898174166679,
152
+ "step": 50
153
+ },
154
+ {
155
+ "clip_ratio": 0.0,
156
+ "completion_length": 676.6164245605469,
157
+ "epoch": 0.9381663113006397,
158
+ "grad_norm": 0.12922672927379608,
159
+ "kl": 0.0027494430541992188,
160
+ "learning_rate": 1e-06,
161
+ "loss": 0.0368,
162
+ "reward": 0.538255549967289,
163
+ "reward_std": 0.23528089113533496,
164
+ "rewards/cosine_scaled_reward": 0.538255549967289,
165
+ "step": 55
166
+ },
167
+ {
168
+ "epoch": 1.0341151385927505,
169
+ "grad_norm": 0.1454162746667862,
170
+ "learning_rate": 1e-06,
171
+ "loss": 0.0173,
172
+ "step": 60
173
+ },
174
+ {
175
+ "epoch": 1.0341151385927505,
176
+ "eval_clip_ratio": 0.0,
177
+ "eval_completion_length": 693.2279481887817,
178
+ "eval_kl": 0.004572391510009766,
179
+ "eval_loss": 0.017638780176639557,
180
+ "eval_reward": 0.4241527561098337,
181
+ "eval_reward_std": 0.27252104552462697,
182
+ "eval_rewards/cosine_scaled_reward": 0.4241527561098337,
183
+ "eval_runtime": 731.0086,
184
+ "eval_samples_per_second": 0.684,
185
+ "eval_steps_per_second": 0.008,
186
+ "step": 60
187
+ },
188
+ {
189
+ "clip_ratio": 0.0,
190
+ "completion_length": 658.7437713623046,
191
+ "epoch": 1.1194029850746268,
192
+ "grad_norm": 0.14889299869537354,
193
+ "kl": 0.004099464416503907,
194
+ "learning_rate": 1e-06,
195
+ "loss": 0.0233,
196
+ "reward": 0.5276085119694471,
197
+ "reward_std": 0.22797267828136683,
198
+ "rewards/cosine_scaled_reward": 0.5276085119694471,
199
+ "step": 65
200
+ },
201
+ {
202
+ "clip_ratio": 0.0,
203
+ "completion_length": 661.8356964111329,
204
+ "epoch": 1.2046908315565032,
205
+ "grad_norm": 0.13085317611694336,
206
+ "kl": 0.0031612396240234377,
207
+ "learning_rate": 1e-06,
208
+ "loss": 0.0077,
209
+ "reward": 0.5444076530635357,
210
+ "reward_std": 0.22460445892065764,
211
+ "rewards/cosine_scaled_reward": 0.5444076530635357,
212
+ "step": 70
213
+ },
214
+ {
215
+ "clip_ratio": 0.0,
216
+ "completion_length": 702.0494995117188,
217
+ "epoch": 1.2899786780383795,
218
+ "grad_norm": 0.15662160515785217,
219
+ "kl": 0.003662109375,
220
+ "learning_rate": 1e-06,
221
+ "loss": 0.0192,
222
+ "reward": 0.5370461963117122,
223
+ "reward_std": 0.23422690220177173,
224
+ "rewards/cosine_scaled_reward": 0.5370461963117122,
225
+ "step": 75
226
+ },
227
+ {
228
+ "clip_ratio": 0.0,
229
+ "completion_length": 675.3036636352539,
230
+ "epoch": 1.375266524520256,
231
+ "grad_norm": 0.14926958084106445,
232
+ "kl": 0.004243087768554687,
233
+ "learning_rate": 1e-06,
234
+ "loss": 0.0059,
235
+ "reward": 0.5500567473471165,
236
+ "reward_std": 0.2124465636909008,
237
+ "rewards/cosine_scaled_reward": 0.5500567473471165,
238
+ "step": 80
239
+ },
240
+ {
241
+ "clip_ratio": 0.0,
242
+ "completion_length": 690.0687698364258,
243
+ "epoch": 1.4605543710021323,
244
+ "grad_norm": 0.12932759523391724,
245
+ "kl": 0.004961395263671875,
246
+ "learning_rate": 1e-06,
247
+ "loss": 0.0239,
248
+ "reward": 0.5469569325447082,
249
+ "reward_std": 0.22115669399499893,
250
+ "rewards/cosine_scaled_reward": 0.5469569325447082,
251
+ "step": 85
252
+ },
253
+ {
254
+ "clip_ratio": 0.0,
255
+ "completion_length": 704.1914291381836,
256
+ "epoch": 1.5458422174840085,
257
+ "grad_norm": 0.27263641357421875,
258
+ "kl": 0.011516571044921875,
259
+ "learning_rate": 1e-06,
260
+ "loss": 0.0134,
261
+ "reward": 0.540603245049715,
262
+ "reward_std": 0.220616265386343,
263
+ "rewards/cosine_scaled_reward": 0.540603245049715,
264
+ "step": 90
265
+ },
266
+ {
267
+ "clip_ratio": 0.0,
268
+ "completion_length": 705.2396011352539,
269
+ "epoch": 1.6311300639658848,
270
+ "grad_norm": 0.1261100471019745,
271
+ "kl": 0.0060760498046875,
272
+ "learning_rate": 1e-06,
273
+ "loss": 0.0069,
274
+ "reward": 0.5562940575182438,
275
+ "reward_std": 0.22738375030457975,
276
+ "rewards/cosine_scaled_reward": 0.5562940575182438,
277
+ "step": 95
278
+ },
279
+ {
280
+ "clip_ratio": 0.0,
281
+ "completion_length": 708.0935134887695,
282
+ "epoch": 1.716417910447761,
283
+ "grad_norm": 0.20003671944141388,
284
+ "kl": 0.0080291748046875,
285
+ "learning_rate": 1e-06,
286
+ "loss": 0.01,
287
+ "reward": 0.5645768508315087,
288
+ "reward_std": 0.22589275762438774,
289
+ "rewards/cosine_scaled_reward": 0.5645768508315087,
290
+ "step": 100
291
+ },
292
+ {
293
+ "clip_ratio": 0.0,
294
+ "completion_length": 730.0916885375976,
295
+ "epoch": 1.8017057569296375,
296
+ "grad_norm": 0.14709459245204926,
297
+ "kl": 0.0078338623046875,
298
+ "learning_rate": 1e-06,
299
+ "loss": 0.0176,
300
+ "reward": 0.5447159253060818,
301
+ "reward_std": 0.2277662731707096,
302
+ "rewards/cosine_scaled_reward": 0.5447159253060818,
303
+ "step": 105
304
+ },
305
+ {
306
+ "clip_ratio": 0.0,
307
+ "completion_length": 733.7476760864258,
308
+ "epoch": 1.886993603411514,
309
+ "grad_norm": 0.14343297481536865,
310
+ "kl": 0.009282684326171875,
311
+ "learning_rate": 1e-06,
312
+ "loss": 0.0221,
313
+ "reward": 0.5597066521644593,
314
+ "reward_std": 0.23097761012613774,
315
+ "rewards/cosine_scaled_reward": 0.5597066521644593,
316
+ "step": 110
317
+ },
318
+ {
319
+ "clip_ratio": 0.0,
320
+ "completion_length": 745.3560104370117,
321
+ "epoch": 1.9722814498933903,
322
+ "grad_norm": 0.21631674468517303,
323
+ "kl": 0.011328125,
324
+ "learning_rate": 1e-06,
325
+ "loss": 0.0179,
326
+ "reward": 0.5601713679730892,
327
+ "reward_std": 0.2243567120283842,
328
+ "rewards/cosine_scaled_reward": 0.5601713679730892,
329
+ "step": 115
330
+ },
331
+ {
332
+ "epoch": 2.068230277185501,
333
+ "grad_norm": 0.22386282682418823,
334
+ "learning_rate": 1e-06,
335
+ "loss": 0.0123,
336
+ "step": 120
337
+ },
338
+ {
339
+ "epoch": 2.068230277185501,
340
+ "eval_clip_ratio": 0.0,
341
+ "eval_completion_length": 754.1520328521729,
342
+ "eval_kl": 0.016979217529296875,
343
+ "eval_loss": 0.012520050629973412,
344
+ "eval_reward": 0.4722373131662607,
345
+ "eval_reward_std": 0.2587718339636922,
346
+ "eval_rewards/cosine_scaled_reward": 0.4722373131662607,
347
+ "eval_runtime": 724.6498,
348
+ "eval_samples_per_second": 0.69,
349
+ "eval_steps_per_second": 0.008,
350
+ "step": 120
351
+ },
352
+ {
353
+ "clip_ratio": 0.0,
354
+ "completion_length": 749.0557495117188,
355
+ "epoch": 2.1535181236673773,
356
+ "grad_norm": 0.17756131291389465,
357
+ "kl": 0.0161590576171875,
358
+ "learning_rate": 1e-06,
359
+ "loss": 0.0042,
360
+ "reward": 0.5662507023662329,
361
+ "reward_std": 0.22460255604237317,
362
+ "rewards/cosine_scaled_reward": 0.5662507023662329,
363
+ "step": 125
364
+ },
365
+ {
366
+ "clip_ratio": 0.0,
367
+ "completion_length": 757.4320526123047,
368
+ "epoch": 2.2388059701492535,
369
+ "grad_norm": 0.24769122898578644,
370
+ "kl": 0.02352294921875,
371
+ "learning_rate": 1e-06,
372
+ "loss": 0.0179,
373
+ "reward": 0.5909504756331444,
374
+ "reward_std": 0.22746318429708481,
375
+ "rewards/cosine_scaled_reward": 0.5909504756331444,
376
+ "step": 130
377
+ },
378
+ {
379
+ "clip_ratio": 0.0,
380
+ "completion_length": 778.7044509887695,
381
+ "epoch": 2.3240938166311302,
382
+ "grad_norm": 0.25195103883743286,
383
+ "kl": 0.02833251953125,
384
+ "learning_rate": 1e-06,
385
+ "loss": 0.0091,
386
+ "reward": 0.5315394312143326,
387
+ "reward_std": 0.23098385594785215,
388
+ "rewards/cosine_scaled_reward": 0.5315394312143326,
389
+ "step": 135
390
+ },
391
+ {
392
+ "clip_ratio": 0.0,
393
+ "completion_length": 780.8755416870117,
394
+ "epoch": 2.4093816631130065,
395
+ "grad_norm": 0.3621465265750885,
396
+ "kl": 0.0350982666015625,
397
+ "learning_rate": 1e-06,
398
+ "loss": 0.0015,
399
+ "reward": 0.5728602990508079,
400
+ "reward_std": 0.24233178310096265,
401
+ "rewards/cosine_scaled_reward": 0.5728602990508079,
402
+ "step": 140
403
+ },
404
+ {
405
+ "clip_ratio": 0.0,
406
+ "completion_length": 777.0599136352539,
407
+ "epoch": 2.4946695095948828,
408
+ "grad_norm": 0.2630998492240906,
409
+ "kl": 0.053851318359375,
410
+ "learning_rate": 1e-06,
411
+ "loss": 0.0229,
412
+ "reward": 0.5749510392546654,
413
+ "reward_std": 0.25164939016103743,
414
+ "rewards/cosine_scaled_reward": 0.5749510392546654,
415
+ "step": 145
416
+ },
417
+ {
418
+ "clip_ratio": 0.0,
419
+ "completion_length": 806.2307495117187,
420
+ "epoch": 2.579957356076759,
421
+ "grad_norm": 0.7152215242385864,
422
+ "kl": 0.075592041015625,
423
+ "learning_rate": 1e-06,
424
+ "loss": 0.0256,
425
+ "reward": 0.5316810458898544,
426
+ "reward_std": 0.24946709722280502,
427
+ "rewards/cosine_scaled_reward": 0.5316810458898544,
428
+ "step": 150
429
+ },
430
+ {
431
+ "clip_ratio": 0.0,
432
+ "completion_length": 807.8872589111328,
433
+ "epoch": 2.6652452025586353,
434
+ "grad_norm": 0.9139208197593689,
435
+ "kl": 0.1150634765625,
436
+ "learning_rate": 1e-06,
437
+ "loss": 0.0312,
438
+ "reward": 0.4877690590918064,
439
+ "reward_std": 0.28349833004176617,
440
+ "rewards/cosine_scaled_reward": 0.4877690590918064,
441
+ "step": 155
442
+ },
443
+ {
444
+ "clip_ratio": 0.0,
445
+ "completion_length": 926.7297088623047,
446
+ "epoch": 2.750533049040512,
447
+ "grad_norm": 1.1683220863342285,
448
+ "kl": 0.2032470703125,
449
+ "learning_rate": 1e-06,
450
+ "loss": 0.0749,
451
+ "reward": 0.2821820305660367,
452
+ "reward_std": 0.3167072061449289,
453
+ "rewards/cosine_scaled_reward": 0.2821820305660367,
454
+ "step": 160
455
+ },
456
+ {
457
+ "clip_ratio": 0.0,
458
+ "completion_length": 932.1104385375977,
459
+ "epoch": 2.835820895522388,
460
+ "grad_norm": 6.447605609893799,
461
+ "kl": 0.3660888671875,
462
+ "learning_rate": 1e-06,
463
+ "loss": 0.0324,
464
+ "reward": 0.02865399098955095,
465
+ "reward_std": 0.30335349403321743,
466
+ "rewards/cosine_scaled_reward": 0.02865399098955095,
467
+ "step": 165
468
+ },
469
+ {
470
+ "clip_ratio": 0.0,
471
+ "completion_length": 664.3466369628907,
472
+ "epoch": 2.9211087420042645,
473
+ "grad_norm": 26.408769607543945,
474
+ "kl": 0.82470703125,
475
+ "learning_rate": 1e-06,
476
+ "loss": -0.0468,
477
+ "reward": -0.12603640989982523,
478
+ "reward_std": 0.2628266651183367,
479
+ "rewards/cosine_scaled_reward": -0.12603640989982523,
480
+ "step": 170
481
+ },
482
+ {
483
+ "clip_ratio": 0.0,
484
+ "completion_length": 562.9258012771606,
485
+ "epoch": 2.9893390191897655,
486
+ "kl": 0.9171142578125,
487
+ "reward": -0.19254306121729314,
488
+ "reward_std": 0.23406662652269006,
489
+ "rewards/cosine_scaled_reward": -0.19254306121729314,
490
+ "step": 174,
491
+ "total_flos": 0.0,
492
+ "train_loss": 0.03207497191133684,
493
+ "train_runtime": 41076.4058,
494
+ "train_samples_per_second": 0.548,
495
+ "train_steps_per_second": 0.004
496
+ }
497
+ ],
498
+ "logging_steps": 5,
499
+ "max_steps": 174,
500
+ "num_input_tokens_seen": 0,
501
+ "num_train_epochs": 3,
502
+ "save_steps": 500,
503
+ "stateful_callbacks": {
504
+ "TrainerControl": {
505
+ "args": {
506
+ "should_epoch_stop": false,
507
+ "should_evaluate": false,
508
+ "should_log": false,
509
+ "should_save": true,
510
+ "should_training_stop": true
511
+ },
512
+ "attributes": {}
513
+ }
514
+ },
515
+ "total_flos": 0.0,
516
+ "train_batch_size": 16,
517
+ "trial_name": null,
518
+ "trial_params": null
519
+ }