NTQuoc commited on
Commit
257567e
·
verified ·
1 Parent(s): d54d0e7

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
- base_model: Qwen/Qwen3.5-0.8B
3
- datasets: knoveleng/open-rs
4
  library_name: transformers
5
  model_name: OpenRS-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for OpenRS-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
 
1
  ---
2
+ base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 
3
  library_name: transformers
4
  model_name: OpenRS-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for OpenRS-GRPO
13
 
14
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.010430806130170823,
4
- "train_runtime": 9458.0577,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.034,
7
- "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -3.18021047860384e-06,
4
+ "train_runtime": 16301.2017,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.02,
7
+ "train_steps_per_second": 0.001
8
  }
step_metrics.csv CHANGED
@@ -1,22 +1,22 @@
1
  step,epoch,loss,learning_rate,grad_norm,rewards/format_reward,rewards/cosine_scaled_reward,reward,reward_std,gpu_mem_alloc_mb,gpu_mem_peak_mb,step_time_sec
2
- 1,0.0006,-1.2665987014770508e-07,0.0,,0.0,-0.3879377990961075,-0.7758755832910538,0.2861072635278106,1565.2,3646.7,476.39
3
- 2,0.0011,2.2351741790771484e-08,5e-07,,0.0,-0.34302495419979095,-0.6860499083995819,0.3919920399785042,1565.2,3663.9,469.44
4
- 3,0.0017,0.04199279844760895,1e-06,,0.0,-0.39361898601055145,-0.7872379571199417,0.28899660520255566,1565.2,3665.2,464.19
5
- 4,0.0023,0.00015985965728759766,9.931634888554935e-07,,0.0,-0.39139123260974884,-0.7827824652194977,0.30724803544580936,1565.2,3665.5,473.28
6
- 5,0.0029,0.016923315823078156,9.728616793536587e-07,,0.0,-0.3417773097753525,-0.6835546344518661,0.4394468888640404,1565.2,3666.6,475.57
7
- 6,0.0034,0.00023673847317695618,9.397114317029974e-07,,0.0,-0.43544115871191025,-0.8708823472261429,0.1456776731647551,1565.2,3669.9,476.37
8
- 7,0.004,5.0827860832214355e-05,8.9471999940354e-07,,0.0,-0.46910375356674194,-0.9382074922323227,0.08606540504842997,1565.2,3669.9,481.67
9
- 8,0.0046,7.636845111846924e-05,8.392544243589427e-07,,0.0,-0.3953063264489174,-0.7906126528978348,0.18317685835063457,1565.2,3669.9,481.42
10
- 9,0.0051,7.21365213394165e-05,7.75e-07,,0.0,-0.4495018497109413,-0.8990036994218826,0.1606585686095059,1565.2,3669.9,479.75
11
- 10,0.0057,0.00016715750098228455,7.039090644965509e-07,,0.0,-0.4300354793667793,-0.860070988535881,0.17052607703953981,1565.2,3669.9,466.26
12
- 11,0.0063,0.00013599544763565063,6.281416799501187e-07,,0.0,-0.4115590825676918,-0.8231181800365448,0.1259385095909238,1565.2,3669.9,472.08
13
- 12,0.0069,5.13419508934021e-05,5.5e-07,,0.0,-0.4723722040653229,-0.9447444081306458,0.08061030774842948,1565.2,3669.9,471.88
14
- 13,0.0074,0.07869705557823181,4.7185832004988133e-07,,0.0,-0.44672856479883194,-0.8934571295976639,0.15774485282599926,1565.2,3669.9,466.09
15
- 14,0.008,0.027497582137584686,3.9609093550344907e-07,,0.0,-0.3920762911438942,-0.7841525673866272,0.2220854666084051,1565.2,3669.9,468.75
16
- 15,0.0086,0.0004888176918029785,3.250000000000001e-07,,0.0,-0.3608057275414467,-0.7216114401817322,0.3453192347660661,1565.2,3669.9,462.49
17
- 16,0.0091,0.00015526264905929565,2.6074557564105724e-07,,0.0,-0.4132692217826843,-0.8265384286642075,0.25778803089633584,1565.2,3669.9,476.11
18
- 17,0.0097,0.02472507953643799,2.0528000059645995e-07,,0.0,-0.4350534752011299,-0.8701069504022598,0.18937412789091468,1565.2,3669.9,469.71
19
- 18,0.0103,0.00023746490478515625,1.6028856829700258e-07,,0.0,-0.41003918647766113,-0.8200783580541611,0.26157089229673147,1565.2,3669.9,472.28
20
- 19,0.0109,0.016675502061843872,1.2713832064634125e-07,,0.0,-0.41168487817049026,-0.8233697563409805,0.20013628248125315,1565.2,3669.9,477.29
21
- 20,0.0114,0.00027292221784591675,1.068365111445064e-07,,0.0,-0.4022079259157181,-0.8044158518314362,0.2423506089253351,1565.2,3669.9,473.18
22
- 20,0.0114,,,,,,,,1565.2,3669.9,477.0
 
1
  step,epoch,loss,learning_rate,grad_norm,rewards/format_reward,rewards/cosine_scaled_reward,reward,reward_std,gpu_mem_alloc_mb,gpu_mem_peak_mb,step_time_sec
2
+ 1,0.0006,2.980232238769531e-07,0.0,,0.0,-0.4581816643476486,-0.9163633286952972,0.07046629022806883,3645.2,6086.5,810.9
3
+ 2,0.0011,4.6566128730773926e-07,5e-07,,0.0,-0.47419849038124084,-0.9483969509601593,0.06443409714847803,3645.2,6119.0,813.84
4
+ 3,0.0017,-6.4373016357421875e-06,1e-06,,0.0,-0.47469519078731537,-0.9493903964757919,0.02546792710199952,3645.2,6120.5,814.88
5
+ 4,0.0023,-6.161630153656006e-06,9.931634888554935e-07,,0.0,-0.4800366908311844,-0.9600733816623688,0.03750546649098396,3645.2,6122.9,815.73
6
+ 5,0.0029,-1.1235475540161133e-05,9.728616793536587e-07,,0.0,-0.4610184580087662,-0.9220369160175323,0.05609214352443814,3645.2,6122.9,815.84
7
+ 6,0.0034,-5.6587159633636475e-06,9.397114317029974e-07,,0.0,-0.43115096539258957,-0.8623019307851791,0.19040754111483693,3645.2,6129.7,815.34
8
+ 7,0.004,-6.24731183052063e-06,8.9471999940354e-07,,0.0,-0.4677419885993004,-0.9354839473962784,0.056630742736160755,3645.2,6129.7,814.51
9
+ 8,0.0046,-5.133450031280518e-06,8.392544243589427e-07,,0.0,-0.4351673647761345,-0.8703347146511078,0.10057847108691931,3645.2,6129.7,815.12
10
+ 9,0.0051,-4.664063453674316e-06,7.75e-07,,0.0,-0.4807252585887909,-0.9614505171775818,0.0432720510289073,3645.2,6129.7,815.01
11
+ 10,0.0057,-2.2277235984802246e-06,7.039090644965509e-07,,0.0,-0.38343894481658936,-0.7668778896331787,0.3067741859704256,3645.2,6129.7,813.54
12
+ 11,0.0063,-1.996755599975586e-06,6.281416799501187e-07,,0.0,-0.2307990826666355,-0.461598165333271,0.3550597131252289,3645.2,6129.7,814.71
13
+ 12,0.0069,-1.2740492820739746e-06,5.5e-07,,0.0,-0.19471427984535694,-0.3894285596907139,0.27436650544404984,3645.2,6129.7,814.77
14
+ 13,0.0074,-2.5406479835510254e-06,4.7185832004988133e-07,,0.0,-0.2408045493066311,-0.4816090911626816,0.3208252266049385,3645.2,6129.7,814.34
15
+ 14,0.008,-1.5497207641601562e-06,3.9609093550344907e-07,,0.0,-0.35031646490097046,-0.7006329447031021,0.3634557966142893,3645.2,6129.7,814.91
16
+ 15,0.0086,-1.7210841178894043e-06,3.250000000000001e-07,,0.0,-0.2049925960600376,-0.40998518466949463,0.3375362530350685,3645.2,6129.7,815.43
17
+ 16,0.0091,-1.9073486328125e-06,2.6074557564105724e-07,,0.0,-0.17782340943813324,-0.3556468114256859,0.38361550495028496,3645.2,6129.7,816.11
18
+ 17,0.0097,-2.1010637283325195e-06,2.0528000059645995e-07,,0.0,-0.28106561303138733,-0.5621312409639359,0.371349073946476,3645.2,6129.7,815.13
19
+ 18,0.0103,-1.7527490854263306e-06,1.6028856829700258e-07,,0.0,-0.25671521946787834,-0.5134304240345955,0.34325383603572845,3645.2,6129.7,815.46
20
+ 19,0.0109,-2.980232238769531e-07,1.2713832064634125e-07,,0.0,-0.20568780414760113,-0.41137560456991196,0.28929552249610424,3645.2,6129.7,816.18
21
+ 20,0.0114,-1.460779458284378e-06,1.068365111445064e-07,,0.0,-0.31505120918154716,-0.6301024332642555,0.33287271670997143,3645.2,6129.7,815.67
22
+ 20,0.0114,,,,,,,,3645.2,6129.7,819.4
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.010430806130170823,
4
- "train_runtime": 9458.0577,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.034,
7
- "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": -3.18021047860384e-06,
4
+ "train_runtime": 16301.2017,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.02,
7
+ "train_steps_per_second": 0.001
8
  }
trainer_state.json CHANGED
@@ -11,261 +11,261 @@
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
14
- "completion_length": 453.625,
15
  "epoch": 0.0005714285714285715,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
- "loss": -1.2665987014770508e-07,
19
- "reward": -0.7758755832910538,
20
- "reward_std": 0.2861072635278106,
21
- "rewards/cosine_scaled_reward": -0.3879377990961075,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
- "completion_length": 512.0,
28
  "epoch": 0.001142857142857143,
29
  "kl": 0.0,
30
  "learning_rate": 5e-07,
31
- "loss": 2.2351741790771484e-08,
32
- "reward": -0.6860499083995819,
33
- "reward_std": 0.3919920399785042,
34
- "rewards/cosine_scaled_reward": -0.34302495419979095,
35
  "rewards/format_reward": 0.0,
36
  "step": 2
37
  },
38
  {
39
  "clip_ratio": 0.0,
40
- "completion_length": 494.9375,
41
  "epoch": 0.0017142857142857142,
42
- "kl": 0.00760650634765625,
43
  "learning_rate": 1e-06,
44
- "loss": 0.04199279844760895,
45
- "reward": -0.7872379571199417,
46
- "reward_std": 0.28899660520255566,
47
- "rewards/cosine_scaled_reward": -0.39361898601055145,
48
  "rewards/format_reward": 0.0,
49
  "step": 3
50
  },
51
  {
52
  "clip_ratio": 0.0,
53
- "completion_length": 484.875,
54
  "epoch": 0.002285714285714286,
55
- "kl": 0.00399017333984375,
56
  "learning_rate": 9.931634888554935e-07,
57
- "loss": 0.00015985965728759766,
58
- "reward": -0.7827824652194977,
59
- "reward_std": 0.30724803544580936,
60
- "rewards/cosine_scaled_reward": -0.39139123260974884,
61
  "rewards/format_reward": 0.0,
62
  "step": 4
63
  },
64
  {
65
  "clip_ratio": 0.0,
66
- "completion_length": 498.125,
67
  "epoch": 0.002857142857142857,
68
- "kl": 0.010009765625,
69
  "learning_rate": 9.728616793536587e-07,
70
- "loss": 0.016923315823078156,
71
- "reward": -0.6835546344518661,
72
- "reward_std": 0.4394468888640404,
73
- "rewards/cosine_scaled_reward": -0.3417773097753525,
74
  "rewards/format_reward": 0.0,
75
  "step": 5
76
  },
77
  {
78
  "clip_ratio": 0.0,
79
- "completion_length": 512.0,
80
  "epoch": 0.0034285714285714284,
81
- "kl": 0.0059185028076171875,
82
  "learning_rate": 9.397114317029974e-07,
83
- "loss": 0.00023673847317695618,
84
- "reward": -0.8708823472261429,
85
- "reward_std": 0.1456776731647551,
86
- "rewards/cosine_scaled_reward": -0.43544115871191025,
87
  "rewards/format_reward": 0.0,
88
  "step": 6
89
  },
90
  {
91
  "clip_ratio": 0.0,
92
- "completion_length": 512.0,
93
  "epoch": 0.004,
94
- "kl": 0.001270294189453125,
95
  "learning_rate": 8.9471999940354e-07,
96
- "loss": 5.0827860832214355e-05,
97
- "reward": -0.9382074922323227,
98
- "reward_std": 0.08606540504842997,
99
- "rewards/cosine_scaled_reward": -0.46910375356674194,
100
  "rewards/format_reward": 0.0,
101
  "step": 7
102
  },
103
  {
104
  "clip_ratio": 0.0,
105
- "completion_length": 480.625,
106
  "epoch": 0.004571428571428572,
107
- "kl": 0.0019054412841796875,
108
  "learning_rate": 8.392544243589427e-07,
109
- "loss": 7.636845111846924e-05,
110
- "reward": -0.7906126528978348,
111
- "reward_std": 0.18317685835063457,
112
- "rewards/cosine_scaled_reward": -0.3953063264489174,
113
  "rewards/format_reward": 0.0,
114
  "step": 8
115
  },
116
  {
117
  "clip_ratio": 0.0,
118
- "completion_length": 512.0,
119
  "epoch": 0.005142857142857143,
120
- "kl": 0.0017995834350585938,
121
  "learning_rate": 7.75e-07,
122
- "loss": 7.21365213394165e-05,
123
- "reward": -0.8990036994218826,
124
- "reward_std": 0.1606585686095059,
125
- "rewards/cosine_scaled_reward": -0.4495018497109413,
126
  "rewards/format_reward": 0.0,
127
  "step": 9
128
  },
129
  {
130
  "clip_ratio": 0.0,
131
- "completion_length": 512.0,
132
  "epoch": 0.005714285714285714,
133
- "kl": 0.0041675567626953125,
134
  "learning_rate": 7.039090644965509e-07,
135
- "loss": 0.00016715750098228455,
136
- "reward": -0.860070988535881,
137
- "reward_std": 0.17052607703953981,
138
- "rewards/cosine_scaled_reward": -0.4300354793667793,
139
  "rewards/format_reward": 0.0,
140
  "step": 10
141
  },
142
  {
143
  "clip_ratio": 0.0,
144
- "completion_length": 477.125,
145
  "epoch": 0.006285714285714286,
146
- "kl": 0.003407001495361328,
147
  "learning_rate": 6.281416799501187e-07,
148
- "loss": 0.00013599544763565063,
149
- "reward": -0.8231181800365448,
150
- "reward_std": 0.1259385095909238,
151
- "rewards/cosine_scaled_reward": -0.4115590825676918,
152
  "rewards/format_reward": 0.0,
153
  "step": 11
154
  },
155
  {
156
  "clip_ratio": 0.0,
157
- "completion_length": 512.0,
158
  "epoch": 0.006857142857142857,
159
- "kl": 0.0012102127075195312,
160
  "learning_rate": 5.5e-07,
161
- "loss": 5.13419508934021e-05,
162
- "reward": -0.9447444081306458,
163
- "reward_std": 0.08061030774842948,
164
- "rewards/cosine_scaled_reward": -0.4723722040653229,
165
  "rewards/format_reward": 0.0,
166
  "step": 12
167
  },
168
  {
169
  "clip_ratio": 0.0,
170
- "completion_length": 487.9375,
171
  "epoch": 0.0074285714285714285,
172
- "kl": 0.00119781494140625,
173
  "learning_rate": 4.7185832004988133e-07,
174
- "loss": 0.07869705557823181,
175
- "reward": -0.8934571295976639,
176
- "reward_std": 0.15774485282599926,
177
- "rewards/cosine_scaled_reward": -0.44672856479883194,
178
  "rewards/format_reward": 0.0,
179
  "step": 13
180
  },
181
  {
182
  "clip_ratio": 0.0,
183
- "completion_length": 466.625,
184
  "epoch": 0.008,
185
- "kl": 0.004004955291748047,
186
  "learning_rate": 3.9609093550344907e-07,
187
- "loss": 0.027497582137584686,
188
- "reward": -0.7841525673866272,
189
- "reward_std": 0.2220854666084051,
190
- "rewards/cosine_scaled_reward": -0.3920762911438942,
191
  "rewards/format_reward": 0.0,
192
  "step": 14
193
  },
194
  {
195
  "clip_ratio": 0.0,
196
- "completion_length": 497.5,
197
  "epoch": 0.008571428571428572,
198
- "kl": 0.012208938598632812,
199
  "learning_rate": 3.250000000000001e-07,
200
- "loss": 0.0004888176918029785,
201
- "reward": -0.7216114401817322,
202
- "reward_std": 0.3453192347660661,
203
- "rewards/cosine_scaled_reward": -0.3608057275414467,
204
  "rewards/format_reward": 0.0,
205
  "step": 15
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
- "completion_length": 499.0,
210
  "epoch": 0.009142857142857144,
211
- "kl": 0.003875732421875,
212
  "learning_rate": 2.6074557564105724e-07,
213
- "loss": 0.00015526264905929565,
214
- "reward": -0.8265384286642075,
215
- "reward_std": 0.25778803089633584,
216
- "rewards/cosine_scaled_reward": -0.4132692217826843,
217
  "rewards/format_reward": 0.0,
218
  "step": 16
219
  },
220
  {
221
  "clip_ratio": 0.0,
222
- "completion_length": 501.5,
223
  "epoch": 0.009714285714285713,
224
- "kl": 0.004252910614013672,
225
  "learning_rate": 2.0528000059645995e-07,
226
- "loss": 0.02472507953643799,
227
- "reward": -0.8701069504022598,
228
- "reward_std": 0.18937412789091468,
229
- "rewards/cosine_scaled_reward": -0.4350534752011299,
230
  "rewards/format_reward": 0.0,
231
  "step": 17
232
  },
233
  {
234
  "clip_ratio": 0.0,
235
- "completion_length": 506.625,
236
  "epoch": 0.010285714285714285,
237
- "kl": 0.005939483642578125,
238
  "learning_rate": 1.6028856829700258e-07,
239
- "loss": 0.00023746490478515625,
240
- "reward": -0.8200783580541611,
241
- "reward_std": 0.26157089229673147,
242
- "rewards/cosine_scaled_reward": -0.41003918647766113,
243
  "rewards/format_reward": 0.0,
244
  "step": 18
245
  },
246
  {
247
  "clip_ratio": 0.0,
248
- "completion_length": 446.25,
249
  "epoch": 0.010857142857142857,
250
- "kl": 0.002822399139404297,
251
  "learning_rate": 1.2713832064634125e-07,
252
- "loss": 0.016675502061843872,
253
- "reward": -0.8233697563409805,
254
- "reward_std": 0.20013628248125315,
255
- "rewards/cosine_scaled_reward": -0.41168487817049026,
256
  "rewards/format_reward": 0.0,
257
  "step": 19
258
  },
259
  {
260
  "clip_ratio": 0.0,
261
- "completion_length": 496.125,
262
  "epoch": 0.011428571428571429,
263
- "kl": 0.00678253173828125,
264
  "learning_rate": 1.068365111445064e-07,
265
- "loss": 0.00027292221784591675,
266
- "reward": -0.8044158518314362,
267
- "reward_std": 0.2423506089253351,
268
- "rewards/cosine_scaled_reward": -0.4022079259157181,
269
  "rewards/format_reward": 0.0,
270
  "step": 20
271
  },
@@ -273,10 +273,10 @@
273
  "epoch": 0.011428571428571429,
274
  "step": 20,
275
  "total_flos": 0.0,
276
- "train_loss": 0.010430806130170823,
277
- "train_runtime": 9458.0577,
278
- "train_samples_per_second": 0.034,
279
- "train_steps_per_second": 0.002
280
  }
281
  ],
282
  "logging_steps": 1,
 
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
14
+ "completion_length": 1024.0,
15
  "epoch": 0.0005714285714285715,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
+ "loss": 2.980232238769531e-07,
19
+ "reward": -0.9163633286952972,
20
+ "reward_std": 0.07046629022806883,
21
+ "rewards/cosine_scaled_reward": -0.4581816643476486,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 1024.0,
28
  "epoch": 0.001142857142857143,
29
  "kl": 0.0,
30
  "learning_rate": 5e-07,
31
+ "loss": 4.6566128730773926e-07,
32
+ "reward": -0.9483969509601593,
33
+ "reward_std": 0.06443409714847803,
34
+ "rewards/cosine_scaled_reward": -0.47419849038124084,
35
  "rewards/format_reward": 0.0,
36
  "step": 2
37
  },
38
  {
39
  "clip_ratio": 0.0,
40
+ "completion_length": 1024.0,
41
  "epoch": 0.0017142857142857142,
42
+ "kl": -0.00019288063049316406,
43
  "learning_rate": 1e-06,
44
+ "loss": -6.4373016357421875e-06,
45
+ "reward": -0.9493903964757919,
46
+ "reward_std": 0.02546792710199952,
47
+ "rewards/cosine_scaled_reward": -0.47469519078731537,
48
  "rewards/format_reward": 0.0,
49
  "step": 3
50
  },
51
  {
52
  "clip_ratio": 0.0,
53
+ "completion_length": 1024.0,
54
  "epoch": 0.002285714285714286,
55
+ "kl": -0.00016021728515625,
56
  "learning_rate": 9.931634888554935e-07,
57
+ "loss": -6.161630153656006e-06,
58
+ "reward": -0.9600733816623688,
59
+ "reward_std": 0.03750546649098396,
60
+ "rewards/cosine_scaled_reward": -0.4800366908311844,
61
  "rewards/format_reward": 0.0,
62
  "step": 4
63
  },
64
  {
65
  "clip_ratio": 0.0,
66
+ "completion_length": 1024.0,
67
  "epoch": 0.002857142857142857,
68
+ "kl": -0.00026488304138183594,
69
  "learning_rate": 9.728616793536587e-07,
70
+ "loss": -1.1235475540161133e-05,
71
+ "reward": -0.9220369160175323,
72
+ "reward_std": 0.05609214352443814,
73
+ "rewards/cosine_scaled_reward": -0.4610184580087662,
74
  "rewards/format_reward": 0.0,
75
  "step": 5
76
  },
77
  {
78
  "clip_ratio": 0.0,
79
+ "completion_length": 1024.0,
80
  "epoch": 0.0034285714285714284,
81
+ "kl": -0.000133514404296875,
82
  "learning_rate": 9.397114317029974e-07,
83
+ "loss": -5.6587159633636475e-06,
84
+ "reward": -0.8623019307851791,
85
+ "reward_std": 0.19040754111483693,
86
+ "rewards/cosine_scaled_reward": -0.43115096539258957,
87
  "rewards/format_reward": 0.0,
88
  "step": 6
89
  },
90
  {
91
  "clip_ratio": 0.0,
92
+ "completion_length": 1024.0,
93
  "epoch": 0.004,
94
+ "kl": -0.00017333030700683594,
95
  "learning_rate": 8.9471999940354e-07,
96
+ "loss": -6.24731183052063e-06,
97
+ "reward": -0.9354839473962784,
98
+ "reward_std": 0.056630742736160755,
99
+ "rewards/cosine_scaled_reward": -0.4677419885993004,
100
  "rewards/format_reward": 0.0,
101
  "step": 7
102
  },
103
  {
104
  "clip_ratio": 0.0,
105
+ "completion_length": 1024.0,
106
  "epoch": 0.004571428571428572,
107
+ "kl": -0.00012612342834472656,
108
  "learning_rate": 8.392544243589427e-07,
109
+ "loss": -5.133450031280518e-06,
110
+ "reward": -0.8703347146511078,
111
+ "reward_std": 0.10057847108691931,
112
+ "rewards/cosine_scaled_reward": -0.4351673647761345,
113
  "rewards/format_reward": 0.0,
114
  "step": 8
115
  },
116
  {
117
  "clip_ratio": 0.0,
118
+ "completion_length": 1024.0,
119
  "epoch": 0.005142857142857143,
120
+ "kl": -9.72747802734375e-05,
121
  "learning_rate": 7.75e-07,
122
+ "loss": -4.664063453674316e-06,
123
+ "reward": -0.9614505171775818,
124
+ "reward_std": 0.0432720510289073,
125
+ "rewards/cosine_scaled_reward": -0.4807252585887909,
126
  "rewards/format_reward": 0.0,
127
  "step": 9
128
  },
129
  {
130
  "clip_ratio": 0.0,
131
+ "completion_length": 1024.0,
132
  "epoch": 0.005714285714285714,
133
+ "kl": -5.555152893066406e-05,
134
  "learning_rate": 7.039090644965509e-07,
135
+ "loss": -2.2277235984802246e-06,
136
+ "reward": -0.7668778896331787,
137
+ "reward_std": 0.3067741859704256,
138
+ "rewards/cosine_scaled_reward": -0.38343894481658936,
139
  "rewards/format_reward": 0.0,
140
  "step": 10
141
  },
142
  {
143
  "clip_ratio": 0.0,
144
+ "completion_length": 1024.0,
145
  "epoch": 0.006285714285714286,
146
+ "kl": -4.9114227294921875e-05,
147
  "learning_rate": 6.281416799501187e-07,
148
+ "loss": -1.996755599975586e-06,
149
+ "reward": -0.461598165333271,
150
+ "reward_std": 0.3550597131252289,
151
+ "rewards/cosine_scaled_reward": -0.2307990826666355,
152
  "rewards/format_reward": 0.0,
153
  "step": 11
154
  },
155
  {
156
  "clip_ratio": 0.0,
157
+ "completion_length": 1024.0,
158
  "epoch": 0.006857142857142857,
159
+ "kl": -3.147125244140625e-05,
160
  "learning_rate": 5.5e-07,
161
+ "loss": -1.2740492820739746e-06,
162
+ "reward": -0.3894285596907139,
163
+ "reward_std": 0.27436650544404984,
164
+ "rewards/cosine_scaled_reward": -0.19471427984535694,
165
  "rewards/format_reward": 0.0,
166
  "step": 12
167
  },
168
  {
169
  "clip_ratio": 0.0,
170
+ "completion_length": 1024.0,
171
  "epoch": 0.0074285714285714285,
172
+ "kl": -6.198883056640625e-05,
173
  "learning_rate": 4.7185832004988133e-07,
174
+ "loss": -2.5406479835510254e-06,
175
+ "reward": -0.4816090911626816,
176
+ "reward_std": 0.3208252266049385,
177
+ "rewards/cosine_scaled_reward": -0.2408045493066311,
178
  "rewards/format_reward": 0.0,
179
  "step": 13
180
  },
181
  {
182
  "clip_ratio": 0.0,
183
+ "completion_length": 1024.0,
184
  "epoch": 0.008,
185
+ "kl": -4.029273986816406e-05,
186
  "learning_rate": 3.9609093550344907e-07,
187
+ "loss": -1.5497207641601562e-06,
188
+ "reward": -0.7006329447031021,
189
+ "reward_std": 0.3634557966142893,
190
+ "rewards/cosine_scaled_reward": -0.35031646490097046,
191
  "rewards/format_reward": 0.0,
192
  "step": 14
193
  },
194
  {
195
  "clip_ratio": 0.0,
196
+ "completion_length": 1024.0,
197
  "epoch": 0.008571428571428572,
198
+ "kl": -4.410743713378906e-05,
199
  "learning_rate": 3.250000000000001e-07,
200
+ "loss": -1.7210841178894043e-06,
201
+ "reward": -0.40998518466949463,
202
+ "reward_std": 0.3375362530350685,
203
+ "rewards/cosine_scaled_reward": -0.2049925960600376,
204
  "rewards/format_reward": 0.0,
205
  "step": 15
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
+ "completion_length": 1024.0,
210
  "epoch": 0.009142857142857144,
211
+ "kl": -4.8160552978515625e-05,
212
  "learning_rate": 2.6074557564105724e-07,
213
+ "loss": -1.9073486328125e-06,
214
+ "reward": -0.3556468114256859,
215
+ "reward_std": 0.38361550495028496,
216
+ "rewards/cosine_scaled_reward": -0.17782340943813324,
217
  "rewards/format_reward": 0.0,
218
  "step": 16
219
  },
220
  {
221
  "clip_ratio": 0.0,
222
+ "completion_length": 1024.0,
223
  "epoch": 0.009714285714285713,
224
+ "kl": -5.2928924560546875e-05,
225
  "learning_rate": 2.0528000059645995e-07,
226
+ "loss": -2.1010637283325195e-06,
227
+ "reward": -0.5621312409639359,
228
+ "reward_std": 0.371349073946476,
229
+ "rewards/cosine_scaled_reward": -0.28106561303138733,
230
  "rewards/format_reward": 0.0,
231
  "step": 17
232
  },
233
  {
234
  "clip_ratio": 0.0,
235
+ "completion_length": 1024.0,
236
  "epoch": 0.010285714285714285,
237
+ "kl": -4.3392181396484375e-05,
238
  "learning_rate": 1.6028856829700258e-07,
239
+ "loss": -1.7527490854263306e-06,
240
+ "reward": -0.5134304240345955,
241
+ "reward_std": 0.34325383603572845,
242
+ "rewards/cosine_scaled_reward": -0.25671521946787834,
243
  "rewards/format_reward": 0.0,
244
  "step": 18
245
  },
246
  {
247
  "clip_ratio": 0.0,
248
+ "completion_length": 1024.0,
249
  "epoch": 0.010857142857142857,
250
+ "kl": -7.867813110351562e-06,
251
  "learning_rate": 1.2713832064634125e-07,
252
+ "loss": -2.980232238769531e-07,
253
+ "reward": -0.41137560456991196,
254
+ "reward_std": 0.28929552249610424,
255
+ "rewards/cosine_scaled_reward": -0.20568780414760113,
256
  "rewards/format_reward": 0.0,
257
  "step": 19
258
  },
259
  {
260
  "clip_ratio": 0.0,
261
+ "completion_length": 1024.0,
262
  "epoch": 0.011428571428571429,
263
+ "kl": -3.4809112548828125e-05,
264
  "learning_rate": 1.068365111445064e-07,
265
+ "loss": -1.460779458284378e-06,
266
+ "reward": -0.6301024332642555,
267
+ "reward_std": 0.33287271670997143,
268
+ "rewards/cosine_scaled_reward": -0.31505120918154716,
269
  "rewards/format_reward": 0.0,
270
  "step": 20
271
  },
 
273
  "epoch": 0.011428571428571429,
274
  "step": 20,
275
  "total_flos": 0.0,
276
+ "train_loss": -3.18021047860384e-06,
277
+ "train_runtime": 16301.2017,
278
+ "train_samples_per_second": 0.02,
279
+ "train_steps_per_second": 0.001
280
  }
281
  ],
282
  "logging_steps": 1,
training_metrics.txt CHANGED
@@ -1,6 +1,6 @@
1
- total_size_before (MB): 1455.72
2
- total_size_after (MB): 1445.40
3
- total_time (seconds): 9466.33
4
- ram_peak (MB): 3499.90
5
- ram_consump (MB): 1492.71
6
- disk_storage (MB): 616.56
 
1
+ total_size_before (MB): 3424.75
2
+ total_size_after (MB): 3407.14
3
+ total_time (seconds): 16314.67
4
+ ram_peak (MB): 5845.76
5
+ ram_consump (MB): 3476.35
6
+ disk_storage (MB): 180.81