chchen commited on
Commit
b4d414b
·
verified ·
1 Parent(s): 9b7b739

End of training

Browse files
README.md CHANGED
@@ -3,9 +3,10 @@ library_name: peft
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
 
 
6
  - trl
7
  - kto
8
- - llama-factory
9
  - generated_from_trainer
10
  model-index:
11
  - name: Llama-3.1-8B-Instruct-KTO-400
@@ -17,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # Llama-3.1-8B-Instruct-KTO-400
19
 
20
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 0.2541
23
  - Rewards/chosen: 0.0309
 
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
6
+ - llama-factory
7
+ - lora
8
  - trl
9
  - kto
 
10
  - generated_from_trainer
11
  model-index:
12
  - name: Llama-3.1-8B-Instruct-KTO-400
 
18
 
19
  # Llama-3.1-8B-Instruct-KTO-400
20
 
21
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the bct_non_cot_kto_400 dataset.
22
  It achieves the following results on the evaluation set:
23
  - Loss: 0.2541
24
  - Rewards/chosen: 0.0309
all_results.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_logits/chosen": -5221032.228571429,
4
+ "eval_logits/rejected": -5284203.377777778,
5
+ "eval_logps/chosen": -16.849844796316965,
6
+ "eval_logps/rejected": -62.744411892361114,
7
+ "eval_loss": 0.2541462779045105,
8
+ "eval_rewards/chosen": 0.030902576446533204,
9
+ "eval_rewards/margins": 4.341355186038547,
10
+ "eval_rewards/rejected": -4.310452609592014,
11
+ "eval_runtime": 17.1784,
12
+ "eval_samples_per_second": 4.657,
13
+ "eval_steps_per_second": 2.329,
14
+ "kl": 0.0,
15
+ "total_flos": 3.939427351343923e+16,
16
+ "train_loss": 0.30883768532011246,
17
+ "train_runtime": 2497.4829,
18
+ "train_samples_per_second": 2.883,
19
+ "train_steps_per_second": 0.18
20
+ }
eval_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "eval_logits/chosen": -5221032.228571429,
4
+ "eval_logits/rejected": -5284203.377777778,
5
+ "eval_logps/chosen": -16.849844796316965,
6
+ "eval_logps/rejected": -62.744411892361114,
7
+ "eval_loss": 0.2541462779045105,
8
+ "eval_rewards/chosen": 0.030902576446533204,
9
+ "eval_rewards/margins": 4.341355186038547,
10
+ "eval_rewards/rejected": -4.310452609592014,
11
+ "eval_runtime": 17.1784,
12
+ "eval_samples_per_second": 4.657,
13
+ "eval_steps_per_second": 2.329,
14
+ "kl": 0.0
15
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 3.939427351343923e+16,
4
+ "train_loss": 0.30883768532011246,
5
+ "train_runtime": 2497.4829,
6
+ "train_samples_per_second": 2.883,
7
+ "train_steps_per_second": 0.18
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2541462779045105,
3
+ "best_model_checkpoint": "saves/sycophancy/Llama-8B-3.1-Instruct/kto-400/checkpoint-450",
4
+ "epoch": 10.0,
5
+ "eval_steps": 50,
6
+ "global_step": 450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.2222222222222222,
13
+ "grad_norm": 0.8755188584327698,
14
+ "kl": 3.4999122619628906,
15
+ "learning_rate": 1.111111111111111e-06,
16
+ "logits/chosen": -6424420.894117647,
17
+ "logits/rejected": -5984302.08,
18
+ "logps/chosen": -17.799721392463237,
19
+ "logps/rejected": -18.852799479166666,
20
+ "loss": 0.5001,
21
+ "rewards/chosen": -0.0005647187723832972,
22
+ "rewards/margins": 0.00044351912596646476,
23
+ "rewards/rejected": -0.001008237898349762,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.4444444444444444,
28
+ "grad_norm": 0.924457311630249,
29
+ "kl": 5.80185604095459,
30
+ "learning_rate": 2.222222222222222e-06,
31
+ "logits/chosen": -6300276.363636363,
32
+ "logits/rejected": -8026647.111111111,
33
+ "logps/chosen": -17.250682484019887,
34
+ "logps/rejected": -21.154300265842014,
35
+ "loss": 0.501,
36
+ "rewards/chosen": -0.004935111511837353,
37
+ "rewards/margins": -0.007736933182435807,
38
+ "rewards/rejected": 0.002801821670598454,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.6666666666666666,
43
+ "grad_norm": 0.6622861623764038,
44
+ "kl": 5.680029392242432,
45
+ "learning_rate": 3.3333333333333333e-06,
46
+ "logits/chosen": -6518788.860759494,
47
+ "logits/rejected": -7602240.790123457,
48
+ "logps/chosen": -17.539696029469937,
49
+ "logps/rejected": -20.295280550733025,
50
+ "loss": 0.5001,
51
+ "rewards/chosen": 0.004383341798299476,
52
+ "rewards/margins": 0.0011355700642266377,
53
+ "rewards/rejected": 0.003247771734072838,
54
+ "step": 30
55
+ },
56
+ {
57
+ "epoch": 0.8888888888888888,
58
+ "grad_norm": 0.7431591153144836,
59
+ "kl": 6.487192630767822,
60
+ "learning_rate": 4.444444444444444e-06,
61
+ "logits/chosen": -5974229.333333333,
62
+ "logits/rejected": -7606852.363636363,
63
+ "logps/chosen": -16.686967637803818,
64
+ "logps/rejected": -18.974927035245027,
65
+ "loss": 0.4992,
66
+ "rewards/chosen": 0.0100140952401691,
67
+ "rewards/margins": 0.00425971449926646,
68
+ "rewards/rejected": 0.00575438074090264,
69
+ "step": 40
70
+ },
71
+ {
72
+ "epoch": 1.1111111111111112,
73
+ "grad_norm": 0.7480154633522034,
74
+ "kl": 5.691668510437012,
75
+ "learning_rate": 4.998119881260576e-06,
76
+ "logits/chosen": -5965018.285714285,
77
+ "logits/rejected": -7548874.947368421,
78
+ "logps/chosen": -17.54410371326265,
79
+ "logps/rejected": -21.369326942845394,
80
+ "loss": 0.4989,
81
+ "rewards/chosen": 0.018393808887118383,
82
+ "rewards/margins": 0.005255397810673055,
83
+ "rewards/rejected": 0.013138411076445328,
84
+ "step": 50
85
+ },
86
+ {
87
+ "epoch": 1.1111111111111112,
88
+ "eval_logits/chosen": -6615963.428571428,
89
+ "eval_logits/rejected": -7226443.377777778,
90
+ "eval_logps/chosen": -16.866085379464284,
91
+ "eval_logps/rejected": -19.409551323784722,
92
+ "eval_loss": 0.4995289444923401,
93
+ "eval_rewards/chosen": 0.02927844183785575,
94
+ "eval_rewards/margins": 0.006244459984794497,
95
+ "eval_rewards/rejected": 0.023033981853061252,
96
+ "eval_runtime": 17.4874,
97
+ "eval_samples_per_second": 4.575,
98
+ "eval_steps_per_second": 2.287,
99
+ "kl": 3.9760074615478516,
100
+ "step": 50
101
+ },
102
+ {
103
+ "epoch": 1.3333333333333333,
104
+ "grad_norm": 0.7908056378364563,
105
+ "kl": 6.587210178375244,
106
+ "learning_rate": 4.983095894354858e-06,
107
+ "logits/chosen": -6344583.441860465,
108
+ "logits/rejected": -7122381.405405405,
109
+ "logps/chosen": -16.889287904251454,
110
+ "logps/rejected": -20.947681323902028,
111
+ "loss": 0.4962,
112
+ "rewards/chosen": 0.045913998470749964,
113
+ "rewards/margins": 0.025308867695195206,
114
+ "rewards/rejected": 0.02060513077555476,
115
+ "step": 60
116
+ },
117
+ {
118
+ "epoch": 1.5555555555555556,
119
+ "grad_norm": 0.8089656233787537,
120
+ "kl": 12.094493865966797,
121
+ "learning_rate": 4.953138276568462e-06,
122
+ "logits/chosen": -5830952.205128205,
123
+ "logits/rejected": -7457981.658536585,
124
+ "logps/chosen": -17.15894493689904,
125
+ "logps/rejected": -19.85860089557927,
126
+ "loss": 0.4933,
127
+ "rewards/chosen": 0.09406566619873047,
128
+ "rewards/margins": 0.053329967870944885,
129
+ "rewards/rejected": 0.040735698327785584,
130
+ "step": 70
131
+ },
132
+ {
133
+ "epoch": 1.7777777777777777,
134
+ "grad_norm": 1.1325657367706299,
135
+ "kl": 13.553695678710938,
136
+ "learning_rate": 4.908427196539701e-06,
137
+ "logits/chosen": -5969041.090909091,
138
+ "logits/rejected": -8059607.111111111,
139
+ "logps/chosen": -15.58839277787642,
140
+ "logps/rejected": -18.098347981770832,
141
+ "loss": 0.484,
142
+ "rewards/chosen": 0.14953918890519577,
143
+ "rewards/margins": 0.10617925031016573,
144
+ "rewards/rejected": 0.04335993859503004,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 2.0,
149
+ "grad_norm": 1.212666630744934,
150
+ "kl": 19.860130310058594,
151
+ "learning_rate": 4.849231551964771e-06,
152
+ "logits/chosen": -6533466.514285714,
153
+ "logits/rejected": -6697638.4,
154
+ "logps/chosen": -13.433639090401785,
155
+ "logps/rejected": -18.823931206597223,
156
+ "loss": 0.4773,
157
+ "rewards/chosen": 0.25568812234061106,
158
+ "rewards/margins": 0.21222362593998986,
159
+ "rewards/rejected": 0.0434644964006212,
160
+ "step": 90
161
+ },
162
+ {
163
+ "epoch": 2.2222222222222223,
164
+ "grad_norm": 1.1951372623443604,
165
+ "kl": 18.933120727539062,
166
+ "learning_rate": 4.775907352415367e-06,
167
+ "logits/chosen": -5909972.053333334,
168
+ "logits/rejected": -7999505.3176470585,
169
+ "logps/chosen": -13.5289892578125,
170
+ "logps/rejected": -18.93715676700368,
171
+ "loss": 0.4593,
172
+ "rewards/chosen": 0.3267870839436849,
173
+ "rewards/margins": 0.3485112210816028,
174
+ "rewards/rejected": -0.02172413713791791,
175
+ "step": 100
176
+ },
177
+ {
178
+ "epoch": 2.2222222222222223,
179
+ "eval_logits/chosen": -6310909.257142857,
180
+ "eval_logits/rejected": -7175433.955555555,
181
+ "eval_logps/chosen": -14.033848353794642,
182
+ "eval_logps/rejected": -19.490941026475696,
183
+ "eval_loss": 0.46696051955223083,
184
+ "eval_rewards/chosen": 0.3125021525791713,
185
+ "eval_rewards/margins": 0.29760706311180474,
186
+ "eval_rewards/rejected": 0.014895089467366536,
187
+ "eval_runtime": 17.481,
188
+ "eval_samples_per_second": 4.576,
189
+ "eval_steps_per_second": 2.288,
190
+ "kl": 7.871938228607178,
191
+ "step": 100
192
+ },
193
+ {
194
+ "epoch": 2.4444444444444446,
195
+ "grad_norm": 1.7072739601135254,
196
+ "kl": 5.876051902770996,
197
+ "learning_rate": 4.688895578255228e-06,
198
+ "logits/chosen": -4674539.2,
199
+ "logits/rejected": -7180268.0,
200
+ "logps/chosen": -11.73232650756836,
201
+ "logps/rejected": -21.302752685546874,
202
+ "loss": 0.4337,
203
+ "rewards/chosen": 0.4022516250610352,
204
+ "rewards/margins": 0.5421271562576294,
205
+ "rewards/rejected": -0.13987553119659424,
206
+ "step": 110
207
+ },
208
+ {
209
+ "epoch": 2.6666666666666665,
210
+ "grad_norm": 1.409259557723999,
211
+ "kl": 15.80307674407959,
212
+ "learning_rate": 4.588719528532342e-06,
213
+ "logits/chosen": -5999085.176470588,
214
+ "logits/rejected": -6937394.773333333,
215
+ "logps/chosen": -15.016993623621323,
216
+ "logps/rejected": -23.880426432291667,
217
+ "loss": 0.4161,
218
+ "rewards/chosen": 0.3759063271915211,
219
+ "rewards/margins": 0.6798269204532399,
220
+ "rewards/rejected": -0.3039205932617188,
221
+ "step": 120
222
+ },
223
+ {
224
+ "epoch": 2.888888888888889,
225
+ "grad_norm": 1.5020368099212646,
226
+ "kl": 4.061548233032227,
227
+ "learning_rate": 4.475981673796899e-06,
228
+ "logits/chosen": -5263017.316455696,
229
+ "logits/rejected": -6793347.160493827,
230
+ "logps/chosen": -13.852985623516615,
231
+ "logps/rejected": -24.90943287037037,
232
+ "loss": 0.4033,
233
+ "rewards/chosen": 0.3698826560491248,
234
+ "rewards/margins": 0.8227180216271051,
235
+ "rewards/rejected": -0.4528353655779803,
236
+ "step": 130
237
+ },
238
+ {
239
+ "epoch": 3.111111111111111,
240
+ "grad_norm": 1.3534297943115234,
241
+ "kl": 10.132991790771484,
242
+ "learning_rate": 4.351360032772512e-06,
243
+ "logits/chosen": -5211370.06741573,
244
+ "logits/rejected": -6893736.563380281,
245
+ "logps/chosen": -13.59007417485955,
246
+ "logps/rejected": -28.269847601232396,
247
+ "loss": 0.3634,
248
+ "rewards/chosen": 0.4416037141607049,
249
+ "rewards/margins": 1.272281283912713,
250
+ "rewards/rejected": -0.8306775697520081,
251
+ "step": 140
252
+ },
253
+ {
254
+ "epoch": 3.3333333333333335,
255
+ "grad_norm": 1.6547423601150513,
256
+ "kl": 4.284663677215576,
257
+ "learning_rate": 4.215604094671835e-06,
258
+ "logits/chosen": -4668295.724137931,
259
+ "logits/rejected": -6887783.890410959,
260
+ "logps/chosen": -13.891624012212644,
261
+ "logps/rejected": -28.655340325342465,
262
+ "loss": 0.3701,
263
+ "rewards/chosen": 0.3276262392942933,
264
+ "rewards/margins": 1.2273214397571572,
265
+ "rewards/rejected": -0.8996952004628639,
266
+ "step": 150
267
+ },
268
+ {
269
+ "epoch": 3.3333333333333335,
270
+ "eval_logits/chosen": -5641927.314285714,
271
+ "eval_logits/rejected": -6731665.777777778,
272
+ "eval_logps/chosen": -14.361015101841518,
273
+ "eval_logps/rejected": -29.413007269965277,
274
+ "eval_loss": 0.36061567068099976,
275
+ "eval_rewards/chosen": 0.27978545597621374,
276
+ "eval_rewards/margins": 1.257096823434981,
277
+ "eval_rewards/rejected": -0.9773113674587673,
278
+ "eval_runtime": 17.4799,
279
+ "eval_samples_per_second": 4.577,
280
+ "eval_steps_per_second": 2.288,
281
+ "kl": 0.0,
282
+ "step": 150
283
+ },
284
+ {
285
+ "epoch": 3.5555555555555554,
286
+ "grad_norm": 1.6560704708099365,
287
+ "kl": 0.0,
288
+ "learning_rate": 4.069530311680247e-06,
289
+ "logits/chosen": -4197209.6,
290
+ "logits/rejected": -6773993.411764706,
291
+ "logps/chosen": -9.26181640625,
292
+ "logps/rejected": -33.01460248161764,
293
+ "loss": 0.3012,
294
+ "rewards/chosen": 0.6008858235677084,
295
+ "rewards/margins": 1.9487138695810355,
296
+ "rewards/rejected": -1.3478280460133272,
297
+ "step": 160
298
+ },
299
+ {
300
+ "epoch": 3.7777777777777777,
301
+ "grad_norm": 4.158433437347412,
302
+ "kl": 6.74577522277832,
303
+ "learning_rate": 3.914017188716347e-06,
304
+ "logits/chosen": -4469396.0,
305
+ "logits/rejected": -6473352.0,
306
+ "logps/chosen": -14.897393798828125,
307
+ "logps/rejected": -32.791632080078124,
308
+ "loss": 0.3442,
309
+ "rewards/chosen": 0.3283118724822998,
310
+ "rewards/margins": 1.6118808269500733,
311
+ "rewards/rejected": -1.2835689544677735,
312
+ "step": 170
313
+ },
314
+ {
315
+ "epoch": 4.0,
316
+ "grad_norm": 1.2808359861373901,
317
+ "kl": 6.034703254699707,
318
+ "learning_rate": 3.7500000000000005e-06,
319
+ "logits/chosen": -5142258.4,
320
+ "logits/rejected": -6658021.6,
321
+ "logps/chosen": -12.736177062988281,
322
+ "logps/rejected": -38.654931640625,
323
+ "loss": 0.2877,
324
+ "rewards/chosen": 0.5321792602539063,
325
+ "rewards/margins": 2.298000717163086,
326
+ "rewards/rejected": -1.7658214569091797,
327
+ "step": 180
328
+ },
329
+ {
330
+ "epoch": 4.222222222222222,
331
+ "grad_norm": 4.354324817657471,
332
+ "kl": 9.844001770019531,
333
+ "learning_rate": 3.578465164203134e-06,
334
+ "logits/chosen": -4434924.873563218,
335
+ "logits/rejected": -6367553.7534246575,
336
+ "logps/chosen": -10.876662681842673,
337
+ "logps/rejected": -37.46076359160959,
338
+ "loss": 0.2869,
339
+ "rewards/chosen": 0.6059916704550556,
340
+ "rewards/margins": 2.4776278390525515,
341
+ "rewards/rejected": -1.8716361685974958,
342
+ "step": 190
343
+ },
344
+ {
345
+ "epoch": 4.444444444444445,
346
+ "grad_norm": 1.9371322393417358,
347
+ "kl": 0.0,
348
+ "learning_rate": 3.400444312011776e-06,
349
+ "logits/chosen": -5190637.948717949,
350
+ "logits/rejected": -6783854.048780488,
351
+ "logps/chosen": -11.751712114383013,
352
+ "logps/rejected": -40.603593035442074,
353
+ "loss": 0.281,
354
+ "rewards/chosen": 0.5476302122458433,
355
+ "rewards/margins": 2.449139008751655,
356
+ "rewards/rejected": -1.9015087965058117,
357
+ "step": 200
358
+ },
359
+ {
360
+ "epoch": 4.444444444444445,
361
+ "eval_logits/chosen": -5451904.914285715,
362
+ "eval_logits/rejected": -6268727.466666667,
363
+ "eval_logps/chosen": -15.457679966517857,
364
+ "eval_logps/rejected": -40.028545464409724,
365
+ "eval_loss": 0.3004380762577057,
366
+ "eval_rewards/chosen": 0.17011917659214565,
367
+ "eval_rewards/margins": 2.2089846686711385,
368
+ "eval_rewards/rejected": -2.038865492078993,
369
+ "eval_runtime": 17.4949,
370
+ "eval_samples_per_second": 4.573,
371
+ "eval_steps_per_second": 2.286,
372
+ "kl": 0.0,
373
+ "step": 200
374
+ },
375
+ {
376
+ "epoch": 4.666666666666667,
377
+ "grad_norm": 2.124127149581909,
378
+ "kl": 0.34170961380004883,
379
+ "learning_rate": 3.217008081777726e-06,
380
+ "logits/chosen": -5209803.317073171,
381
+ "logits/rejected": -6119242.666666667,
382
+ "logps/chosen": -12.961345393483231,
383
+ "logps/rejected": -43.778980744190704,
384
+ "loss": 0.2759,
385
+ "rewards/chosen": 0.4929830969833746,
386
+ "rewards/margins": 2.855773718227365,
387
+ "rewards/rejected": -2.3627906212439904,
388
+ "step": 210
389
+ },
390
+ {
391
+ "epoch": 4.888888888888889,
392
+ "grad_norm": 5.048134803771973,
393
+ "kl": 0.299468994140625,
394
+ "learning_rate": 3.0292596805735275e-06,
395
+ "logits/chosen": -3677977.6,
396
+ "logits/rejected": -7151626.4,
397
+ "logps/chosen": -11.91136474609375,
398
+ "logps/rejected": -44.98524780273438,
399
+ "loss": 0.2552,
400
+ "rewards/chosen": 0.5031857490539551,
401
+ "rewards/margins": 3.0173224449157714,
402
+ "rewards/rejected": -2.5141366958618163,
403
+ "step": 220
404
+ },
405
+ {
406
+ "epoch": 5.111111111111111,
407
+ "grad_norm": 1.2531408071517944,
408
+ "kl": 0.0,
409
+ "learning_rate": 2.8383282493753282e-06,
410
+ "logits/chosen": -4280043.220779221,
411
+ "logits/rejected": -6925918.072289157,
412
+ "logps/chosen": -12.665734229149756,
413
+ "logps/rejected": -43.957440112010545,
414
+ "loss": 0.2576,
415
+ "rewards/chosen": 0.7539182390485492,
416
+ "rewards/margins": 3.1649977377072465,
417
+ "rewards/rejected": -2.4110794986586974,
418
+ "step": 230
419
+ },
420
+ {
421
+ "epoch": 5.333333333333333,
422
+ "grad_norm": 5.549612998962402,
423
+ "kl": 0.0,
424
+ "learning_rate": 2.6453620722761897e-06,
425
+ "logits/chosen": -5263511.717647059,
426
+ "logits/rejected": -5916718.08,
427
+ "logps/chosen": -9.854262408088236,
428
+ "logps/rejected": -46.6882421875,
429
+ "loss": 0.2567,
430
+ "rewards/chosen": 0.6036793877096737,
431
+ "rewards/margins": 3.234525131524778,
432
+ "rewards/rejected": -2.630845743815104,
433
+ "step": 240
434
+ },
435
+ {
436
+ "epoch": 5.555555555555555,
437
+ "grad_norm": 1.6388943195343018,
438
+ "kl": 0.0,
439
+ "learning_rate": 2.4515216705704396e-06,
440
+ "logits/chosen": -4496943.157894737,
441
+ "logits/rejected": -6177542.857142857,
442
+ "logps/chosen": -10.795360364412007,
443
+ "logps/rejected": -54.87718563988095,
444
+ "loss": 0.2051,
445
+ "rewards/chosen": 0.5903302242881373,
446
+ "rewards/margins": 4.18927874839993,
447
+ "rewards/rejected": -3.598948524111793,
448
+ "step": 250
449
+ },
450
+ {
451
+ "epoch": 5.555555555555555,
452
+ "eval_logits/chosen": -5351382.857142857,
453
+ "eval_logits/rejected": -5877686.755555555,
454
+ "eval_logps/chosen": -15.197433035714285,
455
+ "eval_logps/rejected": -48.05066189236111,
456
+ "eval_loss": 0.27403125166893005,
457
+ "eval_rewards/chosen": 0.19614369528634207,
458
+ "eval_rewards/margins": 3.0372208640688942,
459
+ "eval_rewards/rejected": -2.8410771687825522,
460
+ "eval_runtime": 17.5075,
461
+ "eval_samples_per_second": 4.569,
462
+ "eval_steps_per_second": 2.285,
463
+ "kl": 0.0,
464
+ "step": 250
465
+ },
466
+ {
467
+ "epoch": 5.777777777777778,
468
+ "grad_norm": 2.7054665088653564,
469
+ "kl": 1.0151314735412598,
470
+ "learning_rate": 2.2579728232420524e-06,
471
+ "logits/chosen": -4953812.819277109,
472
+ "logits/rejected": -6552532.779220779,
473
+ "logps/chosen": -11.884703119117093,
474
+ "logps/rejected": -51.183222783076296,
475
+ "loss": 0.2612,
476
+ "rewards/chosen": 0.3994415467043957,
477
+ "rewards/margins": 3.453126722612472,
478
+ "rewards/rejected": -3.0536851759080763,
479
+ "step": 260
480
+ },
481
+ {
482
+ "epoch": 6.0,
483
+ "grad_norm": 0.8331411480903625,
484
+ "kl": 0.0,
485
+ "learning_rate": 2.0658795558326745e-06,
486
+ "logits/chosen": -4686625.951219512,
487
+ "logits/rejected": -7812360.205128205,
488
+ "logps/chosen": -9.980894507431403,
489
+ "logps/rejected": -53.11843950320513,
490
+ "loss": 0.221,
491
+ "rewards/chosen": 0.8428270293445121,
492
+ "rewards/margins": 4.1761818814829335,
493
+ "rewards/rejected": -3.3333548521384215,
494
+ "step": 270
495
+ },
496
+ {
497
+ "epoch": 6.222222222222222,
498
+ "grad_norm": 1.117002248764038,
499
+ "kl": 15.995762825012207,
500
+ "learning_rate": 1.876397139855047e-06,
501
+ "logits/chosen": -4876782.12987013,
502
+ "logits/rejected": -6101197.879518072,
503
+ "logps/chosen": -11.705315131645698,
504
+ "logps/rejected": -58.6199171686747,
505
+ "loss": 0.2091,
506
+ "rewards/chosen": 0.6778353901652546,
507
+ "rewards/margins": 4.467821138823952,
508
+ "rewards/rejected": -3.7899857486586974,
509
+ "step": 280
510
+ },
511
+ {
512
+ "epoch": 6.444444444444445,
513
+ "grad_norm": 6.495964527130127,
514
+ "kl": 4.720248699188232,
515
+ "learning_rate": 1.6906651448541977e-06,
516
+ "logits/chosen": -3970153.0256410255,
517
+ "logits/rejected": -7171078.243902439,
518
+ "logps/chosen": -6.301290658804087,
519
+ "logps/rejected": -57.83571122332317,
520
+ "loss": 0.1688,
521
+ "rewards/chosen": 1.1230317140236878,
522
+ "rewards/margins": 4.893083995845334,
523
+ "rewards/rejected": -3.770052281821646,
524
+ "step": 290
525
+ },
526
+ {
527
+ "epoch": 6.666666666666667,
528
+ "grad_norm": 0.7937259674072266,
529
+ "kl": 0.0,
530
+ "learning_rate": 1.509800584902108e-06,
531
+ "logits/chosen": -5307403.341772152,
532
+ "logits/rejected": -6794199.703703703,
533
+ "logps/chosen": -16.510799359671676,
534
+ "logps/rejected": -55.48344666280864,
535
+ "loss": 0.2724,
536
+ "rewards/chosen": 0.0943850988074194,
537
+ "rewards/margins": 3.622630812783114,
538
+ "rewards/rejected": -3.5282457139756946,
539
+ "step": 300
540
+ },
541
+ {
542
+ "epoch": 6.666666666666667,
543
+ "eval_logits/chosen": -5272125.257142857,
544
+ "eval_logits/rejected": -5524427.377777778,
545
+ "eval_logps/chosen": -16.101872907366072,
546
+ "eval_logps/rejected": -56.35105251736111,
547
+ "eval_loss": 0.2627750039100647,
548
+ "eval_rewards/chosen": 0.10569986615862165,
549
+ "eval_rewards/margins": 3.776815656631712,
550
+ "eval_rewards/rejected": -3.6711157904730904,
551
+ "eval_runtime": 17.4909,
552
+ "eval_samples_per_second": 4.574,
553
+ "eval_steps_per_second": 2.287,
554
+ "kl": 0.0,
555
+ "step": 300
556
+ },
557
+ {
558
+ "epoch": 6.888888888888889,
559
+ "grad_norm": 1.1168758869171143,
560
+ "kl": 0.0,
561
+ "learning_rate": 1.3348912007436538e-06,
562
+ "logits/chosen": -3931129.379310345,
563
+ "logits/rejected": -6447021.150684931,
564
+ "logps/chosen": -6.620521282327586,
565
+ "logps/rejected": -63.137895976027394,
566
+ "loss": 0.1898,
567
+ "rewards/chosen": 1.0009118441877694,
568
+ "rewards/margins": 5.434586277647744,
569
+ "rewards/rejected": -4.4336744334599745,
570
+ "step": 310
571
+ },
572
+ {
573
+ "epoch": 7.111111111111111,
574
+ "grad_norm": 3.610934019088745,
575
+ "kl": 0.0,
576
+ "learning_rate": 1.1669889179957725e-06,
577
+ "logits/chosen": -4449015.636363637,
578
+ "logits/rejected": -6512209.333333333,
579
+ "logps/chosen": -9.530960083007812,
580
+ "logps/rejected": -60.69569227430556,
581
+ "loss": 0.2499,
582
+ "rewards/chosen": 0.6914540204134855,
583
+ "rewards/margins": 4.681384067342739,
584
+ "rewards/rejected": -3.9899300469292536,
585
+ "step": 320
586
+ },
587
+ {
588
+ "epoch": 7.333333333333333,
589
+ "grad_norm": 5.70353889465332,
590
+ "kl": 7.343207359313965,
591
+ "learning_rate": 1.0071035207430352e-06,
592
+ "logits/chosen": -4714173.5696202535,
593
+ "logits/rejected": -5926990.222222222,
594
+ "logps/chosen": -9.35969717291337,
595
+ "logps/rejected": -61.245768229166664,
596
+ "loss": 0.1972,
597
+ "rewards/chosen": 0.7530057882960839,
598
+ "rewards/margins": 4.901618858560358,
599
+ "rewards/rejected": -4.1486130702642745,
600
+ "step": 330
601
+ },
602
+ {
603
+ "epoch": 7.555555555555555,
604
+ "grad_norm": 0.47463521361351013,
605
+ "kl": 0.0,
606
+ "learning_rate": 8.561965785773413e-07,
607
+ "logits/chosen": -3589752.0,
608
+ "logits/rejected": -6144573.333333333,
609
+ "logps/chosen": -10.513381264426492,
610
+ "logps/rejected": -67.19074164496527,
611
+ "loss": 0.2194,
612
+ "rewards/chosen": 0.5275679935108532,
613
+ "rewards/margins": 5.252333771098744,
614
+ "rewards/rejected": -4.724765777587891,
615
+ "step": 340
616
+ },
617
+ {
618
+ "epoch": 7.777777777777778,
619
+ "grad_norm": 0.7246691584587097,
620
+ "kl": 9.73458194732666,
621
+ "learning_rate": 7.151756636052529e-07,
622
+ "logits/chosen": -4579355.654320988,
623
+ "logits/rejected": -5925367.898734177,
624
+ "logps/chosen": -13.632544246720679,
625
+ "logps/rejected": -63.80702383306962,
626
+ "loss": 0.2237,
627
+ "rewards/chosen": 0.650694199550299,
628
+ "rewards/margins": 4.9783826822935735,
629
+ "rewards/rejected": -4.327688482743275,
630
+ "step": 350
631
+ },
632
+ {
633
+ "epoch": 7.777777777777778,
634
+ "eval_logits/chosen": -5216298.057142857,
635
+ "eval_logits/rejected": -5306349.866666666,
636
+ "eval_logps/chosen": -16.67706298828125,
637
+ "eval_logps/rejected": -61.12718098958333,
638
+ "eval_loss": 0.2568877041339874,
639
+ "eval_rewards/chosen": 0.0481806891305106,
640
+ "eval_rewards/margins": 4.196909462459503,
641
+ "eval_rewards/rejected": -4.148728773328993,
642
+ "eval_runtime": 17.4735,
643
+ "eval_samples_per_second": 4.578,
644
+ "eval_steps_per_second": 2.289,
645
+ "kl": 0.0,
646
+ "step": 350
647
+ },
648
+ {
649
+ "epoch": 8.0,
650
+ "grad_norm": 1.7622766494750977,
651
+ "kl": 0.0,
652
+ "learning_rate": 5.848888922025553e-07,
653
+ "logits/chosen": -4720167.890410959,
654
+ "logits/rejected": -5690909.425287357,
655
+ "logps/chosen": -7.494754320954623,
656
+ "logps/rejected": -63.13269463900862,
657
+ "loss": 0.1865,
658
+ "rewards/chosen": 0.9349731027263485,
659
+ "rewards/margins": 5.348946505429294,
660
+ "rewards/rejected": -4.413973402702945,
661
+ "step": 360
662
+ },
663
+ {
664
+ "epoch": 8.222222222222221,
665
+ "grad_norm": 1.9263681173324585,
666
+ "kl": 0.0,
667
+ "learning_rate": 4.661198243425813e-07,
668
+ "logits/chosen": -4662120.0,
669
+ "logits/rejected": -6811490.4,
670
+ "logps/chosen": -11.226513671875,
671
+ "logps/rejected": -60.887017822265626,
672
+ "loss": 0.2197,
673
+ "rewards/chosen": 0.5319944381713867,
674
+ "rewards/margins": 4.648596000671386,
675
+ "rewards/rejected": -4.1166015625,
676
+ "step": 370
677
+ },
678
+ {
679
+ "epoch": 8.444444444444445,
680
+ "grad_norm": 2.094463348388672,
681
+ "kl": 1.3769874572753906,
682
+ "learning_rate": 3.595827511743341e-07,
683
+ "logits/chosen": -4225483.377777778,
684
+ "logits/rejected": -6771605.485714286,
685
+ "logps/chosen": -10.853142632378471,
686
+ "logps/rejected": -65.53819754464286,
687
+ "loss": 0.232,
688
+ "rewards/chosen": 0.6328274197048611,
689
+ "rewards/margins": 5.1289795890687,
690
+ "rewards/rejected": -4.496152169363839,
691
+ "step": 380
692
+ },
693
+ {
694
+ "epoch": 8.666666666666666,
695
+ "grad_norm": 1.4835810661315918,
696
+ "kl": 0.0,
697
+ "learning_rate": 2.6591839919146963e-07,
698
+ "logits/chosen": -4116534.222222222,
699
+ "logits/rejected": -5742267.636363637,
700
+ "logps/chosen": -7.460245768229167,
701
+ "logps/rejected": -67.84841086647727,
702
+ "loss": 0.1502,
703
+ "rewards/chosen": 1.145952648586697,
704
+ "rewards/margins": 6.005312389797634,
705
+ "rewards/rejected": -4.8593597412109375,
706
+ "step": 390
707
+ },
708
+ {
709
+ "epoch": 8.88888888888889,
710
+ "grad_norm": 0.8003972172737122,
711
+ "kl": 0.0,
712
+ "learning_rate": 1.8569007682777417e-07,
713
+ "logits/chosen": -4092200.459770115,
714
+ "logits/rejected": -6279503.780821918,
715
+ "logps/chosen": -10.580308234554598,
716
+ "logps/rejected": -65.71065657106165,
717
+ "loss": 0.2291,
718
+ "rewards/chosen": 0.5882207936254041,
719
+ "rewards/margins": 5.146064145642741,
720
+ "rewards/rejected": -4.557843352017337,
721
+ "step": 400
722
+ },
723
+ {
724
+ "epoch": 8.88888888888889,
725
+ "eval_logits/chosen": -5214656.914285715,
726
+ "eval_logits/rejected": -5268033.422222222,
727
+ "eval_logps/chosen": -16.733164760044644,
728
+ "eval_logps/rejected": -62.435932074652776,
729
+ "eval_loss": 0.254804790019989,
730
+ "eval_rewards/chosen": 0.042570604596819196,
731
+ "eval_rewards/margins": 4.32217536805168,
732
+ "eval_rewards/rejected": -4.279604763454861,
733
+ "eval_runtime": 17.4792,
734
+ "eval_samples_per_second": 4.577,
735
+ "eval_steps_per_second": 2.288,
736
+ "kl": 0.0,
737
+ "step": 400
738
+ },
739
+ {
740
+ "epoch": 9.11111111111111,
741
+ "grad_norm": 4.520019054412842,
742
+ "kl": 2.1843814849853516,
743
+ "learning_rate": 1.1938028665396172e-07,
744
+ "logits/chosen": -4234930.849315069,
745
+ "logits/rejected": -5565993.931034483,
746
+ "logps/chosen": -15.401979211258562,
747
+ "logps/rejected": -69.01152792744253,
748
+ "loss": 0.2192,
749
+ "rewards/chosen": 0.15699117477626018,
750
+ "rewards/margins": 5.1128171558287026,
751
+ "rewards/rejected": -4.955825981052443,
752
+ "step": 410
753
+ },
754
+ {
755
+ "epoch": 9.333333333333334,
756
+ "grad_norm": 0.8408989310264587,
757
+ "kl": 1.832779884338379,
758
+ "learning_rate": 6.738782355044048e-08,
759
+ "logits/chosen": -5072888.558139535,
760
+ "logits/rejected": -4679876.324324325,
761
+ "logps/chosen": -13.259429221929507,
762
+ "logps/rejected": -67.32681191934121,
763
+ "loss": 0.2468,
764
+ "rewards/chosen": 0.3328412743501885,
765
+ "rewards/margins": 4.958795567266151,
766
+ "rewards/rejected": -4.625954292915963,
767
+ "step": 420
768
+ },
769
+ {
770
+ "epoch": 9.555555555555555,
771
+ "grad_norm": 0.5843256711959839,
772
+ "kl": 0.0,
773
+ "learning_rate": 3.0025376307977474e-08,
774
+ "logits/chosen": -4235825.34939759,
775
+ "logits/rejected": -6855219.532467533,
776
+ "logps/chosen": -12.172502264919052,
777
+ "logps/rejected": -70.21881341314935,
778
+ "loss": 0.1936,
779
+ "rewards/chosen": 0.7041290007441877,
780
+ "rewards/margins": 5.708083603270568,
781
+ "rewards/rejected": -5.00395460252638,
782
+ "step": 430
783
+ },
784
+ {
785
+ "epoch": 9.777777777777779,
786
+ "grad_norm": 1.3815499544143677,
787
+ "kl": 0.0,
788
+ "learning_rate": 7.517647080519941e-09,
789
+ "logits/chosen": -4151235.011764706,
790
+ "logits/rejected": -7243480.746666667,
791
+ "logps/chosen": -5.613843132467831,
792
+ "logps/rejected": -63.06897135416666,
793
+ "loss": 0.1925,
794
+ "rewards/chosen": 1.112983793370864,
795
+ "rewards/margins": 5.333501371495864,
796
+ "rewards/rejected": -4.220517578125,
797
+ "step": 440
798
+ },
799
+ {
800
+ "epoch": 10.0,
801
+ "grad_norm": 0.7369117736816406,
802
+ "kl": 0.0,
803
+ "learning_rate": 0.0,
804
+ "logits/chosen": -4828750.702702703,
805
+ "logits/rejected": -5816776.930232558,
806
+ "logps/chosen": -6.564434979413007,
807
+ "logps/rejected": -65.85319767441861,
808
+ "loss": 0.1677,
809
+ "rewards/chosen": 1.0981009199812606,
810
+ "rewards/margins": 5.781813901449284,
811
+ "rewards/rejected": -4.683712981468023,
812
+ "step": 450
813
+ },
814
+ {
815
+ "epoch": 10.0,
816
+ "eval_logits/chosen": -5221032.228571429,
817
+ "eval_logits/rejected": -5284203.377777778,
818
+ "eval_logps/chosen": -16.849844796316965,
819
+ "eval_logps/rejected": -62.744411892361114,
820
+ "eval_loss": 0.2541462779045105,
821
+ "eval_rewards/chosen": 0.030902576446533204,
822
+ "eval_rewards/margins": 4.341355186038547,
823
+ "eval_rewards/rejected": -4.310452609592014,
824
+ "eval_runtime": 17.4751,
825
+ "eval_samples_per_second": 4.578,
826
+ "eval_steps_per_second": 2.289,
827
+ "kl": 0.0,
828
+ "step": 450
829
+ },
830
+ {
831
+ "epoch": 10.0,
832
+ "step": 450,
833
+ "total_flos": 3.939427351343923e+16,
834
+ "train_loss": 0.30883768532011246,
835
+ "train_runtime": 2497.4829,
836
+ "train_samples_per_second": 2.883,
837
+ "train_steps_per_second": 0.18
838
+ }
839
+ ],
840
+ "logging_steps": 10,
841
+ "max_steps": 450,
842
+ "num_input_tokens_seen": 0,
843
+ "num_train_epochs": 10,
844
+ "save_steps": 50,
845
+ "stateful_callbacks": {
846
+ "TrainerControl": {
847
+ "args": {
848
+ "should_epoch_stop": false,
849
+ "should_evaluate": false,
850
+ "should_log": false,
851
+ "should_save": true,
852
+ "should_training_stop": true
853
+ },
854
+ "attributes": {}
855
+ }
856
+ },
857
+ "total_flos": 3.939427351343923e+16,
858
+ "train_batch_size": 2,
859
+ "trial_name": null,
860
+ "trial_params": null
861
+ }
training_eval_loss.png ADDED
training_loss.png ADDED
training_rewards_chosen.png ADDED