chchen commited on
Commit
459cf58
·
verified ·
1 Parent(s): e3a10f8

End of training

Browse files
README.md CHANGED
@@ -3,9 +3,10 @@ library_name: peft
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
 
 
6
  - trl
7
  - kto
8
- - llama-factory
9
  - generated_from_trainer
10
  model-index:
11
  - name: Llama-3.1-8B-Instruct-KTO-300
@@ -17,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # Llama-3.1-8B-Instruct-KTO-300
19
 
20
- This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
  - Loss: 0.2807
23
  - Rewards/chosen: 0.7524
 
3
  license: llama3.1
4
  base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
5
  tags:
6
+ - llama-factory
7
+ - lora
8
  - trl
9
  - kto
 
10
  - generated_from_trainer
11
  model-index:
12
  - name: Llama-3.1-8B-Instruct-KTO-300
 
18
 
19
  # Llama-3.1-8B-Instruct-KTO-300
20
 
21
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the bct_non_cot_kto_300 dataset.
22
  It achieves the following results on the evaluation set:
23
  - Loss: 0.2807
24
  - Rewards/chosen: 0.7524
all_results.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.844444444444445,
3
+ "eval_logits/chosen": -3652706.0,
4
+ "eval_logits/rejected": -7997593.777777778,
5
+ "eval_logps/chosen": -12.900739034016928,
6
+ "eval_logps/rejected": -37.902974446614586,
7
+ "eval_loss": 0.2807026207447052,
8
+ "eval_rewards/chosen": 0.7524267832438151,
9
+ "eval_rewards/margins": 2.2908316718207464,
10
+ "eval_rewards/rejected": -1.5384048885769315,
11
+ "eval_runtime": 13.115,
12
+ "eval_samples_per_second": 4.575,
13
+ "eval_steps_per_second": 2.287,
14
+ "kl": 0.663330078125,
15
+ "total_flos": 2.9519334394822656e+16,
16
+ "train_loss": 0.3735494534174601,
17
+ "train_runtime": 1879.5093,
18
+ "train_samples_per_second": 2.873,
19
+ "train_steps_per_second": 0.176
20
+ }
eval_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.844444444444445,
3
+ "eval_logits/chosen": -3652706.0,
4
+ "eval_logits/rejected": -7997593.777777778,
5
+ "eval_logps/chosen": -12.900739034016928,
6
+ "eval_logps/rejected": -37.902974446614586,
7
+ "eval_loss": 0.2807026207447052,
8
+ "eval_rewards/chosen": 0.7524267832438151,
9
+ "eval_rewards/margins": 2.2908316718207464,
10
+ "eval_rewards/rejected": -1.5384048885769315,
11
+ "eval_runtime": 13.115,
12
+ "eval_samples_per_second": 4.575,
13
+ "eval_steps_per_second": 2.287,
14
+ "kl": 0.663330078125
15
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.844444444444445,
3
+ "total_flos": 2.9519334394822656e+16,
4
+ "train_loss": 0.3735494534174601,
5
+ "train_runtime": 1879.5093,
6
+ "train_samples_per_second": 2.873,
7
+ "train_steps_per_second": 0.176
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2807026207447052,
3
+ "best_model_checkpoint": "saves/sycophancy/Llama-8B-3.1-Instruct/kto-300/checkpoint-300",
4
+ "epoch": 9.844444444444445,
5
+ "eval_steps": 50,
6
+ "global_step": 330,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.2962962962962963,
13
+ "grad_norm": 0.6046058535575867,
14
+ "kl": 4.538087844848633,
15
+ "learning_rate": 1.5151515151515152e-06,
16
+ "logits/chosen": -5854173.364705882,
17
+ "logits/rejected": -6722548.053333334,
18
+ "logps/chosen": -16.62459357766544,
19
+ "logps/rejected": -19.6134912109375,
20
+ "loss": 0.5008,
21
+ "rewards/chosen": -0.001938680690877578,
22
+ "rewards/margins": -0.006729768795125625,
23
+ "rewards/rejected": 0.004791088104248047,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.5925925925925926,
28
+ "grad_norm": 0.5337051153182983,
29
+ "kl": 3.5176525115966797,
30
+ "learning_rate": 3.0303030303030305e-06,
31
+ "logits/chosen": -5356078.12987013,
32
+ "logits/rejected": -8177576.096385542,
33
+ "logps/chosen": -16.870456447849026,
34
+ "logps/rejected": -19.49406708866717,
35
+ "loss": 0.4995,
36
+ "rewards/chosen": 0.0029797325660656025,
37
+ "rewards/margins": 0.004846458838143048,
38
+ "rewards/rejected": -0.0018667262720774456,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.8888888888888888,
43
+ "grad_norm": 1.3099695444107056,
44
+ "kl": 5.179463863372803,
45
+ "learning_rate": 4.5454545454545455e-06,
46
+ "logits/chosen": -6825626.482758621,
47
+ "logits/rejected": -7491633.095890411,
48
+ "logps/chosen": -17.327173693426722,
49
+ "logps/rejected": -19.826569857662673,
50
+ "loss": 0.4998,
51
+ "rewards/chosen": 0.005853150767841558,
52
+ "rewards/margins": 0.0036841889799645406,
53
+ "rewards/rejected": 0.0021689617878770176,
54
+ "step": 30
55
+ },
56
+ {
57
+ "epoch": 1.1925925925925926,
58
+ "grad_norm": 0.740423321723938,
59
+ "kl": 6.535464763641357,
60
+ "learning_rate": 4.993149937871306e-06,
61
+ "logits/chosen": -6410937.173333333,
62
+ "logits/rejected": -6861577.035294117,
63
+ "logps/chosen": -17.7991552734375,
64
+ "logps/rejected": -19.352366727941178,
65
+ "loss": 0.4997,
66
+ "rewards/chosen": 0.014202831586201985,
67
+ "rewards/margins": 0.004197974579007018,
68
+ "rewards/rejected": 0.010004857007194968,
69
+ "step": 40
70
+ },
71
+ {
72
+ "epoch": 1.488888888888889,
73
+ "grad_norm": 0.7895795702934265,
74
+ "kl": 6.706003189086914,
75
+ "learning_rate": 4.959688949822748e-06,
76
+ "logits/chosen": -5263670.325581395,
77
+ "logits/rejected": -7425049.945945946,
78
+ "logps/chosen": -17.420151821402616,
79
+ "logps/rejected": -20.32705276076858,
80
+ "loss": 0.4973,
81
+ "rewards/chosen": 0.03406415983688,
82
+ "rewards/margins": 0.018683696752220938,
83
+ "rewards/rejected": 0.015380463084659061,
84
+ "step": 50
85
+ },
86
+ {
87
+ "epoch": 1.488888888888889,
88
+ "eval_logits/chosen": -5131056.333333333,
89
+ "eval_logits/rejected": -8269521.777777778,
90
+ "eval_logps/chosen": -20.01836903889974,
91
+ "eval_logps/rejected": -22.1998291015625,
92
+ "eval_loss": 0.49953240156173706,
93
+ "eval_rewards/chosen": 0.04066378871599833,
94
+ "eval_rewards/margins": 0.008754322926203408,
95
+ "eval_rewards/rejected": 0.03190946578979492,
96
+ "eval_runtime": 13.3665,
97
+ "eval_samples_per_second": 4.489,
98
+ "eval_steps_per_second": 2.244,
99
+ "kl": 4.248124122619629,
100
+ "step": 50
101
+ },
102
+ {
103
+ "epoch": 1.7851851851851852,
104
+ "grad_norm": 1.0223308801651,
105
+ "kl": 10.039009094238281,
106
+ "learning_rate": 4.8987324340362445e-06,
107
+ "logits/chosen": -6110996.8,
108
+ "logits/rejected": -6743119.2,
109
+ "logps/chosen": -15.2028076171875,
110
+ "logps/rejected": -17.942332458496093,
111
+ "loss": 0.4959,
112
+ "rewards/chosen": 0.060177081823349,
113
+ "rewards/margins": 0.03066384792327881,
114
+ "rewards/rejected": 0.029513233900070192,
115
+ "step": 60
116
+ },
117
+ {
118
+ "epoch": 2.088888888888889,
119
+ "grad_norm": 0.8759526014328003,
120
+ "kl": 11.317947387695312,
121
+ "learning_rate": 4.810961790316731e-06,
122
+ "logits/chosen": -6347121.116279069,
123
+ "logits/rejected": -7174790.918918919,
124
+ "logps/chosen": -17.81520257994186,
125
+ "logps/rejected": -20.214766218855573,
126
+ "loss": 0.4929,
127
+ "rewards/chosen": 0.10434515531672987,
128
+ "rewards/margins": 0.05541059575239717,
129
+ "rewards/rejected": 0.0489345595643327,
130
+ "step": 70
131
+ },
132
+ {
133
+ "epoch": 2.3851851851851853,
134
+ "grad_norm": 0.9635448455810547,
135
+ "kl": 14.32188606262207,
136
+ "learning_rate": 4.697358159051549e-06,
137
+ "logits/chosen": -6680833.215189873,
138
+ "logits/rejected": -6814345.481481481,
139
+ "logps/chosen": -14.122315998318829,
140
+ "logps/rejected": -18.411060474537038,
141
+ "loss": 0.4833,
142
+ "rewards/chosen": 0.18152446988262708,
143
+ "rewards/margins": 0.1343007253136257,
144
+ "rewards/rejected": 0.04722374456900137,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 2.6814814814814816,
149
+ "grad_norm": 1.1595202684402466,
150
+ "kl": 22.486248016357422,
151
+ "learning_rate": 4.559191453574582e-06,
152
+ "logits/chosen": -5439367.191011236,
153
+ "logits/rejected": -7333265.126760564,
154
+ "logps/chosen": -15.327505047401685,
155
+ "logps/rejected": -19.205719423965668,
156
+ "loss": 0.4725,
157
+ "rewards/chosen": 0.27227440308988765,
158
+ "rewards/margins": 0.2085925856993383,
159
+ "rewards/rejected": 0.06368181739054934,
160
+ "step": 90
161
+ },
162
+ {
163
+ "epoch": 2.977777777777778,
164
+ "grad_norm": 1.3290456533432007,
165
+ "kl": 32.22382354736328,
166
+ "learning_rate": 4.398006164494358e-06,
167
+ "logits/chosen": -6466890.520547945,
168
+ "logits/rejected": -7224656.91954023,
169
+ "logps/chosen": -12.733187740796232,
170
+ "logps/rejected": -18.703230233028016,
171
+ "loss": 0.4683,
172
+ "rewards/chosen": 0.3788376377053457,
173
+ "rewards/margins": 0.31561340427533824,
174
+ "rewards/rejected": 0.06322423343000741,
175
+ "step": 100
176
+ },
177
+ {
178
+ "epoch": 2.977777777777778,
179
+ "eval_logits/chosen": -4839914.0,
180
+ "eval_logits/rejected": -8194471.111111111,
181
+ "eval_logps/chosen": -16.739178975423176,
182
+ "eval_logps/rejected": -22.005423651801216,
183
+ "eval_loss": 0.4692111313343048,
184
+ "eval_rewards/chosen": 0.36858288447062176,
185
+ "eval_rewards/margins": 0.3172328074773153,
186
+ "eval_rewards/rejected": 0.05135007699330648,
187
+ "eval_runtime": 13.3695,
188
+ "eval_samples_per_second": 4.488,
189
+ "eval_steps_per_second": 2.244,
190
+ "kl": 12.38624095916748,
191
+ "step": 100
192
+ },
193
+ {
194
+ "epoch": 3.2814814814814817,
195
+ "grad_norm": 2.0513880252838135,
196
+ "kl": 9.37839412689209,
197
+ "learning_rate": 4.215604094671835e-06,
198
+ "logits/chosen": -4592428.631578947,
199
+ "logits/rejected": -7237347.047619048,
200
+ "logps/chosen": -12.766847309313322,
201
+ "logps/rejected": -21.028982979910715,
202
+ "loss": 0.4456,
203
+ "rewards/chosen": 0.38844979436774,
204
+ "rewards/margins": 0.4612913705352554,
205
+ "rewards/rejected": -0.07284157616751534,
206
+ "step": 110
207
+ },
208
+ {
209
+ "epoch": 3.5777777777777775,
210
+ "grad_norm": 1.8086739778518677,
211
+ "kl": 4.136274337768555,
212
+ "learning_rate": 4.014024217844167e-06,
213
+ "logits/chosen": -5455928.847058823,
214
+ "logits/rejected": -6883678.72,
215
+ "logps/chosen": -12.671711282169118,
216
+ "logps/rejected": -21.248759765625,
217
+ "loss": 0.4288,
218
+ "rewards/chosen": 0.4551482256721048,
219
+ "rewards/margins": 0.5749030042162129,
220
+ "rewards/rejected": -0.11975477854410807,
221
+ "step": 120
222
+ },
223
+ {
224
+ "epoch": 3.8740740740740742,
225
+ "grad_norm": 1.9404373168945312,
226
+ "kl": 6.438965797424316,
227
+ "learning_rate": 3.7955198860439892e-06,
228
+ "logits/chosen": -6478493.268292683,
229
+ "logits/rejected": -6934217.025641026,
230
+ "logps/chosen": -13.174736209032012,
231
+ "logps/rejected": -20.87556653145032,
232
+ "loss": 0.4103,
233
+ "rewards/chosen": 0.4654149311344798,
234
+ "rewards/margins": 0.7423959914559942,
235
+ "rewards/rejected": -0.2769810603215144,
236
+ "step": 130
237
+ },
238
+ {
239
+ "epoch": 4.177777777777778,
240
+ "grad_norm": 1.794080376625061,
241
+ "kl": 3.989973306655884,
242
+ "learning_rate": 3.5625336406000752e-06,
243
+ "logits/chosen": -5239293.155555556,
244
+ "logits/rejected": -6854407.314285714,
245
+ "logps/chosen": -12.975401475694444,
246
+ "logps/rejected": -24.620901925223215,
247
+ "loss": 0.3852,
248
+ "rewards/chosen": 0.4861636691623264,
249
+ "rewards/margins": 0.9952043926905072,
250
+ "rewards/rejected": -0.5090407235281807,
251
+ "step": 140
252
+ },
253
+ {
254
+ "epoch": 4.474074074074074,
255
+ "grad_norm": 1.1039807796478271,
256
+ "kl": 6.615535736083984,
257
+ "learning_rate": 3.3176699082935546e-06,
258
+ "logits/chosen": -5551139.764705882,
259
+ "logits/rejected": -6892623.36,
260
+ "logps/chosen": -12.373483455882353,
261
+ "logps/rejected": -26.899697265625,
262
+ "loss": 0.3709,
263
+ "rewards/chosen": 0.5205115823184743,
264
+ "rewards/margins": 1.1443341124291515,
265
+ "rewards/rejected": -0.6238225301106771,
266
+ "step": 150
267
+ },
268
+ {
269
+ "epoch": 4.474074074074074,
270
+ "eval_logits/chosen": -4248186.666666667,
271
+ "eval_logits/rejected": -8021952.0,
272
+ "eval_logps/chosen": -14.744425455729166,
273
+ "eval_logps/rejected": -28.396257188585068,
274
+ "eval_loss": 0.3691816031932831,
275
+ "eval_rewards/chosen": 0.568058172861735,
276
+ "eval_rewards/margins": 1.155791653527154,
277
+ "eval_rewards/rejected": -0.5877334806654189,
278
+ "eval_runtime": 13.365,
279
+ "eval_samples_per_second": 4.489,
280
+ "eval_steps_per_second": 2.245,
281
+ "kl": 2.0649871826171875,
282
+ "step": 150
283
+ },
284
+ {
285
+ "epoch": 4.770370370370371,
286
+ "grad_norm": 1.5495309829711914,
287
+ "kl": 5.70380163192749,
288
+ "learning_rate": 3.0636658878845116e-06,
289
+ "logits/chosen": -3638242.086956522,
290
+ "logits/rejected": -7199811.516483516,
291
+ "logps/chosen": -13.276876698369565,
292
+ "logps/rejected": -26.58016665951236,
293
+ "loss": 0.3675,
294
+ "rewards/chosen": 0.40610935377038043,
295
+ "rewards/margins": 1.2144003593346404,
296
+ "rewards/rejected": -0.8082910055642599,
297
+ "step": 160
298
+ },
299
+ {
300
+ "epoch": 5.074074074074074,
301
+ "grad_norm": 3.531503200531006,
302
+ "kl": 3.2827072143554688,
303
+ "learning_rate": 2.803360952452705e-06,
304
+ "logits/chosen": -5859259.2,
305
+ "logits/rejected": -5897748.8,
306
+ "logps/chosen": -10.869898986816406,
307
+ "logps/rejected": -31.441567993164064,
308
+ "loss": 0.3254,
309
+ "rewards/chosen": 0.5347123622894288,
310
+ "rewards/margins": 1.6758206844329835,
311
+ "rewards/rejected": -1.1411083221435547,
312
+ "step": 170
313
+ },
314
+ {
315
+ "epoch": 5.37037037037037,
316
+ "grad_norm": 1.9355641603469849,
317
+ "kl": 6.933535575866699,
318
+ "learning_rate": 2.53966490958702e-06,
319
+ "logits/chosen": -5240330.541176471,
320
+ "logits/rejected": -6676705.28,
321
+ "logps/chosen": -14.63643008961397,
322
+ "logps/rejected": -31.603450520833334,
323
+ "loss": 0.3377,
324
+ "rewards/chosen": 0.4823792401482077,
325
+ "rewards/margins": 1.6440772381950826,
326
+ "rewards/rejected": -1.161697998046875,
327
+ "step": 180
328
+ },
329
+ {
330
+ "epoch": 5.666666666666667,
331
+ "grad_norm": 2.1604580879211426,
332
+ "kl": 0.7298488616943359,
333
+ "learning_rate": 2.275525474225771e-06,
334
+ "logits/chosen": -3473293.7721518986,
335
+ "logits/rejected": -6992762.469135802,
336
+ "logps/chosen": -12.097724238528482,
337
+ "logps/rejected": -32.88131148726852,
338
+ "loss": 0.3295,
339
+ "rewards/chosen": 0.3602371698693384,
340
+ "rewards/margins": 1.7327642099505982,
341
+ "rewards/rejected": -1.3725270400812597,
342
+ "step": 190
343
+ },
344
+ {
345
+ "epoch": 5.962962962962963,
346
+ "grad_norm": 1.2311595678329468,
347
+ "kl": 0.0,
348
+ "learning_rate": 2.013895317751323e-06,
349
+ "logits/chosen": -5824722.75862069,
350
+ "logits/rejected": -6419050.95890411,
351
+ "logps/chosen": -12.355808301903735,
352
+ "logps/rejected": -34.44381421232877,
353
+ "loss": 0.3219,
354
+ "rewards/chosen": 0.4446083156541846,
355
+ "rewards/margins": 1.962539692072207,
356
+ "rewards/rejected": -1.5179313764180222,
357
+ "step": 200
358
+ },
359
+ {
360
+ "epoch": 5.962962962962963,
361
+ "eval_logits/chosen": -3768132.0,
362
+ "eval_logits/rejected": -7958945.777777778,
363
+ "eval_logps/chosen": -14.012957255045572,
364
+ "eval_logps/rejected": -34.88158840603299,
365
+ "eval_loss": 0.30443304777145386,
366
+ "eval_rewards/chosen": 0.6412049929300944,
367
+ "eval_rewards/margins": 1.8774711290995278,
368
+ "eval_rewards/rejected": -1.2362661361694336,
369
+ "eval_runtime": 13.3919,
370
+ "eval_samples_per_second": 4.48,
371
+ "eval_steps_per_second": 2.24,
372
+ "kl": 1.4612131118774414,
373
+ "step": 200
374
+ },
375
+ {
376
+ "epoch": 6.266666666666667,
377
+ "grad_norm": 0.925562858581543,
378
+ "kl": 0.0,
379
+ "learning_rate": 1.7576990616793139e-06,
380
+ "logits/chosen": -4411608.094117647,
381
+ "logits/rejected": -6808450.986666666,
382
+ "logps/chosen": -11.909748391544118,
383
+ "logps/rejected": -36.503896484375,
384
+ "loss": 0.3074,
385
+ "rewards/chosen": 0.45500959508559286,
386
+ "rewards/margins": 2.139190096387676,
387
+ "rewards/rejected": -1.6841805013020834,
388
+ "step": 210
389
+ },
390
+ {
391
+ "epoch": 6.562962962962963,
392
+ "grad_norm": 1.7327585220336914,
393
+ "kl": 0.9251070022583008,
394
+ "learning_rate": 1.509800584902108e-06,
395
+ "logits/chosen": -4351900.049382716,
396
+ "logits/rejected": -7359590.075949367,
397
+ "logps/chosen": -11.922060366030093,
398
+ "logps/rejected": -35.884592563291136,
399
+ "loss": 0.3064,
400
+ "rewards/chosen": 0.49753245600947626,
401
+ "rewards/margins": 2.070165450842348,
402
+ "rewards/rejected": -1.5726329948328719,
403
+ "step": 220
404
+ },
405
+ {
406
+ "epoch": 6.859259259259259,
407
+ "grad_norm": 4.674618721008301,
408
+ "kl": 4.636810302734375,
409
+ "learning_rate": 1.2729710099410802e-06,
410
+ "logits/chosen": -5216737.129411764,
411
+ "logits/rejected": -5893650.773333333,
412
+ "logps/chosen": -11.36463551240809,
413
+ "logps/rejected": -36.05122395833333,
414
+ "loss": 0.2968,
415
+ "rewards/chosen": 0.5570346607881433,
416
+ "rewards/margins": 2.3463927743949142,
417
+ "rewards/rejected": -1.7893581136067709,
418
+ "step": 230
419
+ },
420
+ {
421
+ "epoch": 7.162962962962963,
422
+ "grad_norm": 1.30046808719635,
423
+ "kl": 7.6185832023620605,
424
+ "learning_rate": 1.049857726072005e-06,
425
+ "logits/chosen": -5359469.894736842,
426
+ "logits/rejected": -6327644.952380952,
427
+ "logps/chosen": -12.688995361328125,
428
+ "logps/rejected": -39.67555454799107,
429
+ "loss": 0.2733,
430
+ "rewards/chosen": 0.538983194451583,
431
+ "rewards/margins": 2.463327938452699,
432
+ "rewards/rejected": -1.924344744001116,
433
+ "step": 240
434
+ },
435
+ {
436
+ "epoch": 7.459259259259259,
437
+ "grad_norm": 1.7867010831832886,
438
+ "kl": 0.22487592697143555,
439
+ "learning_rate": 8.4295479559726e-07,
440
+ "logits/chosen": -5227298.285714285,
441
+ "logits/rejected": -6265175.578947368,
442
+ "logps/chosen": -12.42079089936756,
443
+ "logps/rejected": -38.650663677014805,
444
+ "loss": 0.2882,
445
+ "rewards/chosen": 0.5529203414916992,
446
+ "rewards/margins": 2.4198797627499227,
447
+ "rewards/rejected": -1.8669594212582237,
448
+ "step": 250
449
+ },
450
+ {
451
+ "epoch": 7.459259259259259,
452
+ "eval_logits/chosen": -3685730.3333333335,
453
+ "eval_logits/rejected": -7991116.444444444,
454
+ "eval_logps/chosen": -13.086176554361979,
455
+ "eval_logps/rejected": -37.038543701171875,
456
+ "eval_loss": 0.2863730192184448,
457
+ "eval_rewards/chosen": 0.7338830629984537,
458
+ "eval_rewards/margins": 2.1858451101515026,
459
+ "eval_rewards/rejected": -1.451962047153049,
460
+ "eval_runtime": 13.3948,
461
+ "eval_samples_per_second": 4.479,
462
+ "eval_steps_per_second": 2.24,
463
+ "kl": 0.8725299835205078,
464
+ "step": 250
465
+ },
466
+ {
467
+ "epoch": 7.7555555555555555,
468
+ "grad_norm": 9.230957984924316,
469
+ "kl": 9.669639587402344,
470
+ "learning_rate": 6.545750740770338e-07,
471
+ "logits/chosen": -4307272.481927711,
472
+ "logits/rejected": -6674962.285714285,
473
+ "logps/chosen": -9.58886277532003,
474
+ "logps/rejected": -38.9914741020698,
475
+ "loss": 0.2772,
476
+ "rewards/chosen": 0.610903268837067,
477
+ "rewards/margins": 2.649724798869535,
478
+ "rewards/rejected": -2.0388215300324677,
479
+ "step": 260
480
+ },
481
+ {
482
+ "epoch": 8.059259259259258,
483
+ "grad_norm": 1.193448781967163,
484
+ "kl": 3.7342185974121094,
485
+ "learning_rate": 4.868243561723535e-07,
486
+ "logits/chosen": -5346473.643835616,
487
+ "logits/rejected": -6204017.287356322,
488
+ "logps/chosen": -14.12191647046233,
489
+ "logps/rejected": -39.27419517780172,
490
+ "loss": 0.2837,
491
+ "rewards/chosen": 0.40948598678797893,
492
+ "rewards/margins": 2.4313709488479645,
493
+ "rewards/rejected": -2.0218849620599855,
494
+ "step": 270
495
+ },
496
+ {
497
+ "epoch": 8.355555555555556,
498
+ "grad_norm": 1.557085633277893,
499
+ "kl": 0.0,
500
+ "learning_rate": 3.4157783610952263e-07,
501
+ "logits/chosen": -5714791.696202531,
502
+ "logits/rejected": -6967618.37037037,
503
+ "logps/chosen": -12.57650254647943,
504
+ "logps/rejected": -41.757679880401234,
505
+ "loss": 0.2867,
506
+ "rewards/chosen": 0.3982359004926078,
507
+ "rewards/margins": 2.571001561661589,
508
+ "rewards/rejected": -2.1727656611689814,
509
+ "step": 280
510
+ },
511
+ {
512
+ "epoch": 8.651851851851852,
513
+ "grad_norm": 1.3718146085739136,
514
+ "kl": 2.2029647827148438,
515
+ "learning_rate": 2.2045914590165252e-07,
516
+ "logits/chosen": -4378275.720930233,
517
+ "logits/rejected": -6295708.972972973,
518
+ "logps/chosen": -12.955159032067588,
519
+ "logps/rejected": -36.683788402660475,
520
+ "loss": 0.2996,
521
+ "rewards/chosen": 0.5802536010742188,
522
+ "rewards/margins": 2.36948992754962,
523
+ "rewards/rejected": -1.7892363264754012,
524
+ "step": 290
525
+ },
526
+ {
527
+ "epoch": 8.948148148148148,
528
+ "grad_norm": 2.0397913455963135,
529
+ "kl": 0.0,
530
+ "learning_rate": 1.2482220564763669e-07,
531
+ "logits/chosen": -5213185.488372093,
532
+ "logits/rejected": -6215361.72972973,
533
+ "logps/chosen": -9.879340593204942,
534
+ "logps/rejected": -41.936137431376686,
535
+ "loss": 0.2574,
536
+ "rewards/chosen": 0.6615725228952807,
537
+ "rewards/margins": 2.8659850347723923,
538
+ "rewards/rejected": -2.2044125118771114,
539
+ "step": 300
540
+ },
541
+ {
542
+ "epoch": 8.948148148148148,
543
+ "eval_logits/chosen": -3652706.0,
544
+ "eval_logits/rejected": -7997593.777777778,
545
+ "eval_logps/chosen": -12.900739034016928,
546
+ "eval_logps/rejected": -37.902974446614586,
547
+ "eval_loss": 0.2807026207447052,
548
+ "eval_rewards/chosen": 0.7524267832438151,
549
+ "eval_rewards/margins": 2.2908316718207464,
550
+ "eval_rewards/rejected": -1.5384048885769315,
551
+ "eval_runtime": 13.382,
552
+ "eval_samples_per_second": 4.484,
553
+ "eval_steps_per_second": 2.242,
554
+ "kl": 0.663330078125,
555
+ "step": 300
556
+ },
557
+ {
558
+ "epoch": 9.251851851851852,
559
+ "grad_norm": 1.2545690536499023,
560
+ "kl": 6.414087772369385,
561
+ "learning_rate": 5.573608879422876e-08,
562
+ "logits/chosen": -4187727.36,
563
+ "logits/rejected": -7287492.517647059,
564
+ "logps/chosen": -12.690235188802083,
565
+ "logps/rejected": -43.10070657169118,
566
+ "loss": 0.2534,
567
+ "rewards/chosen": 0.5371046956380209,
568
+ "rewards/margins": 2.892158586090686,
569
+ "rewards/rejected": -2.355053890452665,
570
+ "step": 310
571
+ },
572
+ {
573
+ "epoch": 9.548148148148147,
574
+ "grad_norm": 2.7359890937805176,
575
+ "kl": 0.0,
576
+ "learning_rate": 1.3973071544233219e-08,
577
+ "logits/chosen": -4982654.379746836,
578
+ "logits/rejected": -7114389.333333333,
579
+ "logps/chosen": -9.849223849139635,
580
+ "logps/rejected": -36.88050672743056,
581
+ "loss": 0.2753,
582
+ "rewards/chosen": 0.7713173492045342,
583
+ "rewards/margins": 2.553770721359241,
584
+ "rewards/rejected": -1.782453372154707,
585
+ "step": 320
586
+ },
587
+ {
588
+ "epoch": 9.844444444444445,
589
+ "grad_norm": 1.730713129043579,
590
+ "kl": 0.3280200958251953,
591
+ "learning_rate": 0.0,
592
+ "logits/chosen": -4839422.528735632,
593
+ "logits/rejected": -6777751.671232876,
594
+ "logps/chosen": -11.5740931719199,
595
+ "logps/rejected": -41.290196248929796,
596
+ "loss": 0.2887,
597
+ "rewards/chosen": 0.46058505705033226,
598
+ "rewards/margins": 2.5596534766243906,
599
+ "rewards/rejected": -2.099068419574058,
600
+ "step": 330
601
+ },
602
+ {
603
+ "epoch": 9.844444444444445,
604
+ "step": 330,
605
+ "total_flos": 2.9519334394822656e+16,
606
+ "train_loss": 0.3735494534174601,
607
+ "train_runtime": 1879.5093,
608
+ "train_samples_per_second": 2.873,
609
+ "train_steps_per_second": 0.176
610
+ }
611
+ ],
612
+ "logging_steps": 10,
613
+ "max_steps": 330,
614
+ "num_input_tokens_seen": 0,
615
+ "num_train_epochs": 10,
616
+ "save_steps": 50,
617
+ "stateful_callbacks": {
618
+ "TrainerControl": {
619
+ "args": {
620
+ "should_epoch_stop": false,
621
+ "should_evaluate": false,
622
+ "should_log": false,
623
+ "should_save": true,
624
+ "should_training_stop": true
625
+ },
626
+ "attributes": {}
627
+ }
628
+ },
629
+ "total_flos": 2.9519334394822656e+16,
630
+ "train_batch_size": 2,
631
+ "trial_name": null,
632
+ "trial_params": null
633
+ }
training_eval_loss.png ADDED
training_loss.png ADDED
training_rewards_chosen.png ADDED