ggbetz commited on
Commit
f039fdc
·
verified ·
1 Parent(s): d6ed6cd

Model save

Browse files
README.md CHANGED
@@ -3,8 +3,8 @@ library_name: transformers
3
  model_name: Phi-4-Argunaut-1-SPIN-dev1
4
  tags:
5
  - generated_from_trainer
6
- - dpo
7
  - trl
 
8
  licence: license
9
  ---
10
 
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/lp42hqok)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
 
3
  model_name: Phi-4-Argunaut-1-SPIN-dev1
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
+ - dpo
8
  licence: license
9
  ---
10
 
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/ggbetz/argunauts-training/runs/1z2x9t5q)
30
 
31
 
32
  This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.2171997334594014,
5
- "train_runtime": 3612.0435,
6
- "train_samples": 5539,
7
- "train_samples_per_second": 3.067,
8
- "train_steps_per_second": 0.096
9
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.2867361557407257,
5
+ "train_runtime": 4280.6612,
6
+ "train_samples": 4989,
7
+ "train_samples_per_second": 2.331,
8
+ "train_steps_per_second": 0.073
9
  }
model-00001-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65af75d35af07125e0b3bf017ec4b398b2c9067b98bc203ee5bd5a5530d92dde
3
  size 4933658528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b0ea56085ffd706522d837bfb07f83c14f5c8f44b6a9d0b7bd8a8e3415ecd23
3
  size 4933658528
model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a57a273238f9c42f1e4337b634935fa7ae3333eac99365e9421f99aa39f669cb
3
  size 4954693112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b353ed4c7422d6295216f7e1051e831472098cfb6791449260aafef46f81be
3
  size 4954693112
model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ab8312ba59f2e7ab3e2f2f7c5d016179601e7860f6886555ec50eb95c548e3c
3
  size 4902243992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2aae183aabcb2eb1314dc5d0edad88afe52180500c5e6dd25fa6724f9d652ec
3
  size 4902243992
model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa6495e2b7b3faabfb2521d489d3e1fed352d9d566a514e89bb9ae96b8cd14c7
3
  size 4954672440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3f14dc985389b4cc36d84d5ace751de00be19b2f1797b535d3aa423b1918cc
3
  size 4954672440
model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c295a408d94561912d38721aaf824b7d5691438239174cd46cb106894ae64447
3
  size 4954672432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b43e6befb4e7258089513d8d10dd095913be27ab8012a0e7b5fafdffebf0f8f2
3
  size 4954672432
model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e270bbcc9a2da497d6b4a6e7c2c2fd1f1fd2c370e229639cf240ddc36df6491
3
  size 4619116224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c20917ca384f86d677f242d644c35343daa7effe81cdb5cda39ad5262a22e1f7
3
  size 4619116224
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.2171997334594014,
5
- "train_runtime": 3612.0435,
6
- "train_samples": 5539,
7
- "train_samples_per_second": 3.067,
8
- "train_steps_per_second": 0.096
9
  }
 
1
  {
2
  "epoch": 2.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.2867361557407257,
5
+ "train_runtime": 4280.6612,
6
+ "train_samples": 4989,
7
+ "train_samples_per_second": 2.331,
8
+ "train_steps_per_second": 0.073
9
  }
trainer_state.json CHANGED
@@ -4,1058 +4,953 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 2.0,
6
  "eval_steps": 500,
7
- "global_step": 348,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.02888086642599278,
14
- "grad_norm": 11.422099862509954,
15
- "learning_rate": 1.111111111111111e-07,
16
- "logits/chosen": -2.3837890625,
17
- "logits/rejected": -2.0541014671325684,
18
- "logps/chosen": -307.1343688964844,
19
- "logps/rejected": -332.64373779296875,
20
- "loss": 0.3475,
21
- "rewards/accuracies": 0.8125,
22
- "rewards/chosen": 2.523876905441284,
23
- "rewards/margins": 4.429638862609863,
24
- "rewards/rejected": -1.906103491783142,
25
  "step": 5
26
  },
27
  {
28
- "epoch": 0.05776173285198556,
29
- "grad_norm": 9.346708877153914,
30
- "learning_rate": 2.5e-07,
31
- "logits/chosen": -2.3408203125,
32
- "logits/rejected": -2.0601563453674316,
33
- "logps/chosen": -318.1000061035156,
34
- "logps/rejected": -406.9624938964844,
35
- "loss": 0.3921,
36
  "rewards/accuracies": 0.7875000238418579,
37
- "rewards/chosen": 2.3999266624450684,
38
- "rewards/margins": 3.9242920875549316,
39
- "rewards/rejected": -1.5220947265625,
40
  "step": 10
41
  },
42
  {
43
- "epoch": 0.08664259927797834,
44
- "grad_norm": 21.1645613752464,
45
- "learning_rate": 3.888888888888889e-07,
46
- "logits/chosen": -2.418164014816284,
47
- "logits/rejected": -2.087695360183716,
48
- "logps/chosen": -357.48748779296875,
49
- "logps/rejected": -464.01873779296875,
50
- "loss": 0.3932,
51
- "rewards/accuracies": 0.793749988079071,
52
- "rewards/chosen": 1.938085913658142,
53
- "rewards/margins": 3.9061522483825684,
54
- "rewards/rejected": -1.965234398841858,
55
  "step": 15
56
  },
57
  {
58
- "epoch": 0.11552346570397112,
59
- "grad_norm": 12.894608491427922,
60
- "learning_rate": 4.984848484848485e-07,
61
- "logits/chosen": -2.3189454078674316,
62
- "logits/rejected": -2.0580077171325684,
63
- "logps/chosen": -341.4125061035156,
64
- "logps/rejected": -354.140625,
65
- "loss": 0.3856,
66
  "rewards/accuracies": 0.800000011920929,
67
- "rewards/chosen": 2.5682129859924316,
68
- "rewards/margins": 4.517675876617432,
69
- "rewards/rejected": -1.949121117591858,
70
  "step": 20
71
  },
72
  {
73
- "epoch": 0.1444043321299639,
74
- "grad_norm": 12.760881593467248,
75
- "learning_rate": 4.909090909090909e-07,
76
- "logits/chosen": -2.3501954078674316,
77
- "logits/rejected": -2.049023389816284,
78
- "logps/chosen": -316.30938720703125,
79
- "logps/rejected": -418.25,
80
- "loss": 0.2819,
81
- "rewards/accuracies": 0.84375,
82
- "rewards/chosen": 2.112988233566284,
83
- "rewards/margins": 4.386523246765137,
84
- "rewards/rejected": -2.271484375,
85
  "step": 25
86
  },
87
  {
88
- "epoch": 0.17328519855595667,
89
- "grad_norm": 12.368541660564347,
90
- "learning_rate": 4.833333333333333e-07,
91
- "logits/chosen": -2.4140625,
92
- "logits/rejected": -2.116992235183716,
93
- "logps/chosen": -292.2124938964844,
94
- "logps/rejected": -485.0062561035156,
95
- "loss": 0.3626,
96
- "rewards/accuracies": 0.7875000238418579,
97
- "rewards/chosen": 2.282482862472534,
98
- "rewards/margins": 4.206860542297363,
99
- "rewards/rejected": -1.926367163658142,
100
  "step": 30
101
  },
102
  {
103
- "epoch": 0.20216606498194944,
104
- "grad_norm": 17.755478161153214,
105
- "learning_rate": 4.7575757575757574e-07,
106
- "logits/chosen": -2.3724608421325684,
107
- "logits/rejected": -2.0888671875,
108
- "logps/chosen": -338.65313720703125,
109
- "logps/rejected": -413.77813720703125,
110
- "loss": 0.3419,
111
- "rewards/accuracies": 0.7875000238418579,
112
- "rewards/chosen": 2.4981446266174316,
113
- "rewards/margins": 4.247143745422363,
114
- "rewards/rejected": -1.750341773033142,
115
  "step": 35
116
  },
117
  {
118
- "epoch": 0.23104693140794225,
119
- "grad_norm": 26.014183301393516,
120
- "learning_rate": 4.681818181818182e-07,
121
- "logits/chosen": -2.3197264671325684,
122
- "logits/rejected": -2.044921875,
123
- "logps/chosen": -310.12420654296875,
124
- "logps/rejected": -373.03436279296875,
125
- "loss": 0.3871,
126
- "rewards/accuracies": 0.831250011920929,
127
- "rewards/chosen": 2.195751905441284,
128
- "rewards/margins": 4.500244140625,
129
- "rewards/rejected": -2.305126905441284,
130
  "step": 40
131
  },
132
  {
133
- "epoch": 0.259927797833935,
134
- "grad_norm": 38.57908256706837,
135
- "learning_rate": 4.606060606060606e-07,
136
- "logits/chosen": -2.3687500953674316,
137
- "logits/rejected": -2.128124952316284,
138
- "logps/chosen": -303.8999938964844,
139
- "logps/rejected": -405.3687438964844,
140
- "loss": 0.3248,
141
- "rewards/accuracies": 0.8500000238418579,
142
- "rewards/chosen": 2.470898389816284,
143
- "rewards/margins": 4.250390529632568,
144
- "rewards/rejected": -1.774694800376892,
145
  "step": 45
146
  },
147
  {
148
- "epoch": 0.2888086642599278,
149
- "grad_norm": 32.89523848493287,
150
- "learning_rate": 4.53030303030303e-07,
151
- "logits/chosen": -2.338085889816284,
152
- "logits/rejected": -2.116406202316284,
153
- "logps/chosen": -323.234375,
154
- "logps/rejected": -559.8218994140625,
155
- "loss": 0.2769,
156
- "rewards/accuracies": 0.856249988079071,
157
- "rewards/chosen": 2.2285523414611816,
158
- "rewards/margins": 4.183837890625,
159
- "rewards/rejected": -1.955078125,
160
  "step": 50
161
  },
162
  {
163
- "epoch": 0.3176895306859206,
164
- "grad_norm": 11.920071723095447,
165
- "learning_rate": 4.4545454545454544e-07,
166
- "logits/chosen": -2.395312547683716,
167
- "logits/rejected": -2.0990233421325684,
168
- "logps/chosen": -304.8843688964844,
169
- "logps/rejected": -417.2437438964844,
170
- "loss": 0.3615,
171
- "rewards/accuracies": 0.84375,
172
- "rewards/chosen": 2.8320069313049316,
173
- "rewards/margins": 6.729589939117432,
174
- "rewards/rejected": -3.8976807594299316,
175
  "step": 55
176
  },
177
  {
178
- "epoch": 0.34657039711191334,
179
- "grad_norm": 10.026225637915674,
180
- "learning_rate": 4.3787878787878784e-07,
181
- "logits/chosen": -2.390820264816284,
182
- "logits/rejected": -2.187304735183716,
183
- "logps/chosen": -341.2906188964844,
184
- "logps/rejected": -386.2421875,
185
- "loss": 0.3895,
186
- "rewards/accuracies": 0.8812500238418579,
187
- "rewards/chosen": 2.069775342941284,
188
- "rewards/margins": 4.311621189117432,
189
- "rewards/rejected": -2.2381834983825684,
190
  "step": 60
191
  },
192
  {
193
- "epoch": 0.37545126353790614,
194
- "grad_norm": 65.80915152647563,
195
- "learning_rate": 4.303030303030303e-07,
196
- "logits/chosen": -2.3089842796325684,
197
- "logits/rejected": -2.0843749046325684,
198
- "logps/chosen": -377.59375,
199
- "logps/rejected": -469.63751220703125,
200
- "loss": 0.2278,
201
- "rewards/accuracies": 0.918749988079071,
202
- "rewards/chosen": 1.940588355064392,
203
- "rewards/margins": 5.038866996765137,
204
- "rewards/rejected": -3.0979981422424316,
205
  "step": 65
206
  },
207
  {
208
- "epoch": 0.4043321299638989,
209
- "grad_norm": 19.49794529070635,
210
- "learning_rate": 4.227272727272727e-07,
211
- "logits/chosen": -2.387500047683716,
212
- "logits/rejected": -2.1294922828674316,
213
- "logps/chosen": -283.0062561035156,
214
- "logps/rejected": -691.9249877929688,
215
- "loss": 0.3085,
216
- "rewards/accuracies": 0.893750011920929,
217
- "rewards/chosen": 1.929785132408142,
218
- "rewards/margins": 4.166113376617432,
219
- "rewards/rejected": -2.23785400390625,
220
  "step": 70
221
  },
222
  {
223
- "epoch": 0.4332129963898917,
224
- "grad_norm": 11.28128140051103,
225
- "learning_rate": 4.1515151515151513e-07,
226
- "logits/chosen": -2.400195360183716,
227
- "logits/rejected": -2.0947265625,
228
- "logps/chosen": -291.79766845703125,
229
- "logps/rejected": -405.3843688964844,
230
- "loss": 0.2438,
231
- "rewards/accuracies": 0.9125000238418579,
232
- "rewards/chosen": 2.5198731422424316,
233
- "rewards/margins": 4.618750095367432,
234
- "rewards/rejected": -2.0975584983825684,
235
  "step": 75
236
  },
237
  {
238
- "epoch": 0.4620938628158845,
239
- "grad_norm": 19.355642911850794,
240
- "learning_rate": 4.075757575757576e-07,
241
- "logits/chosen": -2.400390625,
242
- "logits/rejected": -2.083789110183716,
243
- "logps/chosen": -331.4624938964844,
244
- "logps/rejected": -432.2875061035156,
245
- "loss": 0.1975,
246
- "rewards/accuracies": 0.9312499761581421,
247
- "rewards/chosen": 2.271533250808716,
248
- "rewards/margins": 5.00537109375,
249
- "rewards/rejected": -2.7354493141174316,
250
  "step": 80
251
  },
252
  {
253
- "epoch": 0.49097472924187724,
254
- "grad_norm": 22.848861871157133,
255
- "learning_rate": 4e-07,
256
- "logits/chosen": -2.3587889671325684,
257
- "logits/rejected": -2.143749952316284,
258
- "logps/chosen": -339.46563720703125,
259
- "logps/rejected": -404.6968688964844,
260
- "loss": 0.2871,
261
- "rewards/accuracies": 0.862500011920929,
262
- "rewards/chosen": 2.1012206077575684,
263
- "rewards/margins": 4.184765815734863,
264
- "rewards/rejected": -2.082202196121216,
265
  "step": 85
266
  },
267
  {
268
- "epoch": 0.51985559566787,
269
- "grad_norm": 17.73928799440602,
270
- "learning_rate": 3.924242424242424e-07,
271
- "logits/chosen": -2.3271484375,
272
- "logits/rejected": -2.0726561546325684,
273
- "logps/chosen": -289.8374938964844,
274
- "logps/rejected": -502.09063720703125,
275
- "loss": 0.196,
276
- "rewards/accuracies": 0.918749988079071,
277
- "rewards/chosen": 2.123095750808716,
278
- "rewards/margins": 4.687890529632568,
279
- "rewards/rejected": -2.5654053688049316,
280
  "step": 90
281
  },
282
  {
283
- "epoch": 0.5487364620938628,
284
- "grad_norm": 44.35578228072781,
285
- "learning_rate": 3.8484848484848483e-07,
286
- "logits/chosen": -2.315624952316284,
287
- "logits/rejected": -2.096874952316284,
288
- "logps/chosen": -390.10626220703125,
289
- "logps/rejected": -399.70001220703125,
290
- "loss": 0.3248,
291
- "rewards/accuracies": 0.893750011920929,
292
- "rewards/chosen": 2.130859375,
293
- "rewards/margins": 4.269433498382568,
294
- "rewards/rejected": -2.1373534202575684,
295
  "step": 95
296
  },
297
  {
298
- "epoch": 0.5776173285198556,
299
- "grad_norm": 13.134481184446855,
300
- "learning_rate": 3.7727272727272723e-07,
301
- "logits/chosen": -2.3763670921325684,
302
- "logits/rejected": -2.126953125,
303
- "logps/chosen": -337.42498779296875,
304
- "logps/rejected": -402.796875,
305
- "loss": 0.2493,
306
- "rewards/accuracies": 0.9125000238418579,
307
- "rewards/chosen": 2.370715379714966,
308
- "rewards/margins": 4.451855659484863,
309
- "rewards/rejected": -2.0814452171325684,
310
  "step": 100
311
  },
312
  {
313
- "epoch": 0.6064981949458483,
314
- "grad_norm": 15.148568274887294,
315
- "learning_rate": 3.696969696969697e-07,
316
- "logits/chosen": -2.375,
317
- "logits/rejected": -2.1224608421325684,
318
- "logps/chosen": -331.1499938964844,
319
- "logps/rejected": -426.40936279296875,
320
- "loss": 0.2124,
321
- "rewards/accuracies": 0.918749988079071,
322
- "rewards/chosen": 2.0746092796325684,
323
- "rewards/margins": 4.863574028015137,
324
- "rewards/rejected": -2.7891478538513184,
325
  "step": 105
326
  },
327
  {
328
- "epoch": 0.6353790613718412,
329
- "grad_norm": 13.710557758939832,
330
- "learning_rate": 3.6212121212121213e-07,
331
- "logits/chosen": -2.283203125,
332
- "logits/rejected": -2.1001954078674316,
333
- "logps/chosen": -349.61248779296875,
334
- "logps/rejected": -440.45623779296875,
335
- "loss": 0.2573,
336
- "rewards/accuracies": 0.90625,
337
- "rewards/chosen": 1.9659302234649658,
338
- "rewards/margins": 4.624316215515137,
339
- "rewards/rejected": -2.6556639671325684,
340
  "step": 110
341
  },
342
  {
343
- "epoch": 0.6642599277978339,
344
- "grad_norm": 64.69545993471166,
345
- "learning_rate": 3.545454545454545e-07,
346
- "logits/chosen": -2.366406202316284,
347
- "logits/rejected": -2.0941405296325684,
348
- "logps/chosen": -316.7437438964844,
349
- "logps/rejected": -610.2750244140625,
350
- "loss": 0.2322,
351
- "rewards/accuracies": 0.9125000238418579,
352
- "rewards/chosen": 1.97076416015625,
353
- "rewards/margins": 4.702880859375,
354
- "rewards/rejected": -2.729736328125,
355
  "step": 115
356
  },
357
  {
358
- "epoch": 0.6931407942238267,
359
- "grad_norm": 28.158605692193273,
360
- "learning_rate": 3.46969696969697e-07,
361
- "logits/chosen": -2.3326172828674316,
362
- "logits/rejected": -2.194140672683716,
363
- "logps/chosen": -422.3999938964844,
364
- "logps/rejected": -384.75,
365
- "loss": 0.3542,
366
- "rewards/accuracies": 0.8999999761581421,
367
- "rewards/chosen": 2.1262450218200684,
368
- "rewards/margins": 4.404687404632568,
369
- "rewards/rejected": -2.278759717941284,
370
  "step": 120
371
  },
372
  {
373
- "epoch": 0.7220216606498195,
374
- "grad_norm": 54.048997407012784,
375
- "learning_rate": 3.393939393939394e-07,
376
- "logits/chosen": -2.353515625,
377
- "logits/rejected": -2.118945360183716,
378
- "logps/chosen": -304.53125,
379
- "logps/rejected": -315.09375,
380
- "loss": 0.2045,
381
- "rewards/accuracies": 0.949999988079071,
382
- "rewards/chosen": 2.14715576171875,
383
- "rewards/margins": 4.737890720367432,
384
- "rewards/rejected": -2.5897459983825684,
385
  "step": 125
386
  },
387
  {
388
- "epoch": 0.7509025270758123,
389
- "grad_norm": 11.276984222127565,
390
- "learning_rate": 3.318181818181818e-07,
391
- "logits/chosen": -2.3589844703674316,
392
- "logits/rejected": -2.185546875,
393
- "logps/chosen": -403.38751220703125,
394
- "logps/rejected": -515.4000244140625,
395
- "loss": 0.3345,
396
- "rewards/accuracies": 0.8812500238418579,
397
- "rewards/chosen": 1.581787109375,
398
- "rewards/margins": 4.064501762390137,
399
- "rewards/rejected": -2.4795165061950684,
400
  "step": 130
401
  },
402
  {
403
- "epoch": 0.779783393501805,
404
- "grad_norm": 14.89194313601513,
405
- "learning_rate": 3.242424242424242e-07,
406
- "logits/chosen": -2.359570264816284,
407
- "logits/rejected": -2.1517577171325684,
408
- "logps/chosen": -338.0874938964844,
409
- "logps/rejected": -509.6312561035156,
410
- "loss": 0.2248,
411
- "rewards/accuracies": 0.9125000238418579,
412
- "rewards/chosen": 2.08935546875,
413
- "rewards/margins": 4.466210842132568,
414
- "rewards/rejected": -2.376904249191284,
415
  "step": 135
416
  },
417
  {
418
- "epoch": 0.8086642599277978,
419
- "grad_norm": 14.736139465045111,
420
- "learning_rate": 3.166666666666666e-07,
421
- "logits/chosen": -2.36328125,
422
- "logits/rejected": -2.017773389816284,
423
- "logps/chosen": -294.9203186035156,
424
- "logps/rejected": -433.78125,
425
- "loss": 0.166,
426
- "rewards/accuracies": 0.956250011920929,
427
- "rewards/chosen": 2.3936524391174316,
428
- "rewards/margins": 5.378954887390137,
429
- "rewards/rejected": -2.98095703125,
430
  "step": 140
431
  },
432
  {
433
- "epoch": 0.8375451263537906,
434
- "grad_norm": 16.93771836336577,
435
- "learning_rate": 3.0909090909090907e-07,
436
- "logits/chosen": -2.420117139816284,
437
- "logits/rejected": -2.133984327316284,
438
- "logps/chosen": -326.1499938964844,
439
- "logps/rejected": -514.8312377929688,
440
- "loss": 0.1949,
441
- "rewards/accuracies": 0.956250011920929,
442
- "rewards/chosen": 2.119824171066284,
443
- "rewards/margins": 4.917089939117432,
444
- "rewards/rejected": -2.796398878097534,
445
  "step": 145
446
  },
447
  {
448
- "epoch": 0.8664259927797834,
449
- "grad_norm": 11.49584268590893,
450
- "learning_rate": 3.015151515151515e-07,
451
- "logits/chosen": -2.2669920921325684,
452
- "logits/rejected": -2.0160155296325684,
453
- "logps/chosen": -373.2562561035156,
454
- "logps/rejected": -530.3125,
455
- "loss": 0.1704,
456
- "rewards/accuracies": 0.925000011920929,
457
- "rewards/chosen": 2.597363233566284,
458
- "rewards/margins": 6.016992092132568,
459
- "rewards/rejected": -3.421142578125,
460
  "step": 150
461
  },
462
  {
463
- "epoch": 0.8953068592057761,
464
- "grad_norm": 13.443716803588206,
465
- "learning_rate": 2.939393939393939e-07,
466
- "logits/chosen": -2.3427734375,
467
- "logits/rejected": -2.0787110328674316,
468
- "logps/chosen": -363.0249938964844,
469
- "logps/rejected": -542.6749877929688,
470
- "loss": 0.1253,
471
- "rewards/accuracies": 0.956250011920929,
472
- "rewards/chosen": 1.879052758216858,
473
- "rewards/margins": 5.240624904632568,
474
- "rewards/rejected": -3.363940477371216,
475
  "step": 155
476
  },
477
  {
478
- "epoch": 0.924187725631769,
479
- "grad_norm": 34.79241720252881,
480
- "learning_rate": 2.8636363636363637e-07,
481
- "logits/chosen": -2.349414110183716,
482
- "logits/rejected": -2.1187500953674316,
483
- "logps/chosen": -329.6875,
484
- "logps/rejected": -474.125,
485
- "loss": 0.2088,
486
- "rewards/accuracies": 0.949999988079071,
487
- "rewards/chosen": 2.0616211891174316,
488
- "rewards/margins": 4.8037109375,
489
- "rewards/rejected": -2.7415528297424316,
490
  "step": 160
491
  },
492
  {
493
- "epoch": 0.9530685920577617,
494
- "grad_norm": 17.4218945141771,
495
- "learning_rate": 2.787878787878788e-07,
496
- "logits/chosen": -2.4146485328674316,
497
- "logits/rejected": -2.116015672683716,
498
- "logps/chosen": -353.875,
499
- "logps/rejected": -560.4375,
500
- "loss": 0.1459,
501
- "rewards/accuracies": 0.949999988079071,
502
- "rewards/chosen": 2.0111327171325684,
503
- "rewards/margins": 5.4267578125,
504
- "rewards/rejected": -3.4139647483825684,
505
  "step": 165
506
  },
507
  {
508
- "epoch": 0.9819494584837545,
509
- "grad_norm": 7.763570540153691,
510
- "learning_rate": 2.712121212121212e-07,
511
- "logits/chosen": -2.291796922683716,
512
- "logits/rejected": -2.0296874046325684,
513
- "logps/chosen": -390.41876220703125,
514
- "logps/rejected": -468.45623779296875,
515
- "loss": 0.1389,
516
- "rewards/accuracies": 0.949999988079071,
517
- "rewards/chosen": 2.46728515625,
518
- "rewards/margins": 6.066796779632568,
519
- "rewards/rejected": -3.6032471656799316,
520
  "step": 170
521
  },
522
  {
523
- "epoch": 1.0057761732851986,
524
- "grad_norm": 5.250963625120634,
525
- "learning_rate": 2.636363636363636e-07,
526
- "logits/chosen": -2.3120265007019043,
527
- "logits/rejected": -2.04237699508667,
528
- "logps/chosen": -427.3636474609375,
529
- "logps/rejected": -674.1060791015625,
530
- "loss": 0.1323,
531
- "rewards/accuracies": 0.9217172265052795,
532
- "rewards/chosen": 2.187159776687622,
533
- "rewards/margins": 6.69081449508667,
534
- "rewards/rejected": -4.504527568817139,
535
  "step": 175
536
  },
537
  {
538
- "epoch": 1.0346570397111914,
539
- "grad_norm": 7.717889023853925,
540
- "learning_rate": 2.56060606060606e-07,
541
- "logits/chosen": -2.347460985183716,
542
- "logits/rejected": -2.016796827316284,
543
- "logps/chosen": -360.2875061035156,
544
- "logps/rejected": -712.875,
545
- "loss": 0.1385,
546
- "rewards/accuracies": 0.949999988079071,
547
- "rewards/chosen": 1.92120361328125,
548
- "rewards/margins": 6.072070121765137,
549
- "rewards/rejected": -4.151269435882568,
550
  "step": 180
551
  },
552
  {
553
- "epoch": 1.0635379061371841,
554
- "grad_norm": 8.349982709565937,
555
- "learning_rate": 2.4848484848484846e-07,
556
- "logits/chosen": -2.275195360183716,
557
- "logits/rejected": -2.0523438453674316,
558
- "logps/chosen": -386.3671875,
559
- "logps/rejected": -524.875,
560
- "loss": 0.2285,
561
- "rewards/accuracies": 0.9375,
562
- "rewards/chosen": 2.014880418777466,
563
- "rewards/margins": 5.687792778015137,
564
- "rewards/rejected": -3.669323682785034,
565
  "step": 185
566
  },
567
  {
568
- "epoch": 1.0924187725631769,
569
- "grad_norm": 32.542684480595675,
570
- "learning_rate": 2.409090909090909e-07,
571
- "logits/chosen": -2.368945360183716,
572
- "logits/rejected": -2.0582032203674316,
573
- "logps/chosen": -374.5289001464844,
574
- "logps/rejected": -360.23126220703125,
575
- "loss": 0.2207,
576
- "rewards/accuracies": 0.9125000238418579,
577
- "rewards/chosen": 1.8182861804962158,
578
- "rewards/margins": 5.199414253234863,
579
- "rewards/rejected": -3.3819823265075684,
580
  "step": 190
581
  },
582
  {
583
- "epoch": 1.1212996389891696,
584
- "grad_norm": 5.443641635105693,
585
- "learning_rate": 2.3333333333333333e-07,
586
- "logits/chosen": -2.3607420921325684,
587
- "logits/rejected": -2.0083985328674316,
588
- "logps/chosen": -354.98126220703125,
589
- "logps/rejected": -578.9874877929688,
590
- "loss": 0.1568,
591
- "rewards/accuracies": 0.9437500238418579,
592
- "rewards/chosen": 2.081469774246216,
593
- "rewards/margins": 6.055468559265137,
594
- "rewards/rejected": -3.973706007003784,
595
  "step": 195
596
  },
597
  {
598
- "epoch": 1.1501805054151624,
599
- "grad_norm": 29.210296481306564,
600
- "learning_rate": 2.2575757575757576e-07,
601
- "logits/chosen": -2.4625000953674316,
602
- "logits/rejected": -2.1500000953674316,
603
- "logps/chosen": -270.30938720703125,
604
- "logps/rejected": -522.21875,
605
- "loss": 0.1728,
606
- "rewards/accuracies": 0.918749988079071,
607
- "rewards/chosen": 1.7206542491912842,
608
- "rewards/margins": 5.392187595367432,
609
- "rewards/rejected": -3.6748046875,
610
  "step": 200
611
  },
612
  {
613
- "epoch": 1.1790613718411551,
614
- "grad_norm": 8.507315599940252,
615
- "learning_rate": 2.1818181818181815e-07,
616
- "logits/chosen": -2.3212890625,
617
- "logits/rejected": -2.050585985183716,
618
- "logps/chosen": -347.82501220703125,
619
- "logps/rejected": -477.4375,
620
- "loss": 0.238,
621
- "rewards/accuracies": 0.956250011920929,
622
- "rewards/chosen": 2.0087647438049316,
623
- "rewards/margins": 5.121484279632568,
624
- "rewards/rejected": -3.111254930496216,
625
  "step": 205
626
  },
627
  {
628
- "epoch": 1.207942238267148,
629
- "grad_norm": 11.12133440056853,
630
- "learning_rate": 2.106060606060606e-07,
631
- "logits/chosen": -2.299609422683716,
632
- "logits/rejected": -2.0796875953674316,
633
- "logps/chosen": -352.25469970703125,
634
- "logps/rejected": -428.83905029296875,
635
- "loss": 0.2224,
636
- "rewards/accuracies": 0.887499988079071,
637
- "rewards/chosen": 1.560888648033142,
638
- "rewards/margins": 4.144775390625,
639
- "rewards/rejected": -2.5865235328674316,
640
  "step": 210
641
  },
642
  {
643
- "epoch": 1.2368231046931408,
644
- "grad_norm": 7.196990053018469,
645
- "learning_rate": 2.0303030303030303e-07,
646
- "logits/chosen": -2.340625047683716,
647
- "logits/rejected": -2.0328125953674316,
648
- "logps/chosen": -375.5093688964844,
649
- "logps/rejected": -503.96563720703125,
650
- "loss": 0.1673,
651
- "rewards/accuracies": 0.956250011920929,
652
- "rewards/chosen": 2.0465941429138184,
653
- "rewards/margins": 5.524023532867432,
654
- "rewards/rejected": -3.48193359375,
655
  "step": 215
656
  },
657
  {
658
- "epoch": 1.2657039711191336,
659
- "grad_norm": 30.16546271206658,
660
- "learning_rate": 1.9545454545454545e-07,
661
- "logits/chosen": -2.3626952171325684,
662
- "logits/rejected": -2.0751953125,
663
- "logps/chosen": -377.1937561035156,
664
- "logps/rejected": -472.3374938964844,
665
- "loss": 0.1338,
666
- "rewards/accuracies": 0.9750000238418579,
667
- "rewards/chosen": 1.931860327720642,
668
- "rewards/margins": 5.258203029632568,
669
- "rewards/rejected": -3.324414014816284,
670
  "step": 220
671
  },
672
  {
673
- "epoch": 1.2945848375451263,
674
- "grad_norm": 16.65751244106967,
675
- "learning_rate": 1.8787878787878785e-07,
676
- "logits/chosen": -2.429492235183716,
677
- "logits/rejected": -2.100390672683716,
678
- "logps/chosen": -269.0625,
679
- "logps/rejected": -362.60626220703125,
680
- "loss": 0.1772,
681
- "rewards/accuracies": 0.9624999761581421,
682
- "rewards/chosen": 2.430957078933716,
683
- "rewards/margins": 4.759179592132568,
684
- "rewards/rejected": -2.3294434547424316,
685
  "step": 225
686
  },
687
  {
688
- "epoch": 1.323465703971119,
689
- "grad_norm": 7.639249240996568,
690
- "learning_rate": 1.803030303030303e-07,
691
- "logits/chosen": -2.4078125953674316,
692
- "logits/rejected": -2.1470704078674316,
693
- "logps/chosen": -328.29998779296875,
694
- "logps/rejected": -392.125,
695
- "loss": 0.147,
696
- "rewards/accuracies": 0.9375,
697
- "rewards/chosen": 2.2931885719299316,
698
- "rewards/margins": 5.653124809265137,
699
- "rewards/rejected": -3.358081102371216,
700
  "step": 230
701
  },
702
  {
703
- "epoch": 1.352346570397112,
704
- "grad_norm": 11.235853450673325,
705
- "learning_rate": 1.7272727272727272e-07,
706
- "logits/chosen": -2.305859327316284,
707
- "logits/rejected": -2.067187547683716,
708
- "logps/chosen": -335.57501220703125,
709
- "logps/rejected": -458.25,
710
- "loss": 0.2442,
711
- "rewards/accuracies": 0.918749988079071,
712
- "rewards/chosen": 2.34521484375,
713
- "rewards/margins": 5.45166015625,
714
- "rewards/rejected": -3.107348680496216,
715
  "step": 235
716
  },
717
  {
718
- "epoch": 1.3812274368231048,
719
- "grad_norm": 24.18415545353277,
720
- "learning_rate": 1.6515151515151515e-07,
721
- "logits/chosen": -2.288281202316284,
722
- "logits/rejected": -1.989843726158142,
723
- "logps/chosen": -374.20001220703125,
724
- "logps/rejected": -513.2125244140625,
725
- "loss": 0.1973,
726
- "rewards/accuracies": 0.925000011920929,
727
- "rewards/chosen": 2.0380616188049316,
728
- "rewards/margins": 6.026757717132568,
729
- "rewards/rejected": -3.9879393577575684,
730
  "step": 240
731
  },
732
  {
733
- "epoch": 1.4101083032490975,
734
- "grad_norm": 14.131437193197566,
735
- "learning_rate": 1.5757575757575757e-07,
736
- "logits/chosen": -2.3443360328674316,
737
- "logits/rejected": -2.125781297683716,
738
- "logps/chosen": -314.6875,
739
- "logps/rejected": -335.6000061035156,
740
- "loss": 0.1387,
741
- "rewards/accuracies": 0.956250011920929,
742
- "rewards/chosen": 2.135449171066284,
743
- "rewards/margins": 4.864843845367432,
744
- "rewards/rejected": -2.727935791015625,
745
  "step": 245
746
  },
747
  {
748
- "epoch": 1.4389891696750903,
749
- "grad_norm": 56.26784093160422,
750
- "learning_rate": 1.5e-07,
751
- "logits/chosen": -2.2816405296325684,
752
- "logits/rejected": -2.0853514671325684,
753
- "logps/chosen": -376.4281311035156,
754
- "logps/rejected": -458.8812561035156,
755
- "loss": 0.2482,
756
- "rewards/accuracies": 0.949999988079071,
757
- "rewards/chosen": 1.89215087890625,
758
- "rewards/margins": 4.9853515625,
759
- "rewards/rejected": -3.094287157058716,
760
  "step": 250
761
  },
762
  {
763
- "epoch": 1.467870036101083,
764
- "grad_norm": 4.313742509893692,
765
- "learning_rate": 1.4242424242424242e-07,
766
- "logits/chosen": -2.323046922683716,
767
- "logits/rejected": -2.084765672683716,
768
- "logps/chosen": -301.89453125,
769
- "logps/rejected": -363.7875061035156,
770
- "loss": 0.1321,
771
- "rewards/accuracies": 0.96875,
772
- "rewards/chosen": 2.201855421066284,
773
- "rewards/margins": 5.726758003234863,
774
- "rewards/rejected": -3.5247559547424316,
775
  "step": 255
776
  },
777
  {
778
- "epoch": 1.4967509025270758,
779
- "grad_norm": 5.9087018367088024,
780
- "learning_rate": 1.3484848484848484e-07,
781
- "logits/chosen": -2.372265577316284,
782
- "logits/rejected": -2.153515577316284,
783
- "logps/chosen": -309.9937438964844,
784
- "logps/rejected": -372.3374938964844,
785
- "loss": 0.1357,
786
- "rewards/accuracies": 0.9437500238418579,
787
- "rewards/chosen": 2.295703172683716,
788
- "rewards/margins": 5.535937309265137,
789
- "rewards/rejected": -3.2354493141174316,
790
  "step": 260
791
  },
792
  {
793
- "epoch": 1.5256317689530685,
794
- "grad_norm": 13.209590602028339,
795
- "learning_rate": 1.2727272727272726e-07,
796
- "logits/chosen": -2.3701171875,
797
- "logits/rejected": -2.107226610183716,
798
- "logps/chosen": -297.04998779296875,
799
- "logps/rejected": -433.0,
800
- "loss": 0.1328,
801
- "rewards/accuracies": 0.956250011920929,
802
- "rewards/chosen": 2.1886353492736816,
803
- "rewards/margins": 5.202734470367432,
804
- "rewards/rejected": -3.012500047683716,
805
  "step": 265
806
  },
807
  {
808
- "epoch": 1.5545126353790613,
809
- "grad_norm": 4.27775101978357,
810
- "learning_rate": 1.196969696969697e-07,
811
- "logits/chosen": -2.3277344703674316,
812
- "logits/rejected": -2.1044921875,
813
- "logps/chosen": -388.7593688964844,
814
- "logps/rejected": -445.0531311035156,
815
- "loss": 0.1726,
816
- "rewards/accuracies": 0.9375,
817
- "rewards/chosen": 1.5277831554412842,
818
- "rewards/margins": 5.085351467132568,
819
- "rewards/rejected": -3.555957078933716,
820
  "step": 270
821
  },
822
  {
823
- "epoch": 1.583393501805054,
824
- "grad_norm": 18.286296338783334,
825
- "learning_rate": 1.1212121212121211e-07,
826
- "logits/chosen": -2.396484375,
827
- "logits/rejected": -2.110156297683716,
828
- "logps/chosen": -309.97186279296875,
829
- "logps/rejected": -359.92498779296875,
830
- "loss": 0.158,
831
- "rewards/accuracies": 0.9624999761581421,
832
- "rewards/chosen": 2.2788939476013184,
833
- "rewards/margins": 5.2080078125,
834
- "rewards/rejected": -2.934521436691284,
835
  "step": 275
836
  },
837
  {
838
- "epoch": 1.612274368231047,
839
- "grad_norm": 2.669702691885131,
840
- "learning_rate": 1.0454545454545454e-07,
841
- "logits/chosen": -2.372851610183716,
842
- "logits/rejected": -2.131640672683716,
843
- "logps/chosen": -348.1578063964844,
844
- "logps/rejected": -405.65936279296875,
845
- "loss": 0.2322,
846
- "rewards/accuracies": 0.918749988079071,
847
- "rewards/chosen": 2.2171874046325684,
848
- "rewards/margins": 5.343847751617432,
849
- "rewards/rejected": -3.1302733421325684,
850
  "step": 280
851
  },
852
  {
853
- "epoch": 1.6411552346570397,
854
- "grad_norm": 13.710710847207622,
855
- "learning_rate": 9.696969696969696e-08,
856
- "logits/chosen": -2.2705078125,
857
- "logits/rejected": -2.0806641578674316,
858
- "logps/chosen": -350.35626220703125,
859
- "logps/rejected": -502.07501220703125,
860
- "loss": 0.1197,
861
- "rewards/accuracies": 0.981249988079071,
862
- "rewards/chosen": 1.909570336341858,
863
- "rewards/margins": 5.671875,
864
- "rewards/rejected": -3.759765625,
865
  "step": 285
866
  },
867
  {
868
- "epoch": 1.6700361010830325,
869
- "grad_norm": 7.551122285737169,
870
- "learning_rate": 8.93939393939394e-08,
871
- "logits/chosen": -2.268749952316284,
872
- "logits/rejected": -2.033203125,
873
- "logps/chosen": -352.2875061035156,
874
- "logps/rejected": -430.5687561035156,
875
- "loss": 0.1557,
876
- "rewards/accuracies": 0.9437500238418579,
877
- "rewards/chosen": 2.1177978515625,
878
- "rewards/margins": 5.108984470367432,
879
- "rewards/rejected": -2.992236375808716,
880
  "step": 290
881
  },
882
  {
883
- "epoch": 1.6989169675090254,
884
- "grad_norm": 6.684658943158315,
885
- "learning_rate": 8.181818181818182e-08,
886
- "logits/chosen": -2.3121094703674316,
887
- "logits/rejected": -2.0816407203674316,
888
- "logps/chosen": -346.4624938964844,
889
- "logps/rejected": -449.046875,
890
- "loss": 0.1549,
891
- "rewards/accuracies": 0.949999988079071,
892
- "rewards/chosen": 2.4569334983825684,
893
- "rewards/margins": 6.220312595367432,
894
- "rewards/rejected": -3.7589354515075684,
895
  "step": 295
896
  },
897
  {
898
- "epoch": 1.7277978339350182,
899
- "grad_norm": 15.106479425607633,
900
- "learning_rate": 7.424242424242424e-08,
901
- "logits/chosen": -2.363476514816284,
902
- "logits/rejected": -2.167773485183716,
903
- "logps/chosen": -351.73748779296875,
904
- "logps/rejected": -432.375,
905
- "loss": 0.1492,
906
- "rewards/accuracies": 0.949999988079071,
907
- "rewards/chosen": 2.024670362472534,
908
- "rewards/margins": 5.391992092132568,
909
- "rewards/rejected": -3.364550828933716,
910
  "step": 300
911
  },
912
  {
913
- "epoch": 1.756678700361011,
914
- "grad_norm": 17.071782434502218,
915
- "learning_rate": 6.666666666666667e-08,
916
- "logits/chosen": -2.3306641578674316,
917
- "logits/rejected": -2.131640672683716,
918
- "logps/chosen": -343.89373779296875,
919
- "logps/rejected": -500.7593688964844,
920
- "loss": 0.1674,
921
- "rewards/accuracies": 0.918749988079071,
922
- "rewards/chosen": 2.3846678733825684,
923
- "rewards/margins": 5.525000095367432,
924
- "rewards/rejected": -3.142895460128784,
925
  "step": 305
926
  },
927
  {
928
- "epoch": 1.7855595667870037,
929
- "grad_norm": 15.209800869884038,
930
- "learning_rate": 5.9090909090909085e-08,
931
- "logits/chosen": -2.3880858421325684,
932
- "logits/rejected": -2.153515577316284,
933
- "logps/chosen": -325.6499938964844,
934
- "logps/rejected": -414.95623779296875,
935
- "loss": 0.1211,
936
- "rewards/accuracies": 0.9624999761581421,
937
- "rewards/chosen": 1.990502953529358,
938
- "rewards/margins": 5.307031154632568,
939
- "rewards/rejected": -3.316601514816284,
940
  "step": 310
941
  },
942
- {
943
- "epoch": 1.8144404332129964,
944
- "grad_norm": 18.27572794541311,
945
- "learning_rate": 5.151515151515151e-08,
946
- "logits/chosen": -2.367968797683716,
947
- "logits/rejected": -2.103710889816284,
948
- "logps/chosen": -356.52813720703125,
949
- "logps/rejected": -500.12188720703125,
950
- "loss": 0.163,
951
- "rewards/accuracies": 0.956250011920929,
952
- "rewards/chosen": 2.038317918777466,
953
- "rewards/margins": 5.278710842132568,
954
- "rewards/rejected": -3.2340455055236816,
955
- "step": 315
956
- },
957
- {
958
- "epoch": 1.8433212996389892,
959
- "grad_norm": 7.0037563817409,
960
- "learning_rate": 4.393939393939393e-08,
961
- "logits/chosen": -2.3832030296325684,
962
- "logits/rejected": -2.078125,
963
- "logps/chosen": -257.1625061035156,
964
- "logps/rejected": -554.7000122070312,
965
- "loss": 0.1503,
966
- "rewards/accuracies": 0.9437500238418579,
967
- "rewards/chosen": 2.0913329124450684,
968
- "rewards/margins": 5.219140529632568,
969
- "rewards/rejected": -3.128173828125,
970
- "step": 320
971
- },
972
- {
973
- "epoch": 1.872202166064982,
974
- "grad_norm": 6.3925450697454,
975
- "learning_rate": 3.636363636363636e-08,
976
- "logits/chosen": -2.378710985183716,
977
- "logits/rejected": -2.157031297683716,
978
- "logps/chosen": -321.14373779296875,
979
- "logps/rejected": -385.79998779296875,
980
- "loss": 0.1651,
981
- "rewards/accuracies": 0.918749988079071,
982
- "rewards/chosen": 2.0872559547424316,
983
- "rewards/margins": 5.511328220367432,
984
- "rewards/rejected": -3.424072265625,
985
- "step": 325
986
- },
987
- {
988
- "epoch": 1.9010830324909747,
989
- "grad_norm": 8.51178304792889,
990
- "learning_rate": 2.8787878787878787e-08,
991
- "logits/chosen": -2.3662109375,
992
- "logits/rejected": -2.1107420921325684,
993
- "logps/chosen": -349.765625,
994
- "logps/rejected": -494.3999938964844,
995
- "loss": 0.1741,
996
- "rewards/accuracies": 0.956250011920929,
997
- "rewards/chosen": 1.6572997570037842,
998
- "rewards/margins": 5.140038967132568,
999
- "rewards/rejected": -3.4827637672424316,
1000
- "step": 330
1001
- },
1002
- {
1003
- "epoch": 1.9299638989169674,
1004
- "grad_norm": 16.678414426541924,
1005
- "learning_rate": 2.1212121212121214e-08,
1006
- "logits/chosen": -2.3955078125,
1007
- "logits/rejected": -2.0966796875,
1008
- "logps/chosen": -327.1000061035156,
1009
- "logps/rejected": -492.73748779296875,
1010
- "loss": 0.1567,
1011
- "rewards/accuracies": 0.949999988079071,
1012
- "rewards/chosen": 2.02783203125,
1013
- "rewards/margins": 5.256445407867432,
1014
- "rewards/rejected": -3.228686571121216,
1015
- "step": 335
1016
- },
1017
- {
1018
- "epoch": 1.9588447653429601,
1019
- "grad_norm": 4.987506994026094,
1020
- "learning_rate": 1.3636363636363635e-08,
1021
- "logits/chosen": -2.355273485183716,
1022
- "logits/rejected": -1.9865233898162842,
1023
- "logps/chosen": -340.0,
1024
- "logps/rejected": -424.0687561035156,
1025
- "loss": 0.1081,
1026
- "rewards/accuracies": 0.9624999761581421,
1027
- "rewards/chosen": 2.090686082839966,
1028
- "rewards/margins": 5.492578029632568,
1029
- "rewards/rejected": -3.401123046875,
1030
- "step": 340
1031
- },
1032
- {
1033
- "epoch": 1.9877256317689531,
1034
- "grad_norm": 5.558593262711838,
1035
- "learning_rate": 6.06060606060606e-09,
1036
- "logits/chosen": -2.360156297683716,
1037
- "logits/rejected": -2.049023389816284,
1038
- "logps/chosen": -311.01251220703125,
1039
- "logps/rejected": -635.7062377929688,
1040
- "loss": 0.1415,
1041
- "rewards/accuracies": 0.956250011920929,
1042
- "rewards/chosen": 1.959832787513733,
1043
- "rewards/margins": 5.430273532867432,
1044
- "rewards/rejected": -3.4686522483825684,
1045
- "step": 345
1046
- },
1047
  {
1048
  "epoch": 2.0,
1049
- "step": 348,
1050
  "total_flos": 0.0,
1051
- "train_loss": 0.2171997334594014,
1052
- "train_runtime": 3612.0435,
1053
- "train_samples_per_second": 3.067,
1054
- "train_steps_per_second": 0.096
1055
  }
1056
  ],
1057
  "logging_steps": 5,
1058
- "max_steps": 348,
1059
  "num_input_tokens_seen": 0,
1060
  "num_train_epochs": 2,
1061
  "save_steps": 50,
 
4
  "best_model_checkpoint": null,
5
  "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 312,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.03205128205128205,
14
+ "grad_norm": 17.482923590326937,
15
+ "learning_rate": 1.25e-07,
16
+ "logits/chosen": -2.0185546875,
17
+ "logits/rejected": -1.881250023841858,
18
+ "logps/chosen": -235.671875,
19
+ "logps/rejected": -302.26873779296875,
20
+ "loss": 0.38,
21
+ "rewards/accuracies": 0.8187500238418579,
22
+ "rewards/chosen": 1.7926757335662842,
23
+ "rewards/margins": 3.30548095703125,
24
+ "rewards/rejected": -1.5131103992462158,
25
  "step": 5
26
  },
27
  {
28
+ "epoch": 0.0641025641025641,
29
+ "grad_norm": 10.731624225914716,
30
+ "learning_rate": 2.8125e-07,
31
+ "logits/chosen": -2.0074219703674316,
32
+ "logits/rejected": -1.779687523841858,
33
+ "logps/chosen": -220.52969360351562,
34
+ "logps/rejected": -415.6499938964844,
35
+ "loss": 0.4128,
36
  "rewards/accuracies": 0.7875000238418579,
37
+ "rewards/chosen": 1.885888695716858,
38
+ "rewards/margins": 3.5706787109375,
39
+ "rewards/rejected": -1.6814696788787842,
40
  "step": 10
41
  },
42
  {
43
+ "epoch": 0.09615384615384616,
44
+ "grad_norm": 12.984171783098084,
45
+ "learning_rate": 4.375e-07,
46
+ "logits/chosen": -2.097851514816284,
47
+ "logits/rejected": -1.916015625,
48
+ "logps/chosen": -212.4031219482422,
49
+ "logps/rejected": -295.92657470703125,
50
+ "loss": 0.4187,
51
+ "rewards/accuracies": 0.78125,
52
+ "rewards/chosen": 1.880639672279358,
53
+ "rewards/margins": 3.52978515625,
54
+ "rewards/rejected": -1.648584008216858,
55
  "step": 15
56
  },
57
  {
58
+ "epoch": 0.1282051282051282,
59
+ "grad_norm": 39.351112208545295,
60
+ "learning_rate": 4.949324324324325e-07,
61
+ "logits/chosen": -2.023632764816284,
62
+ "logits/rejected": -1.8396484851837158,
63
+ "logps/chosen": -228.0656280517578,
64
+ "logps/rejected": -375.046875,
65
+ "loss": 0.5019,
66
  "rewards/accuracies": 0.800000011920929,
67
+ "rewards/chosen": 1.77685546875,
68
+ "rewards/margins": 3.207226514816284,
69
+ "rewards/rejected": -1.429071068763733,
70
  "step": 20
71
  },
72
  {
73
+ "epoch": 0.16025641025641027,
74
+ "grad_norm": 8.14663574686874,
75
+ "learning_rate": 4.864864864864865e-07,
76
+ "logits/chosen": -1.9912109375,
77
+ "logits/rejected": -1.8039062023162842,
78
+ "logps/chosen": -231.5593719482422,
79
+ "logps/rejected": -515.8937377929688,
80
+ "loss": 0.4089,
81
+ "rewards/accuracies": 0.8062499761581421,
82
+ "rewards/chosen": 1.800073266029358,
83
+ "rewards/margins": 3.571337938308716,
84
+ "rewards/rejected": -1.771520972251892,
85
  "step": 25
86
  },
87
  {
88
+ "epoch": 0.19230769230769232,
89
+ "grad_norm": 16.42433782374998,
90
+ "learning_rate": 4.780405405405405e-07,
91
+ "logits/chosen": -1.983789086341858,
92
+ "logits/rejected": -1.7492187023162842,
93
+ "logps/chosen": -250.9375,
94
+ "logps/rejected": -518.8781127929688,
95
+ "loss": 0.3187,
96
+ "rewards/accuracies": 0.8687499761581421,
97
+ "rewards/chosen": 1.736535668373108,
98
+ "rewards/margins": 3.2618165016174316,
99
+ "rewards/rejected": -1.525964379310608,
100
  "step": 30
101
  },
102
  {
103
+ "epoch": 0.22435897435897437,
104
+ "grad_norm": 16.266393432601383,
105
+ "learning_rate": 4.695945945945946e-07,
106
+ "logits/chosen": -2.0166015625,
107
+ "logits/rejected": -1.91015625,
108
+ "logps/chosen": -209.7781219482422,
109
+ "logps/rejected": -289.6625061035156,
110
+ "loss": 0.3953,
111
+ "rewards/accuracies": 0.8125,
112
+ "rewards/chosen": 2.050332546234131,
113
+ "rewards/margins": 3.6033051013946533,
114
+ "rewards/rejected": -1.550323486328125,
115
  "step": 35
116
  },
117
  {
118
+ "epoch": 0.2564102564102564,
119
+ "grad_norm": 25.97441760074817,
120
+ "learning_rate": 4.611486486486486e-07,
121
+ "logits/chosen": -1.963476538658142,
122
+ "logits/rejected": -1.8292968273162842,
123
+ "logps/chosen": -283.6156311035156,
124
+ "logps/rejected": -328.79766845703125,
125
+ "loss": 0.4126,
126
+ "rewards/accuracies": 0.8125,
127
+ "rewards/chosen": 1.7740600109100342,
128
+ "rewards/margins": 3.731884717941284,
129
+ "rewards/rejected": -1.9563720226287842,
130
  "step": 40
131
  },
132
  {
133
+ "epoch": 0.28846153846153844,
134
+ "grad_norm": 34.9734844548034,
135
+ "learning_rate": 4.5270270270270264e-07,
136
+ "logits/chosen": -2.0054688453674316,
137
+ "logits/rejected": -1.7921874523162842,
138
+ "logps/chosen": -274.5687561035156,
139
+ "logps/rejected": -334.1656188964844,
140
+ "loss": 0.377,
141
+ "rewards/accuracies": 0.8187500238418579,
142
+ "rewards/chosen": 2.108358860015869,
143
+ "rewards/margins": 4.065283298492432,
144
+ "rewards/rejected": -1.953271508216858,
145
  "step": 45
146
  },
147
  {
148
+ "epoch": 0.32051282051282054,
149
+ "grad_norm": 11.913243260486984,
150
+ "learning_rate": 4.442567567567567e-07,
151
+ "logits/chosen": -2.0777344703674316,
152
+ "logits/rejected": -1.947656273841858,
153
+ "logps/chosen": -225.84219360351562,
154
+ "logps/rejected": -250.46249389648438,
155
+ "loss": 0.3373,
156
+ "rewards/accuracies": 0.824999988079071,
157
+ "rewards/chosen": 2.101611375808716,
158
+ "rewards/margins": 3.5291504859924316,
159
+ "rewards/rejected": -1.426367163658142,
160
  "step": 50
161
  },
162
  {
163
+ "epoch": 0.3525641025641026,
164
+ "grad_norm": 13.672519743198338,
165
+ "learning_rate": 4.3581081081081076e-07,
166
+ "logits/chosen": -2.114453077316284,
167
+ "logits/rejected": -1.878320336341858,
168
+ "logps/chosen": -331.421875,
169
+ "logps/rejected": -381.27264404296875,
170
+ "loss": 0.3941,
171
+ "rewards/accuracies": 0.824999988079071,
172
+ "rewards/chosen": 1.777099609375,
173
+ "rewards/margins": 3.0150146484375,
174
+ "rewards/rejected": -1.236975073814392,
175
  "step": 55
176
  },
177
  {
178
+ "epoch": 0.38461538461538464,
179
+ "grad_norm": 11.373635937771688,
180
+ "learning_rate": 4.2736486486486484e-07,
181
+ "logits/chosen": -2.139843702316284,
182
+ "logits/rejected": -1.938867211341858,
183
+ "logps/chosen": -233.5578155517578,
184
+ "logps/rejected": -377.8140563964844,
185
+ "loss": 0.3037,
186
+ "rewards/accuracies": 0.8374999761581421,
187
+ "rewards/chosen": 2.1447510719299316,
188
+ "rewards/margins": 3.566601514816284,
189
+ "rewards/rejected": -1.420263648033142,
190
  "step": 60
191
  },
192
  {
193
+ "epoch": 0.4166666666666667,
194
+ "grad_norm": 14.605720279749862,
195
+ "learning_rate": 4.189189189189189e-07,
196
+ "logits/chosen": -1.915624976158142,
197
+ "logits/rejected": -1.8369140625,
198
+ "logps/chosen": -196.640625,
199
+ "logps/rejected": -297.3812561035156,
200
+ "loss": 0.3993,
201
+ "rewards/accuracies": 0.8062499761581421,
202
+ "rewards/chosen": 1.6602294445037842,
203
+ "rewards/margins": 2.8565430641174316,
204
+ "rewards/rejected": -1.1953613758087158,
205
  "step": 65
206
  },
207
  {
208
+ "epoch": 0.44871794871794873,
209
+ "grad_norm": 12.129690582617949,
210
+ "learning_rate": 4.1047297297297296e-07,
211
+ "logits/chosen": -2.043750047683716,
212
+ "logits/rejected": -1.8582031726837158,
213
+ "logps/chosen": -269.55780029296875,
214
+ "logps/rejected": -349.8812561035156,
215
+ "loss": 0.2719,
216
+ "rewards/accuracies": 0.90625,
217
+ "rewards/chosen": 2.2075562477111816,
218
+ "rewards/margins": 3.8485350608825684,
219
+ "rewards/rejected": -1.6388671398162842,
220
  "step": 70
221
  },
222
  {
223
+ "epoch": 0.4807692307692308,
224
+ "grad_norm": 10.894196057080642,
225
+ "learning_rate": 4.02027027027027e-07,
226
+ "logits/chosen": -2.0833983421325684,
227
+ "logits/rejected": -1.8416016101837158,
228
+ "logps/chosen": -218.09375,
229
+ "logps/rejected": -379.48126220703125,
230
+ "loss": 0.3121,
231
+ "rewards/accuracies": 0.8687499761581421,
232
+ "rewards/chosen": 2.30419921875,
233
+ "rewards/margins": 3.564453125,
234
+ "rewards/rejected": -1.260766625404358,
235
  "step": 75
236
  },
237
  {
238
+ "epoch": 0.5128205128205128,
239
+ "grad_norm": 10.28529818518839,
240
+ "learning_rate": 3.935810810810811e-07,
241
+ "logits/chosen": -2.027539014816284,
242
+ "logits/rejected": -1.8759765625,
243
+ "logps/chosen": -280.046875,
244
+ "logps/rejected": -328.8125,
245
+ "loss": 0.2987,
246
+ "rewards/accuracies": 0.887499988079071,
247
+ "rewards/chosen": 2.126843214035034,
248
+ "rewards/margins": 4.263671875,
249
+ "rewards/rejected": -2.134082078933716,
250
  "step": 80
251
  },
252
  {
253
+ "epoch": 0.5448717948717948,
254
+ "grad_norm": 10.046587710863294,
255
+ "learning_rate": 3.851351351351351e-07,
256
+ "logits/chosen": -2.0047850608825684,
257
+ "logits/rejected": -1.8369140625,
258
+ "logps/chosen": -249.7734375,
259
+ "logps/rejected": -267.5843811035156,
260
+ "loss": 0.2905,
261
+ "rewards/accuracies": 0.893750011920929,
262
+ "rewards/chosen": 2.1821961402893066,
263
+ "rewards/margins": 3.7508788108825684,
264
+ "rewards/rejected": -1.568945288658142,
265
  "step": 85
266
  },
267
  {
268
+ "epoch": 0.5769230769230769,
269
+ "grad_norm": 15.815810554242594,
270
+ "learning_rate": 3.766891891891892e-07,
271
+ "logits/chosen": -1.9933593273162842,
272
+ "logits/rejected": -1.7882812023162842,
273
+ "logps/chosen": -285.0328063964844,
274
+ "logps/rejected": -244.2062530517578,
275
+ "loss": 0.2709,
276
+ "rewards/accuracies": 0.925000011920929,
277
+ "rewards/chosen": 2.258129835128784,
278
+ "rewards/margins": 3.865039110183716,
279
+ "rewards/rejected": -1.611413598060608,
280
  "step": 90
281
  },
282
  {
283
+ "epoch": 0.6089743589743589,
284
+ "grad_norm": 33.74436261855397,
285
+ "learning_rate": 3.682432432432432e-07,
286
+ "logits/chosen": -2.1357421875,
287
+ "logits/rejected": -1.894140601158142,
288
+ "logps/chosen": -245.30624389648438,
289
+ "logps/rejected": -478.70001220703125,
290
+ "loss": 0.2975,
291
+ "rewards/accuracies": 0.8500000238418579,
292
+ "rewards/chosen": 1.964135766029358,
293
+ "rewards/margins": 3.7166991233825684,
294
+ "rewards/rejected": -1.751123070716858,
295
  "step": 95
296
  },
297
  {
298
+ "epoch": 0.6410256410256411,
299
+ "grad_norm": 9.910094853594128,
300
+ "learning_rate": 3.597972972972973e-07,
301
+ "logits/chosen": -2.024609327316284,
302
+ "logits/rejected": -1.865625023841858,
303
+ "logps/chosen": -264.28436279296875,
304
+ "logps/rejected": -300.07342529296875,
305
+ "loss": 0.3171,
306
+ "rewards/accuracies": 0.893750011920929,
307
+ "rewards/chosen": 2.304931640625,
308
+ "rewards/margins": 3.8041014671325684,
309
+ "rewards/rejected": -1.499169945716858,
310
  "step": 100
311
  },
312
  {
313
+ "epoch": 0.6730769230769231,
314
+ "grad_norm": 11.870080752446105,
315
+ "learning_rate": 3.5135135135135134e-07,
316
+ "logits/chosen": -2.0267577171325684,
317
+ "logits/rejected": -1.8447265625,
318
+ "logps/chosen": -259.40936279296875,
319
+ "logps/rejected": -498.8890686035156,
320
+ "loss": 0.2647,
321
+ "rewards/accuracies": 0.893750011920929,
322
+ "rewards/chosen": 2.0042967796325684,
323
+ "rewards/margins": 3.547900438308716,
324
+ "rewards/rejected": -1.542810082435608,
325
  "step": 105
326
  },
327
  {
328
+ "epoch": 0.7051282051282052,
329
+ "grad_norm": 8.196509121053467,
330
+ "learning_rate": 3.429054054054054e-07,
331
+ "logits/chosen": -2.0044922828674316,
332
+ "logits/rejected": -1.820703148841858,
333
+ "logps/chosen": -221.078125,
334
+ "logps/rejected": -435.2406311035156,
335
+ "loss": 0.3216,
336
+ "rewards/accuracies": 0.8812500238418579,
337
+ "rewards/chosen": 2.1091065406799316,
338
+ "rewards/margins": 3.4317383766174316,
339
+ "rewards/rejected": -1.322973608970642,
340
  "step": 110
341
  },
342
  {
343
+ "epoch": 0.7371794871794872,
344
+ "grad_norm": 17.275173620118444,
345
+ "learning_rate": 3.3445945945945946e-07,
346
+ "logits/chosen": -2.083203077316284,
347
+ "logits/rejected": -1.875585913658142,
348
+ "logps/chosen": -243.6531219482422,
349
+ "logps/rejected": -477.7124938964844,
350
+ "loss": 0.2858,
351
+ "rewards/accuracies": 0.893750011920929,
352
+ "rewards/chosen": 2.115771532058716,
353
+ "rewards/margins": 3.8758788108825684,
354
+ "rewards/rejected": -1.757867455482483,
355
  "step": 115
356
  },
357
  {
358
+ "epoch": 0.7692307692307693,
359
+ "grad_norm": 6.190361827721572,
360
+ "learning_rate": 3.260135135135135e-07,
361
+ "logits/chosen": -2.025390625,
362
+ "logits/rejected": -1.809960961341858,
363
+ "logps/chosen": -276.1343688964844,
364
+ "logps/rejected": -320.890625,
365
+ "loss": 0.2334,
366
+ "rewards/accuracies": 0.918749988079071,
367
+ "rewards/chosen": 2.201000928878784,
368
+ "rewards/margins": 4.690966606140137,
369
+ "rewards/rejected": -2.4933104515075684,
370
  "step": 120
371
  },
372
  {
373
+ "epoch": 0.8012820512820513,
374
+ "grad_norm": 17.015787309272795,
375
+ "learning_rate": 3.175675675675675e-07,
376
+ "logits/chosen": -1.984960913658142,
377
+ "logits/rejected": -1.833593726158142,
378
+ "logps/chosen": -248.43905639648438,
379
+ "logps/rejected": -294.2093811035156,
380
+ "loss": 0.3588,
381
+ "rewards/accuracies": 0.875,
382
+ "rewards/chosen": 2.134960889816284,
383
+ "rewards/margins": 3.898571729660034,
384
+ "rewards/rejected": -1.761315941810608,
385
  "step": 125
386
  },
387
  {
388
+ "epoch": 0.8333333333333334,
389
+ "grad_norm": 9.916096150406192,
390
+ "learning_rate": 3.091216216216216e-07,
391
+ "logits/chosen": -2.0804686546325684,
392
+ "logits/rejected": -1.899999976158142,
393
+ "logps/chosen": -237.94686889648438,
394
+ "logps/rejected": -357.84063720703125,
395
+ "loss": 0.2721,
396
+ "rewards/accuracies": 0.887499988079071,
397
+ "rewards/chosen": 2.0490965843200684,
398
+ "rewards/margins": 3.5159668922424316,
399
+ "rewards/rejected": -1.4671142101287842,
400
  "step": 130
401
  },
402
  {
403
+ "epoch": 0.8653846153846154,
404
+ "grad_norm": 19.43204229304952,
405
+ "learning_rate": 3.0067567567567564e-07,
406
+ "logits/chosen": -1.9519531726837158,
407
+ "logits/rejected": -1.7833983898162842,
408
+ "logps/chosen": -268.28436279296875,
409
+ "logps/rejected": -376.12188720703125,
410
+ "loss": 0.2836,
411
+ "rewards/accuracies": 0.84375,
412
+ "rewards/chosen": 2.1514039039611816,
413
+ "rewards/margins": 4.179858207702637,
414
+ "rewards/rejected": -2.028857469558716,
415
  "step": 135
416
  },
417
  {
418
+ "epoch": 0.8974358974358975,
419
+ "grad_norm": 14.35891063544634,
420
+ "learning_rate": 2.922297297297297e-07,
421
+ "logits/chosen": -2.075976610183716,
422
+ "logits/rejected": -1.883203148841858,
423
+ "logps/chosen": -203.46875,
424
+ "logps/rejected": -299.484375,
425
+ "loss": 0.2292,
426
+ "rewards/accuracies": 0.90625,
427
+ "rewards/chosen": 2.165087938308716,
428
+ "rewards/margins": 3.933666944503784,
429
+ "rewards/rejected": -1.7722899913787842,
430
  "step": 140
431
  },
432
  {
433
+ "epoch": 0.9294871794871795,
434
+ "grad_norm": 15.571301633489812,
435
+ "learning_rate": 2.8378378378378376e-07,
436
+ "logits/chosen": -2.001757860183716,
437
+ "logits/rejected": -1.8171875476837158,
438
+ "logps/chosen": -257.5218811035156,
439
+ "logps/rejected": -395.0625,
440
+ "loss": 0.2054,
441
+ "rewards/accuracies": 0.9312499761581421,
442
+ "rewards/chosen": 2.296875,
443
+ "rewards/margins": 4.208886623382568,
444
+ "rewards/rejected": -1.913354516029358,
445
  "step": 145
446
  },
447
  {
448
+ "epoch": 0.9615384615384616,
449
+ "grad_norm": 10.241466724079272,
450
+ "learning_rate": 2.7533783783783784e-07,
451
+ "logits/chosen": -2.010937452316284,
452
+ "logits/rejected": -1.7804687023162842,
453
+ "logps/chosen": -263.1890563964844,
454
+ "logps/rejected": -553.5797119140625,
455
+ "loss": 0.2288,
456
+ "rewards/accuracies": 0.9375,
457
+ "rewards/chosen": 2.049511671066284,
458
+ "rewards/margins": 4.341113090515137,
459
+ "rewards/rejected": -2.291332960128784,
460
  "step": 150
461
  },
462
  {
463
+ "epoch": 0.9935897435897436,
464
+ "grad_norm": 8.858764442710157,
465
+ "learning_rate": 2.6689189189189187e-07,
466
+ "logits/chosen": -2.089062452316284,
467
+ "logits/rejected": -1.937890648841858,
468
+ "logps/chosen": -220.54843139648438,
469
+ "logps/rejected": -315.1640625,
470
+ "loss": 0.2811,
471
+ "rewards/accuracies": 0.8687499761581421,
472
+ "rewards/chosen": 2.0038084983825684,
473
+ "rewards/margins": 3.595703125,
474
+ "rewards/rejected": -1.5892822742462158,
475
  "step": 155
476
  },
477
  {
478
+ "epoch": 1.0256410256410255,
479
+ "grad_norm": 9.684185372221096,
480
+ "learning_rate": 2.5844594594594596e-07,
481
+ "logits/chosen": -2.1142578125,
482
+ "logits/rejected": -1.8634765148162842,
483
+ "logps/chosen": -242.419921875,
484
+ "logps/rejected": -744.0179443359375,
485
+ "loss": 0.2283,
486
+ "rewards/accuracies": 0.918749988079071,
487
+ "rewards/chosen": 2.2816405296325684,
488
+ "rewards/margins": 4.6513671875,
489
+ "rewards/rejected": -2.3667969703674316,
490
  "step": 160
491
  },
492
  {
493
+ "epoch": 1.0576923076923077,
494
+ "grad_norm": 16.39607167381079,
495
+ "learning_rate": 2.5e-07,
496
+ "logits/chosen": -2.0179686546325684,
497
+ "logits/rejected": -1.838281273841858,
498
+ "logps/chosen": -285.15155029296875,
499
+ "logps/rejected": -555.5343627929688,
500
+ "loss": 0.2506,
501
+ "rewards/accuracies": 0.90625,
502
+ "rewards/chosen": 2.1651368141174316,
503
+ "rewards/margins": 3.9991211891174316,
504
+ "rewards/rejected": -1.832617163658142,
505
  "step": 165
506
  },
507
  {
508
+ "epoch": 1.0897435897435896,
509
+ "grad_norm": 12.71417077582037,
510
+ "learning_rate": 2.41554054054054e-07,
511
+ "logits/chosen": -1.9617187976837158,
512
+ "logits/rejected": -1.7705078125,
513
+ "logps/chosen": -229.18124389648438,
514
+ "logps/rejected": -388.0687561035156,
515
+ "loss": 0.2361,
516
+ "rewards/accuracies": 0.8999999761581421,
517
+ "rewards/chosen": 2.22900390625,
518
+ "rewards/margins": 4.506933689117432,
519
+ "rewards/rejected": -2.277844190597534,
520
  "step": 170
521
  },
522
  {
523
+ "epoch": 1.1217948717948718,
524
+ "grad_norm": 12.537778856529926,
525
+ "learning_rate": 2.331081081081081e-07,
526
+ "logits/chosen": -2.107226610183716,
527
+ "logits/rejected": -1.90234375,
528
+ "logps/chosen": -215.9656219482422,
529
+ "logps/rejected": -324.7749938964844,
530
+ "loss": 0.2086,
531
+ "rewards/accuracies": 0.918749988079071,
532
+ "rewards/chosen": 2.130053758621216,
533
+ "rewards/margins": 4.331640720367432,
534
+ "rewards/rejected": -2.203198194503784,
535
  "step": 175
536
  },
537
  {
538
+ "epoch": 1.1538461538461537,
539
+ "grad_norm": 13.46928418539436,
540
+ "learning_rate": 2.2466216216216216e-07,
541
+ "logits/chosen": -1.991601586341858,
542
+ "logits/rejected": -1.790429711341858,
543
+ "logps/chosen": -239.49063110351562,
544
+ "logps/rejected": -397.62811279296875,
545
+ "loss": 0.2422,
546
+ "rewards/accuracies": 0.8999999761581421,
547
+ "rewards/chosen": 2.213427782058716,
548
+ "rewards/margins": 4.355273246765137,
549
+ "rewards/rejected": -2.143115282058716,
550
  "step": 180
551
  },
552
  {
553
+ "epoch": 1.185897435897436,
554
+ "grad_norm": 16.721244760339083,
555
+ "learning_rate": 2.1621621621621622e-07,
556
+ "logits/chosen": -2.0091795921325684,
557
+ "logits/rejected": -1.8250000476837158,
558
+ "logps/chosen": -260.0874938964844,
559
+ "logps/rejected": -433.359375,
560
+ "loss": 0.3083,
561
+ "rewards/accuracies": 0.875,
562
+ "rewards/chosen": 2.1787109375,
563
+ "rewards/margins": 3.8218750953674316,
564
+ "rewards/rejected": -1.6440918445587158,
565
  "step": 185
566
  },
567
  {
568
+ "epoch": 1.217948717948718,
569
+ "grad_norm": 6.4545177104485845,
570
+ "learning_rate": 2.0777027027027025e-07,
571
+ "logits/chosen": -2.0591797828674316,
572
+ "logits/rejected": -1.8468749523162842,
573
+ "logps/chosen": -235.1687469482422,
574
+ "logps/rejected": -273.58123779296875,
575
+ "loss": 0.2293,
576
+ "rewards/accuracies": 0.9312499761581421,
577
+ "rewards/chosen": 2.180835008621216,
578
+ "rewards/margins": 3.9361329078674316,
579
+ "rewards/rejected": -1.753662109375,
580
  "step": 190
581
  },
582
  {
583
+ "epoch": 1.25,
584
+ "grad_norm": 13.390863695555577,
585
+ "learning_rate": 1.993243243243243e-07,
586
+ "logits/chosen": -2.015429735183716,
587
+ "logits/rejected": -1.857812523841858,
588
+ "logps/chosen": -247.94686889648438,
589
+ "logps/rejected": -330.03436279296875,
590
+ "loss": 0.2265,
591
+ "rewards/accuracies": 0.9125000238418579,
592
+ "rewards/chosen": 2.378588914871216,
593
+ "rewards/margins": 4.5875244140625,
594
+ "rewards/rejected": -2.206225633621216,
595
  "step": 195
596
  },
597
  {
598
+ "epoch": 1.282051282051282,
599
+ "grad_norm": 7.467973287614187,
600
+ "learning_rate": 1.9087837837837837e-07,
601
+ "logits/chosen": -1.9660155773162842,
602
+ "logits/rejected": -1.7736327648162842,
603
+ "logps/chosen": -242.78125,
604
+ "logps/rejected": -278.73126220703125,
605
+ "loss": 0.3117,
606
+ "rewards/accuracies": 0.8812500238418579,
607
+ "rewards/chosen": 2.247753858566284,
608
+ "rewards/margins": 3.825390577316284,
609
+ "rewards/rejected": -1.5797607898712158,
610
  "step": 200
611
  },
612
  {
613
+ "epoch": 1.314102564102564,
614
+ "grad_norm": 17.275598489252985,
615
+ "learning_rate": 1.8243243243243243e-07,
616
+ "logits/chosen": -2.0658202171325684,
617
+ "logits/rejected": -1.8517577648162842,
618
+ "logps/chosen": -226.86563110351562,
619
+ "logps/rejected": -353.4937438964844,
620
+ "loss": 0.254,
621
+ "rewards/accuracies": 0.862500011920929,
622
+ "rewards/chosen": 2.29681396484375,
623
+ "rewards/margins": 4.391015529632568,
624
+ "rewards/rejected": -2.097705125808716,
625
  "step": 205
626
  },
627
  {
628
+ "epoch": 1.3461538461538463,
629
+ "grad_norm": 10.44378100815255,
630
+ "learning_rate": 1.739864864864865e-07,
631
+ "logits/chosen": -2.025390625,
632
+ "logits/rejected": -1.800390601158142,
633
+ "logps/chosen": -193.6984405517578,
634
+ "logps/rejected": -289.08282470703125,
635
+ "loss": 0.2395,
636
+ "rewards/accuracies": 0.893750011920929,
637
+ "rewards/chosen": 2.071337938308716,
638
+ "rewards/margins": 3.8559813499450684,
639
+ "rewards/rejected": -1.7841675281524658,
640
  "step": 210
641
  },
642
  {
643
+ "epoch": 1.3782051282051282,
644
+ "grad_norm": 97.68377543207546,
645
+ "learning_rate": 1.6554054054054055e-07,
646
+ "logits/chosen": -1.9826171398162842,
647
+ "logits/rejected": -1.755859375,
648
+ "logps/chosen": -355.3062438964844,
649
+ "logps/rejected": -485.71563720703125,
650
+ "loss": 0.2301,
651
+ "rewards/accuracies": 0.925000011920929,
652
+ "rewards/chosen": 2.012890577316284,
653
+ "rewards/margins": 4.411523342132568,
654
+ "rewards/rejected": -2.397656202316284,
655
  "step": 215
656
  },
657
  {
658
+ "epoch": 1.4102564102564101,
659
+ "grad_norm": 10.308990703109377,
660
+ "learning_rate": 1.570945945945946e-07,
661
+ "logits/chosen": -2.0074219703674316,
662
+ "logits/rejected": -1.796484351158142,
663
+ "logps/chosen": -291.6031188964844,
664
+ "logps/rejected": -320.95001220703125,
665
+ "loss": 0.2796,
666
+ "rewards/accuracies": 0.8687499761581421,
667
+ "rewards/chosen": 1.8628418445587158,
668
+ "rewards/margins": 3.919921875,
669
+ "rewards/rejected": -2.0587158203125,
670
  "step": 220
671
  },
672
  {
673
+ "epoch": 1.4423076923076923,
674
+ "grad_norm": 7.8418596148572455,
675
+ "learning_rate": 1.4864864864864866e-07,
676
+ "logits/chosen": -2.051953077316284,
677
+ "logits/rejected": -1.8517577648162842,
678
+ "logps/chosen": -221.51171875,
679
+ "logps/rejected": -234.59375,
680
+ "loss": 0.2821,
681
+ "rewards/accuracies": 0.84375,
682
+ "rewards/chosen": 1.9519531726837158,
683
+ "rewards/margins": 3.46875,
684
+ "rewards/rejected": -1.521032691001892,
685
  "step": 225
686
  },
687
  {
688
+ "epoch": 1.4743589743589745,
689
+ "grad_norm": 19.99054898354689,
690
+ "learning_rate": 1.402027027027027e-07,
691
+ "logits/chosen": -1.9873046875,
692
+ "logits/rejected": -1.816015601158142,
693
+ "logps/chosen": -234.86874389648438,
694
+ "logps/rejected": -471.40625,
695
+ "loss": 0.2533,
696
+ "rewards/accuracies": 0.918749988079071,
697
+ "rewards/chosen": 1.637182593345642,
698
+ "rewards/margins": 3.6805663108825684,
699
+ "rewards/rejected": -2.044677734375,
700
  "step": 230
701
  },
702
  {
703
+ "epoch": 1.5064102564102564,
704
+ "grad_norm": 6.927398842550873,
705
+ "learning_rate": 1.3175675675675673e-07,
706
+ "logits/chosen": -2.116406202316284,
707
+ "logits/rejected": -1.890234351158142,
708
+ "logps/chosen": -236.1374969482422,
709
+ "logps/rejected": -326.1812438964844,
710
+ "loss": 0.2292,
711
+ "rewards/accuracies": 0.9125000238418579,
712
+ "rewards/chosen": 1.9124755859375,
713
+ "rewards/margins": 3.916796922683716,
714
+ "rewards/rejected": -2.0068116188049316,
715
  "step": 235
716
  },
717
  {
718
+ "epoch": 1.5384615384615383,
719
+ "grad_norm": 19.26565095128112,
720
+ "learning_rate": 1.233108108108108e-07,
721
+ "logits/chosen": -1.9998047351837158,
722
+ "logits/rejected": -1.8044922351837158,
723
+ "logps/chosen": -215.8718719482422,
724
+ "logps/rejected": -273.55780029296875,
725
+ "loss": 0.2959,
726
+ "rewards/accuracies": 0.8999999761581421,
727
+ "rewards/chosen": 1.870080590248108,
728
+ "rewards/margins": 3.464648485183716,
729
+ "rewards/rejected": -1.59326171875,
730
  "step": 240
731
  },
732
  {
733
+ "epoch": 1.5705128205128205,
734
+ "grad_norm": 23.53473342051176,
735
+ "learning_rate": 1.1486486486486487e-07,
736
+ "logits/chosen": -2.0589842796325684,
737
+ "logits/rejected": -1.871484398841858,
738
+ "logps/chosen": -288.2890625,
739
+ "logps/rejected": -367.5843811035156,
740
+ "loss": 0.2964,
741
+ "rewards/accuracies": 0.875,
742
+ "rewards/chosen": 1.398657202720642,
743
+ "rewards/margins": 3.667285203933716,
744
+ "rewards/rejected": -2.267504930496216,
745
  "step": 245
746
  },
747
  {
748
+ "epoch": 1.6025641025641026,
749
+ "grad_norm": 9.487545014290102,
750
+ "learning_rate": 1.0641891891891891e-07,
751
+ "logits/chosen": -2.122851610183716,
752
+ "logits/rejected": -1.9580078125,
753
+ "logps/chosen": -285.9078063964844,
754
+ "logps/rejected": -379.8843688964844,
755
+ "loss": 0.3032,
756
+ "rewards/accuracies": 0.918749988079071,
757
+ "rewards/chosen": 1.5487792491912842,
758
+ "rewards/margins": 3.930615186691284,
759
+ "rewards/rejected": -2.3807129859924316,
760
  "step": 250
761
  },
762
  {
763
+ "epoch": 1.6346153846153846,
764
+ "grad_norm": 34.335674181311,
765
+ "learning_rate": 9.797297297297297e-08,
766
+ "logits/chosen": -1.9865233898162842,
767
+ "logits/rejected": -1.8146483898162842,
768
+ "logps/chosen": -303.875,
769
+ "logps/rejected": -383.5625,
770
+ "loss": 0.2268,
771
+ "rewards/accuracies": 0.918749988079071,
772
+ "rewards/chosen": 2.1684327125549316,
773
+ "rewards/margins": 4.968652248382568,
774
+ "rewards/rejected": -2.7986207008361816,
775
  "step": 255
776
  },
777
  {
778
+ "epoch": 1.6666666666666665,
779
+ "grad_norm": 12.79380674096048,
780
+ "learning_rate": 8.952702702702702e-08,
781
+ "logits/chosen": -1.9738280773162842,
782
+ "logits/rejected": -1.810156226158142,
783
+ "logps/chosen": -224.6687469482422,
784
+ "logps/rejected": -420.2437438964844,
785
+ "loss": 0.2571,
786
+ "rewards/accuracies": 0.9375,
787
+ "rewards/chosen": 1.805883765220642,
788
+ "rewards/margins": 3.4756011962890625,
789
+ "rewards/rejected": -1.672705054283142,
790
  "step": 260
791
  },
792
  {
793
+ "epoch": 1.6987179487179487,
794
+ "grad_norm": 5.288061215805156,
795
+ "learning_rate": 8.108108108108108e-08,
796
+ "logits/chosen": -2.00390625,
797
+ "logits/rejected": -1.875585913658142,
798
+ "logps/chosen": -238.39688110351562,
799
+ "logps/rejected": -330.8125,
800
+ "loss": 0.198,
801
+ "rewards/accuracies": 0.925000011920929,
802
+ "rewards/chosen": 2.1185302734375,
803
+ "rewards/margins": 4.246289253234863,
804
+ "rewards/rejected": -2.1275634765625,
805
  "step": 265
806
  },
807
  {
808
+ "epoch": 1.7307692307692308,
809
+ "grad_norm": 15.41954334878335,
810
+ "learning_rate": 7.263513513513512e-08,
811
+ "logits/chosen": -2.037890672683716,
812
+ "logits/rejected": -1.8224608898162842,
813
+ "logps/chosen": -284.765625,
814
+ "logps/rejected": -533.375,
815
+ "loss": 0.2335,
816
+ "rewards/accuracies": 0.893750011920929,
817
+ "rewards/chosen": 1.4149596691131592,
818
+ "rewards/margins": 4.398095607757568,
819
+ "rewards/rejected": -2.9843382835388184,
820
  "step": 270
821
  },
822
  {
823
+ "epoch": 1.7628205128205128,
824
+ "grad_norm": 7.763713271113468,
825
+ "learning_rate": 6.418918918918918e-08,
826
+ "logits/chosen": -2.0380859375,
827
+ "logits/rejected": -1.8134765625,
828
+ "logps/chosen": -225.25,
829
+ "logps/rejected": -430.89373779296875,
830
+ "loss": 0.2135,
831
+ "rewards/accuracies": 0.925000011920929,
832
+ "rewards/chosen": 1.960363745689392,
833
+ "rewards/margins": 4.0166015625,
834
+ "rewards/rejected": -2.0557618141174316,
835
  "step": 275
836
  },
837
  {
838
+ "epoch": 1.7948717948717947,
839
+ "grad_norm": 4.521033115765149,
840
+ "learning_rate": 5.574324324324324e-08,
841
+ "logits/chosen": -2.044140577316284,
842
+ "logits/rejected": -1.811914086341858,
843
+ "logps/chosen": -144.3640594482422,
844
+ "logps/rejected": -276.8843688964844,
845
+ "loss": 0.2559,
846
+ "rewards/accuracies": 0.8812500238418579,
847
+ "rewards/chosen": 2.030041456222534,
848
+ "rewards/margins": 3.295703172683716,
849
+ "rewards/rejected": -1.2666351795196533,
850
  "step": 280
851
  },
852
  {
853
+ "epoch": 1.8269230769230769,
854
+ "grad_norm": 9.52459220706363,
855
+ "learning_rate": 4.72972972972973e-08,
856
+ "logits/chosen": -2.127734422683716,
857
+ "logits/rejected": -1.897070288658142,
858
+ "logps/chosen": -280.30157470703125,
859
+ "logps/rejected": -316.5171813964844,
860
+ "loss": 0.2486,
861
+ "rewards/accuracies": 0.887499988079071,
862
+ "rewards/chosen": 1.485009789466858,
863
+ "rewards/margins": 3.428417921066284,
864
+ "rewards/rejected": -1.9419434070587158,
865
  "step": 285
866
  },
867
  {
868
+ "epoch": 1.858974358974359,
869
+ "grad_norm": 8.906979076376345,
870
+ "learning_rate": 3.885135135135135e-08,
871
+ "logits/chosen": -2.0869140625,
872
+ "logits/rejected": -1.8507812023162842,
873
+ "logps/chosen": -217.7156219482422,
874
+ "logps/rejected": -319.4203186035156,
875
+ "loss": 0.2271,
876
+ "rewards/accuracies": 0.8812500238418579,
877
+ "rewards/chosen": 2.053515672683716,
878
+ "rewards/margins": 4.0621337890625,
879
+ "rewards/rejected": -2.010498046875,
880
  "step": 290
881
  },
882
  {
883
+ "epoch": 1.891025641025641,
884
+ "grad_norm": 9.745814847980592,
885
+ "learning_rate": 3.040540540540541e-08,
886
+ "logits/chosen": -2.052539110183716,
887
+ "logits/rejected": -1.8537108898162842,
888
+ "logps/chosen": -241.68905639648438,
889
+ "logps/rejected": -401.51251220703125,
890
+ "loss": 0.2465,
891
+ "rewards/accuracies": 0.90625,
892
+ "rewards/chosen": 1.9554443359375,
893
+ "rewards/margins": 3.714062452316284,
894
+ "rewards/rejected": -1.7587372064590454,
895
  "step": 295
896
  },
897
  {
898
+ "epoch": 1.9230769230769231,
899
+ "grad_norm": 10.279258989880054,
900
+ "learning_rate": 2.195945945945946e-08,
901
+ "logits/chosen": -1.91796875,
902
+ "logits/rejected": -1.7571289539337158,
903
+ "logps/chosen": -250.7375030517578,
904
+ "logps/rejected": -312.2203063964844,
905
+ "loss": 0.1891,
906
+ "rewards/accuracies": 0.9624999761581421,
907
+ "rewards/chosen": 2.004687547683716,
908
+ "rewards/margins": 4.478320121765137,
909
+ "rewards/rejected": -2.476269483566284,
910
  "step": 300
911
  },
912
  {
913
+ "epoch": 1.9551282051282053,
914
+ "grad_norm": 6.84223500138646,
915
+ "learning_rate": 1.3513513513513514e-08,
916
+ "logits/chosen": -2.0062499046325684,
917
+ "logits/rejected": -1.889062523841858,
918
+ "logps/chosen": -267.1031188964844,
919
+ "logps/rejected": -407.0718688964844,
920
+ "loss": 0.2554,
921
+ "rewards/accuracies": 0.90625,
922
+ "rewards/chosen": 1.6201751232147217,
923
+ "rewards/margins": 3.678515672683716,
924
+ "rewards/rejected": -2.054980516433716,
925
  "step": 305
926
  },
927
  {
928
+ "epoch": 1.9871794871794872,
929
+ "grad_norm": 6.084443681112896,
930
+ "learning_rate": 5.067567567567567e-09,
931
+ "logits/chosen": -2.08203125,
932
+ "logits/rejected": -1.8772461414337158,
933
+ "logps/chosen": -248.9656219482422,
934
+ "logps/rejected": -302.21875,
935
+ "loss": 0.2517,
936
+ "rewards/accuracies": 0.8812500238418579,
937
+ "rewards/chosen": 1.859375,
938
+ "rewards/margins": 3.5653076171875,
939
+ "rewards/rejected": -1.707067847251892,
940
  "step": 310
941
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
  {
943
  "epoch": 2.0,
944
+ "step": 312,
945
  "total_flos": 0.0,
946
+ "train_loss": 0.2867361557407257,
947
+ "train_runtime": 4280.6612,
948
+ "train_samples_per_second": 2.331,
949
+ "train_steps_per_second": 0.073
950
  }
951
  ],
952
  "logging_steps": 5,
953
+ "max_steps": 312,
954
  "num_input_tokens_seen": 0,
955
  "num_train_epochs": 2,
956
  "save_steps": 50,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f726cd8e3d8c89b2f6e027abd219a0a60b3a1f5446c82bda65ba8183d52ff7e9
3
  size 7800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:723f7ffc4922b75a0710d0a8e53e8aeebe78b402100ccee8397bbeee25c1df51
3
  size 7800