Muhammed164 commited on
Commit
5efde56
·
verified ·
1 Parent(s): bbb9b53

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:268aa3d2814a792a1ce12fc0ee5a43e0bc3f4dfbe66bca24ad57492c892f8b91
3
  size 204500912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065322e97e075055ae2c6bcbf10fdfffbac7dd29ef45906fca7a9bacc7abec43
3
  size 204500912
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d14bdbb174576769aa6486b61934c2015edc41a72d409074143c0b546c4f989b
3
- size 104062923
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0e97a95edb102b68426558e6f6306ffd55d9a0ba3fc011ab3b76edaea3a99e3
3
+ size 104062731
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d6d8fafcd1ee268414be5acf0366296af5b03d60871978712eac1979cb42d65
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c800b778fa7e115e4c34de8529902de8b61c9a1b4bab3eb8295d06dafff030e
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf3f9c1ea54f8f95e6812b6b4e99596105233cd3e123554db760e4aba93f83e4
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9a36b5dff54d4dc680b7c932dff5afaac16543707cbf68dd86d83d274f369f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,1518 +2,318 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.6540378863409773,
6
  "eval_steps": 500,
7
- "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.026586905948820207,
14
- "grad_norm": 92.03909301757812,
15
- "learning_rate": 9e-08,
16
- "logits/chosen": 1.8763988018035889,
17
- "logits/rejected": 2.256129264831543,
18
- "logps/chosen": -180.8492431640625,
19
- "logps/rejected": -294.6668395996094,
20
- "loss": 16.764971923828124,
21
  "rewards/accuracies": 0.643750011920929,
22
- "rewards/chosen": 50.876712799072266,
23
- "rewards/margins": 20.225709915161133,
24
- "rewards/rejected": 30.651004791259766,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.053173811897640415,
29
- "grad_norm": 72.75655364990234,
30
- "learning_rate": 1.8999999999999998e-07,
31
- "logits/chosen": 2.2195005416870117,
32
- "logits/rejected": 2.3702588081359863,
33
- "logps/chosen": -199.29591369628906,
34
- "logps/rejected": -293.90887451171875,
35
- "loss": 14.240003967285157,
36
- "rewards/accuracies": 0.625,
37
- "rewards/chosen": 53.17363357543945,
38
- "rewards/margins": 24.12602996826172,
39
- "rewards/rejected": 29.0476016998291,
40
  "step": 20
41
  },
42
  {
43
  "epoch": 0.07976071784646062,
44
- "grad_norm": 101.80017852783203,
45
- "learning_rate": 2.9e-07,
46
- "logits/chosen": 2.360567569732666,
47
- "logits/rejected": 2.484600305557251,
48
- "logps/chosen": -201.53787231445312,
49
- "logps/rejected": -286.9433898925781,
50
- "loss": 13.708811950683593,
51
- "rewards/accuracies": 0.6875,
52
- "rewards/chosen": 59.95117950439453,
53
- "rewards/margins": 25.14548110961914,
54
- "rewards/rejected": 34.805702209472656,
55
  "step": 30
56
  },
57
  {
58
  "epoch": 0.10634762379528083,
59
- "grad_norm": 85.12960052490234,
60
- "learning_rate": 3.8999999999999997e-07,
61
- "logits/chosen": 1.8842649459838867,
62
- "logits/rejected": 2.0478363037109375,
63
- "logps/chosen": -178.1483917236328,
64
- "logps/rejected": -285.0755920410156,
65
- "loss": 17.000025939941406,
66
- "rewards/accuracies": 0.5874999761581421,
67
- "rewards/chosen": 50.454071044921875,
68
- "rewards/margins": 17.00231170654297,
69
- "rewards/rejected": 33.451759338378906,
70
  "step": 40
71
  },
72
  {
73
  "epoch": 0.13293452974410103,
74
- "grad_norm": 33.85184097290039,
75
- "learning_rate": 4.9e-07,
76
- "logits/chosen": 2.229463577270508,
77
- "logits/rejected": 2.204373836517334,
78
- "logps/chosen": -212.55178833007812,
79
- "logps/rejected": -280.9806213378906,
80
- "loss": 18.418368530273437,
81
- "rewards/accuracies": 0.625,
82
- "rewards/chosen": 57.14277267456055,
83
- "rewards/margins": 21.43265151977539,
84
- "rewards/rejected": 35.710121154785156,
85
  "step": 50
86
  },
87
  {
88
  "epoch": 0.15952143569292124,
89
- "grad_norm": 61.26063537597656,
90
- "learning_rate": 5.9e-07,
91
- "logits/chosen": 2.045487403869629,
92
- "logits/rejected": 2.2564761638641357,
93
- "logps/chosen": -183.6549072265625,
94
- "logps/rejected": -311.9967956542969,
95
- "loss": 12.530684661865234,
96
- "rewards/accuracies": 0.6312500238418579,
97
- "rewards/chosen": 40.03901672363281,
98
- "rewards/margins": 22.57087516784668,
99
- "rewards/rejected": 17.468143463134766,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.18610834164174145,
104
- "grad_norm": 65.56060791015625,
105
- "learning_rate": 6.9e-07,
106
- "logits/chosen": 2.3583855628967285,
107
- "logits/rejected": 2.518134593963623,
108
- "logps/chosen": -215.484375,
109
- "logps/rejected": -292.7709045410156,
110
- "loss": 17.06499786376953,
111
- "rewards/accuracies": 0.581250011920929,
112
- "rewards/chosen": 54.328125,
113
- "rewards/margins": 18.9035587310791,
114
- "rewards/rejected": 35.42456817626953,
115
  "step": 70
116
  },
117
  {
118
  "epoch": 0.21269524759056166,
119
- "grad_norm": 78.31404876708984,
120
- "learning_rate": 7.9e-07,
121
- "logits/chosen": 2.389976978302002,
122
- "logits/rejected": 2.5061419010162354,
123
- "logps/chosen": -199.54867553710938,
124
- "logps/rejected": -313.9349670410156,
125
- "loss": 14.476513671875,
126
- "rewards/accuracies": 0.668749988079071,
127
- "rewards/chosen": 58.9052619934082,
128
- "rewards/margins": 28.68975830078125,
129
- "rewards/rejected": 30.215505599975586,
130
  "step": 80
131
  },
132
  {
133
  "epoch": 0.23928215353938184,
134
- "grad_norm": 55.09129333496094,
135
- "learning_rate": 8.9e-07,
136
- "logits/chosen": 2.343313694000244,
137
- "logits/rejected": 2.381267547607422,
138
- "logps/chosen": -195.31007385253906,
139
- "logps/rejected": -315.2503356933594,
140
- "loss": 13.177040100097656,
141
- "rewards/accuracies": 0.6000000238418579,
142
- "rewards/chosen": 62.564231872558594,
143
- "rewards/margins": 32.514305114746094,
144
- "rewards/rejected": 30.049936294555664,
145
  "step": 90
146
  },
147
  {
148
  "epoch": 0.26586905948820205,
149
- "grad_norm": 94.35275268554688,
150
- "learning_rate": 9.9e-07,
151
- "logits/chosen": 2.3925650119781494,
152
- "logits/rejected": 2.607084274291992,
153
- "logps/chosen": -189.29811096191406,
154
- "logps/rejected": -319.96844482421875,
155
- "loss": 18.26203155517578,
156
- "rewards/accuracies": 0.574999988079071,
157
- "rewards/chosen": 50.321632385253906,
158
- "rewards/margins": 22.894689559936523,
159
- "rewards/rejected": 27.426937103271484,
160
  "step": 100
161
  },
162
  {
163
  "epoch": 0.2924559654370223,
164
- "grad_norm": 93.89908599853516,
165
- "learning_rate": 9.9e-07,
166
- "logits/chosen": 2.4763197898864746,
167
- "logits/rejected": 2.6758036613464355,
168
- "logps/chosen": -187.5391082763672,
169
- "logps/rejected": -340.25250244140625,
170
- "loss": 10.57765884399414,
171
- "rewards/accuracies": 0.6875,
172
- "rewards/chosen": 58.184059143066406,
173
- "rewards/margins": 36.878929138183594,
174
- "rewards/rejected": 21.305124282836914,
175
  "step": 110
176
  },
177
  {
178
  "epoch": 0.3190428713858425,
179
- "grad_norm": 91.25633239746094,
180
- "learning_rate": 9.788888888888889e-07,
181
- "logits/chosen": 2.5278210639953613,
182
- "logits/rejected": 2.6886465549468994,
183
- "logps/chosen": -205.1584014892578,
184
- "logps/rejected": -349.90093994140625,
185
- "loss": 13.945356750488282,
186
- "rewards/accuracies": 0.6499999761581421,
187
- "rewards/chosen": 46.65082550048828,
188
- "rewards/margins": 27.151325225830078,
189
- "rewards/rejected": 19.499500274658203,
190
  "step": 120
191
  },
192
  {
193
  "epoch": 0.34562977733466266,
194
- "grad_norm": 96.81977844238281,
195
- "learning_rate": 9.677777777777777e-07,
196
- "logits/chosen": 3.0266711711883545,
197
- "logits/rejected": 3.194408416748047,
198
- "logps/chosen": -198.1504669189453,
199
- "logps/rejected": -356.78485107421875,
200
- "loss": 15.321591186523438,
201
- "rewards/accuracies": 0.6312500238418579,
202
- "rewards/chosen": 58.57115936279297,
203
- "rewards/margins": 34.659385681152344,
204
- "rewards/rejected": 23.911775588989258,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.3722166832834829,
209
- "grad_norm": 93.63339233398438,
210
- "learning_rate": 9.566666666666667e-07,
211
- "logits/chosen": 3.055471181869507,
212
- "logits/rejected": 3.145911455154419,
213
- "logps/chosen": -219.0845184326172,
214
- "logps/rejected": -345.9827880859375,
215
- "loss": 13.172528076171876,
216
- "rewards/accuracies": 0.6499999761581421,
217
- "rewards/chosen": 55.67122268676758,
218
- "rewards/margins": 32.06965637207031,
219
- "rewards/rejected": 23.6015682220459,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.3988035892323031,
224
- "grad_norm": 73.2032699584961,
225
- "learning_rate": 9.455555555555556e-07,
226
- "logits/chosen": 2.777052640914917,
227
- "logits/rejected": 2.8150634765625,
228
- "logps/chosen": -197.19174194335938,
229
- "logps/rejected": -374.73822021484375,
230
- "loss": 15.409014892578124,
231
- "rewards/accuracies": 0.643750011920929,
232
- "rewards/chosen": 48.60415267944336,
233
- "rewards/margins": 28.52492332458496,
234
- "rewards/rejected": 20.079227447509766,
235
  "step": 150
236
  },
237
  {
238
  "epoch": 0.4253904951811233,
239
- "grad_norm": 66.92320251464844,
240
- "learning_rate": 9.344444444444444e-07,
241
- "logits/chosen": 2.996166467666626,
242
- "logits/rejected": 3.1385650634765625,
243
- "logps/chosen": -212.75375366210938,
244
- "logps/rejected": -371.66522216796875,
245
- "loss": 10.577291870117188,
246
- "rewards/accuracies": 0.6937500238418579,
247
- "rewards/chosen": 63.399871826171875,
248
- "rewards/margins": 42.74666213989258,
249
- "rewards/rejected": 20.653209686279297,
250
  "step": 160
251
  },
252
  {
253
  "epoch": 0.4519774011299435,
254
- "grad_norm": 64.92620849609375,
255
- "learning_rate": 9.233333333333333e-07,
256
- "logits/chosen": 2.832219362258911,
257
- "logits/rejected": 3.1428098678588867,
258
- "logps/chosen": -196.91094970703125,
259
- "logps/rejected": -397.47064208984375,
260
- "loss": 12.637787628173829,
261
- "rewards/accuracies": 0.699999988079071,
262
- "rewards/chosen": 54.22844314575195,
263
- "rewards/margins": 42.87944793701172,
264
- "rewards/rejected": 11.348990440368652,
265
  "step": 170
266
  },
267
  {
268
  "epoch": 0.4785643070787637,
269
- "grad_norm": 88.73342895507812,
270
- "learning_rate": 9.122222222222222e-07,
271
- "logits/chosen": 3.001598358154297,
272
- "logits/rejected": 3.18257737159729,
273
- "logps/chosen": -204.97628784179688,
274
- "logps/rejected": -451.9921875,
275
- "loss": 10.512740325927734,
276
- "rewards/accuracies": 0.699999988079071,
277
- "rewards/chosen": 49.384117126464844,
278
- "rewards/margins": 53.00005340576172,
279
- "rewards/rejected": -3.6159355640411377,
280
  "step": 180
281
  },
282
  {
283
  "epoch": 0.5051512130275839,
284
- "grad_norm": 97.91619110107422,
285
- "learning_rate": 9.01111111111111e-07,
286
- "logits/chosen": 2.735273599624634,
287
- "logits/rejected": 2.9921531677246094,
288
- "logps/chosen": -185.69210815429688,
289
- "logps/rejected": -439.780029296875,
290
- "loss": 7.097893524169922,
291
- "rewards/accuracies": 0.75,
292
- "rewards/chosen": 51.99469757080078,
293
- "rewards/margins": 52.051963806152344,
294
- "rewards/rejected": -0.057262420654296875,
295
  "step": 190
296
  },
297
  {
298
  "epoch": 0.5317381189764041,
299
- "grad_norm": 75.27459716796875,
300
- "learning_rate": 8.9e-07,
301
- "logits/chosen": 3.015864610671997,
302
- "logits/rejected": 3.321819305419922,
303
- "logps/chosen": -192.43789672851562,
304
- "logps/rejected": -471.51055908203125,
305
- "loss": 11.397718048095703,
306
- "rewards/accuracies": 0.7124999761581421,
307
- "rewards/chosen": 49.085548400878906,
308
- "rewards/margins": 58.6345329284668,
309
- "rewards/rejected": -9.54898452758789,
310
- "step": 200
311
- },
312
- {
313
- "epoch": 0.5583250249252243,
314
- "grad_norm": 67.41759490966797,
315
- "learning_rate": 8.788888888888889e-07,
316
- "logits/chosen": 3.120459794998169,
317
- "logits/rejected": 3.3150908946990967,
318
- "logps/chosen": -201.68368530273438,
319
- "logps/rejected": -479.9122619628906,
320
- "loss": 9.525629425048828,
321
- "rewards/accuracies": 0.75,
322
- "rewards/chosen": 49.86951446533203,
323
- "rewards/margins": 59.1157112121582,
324
- "rewards/rejected": -9.246195793151855,
325
- "step": 210
326
- },
327
- {
328
- "epoch": 0.5849119308740446,
329
- "grad_norm": 89.28022003173828,
330
- "learning_rate": 8.677777777777777e-07,
331
- "logits/chosen": 3.2067043781280518,
332
- "logits/rejected": 3.2518234252929688,
333
- "logps/chosen": -198.66351318359375,
334
- "logps/rejected": -459.51287841796875,
335
- "loss": 11.018878936767578,
336
- "rewards/accuracies": 0.6937500238418579,
337
- "rewards/chosen": 52.5185432434082,
338
- "rewards/margins": 51.389251708984375,
339
- "rewards/rejected": 1.129294991493225,
340
- "step": 220
341
- },
342
- {
343
- "epoch": 0.6114988368228648,
344
- "grad_norm": 57.3789176940918,
345
- "learning_rate": 8.566666666666667e-07,
346
- "logits/chosen": 3.428664445877075,
347
- "logits/rejected": 3.6689727306365967,
348
- "logps/chosen": -188.14273071289062,
349
- "logps/rejected": -484.1524963378906,
350
- "loss": 8.581022644042969,
351
- "rewards/accuracies": 0.7124999761581421,
352
- "rewards/chosen": 49.485206604003906,
353
- "rewards/margins": 54.331947326660156,
354
- "rewards/rejected": -4.846745491027832,
355
- "step": 230
356
- },
357
- {
358
- "epoch": 0.638085742771685,
359
- "grad_norm": 78.1611099243164,
360
- "learning_rate": 8.455555555555555e-07,
361
- "logits/chosen": 3.273719072341919,
362
- "logits/rejected": 3.5895423889160156,
363
- "logps/chosen": -198.40890502929688,
364
- "logps/rejected": -568.0107421875,
365
- "loss": 8.444003295898437,
366
- "rewards/accuracies": 0.793749988079071,
367
- "rewards/chosen": 55.26072311401367,
368
- "rewards/margins": 84.45598602294922,
369
- "rewards/rejected": -29.195270538330078,
370
- "step": 240
371
- },
372
- {
373
- "epoch": 0.6646726487205051,
374
- "grad_norm": 87.57330322265625,
375
- "learning_rate": 8.344444444444444e-07,
376
- "logits/chosen": 3.921356201171875,
377
- "logits/rejected": 4.107032775878906,
378
- "logps/chosen": -219.56887817382812,
379
- "logps/rejected": -529.9613647460938,
380
- "loss": 11.188172912597656,
381
- "rewards/accuracies": 0.7437499761581421,
382
- "rewards/chosen": 63.129974365234375,
383
- "rewards/margins": 69.01350402832031,
384
- "rewards/rejected": -5.883524417877197,
385
- "step": 250
386
- },
387
- {
388
- "epoch": 0.6912595546693253,
389
- "grad_norm": 78.89329528808594,
390
- "learning_rate": 8.233333333333333e-07,
391
- "logits/chosen": 4.225001335144043,
392
- "logits/rejected": 4.386289119720459,
393
- "logps/chosen": -241.30734252929688,
394
- "logps/rejected": -510.9679260253906,
395
- "loss": 10.638973236083984,
396
- "rewards/accuracies": 0.7437499761581421,
397
- "rewards/chosen": 55.27728271484375,
398
- "rewards/margins": 60.547119140625,
399
- "rewards/rejected": -5.269834041595459,
400
- "step": 260
401
- },
402
- {
403
- "epoch": 0.7178464606181456,
404
- "grad_norm": 88.03643798828125,
405
- "learning_rate": 8.122222222222221e-07,
406
- "logits/chosen": 3.850262403488159,
407
- "logits/rejected": 4.041484832763672,
408
- "logps/chosen": -241.4730987548828,
409
- "logps/rejected": -527.6182250976562,
410
- "loss": 11.718121337890626,
411
- "rewards/accuracies": 0.699999988079071,
412
- "rewards/chosen": 48.15225601196289,
413
- "rewards/margins": 54.50339889526367,
414
- "rewards/rejected": -6.351143836975098,
415
- "step": 270
416
- },
417
- {
418
- "epoch": 0.7444333665669658,
419
- "grad_norm": 161.95907592773438,
420
- "learning_rate": 8.01111111111111e-07,
421
- "logits/chosen": 3.8429579734802246,
422
- "logits/rejected": 4.090743064880371,
423
- "logps/chosen": -197.0509033203125,
424
- "logps/rejected": -592.664794921875,
425
- "loss": 8.627317810058594,
426
- "rewards/accuracies": 0.762499988079071,
427
- "rewards/chosen": 44.076698303222656,
428
- "rewards/margins": 71.58090209960938,
429
- "rewards/rejected": -27.504215240478516,
430
- "step": 280
431
- },
432
- {
433
- "epoch": 0.771020272515786,
434
- "grad_norm": 59.20638656616211,
435
- "learning_rate": 7.9e-07,
436
- "logits/chosen": 3.9963154792785645,
437
- "logits/rejected": 4.247437477111816,
438
- "logps/chosen": -198.98069763183594,
439
- "logps/rejected": -636.8574829101562,
440
- "loss": 7.916163635253906,
441
- "rewards/accuracies": 0.8062499761581421,
442
- "rewards/chosen": 45.44010543823242,
443
- "rewards/margins": 82.26910400390625,
444
- "rewards/rejected": -36.82899475097656,
445
- "step": 290
446
- },
447
- {
448
- "epoch": 0.7976071784646062,
449
- "grad_norm": 3.873155947076157e-05,
450
- "learning_rate": 7.788888888888889e-07,
451
- "logits/chosen": 3.5579922199249268,
452
- "logits/rejected": 3.9878501892089844,
453
- "logps/chosen": -208.97335815429688,
454
- "logps/rejected": -655.0528564453125,
455
- "loss": 6.305292129516602,
456
- "rewards/accuracies": 0.8187500238418579,
457
- "rewards/chosen": 36.67095184326172,
458
- "rewards/margins": 88.25028991699219,
459
- "rewards/rejected": -51.57932662963867,
460
- "step": 300
461
- },
462
- {
463
- "epoch": 0.8241940844134263,
464
- "grad_norm": 101.79195404052734,
465
- "learning_rate": 7.677777777777778e-07,
466
- "logits/chosen": 4.391497611999512,
467
- "logits/rejected": 4.607339382171631,
468
- "logps/chosen": -249.1584930419922,
469
- "logps/rejected": -639.23974609375,
470
- "loss": 8.43834228515625,
471
- "rewards/accuracies": 0.78125,
472
- "rewards/chosen": 43.91454315185547,
473
- "rewards/margins": 78.60186004638672,
474
- "rewards/rejected": -34.68731689453125,
475
- "step": 310
476
- },
477
- {
478
- "epoch": 0.8507809903622466,
479
- "grad_norm": 91.69438171386719,
480
- "learning_rate": 7.566666666666667e-07,
481
- "logits/chosen": 4.2728271484375,
482
- "logits/rejected": 4.45902156829834,
483
- "logps/chosen": -242.8008575439453,
484
- "logps/rejected": -614.6475830078125,
485
- "loss": 8.622640228271484,
486
- "rewards/accuracies": 0.793749988079071,
487
- "rewards/chosen": 51.20795440673828,
488
- "rewards/margins": 79.56620788574219,
489
- "rewards/rejected": -28.358264923095703,
490
- "step": 320
491
- },
492
- {
493
- "epoch": 0.8773678963110668,
494
- "grad_norm": 64.15619659423828,
495
- "learning_rate": 7.455555555555555e-07,
496
- "logits/chosen": 4.106622219085693,
497
- "logits/rejected": 4.50801944732666,
498
- "logps/chosen": -230.63919067382812,
499
- "logps/rejected": -676.0430908203125,
500
- "loss": 5.273190307617187,
501
- "rewards/accuracies": 0.793749988079071,
502
- "rewards/chosen": 40.392784118652344,
503
- "rewards/margins": 91.14765930175781,
504
- "rewards/rejected": -50.75487518310547,
505
- "step": 330
506
- },
507
- {
508
- "epoch": 0.903954802259887,
509
- "grad_norm": 97.1626205444336,
510
- "learning_rate": 7.344444444444444e-07,
511
- "logits/chosen": 4.293347358703613,
512
- "logits/rejected": 4.595992088317871,
513
- "logps/chosen": -215.70938110351562,
514
- "logps/rejected": -666.0777587890625,
515
- "loss": 5.9850719451904295,
516
- "rewards/accuracies": 0.762499988079071,
517
- "rewards/chosen": 44.4474983215332,
518
- "rewards/margins": 83.35242462158203,
519
- "rewards/rejected": -38.90492630004883,
520
- "step": 340
521
- },
522
- {
523
- "epoch": 0.9305417082087072,
524
- "grad_norm": 85.94694519042969,
525
- "learning_rate": 7.233333333333333e-07,
526
- "logits/chosen": 4.838589191436768,
527
- "logits/rejected": 5.159350872039795,
528
- "logps/chosen": -264.10711669921875,
529
- "logps/rejected": -796.7295532226562,
530
- "loss": 3.4507820129394533,
531
- "rewards/accuracies": 0.856249988079071,
532
- "rewards/chosen": 42.16107940673828,
533
- "rewards/margins": 113.5744857788086,
534
- "rewards/rejected": -71.41340637207031,
535
- "step": 350
536
- },
537
- {
538
- "epoch": 0.9571286141575274,
539
- "grad_norm": 236.06187438964844,
540
- "learning_rate": 7.122222222222221e-07,
541
- "logits/chosen": 4.589522361755371,
542
- "logits/rejected": 4.999955177307129,
543
- "logps/chosen": -258.4150695800781,
544
- "logps/rejected": -727.7996826171875,
545
- "loss": 6.599867248535157,
546
- "rewards/accuracies": 0.8062499761581421,
547
- "rewards/chosen": 35.76854705810547,
548
- "rewards/margins": 92.19223022460938,
549
- "rewards/rejected": -56.42368698120117,
550
- "step": 360
551
- },
552
- {
553
- "epoch": 0.9837155201063477,
554
- "grad_norm": 100.6878662109375,
555
- "learning_rate": 7.01111111111111e-07,
556
- "logits/chosen": 5.191050052642822,
557
- "logits/rejected": 5.309014320373535,
558
- "logps/chosen": -296.0009460449219,
559
- "logps/rejected": -759.0372924804688,
560
- "loss": 3.534566116333008,
561
- "rewards/accuracies": 0.856249988079071,
562
- "rewards/chosen": 38.79043960571289,
563
- "rewards/margins": 100.0891342163086,
564
- "rewards/rejected": -61.29869842529297,
565
- "step": 370
566
- },
567
- {
568
- "epoch": 1.007976071784646,
569
- "grad_norm": 71.57064819335938,
570
- "learning_rate": 6.9e-07,
571
- "logits/chosen": 5.157181262969971,
572
- "logits/rejected": 5.5708699226379395,
573
- "logps/chosen": -288.7073059082031,
574
- "logps/rejected": -775.53173828125,
575
- "loss": 2.5931621551513673,
576
- "rewards/accuracies": 0.8698630332946777,
577
- "rewards/chosen": 35.64141845703125,
578
- "rewards/margins": 96.54639434814453,
579
- "rewards/rejected": -60.904972076416016,
580
- "step": 380
581
- },
582
- {
583
- "epoch": 1.0345629777334662,
584
- "grad_norm": 80.57012939453125,
585
- "learning_rate": 6.788888888888889e-07,
586
- "logits/chosen": 4.908313274383545,
587
- "logits/rejected": 5.281552791595459,
588
- "logps/chosen": -255.53994750976562,
589
- "logps/rejected": -876.7796630859375,
590
- "loss": 3.100166130065918,
591
- "rewards/accuracies": 0.8999999761581421,
592
- "rewards/chosen": 29.122013092041016,
593
- "rewards/margins": 121.50981140136719,
594
- "rewards/rejected": -92.38780212402344,
595
- "step": 390
596
- },
597
- {
598
- "epoch": 1.0611498836822866,
599
- "grad_norm": 62.662662506103516,
600
- "learning_rate": 6.677777777777778e-07,
601
- "logits/chosen": 5.439974308013916,
602
- "logits/rejected": 5.852138519287109,
603
- "logps/chosen": -291.69049072265625,
604
- "logps/rejected": -910.0514526367188,
605
- "loss": 3.5051338195800783,
606
- "rewards/accuracies": 0.887499988079071,
607
- "rewards/chosen": 25.592533111572266,
608
- "rewards/margins": 119.5553207397461,
609
- "rewards/rejected": -93.96280670166016,
610
- "step": 400
611
- },
612
- {
613
- "epoch": 1.0877367896311068,
614
- "grad_norm": 178.87042236328125,
615
- "learning_rate": 6.566666666666666e-07,
616
- "logits/chosen": 5.411637783050537,
617
- "logits/rejected": 5.973018646240234,
618
- "logps/chosen": -308.68487548828125,
619
- "logps/rejected": -894.93359375,
620
- "loss": 1.9858436584472656,
621
- "rewards/accuracies": 0.8687499761581421,
622
- "rewards/chosen": 25.565839767456055,
623
- "rewards/margins": 120.69632720947266,
624
- "rewards/rejected": -95.13047790527344,
625
- "step": 410
626
- },
627
- {
628
- "epoch": 1.114323695579927,
629
- "grad_norm": 5.069334747531684e-07,
630
- "learning_rate": 6.455555555555555e-07,
631
- "logits/chosen": 5.404343605041504,
632
- "logits/rejected": 5.771250247955322,
633
- "logps/chosen": -301.6144104003906,
634
- "logps/rejected": -933.3482666015625,
635
- "loss": 2.1804153442382814,
636
- "rewards/accuracies": 0.893750011920929,
637
- "rewards/chosen": 25.12398910522461,
638
- "rewards/margins": 123.10489654541016,
639
- "rewards/rejected": -97.98091125488281,
640
- "step": 420
641
- },
642
- {
643
- "epoch": 1.1409106015287471,
644
- "grad_norm": 42.221588134765625,
645
- "learning_rate": 6.344444444444444e-07,
646
- "logits/chosen": 5.612006187438965,
647
- "logits/rejected": 6.014307022094727,
648
- "logps/chosen": -324.22314453125,
649
- "logps/rejected": -896.99658203125,
650
- "loss": 2.2177616119384767,
651
- "rewards/accuracies": 0.875,
652
- "rewards/chosen": 22.134746551513672,
653
- "rewards/margins": 109.2309341430664,
654
- "rewards/rejected": -87.09618377685547,
655
- "step": 430
656
- },
657
- {
658
- "epoch": 1.1674975074775673,
659
- "grad_norm": 26.945816040039062,
660
- "learning_rate": 6.233333333333332e-07,
661
- "logits/chosen": 5.816843509674072,
662
- "logits/rejected": 6.254372596740723,
663
- "logps/chosen": -350.730712890625,
664
- "logps/rejected": -915.1998291015625,
665
- "loss": 1.6761651992797852,
666
- "rewards/accuracies": 0.887499988079071,
667
- "rewards/chosen": 27.9744873046875,
668
- "rewards/margins": 122.52757263183594,
669
- "rewards/rejected": -94.55308532714844,
670
- "step": 440
671
- },
672
- {
673
- "epoch": 1.1940844134263875,
674
- "grad_norm": 201.8599853515625,
675
- "learning_rate": 6.122222222222222e-07,
676
- "logits/chosen": 5.693143844604492,
677
- "logits/rejected": 6.162411689758301,
678
- "logps/chosen": -299.4298095703125,
679
- "logps/rejected": -959.1011962890625,
680
- "loss": 2.2222429275512696,
681
- "rewards/accuracies": 0.90625,
682
- "rewards/chosen": 22.612167358398438,
683
- "rewards/margins": 126.55992126464844,
684
- "rewards/rejected": -103.94776916503906,
685
- "step": 450
686
- },
687
- {
688
- "epoch": 1.2206713193752077,
689
- "grad_norm": 139.24583435058594,
690
- "learning_rate": 6.011111111111112e-07,
691
- "logits/chosen": 5.940896034240723,
692
- "logits/rejected": 6.417025566101074,
693
- "logps/chosen": -349.1092834472656,
694
- "logps/rejected": -947.8054809570312,
695
- "loss": 1.6525358200073241,
696
- "rewards/accuracies": 0.9125000238418579,
697
- "rewards/chosen": 20.343765258789062,
698
- "rewards/margins": 119.97611999511719,
699
- "rewards/rejected": -99.63237762451172,
700
- "step": 460
701
- },
702
- {
703
- "epoch": 1.2472582253240279,
704
- "grad_norm": 207.5171661376953,
705
- "learning_rate": 5.9e-07,
706
- "logits/chosen": 6.329422950744629,
707
- "logits/rejected": 6.6328911781311035,
708
- "logps/chosen": -353.35308837890625,
709
- "logps/rejected": -962.7057495117188,
710
- "loss": 2.509499740600586,
711
- "rewards/accuracies": 0.8687499761581421,
712
- "rewards/chosen": 18.188751220703125,
713
- "rewards/margins": 119.3648910522461,
714
- "rewards/rejected": -101.1761474609375,
715
- "step": 470
716
- },
717
- {
718
- "epoch": 1.273845131272848,
719
- "grad_norm": 88.39582824707031,
720
- "learning_rate": 5.788888888888889e-07,
721
- "logits/chosen": 6.743104457855225,
722
- "logits/rejected": 7.0454840660095215,
723
- "logps/chosen": -417.47314453125,
724
- "logps/rejected": -1049.904052734375,
725
- "loss": 1.0497099876403808,
726
- "rewards/accuracies": 0.9375,
727
- "rewards/chosen": 14.974113464355469,
728
- "rewards/margins": 132.9074249267578,
729
- "rewards/rejected": -117.93331146240234,
730
- "step": 480
731
- },
732
- {
733
- "epoch": 1.3004320372216682,
734
- "grad_norm": 22.960351943969727,
735
- "learning_rate": 5.677777777777778e-07,
736
- "logits/chosen": 6.46866512298584,
737
- "logits/rejected": 6.957917213439941,
738
- "logps/chosen": -405.8673400878906,
739
- "logps/rejected": -1068.978759765625,
740
- "loss": 1.990153694152832,
741
- "rewards/accuracies": 0.925000011920929,
742
- "rewards/chosen": 11.08216381072998,
743
- "rewards/margins": 135.7135772705078,
744
- "rewards/rejected": -124.63139343261719,
745
- "step": 490
746
- },
747
- {
748
- "epoch": 1.3270189431704886,
749
- "grad_norm": 2.325967418670416e-09,
750
- "learning_rate": 5.566666666666666e-07,
751
- "logits/chosen": 7.048731327056885,
752
- "logits/rejected": 7.304018497467041,
753
- "logps/chosen": -447.47161865234375,
754
- "logps/rejected": -1018.6476440429688,
755
- "loss": 1.1221290588378907,
756
- "rewards/accuracies": 0.9437500238418579,
757
- "rewards/chosen": 12.060667037963867,
758
- "rewards/margins": 117.6533203125,
759
- "rewards/rejected": -105.5926513671875,
760
- "step": 500
761
- },
762
- {
763
- "epoch": 1.3536058491193088,
764
- "grad_norm": 0.007949860766530037,
765
- "learning_rate": 5.455555555555555e-07,
766
- "logits/chosen": 6.571198463439941,
767
- "logits/rejected": 6.9949541091918945,
768
- "logps/chosen": -425.16229248046875,
769
- "logps/rejected": -1075.010009765625,
770
- "loss": 1.1256651878356934,
771
- "rewards/accuracies": 0.918749988079071,
772
- "rewards/chosen": 6.658470153808594,
773
- "rewards/margins": 131.23922729492188,
774
- "rewards/rejected": -124.58077239990234,
775
- "step": 510
776
- },
777
- {
778
- "epoch": 1.380192755068129,
779
- "grad_norm": 162.58592224121094,
780
- "learning_rate": 5.344444444444445e-07,
781
- "logits/chosen": 6.9003005027771,
782
- "logits/rejected": 7.386146545410156,
783
- "logps/chosen": -386.6710510253906,
784
- "logps/rejected": -1033.316650390625,
785
- "loss": 2.134552001953125,
786
- "rewards/accuracies": 0.925000011920929,
787
- "rewards/chosen": 8.298912048339844,
788
- "rewards/margins": 122.3122329711914,
789
- "rewards/rejected": -114.0133056640625,
790
- "step": 520
791
- },
792
- {
793
- "epoch": 1.4067796610169492,
794
- "grad_norm": 2.28546106484373e-08,
795
- "learning_rate": 5.233333333333333e-07,
796
- "logits/chosen": 6.5765581130981445,
797
- "logits/rejected": 7.117588996887207,
798
- "logps/chosen": -364.48895263671875,
799
- "logps/rejected": -1099.998291015625,
800
- "loss": 1.2829697608947754,
801
  "rewards/accuracies": 0.956250011920929,
802
- "rewards/chosen": 8.331583976745605,
803
- "rewards/margins": 137.1312713623047,
804
- "rewards/rejected": -128.7996826171875,
805
- "step": 530
806
- },
807
- {
808
- "epoch": 1.4333665669657694,
809
- "grad_norm": 0.8024188280105591,
810
- "learning_rate": 5.122222222222222e-07,
811
- "logits/chosen": 6.9486517906188965,
812
- "logits/rejected": 7.226126194000244,
813
- "logps/chosen": -452.52410888671875,
814
- "logps/rejected": -1101.6812744140625,
815
- "loss": 0.855518913269043,
816
- "rewards/accuracies": 0.9624999761581421,
817
- "rewards/chosen": 3.314232587814331,
818
- "rewards/margins": 140.82302856445312,
819
- "rewards/rejected": -137.50875854492188,
820
- "step": 540
821
- },
822
- {
823
- "epoch": 1.4599534729145895,
824
- "grad_norm": 283.451416015625,
825
- "learning_rate": 5.011111111111111e-07,
826
- "logits/chosen": 6.747658729553223,
827
- "logits/rejected": 7.270951271057129,
828
- "logps/chosen": -445.3887634277344,
829
- "logps/rejected": -1164.9342041015625,
830
- "loss": 0.7842754364013672,
831
- "rewards/accuracies": 0.9437500238418579,
832
- "rewards/chosen": 1.7988097667694092,
833
- "rewards/margins": 148.59938049316406,
834
- "rewards/rejected": -146.80056762695312,
835
- "step": 550
836
- },
837
- {
838
- "epoch": 1.4865403788634097,
839
- "grad_norm": 6.495264507836457e-20,
840
- "learning_rate": 4.9e-07,
841
- "logits/chosen": 7.210297584533691,
842
- "logits/rejected": 7.638421535491943,
843
- "logps/chosen": -495.406494140625,
844
- "logps/rejected": -1289.328125,
845
- "loss": 1.998776626586914,
846
- "rewards/accuracies": 0.956250011920929,
847
- "rewards/chosen": 1.3115170001983643,
848
- "rewards/margins": 165.7671661376953,
849
- "rewards/rejected": -164.4556427001953,
850
- "step": 560
851
- },
852
- {
853
- "epoch": 1.5131272848122301,
854
- "grad_norm": 162.8050994873047,
855
- "learning_rate": 4.788888888888889e-07,
856
- "logits/chosen": 7.229719638824463,
857
- "logits/rejected": 7.55483341217041,
858
- "logps/chosen": -467.35791015625,
859
- "logps/rejected": -1125.8382568359375,
860
- "loss": 0.433735990524292,
861
- "rewards/accuracies": 0.9437500238418579,
862
- "rewards/chosen": 1.6890428066253662,
863
- "rewards/margins": 135.47544860839844,
864
- "rewards/rejected": -133.78640747070312,
865
- "step": 570
866
- },
867
- {
868
- "epoch": 1.53971419076105,
869
- "grad_norm": 198.3883514404297,
870
- "learning_rate": 4.677777777777778e-07,
871
- "logits/chosen": 6.802654266357422,
872
- "logits/rejected": 7.2844390869140625,
873
- "logps/chosen": -412.0586853027344,
874
- "logps/rejected": -1095.768798828125,
875
- "loss": 0.20694947242736816,
876
- "rewards/accuracies": 0.949999988079071,
877
- "rewards/chosen": 3.9473280906677246,
878
- "rewards/margins": 135.24732971191406,
879
- "rewards/rejected": -131.3000030517578,
880
- "step": 580
881
- },
882
- {
883
- "epoch": 1.5663010967098705,
884
- "grad_norm": 2.5432399297642405e-08,
885
- "learning_rate": 4.5666666666666665e-07,
886
- "logits/chosen": 7.081494331359863,
887
- "logits/rejected": 7.492499351501465,
888
- "logps/chosen": -423.6611328125,
889
- "logps/rejected": -1167.4932861328125,
890
- "loss": 1.276815414428711,
891
- "rewards/accuracies": 0.949999988079071,
892
- "rewards/chosen": -0.2825419008731842,
893
- "rewards/margins": 142.3505096435547,
894
- "rewards/rejected": -142.63302612304688,
895
- "step": 590
896
- },
897
- {
898
- "epoch": 1.5928880026586905,
899
- "grad_norm": 0.6176006197929382,
900
- "learning_rate": 4.455555555555555e-07,
901
- "logits/chosen": 7.086031913757324,
902
- "logits/rejected": 7.627284049987793,
903
- "logps/chosen": -448.5660705566406,
904
- "logps/rejected": -1207.191650390625,
905
- "loss": 0.29495222568511964,
906
- "rewards/accuracies": 0.9750000238418579,
907
- "rewards/chosen": 0.9667795300483704,
908
- "rewards/margins": 149.69699096679688,
909
- "rewards/rejected": -148.73023986816406,
910
- "step": 600
911
- },
912
- {
913
- "epoch": 1.6194749086075109,
914
- "grad_norm": 66.45056915283203,
915
- "learning_rate": 4.344444444444444e-07,
916
- "logits/chosen": 7.270118713378906,
917
- "logits/rejected": 7.5960588455200195,
918
- "logps/chosen": -503.7193298339844,
919
- "logps/rejected": -1094.6021728515625,
920
- "loss": 1.5946972846984864,
921
- "rewards/accuracies": 0.9437500238418579,
922
- "rewards/chosen": -3.655120372772217,
923
- "rewards/margins": 125.68217468261719,
924
- "rewards/rejected": -129.33731079101562,
925
- "step": 610
926
- },
927
- {
928
- "epoch": 1.646061814556331,
929
- "grad_norm": 280.2427978515625,
930
- "learning_rate": 4.2333333333333334e-07,
931
- "logits/chosen": 7.251768589019775,
932
- "logits/rejected": 7.520864009857178,
933
- "logps/chosen": -517.1514892578125,
934
- "logps/rejected": -1172.587158203125,
935
- "loss": 2.4477691650390625,
936
- "rewards/accuracies": 0.925000011920929,
937
- "rewards/chosen": 0.9268826246261597,
938
- "rewards/margins": 142.18368530273438,
939
- "rewards/rejected": -141.2567901611328,
940
- "step": 620
941
- },
942
- {
943
- "epoch": 1.6726487205051512,
944
- "grad_norm": 1.0393255949020386,
945
- "learning_rate": 4.122222222222222e-07,
946
- "logits/chosen": 7.011075019836426,
947
- "logits/rejected": 7.46621561050415,
948
- "logps/chosen": -447.34124755859375,
949
- "logps/rejected": -1143.3458251953125,
950
- "loss": 1.0738434791564941,
951
- "rewards/accuracies": 0.9437500238418579,
952
- "rewards/chosen": 1.6229969263076782,
953
- "rewards/margins": 142.9796600341797,
954
- "rewards/rejected": -141.35665893554688,
955
- "step": 630
956
- },
957
- {
958
- "epoch": 1.6992356264539714,
959
- "grad_norm": 119.75847625732422,
960
- "learning_rate": 4.0111111111111106e-07,
961
- "logits/chosen": 6.9999237060546875,
962
- "logits/rejected": 7.578449249267578,
963
- "logps/chosen": -469.19012451171875,
964
- "logps/rejected": -1200.680419921875,
965
- "loss": 0.9937694549560547,
966
- "rewards/accuracies": 0.949999988079071,
967
- "rewards/chosen": 1.2244775295257568,
968
- "rewards/margins": 147.14993286132812,
969
- "rewards/rejected": -145.92544555664062,
970
- "step": 640
971
- },
972
- {
973
- "epoch": 1.7258225324027916,
974
- "grad_norm": 27.75540542602539,
975
- "learning_rate": 3.8999999999999997e-07,
976
- "logits/chosen": 6.937554359436035,
977
- "logits/rejected": 7.374237060546875,
978
- "logps/chosen": -463.05438232421875,
979
- "logps/rejected": -1168.0521240234375,
980
- "loss": 0.39649856090545654,
981
- "rewards/accuracies": 0.981249988079071,
982
- "rewards/chosen": 3.329749345779419,
983
- "rewards/margins": 145.75735473632812,
984
- "rewards/rejected": -142.4276123046875,
985
- "step": 650
986
- },
987
- {
988
- "epoch": 1.7524094383516118,
989
- "grad_norm": 31.218721389770508,
990
- "learning_rate": 3.788888888888889e-07,
991
- "logits/chosen": 7.072316646575928,
992
- "logits/rejected": 7.550895690917969,
993
- "logps/chosen": -483.40234375,
994
- "logps/rejected": -1184.9073486328125,
995
- "loss": 0.25033409595489503,
996
- "rewards/accuracies": 0.9624999761581421,
997
- "rewards/chosen": -2.29609751701355,
998
- "rewards/margins": 147.64785766601562,
999
- "rewards/rejected": -149.94395446777344,
1000
- "step": 660
1001
- },
1002
- {
1003
- "epoch": 1.778996344300432,
1004
- "grad_norm": 6.414053359549143e-07,
1005
- "learning_rate": 3.6777777777777774e-07,
1006
- "logits/chosen": 7.303959846496582,
1007
- "logits/rejected": 7.623525142669678,
1008
- "logps/chosen": -508.6453552246094,
1009
- "logps/rejected": -1223.940673828125,
1010
- "loss": 0.31205618381500244,
1011
- "rewards/accuracies": 0.9624999761581421,
1012
- "rewards/chosen": -2.5321922302246094,
1013
- "rewards/margins": 150.60842895507812,
1014
- "rewards/rejected": -153.14060974121094,
1015
- "step": 670
1016
- },
1017
- {
1018
- "epoch": 1.8055832502492524,
1019
- "grad_norm": 1.0985974499902462e-12,
1020
- "learning_rate": 3.5666666666666666e-07,
1021
- "logits/chosen": 7.344334602355957,
1022
- "logits/rejected": 7.8254547119140625,
1023
- "logps/chosen": -532.2833251953125,
1024
- "logps/rejected": -1228.1844482421875,
1025
- "loss": 1.0204992294311523,
1026
- "rewards/accuracies": 0.956250011920929,
1027
- "rewards/chosen": 3.5795791149139404,
1028
- "rewards/margins": 159.29647827148438,
1029
- "rewards/rejected": -155.71688842773438,
1030
- "step": 680
1031
- },
1032
- {
1033
- "epoch": 1.8321701561980723,
1034
- "grad_norm": 66.1689453125,
1035
- "learning_rate": 3.4555555555555557e-07,
1036
- "logits/chosen": 7.0121636390686035,
1037
- "logits/rejected": 7.367627143859863,
1038
- "logps/chosen": -453.792236328125,
1039
- "logps/rejected": -1141.1865234375,
1040
- "loss": 0.38547022342681886,
1041
- "rewards/accuracies": 0.9437500238418579,
1042
- "rewards/chosen": -0.3728172183036804,
1043
- "rewards/margins": 139.9238739013672,
1044
- "rewards/rejected": -140.29669189453125,
1045
- "step": 690
1046
- },
1047
- {
1048
- "epoch": 1.8587570621468927,
1049
- "grad_norm": 1.7826409438004044e-20,
1050
- "learning_rate": 3.3444444444444443e-07,
1051
- "logits/chosen": 6.914497375488281,
1052
- "logits/rejected": 7.344313144683838,
1053
- "logps/chosen": -456.8873596191406,
1054
- "logps/rejected": -1159.482666015625,
1055
- "loss": 0.2864746332168579,
1056
- "rewards/accuracies": 0.9750000238418579,
1057
- "rewards/chosen": -0.41872739791870117,
1058
- "rewards/margins": 139.0840606689453,
1059
- "rewards/rejected": -139.50277709960938,
1060
- "step": 700
1061
- },
1062
- {
1063
- "epoch": 1.8853439680957127,
1064
- "grad_norm": 0.6577161550521851,
1065
- "learning_rate": 3.233333333333333e-07,
1066
- "logits/chosen": 7.24100399017334,
1067
- "logits/rejected": 7.729971408843994,
1068
- "logps/chosen": -453.352783203125,
1069
- "logps/rejected": -1139.1920166015625,
1070
- "loss": 0.40453357696533204,
1071
- "rewards/accuracies": 0.949999988079071,
1072
- "rewards/chosen": -0.5374351739883423,
1073
- "rewards/margins": 134.17910766601562,
1074
- "rewards/rejected": -134.71653747558594,
1075
- "step": 710
1076
- },
1077
- {
1078
- "epoch": 1.911930874044533,
1079
- "grad_norm": 58.065155029296875,
1080
- "learning_rate": 3.122222222222222e-07,
1081
- "logits/chosen": 7.2779541015625,
1082
- "logits/rejected": 7.5862250328063965,
1083
- "logps/chosen": -502.33489990234375,
1084
- "logps/rejected": -1174.3145751953125,
1085
- "loss": 0.2982128143310547,
1086
- "rewards/accuracies": 0.949999988079071,
1087
- "rewards/chosen": 0.528018593788147,
1088
- "rewards/margins": 134.43031311035156,
1089
- "rewards/rejected": -133.9022979736328,
1090
- "step": 720
1091
- },
1092
- {
1093
- "epoch": 1.9385177799933533,
1094
- "grad_norm": 3.057793140411377,
1095
- "learning_rate": 3.011111111111111e-07,
1096
- "logits/chosen": 7.26782751083374,
1097
- "logits/rejected": 7.741539001464844,
1098
- "logps/chosen": -496.8504943847656,
1099
- "logps/rejected": -1235.9169921875,
1100
- "loss": 0.8299455642700195,
1101
- "rewards/accuracies": 0.9750000238418579,
1102
- "rewards/chosen": -0.2519731819629669,
1103
- "rewards/margins": 152.95582580566406,
1104
- "rewards/rejected": -153.20779418945312,
1105
- "step": 730
1106
- },
1107
- {
1108
- "epoch": 1.9651046859421735,
1109
- "grad_norm": 185.18174743652344,
1110
- "learning_rate": 2.9e-07,
1111
- "logits/chosen": 7.201784610748291,
1112
- "logits/rejected": 7.585198402404785,
1113
- "logps/chosen": -474.599853515625,
1114
- "logps/rejected": -1184.7464599609375,
1115
- "loss": 1.5328912734985352,
1116
- "rewards/accuracies": 0.9312499761581421,
1117
- "rewards/chosen": -3.7899742126464844,
1118
- "rewards/margins": 142.3949432373047,
1119
- "rewards/rejected": -146.1849365234375,
1120
- "step": 740
1121
- },
1122
- {
1123
- "epoch": 1.9916915918909937,
1124
- "grad_norm": 173.19436645507812,
1125
- "learning_rate": 2.788888888888889e-07,
1126
- "logits/chosen": 7.347403526306152,
1127
- "logits/rejected": 7.8731865882873535,
1128
- "logps/chosen": -484.5741271972656,
1129
- "logps/rejected": -1241.9910888671875,
1130
- "loss": 0.9414227485656739,
1131
- "rewards/accuracies": 0.9624999761581421,
1132
- "rewards/chosen": -0.26702070236206055,
1133
- "rewards/margins": 149.2693328857422,
1134
- "rewards/rejected": -149.53634643554688,
1135
- "step": 750
1136
- },
1137
- {
1138
- "epoch": 2.015952143569292,
1139
- "grad_norm": 277.53521728515625,
1140
- "learning_rate": 2.6777777777777775e-07,
1141
- "logits/chosen": 7.591332912445068,
1142
- "logits/rejected": 7.759430408477783,
1143
- "logps/chosen": -578.1312866210938,
1144
- "logps/rejected": -1104.3414306640625,
1145
- "loss": 0.3479891538619995,
1146
- "rewards/accuracies": 0.9726027250289917,
1147
- "rewards/chosen": -3.462564468383789,
1148
- "rewards/margins": 117.48858642578125,
1149
- "rewards/rejected": -120.95115661621094,
1150
- "step": 760
1151
- },
1152
- {
1153
- "epoch": 2.0425390495181124,
1154
- "grad_norm": 81.4224624633789,
1155
- "learning_rate": 2.5666666666666666e-07,
1156
- "logits/chosen": 6.820937156677246,
1157
- "logits/rejected": 7.438076972961426,
1158
- "logps/chosen": -445.38592529296875,
1159
- "logps/rejected": -1255.2547607421875,
1160
- "loss": 0.7632743835449218,
1161
- "rewards/accuracies": 0.9624999761581421,
1162
- "rewards/chosen": -1.3056232929229736,
1163
- "rewards/margins": 162.7215576171875,
1164
- "rewards/rejected": -164.02719116210938,
1165
- "step": 770
1166
- },
1167
- {
1168
- "epoch": 2.0691259554669323,
1169
- "grad_norm": 80.839111328125,
1170
- "learning_rate": 2.455555555555555e-07,
1171
- "logits/chosen": 7.089077949523926,
1172
- "logits/rejected": 7.562623500823975,
1173
- "logps/chosen": -479.9771423339844,
1174
- "logps/rejected": -1212.7470703125,
1175
- "loss": 0.37755522727966306,
1176
- "rewards/accuracies": 0.96875,
1177
- "rewards/chosen": -0.4610620439052582,
1178
- "rewards/margins": 147.49462890625,
1179
- "rewards/rejected": -147.9556884765625,
1180
- "step": 780
1181
- },
1182
- {
1183
- "epoch": 2.0957128614157527,
1184
- "grad_norm": 102.21258544921875,
1185
- "learning_rate": 2.3444444444444444e-07,
1186
- "logits/chosen": 7.348860740661621,
1187
- "logits/rejected": 7.812272548675537,
1188
- "logps/chosen": -545.9133911132812,
1189
- "logps/rejected": -1209.3829345703125,
1190
- "loss": 0.8432134628295899,
1191
- "rewards/accuracies": 0.9437500238418579,
1192
- "rewards/chosen": -4.405800819396973,
1193
- "rewards/margins": 136.8076629638672,
1194
- "rewards/rejected": -141.2134552001953,
1195
- "step": 790
1196
- },
1197
- {
1198
- "epoch": 2.122299767364573,
1199
- "grad_norm": 7.490438461303711,
1200
- "learning_rate": 2.2333333333333332e-07,
1201
- "logits/chosen": 6.928166389465332,
1202
- "logits/rejected": 7.589695930480957,
1203
- "logps/chosen": -411.7215881347656,
1204
- "logps/rejected": -1234.5572509765625,
1205
- "loss": 0.08373026251792907,
1206
- "rewards/accuracies": 0.9750000238418579,
1207
- "rewards/chosen": -1.1743253469467163,
1208
- "rewards/margins": 159.31773376464844,
1209
- "rewards/rejected": -160.49208068847656,
1210
- "step": 800
1211
- },
1212
- {
1213
- "epoch": 2.148886673313393,
1214
- "grad_norm": 0.0003583618381526321,
1215
- "learning_rate": 2.122222222222222e-07,
1216
- "logits/chosen": 7.2483720779418945,
1217
- "logits/rejected": 7.788289546966553,
1218
- "logps/chosen": -497.52069091796875,
1219
- "logps/rejected": -1158.642822265625,
1220
- "loss": 0.1160581350326538,
1221
- "rewards/accuracies": 0.96875,
1222
- "rewards/chosen": 2.365230083465576,
1223
- "rewards/margins": 146.15255737304688,
1224
- "rewards/rejected": -143.78732299804688,
1225
- "step": 810
1226
- },
1227
- {
1228
- "epoch": 2.1754735792622135,
1229
- "grad_norm": 6.3310980796813965,
1230
- "learning_rate": 2.011111111111111e-07,
1231
- "logits/chosen": 6.909984588623047,
1232
- "logits/rejected": 7.344359397888184,
1233
- "logps/chosen": -432.39764404296875,
1234
- "logps/rejected": -1212.4569091796875,
1235
- "loss": 0.9685474395751953,
1236
- "rewards/accuracies": 0.9437500238418579,
1237
- "rewards/chosen": -2.124849319458008,
1238
- "rewards/margins": 155.05654907226562,
1239
- "rewards/rejected": -157.18141174316406,
1240
- "step": 820
1241
- },
1242
- {
1243
- "epoch": 2.2020604852110335,
1244
- "grad_norm": 5.151050697094206e-09,
1245
- "learning_rate": 1.8999999999999998e-07,
1246
- "logits/chosen": 7.006634712219238,
1247
- "logits/rejected": 7.5766754150390625,
1248
- "logps/chosen": -431.0802307128906,
1249
- "logps/rejected": -1220.452392578125,
1250
- "loss": 1.1500192642211915,
1251
- "rewards/accuracies": 0.9750000238418579,
1252
- "rewards/chosen": -2.019794464111328,
1253
- "rewards/margins": 152.16506958007812,
1254
- "rewards/rejected": -154.18484497070312,
1255
- "step": 830
1256
- },
1257
- {
1258
- "epoch": 2.228647391159854,
1259
- "grad_norm": 1.951496702049138e-18,
1260
- "learning_rate": 1.7888888888888887e-07,
1261
- "logits/chosen": 6.816000938415527,
1262
- "logits/rejected": 7.375506401062012,
1263
- "logps/chosen": -439.57891845703125,
1264
- "logps/rejected": -1222.27001953125,
1265
- "loss": 0.3972776889801025,
1266
- "rewards/accuracies": 0.987500011920929,
1267
- "rewards/chosen": 1.9101593494415283,
1268
- "rewards/margins": 158.9185333251953,
1269
- "rewards/rejected": -157.0083770751953,
1270
- "step": 840
1271
- },
1272
- {
1273
- "epoch": 2.255234297108674,
1274
- "grad_norm": 2.449645117964328e-15,
1275
- "learning_rate": 1.6777777777777778e-07,
1276
- "logits/chosen": 7.166296482086182,
1277
- "logits/rejected": 7.5857744216918945,
1278
- "logps/chosen": -484.2479553222656,
1279
- "logps/rejected": -1235.645263671875,
1280
- "loss": 0.15833470821380616,
1281
- "rewards/accuracies": 0.9750000238418579,
1282
- "rewards/chosen": -0.6545869708061218,
1283
- "rewards/margins": 155.0919952392578,
1284
- "rewards/rejected": -155.74655151367188,
1285
- "step": 850
1286
- },
1287
- {
1288
- "epoch": 2.2818212030574943,
1289
- "grad_norm": 67.49964141845703,
1290
- "learning_rate": 1.5666666666666667e-07,
1291
- "logits/chosen": 6.9471001625061035,
1292
- "logits/rejected": 7.408398628234863,
1293
- "logps/chosen": -406.9446105957031,
1294
- "logps/rejected": -1206.536376953125,
1295
- "loss": 0.3223508358001709,
1296
- "rewards/accuracies": 0.9437500238418579,
1297
- "rewards/chosen": 4.314828395843506,
1298
- "rewards/margins": 160.91775512695312,
1299
- "rewards/rejected": -156.60293579101562,
1300
- "step": 860
1301
- },
1302
- {
1303
- "epoch": 2.308408109006314,
1304
- "grad_norm": 3.4588420021464117e-06,
1305
- "learning_rate": 1.4555555555555555e-07,
1306
- "logits/chosen": 6.990222930908203,
1307
- "logits/rejected": 7.685202598571777,
1308
- "logps/chosen": -426.66973876953125,
1309
- "logps/rejected": -1176.889404296875,
1310
- "loss": 0.8611475944519043,
1311
- "rewards/accuracies": 0.9624999761581421,
1312
- "rewards/chosen": 1.3860576152801514,
1313
- "rewards/margins": 144.15267944335938,
1314
- "rewards/rejected": -142.76663208007812,
1315
- "step": 870
1316
- },
1317
- {
1318
- "epoch": 2.3349950149551346,
1319
- "grad_norm": 4.1328581182331625e-12,
1320
- "learning_rate": 1.3444444444444444e-07,
1321
- "logits/chosen": 7.256162166595459,
1322
- "logits/rejected": 7.685450553894043,
1323
- "logps/chosen": -462.0904846191406,
1324
- "logps/rejected": -1166.178466796875,
1325
- "loss": 0.024902737140655516,
1326
- "rewards/accuracies": 0.987500011920929,
1327
- "rewards/chosen": 1.212837815284729,
1328
- "rewards/margins": 140.08041381835938,
1329
- "rewards/rejected": -138.8675537109375,
1330
- "step": 880
1331
- },
1332
- {
1333
- "epoch": 2.361581920903955,
1334
- "grad_norm": 22.725154876708984,
1335
- "learning_rate": 1.2333333333333333e-07,
1336
- "logits/chosen": 7.431256294250488,
1337
- "logits/rejected": 7.865132808685303,
1338
- "logps/chosen": -456.8827209472656,
1339
- "logps/rejected": -1153.871337890625,
1340
- "loss": 0.13207526206970216,
1341
- "rewards/accuracies": 0.981249988079071,
1342
- "rewards/chosen": 1.6622031927108765,
1343
- "rewards/margins": 136.24082946777344,
1344
- "rewards/rejected": -134.57862854003906,
1345
- "step": 890
1346
- },
1347
- {
1348
- "epoch": 2.388168826852775,
1349
- "grad_norm": 132.83956909179688,
1350
- "learning_rate": 1.1222222222222221e-07,
1351
- "logits/chosen": 7.010849952697754,
1352
- "logits/rejected": 7.441749572753906,
1353
- "logps/chosen": -502.49371337890625,
1354
- "logps/rejected": -1215.2733154296875,
1355
- "loss": 0.5922121524810791,
1356
- "rewards/accuracies": 0.9750000238418579,
1357
- "rewards/chosen": -2.6824889183044434,
1358
- "rewards/margins": 148.62466430664062,
1359
- "rewards/rejected": -151.30715942382812,
1360
- "step": 900
1361
- },
1362
- {
1363
- "epoch": 2.4147557328015954,
1364
- "grad_norm": 0.005245895590633154,
1365
- "learning_rate": 1.011111111111111e-07,
1366
- "logits/chosen": 6.980523109436035,
1367
- "logits/rejected": 7.430232048034668,
1368
- "logps/chosen": -470.41253662109375,
1369
- "logps/rejected": -1160.4951171875,
1370
- "loss": 1.195225143432617,
1371
- "rewards/accuracies": 0.949999988079071,
1372
- "rewards/chosen": -3.2118802070617676,
1373
- "rewards/margins": 144.29278564453125,
1374
- "rewards/rejected": -147.50466918945312,
1375
- "step": 910
1376
- },
1377
- {
1378
- "epoch": 2.4413426387504154,
1379
- "grad_norm": 194.52578735351562,
1380
- "learning_rate": 9e-08,
1381
- "logits/chosen": 6.884810447692871,
1382
- "logits/rejected": 7.503731727600098,
1383
- "logps/chosen": -440.31976318359375,
1384
- "logps/rejected": -1206.906494140625,
1385
- "loss": 0.44526066780090334,
1386
- "rewards/accuracies": 0.956250011920929,
1387
- "rewards/chosen": 1.156048059463501,
1388
- "rewards/margins": 149.27732849121094,
1389
- "rewards/rejected": -148.1212921142578,
1390
- "step": 920
1391
- },
1392
- {
1393
- "epoch": 2.4679295446992358,
1394
- "grad_norm": 1.5737574004387467e-14,
1395
- "learning_rate": 7.888888888888889e-08,
1396
- "logits/chosen": 7.322862148284912,
1397
- "logits/rejected": 7.748003959655762,
1398
- "logps/chosen": -508.33245849609375,
1399
- "logps/rejected": -1189.603759765625,
1400
- "loss": 0.18692436218261718,
1401
- "rewards/accuracies": 0.9750000238418579,
1402
- "rewards/chosen": 1.148808479309082,
1403
- "rewards/margins": 146.20956420898438,
1404
- "rewards/rejected": -145.06076049804688,
1405
- "step": 930
1406
- },
1407
- {
1408
- "epoch": 2.4945164506480557,
1409
- "grad_norm": 0.22959347069263458,
1410
- "learning_rate": 6.777777777777778e-08,
1411
- "logits/chosen": 7.375940799713135,
1412
- "logits/rejected": 7.710402011871338,
1413
- "logps/chosen": -490.12384033203125,
1414
- "logps/rejected": -1171.1483154296875,
1415
- "loss": 0.27915282249450685,
1416
- "rewards/accuracies": 0.9624999761581421,
1417
- "rewards/chosen": 1.145845651626587,
1418
- "rewards/margins": 140.29800415039062,
1419
- "rewards/rejected": -139.1521453857422,
1420
- "step": 940
1421
- },
1422
- {
1423
- "epoch": 2.521103356596876,
1424
- "grad_norm": 22.964818954467773,
1425
- "learning_rate": 5.666666666666666e-08,
1426
- "logits/chosen": 7.258917331695557,
1427
- "logits/rejected": 7.766401767730713,
1428
- "logps/chosen": -467.205322265625,
1429
- "logps/rejected": -1157.4315185546875,
1430
- "loss": 1.6006925582885743,
1431
- "rewards/accuracies": 0.9437500238418579,
1432
- "rewards/chosen": 0.41268739104270935,
1433
- "rewards/margins": 135.27273559570312,
1434
- "rewards/rejected": -134.86004638671875,
1435
- "step": 950
1436
- },
1437
- {
1438
- "epoch": 2.547690262545696,
1439
- "grad_norm": 3.8648969441501535e-11,
1440
- "learning_rate": 4.555555555555556e-08,
1441
- "logits/chosen": 7.018073081970215,
1442
- "logits/rejected": 7.558196067810059,
1443
- "logps/chosen": -449.532958984375,
1444
- "logps/rejected": -1138.4356689453125,
1445
- "loss": 0.28522279262542727,
1446
- "rewards/accuracies": 0.981249988079071,
1447
- "rewards/chosen": -0.8609614372253418,
1448
- "rewards/margins": 139.2249298095703,
1449
- "rewards/rejected": -140.0858917236328,
1450
- "step": 960
1451
- },
1452
- {
1453
- "epoch": 2.5742771684945165,
1454
- "grad_norm": 84.71375274658203,
1455
- "learning_rate": 3.4444444444444444e-08,
1456
- "logits/chosen": 7.130776405334473,
1457
- "logits/rejected": 7.609295845031738,
1458
- "logps/chosen": -438.42694091796875,
1459
- "logps/rejected": -1249.6336669921875,
1460
- "loss": 0.4750792980194092,
1461
- "rewards/accuracies": 0.96875,
1462
- "rewards/chosen": -1.273829460144043,
1463
- "rewards/margins": 160.90731811523438,
1464
- "rewards/rejected": -162.18113708496094,
1465
- "step": 970
1466
- },
1467
- {
1468
- "epoch": 2.6008640744433364,
1469
- "grad_norm": 85.9113540649414,
1470
- "learning_rate": 2.3333333333333334e-08,
1471
- "logits/chosen": 7.113263130187988,
1472
- "logits/rejected": 7.686596870422363,
1473
- "logps/chosen": -434.325439453125,
1474
- "logps/rejected": -1194.6849365234375,
1475
- "loss": 0.33106160163879395,
1476
- "rewards/accuracies": 0.9437500238418579,
1477
- "rewards/chosen": -1.2038366794586182,
1478
- "rewards/margins": 146.78440856933594,
1479
- "rewards/rejected": -147.98825073242188,
1480
- "step": 980
1481
- },
1482
- {
1483
- "epoch": 2.627450980392157,
1484
- "grad_norm": 2.4605165866986043e-20,
1485
- "learning_rate": 1.2222222222222222e-08,
1486
- "logits/chosen": 7.00030517578125,
1487
- "logits/rejected": 7.477368354797363,
1488
- "logps/chosen": -450.455078125,
1489
- "logps/rejected": -1269.2520751953125,
1490
- "loss": 0.2776132583618164,
1491
- "rewards/accuracies": 0.981249988079071,
1492
- "rewards/chosen": -3.280397891998291,
1493
- "rewards/margins": 163.57626342773438,
1494
- "rewards/rejected": -166.85665893554688,
1495
- "step": 990
1496
- },
1497
- {
1498
- "epoch": 2.6540378863409773,
1499
- "grad_norm": 80.78559112548828,
1500
- "learning_rate": 1.111111111111111e-09,
1501
- "logits/chosen": 7.087013244628906,
1502
- "logits/rejected": 7.507058620452881,
1503
- "logps/chosen": -490.28857421875,
1504
- "logps/rejected": -1223.248046875,
1505
- "loss": 0.2815593719482422,
1506
- "rewards/accuracies": 0.9750000238418579,
1507
- "rewards/chosen": -2.167093276977539,
1508
- "rewards/margins": 151.61813354492188,
1509
- "rewards/rejected": -153.78524780273438,
1510
- "step": 1000
1511
  }
1512
  ],
1513
  "logging_steps": 10,
1514
- "max_steps": 1000,
1515
  "num_input_tokens_seen": 0,
1516
- "num_train_epochs": 3,
1517
  "save_steps": 200,
1518
  "stateful_callbacks": {
1519
  "TrainerControl": {
@@ -1522,7 +322,7 @@
1522
  "should_evaluate": false,
1523
  "should_log": false,
1524
  "should_save": true,
1525
- "should_training_stop": true
1526
  },
1527
  "attributes": {}
1528
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5317381189764041,
6
  "eval_steps": 500,
7
+ "global_step": 200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.026586905948820207,
14
+ "grad_norm": 229.74172973632812,
15
+ "learning_rate": 1.8000000000000001e-06,
16
+ "logits/chosen": 1.892960548400879,
17
+ "logits/rejected": 2.2739109992980957,
18
+ "logps/chosen": -180.7786102294922,
19
+ "logps/rejected": -296.7843017578125,
20
+ "loss": 41.754217529296874,
21
  "rewards/accuracies": 0.643750011920929,
22
+ "rewards/chosen": 127.22708892822266,
23
+ "rewards/margins": 51.65830612182617,
24
+ "rewards/rejected": 75.56879425048828,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.053173811897640415,
29
+ "grad_norm": 127.54531860351562,
30
+ "learning_rate": 3.8000000000000005e-06,
31
+ "logits/chosen": 2.360779285430908,
32
+ "logits/rejected": 2.5129952430725098,
33
+ "logps/chosen": -198.55577087402344,
34
+ "logps/rejected": -311.8116149902344,
35
+ "loss": 34.334066772460936,
36
+ "rewards/accuracies": 0.637499988079071,
37
+ "rewards/chosen": 133.30413818359375,
38
+ "rewards/margins": 69.63650512695312,
39
+ "rewards/rejected": 63.667640686035156,
40
  "step": 20
41
  },
42
  {
43
  "epoch": 0.07976071784646062,
44
+ "grad_norm": 195.1651153564453,
45
+ "learning_rate": 5.8e-06,
46
+ "logits/chosen": 2.7846579551696777,
47
+ "logits/rejected": 2.912212610244751,
48
+ "logps/chosen": -201.8841094970703,
49
+ "logps/rejected": -343.28619384765625,
50
+ "loss": 31.000405883789064,
51
+ "rewards/accuracies": 0.706250011920929,
52
+ "rewards/chosen": 149.70481872558594,
53
+ "rewards/margins": 90.86196899414062,
54
+ "rewards/rejected": 58.842857360839844,
55
  "step": 30
56
  },
57
  {
58
  "epoch": 0.10634762379528083,
59
+ "grad_norm": 217.8875274658203,
60
+ "learning_rate": 7.800000000000002e-06,
61
+ "logits/chosen": 2.687530040740967,
62
+ "logits/rejected": 2.89595365524292,
63
+ "logps/chosen": -180.60574340820312,
64
+ "logps/rejected": -395.2937927246094,
65
+ "loss": 33.74924926757812,
66
+ "rewards/accuracies": 0.675000011920929,
67
+ "rewards/chosen": 124.906494140625,
68
+ "rewards/margins": 96.38619995117188,
69
+ "rewards/rejected": 28.52029037475586,
70
  "step": 40
71
  },
72
  {
73
  "epoch": 0.13293452974410103,
74
+ "grad_norm": 123.57787322998047,
75
+ "learning_rate": 9.800000000000001e-06,
76
+ "logits/chosen": 3.6891541481018066,
77
+ "logits/rejected": 3.7287964820861816,
78
+ "logps/chosen": -233.40304565429688,
79
+ "logps/rejected": -501.949951171875,
80
+ "loss": 33.165060424804686,
81
+ "rewards/accuracies": 0.706250011920929,
82
+ "rewards/chosen": 132.43130493164062,
83
+ "rewards/margins": 153.64065551757812,
84
+ "rewards/rejected": -21.209354400634766,
85
  "step": 50
86
  },
87
  {
88
  "epoch": 0.15952143569292124,
89
+ "grad_norm": 240.26422119140625,
90
+ "learning_rate": 9.742857142857143e-06,
91
+ "logits/chosen": 4.325669288635254,
92
+ "logits/rejected": 4.669508934020996,
93
+ "logps/chosen": -227.3274688720703,
94
+ "logps/rejected": -761.2488403320312,
95
+ "loss": 9.919349670410156,
96
+ "rewards/accuracies": 0.8374999761581421,
97
+ "rewards/chosen": 78.26126861572266,
98
+ "rewards/margins": 259.2169494628906,
99
+ "rewards/rejected": -180.95565795898438,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.18610834164174145,
104
+ "grad_norm": 18.67397117614746,
105
+ "learning_rate": 9.457142857142858e-06,
106
+ "logits/chosen": 6.029001235961914,
107
+ "logits/rejected": 6.398881912231445,
108
+ "logps/chosen": -361.8104248046875,
109
+ "logps/rejected": -936.2852783203125,
110
+ "loss": 8.505984497070312,
111
+ "rewards/accuracies": 0.8999999761581421,
112
+ "rewards/chosen": 62.65727615356445,
113
+ "rewards/margins": 295.85296630859375,
114
+ "rewards/rejected": -233.1957550048828,
115
  "step": 70
116
  },
117
  {
118
  "epoch": 0.21269524759056166,
119
+ "grad_norm": 424.631591796875,
120
+ "learning_rate": 9.171428571428572e-06,
121
+ "logits/chosen": 7.3346147537231445,
122
+ "logits/rejected": 7.6371870040893555,
123
+ "logps/chosen": -487.5819396972656,
124
+ "logps/rejected": -1231.750244140625,
125
+ "loss": 5.0393016815185545,
126
+ "rewards/accuracies": 0.925000011920929,
127
+ "rewards/chosen": 3.2465362548828125,
128
+ "rewards/margins": 386.61541748046875,
129
+ "rewards/rejected": -383.368896484375,
130
  "step": 80
131
  },
132
  {
133
  "epoch": 0.23928215353938184,
134
+ "grad_norm": 58.2872200012207,
135
+ "learning_rate": 8.885714285714286e-06,
136
+ "logits/chosen": 7.502760410308838,
137
+ "logits/rejected": 7.7781524658203125,
138
+ "logps/chosen": -515.661376953125,
139
+ "logps/rejected": -1255.3602294921875,
140
+ "loss": 2.7182350158691406,
141
+ "rewards/accuracies": 0.956250011920929,
142
+ "rewards/chosen": -3.7650718688964844,
143
+ "rewards/margins": 391.1650390625,
144
+ "rewards/rejected": -394.9300842285156,
145
  "step": 90
146
  },
147
  {
148
  "epoch": 0.26586905948820205,
149
+ "grad_norm": 0.0009401601273566484,
150
+ "learning_rate": 8.6e-06,
151
+ "logits/chosen": 6.9696502685546875,
152
+ "logits/rejected": 7.490464687347412,
153
+ "logps/chosen": -407.62591552734375,
154
+ "logps/rejected": -1201.7757568359375,
155
+ "loss": 1.852958869934082,
156
+ "rewards/accuracies": 0.9437500238418579,
157
+ "rewards/chosen": 16.64017105102539,
158
+ "rewards/margins": 388.97650146484375,
159
+ "rewards/rejected": -372.3362731933594,
160
  "step": 100
161
  },
162
  {
163
  "epoch": 0.2924559654370223,
164
+ "grad_norm": 776.8017578125,
165
+ "learning_rate": 8.314285714285715e-06,
166
+ "logits/chosen": 7.605508327484131,
167
+ "logits/rejected": 8.056883811950684,
168
+ "logps/chosen": -500.75714111328125,
169
+ "logps/rejected": -1358.50732421875,
170
+ "loss": 0.4602807998657227,
171
+ "rewards/accuracies": 0.9750000238418579,
172
+ "rewards/chosen": -11.148886680603027,
173
+ "rewards/margins": 444.7156677246094,
174
+ "rewards/rejected": -455.8646545410156,
175
  "step": 110
176
  },
177
  {
178
  "epoch": 0.3190428713858425,
179
+ "grad_norm": 9.229455307652179e-14,
180
+ "learning_rate": 8.02857142857143e-06,
181
+ "logits/chosen": 7.707437992095947,
182
+ "logits/rejected": 8.208626747131348,
183
+ "logps/chosen": -493.72503662109375,
184
+ "logps/rejected": -1418.4669189453125,
185
+ "loss": 0.4660654544830322,
186
+ "rewards/accuracies": 0.9750000238418579,
187
+ "rewards/chosen": -27.65629005432129,
188
+ "rewards/margins": 457.8779296875,
189
+ "rewards/rejected": -485.53424072265625,
190
  "step": 120
191
  },
192
  {
193
  "epoch": 0.34562977733466266,
194
+ "grad_norm": 3.0277444440507395e-10,
195
+ "learning_rate": 7.742857142857144e-06,
196
+ "logits/chosen": 7.820192813873291,
197
+ "logits/rejected": 8.277512550354004,
198
+ "logps/chosen": -490.735595703125,
199
+ "logps/rejected": -1337.3505859375,
200
+ "loss": 1.2780420303344726,
201
+ "rewards/accuracies": 0.981249988079071,
202
+ "rewards/chosen": 0.13531294465065002,
203
+ "rewards/margins": 430.638671875,
204
+ "rewards/rejected": -430.50335693359375,
205
  "step": 130
206
  },
207
  {
208
  "epoch": 0.3722166832834829,
209
+ "grad_norm": 1.2154150397236663e-07,
210
+ "learning_rate": 7.457142857142857e-06,
211
+ "logits/chosen": 7.428654670715332,
212
+ "logits/rejected": 7.884097099304199,
213
+ "logps/chosen": -490.9019470214844,
214
+ "logps/rejected": -1213.6722412109375,
215
+ "loss": 0.03439792990684509,
216
+ "rewards/accuracies": 0.9937499761581421,
217
+ "rewards/chosen": 3.2693276405334473,
218
+ "rewards/margins": 378.11016845703125,
219
+ "rewards/rejected": -374.8408203125,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.3988035892323031,
224
+ "grad_norm": 0.0008078943355940282,
225
+ "learning_rate": 7.1714285714285725e-06,
226
+ "logits/chosen": 7.338967800140381,
227
+ "logits/rejected": 7.722776889801025,
228
+ "logps/chosen": -444.8753356933594,
229
+ "logps/rejected": -1266.625244140625,
230
+ "loss": 1.42781343460083,
231
+ "rewards/accuracies": 0.96875,
232
+ "rewards/chosen": -2.331421375274658,
233
+ "rewards/margins": 393.41400146484375,
234
+ "rewards/rejected": -395.74542236328125,
235
  "step": 150
236
  },
237
  {
238
  "epoch": 0.4253904951811233,
239
+ "grad_norm": 0.0,
240
+ "learning_rate": 6.885714285714287e-06,
241
+ "logits/chosen": 7.96518087387085,
242
+ "logits/rejected": 8.39413833618164,
243
+ "logps/chosen": -586.011962890625,
244
+ "logps/rejected": -1365.374267578125,
245
+ "loss": 1.9711128234863282,
246
+ "rewards/accuracies": 0.96875,
247
+ "rewards/chosen": -28.12943458557129,
248
+ "rewards/margins": 417.09210205078125,
249
+ "rewards/rejected": -445.22149658203125,
250
  "step": 160
251
  },
252
  {
253
  "epoch": 0.4519774011299435,
254
+ "grad_norm": 5.4764127260797935e-12,
255
+ "learning_rate": 6.600000000000001e-06,
256
+ "logits/chosen": 7.3422675132751465,
257
+ "logits/rejected": 7.857165336608887,
258
+ "logps/chosen": -465.5393981933594,
259
+ "logps/rejected": -1300.1103515625,
260
+ "loss": 0.004332171380519867,
261
+ "rewards/accuracies": 0.9937499761581421,
262
+ "rewards/chosen": 1.256854772567749,
263
+ "rewards/margins": 424.2041931152344,
264
+ "rewards/rejected": -422.9473571777344,
265
  "step": 170
266
  },
267
  {
268
  "epoch": 0.4785643070787637,
269
+ "grad_norm": 422.58892822265625,
270
+ "learning_rate": 6.314285714285715e-06,
271
+ "logits/chosen": 7.199074745178223,
272
+ "logits/rejected": 7.617570400238037,
273
+ "logps/chosen": -446.2982482910156,
274
+ "logps/rejected": -1317.12890625,
275
+ "loss": 0.9593421936035156,
276
+ "rewards/accuracies": 0.9750000238418579,
277
+ "rewards/chosen": 2.7993197441101074,
278
+ "rewards/margins": 444.40753173828125,
279
+ "rewards/rejected": -441.60821533203125,
280
  "step": 180
281
  },
282
  {
283
  "epoch": 0.5051512130275839,
284
+ "grad_norm": 0.0,
285
+ "learning_rate": 6.028571428571429e-06,
286
+ "logits/chosen": 7.070580959320068,
287
+ "logits/rejected": 7.494720458984375,
288
+ "logps/chosen": -430.97760009765625,
289
+ "logps/rejected": -1291.35205078125,
290
+ "loss": 0.09595458507537842,
291
+ "rewards/accuracies": 0.9937499761581421,
292
+ "rewards/chosen": 7.344033241271973,
293
+ "rewards/margins": 433.2731018066406,
294
+ "rewards/rejected": -425.9291076660156,
295
  "step": 190
296
  },
297
  {
298
  "epoch": 0.5317381189764041,
299
+ "grad_norm": 1.3654603958129883,
300
+ "learning_rate": 5.742857142857143e-06,
301
+ "logits/chosen": 7.252472877502441,
302
+ "logits/rejected": 7.820960998535156,
303
+ "logps/chosen": -435.6075134277344,
304
+ "logps/rejected": -1333.8228759765625,
305
+ "loss": 0.7474074840545655,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  "rewards/accuracies": 0.956250011920929,
307
+ "rewards/chosen": 1.129046082496643,
308
+ "rewards/margins": 456.1576232910156,
309
+ "rewards/rejected": -455.028564453125,
310
+ "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  }
312
  ],
313
  "logging_steps": 10,
314
+ "max_steps": 400,
315
  "num_input_tokens_seen": 0,
316
+ "num_train_epochs": 2,
317
  "save_steps": 200,
318
  "stateful_callbacks": {
319
  "TrainerControl": {
 
322
  "should_evaluate": false,
323
  "should_log": false,
324
  "should_save": true,
325
+ "should_training_stop": false
326
  },
327
  "attributes": {}
328
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07d1084fbcea73eed4529408d2dd186b09d81c71318b95b1f0d3c71ddb884015
3
  size 6289
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa5979d784b3be5f03398730b0db9a0aaad24ae1fdea10accf8ecc4f7c831b44
3
  size 6289