besimray commited on
Commit
97afb08
·
verified ·
1 Parent(s): d82a646

Upload task output abc123

Browse files
Files changed (4) hide show
  1. loss.txt +1 -1
  2. model.safetensors +1 -1
  3. trainer_state.json +355 -355
  4. training_args.bin +1 -1
loss.txt CHANGED
@@ -1 +1 @@
1
- 202,0.00023559275723528117
 
1
+ 202,0.00028668390586972237
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b316ceca0fd69a88ca78e6263ebb4eed5c0bc75c3b1d2a7b77cb662432a40310
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba1541b31a6b2a77f67aa4a5bd2369f4898ef815907a3883221836f9a7856474
3
  size 3087467144
trainer_state.json CHANGED
@@ -11,634 +11,634 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04926108374384237,
14
- "grad_norm": 109.0,
15
  "learning_rate": 2.52589e-06,
16
- "logits/chosen": 0.05971701070666313,
17
- "logits/rejected": -0.009489008225500584,
18
- "logps/chosen": -715.102294921875,
19
- "logps/rejected": -992.0411376953125,
20
- "loss": 0.6912,
21
- "rewards/accuracies": 0.3333333432674408,
22
- "rewards/chosen": -0.0028032560367137194,
23
- "rewards/margins": 0.006950830575078726,
24
- "rewards/rejected": -0.009754086844623089,
25
  "step": 5
26
  },
27
  {
28
  "epoch": 0.09852216748768473,
29
- "grad_norm": 75.0,
30
  "learning_rate": 5.6832524999999995e-06,
31
- "logits/chosen": 0.04588876664638519,
32
- "logits/rejected": 0.003985819406807423,
33
- "logps/chosen": -671.5519409179688,
34
- "logps/rejected": -1003.0562744140625,
35
  "loss": 0.5469,
36
- "rewards/accuracies": 0.9083333015441895,
37
- "rewards/chosen": 0.09333531558513641,
38
- "rewards/margins": 0.3377665579319,
39
- "rewards/rejected": -0.24443121254444122,
40
  "step": 10
41
  },
42
  {
43
  "epoch": 0.1477832512315271,
44
- "grad_norm": 29.125,
45
  "learning_rate": 8.840615e-06,
46
- "logits/chosen": -0.02368784323334694,
47
- "logits/rejected": 0.04294583946466446,
48
- "logps/chosen": -677.3331909179688,
49
- "logps/rejected": -989.4844970703125,
50
- "loss": 0.2525,
51
  "rewards/accuracies": 0.98333340883255,
52
- "rewards/chosen": 0.3687068819999695,
53
- "rewards/margins": 1.4707039594650269,
54
- "rewards/rejected": -1.101996898651123,
55
  "step": 15
56
  },
57
  {
58
  "epoch": 0.19704433497536947,
59
- "grad_norm": 3.703125,
60
  "learning_rate": 1.19979775e-05,
61
- "logits/chosen": 0.008908278308808804,
62
- "logits/rejected": 0.017966633662581444,
63
- "logps/chosen": -703.7459716796875,
64
- "logps/rejected": -1041.655029296875,
65
- "loss": 0.0458,
66
- "rewards/accuracies": 1.0,
67
- "rewards/chosen": 0.7780574560165405,
68
- "rewards/margins": 4.076639652252197,
69
- "rewards/rejected": -3.298582077026367,
70
  "step": 20
71
  },
72
  {
73
  "epoch": 0.24630541871921183,
74
- "grad_norm": 12.0,
75
  "learning_rate": 1.5155340000000002e-05,
76
- "logits/chosen": -0.024580026045441628,
77
- "logits/rejected": 0.03568558394908905,
78
- "logps/chosen": -717.9066162109375,
79
- "logps/rejected": -1072.755126953125,
80
- "loss": 0.016,
81
- "rewards/accuracies": 1.0,
82
- "rewards/chosen": 1.2561551332473755,
83
- "rewards/margins": 7.924638271331787,
84
- "rewards/rejected": -6.668482780456543,
85
  "step": 25
86
  },
87
  {
88
  "epoch": 0.2955665024630542,
89
- "grad_norm": 0.2138671875,
90
  "learning_rate": 1.8312702500000002e-05,
91
- "logits/chosen": 0.021068373695015907,
92
- "logits/rejected": -0.008218185044825077,
93
- "logps/chosen": -672.0543212890625,
94
- "logps/rejected": -1118.3712158203125,
95
  "loss": 0.0009,
96
  "rewards/accuracies": 1.0,
97
- "rewards/chosen": 1.809730887413025,
98
- "rewards/margins": 12.14515495300293,
99
- "rewards/rejected": -10.335424423217773,
100
  "step": 30
101
  },
102
  {
103
  "epoch": 0.3448275862068966,
104
- "grad_norm": 7.1875,
105
  "learning_rate": 2.1470065e-05,
106
- "logits/chosen": 0.0051301405765116215,
107
- "logits/rejected": -0.02955557033419609,
108
- "logps/chosen": -665.8123779296875,
109
- "logps/rejected": -1053.738037109375,
110
- "loss": 0.0086,
111
- "rewards/accuracies": 1.0,
112
- "rewards/chosen": 1.6763460636138916,
113
- "rewards/margins": 13.757688522338867,
114
- "rewards/rejected": -12.081342697143555,
115
  "step": 35
116
  },
117
  {
118
  "epoch": 0.39408866995073893,
119
- "grad_norm": 4.1875,
120
  "learning_rate": 2.2092428004483083e-05,
121
- "logits/chosen": 0.07094338536262512,
122
- "logits/rejected": 0.05427859351038933,
123
- "logps/chosen": -715.7424926757812,
124
- "logps/rejected": -1175.1407470703125,
125
- "loss": 0.002,
126
- "rewards/accuracies": 1.0,
127
- "rewards/chosen": 2.44687819480896,
128
- "rewards/margins": 15.842519760131836,
129
- "rewards/rejected": -13.39564037322998,
130
  "step": 40
131
  },
132
  {
133
  "epoch": 0.4433497536945813,
134
- "grad_norm": 0.00194549560546875,
135
  "learning_rate": 2.205545499586072e-05,
136
- "logits/chosen": -0.023238930851221085,
137
- "logits/rejected": -0.05826393887400627,
138
- "logps/chosen": -682.96142578125,
139
- "logps/rejected": -1163.7904052734375,
140
  "loss": 0.0002,
141
  "rewards/accuracies": 1.0,
142
- "rewards/chosen": 2.871985912322998,
143
- "rewards/margins": 16.598369598388672,
144
- "rewards/rejected": -13.7263822555542,
145
  "step": 45
146
  },
147
  {
148
  "epoch": 0.49261083743842365,
149
- "grad_norm": 0.5390625,
150
  "learning_rate": 2.1990176011484834e-05,
151
- "logits/chosen": 0.07197682559490204,
152
- "logits/rejected": -0.03699414059519768,
153
- "logps/chosen": -663.2843627929688,
154
- "logps/rejected": -1101.3746337890625,
155
  "loss": 0.0012,
156
  "rewards/accuracies": 1.0,
157
- "rewards/chosen": 2.7080256938934326,
158
- "rewards/margins": 16.22707176208496,
159
- "rewards/rejected": -13.519048690795898,
160
  "step": 50
161
  },
162
  {
163
  "epoch": 0.541871921182266,
164
- "grad_norm": 1.1484375,
165
  "learning_rate": 2.1896815242896443e-05,
166
- "logits/chosen": -0.07335661351680756,
167
- "logits/rejected": -0.07419757544994354,
168
- "logps/chosen": -613.652099609375,
169
- "logps/rejected": -1092.04052734375,
170
- "loss": 0.0016,
171
- "rewards/accuracies": 1.0,
172
- "rewards/chosen": 2.8754172325134277,
173
- "rewards/margins": 16.448772430419922,
174
- "rewards/rejected": -13.573356628417969,
175
  "step": 55
176
  },
177
  {
178
  "epoch": 0.5911330049261084,
179
- "grad_norm": 0.00213623046875,
180
  "learning_rate": 2.1775693324596527e-05,
181
- "logits/chosen": -0.03252274543046951,
182
- "logits/rejected": -0.03407047688961029,
183
- "logps/chosen": -665.9483642578125,
184
- "logps/rejected": -1142.9947509765625,
185
  "loss": 0.0005,
186
  "rewards/accuracies": 1.0,
187
- "rewards/chosen": 3.2369766235351562,
188
- "rewards/margins": 17.267452239990234,
189
- "rewards/rejected": -14.030476570129395,
190
  "step": 60
191
  },
192
  {
193
  "epoch": 0.6403940886699507,
194
- "grad_norm": 0.00213623046875,
195
  "learning_rate": 2.1627226232871688e-05,
196
- "logits/chosen": -0.02839934267103672,
197
- "logits/rejected": -0.16694848239421844,
198
- "logps/chosen": -728.3703002929688,
199
- "logps/rejected": -1130.7135009765625,
200
  "loss": 0.0001,
201
  "rewards/accuracies": 1.0,
202
- "rewards/chosen": 3.4549667835235596,
203
- "rewards/margins": 17.37836265563965,
204
- "rewards/rejected": -13.9233980178833,
205
  "step": 65
206
  },
207
  {
208
  "epoch": 0.6896551724137931,
209
- "grad_norm": 0.1435546875,
210
  "learning_rate": 2.1451923857181784e-05,
211
- "logits/chosen": -0.0006940827006474137,
212
- "logits/rejected": -0.10031332820653915,
213
- "logps/chosen": -674.8314819335938,
214
- "logps/rejected": -1093.62890625,
215
  "loss": 0.0007,
216
  "rewards/accuracies": 1.0,
217
- "rewards/chosen": 3.454847812652588,
218
- "rewards/margins": 16.464736938476562,
219
- "rewards/rejected": -13.0098876953125,
220
  "step": 70
221
  },
222
  {
223
  "epoch": 0.7389162561576355,
224
- "grad_norm": 0.27734375,
225
  "learning_rate": 2.12503882490159e-05,
226
- "logits/chosen": -0.08651714771986008,
227
- "logits/rejected": -0.1013587936758995,
228
- "logps/chosen": -670.3421020507812,
229
- "logps/rejected": -1148.192626953125,
230
  "loss": 0.0003,
231
  "rewards/accuracies": 1.0,
232
- "rewards/chosen": 3.3947932720184326,
233
- "rewards/margins": 16.52945899963379,
234
- "rewards/rejected": -13.134666442871094,
235
  "step": 75
236
  },
237
  {
238
  "epoch": 0.7881773399014779,
239
- "grad_norm": 0.2890625,
240
  "learning_rate": 2.1023311554230692e-05,
241
- "logits/chosen": 0.04961782321333885,
242
- "logits/rejected": -0.06455739587545395,
243
- "logps/chosen": -684.1675415039062,
244
- "logps/rejected": -1145.384033203125,
245
  "loss": 0.0002,
246
  "rewards/accuracies": 1.0,
247
- "rewards/chosen": 3.4156594276428223,
248
- "rewards/margins": 16.826187133789062,
249
- "rewards/rejected": -13.410530090332031,
250
  "step": 80
251
  },
252
  {
253
  "epoch": 0.8374384236453202,
254
- "grad_norm": 0.11962890625,
255
  "learning_rate": 2.0771473635972312e-05,
256
- "logits/chosen": -0.07051565498113632,
257
- "logits/rejected": -0.06034456938505173,
258
- "logps/chosen": -641.9849853515625,
259
- "logps/rejected": -1189.5823974609375,
260
  "loss": 0.0001,
261
  "rewards/accuracies": 1.0,
262
- "rewards/chosen": 3.3825697898864746,
263
- "rewards/margins": 16.967540740966797,
264
- "rewards/rejected": -13.584970474243164,
265
  "step": 85
266
  },
267
  {
268
  "epoch": 0.8866995073891626,
269
- "grad_norm": 0.051025390625,
270
  "learning_rate": 2.0495739396345457e-05,
271
- "logits/chosen": -0.08308638632297516,
272
- "logits/rejected": -0.15096285939216614,
273
- "logps/chosen": -629.1686401367188,
274
- "logps/rejected": -1102.910888671875,
275
  "loss": 0.0003,
276
  "rewards/accuracies": 1.0,
277
- "rewards/chosen": 3.448202133178711,
278
- "rewards/margins": 17.18029022216797,
279
- "rewards/rejected": -13.732088088989258,
280
  "step": 90
281
  },
282
  {
283
  "epoch": 0.9359605911330049,
284
- "grad_norm": 0.0172119140625,
285
  "learning_rate": 2.01970558060281e-05,
286
- "logits/chosen": -0.025356780737638474,
287
- "logits/rejected": -0.04741012677550316,
288
- "logps/chosen": -725.537353515625,
289
- "logps/rejected": -1180.159912109375,
290
  "loss": 0.0,
291
  "rewards/accuracies": 1.0,
292
- "rewards/chosen": 3.787024974822998,
293
- "rewards/margins": 17.374534606933594,
294
- "rewards/rejected": -13.587509155273438,
295
  "step": 95
296
  },
297
  {
298
  "epoch": 0.9852216748768473,
299
  "grad_norm": 0.004241943359375,
300
  "learning_rate": 1.9876448652033145e-05,
301
- "logits/chosen": -0.12176716327667236,
302
- "logits/rejected": -0.1069510206580162,
303
- "logps/chosen": -615.0023193359375,
304
- "logps/rejected": -1111.8590087890625,
305
  "loss": 0.001,
306
  "rewards/accuracies": 1.0,
307
- "rewards/chosen": 3.566725492477417,
308
- "rewards/margins": 16.64693832397461,
309
- "rewards/rejected": -13.080212593078613,
310
  "step": 100
311
  },
312
  {
313
  "epoch": 0.9950738916256158,
314
- "eval_logits/chosen": -0.10971267521381378,
315
- "eval_logits/rejected": -0.11195576936006546,
316
- "eval_logps/chosen": -621.429443359375,
317
- "eval_logps/rejected": -1077.3065185546875,
318
- "eval_loss": 0.00026110102771781385,
319
  "eval_rewards/accuracies": 1.0,
320
- "eval_rewards/chosen": 3.707453727722168,
321
- "eval_rewards/margins": 16.420101165771484,
322
- "eval_rewards/rejected": -12.71264934539795,
323
- "eval_runtime": 16.3811,
324
- "eval_samples_per_second": 12.209,
325
- "eval_steps_per_second": 12.209,
326
  "step": 101
327
  },
328
  {
329
  "epoch": 1.0295566502463054,
330
- "grad_norm": 0.00640869140625,
331
  "learning_rate": 1.9535019014786414e-05,
332
- "logits/chosen": -0.061172205954790115,
333
- "logits/rejected": -0.10132347047328949,
334
- "logps/chosen": -672.3265380859375,
335
- "logps/rejected": -1173.51171875,
336
  "loss": 0.0,
337
  "rewards/accuracies": 1.0,
338
- "rewards/chosen": 3.708890199661255,
339
- "rewards/margins": 17.31882667541504,
340
- "rewards/rejected": -13.60993766784668,
341
  "step": 105
342
  },
343
  {
344
  "epoch": 1.0788177339901477,
345
- "grad_norm": 0.001983642578125,
346
  "learning_rate": 1.9173939486619933e-05,
347
- "logits/chosen": -0.05034123733639717,
348
- "logits/rejected": -0.10526905208826065,
349
- "logps/chosen": -647.2199096679688,
350
- "logps/rejected": -1188.275634765625,
351
  "loss": 0.0,
352
  "rewards/accuracies": 1.0,
353
- "rewards/chosen": 4.165084362030029,
354
- "rewards/margins": 17.745838165283203,
355
- "rewards/rejected": -13.5807523727417,
356
  "step": 110
357
  },
358
  {
359
  "epoch": 1.1280788177339902,
360
- "grad_norm": 0.0123291015625,
361
  "learning_rate": 1.8794450144667584e-05,
362
- "logits/chosen": -0.037617627531290054,
363
- "logits/rejected": -0.12743313610553741,
364
- "logps/chosen": -771.9480590820312,
365
- "logps/rejected": -1097.7689208984375,
366
  "loss": 0.0,
367
  "rewards/accuracies": 1.0,
368
- "rewards/chosen": 4.783888339996338,
369
- "rewards/margins": 18.24011993408203,
370
- "rewards/rejected": -13.456230163574219,
371
  "step": 115
372
  },
373
  {
374
  "epoch": 1.1773399014778325,
375
- "grad_norm": 0.000865936279296875,
376
  "learning_rate": 1.839785429199364e-05,
377
- "logits/chosen": -0.00499363336712122,
378
- "logits/rejected": -0.14917989075183868,
379
- "logps/chosen": -711.33447265625,
380
- "logps/rejected": -1193.215576171875,
381
  "loss": 0.0,
382
  "rewards/accuracies": 1.0,
383
- "rewards/chosen": 4.206370830535889,
384
- "rewards/margins": 18.47035026550293,
385
- "rewards/rejected": -14.263978958129883,
386
  "step": 120
387
  },
388
  {
389
  "epoch": 1.2266009852216748,
390
- "grad_norm": 0.004608154296875,
391
  "learning_rate": 1.7985513981580707e-05,
392
- "logits/chosen": -0.07984370738267899,
393
- "logits/rejected": -0.14964917302131653,
394
- "logps/chosen": -656.4620361328125,
395
- "logps/rejected": -1094.466064453125,
396
  "loss": 0.0,
397
  "rewards/accuracies": 1.0,
398
- "rewards/chosen": 3.7183139324188232,
399
- "rewards/margins": 17.3912353515625,
400
- "rewards/rejected": -13.67292308807373,
401
  "step": 125
402
  },
403
  {
404
  "epoch": 1.2758620689655173,
405
- "grad_norm": 0.01708984375,
406
  "learning_rate": 1.7558845338549242e-05,
407
- "logits/chosen": -0.07335531711578369,
408
- "logits/rejected": -0.25236526131629944,
409
- "logps/chosen": -668.4403076171875,
410
- "logps/rejected": -1086.022216796875,
411
  "loss": 0.0,
412
  "rewards/accuracies": 1.0,
413
- "rewards/chosen": 4.3127336502075195,
414
- "rewards/margins": 17.868593215942383,
415
- "rewards/rejected": -13.555859565734863,
416
  "step": 130
417
  },
418
  {
419
  "epoch": 1.3251231527093597,
420
- "grad_norm": 0.003265380859375,
421
  "learning_rate": 1.711931369667393e-05,
422
- "logits/chosen": -0.10842283070087433,
423
- "logits/rejected": -0.20305411517620087,
424
- "logps/chosen": -665.7677001953125,
425
- "logps/rejected": -1082.3480224609375,
426
  "loss": 0.0,
427
  "rewards/accuracies": 1.0,
428
- "rewards/chosen": 3.790182590484619,
429
- "rewards/margins": 17.2746639251709,
430
- "rewards/rejected": -13.484481811523438,
431
  "step": 135
432
  },
433
  {
434
  "epoch": 1.374384236453202,
435
- "grad_norm": 0.004364013671875,
436
  "learning_rate": 1.666842856589978e-05,
437
- "logits/chosen": -0.10986592620611191,
438
- "logits/rejected": -0.11519701778888702,
439
- "logps/chosen": -634.3655395507812,
440
- "logps/rejected": -1161.351806640625,
441
  "loss": 0.0,
442
  "rewards/accuracies": 1.0,
443
- "rewards/chosen": 3.9017367362976074,
444
- "rewards/margins": 17.374921798706055,
445
- "rewards/rejected": -13.473180770874023,
446
  "step": 140
447
  },
448
  {
449
  "epoch": 1.4236453201970443,
450
- "grad_norm": 0.000965118408203125,
451
  "learning_rate": 1.6207738448141366e-05,
452
- "logits/chosen": -0.07416743785142899,
453
- "logits/rejected": -0.16824878752231598,
454
- "logps/chosen": -629.0369262695312,
455
- "logps/rejected": -1140.7796630859375,
456
  "loss": 0.0,
457
  "rewards/accuracies": 1.0,
458
- "rewards/chosen": 3.62548828125,
459
- "rewards/margins": 17.625173568725586,
460
- "rewards/rejected": -13.999687194824219,
461
  "step": 145
462
  },
463
  {
464
  "epoch": 1.4729064039408866,
465
- "grad_norm": 0.0030364990234375,
466
  "learning_rate": 1.573882551916961e-05,
467
- "logits/chosen": -0.12653779983520508,
468
- "logits/rejected": -0.12018134444952011,
469
- "logps/chosen": -682.4252319335938,
470
- "logps/rejected": -1164.7567138671875,
471
  "loss": 0.0,
472
  "rewards/accuracies": 1.0,
473
- "rewards/chosen": 3.8734822273254395,
474
- "rewards/margins": 17.484914779663086,
475
- "rewards/rejected": -13.611432075500488,
476
  "step": 150
477
  },
478
  {
479
  "epoch": 1.522167487684729,
480
- "grad_norm": 0.06494140625,
481
  "learning_rate": 1.5263300194850375e-05,
482
- "logits/chosen": -0.13870279490947723,
483
- "logits/rejected": -0.17405085265636444,
484
- "logps/chosen": -562.0906372070312,
485
- "logps/rejected": -1064.6165771484375,
486
  "loss": 0.0,
487
  "rewards/accuracies": 1.0,
488
- "rewards/chosen": 3.8661842346191406,
489
- "rewards/margins": 16.854862213134766,
490
- "rewards/rejected": -12.988679885864258,
491
  "step": 155
492
  },
493
  {
494
  "epoch": 1.5714285714285714,
495
- "grad_norm": 0.0101318359375,
496
  "learning_rate": 1.478279560039646e-05,
497
- "logits/chosen": 0.00353141943924129,
498
- "logits/rejected": -0.052297573536634445,
499
- "logps/chosen": -631.2164916992188,
500
- "logps/rejected": -1117.577392578125,
501
  "loss": 0.0,
502
  "rewards/accuracies": 1.0,
503
- "rewards/chosen": 3.469149112701416,
504
- "rewards/margins": 16.642498016357422,
505
- "rewards/rejected": -13.17335033416748,
506
  "step": 160
507
  },
508
  {
509
  "epoch": 1.6206896551724137,
510
- "grad_norm": 0.003143310546875,
511
  "learning_rate": 1.429896196162745e-05,
512
- "logits/chosen": -0.0989263653755188,
513
- "logits/rejected": -0.08890175819396973,
514
- "logps/chosen": -593.7097778320312,
515
- "logps/rejected": -1195.1968994140625,
516
  "loss": 0.0,
517
  "rewards/accuracies": 1.0,
518
- "rewards/chosen": 4.126656532287598,
519
- "rewards/margins": 17.980117797851562,
520
- "rewards/rejected": -13.853459358215332,
521
  "step": 165
522
  },
523
  {
524
  "epoch": 1.6699507389162562,
525
- "grad_norm": 0.0030975341796875,
526
  "learning_rate": 1.3813460937500001e-05,
527
- "logits/chosen": -0.020123030990362167,
528
- "logits/rejected": -0.1141321063041687,
529
- "logps/chosen": -704.115478515625,
530
- "logps/rejected": -1211.7939453125,
531
  "loss": 0.0,
532
  "rewards/accuracies": 1.0,
533
- "rewards/chosen": 3.987809419631958,
534
- "rewards/margins": 18.1489315032959,
535
- "rewards/rejected": -14.161120414733887,
536
  "step": 170
537
  },
538
  {
539
  "epoch": 1.7192118226600985,
540
- "grad_norm": 0.0244140625,
541
  "learning_rate": 1.332795991337255e-05,
542
- "logits/chosen": -0.07781299948692322,
543
- "logits/rejected": -0.1914922446012497,
544
- "logps/chosen": -714.4182739257812,
545
- "logps/rejected": -1111.824951171875,
546
  "loss": 0.0,
547
  "rewards/accuracies": 1.0,
548
- "rewards/chosen": 4.549325942993164,
549
- "rewards/margins": 18.17251205444336,
550
- "rewards/rejected": -13.623186111450195,
551
  "step": 175
552
  },
553
  {
554
  "epoch": 1.7684729064039408,
555
- "grad_norm": 0.0003814697265625,
556
  "learning_rate": 1.2844126274603544e-05,
557
- "logits/chosen": -0.04165253788232803,
558
- "logits/rejected": -0.11223433166742325,
559
- "logps/chosen": -728.790771484375,
560
- "logps/rejected": -1157.66552734375,
561
  "loss": 0.0,
562
  "rewards/accuracies": 1.0,
563
- "rewards/chosen": 4.202837944030762,
564
- "rewards/margins": 17.818559646606445,
565
- "rewards/rejected": -13.615720748901367,
566
  "step": 180
567
  },
568
  {
569
  "epoch": 1.8177339901477834,
570
- "grad_norm": 0.005523681640625,
571
  "learning_rate": 1.2363621680149627e-05,
572
- "logits/chosen": 0.018531261011958122,
573
- "logits/rejected": -0.08929164707660675,
574
- "logps/chosen": -655.3833618164062,
575
- "logps/rejected": -1119.193115234375,
576
  "loss": 0.0,
577
  "rewards/accuracies": 1.0,
578
- "rewards/chosen": 3.5976531505584717,
579
- "rewards/margins": 17.0244197845459,
580
- "rewards/rejected": -13.426763534545898,
581
  "step": 185
582
  },
583
  {
584
  "epoch": 1.8669950738916257,
585
- "grad_norm": 0.00177764892578125,
586
  "learning_rate": 1.1888096355830394e-05,
587
- "logits/chosen": -0.008384434506297112,
588
- "logits/rejected": -0.078025683760643,
589
- "logps/chosen": -641.1484985351562,
590
- "logps/rejected": -1152.9144287109375,
591
  "loss": 0.0,
592
  "rewards/accuracies": 1.0,
593
- "rewards/chosen": 3.942249298095703,
594
- "rewards/margins": 17.213258743286133,
595
- "rewards/rejected": -13.271008491516113,
596
  "step": 190
597
  },
598
  {
599
  "epoch": 1.916256157635468,
600
- "grad_norm": 0.0306396484375,
601
  "learning_rate": 1.1419183426858638e-05,
602
- "logits/chosen": -0.11397171020507812,
603
- "logits/rejected": -0.14867517352104187,
604
- "logps/chosen": -606.4990844726562,
605
- "logps/rejected": -1104.8248291015625,
606
  "loss": 0.0,
607
  "rewards/accuracies": 1.0,
608
- "rewards/chosen": 4.0419487953186035,
609
- "rewards/margins": 17.517881393432617,
610
- "rewards/rejected": -13.475932121276855,
611
  "step": 195
612
  },
613
  {
614
  "epoch": 1.9655172413793105,
615
- "grad_norm": 0.0322265625,
616
  "learning_rate": 1.0958493309100225e-05,
617
- "logits/chosen": -0.04851642996072769,
618
- "logits/rejected": -0.09745767712593079,
619
- "logps/chosen": -662.3472900390625,
620
- "logps/rejected": -1182.0906982421875,
621
  "loss": 0.0,
622
  "rewards/accuracies": 1.0,
623
- "rewards/chosen": 4.098772048950195,
624
- "rewards/margins": 18.182476043701172,
625
- "rewards/rejected": -14.083703994750977,
626
  "step": 200
627
  },
628
  {
629
  "epoch": 1.9852216748768474,
630
- "eval_logits/chosen": -0.11506563425064087,
631
- "eval_logits/rejected": -0.11980016529560089,
632
- "eval_logps/chosen": -620.9696655273438,
633
- "eval_logps/rejected": -1077.2542724609375,
634
- "eval_loss": 0.00023559275723528117,
635
  "eval_rewards/accuracies": 1.0,
636
- "eval_rewards/chosen": 3.7534308433532715,
637
- "eval_rewards/margins": 16.460861206054688,
638
- "eval_rewards/rejected": -12.70743179321289,
639
- "eval_runtime": 17.0739,
640
- "eval_samples_per_second": 11.714,
641
- "eval_steps_per_second": 11.714,
642
  "step": 202
643
  }
644
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04926108374384237,
14
+ "grad_norm": 108.0,
15
  "learning_rate": 2.52589e-06,
16
+ "logits/chosen": 0.06021759659051895,
17
+ "logits/rejected": -0.009350213222205639,
18
+ "logps/chosen": -714.9625244140625,
19
+ "logps/rejected": -992.0533447265625,
20
+ "loss": 0.6838,
21
+ "rewards/accuracies": 0.366666704416275,
22
+ "rewards/chosen": 0.011166826821863651,
23
+ "rewards/margins": 0.022149886935949326,
24
+ "rewards/rejected": -0.010983060114085674,
25
  "step": 5
26
  },
27
  {
28
  "epoch": 0.09852216748768473,
29
+ "grad_norm": 73.5,
30
  "learning_rate": 5.6832524999999995e-06,
31
+ "logits/chosen": 0.045873817056417465,
32
+ "logits/rejected": 0.003844240214675665,
33
+ "logps/chosen": -671.487060546875,
34
+ "logps/rejected": -1002.9957885742188,
35
  "loss": 0.5469,
36
+ "rewards/accuracies": 0.8999999761581421,
37
+ "rewards/chosen": 0.0998205691576004,
38
+ "rewards/margins": 0.3382015824317932,
39
+ "rewards/rejected": -0.2383810579776764,
40
  "step": 10
41
  },
42
  {
43
  "epoch": 0.1477832512315271,
44
+ "grad_norm": 30.625,
45
  "learning_rate": 8.840615e-06,
46
+ "logits/chosen": -0.023920465260744095,
47
+ "logits/rejected": 0.042549438774585724,
48
+ "logps/chosen": -677.3431396484375,
49
+ "logps/rejected": -989.5335693359375,
50
+ "loss": 0.2497,
51
  "rewards/accuracies": 0.98333340883255,
52
+ "rewards/chosen": 0.3677126467227936,
53
+ "rewards/margins": 1.4746158123016357,
54
+ "rewards/rejected": -1.106903076171875,
55
  "step": 15
56
  },
57
  {
58
  "epoch": 0.19704433497536947,
59
+ "grad_norm": 3.71875,
60
  "learning_rate": 1.19979775e-05,
61
+ "logits/chosen": 0.008176160976290703,
62
+ "logits/rejected": 0.018394723534584045,
63
+ "logps/chosen": -703.6475830078125,
64
+ "logps/rejected": -1041.759765625,
65
+ "loss": 0.0449,
66
+ "rewards/accuracies": 1.0,
67
+ "rewards/chosen": 0.787887454032898,
68
+ "rewards/margins": 4.096963405609131,
69
+ "rewards/rejected": -3.3090755939483643,
70
  "step": 20
71
  },
72
  {
73
  "epoch": 0.24630541871921183,
74
+ "grad_norm": 12.5,
75
  "learning_rate": 1.5155340000000002e-05,
76
+ "logits/chosen": -0.024350730702280998,
77
+ "logits/rejected": 0.03716661036014557,
78
+ "logps/chosen": -717.7275390625,
79
+ "logps/rejected": -1072.546630859375,
80
+ "loss": 0.0161,
81
+ "rewards/accuracies": 1.0,
82
+ "rewards/chosen": 1.2740657329559326,
83
+ "rewards/margins": 7.921713829040527,
84
+ "rewards/rejected": -6.647648811340332,
85
  "step": 25
86
  },
87
  {
88
  "epoch": 0.2955665024630542,
89
+ "grad_norm": 0.2109375,
90
  "learning_rate": 1.8312702500000002e-05,
91
+ "logits/chosen": 0.02022477611899376,
92
+ "logits/rejected": -0.005769997835159302,
93
+ "logps/chosen": -672.1201171875,
94
+ "logps/rejected": -1118.390380859375,
95
  "loss": 0.0009,
96
  "rewards/accuracies": 1.0,
97
+ "rewards/chosen": 1.8031457662582397,
98
+ "rewards/margins": 12.140481948852539,
99
+ "rewards/rejected": -10.337335586547852,
100
  "step": 30
101
  },
102
  {
103
  "epoch": 0.3448275862068966,
104
+ "grad_norm": 8.6875,
105
  "learning_rate": 2.1470065e-05,
106
+ "logits/chosen": 0.0037163645029067993,
107
+ "logits/rejected": -0.027010012418031693,
108
+ "logps/chosen": -665.9528198242188,
109
+ "logps/rejected": -1054.0537109375,
110
+ "loss": 0.0097,
111
+ "rewards/accuracies": 1.0,
112
+ "rewards/chosen": 1.6622908115386963,
113
+ "rewards/margins": 13.775215148925781,
114
+ "rewards/rejected": -12.112924575805664,
115
  "step": 35
116
  },
117
  {
118
  "epoch": 0.39408866995073893,
119
+ "grad_norm": 4.53125,
120
  "learning_rate": 2.2092428004483083e-05,
121
+ "logits/chosen": 0.0688682422041893,
122
+ "logits/rejected": 0.05512089654803276,
123
+ "logps/chosen": -715.6795654296875,
124
+ "logps/rejected": -1175.619140625,
125
+ "loss": 0.0022,
126
+ "rewards/accuracies": 1.0,
127
+ "rewards/chosen": 2.4531707763671875,
128
+ "rewards/margins": 15.8966703414917,
129
+ "rewards/rejected": -13.443501472473145,
130
  "step": 40
131
  },
132
  {
133
  "epoch": 0.4433497536945813,
134
+ "grad_norm": 0.0022735595703125,
135
  "learning_rate": 2.205545499586072e-05,
136
+ "logits/chosen": -0.025104057043790817,
137
+ "logits/rejected": -0.059289705008268356,
138
+ "logps/chosen": -683.1431884765625,
139
+ "logps/rejected": -1164.0970458984375,
140
  "loss": 0.0002,
141
  "rewards/accuracies": 1.0,
142
+ "rewards/chosen": 2.8538169860839844,
143
+ "rewards/margins": 16.61087989807129,
144
+ "rewards/rejected": -13.757061958312988,
145
  "step": 45
146
  },
147
  {
148
  "epoch": 0.49261083743842365,
149
+ "grad_norm": 0.5078125,
150
  "learning_rate": 2.1990176011484834e-05,
151
+ "logits/chosen": 0.06939034163951874,
152
+ "logits/rejected": -0.03862878680229187,
153
+ "logps/chosen": -663.315185546875,
154
+ "logps/rejected": -1101.8394775390625,
155
  "loss": 0.0012,
156
  "rewards/accuracies": 1.0,
157
+ "rewards/chosen": 2.704948663711548,
158
+ "rewards/margins": 16.27048110961914,
159
+ "rewards/rejected": -13.565534591674805,
160
  "step": 50
161
  },
162
  {
163
  "epoch": 0.541871921182266,
164
+ "grad_norm": 1.09375,
165
  "learning_rate": 2.1896815242896443e-05,
166
+ "logits/chosen": -0.07729315012693405,
167
+ "logits/rejected": -0.07702396810054779,
168
+ "logps/chosen": -613.7337036132812,
169
+ "logps/rejected": -1092.618408203125,
170
+ "loss": 0.0014,
171
+ "rewards/accuracies": 1.0,
172
+ "rewards/chosen": 2.867260694503784,
173
+ "rewards/margins": 16.498395919799805,
174
+ "rewards/rejected": -13.631135940551758,
175
  "step": 55
176
  },
177
  {
178
  "epoch": 0.5911330049261084,
179
+ "grad_norm": 0.002410888671875,
180
  "learning_rate": 2.1775693324596527e-05,
181
+ "logits/chosen": -0.03670992702245712,
182
+ "logits/rejected": -0.035895608365535736,
183
+ "logps/chosen": -666.19970703125,
184
+ "logps/rejected": -1143.6500244140625,
185
  "loss": 0.0005,
186
  "rewards/accuracies": 1.0,
187
+ "rewards/chosen": 3.211841583251953,
188
+ "rewards/margins": 17.307846069335938,
189
+ "rewards/rejected": -14.0960054397583,
190
  "step": 60
191
  },
192
  {
193
  "epoch": 0.6403940886699507,
194
+ "grad_norm": 0.001922607421875,
195
  "learning_rate": 2.1627226232871688e-05,
196
+ "logits/chosen": -0.031517110764980316,
197
+ "logits/rejected": -0.16813872754573822,
198
+ "logps/chosen": -728.2982177734375,
199
+ "logps/rejected": -1131.3756103515625,
200
  "loss": 0.0001,
201
  "rewards/accuracies": 1.0,
202
+ "rewards/chosen": 3.4621803760528564,
203
+ "rewards/margins": 17.45178985595703,
204
+ "rewards/rejected": -13.98961067199707,
205
  "step": 65
206
  },
207
  {
208
  "epoch": 0.6896551724137931,
209
+ "grad_norm": 0.11474609375,
210
  "learning_rate": 2.1451923857181784e-05,
211
+ "logits/chosen": -0.003007475985214114,
212
+ "logits/rejected": -0.1008603423833847,
213
+ "logps/chosen": -675.1759033203125,
214
+ "logps/rejected": -1094.3150634765625,
215
  "loss": 0.0007,
216
  "rewards/accuracies": 1.0,
217
+ "rewards/chosen": 3.4204094409942627,
218
+ "rewards/margins": 16.498920440673828,
219
+ "rewards/rejected": -13.078511238098145,
220
  "step": 70
221
  },
222
  {
223
  "epoch": 0.7389162561576355,
224
+ "grad_norm": 0.302734375,
225
  "learning_rate": 2.12503882490159e-05,
226
+ "logits/chosen": -0.08852293342351913,
227
+ "logits/rejected": -0.10156140476465225,
228
+ "logps/chosen": -670.6781005859375,
229
+ "logps/rejected": -1148.835693359375,
230
  "loss": 0.0003,
231
  "rewards/accuracies": 1.0,
232
+ "rewards/chosen": 3.3611931800842285,
233
+ "rewards/margins": 16.560178756713867,
234
+ "rewards/rejected": -13.198987007141113,
235
  "step": 75
236
  },
237
  {
238
  "epoch": 0.7881773399014779,
239
+ "grad_norm": 0.29296875,
240
  "learning_rate": 2.1023311554230692e-05,
241
+ "logits/chosen": 0.0469479113817215,
242
+ "logits/rejected": -0.06342597305774689,
243
+ "logps/chosen": -684.2789916992188,
244
+ "logps/rejected": -1146.1676025390625,
245
  "loss": 0.0002,
246
  "rewards/accuracies": 1.0,
247
+ "rewards/chosen": 3.4045166969299316,
248
+ "rewards/margins": 16.89340591430664,
249
+ "rewards/rejected": -13.488889694213867,
250
  "step": 80
251
  },
252
  {
253
  "epoch": 0.8374384236453202,
254
+ "grad_norm": 0.1240234375,
255
  "learning_rate": 2.0771473635972312e-05,
256
+ "logits/chosen": -0.07357874512672424,
257
+ "logits/rejected": -0.06026272848248482,
258
+ "logps/chosen": -642.2686767578125,
259
+ "logps/rejected": -1190.205810546875,
260
  "loss": 0.0001,
261
  "rewards/accuracies": 1.0,
262
+ "rewards/chosen": 3.354203701019287,
263
+ "rewards/margins": 17.001527786254883,
264
+ "rewards/rejected": -13.647323608398438,
265
  "step": 85
266
  },
267
  {
268
  "epoch": 0.8866995073891626,
269
+ "grad_norm": 0.060302734375,
270
  "learning_rate": 2.0495739396345457e-05,
271
+ "logits/chosen": -0.08571887761354446,
272
+ "logits/rejected": -0.1507987231016159,
273
+ "logps/chosen": -629.473388671875,
274
+ "logps/rejected": -1103.9293212890625,
275
  "loss": 0.0003,
276
  "rewards/accuracies": 1.0,
277
+ "rewards/chosen": 3.417731761932373,
278
+ "rewards/margins": 17.251663208007812,
279
+ "rewards/rejected": -13.833930969238281,
280
  "step": 90
281
  },
282
  {
283
  "epoch": 0.9359605911330049,
284
+ "grad_norm": 0.0128173828125,
285
  "learning_rate": 2.01970558060281e-05,
286
+ "logits/chosen": -0.027310481294989586,
287
+ "logits/rejected": -0.04605848342180252,
288
+ "logps/chosen": -725.526611328125,
289
+ "logps/rejected": -1180.865478515625,
290
  "loss": 0.0,
291
  "rewards/accuracies": 1.0,
292
+ "rewards/chosen": 3.788097858428955,
293
+ "rewards/margins": 17.4461727142334,
294
+ "rewards/rejected": -13.658073425292969,
295
  "step": 95
296
  },
297
  {
298
  "epoch": 0.9852216748768473,
299
  "grad_norm": 0.004241943359375,
300
  "learning_rate": 1.9876448652033145e-05,
301
+ "logits/chosen": -0.12386312335729599,
302
+ "logits/rejected": -0.10513818264007568,
303
+ "logps/chosen": -615.34033203125,
304
+ "logps/rejected": -1112.7601318359375,
305
  "loss": 0.001,
306
  "rewards/accuracies": 1.0,
307
+ "rewards/chosen": 3.5329208374023438,
308
+ "rewards/margins": 16.703248977661133,
309
+ "rewards/rejected": -13.170324325561523,
310
  "step": 100
311
  },
312
  {
313
  "epoch": 0.9950738916256158,
314
+ "eval_logits/chosen": -0.11170963197946548,
315
+ "eval_logits/rejected": -0.11342550069093704,
316
+ "eval_logps/chosen": -621.7440795898438,
317
+ "eval_logps/rejected": -1078.06591796875,
318
+ "eval_loss": 0.00033413898199796677,
319
  "eval_rewards/accuracies": 1.0,
320
+ "eval_rewards/chosen": 3.6759870052337646,
321
+ "eval_rewards/margins": 16.46457862854004,
322
+ "eval_rewards/rejected": -12.788591384887695,
323
+ "eval_runtime": 16.5627,
324
+ "eval_samples_per_second": 12.075,
325
+ "eval_steps_per_second": 12.075,
326
  "step": 101
327
  },
328
  {
329
  "epoch": 1.0295566502463054,
330
+ "grad_norm": 0.006683349609375,
331
  "learning_rate": 1.9535019014786414e-05,
332
+ "logits/chosen": -0.06293856352567673,
333
+ "logits/rejected": -0.09947305917739868,
334
+ "logps/chosen": -672.5732421875,
335
+ "logps/rejected": -1174.2159423828125,
336
  "loss": 0.0,
337
  "rewards/accuracies": 1.0,
338
+ "rewards/chosen": 3.6842219829559326,
339
+ "rewards/margins": 17.364566802978516,
340
+ "rewards/rejected": -13.680344581604004,
341
  "step": 105
342
  },
343
  {
344
  "epoch": 1.0788177339901477,
345
+ "grad_norm": 0.0015106201171875,
346
  "learning_rate": 1.9173939486619933e-05,
347
+ "logits/chosen": -0.05223611742258072,
348
+ "logits/rejected": -0.10244528949260712,
349
+ "logps/chosen": -647.3909912109375,
350
+ "logps/rejected": -1189.0113525390625,
351
  "loss": 0.0,
352
  "rewards/accuracies": 1.0,
353
+ "rewards/chosen": 4.147972583770752,
354
+ "rewards/margins": 17.80228614807129,
355
+ "rewards/rejected": -13.654312133789062,
356
  "step": 110
357
  },
358
  {
359
  "epoch": 1.1280788177339902,
360
+ "grad_norm": 0.01220703125,
361
  "learning_rate": 1.8794450144667584e-05,
362
+ "logits/chosen": -0.03947216272354126,
363
+ "logits/rejected": -0.12559063732624054,
364
+ "logps/chosen": -772.1909790039062,
365
+ "logps/rejected": -1098.6795654296875,
366
  "loss": 0.0,
367
  "rewards/accuracies": 1.0,
368
+ "rewards/chosen": 4.7596025466918945,
369
+ "rewards/margins": 18.306901931762695,
370
+ "rewards/rejected": -13.547297477722168,
371
  "step": 115
372
  },
373
  {
374
  "epoch": 1.1773399014778325,
375
+ "grad_norm": 0.000743865966796875,
376
  "learning_rate": 1.839785429199364e-05,
377
+ "logits/chosen": -0.0065459804609417915,
378
+ "logits/rejected": -0.1465989649295807,
379
+ "logps/chosen": -711.794189453125,
380
+ "logps/rejected": -1193.7640380859375,
381
  "loss": 0.0,
382
  "rewards/accuracies": 1.0,
383
+ "rewards/chosen": 4.160403251647949,
384
+ "rewards/margins": 18.479259490966797,
385
+ "rewards/rejected": -14.318857192993164,
386
  "step": 120
387
  },
388
  {
389
  "epoch": 1.2266009852216748,
390
+ "grad_norm": 0.003936767578125,
391
  "learning_rate": 1.7985513981580707e-05,
392
+ "logits/chosen": -0.0810680016875267,
393
+ "logits/rejected": -0.14764997363090515,
394
+ "logps/chosen": -656.9730224609375,
395
+ "logps/rejected": -1095.385986328125,
396
  "loss": 0.0,
397
  "rewards/accuracies": 1.0,
398
+ "rewards/chosen": 3.6672158241271973,
399
+ "rewards/margins": 17.432126998901367,
400
+ "rewards/rejected": -13.764910697937012,
401
  "step": 125
402
  },
403
  {
404
  "epoch": 1.2758620689655173,
405
+ "grad_norm": 0.0162353515625,
406
  "learning_rate": 1.7558845338549242e-05,
407
+ "logits/chosen": -0.07523246854543686,
408
+ "logits/rejected": -0.25009721517562866,
409
+ "logps/chosen": -668.7523193359375,
410
+ "logps/rejected": -1086.5867919921875,
411
  "loss": 0.0,
412
  "rewards/accuracies": 1.0,
413
+ "rewards/chosen": 4.281540393829346,
414
+ "rewards/margins": 17.89385986328125,
415
+ "rewards/rejected": -13.61231803894043,
416
  "step": 130
417
  },
418
  {
419
  "epoch": 1.3251231527093597,
420
+ "grad_norm": 0.002838134765625,
421
  "learning_rate": 1.711931369667393e-05,
422
+ "logits/chosen": -0.1100362166762352,
423
+ "logits/rejected": -0.200296089053154,
424
+ "logps/chosen": -666.0404052734375,
425
+ "logps/rejected": -1082.836669921875,
426
  "loss": 0.0,
427
  "rewards/accuracies": 1.0,
428
+ "rewards/chosen": 3.762917995452881,
429
+ "rewards/margins": 17.2962703704834,
430
+ "rewards/rejected": -13.533352851867676,
431
  "step": 135
432
  },
433
  {
434
  "epoch": 1.374384236453202,
435
+ "grad_norm": 0.0068359375,
436
  "learning_rate": 1.666842856589978e-05,
437
+ "logits/chosen": -0.11170603334903717,
438
+ "logits/rejected": -0.11424056440591812,
439
+ "logps/chosen": -634.7129516601562,
440
+ "logps/rejected": -1162.6055908203125,
441
  "loss": 0.0,
442
  "rewards/accuracies": 1.0,
443
+ "rewards/chosen": 3.8669981956481934,
444
+ "rewards/margins": 17.46556854248047,
445
+ "rewards/rejected": -13.59857177734375,
446
  "step": 140
447
  },
448
  {
449
  "epoch": 1.4236453201970443,
450
+ "grad_norm": 0.0010833740234375,
451
  "learning_rate": 1.6207738448141366e-05,
452
+ "logits/chosen": -0.07618793845176697,
453
+ "logits/rejected": -0.1659926474094391,
454
+ "logps/chosen": -629.3489379882812,
455
+ "logps/rejected": -1141.2314453125,
456
  "loss": 0.0,
457
  "rewards/accuracies": 1.0,
458
+ "rewards/chosen": 3.594289779663086,
459
+ "rewards/margins": 17.63915252685547,
460
+ "rewards/rejected": -14.044862747192383,
461
  "step": 145
462
  },
463
  {
464
  "epoch": 1.4729064039408866,
465
+ "grad_norm": 0.0028839111328125,
466
  "learning_rate": 1.573882551916961e-05,
467
+ "logits/chosen": -0.1288968324661255,
468
+ "logits/rejected": -0.11787639558315277,
469
+ "logps/chosen": -682.9420166015625,
470
+ "logps/rejected": -1165.747802734375,
471
  "loss": 0.0,
472
  "rewards/accuracies": 1.0,
473
+ "rewards/chosen": 3.8218026161193848,
474
+ "rewards/margins": 17.532339096069336,
475
+ "rewards/rejected": -13.710535049438477,
476
  "step": 150
477
  },
478
  {
479
  "epoch": 1.522167487684729,
480
+ "grad_norm": 0.0712890625,
481
  "learning_rate": 1.5263300194850375e-05,
482
+ "logits/chosen": -0.14020918309688568,
483
+ "logits/rejected": -0.17282378673553467,
484
+ "logps/chosen": -562.5584716796875,
485
+ "logps/rejected": -1065.2425537109375,
486
  "loss": 0.0,
487
  "rewards/accuracies": 1.0,
488
+ "rewards/chosen": 3.819398880004883,
489
+ "rewards/margins": 16.870691299438477,
490
+ "rewards/rejected": -13.051290512084961,
491
  "step": 155
492
  },
493
  {
494
  "epoch": 1.5714285714285714,
495
+ "grad_norm": 0.01177978515625,
496
  "learning_rate": 1.478279560039646e-05,
497
+ "logits/chosen": 0.0017956334631890059,
498
+ "logits/rejected": -0.04993182793259621,
499
+ "logps/chosen": -631.7723388671875,
500
+ "logps/rejected": -1118.0230712890625,
501
  "loss": 0.0,
502
  "rewards/accuracies": 1.0,
503
+ "rewards/chosen": 3.4135658740997314,
504
+ "rewards/margins": 16.631460189819336,
505
+ "rewards/rejected": -13.217893600463867,
506
  "step": 160
507
  },
508
  {
509
  "epoch": 1.6206896551724137,
510
+ "grad_norm": 0.0032806396484375,
511
  "learning_rate": 1.429896196162745e-05,
512
+ "logits/chosen": -0.10030888020992279,
513
+ "logits/rejected": -0.0873989462852478,
514
+ "logps/chosen": -593.56884765625,
515
+ "logps/rejected": -1196.1925048828125,
516
  "loss": 0.0,
517
  "rewards/accuracies": 1.0,
518
+ "rewards/chosen": 4.140743255615234,
519
+ "rewards/margins": 18.093753814697266,
520
+ "rewards/rejected": -13.953012466430664,
521
  "step": 165
522
  },
523
  {
524
  "epoch": 1.6699507389162562,
525
+ "grad_norm": 0.004180908203125,
526
  "learning_rate": 1.3813460937500001e-05,
527
+ "logits/chosen": -0.022314613685011864,
528
+ "logits/rejected": -0.1113683357834816,
529
+ "logps/chosen": -704.1790771484375,
530
+ "logps/rejected": -1212.56103515625,
531
  "loss": 0.0,
532
  "rewards/accuracies": 1.0,
533
+ "rewards/chosen": 3.9814553260803223,
534
+ "rewards/margins": 18.219303131103516,
535
+ "rewards/rejected": -14.237849235534668,
536
  "step": 170
537
  },
538
  {
539
  "epoch": 1.7192118226600985,
540
+ "grad_norm": 0.0185546875,
541
  "learning_rate": 1.332795991337255e-05,
542
+ "logits/chosen": -0.0795547217130661,
543
+ "logits/rejected": -0.18906092643737793,
544
+ "logps/chosen": -714.6734619140625,
545
+ "logps/rejected": -1112.668212890625,
546
  "loss": 0.0,
547
  "rewards/accuracies": 1.0,
548
+ "rewards/chosen": 4.523798942565918,
549
+ "rewards/margins": 18.23130989074707,
550
+ "rewards/rejected": -13.70750904083252,
551
  "step": 175
552
  },
553
  {
554
  "epoch": 1.7684729064039408,
555
+ "grad_norm": 0.000331878662109375,
556
  "learning_rate": 1.2844126274603544e-05,
557
+ "logits/chosen": -0.04283156991004944,
558
+ "logits/rejected": -0.11096321046352386,
559
+ "logps/chosen": -729.1672973632812,
560
+ "logps/rejected": -1158.585205078125,
561
  "loss": 0.0,
562
  "rewards/accuracies": 1.0,
563
+ "rewards/chosen": 4.16518497467041,
564
+ "rewards/margins": 17.872875213623047,
565
+ "rewards/rejected": -13.707687377929688,
566
  "step": 180
567
  },
568
  {
569
  "epoch": 1.8177339901477834,
570
+ "grad_norm": 0.005584716796875,
571
  "learning_rate": 1.2363621680149627e-05,
572
+ "logits/chosen": 0.017648298293352127,
573
+ "logits/rejected": -0.08744337409734726,
574
+ "logps/chosen": -655.9172973632812,
575
+ "logps/rejected": -1120.0281982421875,
576
  "loss": 0.0,
577
  "rewards/accuracies": 1.0,
578
+ "rewards/chosen": 3.544255495071411,
579
+ "rewards/margins": 17.05453109741211,
580
+ "rewards/rejected": -13.510274887084961,
581
  "step": 185
582
  },
583
  {
584
  "epoch": 1.8669950738916257,
585
+ "grad_norm": 0.00147247314453125,
586
  "learning_rate": 1.1888096355830394e-05,
587
+ "logits/chosen": -0.009853709489107132,
588
+ "logits/rejected": -0.07630355656147003,
589
+ "logps/chosen": -641.5875244140625,
590
+ "logps/rejected": -1153.7431640625,
591
  "loss": 0.0,
592
  "rewards/accuracies": 1.0,
593
+ "rewards/chosen": 3.8983428478240967,
594
+ "rewards/margins": 17.25222396850586,
595
+ "rewards/rejected": -13.353883743286133,
596
  "step": 190
597
  },
598
  {
599
  "epoch": 1.916256157635468,
600
+ "grad_norm": 0.040771484375,
601
  "learning_rate": 1.1419183426858638e-05,
602
+ "logits/chosen": -0.11551934480667114,
603
+ "logits/rejected": -0.1467764675617218,
604
+ "logps/chosen": -606.7501220703125,
605
+ "logps/rejected": -1105.7303466796875,
606
  "loss": 0.0,
607
  "rewards/accuracies": 1.0,
608
+ "rewards/chosen": 4.016844749450684,
609
+ "rewards/margins": 17.583324432373047,
610
+ "rewards/rejected": -13.56648063659668,
611
  "step": 195
612
  },
613
  {
614
  "epoch": 1.9655172413793105,
615
+ "grad_norm": 0.029541015625,
616
  "learning_rate": 1.0958493309100225e-05,
617
+ "logits/chosen": -0.050420112907886505,
618
+ "logits/rejected": -0.09584550559520721,
619
+ "logps/chosen": -662.5157470703125,
620
+ "logps/rejected": -1182.98486328125,
621
  "loss": 0.0,
622
  "rewards/accuracies": 1.0,
623
+ "rewards/chosen": 4.08192253112793,
624
+ "rewards/margins": 18.255043029785156,
625
+ "rewards/rejected": -14.173120498657227,
626
  "step": 200
627
  },
628
  {
629
  "epoch": 1.9852216748768474,
630
+ "eval_logits/chosen": -0.11707988381385803,
631
+ "eval_logits/rejected": -0.12201124429702759,
632
+ "eval_logps/chosen": -621.3673706054688,
633
+ "eval_logps/rejected": -1078.1207275390625,
634
+ "eval_loss": 0.00028668390586972237,
635
  "eval_rewards/accuracies": 1.0,
636
+ "eval_rewards/chosen": 3.7136542797088623,
637
+ "eval_rewards/margins": 16.507728576660156,
638
+ "eval_rewards/rejected": -12.794075012207031,
639
+ "eval_runtime": 17.6249,
640
+ "eval_samples_per_second": 11.348,
641
+ "eval_steps_per_second": 11.348,
642
  "step": 202
643
  }
644
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ca5fc5ec268283fdc7c45c09fa4a36b3a361610f08042358654cebbf1cb9274
3
  size 6392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5292107477bd9a079b607dd783099e2b018d5b0a17a19c5c05ea03394a245fa8
3
  size 6392