derko83 commited on
Commit
f3565a6
·
verified ·
1 Parent(s): 6a86618

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1167 -267
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30692ceb235c4429caf85385bb5458694b351ae55100bbfb3f50a74566d5f9f1
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb63f5f7e748555daeeb40663638cb4ee36e3274e2691792da29cde974199b6
3
  size 2384234968
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd62ddddee6c23a6650a91d1b2d97b6552130af70cc0c03b742eeb84fd96ec01
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc572568881d9b759c8f59d75bf1b9cb28ba836994c8f509e1180f5fbe94eb68
3
  size 4768663315
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:250560ab3d528161ab3659b120def6e4a9ab4b457e3399603bbcfa40db3efc90
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60dc60940088350df2476daec612777256353a222718d12ad1d77c0a8edad709
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eda8fb70e02ca89c72bc8ed5c2b4af18bf6ab7515831776b7b43399b9d94e12
3
  size 1465
trainer_state.json CHANGED
@@ -2,453 +2,1353 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 1493,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.033489618218352314,
14
- "grad_norm": 90.51985931396484,
15
- "learning_rate": 4.835900870730074e-06,
16
  "logits/chosen": NaN,
17
  "logits/rejected": NaN,
18
- "logps/chosen": -179.42369079589844,
19
- "logps/rejected": -227.34011840820312,
20
- "loss": 0.6272,
21
- "rewards/accuracies": 0.4987500011920929,
22
- "rewards/chosen": -0.41917884349823,
23
- "rewards/margins": 0.4206826388835907,
24
- "rewards/rejected": -0.8398614525794983,
25
  "step": 50
26
  },
27
  {
28
  "epoch": 0.06697923643670463,
29
- "grad_norm": 122.67408752441406,
30
- "learning_rate": 4.668452779638312e-06,
31
  "logits/chosen": NaN,
32
  "logits/rejected": NaN,
33
- "logps/chosen": -189.38446044921875,
34
- "logps/rejected": -243.21636962890625,
35
- "loss": 0.6008,
36
- "rewards/accuracies": 0.5625,
37
- "rewards/chosen": -0.9909499287605286,
38
- "rewards/margins": 0.8664093613624573,
39
- "rewards/rejected": -1.8573591709136963,
40
  "step": 100
41
  },
42
  {
43
  "epoch": 0.10046885465505694,
44
- "grad_norm": 69.09490203857422,
45
- "learning_rate": 4.501004688546551e-06,
46
  "logits/chosen": NaN,
47
  "logits/rejected": NaN,
48
- "logps/chosen": -172.51051330566406,
49
- "logps/rejected": -237.64398193359375,
50
- "loss": 0.5445,
51
- "rewards/accuracies": 0.5912500023841858,
52
- "rewards/chosen": -0.773537814617157,
53
- "rewards/margins": 1.0922305583953857,
54
- "rewards/rejected": -1.8657684326171875,
55
  "step": 150
56
  },
57
  {
58
  "epoch": 0.13395847287340926,
59
- "grad_norm": 92.90730285644531,
60
- "learning_rate": 4.333556597454789e-06,
61
  "logits/chosen": NaN,
62
  "logits/rejected": NaN,
63
- "logps/chosen": -185.27850341796875,
64
- "logps/rejected": -240.15431213378906,
65
- "loss": 0.5958,
66
- "rewards/accuracies": 0.581250011920929,
67
- "rewards/chosen": -0.8680741190910339,
68
- "rewards/margins": 1.0129902362823486,
69
- "rewards/rejected": -1.8810642957687378,
70
  "step": 200
71
  },
72
  {
73
  "epoch": 0.16744809109176156,
74
- "grad_norm": 41.30106735229492,
75
- "learning_rate": 4.166108506363028e-06,
76
  "logits/chosen": NaN,
77
  "logits/rejected": NaN,
78
- "logps/chosen": -185.0808563232422,
79
- "logps/rejected": -243.77664184570312,
80
- "loss": 0.5232,
81
- "rewards/accuracies": 0.5950000286102295,
82
- "rewards/chosen": -0.6241927742958069,
83
- "rewards/margins": 1.5013166666030884,
84
- "rewards/rejected": -2.125509262084961,
85
  "step": 250
86
  },
87
  {
88
  "epoch": 0.20093770931011387,
89
- "grad_norm": 88.7403793334961,
90
- "learning_rate": 3.998660415271266e-06,
91
  "logits/chosen": NaN,
92
  "logits/rejected": NaN,
93
- "logps/chosen": -184.0787353515625,
94
- "logps/rejected": -243.6924285888672,
95
- "loss": 0.5419,
96
- "rewards/accuracies": 0.5950000286102295,
97
- "rewards/chosen": -0.8100302219390869,
98
- "rewards/margins": 1.521090030670166,
99
- "rewards/rejected": -2.331120491027832,
100
  "step": 300
101
  },
102
  {
103
  "epoch": 0.23442732752846618,
104
- "grad_norm": 70.95861053466797,
105
- "learning_rate": 3.831212324179505e-06,
106
- "logits/chosen": NaN,
107
- "logits/rejected": -1.532848834991455,
108
- "logps/chosen": -182.43124389648438,
109
- "logps/rejected": -248.5233917236328,
110
- "loss": 0.5533,
111
- "rewards/accuracies": 0.5824999809265137,
112
- "rewards/chosen": -1.069503903388977,
113
- "rewards/margins": 1.6424156427383423,
114
- "rewards/rejected": -2.7119195461273193,
115
  "step": 350
116
  },
117
  {
118
  "epoch": 0.2679169457468185,
119
- "grad_norm": 71.3283462524414,
120
- "learning_rate": 3.663764233087743e-06,
121
  "logits/chosen": NaN,
122
  "logits/rejected": NaN,
123
- "logps/chosen": -180.90724182128906,
124
- "logps/rejected": -253.06655883789062,
125
- "loss": 0.5131,
126
- "rewards/accuracies": 0.6449999809265137,
127
- "rewards/chosen": -0.9932506084442139,
128
- "rewards/margins": 2.001958131790161,
129
- "rewards/rejected": -2.995208740234375,
130
  "step": 400
131
  },
132
  {
133
  "epoch": 0.3014065639651708,
134
- "grad_norm": 52.41379928588867,
135
- "learning_rate": 3.496316141995982e-06,
136
  "logits/chosen": NaN,
137
  "logits/rejected": NaN,
138
- "logps/chosen": -187.8169708251953,
139
- "logps/rejected": -250.7092742919922,
140
- "loss": 0.574,
141
- "rewards/accuracies": 0.6087499856948853,
142
- "rewards/chosen": -0.9525413513183594,
143
- "rewards/margins": 1.8562077283859253,
144
- "rewards/rejected": -2.808749198913574,
145
  "step": 450
146
  },
147
  {
148
  "epoch": 0.33489618218352313,
149
- "grad_norm": 86.05554962158203,
150
- "learning_rate": 3.32886805090422e-06,
151
  "logits/chosen": NaN,
152
  "logits/rejected": NaN,
153
- "logps/chosen": -184.93011474609375,
154
- "logps/rejected": -245.39427185058594,
155
- "loss": 0.545,
156
- "rewards/accuracies": 0.6200000047683716,
157
- "rewards/chosen": -0.9822418093681335,
158
- "rewards/margins": 1.7062770128250122,
159
- "rewards/rejected": -2.68851900100708,
160
  "step": 500
161
  },
162
  {
163
  "epoch": 0.3683858004018754,
164
- "grad_norm": 24.4642391204834,
165
- "learning_rate": 3.1614199598124583e-06,
166
  "logits/chosen": NaN,
167
  "logits/rejected": NaN,
168
- "logps/chosen": -177.97482299804688,
169
- "logps/rejected": -253.92169189453125,
170
- "loss": 0.5249,
171
- "rewards/accuracies": 0.6150000095367432,
172
- "rewards/chosen": -0.7585690021514893,
173
- "rewards/margins": 2.060235023498535,
174
- "rewards/rejected": -2.8188037872314453,
175
  "step": 550
176
  },
177
  {
178
  "epoch": 0.40187541862022774,
179
- "grad_norm": 28.010169982910156,
180
- "learning_rate": 2.993971868720697e-06,
181
  "logits/chosen": NaN,
182
  "logits/rejected": NaN,
183
- "logps/chosen": -187.2054443359375,
184
- "logps/rejected": -262.4362487792969,
185
- "loss": 0.531,
186
- "rewards/accuracies": 0.6212499737739563,
187
- "rewards/chosen": -1.0288543701171875,
188
- "rewards/margins": 2.0049030780792236,
189
- "rewards/rejected": -3.033757209777832,
190
  "step": 600
191
  },
192
  {
193
  "epoch": 0.43536503683858,
194
- "grad_norm": 88.00039672851562,
195
- "learning_rate": 2.8265237776289352e-06,
196
  "logits/chosen": NaN,
197
  "logits/rejected": NaN,
198
- "logps/chosen": -199.0946044921875,
199
- "logps/rejected": -257.4405212402344,
200
- "loss": 0.506,
201
- "rewards/accuracies": 0.6175000071525574,
202
- "rewards/chosen": -0.9385867118835449,
203
- "rewards/margins": 1.9474469423294067,
204
- "rewards/rejected": -2.8860340118408203,
205
  "step": 650
206
  },
207
  {
208
  "epoch": 0.46885465505693236,
209
- "grad_norm": 76.95013427734375,
210
- "learning_rate": 2.6590756865371737e-06,
211
  "logits/chosen": NaN,
212
  "logits/rejected": NaN,
213
- "logps/chosen": -188.61651611328125,
214
- "logps/rejected": -255.59820556640625,
215
- "loss": 0.4841,
216
- "rewards/accuracies": 0.6287500262260437,
217
- "rewards/chosen": -0.9187755584716797,
218
- "rewards/margins": 2.190307378768921,
219
- "rewards/rejected": -3.1090831756591797,
220
  "step": 700
221
  },
222
  {
223
  "epoch": 0.5023442732752846,
224
- "grad_norm": 37.079219818115234,
225
- "learning_rate": 2.491627595445412e-06,
226
  "logits/chosen": NaN,
227
  "logits/rejected": NaN,
228
- "logps/chosen": -182.82937622070312,
229
- "logps/rejected": -253.3831787109375,
230
- "loss": 0.5061,
231
- "rewards/accuracies": 0.6324999928474426,
232
- "rewards/chosen": -0.8986356854438782,
233
- "rewards/margins": 2.1806037425994873,
234
- "rewards/rejected": -3.0792391300201416,
235
  "step": 750
236
  },
237
  {
238
  "epoch": 0.535833891493637,
239
- "grad_norm": 31.89027976989746,
240
- "learning_rate": 2.3241795043536505e-06,
241
  "logits/chosen": NaN,
242
  "logits/rejected": NaN,
243
- "logps/chosen": -172.42593383789062,
244
- "logps/rejected": -251.63729858398438,
245
- "loss": 0.4358,
246
- "rewards/accuracies": 0.6625000238418579,
247
- "rewards/chosen": -0.7772516012191772,
248
- "rewards/margins": 2.509305953979492,
249
- "rewards/rejected": -3.286557674407959,
250
  "step": 800
251
  },
252
  {
253
  "epoch": 0.5693235097119893,
254
- "grad_norm": 49.636253356933594,
255
- "learning_rate": 2.156731413261889e-06,
256
  "logits/chosen": NaN,
257
  "logits/rejected": NaN,
258
- "logps/chosen": -177.4981231689453,
259
- "logps/rejected": -252.06234741210938,
260
- "loss": 0.4752,
261
- "rewards/accuracies": 0.6262500286102295,
262
- "rewards/chosen": -0.8873915672302246,
263
- "rewards/margins": 2.302338123321533,
264
- "rewards/rejected": -3.1897289752960205,
265
  "step": 850
266
  },
267
  {
268
  "epoch": 0.6028131279303416,
269
- "grad_norm": 42.22234344482422,
270
- "learning_rate": 1.9892833221701274e-06,
271
  "logits/chosen": NaN,
272
  "logits/rejected": NaN,
273
- "logps/chosen": -185.84585571289062,
274
- "logps/rejected": -251.1869354248047,
275
- "loss": 0.5284,
276
- "rewards/accuracies": 0.6200000047683716,
277
- "rewards/chosen": -0.9590955972671509,
278
- "rewards/margins": 2.267340660095215,
279
- "rewards/rejected": -3.2264363765716553,
280
  "step": 900
281
  },
282
  {
283
  "epoch": 0.6363027461486939,
284
- "grad_norm": 60.56090545654297,
285
- "learning_rate": 1.8218352310783657e-06,
286
  "logits/chosen": NaN,
287
  "logits/rejected": NaN,
288
- "logps/chosen": -182.73912048339844,
289
- "logps/rejected": -245.673828125,
290
- "loss": 0.4587,
291
- "rewards/accuracies": 0.6625000238418579,
292
- "rewards/chosen": -0.6209310293197632,
293
- "rewards/margins": 2.47007155418396,
294
- "rewards/rejected": -3.0910024642944336,
295
  "step": 950
296
  },
297
  {
298
  "epoch": 0.6697923643670463,
299
- "grad_norm": 47.82743453979492,
300
- "learning_rate": 1.6543871399866043e-06,
301
  "logits/chosen": NaN,
302
  "logits/rejected": NaN,
303
- "logps/chosen": -170.28599548339844,
304
- "logps/rejected": -247.58837890625,
305
- "loss": 0.4471,
306
- "rewards/accuracies": 0.6424999833106995,
307
- "rewards/chosen": -0.5806804895401001,
308
- "rewards/margins": 2.4890031814575195,
309
- "rewards/rejected": -3.06968355178833,
310
  "step": 1000
311
  },
312
  {
313
  "epoch": 0.7032819825853985,
314
- "grad_norm": 33.236053466796875,
315
- "learning_rate": 1.4869390488948425e-06,
316
  "logits/chosen": NaN,
317
  "logits/rejected": NaN,
318
- "logps/chosen": -173.26913452148438,
319
- "logps/rejected": -257.9516906738281,
320
- "loss": 0.4204,
321
- "rewards/accuracies": 0.6825000047683716,
322
- "rewards/chosen": -0.4792047142982483,
323
- "rewards/margins": 2.651007890701294,
324
- "rewards/rejected": -3.1302123069763184,
325
  "step": 1050
326
  },
327
  {
328
  "epoch": 0.7367716008037508,
329
- "grad_norm": 39.73881530761719,
330
- "learning_rate": 1.3194909578030812e-06,
331
  "logits/chosen": NaN,
332
  "logits/rejected": NaN,
333
- "logps/chosen": -161.0741424560547,
334
- "logps/rejected": -243.2300262451172,
335
- "loss": 0.4058,
336
- "rewards/accuracies": 0.6625000238418579,
337
- "rewards/chosen": -0.5020321607589722,
338
- "rewards/margins": 2.7466940879821777,
339
- "rewards/rejected": -3.2487261295318604,
340
  "step": 1100
341
  },
342
  {
343
  "epoch": 0.7702612190221031,
344
- "grad_norm": 37.8878173828125,
345
- "learning_rate": 1.1520428667113196e-06,
346
  "logits/chosen": NaN,
347
  "logits/rejected": NaN,
348
- "logps/chosen": -177.0689697265625,
349
- "logps/rejected": -248.60035705566406,
350
- "loss": 0.4246,
351
- "rewards/accuracies": 0.6575000286102295,
352
- "rewards/chosen": -0.6515741944313049,
353
- "rewards/margins": 2.5524582862854004,
354
- "rewards/rejected": -3.2040326595306396,
355
  "step": 1150
356
  },
357
  {
358
  "epoch": 0.8037508372404555,
359
- "grad_norm": 54.17752456665039,
360
- "learning_rate": 9.84594775619558e-07,
361
  "logits/chosen": NaN,
362
  "logits/rejected": NaN,
363
- "logps/chosen": -184.8086395263672,
364
- "logps/rejected": -270.40771484375,
365
- "loss": 0.4499,
366
- "rewards/accuracies": 0.65625,
367
- "rewards/chosen": -0.5556185841560364,
368
- "rewards/margins": 2.5394182205200195,
369
- "rewards/rejected": -3.0950369834899902,
370
  "step": 1200
371
  },
372
  {
373
  "epoch": 0.8372404554588078,
374
- "grad_norm": 37.586639404296875,
375
- "learning_rate": 8.171466845277964e-07,
376
  "logits/chosen": NaN,
377
  "logits/rejected": NaN,
378
- "logps/chosen": -182.44801330566406,
379
- "logps/rejected": -255.1699676513672,
380
- "loss": 0.4307,
381
- "rewards/accuracies": 0.6537500023841858,
382
- "rewards/chosen": -0.6889155507087708,
383
- "rewards/margins": 2.603567123413086,
384
- "rewards/rejected": -3.292482852935791,
385
  "step": 1250
386
  },
387
  {
388
  "epoch": 0.87073007367716,
389
- "grad_norm": 52.10836410522461,
390
- "learning_rate": 6.496985934360349e-07,
391
  "logits/chosen": NaN,
392
  "logits/rejected": NaN,
393
- "logps/chosen": -179.65557861328125,
394
- "logps/rejected": -245.46560668945312,
395
- "loss": 0.4125,
396
- "rewards/accuracies": 0.6637499928474426,
397
- "rewards/chosen": -0.5343858003616333,
398
- "rewards/margins": 2.6487746238708496,
399
- "rewards/rejected": -3.1831603050231934,
400
  "step": 1300
401
  },
402
  {
403
  "epoch": 0.9042196918955124,
404
- "grad_norm": 30.635120391845703,
405
- "learning_rate": 4.822505023442733e-07,
406
  "logits/chosen": NaN,
407
  "logits/rejected": NaN,
408
- "logps/chosen": -175.81723022460938,
409
- "logps/rejected": -254.23828125,
410
- "loss": 0.4304,
411
- "rewards/accuracies": 0.6524999737739563,
412
- "rewards/chosen": -0.5225290060043335,
413
- "rewards/margins": 2.57625675201416,
414
- "rewards/rejected": -3.098785638809204,
415
  "step": 1350
416
  },
417
  {
418
  "epoch": 0.9377093101138647,
419
- "grad_norm": 49.38210678100586,
420
- "learning_rate": 3.1480241125251174e-07,
421
  "logits/chosen": NaN,
422
  "logits/rejected": NaN,
423
- "logps/chosen": -177.96661376953125,
424
- "logps/rejected": -240.63694763183594,
425
- "loss": 0.4411,
426
- "rewards/accuracies": 0.65625,
427
- "rewards/chosen": -0.4394516050815582,
428
- "rewards/margins": 2.5781943798065186,
429
- "rewards/rejected": -3.017645835876465,
430
  "step": 1400
431
  },
432
  {
433
  "epoch": 0.971198928332217,
434
- "grad_norm": 57.8912467956543,
435
- "learning_rate": 1.4735432016075018e-07,
436
  "logits/chosen": NaN,
437
  "logits/rejected": NaN,
438
- "logps/chosen": -176.60218811035156,
439
- "logps/rejected": -250.90786743164062,
440
- "loss": 0.4317,
441
- "rewards/accuracies": 0.6474999785423279,
442
- "rewards/chosen": -0.514062225818634,
443
- "rewards/margins": 2.4711577892303467,
444
- "rewards/rejected": -2.985220432281494,
445
  "step": 1450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  }
447
  ],
448
  "logging_steps": 50,
449
- "max_steps": 1493,
450
  "num_input_tokens_seen": 0,
451
- "num_train_epochs": 1,
452
  "save_steps": 200,
453
  "stateful_callbacks": {
454
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 4479,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.033489618218352314,
14
+ "grad_norm": 78.33113098144531,
15
+ "learning_rate": 2.1875e-07,
16
  "logits/chosen": NaN,
17
  "logits/rejected": NaN,
18
+ "logps/chosen": -175.2264862060547,
19
+ "logps/rejected": -218.9656982421875,
20
+ "loss": 0.6923,
21
+ "rewards/accuracies": 0.4137499928474426,
22
+ "rewards/chosen": 0.0005424434202723205,
23
+ "rewards/margins": 0.0029623538721352816,
24
+ "rewards/rejected": -0.0024199108593165874,
25
  "step": 50
26
  },
27
  {
28
  "epoch": 0.06697923643670463,
29
+ "grad_norm": 106.17163848876953,
30
+ "learning_rate": 4.419642857142857e-07,
31
  "logits/chosen": NaN,
32
  "logits/rejected": NaN,
33
+ "logps/chosen": -179.5259246826172,
34
+ "logps/rejected": -224.7578887939453,
35
+ "loss": 0.6907,
36
+ "rewards/accuracies": 0.42124998569488525,
37
+ "rewards/chosen": -0.005094751715660095,
38
+ "rewards/margins": 0.00641661649569869,
39
+ "rewards/rejected": -0.011511369608342648,
40
  "step": 100
41
  },
42
  {
43
  "epoch": 0.10046885465505694,
44
+ "grad_norm": 86.04861450195312,
45
+ "learning_rate": 6.651785714285713e-07,
46
  "logits/chosen": NaN,
47
  "logits/rejected": NaN,
48
+ "logps/chosen": -165.04095458984375,
49
+ "logps/rejected": -219.6518096923828,
50
+ "loss": 0.6756,
51
+ "rewards/accuracies": 0.5112500190734863,
52
+ "rewards/chosen": -0.026584235951304436,
53
+ "rewards/margins": 0.03996539115905762,
54
+ "rewards/rejected": -0.0665496289730072,
55
  "step": 150
56
  },
57
  {
58
  "epoch": 0.13395847287340926,
59
+ "grad_norm": 82.77224731445312,
60
+ "learning_rate": 8.88392857142857e-07,
61
  "logits/chosen": NaN,
62
  "logits/rejected": NaN,
63
+ "logps/chosen": -177.75872802734375,
64
+ "logps/rejected": -223.51528930664062,
65
+ "loss": 0.6591,
66
+ "rewards/accuracies": 0.5099999904632568,
67
+ "rewards/chosen": -0.11609632521867752,
68
+ "rewards/margins": 0.10106377303600311,
69
+ "rewards/rejected": -0.21716010570526123,
70
  "step": 200
71
  },
72
  {
73
  "epoch": 0.16744809109176156,
74
+ "grad_norm": 135.95346069335938,
75
+ "learning_rate": 1.1116071428571427e-06,
76
  "logits/chosen": NaN,
77
  "logits/rejected": NaN,
78
+ "logps/chosen": -180.67254638671875,
79
+ "logps/rejected": -226.42140197753906,
80
+ "loss": 0.6295,
81
+ "rewards/accuracies": 0.5099999904632568,
82
+ "rewards/chosen": -0.18336135149002075,
83
+ "rewards/margins": 0.20662552118301392,
84
+ "rewards/rejected": -0.38998690247535706,
85
  "step": 250
86
  },
87
  {
88
  "epoch": 0.20093770931011387,
89
+ "grad_norm": 89.77359771728516,
90
+ "learning_rate": 1.3348214285714285e-06,
91
  "logits/chosen": NaN,
92
  "logits/rejected": NaN,
93
+ "logps/chosen": -178.88687133789062,
94
+ "logps/rejected": -226.58355712890625,
95
+ "loss": 0.602,
96
+ "rewards/accuracies": 0.5637500286102295,
97
+ "rewards/chosen": -0.29084426164627075,
98
+ "rewards/margins": 0.32938891649246216,
99
+ "rewards/rejected": -0.6202332377433777,
100
  "step": 300
101
  },
102
  {
103
  "epoch": 0.23442732752846618,
104
+ "grad_norm": 89.93605041503906,
105
+ "learning_rate": 1.558035714285714e-06,
106
+ "logits/chosen": NaN,
107
+ "logits/rejected": -1.608971118927002,
108
+ "logps/chosen": -176.1905059814453,
109
+ "logps/rejected": -231.0211944580078,
110
+ "loss": 0.5782,
111
+ "rewards/accuracies": 0.5562499761581421,
112
+ "rewards/chosen": -0.4454282522201538,
113
+ "rewards/margins": 0.5162708163261414,
114
+ "rewards/rejected": -0.9616988897323608,
115
  "step": 350
116
  },
117
  {
118
  "epoch": 0.2679169457468185,
119
+ "grad_norm": 113.58289337158203,
120
+ "learning_rate": 1.7812499999999999e-06,
121
  "logits/chosen": NaN,
122
  "logits/rejected": NaN,
123
+ "logps/chosen": -176.52401733398438,
124
+ "logps/rejected": -236.76588439941406,
125
+ "loss": 0.5478,
126
+ "rewards/accuracies": 0.6150000095367432,
127
+ "rewards/chosen": -0.5549299120903015,
128
+ "rewards/margins": 0.8102107048034668,
129
+ "rewards/rejected": -1.3651405572891235,
130
  "step": 400
131
  },
132
  {
133
  "epoch": 0.3014065639651708,
134
+ "grad_norm": 100.28213500976562,
135
+ "learning_rate": 1.999999696300462e-06,
136
  "logits/chosen": NaN,
137
  "logits/rejected": NaN,
138
+ "logps/chosen": -183.1260223388672,
139
+ "logps/rejected": -235.15631103515625,
140
+ "loss": 0.5635,
141
+ "rewards/accuracies": 0.5799999833106995,
142
+ "rewards/chosen": -0.48344433307647705,
143
+ "rewards/margins": 0.770007848739624,
144
+ "rewards/rejected": -1.253452181816101,
145
  "step": 450
146
  },
147
  {
148
  "epoch": 0.33489618218352313,
149
+ "grad_norm": 90.32833099365234,
150
+ "learning_rate": 1.999210181452139e-06,
151
  "logits/chosen": NaN,
152
  "logits/rejected": NaN,
153
+ "logps/chosen": -180.36907958984375,
154
+ "logps/rejected": -232.14285278320312,
155
+ "loss": 0.5376,
156
+ "rewards/accuracies": 0.6087499856948853,
157
+ "rewards/chosen": -0.5261387825012207,
158
+ "rewards/margins": 0.8372372984886169,
159
+ "rewards/rejected": -1.3633761405944824,
160
  "step": 500
161
  },
162
  {
163
  "epoch": 0.3683858004018754,
164
+ "grad_norm": 72.57466125488281,
165
+ "learning_rate": 1.996903560165487e-06,
166
  "logits/chosen": NaN,
167
  "logits/rejected": NaN,
168
+ "logps/chosen": -175.88233947753906,
169
+ "logps/rejected": -242.15728759765625,
170
+ "loss": 0.5083,
171
+ "rewards/accuracies": 0.6225000023841858,
172
+ "rewards/chosen": -0.5493210554122925,
173
+ "rewards/margins": 1.0930429697036743,
174
+ "rewards/rejected": -1.6423640251159668,
175
  "step": 550
176
  },
177
  {
178
  "epoch": 0.40187541862022774,
179
+ "grad_norm": 47.55934143066406,
180
+ "learning_rate": 1.993083334596579e-06,
181
  "logits/chosen": NaN,
182
  "logits/rejected": NaN,
183
+ "logps/chosen": -184.1678924560547,
184
+ "logps/rejected": -251.43661499023438,
185
+ "loss": 0.5193,
186
+ "rewards/accuracies": 0.6225000023841858,
187
+ "rewards/chosen": -0.7250985503196716,
188
+ "rewards/margins": 1.2086968421936035,
189
+ "rewards/rejected": -1.9337953329086304,
190
  "step": 600
191
  },
192
  {
193
  "epoch": 0.43536503683858,
194
+ "grad_norm": 90.7481460571289,
195
+ "learning_rate": 1.987755305015383e-06,
196
  "logits/chosen": NaN,
197
  "logits/rejected": NaN,
198
+ "logps/chosen": -196.693359375,
199
+ "logps/rejected": -247.3010711669922,
200
+ "loss": 0.516,
201
+ "rewards/accuracies": 0.6137499809265137,
202
+ "rewards/chosen": -0.6984607577323914,
203
+ "rewards/margins": 1.173628807067871,
204
+ "rewards/rejected": -1.8720895051956177,
205
  "step": 650
206
  },
207
  {
208
  "epoch": 0.46885465505693236,
209
+ "grad_norm": 86.08389282226562,
210
+ "learning_rate": 1.980927560999178e-06,
211
  "logits/chosen": NaN,
212
  "logits/rejected": NaN,
213
+ "logps/chosen": -186.29693603515625,
214
+ "logps/rejected": -245.04824829101562,
215
+ "loss": 0.5057,
216
+ "rewards/accuracies": 0.612500011920929,
217
+ "rewards/chosen": -0.6868166327476501,
218
+ "rewards/margins": 1.367271900177002,
219
+ "rewards/rejected": -2.0540883541107178,
220
  "step": 700
221
  },
222
  {
223
  "epoch": 0.5023442732752846,
224
+ "grad_norm": 40.12553405761719,
225
+ "learning_rate": 1.9726104691501045e-06,
226
  "logits/chosen": NaN,
227
  "logits/rejected": NaN,
228
+ "logps/chosen": -179.41378784179688,
229
+ "logps/rejected": -240.62547302246094,
230
+ "loss": 0.5132,
231
+ "rewards/accuracies": 0.5975000262260437,
232
+ "rewards/chosen": -0.5570769309997559,
233
+ "rewards/margins": 1.2463946342468262,
234
+ "rewards/rejected": -1.803471326828003,
235
  "step": 750
236
  },
237
  {
238
  "epoch": 0.535833891493637,
239
+ "grad_norm": 36.09309005737305,
240
+ "learning_rate": 1.9628166573554945e-06,
241
  "logits/chosen": NaN,
242
  "logits/rejected": NaN,
243
+ "logps/chosen": -170.22169494628906,
244
+ "logps/rejected": -239.9406280517578,
245
+ "loss": 0.4553,
246
+ "rewards/accuracies": 0.6449999809265137,
247
+ "rewards/chosen": -0.5568282604217529,
248
+ "rewards/margins": 1.5600597858428955,
249
+ "rewards/rejected": -2.1168878078460693,
250
  "step": 800
251
  },
252
  {
253
  "epoch": 0.5693235097119893,
254
+ "grad_norm": 88.8606185913086,
255
+ "learning_rate": 1.951560995614879e-06,
256
  "logits/chosen": NaN,
257
  "logits/rejected": NaN,
258
+ "logps/chosen": -175.4136199951172,
259
+ "logps/rejected": -241.44386291503906,
260
+ "loss": 0.4912,
261
+ "rewards/accuracies": 0.6175000071525574,
262
+ "rewards/chosen": -0.6789398193359375,
263
+ "rewards/margins": 1.448940634727478,
264
+ "rewards/rejected": -2.127880573272705,
265
  "step": 850
266
  },
267
  {
268
  "epoch": 0.6028131279303416,
269
+ "grad_norm": 37.501346588134766,
270
+ "learning_rate": 1.9388605734627843e-06,
271
  "logits/chosen": NaN,
272
  "logits/rejected": NaN,
273
+ "logps/chosen": -183.4543914794922,
274
+ "logps/rejected": -241.45433044433594,
275
+ "loss": 0.505,
276
+ "rewards/accuracies": 0.6212499737739563,
277
+ "rewards/chosen": -0.719947338104248,
278
+ "rewards/margins": 1.5332283973693848,
279
+ "rewards/rejected": -2.253175735473633,
280
  "step": 900
281
  },
282
  {
283
  "epoch": 0.6363027461486939,
284
+ "grad_norm": 58.78173065185547,
285
+ "learning_rate": 1.9247346740215936e-06,
286
  "logits/chosen": NaN,
287
  "logits/rejected": NaN,
288
+ "logps/chosen": -182.4608612060547,
289
+ "logps/rejected": -236.8692169189453,
290
+ "loss": 0.4756,
291
+ "rewards/accuracies": 0.6274999976158142,
292
+ "rewards/chosen": -0.5931037068367004,
293
+ "rewards/margins": 1.6174336671829224,
294
+ "rewards/rejected": -2.2105374336242676,
295
  "step": 950
296
  },
297
  {
298
  "epoch": 0.6697923643670463,
299
+ "grad_norm": 53.627410888671875,
300
+ "learning_rate": 1.909204744723877e-06,
301
  "logits/chosen": NaN,
302
  "logits/rejected": NaN,
303
+ "logps/chosen": -169.64356994628906,
304
+ "logps/rejected": -238.07931518554688,
305
+ "loss": 0.4699,
306
+ "rewards/accuracies": 0.625,
307
+ "rewards/chosen": -0.5164381265640259,
308
+ "rewards/margins": 1.6023368835449219,
309
+ "rewards/rejected": -2.1187753677368164,
310
  "step": 1000
311
  },
312
  {
313
  "epoch": 0.7032819825853985,
314
+ "grad_norm": 47.64691162109375,
315
+ "learning_rate": 1.8922943647486314e-06,
316
  "logits/chosen": NaN,
317
  "logits/rejected": NaN,
318
+ "logps/chosen": -174.08212280273438,
319
+ "logps/rejected": -251.6885223388672,
320
+ "loss": 0.4309,
321
+ "rewards/accuracies": 0.668749988079071,
322
+ "rewards/chosen": -0.560505211353302,
323
+ "rewards/margins": 1.9433872699737549,
324
+ "rewards/rejected": -2.503892421722412,
325
  "step": 1050
326
  },
327
  {
328
  "epoch": 0.7367716008037508,
329
+ "grad_norm": 58.94224166870117,
330
+ "learning_rate": 1.8740292092208816e-06,
331
  "logits/chosen": NaN,
332
  "logits/rejected": NaN,
333
+ "logps/chosen": -162.09487915039062,
334
+ "logps/rejected": -236.79824829101562,
335
+ "loss": 0.4293,
336
+ "rewards/accuracies": 0.6524999737739563,
337
+ "rewards/chosen": -0.6041057705879211,
338
+ "rewards/margins": 2.0014426708221436,
339
+ "rewards/rejected": -2.60554838180542,
340
  "step": 1100
341
  },
342
  {
343
  "epoch": 0.7702612190221031,
344
+ "grad_norm": 41.707763671875,
345
+ "learning_rate": 1.8544370102289943e-06,
346
  "logits/chosen": NaN,
347
  "logits/rejected": NaN,
348
+ "logps/chosen": -177.0761260986328,
349
+ "logps/rejected": -240.7725067138672,
350
+ "loss": 0.4419,
351
+ "rewards/accuracies": 0.6612499952316284,
352
+ "rewards/chosen": -0.6522895097732544,
353
+ "rewards/margins": 1.7689578533172607,
354
+ "rewards/rejected": -2.4212474822998047,
355
  "step": 1150
356
  },
357
  {
358
  "epoch": 0.8037508372404555,
359
+ "grad_norm": 45.48369216918945,
360
+ "learning_rate": 1.83354751471889e-06,
361
  "logits/chosen": NaN,
362
  "logits/rejected": NaN,
363
+ "logps/chosen": -184.2169952392578,
364
+ "logps/rejected": -264.9205322265625,
365
+ "loss": 0.4503,
366
+ "rewards/accuracies": 0.6549999713897705,
367
+ "rewards/chosen": -0.49645543098449707,
368
+ "rewards/margins": 2.04986572265625,
369
+ "rewards/rejected": -2.546321392059326,
370
  "step": 1200
371
  },
372
  {
373
  "epoch": 0.8372404554588078,
374
+ "grad_norm": 51.16058349609375,
375
+ "learning_rate": 1.8113924393290904e-06,
376
  "logits/chosen": NaN,
377
  "logits/rejected": NaN,
378
+ "logps/chosen": -182.03074645996094,
379
+ "logps/rejected": -249.8163604736328,
380
+ "loss": 0.4319,
381
+ "rewards/accuracies": 0.6612499952316284,
382
+ "rewards/chosen": -0.6471911072731018,
383
+ "rewards/margins": 2.1099319458007812,
384
+ "rewards/rejected": -2.7571229934692383,
385
  "step": 1250
386
  },
387
  {
388
  "epoch": 0.87073007367716,
389
+ "grad_norm": 64.02259063720703,
390
+ "learning_rate": 1.7880054222351658e-06,
391
  "logits/chosen": NaN,
392
  "logits/rejected": NaN,
393
+ "logps/chosen": -178.18972778320312,
394
+ "logps/rejected": -237.3641815185547,
395
+ "loss": 0.4155,
396
+ "rewards/accuracies": 0.6725000143051147,
397
+ "rewards/chosen": -0.38780125975608826,
398
+ "rewards/margins": 1.9852185249328613,
399
+ "rewards/rejected": -2.3730199337005615,
400
  "step": 1300
401
  },
402
  {
403
  "epoch": 0.9042196918955124,
404
+ "grad_norm": 35.12641525268555,
405
+ "learning_rate": 1.763421972076705e-06,
406
  "logits/chosen": NaN,
407
  "logits/rejected": NaN,
408
+ "logps/chosen": -175.52285766601562,
409
+ "logps/rejected": -247.11244201660156,
410
+ "loss": 0.4359,
411
+ "rewards/accuracies": 0.6512500047683716,
412
+ "rewards/chosen": -0.493091344833374,
413
+ "rewards/margins": 1.8931076526641846,
414
+ "rewards/rejected": -2.3861987590789795,
415
  "step": 1350
416
  },
417
  {
418
  "epoch": 0.9377093101138647,
419
+ "grad_norm": 64.41110229492188,
420
+ "learning_rate": 1.7376794140443474e-06,
421
  "logits/chosen": NaN,
422
  "logits/rejected": NaN,
423
+ "logps/chosen": -178.29629516601562,
424
+ "logps/rejected": -234.5249481201172,
425
+ "loss": 0.4512,
426
+ "rewards/accuracies": 0.6549999713897705,
427
+ "rewards/chosen": -0.4724200367927551,
428
+ "rewards/margins": 1.9340243339538574,
429
+ "rewards/rejected": -2.4064440727233887,
430
  "step": 1400
431
  },
432
  {
433
  "epoch": 0.971198928332217,
434
+ "grad_norm": 26.93653106689453,
435
+ "learning_rate": 1.7108168332087366e-06,
436
  "logits/chosen": NaN,
437
  "logits/rejected": NaN,
438
+ "logps/chosen": -175.42259216308594,
439
+ "logps/rejected": -243.82032775878906,
440
+ "loss": 0.4343,
441
+ "rewards/accuracies": 0.6512500047683716,
442
+ "rewards/chosen": -0.3961036205291748,
443
+ "rewards/margins": 1.8803616762161255,
444
+ "rewards/rejected": -2.27646541595459,
445
  "step": 1450
446
+ },
447
+ {
448
+ "epoch": 1.0046885465505693,
449
+ "grad_norm": 74.74053955078125,
450
+ "learning_rate": 1.682875015177438e-06,
451
+ "logits/chosen": NaN,
452
+ "logits/rejected": NaN,
453
+ "logps/chosen": -174.56732177734375,
454
+ "logps/rejected": -246.36451721191406,
455
+ "loss": 0.3957,
456
+ "rewards/accuracies": 0.6800000071525574,
457
+ "rewards/chosen": -0.34164169430732727,
458
+ "rewards/margins": 2.248396635055542,
459
+ "rewards/rejected": -2.590038537979126,
460
+ "step": 1500
461
+ },
462
+ {
463
+ "epoch": 1.0381781647689217,
464
+ "grad_norm": 58.65504455566406,
465
+ "learning_rate": 1.6538963841699207e-06,
466
+ "logits/chosen": NaN,
467
+ "logits/rejected": NaN,
468
+ "logps/chosen": -176.5469207763672,
469
+ "logps/rejected": -258.92706298828125,
470
+ "loss": 0.2861,
471
+ "rewards/accuracies": 0.7437499761581421,
472
+ "rewards/chosen": -0.2739707827568054,
473
+ "rewards/margins": 3.0113985538482666,
474
+ "rewards/rejected": -3.2853691577911377,
475
+ "step": 1550
476
+ },
477
+ {
478
+ "epoch": 1.0716677829872738,
479
+ "grad_norm": 59.74324417114258,
480
+ "learning_rate": 1.6239249386046274e-06,
481
+ "logits/chosen": NaN,
482
+ "logits/rejected": NaN,
483
+ "logps/chosen": -177.00692749023438,
484
+ "logps/rejected": -255.23556518554688,
485
+ "loss": 0.2914,
486
+ "rewards/accuracies": 0.7549999952316284,
487
+ "rewards/chosen": -0.4652925729751587,
488
+ "rewards/margins": 3.098710298538208,
489
+ "rewards/rejected": -3.564002752304077,
490
+ "step": 1600
491
+ },
492
+ {
493
+ "epoch": 1.1051574012056262,
494
+ "grad_norm": 37.80025863647461,
495
+ "learning_rate": 1.593006184295927e-06,
496
+ "logits/chosen": NaN,
497
+ "logits/rejected": NaN,
498
+ "logps/chosen": -185.12716674804688,
499
+ "logps/rejected": -254.19509887695312,
500
+ "loss": 0.2798,
501
+ "rewards/accuracies": 0.7524999976158142,
502
+ "rewards/chosen": -0.28863173723220825,
503
+ "rewards/margins": 3.227825880050659,
504
+ "rewards/rejected": -3.516458034515381,
505
+ "step": 1650
506
+ },
507
+ {
508
+ "epoch": 1.1386470194239786,
509
+ "grad_norm": 40.97309875488281,
510
+ "learning_rate": 1.5611870653623825e-06,
511
+ "logits/chosen": NaN,
512
+ "logits/rejected": NaN,
513
+ "logps/chosen": -182.1793975830078,
514
+ "logps/rejected": -245.0845184326172,
515
+ "loss": 0.2778,
516
+ "rewards/accuracies": 0.7450000047683716,
517
+ "rewards/chosen": -0.3949226438999176,
518
+ "rewards/margins": 3.3151471614837646,
519
+ "rewards/rejected": -3.7100696563720703,
520
+ "step": 1700
521
+ },
522
+ {
523
+ "epoch": 1.1721366376423308,
524
+ "grad_norm": 61.272247314453125,
525
+ "learning_rate": 1.5285158929512291e-06,
526
+ "logits/chosen": NaN,
527
+ "logits/rejected": NaN,
528
+ "logps/chosen": -174.18487548828125,
529
+ "logps/rejected": -247.96957397460938,
530
+ "loss": 0.3048,
531
+ "rewards/accuracies": 0.71875,
532
+ "rewards/chosen": -0.4471362233161926,
533
+ "rewards/margins": 3.481740951538086,
534
+ "rewards/rejected": -3.928877830505371,
535
+ "step": 1750
536
+ },
537
+ {
538
+ "epoch": 1.2056262558606832,
539
+ "grad_norm": 20.384906768798828,
540
+ "learning_rate": 1.4950422718872916e-06,
541
+ "logits/chosen": NaN,
542
+ "logits/rejected": NaN,
543
+ "logps/chosen": -177.91143798828125,
544
+ "logps/rejected": -264.8081970214844,
545
+ "loss": 0.2738,
546
+ "rewards/accuracies": 0.7574999928474426,
547
+ "rewards/chosen": -0.4734611213207245,
548
+ "rewards/margins": 3.4893076419830322,
549
+ "rewards/rejected": -3.962768793106079,
550
+ "step": 1800
551
+ },
552
+ {
553
+ "epoch": 1.2391158740790356,
554
+ "grad_norm": 46.84432601928711,
555
+ "learning_rate": 1.4608170253576945e-06,
556
+ "logits/chosen": NaN,
557
+ "logits/rejected": NaN,
558
+ "logps/chosen": -171.02236938476562,
559
+ "logps/rejected": -259.7280578613281,
560
+ "loss": 0.2928,
561
+ "rewards/accuracies": 0.7262499928474426,
562
+ "rewards/chosen": -0.6498711109161377,
563
+ "rewards/margins": 3.556124210357666,
564
+ "rewards/rejected": -4.205995082855225,
565
+ "step": 1850
566
+ },
567
+ {
568
+ "epoch": 1.2726054922973877,
569
+ "grad_norm": 40.36602020263672,
570
+ "learning_rate": 1.4258921177467371e-06,
571
+ "logits/chosen": NaN,
572
+ "logits/rejected": NaN,
573
+ "logps/chosen": -176.40257263183594,
574
+ "logps/rejected": -251.6402130126953,
575
+ "loss": 0.301,
576
+ "rewards/accuracies": 0.7325000166893005,
577
+ "rewards/chosen": -0.7374945878982544,
578
+ "rewards/margins": 3.618178606033325,
579
+ "rewards/rejected": -4.355673789978027,
580
+ "step": 1900
581
+ },
582
+ {
583
+ "epoch": 1.3060951105157401,
584
+ "grad_norm": 33.35322952270508,
585
+ "learning_rate": 1.3903205757380715e-06,
586
+ "logits/chosen": NaN,
587
+ "logits/rejected": NaN,
588
+ "logps/chosen": -177.98854064941406,
589
+ "logps/rejected": -259.6983337402344,
590
+ "loss": 0.2985,
591
+ "rewards/accuracies": 0.7275000214576721,
592
+ "rewards/chosen": -0.7513535022735596,
593
+ "rewards/margins": 3.433237314224243,
594
+ "rewards/rejected": -4.184591293334961,
595
+ "step": 1950
596
+ },
597
+ {
598
+ "epoch": 1.3395847287340925,
599
+ "grad_norm": 31.858760833740234,
600
+ "learning_rate": 1.3541564078039942e-06,
601
+ "logits/chosen": NaN,
602
+ "logits/rejected": NaN,
603
+ "logps/chosen": -176.97511291503906,
604
+ "logps/rejected": -267.1122131347656,
605
+ "loss": 0.307,
606
+ "rewards/accuracies": 0.7174999713897705,
607
+ "rewards/chosen": -0.6912581920623779,
608
+ "rewards/margins": 3.4836156368255615,
609
+ "rewards/rejected": -4.1748738288879395,
610
+ "step": 2000
611
+ },
612
+ {
613
+ "epoch": 1.3730743469524447,
614
+ "grad_norm": 40.272186279296875,
615
+ "learning_rate": 1.3174545222040757e-06,
616
+ "logits/chosen": NaN,
617
+ "logits/rejected": NaN,
618
+ "logps/chosen": -181.2541046142578,
619
+ "logps/rejected": -267.8948974609375,
620
+ "loss": 0.2764,
621
+ "rewards/accuracies": 0.7612500190734863,
622
+ "rewards/chosen": -0.5613307356834412,
623
+ "rewards/margins": 3.6199841499328613,
624
+ "rewards/rejected": -4.181314468383789,
625
+ "step": 2050
626
+ },
627
+ {
628
+ "epoch": 1.406563965170797,
629
+ "grad_norm": 20.189088821411133,
630
+ "learning_rate": 1.2802706436176447e-06,
631
+ "logits/chosen": NaN,
632
+ "logits/rejected": NaN,
633
+ "logps/chosen": -186.3399658203125,
634
+ "logps/rejected": -275.252685546875,
635
+ "loss": 0.2673,
636
+ "rewards/accuracies": 0.7512500286102295,
637
+ "rewards/chosen": -0.49821099638938904,
638
+ "rewards/margins": 3.6726813316345215,
639
+ "rewards/rejected": -4.170892238616943,
640
+ "step": 2100
641
+ },
642
+ {
643
+ "epoch": 1.4400535833891492,
644
+ "grad_norm": 28.09309196472168,
645
+ "learning_rate": 1.2426612285366904e-06,
646
+ "logits/chosen": NaN,
647
+ "logits/rejected": NaN,
648
+ "logps/chosen": -180.54571533203125,
649
+ "logps/rejected": -272.14337158203125,
650
+ "loss": 0.2833,
651
+ "rewards/accuracies": 0.7649999856948853,
652
+ "rewards/chosen": -0.5274211168289185,
653
+ "rewards/margins": 3.785543203353882,
654
+ "rewards/rejected": -4.31296443939209,
655
+ "step": 2150
656
+ },
657
+ {
658
+ "epoch": 1.4735432016075016,
659
+ "grad_norm": 5.396151542663574,
660
+ "learning_rate": 1.2046833795476566e-06,
661
+ "logits/chosen": NaN,
662
+ "logits/rejected": NaN,
663
+ "logps/chosen": -178.48960876464844,
664
+ "logps/rejected": -268.61944580078125,
665
+ "loss": 0.2594,
666
+ "rewards/accuracies": 0.768750011920929,
667
+ "rewards/chosen": -0.3929290771484375,
668
+ "rewards/margins": 3.8942084312438965,
669
+ "rewards/rejected": -4.287137508392334,
670
+ "step": 2200
671
+ },
672
+ {
673
+ "epoch": 1.507032819825854,
674
+ "grad_norm": 26.636991500854492,
675
+ "learning_rate": 1.16639475863226e-06,
676
+ "logits/chosen": NaN,
677
+ "logits/rejected": NaN,
678
+ "logps/chosen": -183.34547424316406,
679
+ "logps/rejected": -259.8311462402344,
680
+ "loss": 0.3026,
681
+ "rewards/accuracies": 0.7200000286102295,
682
+ "rewards/chosen": -0.5500699281692505,
683
+ "rewards/margins": 3.565783739089966,
684
+ "rewards/rejected": -4.115853786468506,
685
+ "step": 2250
686
+ },
687
+ {
688
+ "epoch": 1.5405224380442064,
689
+ "grad_norm": 14.03653335571289,
690
+ "learning_rate": 1.1278534996189831e-06,
691
+ "logits/chosen": NaN,
692
+ "logits/rejected": NaN,
693
+ "logps/chosen": -182.8995361328125,
694
+ "logps/rejected": -273.84112548828125,
695
+ "loss": 0.2603,
696
+ "rewards/accuracies": 0.7487499713897705,
697
+ "rewards/chosen": -0.5162584185600281,
698
+ "rewards/margins": 4.0679030418396,
699
+ "rewards/rejected": -4.584161758422852,
700
+ "step": 2300
701
+ },
702
+ {
703
+ "epoch": 1.5740120562625586,
704
+ "grad_norm": 67.45540618896484,
705
+ "learning_rate": 1.0891181199181518e-06,
706
+ "logits/chosen": NaN,
707
+ "logits/rejected": NaN,
708
+ "logps/chosen": -176.06849670410156,
709
+ "logps/rejected": -265.9678649902344,
710
+ "loss": 0.272,
711
+ "rewards/accuracies": 0.7475000023841858,
712
+ "rewards/chosen": -0.5778465867042542,
713
+ "rewards/margins": 3.9320404529571533,
714
+ "rewards/rejected": -4.509886264801025,
715
+ "step": 2350
716
+ },
717
+ {
718
+ "epoch": 1.607501674480911,
719
+ "grad_norm": 21.127580642700195,
720
+ "learning_rate": 1.0502474316746242e-06,
721
+ "logits/chosen": NaN,
722
+ "logits/rejected": NaN,
723
+ "logps/chosen": -178.6305694580078,
724
+ "logps/rejected": -265.5202331542969,
725
+ "loss": 0.2839,
726
+ "rewards/accuracies": 0.7462499737739563,
727
+ "rewards/chosen": -0.5587973594665527,
728
+ "rewards/margins": 3.9246935844421387,
729
+ "rewards/rejected": -4.48349142074585,
730
+ "step": 2400
731
+ },
732
+ {
733
+ "epoch": 1.6409912926992631,
734
+ "grad_norm": 47.24773025512695,
735
+ "learning_rate": 1.0113004524729797e-06,
736
+ "logits/chosen": NaN,
737
+ "logits/rejected": NaN,
738
+ "logps/chosen": -196.45948791503906,
739
+ "logps/rejected": -272.1256408691406,
740
+ "loss": 0.2791,
741
+ "rewards/accuracies": 0.7587500214576721,
742
+ "rewards/chosen": -0.5817875862121582,
743
+ "rewards/margins": 3.766108989715576,
744
+ "rewards/rejected": -4.347896099090576,
745
+ "step": 2450
746
+ },
747
+ {
748
+ "epoch": 1.6744809109176155,
749
+ "grad_norm": 20.178668975830078,
750
+ "learning_rate": 9.723363157307888e-07,
751
+ "logits/chosen": NaN,
752
+ "logits/rejected": NaN,
753
+ "logps/chosen": -183.7681427001953,
754
+ "logps/rejected": -268.5182800292969,
755
+ "loss": 0.2744,
756
+ "rewards/accuracies": 0.7475000023841858,
757
+ "rewards/chosen": -0.5075680017471313,
758
+ "rewards/margins": 3.9134867191314697,
759
+ "rewards/rejected": -4.421054840087891,
760
+ "step": 2500
761
+ },
762
+ {
763
+ "epoch": 1.707970529135968,
764
+ "grad_norm": 31.073015213012695,
765
+ "learning_rate": 9.334141809160118e-07,
766
+ "logits/chosen": NaN,
767
+ "logits/rejected": NaN,
768
+ "logps/chosen": -178.35658264160156,
769
+ "logps/rejected": -265.6587829589844,
770
+ "loss": 0.2405,
771
+ "rewards/accuracies": 0.7712500095367432,
772
+ "rewards/chosen": -0.6600850820541382,
773
+ "rewards/margins": 4.134018421173096,
774
+ "rewards/rejected": -4.794103622436523,
775
+ "step": 2550
776
+ },
777
+ {
778
+ "epoch": 1.7414601473543203,
779
+ "grad_norm": 36.3228759765625,
780
+ "learning_rate": 8.945931437248468e-07,
781
+ "logits/chosen": NaN,
782
+ "logits/rejected": NaN,
783
+ "logps/chosen": -178.47000122070312,
784
+ "logps/rejected": -270.1788635253906,
785
+ "loss": 0.2674,
786
+ "rewards/accuracies": 0.7524999976158142,
787
+ "rewards/chosen": -0.6616349816322327,
788
+ "rewards/margins": 4.066000461578369,
789
+ "rewards/rejected": -4.727634906768799,
790
+ "step": 2600
791
+ },
792
+ {
793
+ "epoch": 1.7749497655726725,
794
+ "grad_norm": 27.108051300048828,
795
+ "learning_rate": 8.559321463564014e-07,
796
+ "logits/chosen": NaN,
797
+ "logits/rejected": NaN,
798
+ "logps/chosen": -175.67808532714844,
799
+ "logps/rejected": -261.2061767578125,
800
+ "loss": 0.2494,
801
+ "rewards/accuracies": 0.7549999952316284,
802
+ "rewards/chosen": -0.5604009032249451,
803
+ "rewards/margins": 4.31578254699707,
804
+ "rewards/rejected": -4.876183032989502,
805
+ "step": 2650
806
+ },
807
+ {
808
+ "epoch": 1.8084393837910246,
809
+ "grad_norm": 54.821876525878906,
810
+ "learning_rate": 8.174898880204195e-07,
811
+ "logits/chosen": NaN,
812
+ "logits/rejected": NaN,
813
+ "logps/chosen": -178.19236755371094,
814
+ "logps/rejected": -269.1416015625,
815
+ "loss": 0.2817,
816
+ "rewards/accuracies": 0.7400000095367432,
817
+ "rewards/chosen": -0.5425779223442078,
818
+ "rewards/margins": 3.9950203895568848,
819
+ "rewards/rejected": -4.537598133087158,
820
+ "step": 2700
821
+ },
822
+ {
823
+ "epoch": 1.841929002009377,
824
+ "grad_norm": 36.13364791870117,
825
+ "learning_rate": 7.793247358139428e-07,
826
+ "logits/chosen": NaN,
827
+ "logits/rejected": NaN,
828
+ "logps/chosen": -179.92677307128906,
829
+ "logps/rejected": -266.75799560546875,
830
+ "loss": 0.2885,
831
+ "rewards/accuracies": 0.7387499809265137,
832
+ "rewards/chosen": -0.5648588538169861,
833
+ "rewards/margins": 3.864666700363159,
834
+ "rewards/rejected": -4.429525852203369,
835
+ "step": 2750
836
+ },
837
+ {
838
+ "epoch": 1.8754186202277294,
839
+ "grad_norm": 24.641510009765625,
840
+ "learning_rate": 7.414946361022179e-07,
841
+ "logits/chosen": NaN,
842
+ "logits/rejected": NaN,
843
+ "logps/chosen": -171.00909423828125,
844
+ "logps/rejected": -273.5279541015625,
845
+ "loss": 0.2695,
846
+ "rewards/accuracies": 0.7400000095367432,
847
+ "rewards/chosen": -0.4850202798843384,
848
+ "rewards/margins": 4.063894271850586,
849
+ "rewards/rejected": -4.548914432525635,
850
+ "step": 2800
851
+ },
852
+ {
853
+ "epoch": 1.9089082384460818,
854
+ "grad_norm": 25.44546127319336,
855
+ "learning_rate": 7.040570265384029e-07,
856
+ "logits/chosen": NaN,
857
+ "logits/rejected": NaN,
858
+ "logps/chosen": -186.17147827148438,
859
+ "logps/rejected": -272.64111328125,
860
+ "loss": 0.2881,
861
+ "rewards/accuracies": 0.7512500286102295,
862
+ "rewards/chosen": -0.5362930297851562,
863
+ "rewards/margins": 4.026025295257568,
864
+ "rewards/rejected": -4.562318325042725,
865
+ "step": 2850
866
+ },
867
+ {
868
+ "epoch": 1.942397856664434,
869
+ "grad_norm": 62.34092330932617,
870
+ "learning_rate": 6.670687488556586e-07,
871
+ "logits/chosen": NaN,
872
+ "logits/rejected": NaN,
873
+ "logps/chosen": -188.8939208984375,
874
+ "logps/rejected": -270.8504943847656,
875
+ "loss": 0.2685,
876
+ "rewards/accuracies": 0.7337499856948853,
877
+ "rewards/chosen": -0.3625078499317169,
878
+ "rewards/margins": 4.072076797485352,
879
+ "rewards/rejected": -4.434584617614746,
880
+ "step": 2900
881
+ },
882
+ {
883
+ "epoch": 1.9758874748827864,
884
+ "grad_norm": 16.188819885253906,
885
+ "learning_rate": 6.305859625640224e-07,
886
+ "logits/chosen": NaN,
887
+ "logits/rejected": NaN,
888
+ "logps/chosen": -177.49630737304688,
889
+ "logps/rejected": -280.4139404296875,
890
+ "loss": 0.2755,
891
+ "rewards/accuracies": 0.7475000023841858,
892
+ "rewards/chosen": -0.6155076026916504,
893
+ "rewards/margins": 4.242664337158203,
894
+ "rewards/rejected": -4.8581719398498535,
895
+ "step": 2950
896
+ },
897
+ {
898
+ "epoch": 2.0093770931011385,
899
+ "grad_norm": 35.435707092285156,
900
+ "learning_rate": 5.946640596831101e-07,
901
+ "logits/chosen": NaN,
902
+ "logits/rejected": NaN,
903
+ "logps/chosen": -166.32289123535156,
904
+ "logps/rejected": -263.216552734375,
905
+ "loss": 0.2391,
906
+ "rewards/accuracies": 0.7712500095367432,
907
+ "rewards/chosen": -0.6572730541229248,
908
+ "rewards/margins": 4.339555740356445,
909
+ "rewards/rejected": -4.996828556060791,
910
+ "step": 3000
911
+ },
912
+ {
913
+ "epoch": 2.042866711319491,
914
+ "grad_norm": 42.23343276977539,
915
+ "learning_rate": 5.59357580640101e-07,
916
+ "logits/chosen": NaN,
917
+ "logits/rejected": NaN,
918
+ "logps/chosen": -179.9312744140625,
919
+ "logps/rejected": -277.5908508300781,
920
+ "loss": 0.213,
921
+ "rewards/accuracies": 0.7850000262260437,
922
+ "rewards/chosen": -0.35315731167793274,
923
+ "rewards/margins": 4.545411586761475,
924
+ "rewards/rejected": -4.898569107055664,
925
+ "step": 3050
926
+ },
927
+ {
928
+ "epoch": 2.0763563295378433,
929
+ "grad_norm": 2.853132486343384,
930
+ "learning_rate": 5.247201314606984e-07,
931
+ "logits/chosen": NaN,
932
+ "logits/rejected": NaN,
933
+ "logps/chosen": -180.59486389160156,
934
+ "logps/rejected": -276.373291015625,
935
+ "loss": 0.2047,
936
+ "rewards/accuracies": 0.7950000166893005,
937
+ "rewards/chosen": -0.3648325800895691,
938
+ "rewards/margins": 4.745596885681152,
939
+ "rewards/rejected": -5.110429763793945,
940
+ "step": 3100
941
+ },
942
+ {
943
+ "epoch": 2.1098459477561957,
944
+ "grad_norm": 22.07088851928711,
945
+ "learning_rate": 4.90804302378802e-07,
946
+ "logits/chosen": NaN,
947
+ "logits/rejected": NaN,
948
+ "logps/chosen": -177.32708740234375,
949
+ "logps/rejected": -260.5697021484375,
950
+ "loss": 0.2054,
951
+ "rewards/accuracies": 0.7925000190734863,
952
+ "rewards/chosen": -0.48022788763046265,
953
+ "rewards/margins": 4.517958641052246,
954
+ "rewards/rejected": -4.998186111450195,
955
+ "step": 3150
956
+ },
957
+ {
958
+ "epoch": 2.1433355659745477,
959
+ "grad_norm": 50.728519439697266,
960
+ "learning_rate": 4.57661587988459e-07,
961
+ "logits/chosen": NaN,
962
+ "logits/rejected": NaN,
963
+ "logps/chosen": -177.0932159423828,
964
+ "logps/rejected": -270.6129150390625,
965
+ "loss": 0.236,
966
+ "rewards/accuracies": 0.7574999928474426,
967
+ "rewards/chosen": -0.4882276654243469,
968
+ "rewards/margins": 4.606672286987305,
969
+ "rewards/rejected": -5.094900131225586,
970
+ "step": 3200
971
+ },
972
+ {
973
+ "epoch": 2.1768251841929,
974
+ "grad_norm": 19.410276412963867,
975
+ "learning_rate": 4.253423090593318e-07,
976
+ "logits/chosen": NaN,
977
+ "logits/rejected": NaN,
978
+ "logps/chosen": -185.2410125732422,
979
+ "logps/rejected": -282.7039794921875,
980
+ "loss": 0.2242,
981
+ "rewards/accuracies": 0.7612500190734863,
982
+ "rewards/chosen": -0.5257070064544678,
983
+ "rewards/margins": 4.692570209503174,
984
+ "rewards/rejected": -5.218277454376221,
985
+ "step": 3250
986
+ },
987
+ {
988
+ "epoch": 2.2103148024112524,
989
+ "grad_norm": 45.68756103515625,
990
+ "learning_rate": 3.938955361343912e-07,
991
+ "logits/chosen": NaN,
992
+ "logits/rejected": NaN,
993
+ "logps/chosen": -175.8925018310547,
994
+ "logps/rejected": -284.1990966796875,
995
+ "loss": 0.2259,
996
+ "rewards/accuracies": 0.7699999809265137,
997
+ "rewards/chosen": -0.605311930179596,
998
+ "rewards/margins": 4.8395843505859375,
999
+ "rewards/rejected": -5.444896221160889,
1000
+ "step": 3300
1001
+ },
1002
+ {
1003
+ "epoch": 2.243804420629605,
1004
+ "grad_norm": 51.53227996826172,
1005
+ "learning_rate": 3.6336901502583364e-07,
1006
+ "logits/chosen": NaN,
1007
+ "logits/rejected": NaN,
1008
+ "logps/chosen": -177.85601806640625,
1009
+ "logps/rejected": -275.8158874511719,
1010
+ "loss": 0.2048,
1011
+ "rewards/accuracies": 0.7875000238418579,
1012
+ "rewards/chosen": -0.6794506907463074,
1013
+ "rewards/margins": 4.734764575958252,
1014
+ "rewards/rejected": -5.414215087890625,
1015
+ "step": 3350
1016
+ },
1017
+ {
1018
+ "epoch": 2.2772940388479572,
1019
+ "grad_norm": 3.569408893585205,
1020
+ "learning_rate": 3.3380909432234807e-07,
1021
+ "logits/chosen": NaN,
1022
+ "logits/rejected": NaN,
1023
+ "logps/chosen": -182.00836181640625,
1024
+ "logps/rejected": -280.286376953125,
1025
+ "loss": 0.1999,
1026
+ "rewards/accuracies": 0.7950000166893005,
1027
+ "rewards/chosen": -0.6098263263702393,
1028
+ "rewards/margins": 4.961060047149658,
1029
+ "rewards/rejected": -5.570886611938477,
1030
+ "step": 3400
1031
+ },
1032
+ {
1033
+ "epoch": 2.3107836570663096,
1034
+ "grad_norm": 27.362163543701172,
1035
+ "learning_rate": 3.0526065501779184e-07,
1036
+ "logits/chosen": NaN,
1037
+ "logits/rejected": NaN,
1038
+ "logps/chosen": -172.97593688964844,
1039
+ "logps/rejected": -275.5477600097656,
1040
+ "loss": 0.2184,
1041
+ "rewards/accuracies": 0.7749999761581421,
1042
+ "rewards/chosen": -0.6930285096168518,
1043
+ "rewards/margins": 4.821885585784912,
1044
+ "rewards/rejected": -5.514913558959961,
1045
+ "step": 3450
1046
+ },
1047
+ {
1048
+ "epoch": 2.3442732752846616,
1049
+ "grad_norm": 28.243000030517578,
1050
+ "learning_rate": 2.7776704236812454e-07,
1051
+ "logits/chosen": NaN,
1052
+ "logits/rejected": NaN,
1053
+ "logps/chosen": -182.44705200195312,
1054
+ "logps/rejected": -277.888427734375,
1055
+ "loss": 0.2128,
1056
+ "rewards/accuracies": 0.7649999856948853,
1057
+ "rewards/chosen": -0.6010170578956604,
1058
+ "rewards/margins": 5.026294708251953,
1059
+ "rewards/rejected": -5.6273112297058105,
1060
+ "step": 3500
1061
+ },
1062
+ {
1063
+ "epoch": 2.377762893503014,
1064
+ "grad_norm": 14.03532886505127,
1065
+ "learning_rate": 2.5137000008006437e-07,
1066
+ "logits/chosen": NaN,
1067
+ "logits/rejected": NaN,
1068
+ "logps/chosen": -182.77134704589844,
1069
+ "logps/rejected": -279.57769775390625,
1070
+ "loss": 0.21,
1071
+ "rewards/accuracies": 0.7799999713897705,
1072
+ "rewards/chosen": -0.7788973450660706,
1073
+ "rewards/margins": 5.022655010223389,
1074
+ "rewards/rejected": -5.801552772521973,
1075
+ "step": 3550
1076
+ },
1077
+ {
1078
+ "epoch": 2.4112525117213663,
1079
+ "grad_norm": 35.019554138183594,
1080
+ "learning_rate": 2.261096069313816e-07,
1081
+ "logits/chosen": NaN,
1082
+ "logits/rejected": NaN,
1083
+ "logps/chosen": -187.45738220214844,
1084
+ "logps/rejected": -281.279541015625,
1085
+ "loss": 0.1887,
1086
+ "rewards/accuracies": 0.8075000047683716,
1087
+ "rewards/chosen": -0.7265406847000122,
1088
+ "rewards/margins": 5.097284317016602,
1089
+ "rewards/rejected": -5.823824882507324,
1090
+ "step": 3600
1091
+ },
1092
+ {
1093
+ "epoch": 2.4447421299397187,
1094
+ "grad_norm": 25.041046142578125,
1095
+ "learning_rate": 2.020242159190646e-07,
1096
+ "logits/chosen": NaN,
1097
+ "logits/rejected": NaN,
1098
+ "logps/chosen": -176.86915588378906,
1099
+ "logps/rejected": -277.746826171875,
1100
+ "loss": 0.2311,
1101
+ "rewards/accuracies": 0.7587500214576721,
1102
+ "rewards/chosen": -0.786669135093689,
1103
+ "rewards/margins": 4.789151191711426,
1104
+ "rewards/rejected": -5.575820446014404,
1105
+ "step": 3650
1106
+ },
1107
+ {
1108
+ "epoch": 2.478231748158071,
1109
+ "grad_norm": 20.99360466003418,
1110
+ "learning_rate": 1.7915039602775062e-07,
1111
+ "logits/chosen": NaN,
1112
+ "logits/rejected": NaN,
1113
+ "logps/chosen": -182.3199462890625,
1114
+ "logps/rejected": -273.0755920410156,
1115
+ "loss": 0.2429,
1116
+ "rewards/accuracies": 0.7737500071525574,
1117
+ "rewards/chosen": -0.8147923946380615,
1118
+ "rewards/margins": 4.847590446472168,
1119
+ "rewards/rejected": -5.66238260269165,
1120
+ "step": 3700
1121
+ },
1122
+ {
1123
+ "epoch": 2.511721366376423,
1124
+ "grad_norm": 18.44826889038086,
1125
+ "learning_rate": 1.5752287670682861e-07,
1126
+ "logits/chosen": NaN,
1127
+ "logits/rejected": NaN,
1128
+ "logps/chosen": -170.71795654296875,
1129
+ "logps/rejected": -276.1592102050781,
1130
+ "loss": 0.2043,
1131
+ "rewards/accuracies": 0.7862499952316284,
1132
+ "rewards/chosen": -0.638399064540863,
1133
+ "rewards/margins": 5.212125301361084,
1134
+ "rewards/rejected": -5.850524425506592,
1135
+ "step": 3750
1136
+ },
1137
+ {
1138
+ "epoch": 2.5452109845947755,
1139
+ "grad_norm": 40.779659271240234,
1140
+ "learning_rate": 1.3717449514052314e-07,
1141
+ "logits/chosen": NaN,
1142
+ "logits/rejected": NaN,
1143
+ "logps/chosen": -180.7264404296875,
1144
+ "logps/rejected": -284.6885986328125,
1145
+ "loss": 0.2033,
1146
+ "rewards/accuracies": 0.7962499856948853,
1147
+ "rewards/chosen": -0.882935106754303,
1148
+ "rewards/margins": 5.128498554229736,
1149
+ "rewards/rejected": -6.0114336013793945,
1150
+ "step": 3800
1151
+ },
1152
+ {
1153
+ "epoch": 2.578700602813128,
1154
+ "grad_norm": 44.556678771972656,
1155
+ "learning_rate": 1.1813614639101088e-07,
1156
+ "logits/chosen": NaN,
1157
+ "logits/rejected": NaN,
1158
+ "logps/chosen": -183.99533081054688,
1159
+ "logps/rejected": -275.25518798828125,
1160
+ "loss": 0.2274,
1161
+ "rewards/accuracies": 0.7774999737739563,
1162
+ "rewards/chosen": -0.703125,
1163
+ "rewards/margins": 5.014428615570068,
1164
+ "rewards/rejected": -5.717553615570068,
1165
+ "step": 3850
1166
+ },
1167
+ {
1168
+ "epoch": 2.6121902210314802,
1169
+ "grad_norm": 61.39085388183594,
1170
+ "learning_rate": 1.0043673649027517e-07,
1171
+ "logits/chosen": NaN,
1172
+ "logits/rejected": NaN,
1173
+ "logps/chosen": -178.3540802001953,
1174
+ "logps/rejected": -282.1649475097656,
1175
+ "loss": 0.2097,
1176
+ "rewards/accuracies": 0.7662500143051147,
1177
+ "rewards/chosen": -0.683403730392456,
1178
+ "rewards/margins": 5.063638687133789,
1179
+ "rewards/rejected": -5.747043609619141,
1180
+ "step": 3900
1181
+ },
1182
+ {
1183
+ "epoch": 2.6456798392498326,
1184
+ "grad_norm": 58.0173454284668,
1185
+ "learning_rate": 8.410313855191464e-08,
1186
+ "logits/chosen": NaN,
1187
+ "logits/rejected": NaN,
1188
+ "logps/chosen": -178.94400024414062,
1189
+ "logps/rejected": -286.5594177246094,
1190
+ "loss": 0.2042,
1191
+ "rewards/accuracies": 0.7862499952316284,
1192
+ "rewards/chosen": -0.8088821172714233,
1193
+ "rewards/margins": 5.067000865936279,
1194
+ "rewards/rejected": -5.875882625579834,
1195
+ "step": 3950
1196
+ },
1197
+ {
1198
+ "epoch": 2.679169457468185,
1199
+ "grad_norm": 16.31562042236328,
1200
+ "learning_rate": 6.916015196954383e-08,
1201
+ "logits/chosen": NaN,
1202
+ "logits/rejected": NaN,
1203
+ "logps/chosen": -185.46673583984375,
1204
+ "logps/rejected": -288.2527770996094,
1205
+ "loss": 0.217,
1206
+ "rewards/accuracies": 0.7875000238418579,
1207
+ "rewards/chosen": -0.7252050638198853,
1208
+ "rewards/margins": 5.204960823059082,
1209
+ "rewards/rejected": -5.930166244506836,
1210
+ "step": 4000
1211
+ },
1212
+ {
1213
+ "epoch": 2.7126590756865374,
1214
+ "grad_norm": 20.799222946166992,
1215
+ "learning_rate": 5.5630464763733787e-08,
1216
+ "logits/chosen": NaN,
1217
+ "logits/rejected": NaN,
1218
+ "logps/chosen": -188.50820922851562,
1219
+ "logps/rejected": -288.9837646484375,
1220
+ "loss": 0.2258,
1221
+ "rewards/accuracies": 0.7724999785423279,
1222
+ "rewards/chosen": -0.7981621026992798,
1223
+ "rewards/margins": 5.062735557556152,
1224
+ "rewards/rejected": -5.860898017883301,
1225
+ "step": 4050
1226
+ },
1227
+ {
1228
+ "epoch": 2.7461486939048894,
1229
+ "grad_norm": 18.682947158813477,
1230
+ "learning_rate": 4.353461913466405e-08,
1231
+ "logits/chosen": NaN,
1232
+ "logits/rejected": NaN,
1233
+ "logps/chosen": -178.44317626953125,
1234
+ "logps/rejected": -266.35333251953125,
1235
+ "loss": 0.2426,
1236
+ "rewards/accuracies": 0.7524999976158142,
1237
+ "rewards/chosen": -0.6803594827651978,
1238
+ "rewards/margins": 4.8590497970581055,
1239
+ "rewards/rejected": -5.539409160614014,
1240
+ "step": 4100
1241
+ },
1242
+ {
1243
+ "epoch": 2.7796383121232418,
1244
+ "grad_norm": 54.06953048706055,
1245
+ "learning_rate": 3.2890980272783255e-08,
1246
+ "logits/chosen": NaN,
1247
+ "logits/rejected": NaN,
1248
+ "logps/chosen": -180.65658569335938,
1249
+ "logps/rejected": -280.3162536621094,
1250
+ "loss": 0.2086,
1251
+ "rewards/accuracies": 0.7875000238418579,
1252
+ "rewards/chosen": -0.812857449054718,
1253
+ "rewards/margins": 5.271449565887451,
1254
+ "rewards/rejected": -6.0843071937561035,
1255
+ "step": 4150
1256
+ },
1257
+ {
1258
+ "epoch": 2.813127930341594,
1259
+ "grad_norm": 12.436116218566895,
1260
+ "learning_rate": 2.371570847483839e-08,
1261
+ "logits/chosen": NaN,
1262
+ "logits/rejected": NaN,
1263
+ "logps/chosen": -180.7625732421875,
1264
+ "logps/rejected": -277.9272766113281,
1265
+ "loss": 0.2046,
1266
+ "rewards/accuracies": 0.78125,
1267
+ "rewards/chosen": -0.6954517364501953,
1268
+ "rewards/margins": 5.145771026611328,
1269
+ "rewards/rejected": -5.841222763061523,
1270
+ "step": 4200
1271
+ },
1272
+ {
1273
+ "epoch": 2.8466175485599465,
1274
+ "grad_norm": 66.9225845336914,
1275
+ "learning_rate": 1.6022734607604393e-08,
1276
+ "logits/chosen": NaN,
1277
+ "logits/rejected": NaN,
1278
+ "logps/chosen": -187.79019165039062,
1279
+ "logps/rejected": -282.13323974609375,
1280
+ "loss": 0.2096,
1281
+ "rewards/accuracies": 0.7925000190734863,
1282
+ "rewards/chosen": -0.8357629179954529,
1283
+ "rewards/margins": 5.103863716125488,
1284
+ "rewards/rejected": -5.939626693725586,
1285
+ "step": 4250
1286
+ },
1287
+ {
1288
+ "epoch": 2.8801071667782985,
1289
+ "grad_norm": 15.983145713806152,
1290
+ "learning_rate": 9.823738956571182e-09,
1291
+ "logits/chosen": NaN,
1292
+ "logits/rejected": NaN,
1293
+ "logps/chosen": -191.03807067871094,
1294
+ "logps/rejected": -292.3773193359375,
1295
+ "loss": 0.206,
1296
+ "rewards/accuracies": 0.7912499904632568,
1297
+ "rewards/chosen": -0.6932557821273804,
1298
+ "rewards/margins": 5.146268367767334,
1299
+ "rewards/rejected": -5.839523792266846,
1300
+ "step": 4300
1301
+ },
1302
+ {
1303
+ "epoch": 2.913596784996651,
1304
+ "grad_norm": 33.383487701416016,
1305
+ "learning_rate": 5.128133491700715e-09,
1306
+ "logits/chosen": NaN,
1307
+ "logits/rejected": NaN,
1308
+ "logps/chosen": -186.7404327392578,
1309
+ "logps/rejected": -289.3056945800781,
1310
+ "loss": 0.1936,
1311
+ "rewards/accuracies": 0.7975000143051147,
1312
+ "rewards/chosen": -0.7487243413925171,
1313
+ "rewards/margins": 5.300227642059326,
1314
+ "rewards/rejected": -6.048952579498291,
1315
+ "step": 4350
1316
+ },
1317
+ {
1318
+ "epoch": 2.9470864032150033,
1319
+ "grad_norm": 3.542743682861328,
1320
+ "learning_rate": 1.9430475771796684e-09,
1321
+ "logits/chosen": NaN,
1322
+ "logits/rejected": NaN,
1323
+ "logps/chosen": -190.31752014160156,
1324
+ "logps/rejected": -268.015380859375,
1325
+ "loss": 0.2124,
1326
+ "rewards/accuracies": 0.7862499952316284,
1327
+ "rewards/chosen": -0.6255255937576294,
1328
+ "rewards/margins": 4.9648332595825195,
1329
+ "rewards/rejected": -5.590358257293701,
1330
+ "step": 4400
1331
+ },
1332
+ {
1333
+ "epoch": 2.9805760214333556,
1334
+ "grad_norm": 19.205642700195312,
1335
+ "learning_rate": 2.733171468656259e-10,
1336
+ "logits/chosen": NaN,
1337
+ "logits/rejected": NaN,
1338
+ "logps/chosen": -177.03684997558594,
1339
+ "logps/rejected": -277.01495361328125,
1340
+ "loss": 0.2059,
1341
+ "rewards/accuracies": 0.7837499976158142,
1342
+ "rewards/chosen": -0.7730162739753723,
1343
+ "rewards/margins": 5.190572738647461,
1344
+ "rewards/rejected": -5.963588714599609,
1345
+ "step": 4450
1346
  }
1347
  ],
1348
  "logging_steps": 50,
1349
+ "max_steps": 4479,
1350
  "num_input_tokens_seen": 0,
1351
+ "num_train_epochs": 3,
1352
  "save_steps": 200,
1353
  "stateful_callbacks": {
1354
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83253cf573c71383f710bb1441ffc24338aa5407acd305912e4964de7e52bda0
3
  size 6545
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b84867b7d993885f385b0892996ffe7c2611dc21555a94b8af64b6116d9bbd7
3
  size 6545