File size: 15,684 Bytes
d42ac9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 636,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.12,
      "learning_rate": 7.8125e-06,
      "logps/chosen": -122.16297149658203,
      "logps/rejected": -71.43323516845703,
      "loss": 0.4952,
      "losses/dpo": 0.4956728219985962,
      "losses/sft": 0.7316558957099915,
      "losses/total": 0.4956728219985962,
      "ref_logps/chosen": -127.74378204345703,
      "ref_logps/rejected": -70.59587860107422,
      "rewards/accuracies": 0.8070000410079956,
      "rewards/chosen": 0.5580801367759705,
      "rewards/margins": 0.6418154239654541,
      "rewards/rejected": -0.08373536914587021,
      "step": 25
    },
    {
      "epoch": 0.24,
      "learning_rate": 1.5625e-05,
      "logps/chosen": -94.26205444335938,
      "logps/rejected": -85.60575866699219,
      "loss": 0.0691,
      "losses/dpo": 0.07388682663440704,
      "losses/sft": 0.5650071501731873,
      "losses/total": 0.07388682663440704,
      "ref_logps/chosen": -128.54661560058594,
      "ref_logps/rejected": -72.49893951416016,
      "rewards/accuracies": 0.9929999709129333,
      "rewards/chosen": 3.428455114364624,
      "rewards/margins": 4.739137649536133,
      "rewards/rejected": -1.310682773590088,
      "step": 50
    },
    {
      "epoch": 0.35,
      "learning_rate": 1.9615384615384617e-05,
      "logps/chosen": -85.07345581054688,
      "logps/rejected": -101.97691345214844,
      "loss": 0.0179,
      "losses/dpo": 0.014726839028298855,
      "losses/sft": 0.5030468106269836,
      "losses/total": 0.014726839028298855,
      "ref_logps/chosen": -129.9876708984375,
      "ref_logps/rejected": -72.3249282836914,
      "rewards/accuracies": 0.9989999532699585,
      "rewards/chosen": 4.491419792175293,
      "rewards/margins": 7.45661735534668,
      "rewards/rejected": -2.9651970863342285,
      "step": 75
    },
    {
      "epoch": 0.47,
      "learning_rate": 1.8741258741258744e-05,
      "logps/chosen": -85.64691162109375,
      "logps/rejected": -110.90087890625,
      "loss": 0.0096,
      "losses/dpo": 0.012412017211318016,
      "losses/sft": 0.5199429988861084,
      "losses/total": 0.012412017211318016,
      "ref_logps/chosen": -130.2884979248047,
      "ref_logps/rejected": -71.44290924072266,
      "rewards/accuracies": 0.9984999299049377,
      "rewards/chosen": 4.464157581329346,
      "rewards/margins": 8.409955024719238,
      "rewards/rejected": -3.9457967281341553,
      "step": 100
    },
    {
      "epoch": 0.59,
      "learning_rate": 1.7867132867132868e-05,
      "logps/chosen": -82.34768676757812,
      "logps/rejected": -116.94005584716797,
      "loss": 0.0061,
      "losses/dpo": 0.008614934980869293,
      "losses/sft": 0.49562689661979675,
      "losses/total": 0.008614934980869293,
      "ref_logps/chosen": -128.71200561523438,
      "ref_logps/rejected": -71.86701202392578,
      "rewards/accuracies": 0.9994999766349792,
      "rewards/chosen": 4.636431694030762,
      "rewards/margins": 9.143735885620117,
      "rewards/rejected": -4.507305145263672,
      "step": 125
    },
    {
      "epoch": 0.71,
      "learning_rate": 1.6993006993006995e-05,
      "logps/chosen": -85.28910064697266,
      "logps/rejected": -123.17980194091797,
      "loss": 0.0053,
      "losses/dpo": 0.004700234159827232,
      "losses/sft": 0.5220319032669067,
      "losses/total": 0.004700234159827232,
      "ref_logps/chosen": -129.39625549316406,
      "ref_logps/rejected": -70.16360473632812,
      "rewards/accuracies": 0.9994999766349792,
      "rewards/chosen": 4.410714149475098,
      "rewards/margins": 9.712334632873535,
      "rewards/rejected": -5.301620006561279,
      "step": 150
    },
    {
      "epoch": 0.83,
      "learning_rate": 1.611888111888112e-05,
      "logps/chosen": -84.59432983398438,
      "logps/rejected": -131.1455535888672,
      "loss": 0.0051,
      "losses/dpo": 0.003301014890894294,
      "losses/sft": 0.5115602016448975,
      "losses/total": 0.003301014890894294,
      "ref_logps/chosen": -127.61747741699219,
      "ref_logps/rejected": -71.97355651855469,
      "rewards/accuracies": 0.9994999766349792,
      "rewards/chosen": 4.302317142486572,
      "rewards/margins": 10.219517707824707,
      "rewards/rejected": -5.917200088500977,
      "step": 175
    },
    {
      "epoch": 0.94,
      "learning_rate": 1.5244755244755244e-05,
      "logps/chosen": -88.2319564819336,
      "logps/rejected": -141.83016967773438,
      "loss": 0.0028,
      "losses/dpo": 0.002293643541634083,
      "losses/sft": 0.5383260846138,
      "losses/total": 0.002293643541634083,
      "ref_logps/chosen": -129.1661376953125,
      "ref_logps/rejected": -71.8288803100586,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 4.093417167663574,
      "rewards/margins": 11.093545913696289,
      "rewards/rejected": -7.000128746032715,
      "step": 200
    },
    {
      "epoch": 1.06,
      "learning_rate": 1.4370629370629371e-05,
      "logps/chosen": -89.62419891357422,
      "logps/rejected": -153.9451904296875,
      "loss": 0.0033,
      "losses/dpo": 0.0030320805963128805,
      "losses/sft": 0.532666027545929,
      "losses/total": 0.0030320805963128805,
      "ref_logps/chosen": -128.41148376464844,
      "ref_logps/rejected": -71.97950744628906,
      "rewards/accuracies": 0.9989999532699585,
      "rewards/chosen": 3.878729820251465,
      "rewards/margins": 12.075300216674805,
      "rewards/rejected": -8.196569442749023,
      "step": 225
    },
    {
      "epoch": 1.18,
      "learning_rate": 1.3496503496503497e-05,
      "logps/chosen": -86.68380737304688,
      "logps/rejected": -156.0954132080078,
      "loss": 0.0013,
      "losses/dpo": 0.0008313562138937414,
      "losses/sft": 0.518293559551239,
      "losses/total": 0.0008313562138937414,
      "ref_logps/chosen": -128.29469299316406,
      "ref_logps/rejected": -71.77529907226562,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 4.161087989807129,
      "rewards/margins": 12.593099594116211,
      "rewards/rejected": -8.432010650634766,
      "step": 250
    },
    {
      "epoch": 1.3,
      "learning_rate": 1.2622377622377624e-05,
      "logps/chosen": -88.83470916748047,
      "logps/rejected": -160.05836486816406,
      "loss": 0.0015,
      "losses/dpo": 0.0008700879407115281,
      "losses/sft": 0.5323516726493835,
      "losses/total": 0.0008700879407115281,
      "ref_logps/chosen": -128.95960998535156,
      "ref_logps/rejected": -70.340576171875,
      "rewards/accuracies": 0.9994999766349792,
      "rewards/chosen": 4.012491703033447,
      "rewards/margins": 12.984270095825195,
      "rewards/rejected": -8.971778869628906,
      "step": 275
    },
    {
      "epoch": 1.42,
      "learning_rate": 1.1748251748251748e-05,
      "logps/chosen": -90.86172485351562,
      "logps/rejected": -162.376953125,
      "loss": 0.0011,
      "losses/dpo": 0.00106943363789469,
      "losses/sft": 0.5586134195327759,
      "losses/total": 0.00106943363789469,
      "ref_logps/chosen": -129.39662170410156,
      "ref_logps/rejected": -71.82042694091797,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.853489398956299,
      "rewards/margins": 12.909143447875977,
      "rewards/rejected": -9.055652618408203,
      "step": 300
    },
    {
      "epoch": 1.53,
      "learning_rate": 1.0874125874125875e-05,
      "logps/chosen": -93.8604507446289,
      "logps/rejected": -171.50653076171875,
      "loss": 0.0013,
      "losses/dpo": 0.0009554739226587117,
      "losses/sft": 0.5667473077774048,
      "losses/total": 0.0009554739226587117,
      "ref_logps/chosen": -128.62173461914062,
      "ref_logps/rejected": -72.07390594482422,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.476128339767456,
      "rewards/margins": 13.419390678405762,
      "rewards/rejected": -9.943263053894043,
      "step": 325
    },
    {
      "epoch": 1.65,
      "learning_rate": 1e-05,
      "logps/chosen": -92.96914672851562,
      "logps/rejected": -173.7914276123047,
      "loss": 0.0009,
      "losses/dpo": 0.00044292627717368305,
      "losses/sft": 0.561470627784729,
      "losses/total": 0.00044292627717368305,
      "ref_logps/chosen": -128.7430877685547,
      "ref_logps/rejected": -72.57361602783203,
      "rewards/accuracies": 0.9994999766349792,
      "rewards/chosen": 3.577392578125,
      "rewards/margins": 13.699174880981445,
      "rewards/rejected": -10.121781349182129,
      "step": 350
    },
    {
      "epoch": 1.77,
      "learning_rate": 9.125874125874126e-06,
      "logps/chosen": -94.47010040283203,
      "logps/rejected": -175.3832550048828,
      "loss": 0.0006,
      "losses/dpo": 0.00038047495763748884,
      "losses/sft": 0.567641019821167,
      "losses/total": 0.00038047495763748884,
      "ref_logps/chosen": -128.09613037109375,
      "ref_logps/rejected": -71.36651611328125,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.3626015186309814,
      "rewards/margins": 13.764276504516602,
      "rewards/rejected": -10.401673316955566,
      "step": 375
    },
    {
      "epoch": 1.89,
      "learning_rate": 8.251748251748254e-06,
      "logps/chosen": -96.90827941894531,
      "logps/rejected": -179.17970275878906,
      "loss": 0.0004,
      "losses/dpo": 0.0003745325666386634,
      "losses/sft": 0.5794407725334167,
      "losses/total": 0.0003745325666386634,
      "ref_logps/chosen": -129.79989624023438,
      "ref_logps/rejected": -71.4466323852539,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.2891619205474854,
      "rewards/margins": 14.062468528747559,
      "rewards/rejected": -10.773306846618652,
      "step": 400
    },
    {
      "epoch": 2.0,
      "learning_rate": 7.377622377622379e-06,
      "logps/chosen": -96.15299987792969,
      "logps/rejected": -181.71612548828125,
      "loss": 0.001,
      "losses/dpo": 0.0020445636473596096,
      "losses/sft": 0.5634098052978516,
      "losses/total": 0.0020445636473596096,
      "ref_logps/chosen": -130.4124298095703,
      "ref_logps/rejected": -71.5147933959961,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.4259424209594727,
      "rewards/margins": 14.446078300476074,
      "rewards/rejected": -11.020133972167969,
      "step": 425
    },
    {
      "epoch": 2.12,
      "learning_rate": 6.503496503496504e-06,
      "logps/chosen": -93.45982360839844,
      "logps/rejected": -177.76284790039062,
      "loss": 0.0003,
      "losses/dpo": 0.0001847467792686075,
      "losses/sft": 0.5676508545875549,
      "losses/total": 0.0001847467792686075,
      "ref_logps/chosen": -128.48521423339844,
      "ref_logps/rejected": -71.86593627929688,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.5025393962860107,
      "rewards/margins": 14.092233657836914,
      "rewards/rejected": -10.589694023132324,
      "step": 450
    },
    {
      "epoch": 2.24,
      "learning_rate": 5.629370629370629e-06,
      "logps/chosen": -95.26840209960938,
      "logps/rejected": -181.77322387695312,
      "loss": 0.0003,
      "losses/dpo": 0.00029167634784244,
      "losses/sft": 0.5723408460617065,
      "losses/total": 0.00029167634784244,
      "ref_logps/chosen": -129.8194580078125,
      "ref_logps/rejected": -71.22503662109375,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.455106496810913,
      "rewards/margins": 14.509923934936523,
      "rewards/rejected": -11.054819107055664,
      "step": 475
    },
    {
      "epoch": 2.36,
      "learning_rate": 4.755244755244756e-06,
      "logps/chosen": -96.92023468017578,
      "logps/rejected": -186.9994354248047,
      "loss": 0.0002,
      "losses/dpo": 0.00031228098669089377,
      "losses/sft": 0.5785849690437317,
      "losses/total": 0.00031228098669089377,
      "ref_logps/chosen": -129.39373779296875,
      "ref_logps/rejected": -72.0064468383789,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.2473514080047607,
      "rewards/margins": 14.746650695800781,
      "rewards/rejected": -11.499299049377441,
      "step": 500
    },
    {
      "epoch": 2.48,
      "learning_rate": 3.881118881118881e-06,
      "logps/chosen": -96.56753540039062,
      "logps/rejected": -188.7633819580078,
      "loss": 0.0003,
      "losses/dpo": 0.00019269342010375112,
      "losses/sft": 0.5762569904327393,
      "losses/total": 0.00019269342010375112,
      "ref_logps/chosen": -128.0542449951172,
      "ref_logps/rejected": -71.33090209960938,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.148669481277466,
      "rewards/margins": 14.891918182373047,
      "rewards/rejected": -11.743247985839844,
      "step": 525
    },
    {
      "epoch": 2.59,
      "learning_rate": 3.006993006993007e-06,
      "logps/chosen": -97.47007751464844,
      "logps/rejected": -189.46685791015625,
      "loss": 0.0002,
      "losses/dpo": 0.00025137903867289424,
      "losses/sft": 0.5845997333526611,
      "losses/total": 0.00025137903867289424,
      "ref_logps/chosen": -128.27481079101562,
      "ref_logps/rejected": -71.0475082397461,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.0804734230041504,
      "rewards/margins": 14.922408103942871,
      "rewards/rejected": -11.841936111450195,
      "step": 550
    },
    {
      "epoch": 2.71,
      "learning_rate": 2.132867132867133e-06,
      "logps/chosen": -97.53392791748047,
      "logps/rejected": -190.3184356689453,
      "loss": 0.0003,
      "losses/dpo": 0.00022768642520532012,
      "losses/sft": 0.5818451642990112,
      "losses/total": 0.00022768642520532012,
      "ref_logps/chosen": -127.79641723632812,
      "ref_logps/rejected": -71.30667877197266,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.0262484550476074,
      "rewards/margins": 14.927424430847168,
      "rewards/rejected": -11.901176452636719,
      "step": 575
    },
    {
      "epoch": 2.83,
      "learning_rate": 1.258741258741259e-06,
      "logps/chosen": -98.7781753540039,
      "logps/rejected": -194.68101501464844,
      "loss": 0.0002,
      "losses/dpo": 0.00027412467170506716,
      "losses/sft": 0.5883935689926147,
      "losses/total": 0.00027412467170506716,
      "ref_logps/chosen": -129.11810302734375,
      "ref_logps/rejected": -72.69854736328125,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.0339913368225098,
      "rewards/margins": 15.232237815856934,
      "rewards/rejected": -12.198246955871582,
      "step": 600
    },
    {
      "epoch": 2.95,
      "learning_rate": 3.846153846153847e-07,
      "logps/chosen": -99.09100341796875,
      "logps/rejected": -193.7891387939453,
      "loss": 0.0002,
      "losses/dpo": 0.0002839878143277019,
      "losses/sft": 0.5941969752311707,
      "losses/total": 0.0002839878143277019,
      "ref_logps/chosen": -129.47601318359375,
      "ref_logps/rejected": -71.78772735595703,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 3.0384998321533203,
      "rewards/margins": 15.238642692565918,
      "rewards/rejected": -12.200141906738281,
      "step": 625
    },
    {
      "epoch": 3.0,
      "step": 636,
      "total_flos": 0.0,
      "train_loss": 0.02455043116600528,
      "train_runtime": 18942.3632,
      "train_samples_per_second": 2.686,
      "train_steps_per_second": 0.034
    }
  ],
  "logging_steps": 25,
  "max_steps": 636,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}