File size: 24,220 Bytes
0ee3340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.008502949460594144,
  "eval_steps": 500,
  "global_step": 40,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 216.796875,
      "epoch": 0.00021257373651485358,
      "grad_norm": 0.4854881763458252,
      "kl": 9.614229202270508e-05,
      "learning_rate": 9.997874149659865e-07,
      "loss": 0.0,
      "reward": 2.732285737991333,
      "reward_std": 0.02619727296405472,
      "rewards/format_reward_hoi_key": 0.9139583259820938,
      "rewards/format_reward_hoi_object_label": 0.8222222253680229,
      "rewards/format_reward_hoi_verb_label": 0.3161458373069763,
      "rewards/hoi_iou_reward": 0.6799592822790146,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 173.3125,
      "epoch": 0.00042514747302970716,
      "grad_norm": 0.6831408739089966,
      "kl": 1.3329088687896729e-05,
      "learning_rate": 9.995748299319728e-07,
      "loss": 0.0,
      "reward": 2.8274163007736206,
      "reward_std": 0.03815040903282352,
      "rewards/format_reward_hoi_key": 0.8166666775941849,
      "rewards/format_reward_hoi_object_label": 0.7916666567325592,
      "rewards/format_reward_hoi_verb_label": 0.5974702388048172,
      "rewards/hoi_iou_reward": 0.6216127127408981,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 166.03125,
      "epoch": 0.0006377212095445608,
      "grad_norm": 0.8407193422317505,
      "kl": 0.00014454126358032227,
      "learning_rate": 9.99362244897959e-07,
      "loss": 0.0,
      "reward": 2.986231029033661,
      "reward_std": 0.0052611194987548515,
      "rewards/format_reward_hoi_key": 0.8208333402872086,
      "rewards/format_reward_hoi_object_label": 0.84375,
      "rewards/format_reward_hoi_verb_label": 0.6927083432674408,
      "rewards/hoi_iou_reward": 0.6289393231272697,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 185.703125,
      "epoch": 0.0008502949460594143,
      "grad_norm": 2.3168516159057617,
      "kl": 0.00014531612396240234,
      "learning_rate": 9.991496598639456e-07,
      "loss": 0.0,
      "reward": 2.3956105709075928,
      "reward_std": 0.045728508091997355,
      "rewards/format_reward_hoi_key": 0.7395220696926117,
      "rewards/format_reward_hoi_object_label": 0.59375,
      "rewards/format_reward_hoi_verb_label": 0.5073784738779068,
      "rewards/hoi_iou_reward": 0.5549599975347519,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 180.421875,
      "epoch": 0.001062868682574268,
      "grad_norm": 0.5881515741348267,
      "kl": 0.00014841556549072266,
      "learning_rate": 9.989370748299319e-07,
      "loss": 0.0,
      "reward": 2.2462641298770905,
      "reward_std": 0.14320564700756222,
      "rewards/format_reward_hoi_key": 0.7350446432828903,
      "rewards/format_reward_hoi_object_label": 0.4899553433060646,
      "rewards/format_reward_hoi_verb_label": 0.5563345961272717,
      "rewards/hoi_iou_reward": 0.46492957696318626,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 208.28125,
      "epoch": 0.0012754424190891216,
      "grad_norm": 0.29585161805152893,
      "kl": 0.0001379847526550293,
      "learning_rate": 9.987244897959182e-07,
      "loss": 0.0,
      "reward": 2.1843446791172028,
      "reward_std": 0.005820542646688409,
      "rewards/format_reward_hoi_key": 0.8457291722297668,
      "rewards/format_reward_hoi_object_label": 0.6000000089406967,
      "rewards/format_reward_hoi_verb_label": 0.1180555634200573,
      "rewards/hoi_iou_reward": 0.6205599009990692,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 164.84375,
      "epoch": 0.0014880161556039752,
      "grad_norm": 0.5830075144767761,
      "kl": 0.00010955333709716797,
      "learning_rate": 9.985119047619047e-07,
      "loss": -0.0,
      "reward": 2.5442887246608734,
      "reward_std": 0.11149050580570474,
      "rewards/format_reward_hoi_key": 0.7979166656732559,
      "rewards/format_reward_hoi_object_label": 0.7083333358168602,
      "rewards/format_reward_hoi_verb_label": 0.4583333358168602,
      "rewards/hoi_iou_reward": 0.5797053650021553,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 167.234375,
      "epoch": 0.0017005898921188286,
      "grad_norm": 0.35756170749664307,
      "kl": 8.910894393920898e-05,
      "learning_rate": 9.982993197278912e-07,
      "loss": 0.0,
      "reward": 2.5064347982406616,
      "reward_std": 0.0026310062530683354,
      "rewards/format_reward_hoi_key": 0.7702381014823914,
      "rewards/format_reward_hoi_object_label": 0.595362103311345,
      "rewards/format_reward_hoi_verb_label": 0.5941220238455571,
      "rewards/hoi_iou_reward": 0.546712551265955,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 188.484375,
      "epoch": 0.0019131636286336823,
      "grad_norm": 1.1468232870101929,
      "kl": 0.00023877620697021484,
      "learning_rate": 9.980867346938775e-07,
      "loss": 0.0,
      "reward": 2.93448406457901,
      "reward_std": 0.07516021025367081,
      "rewards/format_reward_hoi_key": 0.90625,
      "rewards/format_reward_hoi_object_label": 0.79296875,
      "rewards/format_reward_hoi_verb_label": 0.447916679084301,
      "rewards/hoi_iou_reward": 0.7873486280441284,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 188.921875,
      "epoch": 0.002125737365148536,
      "grad_norm": 0.608834445476532,
      "kl": 0.0003757476806640625,
      "learning_rate": 9.97874149659864e-07,
      "loss": -0.0,
      "reward": 2.309541165828705,
      "reward_std": 0.04332686646375805,
      "rewards/format_reward_hoi_key": 0.7756249904632568,
      "rewards/format_reward_hoi_object_label": 0.5166666656732559,
      "rewards/format_reward_hoi_verb_label": 0.46510415710508823,
      "rewards/hoi_iou_reward": 0.5521453768014908,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 156.34375,
      "epoch": 0.0023383111016633895,
      "grad_norm": 1.079801321029663,
      "kl": 0.0002917051315307617,
      "learning_rate": 9.976615646258503e-07,
      "loss": -0.0,
      "reward": 2.9021179378032684,
      "reward_std": 0.06573383091017604,
      "rewards/format_reward_hoi_key": 0.9125000089406967,
      "rewards/format_reward_hoi_object_label": 0.75,
      "rewards/format_reward_hoi_verb_label": 0.5,
      "rewards/hoi_iou_reward": 0.7396180182695389,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 174.46875,
      "epoch": 0.002550884838178243,
      "grad_norm": 0.6156663298606873,
      "kl": 0.0005776882171630859,
      "learning_rate": 9.974489795918366e-07,
      "loss": 0.0,
      "reward": 2.3791774213314056,
      "reward_std": 0.0850577435339801,
      "rewards/format_reward_hoi_key": 0.7312500178813934,
      "rewards/format_reward_hoi_object_label": 0.5208333358168602,
      "rewards/format_reward_hoi_verb_label": 0.5911458358168602,
      "rewards/hoi_iou_reward": 0.535948283970356,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 167.578125,
      "epoch": 0.002763458574693097,
      "grad_norm": 3.2466673851013184,
      "kl": 0.0002658367156982422,
      "learning_rate": 9.972363945578231e-07,
      "loss": 0.0,
      "reward": 3.0418315529823303,
      "reward_std": 0.013055827002972364,
      "rewards/format_reward_hoi_key": 0.9000000059604645,
      "rewards/format_reward_hoi_object_label": 0.8125,
      "rewards/format_reward_hoi_verb_label": 0.625,
      "rewards/hoi_iou_reward": 0.704331636428833,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 167.4375,
      "epoch": 0.0029760323112079505,
      "grad_norm": 0.5669279098510742,
      "kl": 0.0004019737243652344,
      "learning_rate": 9.970238095238094e-07,
      "loss": 0.0,
      "reward": 2.4804917573928833,
      "reward_std": 0.08349880830792245,
      "rewards/format_reward_hoi_key": 0.7427083253860474,
      "rewards/format_reward_hoi_object_label": 0.697916679084301,
      "rewards/format_reward_hoi_verb_label": 0.483333345502615,
      "rewards/hoi_iou_reward": 0.5565334260463715,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 198.625,
      "epoch": 0.0031886060477228037,
      "grad_norm": 0.27379515767097473,
      "kl": 0.00033855438232421875,
      "learning_rate": 9.968112244897957e-07,
      "loss": 0.0,
      "reward": 2.1904609203338623,
      "reward_std": 0.06255148959462531,
      "rewards/format_reward_hoi_key": 0.7820312678813934,
      "rewards/format_reward_hoi_object_label": 0.6083984375,
      "rewards/format_reward_hoi_verb_label": 0.3639322891831398,
      "rewards/hoi_iou_reward": 0.4360988959670067,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 242.234375,
      "epoch": 0.0034011797842376573,
      "grad_norm": 0.2515924870967865,
      "kl": 0.0006353855133056641,
      "learning_rate": 9.965986394557822e-07,
      "loss": 0.0,
      "reward": 2.716467797756195,
      "reward_std": 0.07810639549279585,
      "rewards/format_reward_hoi_key": 0.7664583474397659,
      "rewards/format_reward_hoi_object_label": 0.6187500059604645,
      "rewards/format_reward_hoi_verb_label": 0.6677083224058151,
      "rewards/hoi_iou_reward": 0.663551077246666,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 226.859375,
      "epoch": 0.003613753520752511,
      "grad_norm": 0.5136963725090027,
      "kl": 0.0003933906555175781,
      "learning_rate": 9.963860544217688e-07,
      "loss": 0.0,
      "reward": 2.071069449186325,
      "reward_std": 0.06459418445592746,
      "rewards/format_reward_hoi_key": 0.6252120807766914,
      "rewards/format_reward_hoi_object_label": 0.5837053582072258,
      "rewards/format_reward_hoi_verb_label": 0.4394965320825577,
      "rewards/hoi_iou_reward": 0.4226554408669472,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 277.625,
      "epoch": 0.0038263272572673646,
      "grad_norm": 0.6188202500343323,
      "kl": 0.0002378225326538086,
      "learning_rate": 9.96173469387755e-07,
      "loss": 0.0,
      "reward": 3.0354496240615845,
      "reward_std": 0.30482952669262886,
      "rewards/format_reward_hoi_key": 0.8430059552192688,
      "rewards/format_reward_hoi_object_label": 0.8227306753396988,
      "rewards/format_reward_hoi_verb_label": 0.5986328125,
      "rewards/hoi_iou_reward": 0.7710802108049393,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 151.75,
      "epoch": 0.004038900993782219,
      "grad_norm": 0.27549490332603455,
      "kl": 0.0006313323974609375,
      "learning_rate": 9.959608843537416e-07,
      "loss": -0.0,
      "reward": 2.0183950662612915,
      "reward_std": 0.015180108457570896,
      "rewards/format_reward_hoi_key": 0.6604166775941849,
      "rewards/format_reward_hoi_object_label": 0.5416666716337204,
      "rewards/format_reward_hoi_verb_label": 0.3524305671453476,
      "rewards/hoi_iou_reward": 0.4638812467455864,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 224.078125,
      "epoch": 0.004251474730297072,
      "grad_norm": 0.56353759765625,
      "kl": 0.0008664131164550781,
      "learning_rate": 9.957482993197279e-07,
      "loss": 0.0,
      "reward": 2.617310881614685,
      "reward_std": 0.185114907566458,
      "rewards/format_reward_hoi_key": 0.7604167088866234,
      "rewards/format_reward_hoi_object_label": 0.6744791641831398,
      "rewards/format_reward_hoi_verb_label": 0.5677083283662796,
      "rewards/hoi_iou_reward": 0.6147066801786423,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 174.09375,
      "epoch": 0.004464048466811925,
      "grad_norm": 0.31322988867759705,
      "kl": 0.0007352828979492188,
      "learning_rate": 9.955357142857142e-07,
      "loss": 0.0,
      "reward": 2.9305796921253204,
      "reward_std": 0.01013911364134401,
      "rewards/format_reward_hoi_key": 0.8696428686380386,
      "rewards/format_reward_hoi_object_label": 0.7857142835855484,
      "rewards/format_reward_hoi_verb_label": 0.552300363779068,
      "rewards/hoi_iou_reward": 0.7229221612215042,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 162.0,
      "epoch": 0.004676622203326779,
      "grad_norm": 0.4496309161186218,
      "kl": 0.001049041748046875,
      "learning_rate": 9.953231292517007e-07,
      "loss": 0.0,
      "reward": 2.2096868455410004,
      "reward_std": 0.0113821976701729,
      "rewards/format_reward_hoi_key": 0.7333928644657135,
      "rewards/format_reward_hoi_object_label": 0.6169642880558968,
      "rewards/format_reward_hoi_verb_label": 0.2777777761220932,
      "rewards/hoi_iou_reward": 0.5815519690513611,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 154.609375,
      "epoch": 0.004889195939841632,
      "grad_norm": 0.8822689652442932,
      "kl": 0.0013833045959472656,
      "learning_rate": 9.95110544217687e-07,
      "loss": 0.0,
      "reward": 3.247895896434784,
      "reward_std": 0.04373934442992322,
      "rewards/format_reward_hoi_key": 0.9250000268220901,
      "rewards/format_reward_hoi_object_label": 0.9583333283662796,
      "rewards/format_reward_hoi_verb_label": 0.6562499850988388,
      "rewards/hoi_iou_reward": 0.708312600851059,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 410.328125,
      "epoch": 0.005101769676356486,
      "grad_norm": 0.7035483121871948,
      "kl": 0.0004572868347167969,
      "learning_rate": 9.948979591836735e-07,
      "loss": 0.0,
      "reward": 2.264761805534363,
      "reward_std": 0.28715356811881065,
      "rewards/format_reward_hoi_key": 0.6794504672288895,
      "rewards/format_reward_hoi_object_label": 0.5326923131942749,
      "rewards/format_reward_hoi_verb_label": 0.6280448734760284,
      "rewards/hoi_iou_reward": 0.42457417771220207,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 227.59375,
      "epoch": 0.0053143434128713396,
      "grad_norm": 0.31094542145729065,
      "kl": 0.0009341239929199219,
      "learning_rate": 9.946853741496598e-07,
      "loss": 0.0,
      "reward": 2.356251895427704,
      "reward_std": 0.003799198704655282,
      "rewards/format_reward_hoi_key": 0.767708346247673,
      "rewards/format_reward_hoi_object_label": 0.4895833432674408,
      "rewards/format_reward_hoi_verb_label": 0.5043560639023781,
      "rewards/hoi_iou_reward": 0.5946041345596313,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 181.375,
      "epoch": 0.005526917149386194,
      "grad_norm": 0.5995525121688843,
      "kl": 0.00150299072265625,
      "learning_rate": 9.944727891156463e-07,
      "loss": 0.0001,
      "reward": 2.6978970766067505,
      "reward_std": 0.13544296027976088,
      "rewards/format_reward_hoi_key": 0.8333333432674408,
      "rewards/format_reward_hoi_object_label": 0.6158854141831398,
      "rewards/format_reward_hoi_verb_label": 0.5898437350988388,
      "rewards/hoi_iou_reward": 0.6588345021009445,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 189.03125,
      "epoch": 0.005739490885901047,
      "grad_norm": 0.5540001392364502,
      "kl": 0.000858306884765625,
      "learning_rate": 9.942602040816326e-07,
      "loss": 0.0001,
      "reward": 3.355882227420807,
      "reward_std": 0.005877207615412772,
      "rewards/format_reward_hoi_key": 0.9535714238882065,
      "rewards/format_reward_hoi_object_label": 0.9017857313156128,
      "rewards/format_reward_hoi_verb_label": 0.7232142835855484,
      "rewards/hoi_iou_reward": 0.7773108184337616,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 211.90625,
      "epoch": 0.005952064622415901,
      "grad_norm": 2.0975677967071533,
      "kl": 0.001495361328125,
      "learning_rate": 9.940476190476191e-07,
      "loss": 0.0001,
      "reward": 2.007324628531933,
      "reward_std": 0.03993106237612665,
      "rewards/format_reward_hoi_key": 0.5873221457004547,
      "rewards/format_reward_hoi_object_label": 0.44114159047603607,
      "rewards/format_reward_hoi_verb_label": 0.5036415904760361,
      "rewards/hoi_iou_reward": 0.47521928139030933,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 165.1875,
      "epoch": 0.006164638358930754,
      "grad_norm": 0.37749311327934265,
      "kl": 0.0019092559814453125,
      "learning_rate": 9.938350340136054e-07,
      "loss": 0.0001,
      "reward": 2.2582033574581146,
      "reward_std": 0.08065436300239526,
      "rewards/format_reward_hoi_key": 0.6932291835546494,
      "rewards/format_reward_hoi_object_label": 0.59375,
      "rewards/format_reward_hoi_verb_label": 0.3541666641831398,
      "rewards/hoi_iou_reward": 0.6170575618743896,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 184.59375,
      "epoch": 0.006377212095445607,
      "grad_norm": 0.3330162763595581,
      "kl": 0.0014448165893554688,
      "learning_rate": 9.936224489795917e-07,
      "loss": 0.0,
      "reward": 2.6335054636001587,
      "reward_std": 0.0012341497422312386,
      "rewards/format_reward_hoi_key": 0.8750000149011612,
      "rewards/format_reward_hoi_object_label": 0.6875,
      "rewards/format_reward_hoi_verb_label": 0.3880208358168602,
      "rewards/hoi_iou_reward": 0.6829846650362015,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 172.125,
      "epoch": 0.006589785831960461,
      "grad_norm": 0.8369670510292053,
      "kl": 0.0013284683227539062,
      "learning_rate": 9.934098639455782e-07,
      "loss": 0.0001,
      "reward": 2.4850784838199615,
      "reward_std": 0.02788396377582103,
      "rewards/format_reward_hoi_key": 0.8687500208616257,
      "rewards/format_reward_hoi_object_label": 0.4687500074505806,
      "rewards/format_reward_hoi_verb_label": 0.5,
      "rewards/hoi_iou_reward": 0.6475784331560135,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 216.046875,
      "epoch": 0.006802359568475315,
      "grad_norm": 0.9224941730499268,
      "kl": 0.00140380859375,
      "learning_rate": 9.931972789115645e-07,
      "loss": 0.0,
      "reward": 2.751905083656311,
      "reward_std": 0.08277821098454297,
      "rewards/format_reward_hoi_key": 0.809895858168602,
      "rewards/format_reward_hoi_object_label": 0.5078125,
      "rewards/format_reward_hoi_verb_label": 0.6927083283662796,
      "rewards/hoi_iou_reward": 0.7414884492754936,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 210.71875,
      "epoch": 0.007014933304990169,
      "grad_norm": 0.39392945170402527,
      "kl": 0.002300262451171875,
      "learning_rate": 9.92984693877551e-07,
      "loss": 0.0001,
      "reward": 2.1440170407295227,
      "reward_std": 0.02253561234101653,
      "rewards/format_reward_hoi_key": 0.9121875166893005,
      "rewards/format_reward_hoi_object_label": 0.3125,
      "rewards/format_reward_hoi_verb_label": 0.3333333358168602,
      "rewards/hoi_iou_reward": 0.5859961807727814,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 194.265625,
      "epoch": 0.007227507041505022,
      "grad_norm": 0.5018682479858398,
      "kl": 0.0017986297607421875,
      "learning_rate": 9.927721088435373e-07,
      "loss": 0.0001,
      "reward": 2.5592292845249176,
      "reward_std": 0.00986732606543228,
      "rewards/format_reward_hoi_key": 0.7691666930913925,
      "rewards/format_reward_hoi_object_label": 0.6583333313465118,
      "rewards/format_reward_hoi_verb_label": 0.5562499985098839,
      "rewards/hoi_iou_reward": 0.5754793435335159,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 205.03125,
      "epoch": 0.007440080778019876,
      "grad_norm": 0.6149845719337463,
      "kl": 0.0016880035400390625,
      "learning_rate": 9.925595238095238e-07,
      "loss": 0.0001,
      "reward": 2.778216004371643,
      "reward_std": 0.11917518911650404,
      "rewards/format_reward_hoi_key": 0.8614583313465118,
      "rewards/format_reward_hoi_object_label": 0.7333333194255829,
      "rewards/format_reward_hoi_verb_label": 0.5011574029922485,
      "rewards/hoi_iou_reward": 0.6822669506072998,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 222.671875,
      "epoch": 0.007652654514534729,
      "grad_norm": 0.5013077259063721,
      "kl": 0.0017242431640625,
      "learning_rate": 9.923469387755101e-07,
      "loss": 0.0001,
      "reward": 2.724997416138649,
      "reward_std": 0.007125564094167203,
      "rewards/format_reward_hoi_key": 0.8181547522544861,
      "rewards/format_reward_hoi_object_label": 0.6875,
      "rewards/format_reward_hoi_verb_label": 0.625,
      "rewards/hoi_iou_reward": 0.5943426117300987,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 249.65625,
      "epoch": 0.007865228251049582,
      "grad_norm": 0.4427626430988312,
      "kl": 0.00135040283203125,
      "learning_rate": 9.921343537414967e-07,
      "loss": 0.0001,
      "reward": 2.5683979988098145,
      "reward_std": 0.05630575024406426,
      "rewards/format_reward_hoi_key": 0.810416653752327,
      "rewards/format_reward_hoi_object_label": 0.625,
      "rewards/format_reward_hoi_verb_label": 0.3906250111758709,
      "rewards/hoi_iou_reward": 0.7423563152551651,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 270.921875,
      "epoch": 0.008077801987564437,
      "grad_norm": 2.8067455291748047,
      "kl": 0.0019435882568359375,
      "learning_rate": 9.91921768707483e-07,
      "loss": 0.0001,
      "reward": 2.22263365983963,
      "reward_std": 0.20146464882418513,
      "rewards/format_reward_hoi_key": 0.6945772171020508,
      "rewards/format_reward_hoi_object_label": 0.3977022171020508,
      "rewards/format_reward_hoi_verb_label": 0.5460824370384216,
      "rewards/hoi_iou_reward": 0.5842718333005905,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 211.40625,
      "epoch": 0.00829037572407929,
      "grad_norm": 0.5841067433357239,
      "kl": 0.00464630126953125,
      "learning_rate": 9.917091836734693e-07,
      "loss": 0.0002,
      "reward": 2.7285755276679993,
      "reward_std": 0.15019595221383497,
      "rewards/format_reward_hoi_key": 0.931383952498436,
      "rewards/format_reward_hoi_object_label": 0.5188244059681892,
      "rewards/format_reward_hoi_verb_label": 0.5774181559681892,
      "rewards/hoi_iou_reward": 0.7009490430355072,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 235.15625,
      "epoch": 0.008502949460594144,
      "grad_norm": 0.3903954327106476,
      "kl": 0.0014748573303222656,
      "learning_rate": 9.914965986394558e-07,
      "loss": 0.0001,
      "reward": 2.41436231136322,
      "reward_std": 0.031614198378520086,
      "rewards/format_reward_hoi_key": 0.7393315136432648,
      "rewards/format_reward_hoi_object_label": 0.6360462605953217,
      "rewards/format_reward_hoi_verb_label": 0.5051649361848831,
      "rewards/hoi_iou_reward": 0.5338196456432343,
      "step": 40
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 4704,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 20,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}