File size: 25,521 Bytes
fc96ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.39603960396039606,
  "eval_steps": 500,
  "global_step": 40,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 9300.9375,
      "epoch": 0.009900990099009901,
      "grad_norm": 0.2749840021133423,
      "kl": 0.0,
      "learning_rate": 3.999032564583976e-06,
      "loss": 0.009333692491054535,
      "max_completion_length": 14084.125,
      "min_completion_length": 5729.875,
      "num_updates": 1,
      "rewards": 1.173762883991003,
      "rewards/cosine_scaled_reward": 0.27115931920707226,
      "rewards/format_reward2": 0.8515625,
      "rewards/len_reward": 0.051041055703535676,
      "rewards_std": 0.5518537946045399,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9752.296875,
      "epoch": 0.019801980198019802,
      "grad_norm": 0.23711691796779633,
      "kl": 0.0007762908935546875,
      "learning_rate": 3.996131194267188e-06,
      "loss": 0.016636773943901062,
      "max_completion_length": 14506.25,
      "min_completion_length": 3615.875,
      "num_updates": 2,
      "rewards": 1.011244721710682,
      "rewards/cosine_scaled_reward": 0.1618131911382079,
      "rewards/format_reward2": 0.8203125,
      "rewards/len_reward": 0.02911903988569975,
      "rewards_std": 0.6834513954818249,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 10513.359375,
      "epoch": 0.0297029702970297,
      "grad_norm": 0.26973867416381836,
      "kl": 0.0009961128234863281,
      "learning_rate": 3.9912986959380376e-06,
      "loss": -0.002310425043106079,
      "max_completion_length": 14084.875,
      "min_completion_length": 5952.75,
      "num_updates": 3,
      "rewards": 0.8836403228342533,
      "rewards/cosine_scaled_reward": 0.06623293040320277,
      "rewards/format_reward2": 0.84375,
      "rewards/len_reward": -0.026342609897255898,
      "rewards_std": 0.590987540781498,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 10638.6484375,
      "epoch": 0.039603960396039604,
      "grad_norm": 0.3450890779495239,
      "kl": 0.0011034011840820312,
      "learning_rate": 3.9845397447265526e-06,
      "loss": 2.3186206817626953e-05,
      "max_completion_length": 15636.125,
      "min_completion_length": 6815.75,
      "num_updates": 4,
      "rewards": 0.8896834207698703,
      "rewards/cosine_scaled_reward": 0.17461357091087848,
      "rewards/format_reward2": 0.7109375,
      "rewards/len_reward": 0.004132358357310295,
      "rewards_std": 0.6534126400947571,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9484.03125,
      "epoch": 0.04950495049504951,
      "grad_norm": 0.245803564786911,
      "kl": 0.0010962486267089844,
      "learning_rate": 3.975860879481513e-06,
      "loss": -0.025934472680091858,
      "max_completion_length": 14890.25,
      "min_completion_length": 5349.375,
      "num_updates": 5,
      "rewards": 0.9708382207900286,
      "rewards/cosine_scaled_reward": 0.10498641454614699,
      "rewards/format_reward2": 0.859375,
      "rewards/len_reward": 0.0064767999574542046,
      "rewards_std": 0.655558355152607,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9448.5859375,
      "epoch": 0.0594059405940594,
      "grad_norm": 0.24996301531791687,
      "kl": 0.0013685226440429688,
      "learning_rate": 3.965270496444528e-06,
      "loss": 0.005861759185791016,
      "max_completion_length": 15323.625,
      "min_completion_length": 3290.125,
      "num_updates": 6,
      "rewards": 0.9613782716915011,
      "rewards/cosine_scaled_reward": 0.2207801272161305,
      "rewards/format_reward2": 0.7734375,
      "rewards/len_reward": -0.03283937182277441,
      "rewards_std": 0.8270582258701324,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9011.703125,
      "epoch": 0.06930693069306931,
      "grad_norm": 0.24825839698314667,
      "kl": 0.0017271041870117188,
      "learning_rate": 3.952778841127214e-06,
      "loss": -0.010295629501342773,
      "max_completion_length": 12310.875,
      "min_completion_length": 4909.25,
      "num_updates": 7,
      "rewards": 1.1428990792483091,
      "rewards/cosine_scaled_reward": 0.24160153639968485,
      "rewards/format_reward2": 0.8515625,
      "rewards/len_reward": 0.04973505577072501,
      "rewards_std": 0.5509255714714527,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 11187.453125,
      "epoch": 0.07920792079207921,
      "grad_norm": 0.2318185567855835,
      "kl": 0.0022907257080078125,
      "learning_rate": 3.938397998399332e-06,
      "loss": 0.007296696305274963,
      "max_completion_length": 14553.875,
      "min_completion_length": 4561.375,
      "num_updates": 8,
      "rewards": 0.8727323254570365,
      "rewards/cosine_scaled_reward": 0.09352816140744835,
      "rewards/format_reward2": 0.796875,
      "rewards/len_reward": -0.01767082791775465,
      "rewards_std": 0.6038715615868568,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 10907.1484375,
      "epoch": 0.0891089108910891,
      "grad_norm": 0.23230598866939545,
      "kl": 0.002445220947265625,
      "learning_rate": 3.922141880797449e-06,
      "loss": 0.016454651951789856,
      "max_completion_length": 15823.125,
      "min_completion_length": 4670.375,
      "num_updates": 9,
      "rewards": 0.8584917988628149,
      "rewards/cosine_scaled_reward": 0.12224693153984845,
      "rewards/format_reward2": 0.7421875,
      "rewards/len_reward": -0.0059426589868962765,
      "rewards_std": 0.7407274544239044,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 10617.234375,
      "epoch": 0.09900990099009901,
      "grad_norm": 0.3195263147354126,
      "kl": 0.0034656524658203125,
      "learning_rate": 3.90402621506546e-06,
      "loss": 0.022236675024032593,
      "max_completion_length": 14234.125,
      "min_completion_length": 6616.625,
      "num_updates": 10,
      "rewards": 0.9227555003017187,
      "rewards/cosine_scaled_reward": 0.16829395852982998,
      "rewards/format_reward2": 0.765625,
      "rewards/len_reward": -0.011163473129272461,
      "rewards_std": 0.5515045262873173,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 11003.5703125,
      "epoch": 0.10891089108910891,
      "grad_norm": 0.22265592217445374,
      "kl": 0.0044879913330078125,
      "learning_rate": 3.884068526939978e-06,
      "loss": -0.013431079685688019,
      "max_completion_length": 14716.25,
      "min_completion_length": 5951.375,
      "num_updates": 11,
      "rewards": 0.8861873494461179,
      "rewards/cosine_scaled_reward": 0.16878368379548192,
      "rewards/format_reward2": 0.765625,
      "rewards/len_reward": -0.04822135902941227,
      "rewards_std": 0.5935308411717415,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7835.3984375,
      "epoch": 0.1188118811881188,
      "grad_norm": 0.3398093581199646,
      "kl": 0.0051021575927734375,
      "learning_rate": 3.862288124195319e-06,
      "loss": -0.013615414500236511,
      "max_completion_length": 13709.5,
      "min_completion_length": 2604.5,
      "num_updates": 12,
      "rewards": 1.274961642920971,
      "rewards/cosine_scaled_reward": 0.32682749163359404,
      "rewards/format_reward2": 0.890625,
      "rewards/len_reward": 0.0575091321952641,
      "rewards_std": 0.7341729030013084,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9435.609375,
      "epoch": 0.12871287128712872,
      "grad_norm": 0.2820684313774109,
      "kl": 0.006267547607421875,
      "learning_rate": 3.8387060779644725e-06,
      "loss": 0.015070796012878418,
      "max_completion_length": 13926.75,
      "min_completion_length": 3093.375,
      "num_updates": 13,
      "rewards": 0.9852710571140051,
      "rewards/cosine_scaled_reward": 0.22535304143093526,
      "rewards/format_reward2": 0.7578125,
      "rewards/len_reward": 0.0021055126562714577,
      "rewards_std": 0.7060699462890625,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 6023.7421875,
      "epoch": 0.13861386138613863,
      "grad_norm": 0.3811286985874176,
      "kl": 0.006252288818359375,
      "learning_rate": 3.8133452023541447e-06,
      "loss": 0.032392144203186035,
      "max_completion_length": 15730.75,
      "min_completion_length": 1983.375,
      "num_updates": 14,
      "rewards": 1.5124556943774223,
      "rewards/cosine_scaled_reward": 0.5594989098608494,
      "rewards/format_reward2": 0.875,
      "rewards/len_reward": 0.07795678498223424,
      "rewards_std": 0.7117869555950165,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9897.4609375,
      "epoch": 0.1485148514851485,
      "grad_norm": 0.2323319911956787,
      "kl": 0.006938934326171875,
      "learning_rate": 3.786230032373583e-06,
      "loss": -0.02542346715927124,
      "max_completion_length": 14731.125,
      "min_completion_length": 4343.625,
      "num_updates": 15,
      "rewards": 1.046268306672573,
      "rewards/cosine_scaled_reward": 0.24649553978815675,
      "rewards/format_reward2": 0.7890625,
      "rewards/len_reward": 0.010710292495787144,
      "rewards_std": 0.6413916498422623,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 8641.328125,
      "epoch": 0.15841584158415842,
      "grad_norm": 0.2797869145870209,
      "kl": 0.009868621826171875,
      "learning_rate": 3.7573868001985375e-06,
      "loss": 0.00245087593793869,
      "max_completion_length": 14046.0,
      "min_completion_length": 2590.0,
      "num_updates": 16,
      "rewards": 1.0253378190100193,
      "rewards/cosine_scaled_reward": 0.18167185690253973,
      "rewards/format_reward2": 0.8515625,
      "rewards/len_reward": -0.007896540686488152,
      "rewards_std": 0.7439497336745262,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9286.7734375,
      "epoch": 0.16831683168316833,
      "grad_norm": 0.2538006007671356,
      "kl": 0.009235382080078125,
      "learning_rate": 3.7268434097933267e-06,
      "loss": 0.012023478746414185,
      "max_completion_length": 14357.0,
      "min_completion_length": 4187.625,
      "num_updates": 17,
      "rewards": 1.116831500083208,
      "rewards/cosine_scaled_reward": 0.26869785273447633,
      "rewards/format_reward2": 0.8046875,
      "rewards/len_reward": 0.04344612918794155,
      "rewards_std": 0.6232936978340149,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7877.734375,
      "epoch": 0.1782178217821782,
      "grad_norm": 0.27215102314949036,
      "kl": 0.01183319091796875,
      "learning_rate": 3.6946294099155545e-06,
      "loss": 0.00474470853805542,
      "max_completion_length": 14090.375,
      "min_completion_length": 2752.125,
      "num_updates": 18,
      "rewards": 1.222687341272831,
      "rewards/cosine_scaled_reward": 0.3153993431478739,
      "rewards/format_reward2": 0.875,
      "rewards/len_reward": 0.032287961803376675,
      "rewards_std": 0.7518719509243965,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7099.703125,
      "epoch": 0.18811881188118812,
      "grad_norm": 0.3579545319080353,
      "kl": 0.013427734375,
      "learning_rate": 3.6607759655295948e-06,
      "loss": 0.01689109206199646,
      "max_completion_length": 14201.875,
      "min_completion_length": 1985.625,
      "num_updates": 19,
      "rewards": 1.2932276129722595,
      "rewards/cosine_scaled_reward": 0.3619839735329151,
      "rewards/format_reward2": 0.8515625,
      "rewards/len_reward": 0.07968113431707025,
      "rewards_std": 0.7946057394146919,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7809.1328125,
      "epoch": 0.19801980198019803,
      "grad_norm": 0.3970443904399872,
      "kl": 0.0154876708984375,
      "learning_rate": 3.6253158276565003e-06,
      "loss": 0.013616234064102173,
      "max_completion_length": 13511.875,
      "min_completion_length": 1854.125,
      "num_updates": 20,
      "rewards": 1.3438544012606144,
      "rewards/cosine_scaled_reward": 0.41305189533159137,
      "rewards/format_reward2": 0.875,
      "rewards/len_reward": 0.055802563671022654,
      "rewards_std": 0.5804904215037823,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9812.9296875,
      "epoch": 0.2079207920792079,
      "grad_norm": 0.5228389501571655,
      "kl": 0.01609039306640625,
      "learning_rate": 3.5882833016895067e-06,
      "loss": -0.00042431801557540894,
      "max_completion_length": 12778.25,
      "min_completion_length": 4778.0,
      "num_updates": 21,
      "rewards": 1.136468593031168,
      "rewards/cosine_scaled_reward": 0.18093573104124516,
      "rewards/format_reward2": 0.875,
      "rewards/len_reward": 0.08053285209462047,
      "rewards_std": 0.5555343925952911,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 10106.0,
      "epoch": 0.21782178217821782,
      "grad_norm": 0.3004865050315857,
      "kl": 0.01863861083984375,
      "learning_rate": 3.5497142142057796e-06,
      "loss": 0.0011682212352752686,
      "max_completion_length": 13495.75,
      "min_completion_length": 5747.125,
      "num_updates": 22,
      "rewards": 1.1095520546659827,
      "rewards/cosine_scaled_reward": 0.20947218214860186,
      "rewards/format_reward2": 0.875,
      "rewards/len_reward": 0.025079891085624695,
      "rewards_std": 0.4830879457294941,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 8428.5078125,
      "epoch": 0.22772277227722773,
      "grad_norm": 0.36966538429260254,
      "kl": 0.01552581787109375,
      "learning_rate": 3.509645878306514e-06,
      "loss": 0.0047097280621528625,
      "max_completion_length": 14159.0,
      "min_completion_length": 1964.5,
      "num_updates": 23,
      "rewards": 1.1670421473681927,
      "rewards/cosine_scaled_reward": 0.29000907950103283,
      "rewards/format_reward2": 0.8515625,
      "rewards/len_reward": 0.025470565538853407,
      "rewards_std": 0.615565050393343,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 5892.2109375,
      "epoch": 0.2376237623762376,
      "grad_norm": 125.0189208984375,
      "kl": 0.43308258056640625,
      "learning_rate": 3.4681170575189206e-06,
      "loss": 0.00223734974861145,
      "max_completion_length": 11146.375,
      "min_completion_length": 1771.125,
      "num_updates": 24,
      "rewards": 1.4564557410776615,
      "rewards/cosine_scaled_reward": 0.44786818977445364,
      "rewards/format_reward2": 0.90625,
      "rewards/len_reward": 0.10233754548244178,
      "rewards_std": 0.6475037336349487,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 6293.140625,
      "epoch": 0.24752475247524752,
      "grad_norm": 0.4965859651565552,
      "kl": 0.0188446044921875,
      "learning_rate": 3.425167928295014e-06,
      "loss": 0.019756004214286804,
      "max_completion_length": 11885.375,
      "min_completion_length": 2043.5,
      "num_updates": 25,
      "rewards": 1.2420116439461708,
      "rewards/cosine_scaled_reward": 0.24037119653075933,
      "rewards/format_reward2": 0.921875,
      "rewards/len_reward": 0.07976543391123414,
      "rewards_std": 0.768707849085331,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7320.9765625,
      "epoch": 0.25742574257425743,
      "grad_norm": 0.32264769077301025,
      "kl": 0.0201263427734375,
      "learning_rate": 3.3808400411434935e-06,
      "loss": 0.007990241050720215,
      "max_completion_length": 14976.25,
      "min_completion_length": 1978.0,
      "num_updates": 26,
      "rewards": 1.2049608379602432,
      "rewards/cosine_scaled_reward": 0.328420914709568,
      "rewards/format_reward2": 0.8671875,
      "rewards/len_reward": 0.009352410677820444,
      "rewards_std": 0.7654620930552483,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 5956.828125,
      "epoch": 0.26732673267326734,
      "grad_norm": 0.3196789026260376,
      "kl": 0.0207061767578125,
      "learning_rate": 3.335176280432307e-06,
      "loss": -0.00398920476436615,
      "max_completion_length": 10882.125,
      "min_completion_length": 2554.375,
      "num_updates": 27,
      "rewards": 1.3529352433979511,
      "rewards/cosine_scaled_reward": 0.30559817608445883,
      "rewards/format_reward2": 0.9609375,
      "rewards/len_reward": 0.08639959944412112,
      "rewards_std": 0.7427709549665451,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 6028.84375,
      "epoch": 0.27722772277227725,
      "grad_norm": 0.3432248532772064,
      "kl": 0.02164459228515625,
      "learning_rate": 3.2882208229007955e-06,
      "loss": -0.015418417751789093,
      "max_completion_length": 11848.0,
      "min_completion_length": 2110.75,
      "num_updates": 28,
      "rewards": 1.3594568185508251,
      "rewards/cosine_scaled_reward": 0.35286577604711056,
      "rewards/format_reward2": 0.90625,
      "rewards/len_reward": 0.10034103039652109,
      "rewards_std": 0.7156434431672096,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 8682.453125,
      "epoch": 0.2871287128712871,
      "grad_norm": 0.2364882528781891,
      "kl": 0.024139404296875,
      "learning_rate": 3.24001909492155e-06,
      "loss": 0.0035642534494400024,
      "max_completion_length": 13773.375,
      "min_completion_length": 3882.25,
      "num_updates": 29,
      "rewards": 1.13150573708117,
      "rewards/cosine_scaled_reward": 0.195555618731305,
      "rewards/format_reward2": 0.9140625,
      "rewards/len_reward": 0.021887621260248125,
      "rewards_std": 0.6196031682193279,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 9472.0859375,
      "epoch": 0.297029702970297,
      "grad_norm": 0.23454353213310242,
      "kl": 0.029876708984375,
      "learning_rate": 3.190617728553332e-06,
      "loss": 0.0017639100551605225,
      "max_completion_length": 14098.75,
      "min_completion_length": 4757.875,
      "num_updates": 30,
      "rewards": 1.0655029881745577,
      "rewards/cosine_scaled_reward": 0.137826404068619,
      "rewards/format_reward2": 0.8984375,
      "rewards/len_reward": 0.029239090159535408,
      "rewards_std": 0.5960428677499294,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 6721.03125,
      "epoch": 0.3069306930693069,
      "grad_norm": 0.36463433504104614,
      "kl": 0.02364349365234375,
      "learning_rate": 3.140064516427565e-06,
      "loss": 0.02541273832321167,
      "max_completion_length": 11538.875,
      "min_completion_length": 3436.5,
      "num_updates": 31,
      "rewards": 1.2696323096752167,
      "rewards/cosine_scaled_reward": 0.264737417222932,
      "rewards/format_reward2": 0.921875,
      "rewards/len_reward": 0.08301988849416375,
      "rewards_std": 0.7188399098813534,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7162.3828125,
      "epoch": 0.31683168316831684,
      "grad_norm": 0.5464898347854614,
      "kl": 0.02471923828125,
      "learning_rate": 3.0884083655120544e-06,
      "loss": 0.01196742057800293,
      "max_completion_length": 10615.375,
      "min_completion_length": 4111.875,
      "num_updates": 32,
      "rewards": 1.05987061932683,
      "rewards/cosine_scaled_reward": 0.07295701105613261,
      "rewards/format_reward2": 0.96875,
      "rewards/len_reward": 0.018163591157644987,
      "rewards_std": 0.6865731440484524,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 8008.421875,
      "epoch": 0.32673267326732675,
      "grad_norm": 67.73995208740234,
      "kl": 0.0318756103515625,
      "learning_rate": 3.0356992497966503e-06,
      "loss": -0.01266103982925415,
      "max_completion_length": 12517.5,
      "min_completion_length": 2955.875,
      "num_updates": 33,
      "rewards": 1.1564012691378593,
      "rewards/cosine_scaled_reward": 0.18674227688461542,
      "rewards/format_reward2": 0.9453125,
      "rewards/len_reward": 0.024346530437469482,
      "rewards_std": 0.5944486074149609,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 5801.1328125,
      "epoch": 0.33663366336633666,
      "grad_norm": 0.44804856181144714,
      "kl": 0.0241241455078125,
      "learning_rate": 2.981988161946644e-06,
      "loss": -0.0008684098720550537,
      "max_completion_length": 13239.625,
      "min_completion_length": 1896.5,
      "num_updates": 34,
      "rewards": 1.5373370498418808,
      "rewards/cosine_scaled_reward": 0.4724404886364937,
      "rewards/format_reward2": 0.9609375,
      "rewards/len_reward": 0.10395912081003189,
      "rewards_std": 0.5325119644403458,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7891.390625,
      "epoch": 0.3465346534653465,
      "grad_norm": 0.395713746547699,
      "kl": 0.0246734619140625,
      "learning_rate": 2.9273270639706544e-06,
      "loss": 0.009494274854660034,
      "max_completion_length": 13065.75,
      "min_completion_length": 3577.25,
      "num_updates": 35,
      "rewards": 1.2487693056464195,
      "rewards/cosine_scaled_reward": 0.21439548954367638,
      "rewards/format_reward2": 0.9609375,
      "rewards/len_reward": 0.0734363030642271,
      "rewards_std": 0.6400604620575905,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 8915.25,
      "epoch": 0.3564356435643564,
      "grad_norm": 0.33517077565193176,
      "kl": 0.0334320068359375,
      "learning_rate": 2.871768836950742e-06,
      "loss": -0.018658161163330078,
      "max_completion_length": 12554.875,
      "min_completion_length": 5071.375,
      "num_updates": 36,
      "rewards": 1.1192209478467703,
      "rewards/cosine_scaled_reward": 0.15578080737031996,
      "rewards/format_reward2": 0.9296875,
      "rewards/len_reward": 0.03375265281647444,
      "rewards_std": 0.5714416801929474,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 6871.40625,
      "epoch": 0.36633663366336633,
      "grad_norm": 0.3194531202316284,
      "kl": 0.0283966064453125,
      "learning_rate": 2.8153672298833772e-06,
      "loss": 0.027765318751335144,
      "max_completion_length": 11280.625,
      "min_completion_length": 2181.75,
      "num_updates": 37,
      "rewards": 1.2906805723905563,
      "rewards/cosine_scaled_reward": 0.289469544775784,
      "rewards/format_reward2": 0.9609375,
      "rewards/len_reward": 0.04027354822028428,
      "rewards_std": 0.6187824495136738,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7555.359375,
      "epoch": 0.37623762376237624,
      "grad_norm": 0.329673707485199,
      "kl": 0.03289794921875,
      "learning_rate": 2.7581768076807586e-06,
      "loss": 0.00029387325048446655,
      "max_completion_length": 13576.25,
      "min_completion_length": 3269.75,
      "num_updates": 38,
      "rewards": 0.9393773451447487,
      "rewards/cosine_scaled_reward": 0.02606131136417389,
      "rewards/format_reward2": 0.9375,
      "rewards/len_reward": -0.024183956440538168,
      "rewards_std": 0.7055792585015297,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 7003.8125,
      "epoch": 0.38613861386138615,
      "grad_norm": 0.3149705231189728,
      "kl": 0.03338623046875,
      "learning_rate": 2.700252898382781e-06,
      "loss": 0.00039067864418029785,
      "max_completion_length": 12233.875,
      "min_completion_length": 2225.0,
      "num_updates": 39,
      "rewards": 1.323565311729908,
      "rewards/cosine_scaled_reward": 0.23708410863764584,
      "rewards/format_reward2": 0.9609375,
      "rewards/len_reward": 0.12554369773715734,
      "rewards_std": 0.639260545372963,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 5583.1171875,
      "epoch": 0.39603960396039606,
      "grad_norm": 0.4048033356666565,
      "kl": 0.027130126953125,
      "learning_rate": 2.641651539630735e-06,
      "loss": 0.015950188040733337,
      "max_completion_length": 9941.625,
      "min_completion_length": 1826.0,
      "num_updates": 40,
      "rewards": 1.4943003356456757,
      "rewards/cosine_scaled_reward": 0.4848987963050604,
      "rewards/format_reward2": 0.921875,
      "rewards/len_reward": 0.08752657752484083,
      "rewards_std": 0.636099562048912,
      "step": 40
    }
  ],
  "logging_steps": 1,
  "max_steps": 101,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 5,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": true,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0,
  "train_batch_size": null,
  "trial_name": null,
  "trial_params": null
}