File size: 29,564 Bytes
b66e341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.6006006006006006,
  "eval_steps": 50,
  "global_step": 400,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02002002002002002,
      "grad_norm": 0.05460292845964432,
      "grpo_mean_advantage": -1.3560057254835556e-07,
      "grpo_mean_group_score": 0.5922331809997559,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.0318567496578908e-06,
      "learning_rate": 8.000000000000001e-07,
      "loss": 0.007,
      "step": 5
    },
    {
      "epoch": 0.04004004004004004,
      "grad_norm": 0.0679207444190979,
      "grpo_mean_advantage": 3.6619603633880615e-06,
      "grpo_mean_group_score": 0.5561589002609253,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.6246918676188216e-05,
      "learning_rate": 1.8000000000000001e-06,
      "loss": 0.0107,
      "step": 10
    },
    {
      "epoch": 0.06006006006006006,
      "grad_norm": 0.05788416787981987,
      "grpo_mean_advantage": -1.0654330395709621e-07,
      "grpo_mean_group_score": 0.5759152173995972,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.399440965447866e-07,
      "learning_rate": 2.8000000000000003e-06,
      "loss": 0.007,
      "step": 15
    },
    {
      "epoch": 0.08008008008008008,
      "grad_norm": 0.0746568813920021,
      "grpo_mean_advantage": -5.871057737749652e-07,
      "grpo_mean_group_score": 0.5127314329147339,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.6951597646984737e-06,
      "learning_rate": 3.8000000000000005e-06,
      "loss": 0.0246,
      "step": 20
    },
    {
      "epoch": 0.1001001001001001,
      "grad_norm": 0.11442846059799194,
      "grpo_mean_advantage": 6.370246410369873e-07,
      "grpo_mean_group_score": 0.539706826210022,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.8908377771585947e-06,
      "learning_rate": 4.800000000000001e-06,
      "loss": 0.0337,
      "step": 25
    },
    {
      "epoch": 0.12012012012012012,
      "grad_norm": 0.05778791010379791,
      "grpo_mean_advantage": 6.705522359595761e-09,
      "grpo_mean_group_score": 0.5812538862228394,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.189450800775376e-07,
      "learning_rate": 4.999125183044924e-06,
      "loss": 0.0171,
      "step": 30
    },
    {
      "epoch": 0.14014014014014015,
      "grad_norm": 0.05819695070385933,
      "grpo_mean_advantage": 3.859400692363124e-07,
      "grpo_mean_group_score": 0.5909844636917114,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.6833292875162442e-06,
      "learning_rate": 4.995572288443412e-06,
      "loss": 0.0145,
      "step": 35
    },
    {
      "epoch": 0.16016016016016016,
      "grad_norm": 0.07968433201313019,
      "grpo_mean_advantage": 2.600252742013254e-07,
      "grpo_mean_group_score": 0.5630953907966614,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.4095899132371414e-06,
      "learning_rate": 4.98929052218411e-06,
      "loss": 0.0196,
      "step": 40
    },
    {
      "epoch": 0.18018018018018017,
      "grad_norm": 0.0733402892947197,
      "grpo_mean_advantage": -1.2591480924584175e-07,
      "grpo_mean_group_score": 0.5604403614997864,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0309080380466185e-06,
      "learning_rate": 4.980286753286196e-06,
      "loss": 0.0186,
      "step": 45
    },
    {
      "epoch": 0.2002002002002002,
      "grad_norm": 0.07136482000350952,
      "grpo_mean_advantage": -2.808868941883702e-07,
      "grpo_mean_group_score": 0.5971035957336426,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.5696078889959608e-06,
      "learning_rate": 4.9685708272387645e-06,
      "loss": 0.0286,
      "step": 50
    },
    {
      "epoch": 0.22022022022022023,
      "grad_norm": 0.08851475268602371,
      "grpo_mean_advantage": 2.6822089438383045e-08,
      "grpo_mean_group_score": 0.5892971754074097,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.7878271541558206e-07,
      "learning_rate": 4.9541555552349404e-06,
      "loss": 0.0054,
      "step": 55
    },
    {
      "epoch": 0.24024024024024024,
      "grad_norm": 0.07778509706258774,
      "grpo_mean_advantage": -5.662441182607836e-08,
      "grpo_mean_group_score": 0.564322292804718,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.128998393251095e-07,
      "learning_rate": 4.9370567001630155e-06,
      "loss": -0.0074,
      "step": 60
    },
    {
      "epoch": 0.2602602602602603,
      "grad_norm": 0.08740051090717316,
      "grpo_mean_advantage": -1.5944242193199898e-07,
      "grpo_mean_group_score": 0.562497615814209,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.6374274309782777e-06,
      "learning_rate": 4.917292959369968e-06,
      "loss": 0.0145,
      "step": 65
    },
    {
      "epoch": 0.2802802802802803,
      "grad_norm": 0.19070060551166534,
      "grpo_mean_advantage": 1.6838312433264946e-07,
      "grpo_mean_group_score": 0.5904761552810669,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 8.536571272088622e-07,
      "learning_rate": 4.8948859442161876e-06,
      "loss": 0.0257,
      "step": 70
    },
    {
      "epoch": 0.3003003003003003,
      "grad_norm": 0.07321271300315857,
      "grpo_mean_advantage": 1.1175870895385742e-07,
      "grpo_mean_group_score": 0.5765624046325684,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.451961667153228e-07,
      "learning_rate": 4.869860156443768e-06,
      "loss": 0.0024,
      "step": 75
    },
    {
      "epoch": 0.3203203203203203,
      "grad_norm": 0.07126748561859131,
      "grpo_mean_advantage": -1.4603138254187797e-07,
      "grpo_mean_group_score": 0.5858271718025208,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.1309343790344428e-06,
      "learning_rate": 4.842242961384211e-06,
      "loss": 0.0277,
      "step": 80
    },
    {
      "epoch": 0.34034034034034033,
      "grad_norm": 0.08629189431667328,
      "grpo_mean_advantage": -1.817941665649414e-06,
      "grpo_mean_group_score": 0.5871662497520447,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.1141768482048064e-05,
      "learning_rate": 4.812064558034847e-06,
      "loss": 0.0246,
      "step": 85
    },
    {
      "epoch": 0.36036036036036034,
      "grad_norm": 0.0998779758810997,
      "grpo_mean_advantage": 1.8179416372277046e-07,
      "grpo_mean_group_score": 0.5330992937088013,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.210335072864837e-07,
      "learning_rate": 4.779357946036662e-06,
      "loss": 0.0056,
      "step": 90
    },
    {
      "epoch": 0.38038038038038036,
      "grad_norm": 0.10614689439535141,
      "grpo_mean_advantage": -2.972781771859445e-07,
      "grpo_mean_group_score": 0.5265295505523682,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.1582342217006953e-06,
      "learning_rate": 4.74415888958968e-06,
      "loss": 0.0053,
      "step": 95
    },
    {
      "epoch": 0.4004004004004004,
      "grad_norm": 0.10345634073019028,
      "grpo_mean_advantage": -7.033348197182931e-07,
      "grpo_mean_group_score": 0.5660771131515503,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.245831405569334e-06,
      "learning_rate": 4.706505878345343e-06,
      "loss": 0.0134,
      "step": 100
    },
    {
      "epoch": 0.42042042042042044,
      "grad_norm": 0.10077933222055435,
      "grpo_mean_advantage": 1.1920928955078125e-07,
      "grpo_mean_group_score": 0.57631915807724,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.2809634831210133e-07,
      "learning_rate": 4.666440085318626e-06,
      "loss": 0.0004,
      "step": 105
    },
    {
      "epoch": 0.44044044044044045,
      "grad_norm": 0.09548182785511017,
      "grpo_mean_advantage": -4.0978193283081055e-07,
      "grpo_mean_group_score": 0.546563982963562,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.0397578636184335e-06,
      "learning_rate": 4.624005321865968e-06,
      "loss": 0.0033,
      "step": 110
    },
    {
      "epoch": 0.46046046046046046,
      "grad_norm": 0.09417816251516342,
      "grpo_mean_advantage": -1.467764434437413e-07,
      "grpo_mean_group_score": 0.5519219636917114,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.2689375782647403e-06,
      "learning_rate": 4.57924798977818e-06,
      "loss": 0.0095,
      "step": 115
    },
    {
      "epoch": 0.4804804804804805,
      "grad_norm": 0.10022275149822235,
      "grpo_mean_advantage": -5.215406329028838e-09,
      "grpo_mean_group_score": 0.5490407943725586,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 7.929010621410271e-07,
      "learning_rate": 4.532217030540781e-06,
      "loss": 0.0006,
      "step": 120
    },
    {
      "epoch": 0.5005005005005005,
      "grad_norm": 0.14057794213294983,
      "grpo_mean_advantage": -5.7369469175228005e-08,
      "grpo_mean_group_score": 0.5646580457687378,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.2823379620385822e-06,
      "learning_rate": 4.482963871817195e-06,
      "loss": -0.0046,
      "step": 125
    },
    {
      "epoch": 0.5205205205205206,
      "grad_norm": 0.12420658767223358,
      "grpo_mean_advantage": 2.9876827056796174e-07,
      "grpo_mean_group_score": 0.6111599802970886,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0496698905626545e-06,
      "learning_rate": 4.4315423712133595e-06,
      "loss": -0.003,
      "step": 130
    },
    {
      "epoch": 0.5405405405405406,
      "grad_norm": 0.14342808723449707,
      "grpo_mean_advantage": 1.5869736103013565e-07,
      "grpo_mean_group_score": 0.5619662404060364,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.2748531617035042e-06,
      "learning_rate": 4.378008757385222e-06,
      "loss": 0.0154,
      "step": 135
    },
    {
      "epoch": 0.5605605605605606,
      "grad_norm": 0.14729444682598114,
      "grpo_mean_advantage": 3.0100346748440643e-07,
      "grpo_mean_group_score": 0.5795454978942871,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.4499684059264837e-06,
      "learning_rate": 4.322421568553529e-06,
      "loss": -0.0262,
      "step": 140
    },
    {
      "epoch": 0.5805805805805806,
      "grad_norm": 0.15249410271644592,
      "grpo_mean_advantage": -3.233552092751779e-07,
      "grpo_mean_group_score": 0.5804953575134277,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.248456669600273e-06,
      "learning_rate": 4.2648415884931476e-06,
      "loss": 0.0018,
      "step": 145
    },
    {
      "epoch": 0.6006006006006006,
      "grad_norm": 0.1841023564338684,
      "grpo_mean_advantage": 3.2261013416245987e-07,
      "grpo_mean_group_score": 0.5628539323806763,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.4773489738217904e-06,
      "learning_rate": 4.205331780066892e-06,
      "loss": -0.017,
      "step": 150
    },
    {
      "epoch": 0.6206206206206206,
      "grad_norm": 0.18597163259983063,
      "grpo_mean_advantage": -2.5331974029541016e-07,
      "grpo_mean_group_score": 0.5727725625038147,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.5092309695319273e-06,
      "learning_rate": 4.1439572163765615e-06,
      "loss": 0.0044,
      "step": 155
    },
    {
      "epoch": 0.6406406406406406,
      "grad_norm": 0.18310388922691345,
      "grpo_mean_advantage": -6.780028627417778e-08,
      "grpo_mean_group_score": 0.5833909511566162,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 8.550978805033083e-07,
      "learning_rate": 4.0807850096064605e-06,
      "loss": -0.005,
      "step": 160
    },
    {
      "epoch": 0.6606606606606606,
      "grad_norm": 0.2192923128604889,
      "grpo_mean_advantage": -5.587935447692871e-08,
      "grpo_mean_group_score": 0.5742615461349487,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.564579174053506e-07,
      "learning_rate": 4.015884237637206e-06,
      "loss": -0.015,
      "step": 165
    },
    {
      "epoch": 0.6806806806806807,
      "grad_norm": 0.16708803176879883,
      "grpo_mean_advantage": -5.327165126800537e-07,
      "grpo_mean_group_score": 0.5758188962936401,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.309018327650847e-06,
      "learning_rate": 3.949325868510083e-06,
      "loss": -0.0314,
      "step": 170
    },
    {
      "epoch": 0.7007007007007007,
      "grad_norm": 0.3401262164115906,
      "grpo_mean_advantage": 5.863606702405377e-07,
      "grpo_mean_group_score": 0.5767683982849121,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.4449204829579685e-06,
      "learning_rate": 3.881182682824534e-06,
      "loss": -0.0441,
      "step": 175
    },
    {
      "epoch": 0.7207207207207207,
      "grad_norm": 0.1931898146867752,
      "grpo_mean_advantage": 3.2186508747145126e-07,
      "grpo_mean_group_score": 0.586772084236145,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.293551688126172e-06,
      "learning_rate": 3.811529194153635e-06,
      "loss": -0.0162,
      "step": 180
    },
    {
      "epoch": 0.7407407407407407,
      "grad_norm": 0.2537969648838043,
      "grpo_mean_advantage": -4.470348358154297e-08,
      "grpo_mean_group_score": 0.549396276473999,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.7067667335577426e-07,
      "learning_rate": 3.7404415675646054e-06,
      "loss": -0.0386,
      "step": 185
    },
    {
      "epoch": 0.7607607607607607,
      "grad_norm": 0.20326584577560425,
      "grpo_mean_advantage": -2.1010637851759384e-07,
      "grpo_mean_group_score": 0.5798425078392029,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.1695076409523608e-06,
      "learning_rate": 3.667997536333424e-06,
      "loss": -0.037,
      "step": 190
    },
    {
      "epoch": 0.7807807807807807,
      "grad_norm": 0.25048357248306274,
      "grpo_mean_advantage": 1.765787658314366e-07,
      "grpo_mean_group_score": 0.5584167838096619,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.429934738756856e-06,
      "learning_rate": 3.59427631694463e-06,
      "loss": -0.0292,
      "step": 195
    },
    {
      "epoch": 0.8008008008008008,
      "grad_norm": 0.2687569260597229,
      "grpo_mean_advantage": 1.6540289493605087e-07,
      "grpo_mean_group_score": 0.5676193237304688,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.6342788714828203e-06,
      "learning_rate": 3.5193585224692595e-06,
      "loss": -0.0454,
      "step": 200
    },
    {
      "epoch": 0.8208208208208209,
      "grad_norm": 0.22301620244979858,
      "grpo_mean_advantage": -1.0944902442133753e-06,
      "grpo_mean_group_score": 0.5669739842414856,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.346942998585291e-06,
      "learning_rate": 3.44332607441564e-06,
      "loss": -0.0423,
      "step": 205
    },
    {
      "epoch": 0.8408408408408409,
      "grad_norm": 0.3040211498737335,
      "grpo_mean_advantage": 2.4065374759629776e-07,
      "grpo_mean_group_score": 0.5922158360481262,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.6327536513927043e-06,
      "learning_rate": 3.3662621131494204e-06,
      "loss": -0.0857,
      "step": 210
    },
    {
      "epoch": 0.8608608608608609,
      "grad_norm": 0.27231141924858093,
      "grpo_mean_advantage": -5.21540641784668e-08,
      "grpo_mean_group_score": 0.5473950505256653,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.847922466273303e-07,
      "learning_rate": 3.2882509069808044e-06,
      "loss": -0.0278,
      "step": 215
    },
    {
      "epoch": 0.8808808808808809,
      "grad_norm": 0.3571636378765106,
      "grpo_mean_advantage": 6.541609991472797e-07,
      "grpo_mean_group_score": 0.5880032777786255,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.072162937518442e-06,
      "learning_rate": 3.2093777600183873e-06,
      "loss": -0.0727,
      "step": 220
    },
    {
      "epoch": 0.9009009009009009,
      "grad_norm": 0.306273490190506,
      "grpo_mean_advantage": -1.2218951894737984e-07,
      "grpo_mean_group_score": 0.5835092663764954,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.386006935419573e-07,
      "learning_rate": 3.1297289188903705e-06,
      "loss": -0.0464,
      "step": 225
    },
    {
      "epoch": 0.9209209209209209,
      "grad_norm": 0.2700377106666565,
      "grpo_mean_advantage": 1.7605722177904681e-06,
      "grpo_mean_group_score": 0.5394966006278992,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 8.007580618141219e-06,
      "learning_rate": 3.049391478435133e-06,
      "loss": -0.0295,
      "step": 230
    },
    {
      "epoch": 0.9409409409409409,
      "grad_norm": 0.39531761407852173,
      "grpo_mean_advantage": -3.3080578987210174e-07,
      "grpo_mean_group_score": 0.5687432289123535,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.551636614749441e-06,
      "learning_rate": 2.9684532864643123e-06,
      "loss": -0.031,
      "step": 235
    },
    {
      "epoch": 0.960960960960961,
      "grad_norm": 0.5987040996551514,
      "grpo_mean_advantage": 2.712011450967111e-07,
      "grpo_mean_group_score": 0.5550583600997925,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.4400844747797237e-06,
      "learning_rate": 2.887002847702504e-06,
      "loss": -0.0789,
      "step": 240
    },
    {
      "epoch": 0.980980980980981,
      "grad_norm": 0.5680716037750244,
      "grpo_mean_advantage": -3.2857059295565705e-07,
      "grpo_mean_group_score": 0.558111310005188,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.105091425619321e-06,
      "learning_rate": 2.8051292270086506e-06,
      "loss": -0.1131,
      "step": 245
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.6204046010971069,
      "grpo_mean_advantage": 4.470348358154297e-08,
      "grpo_mean_group_score": 0.6196198463439941,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.315724820320611e-07,
      "learning_rate": 2.722921951984927e-06,
      "loss": -0.2232,
      "step": 250
    },
    {
      "epoch": 1.02002002002002,
      "grad_norm": 0.8389026522636414,
      "grpo_mean_advantage": 9.290873776990338e-07,
      "grpo_mean_group_score": 0.582168459892273,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.219644324621186e-06,
      "learning_rate": 2.640470915079614e-06,
      "loss": -0.1363,
      "step": 255
    },
    {
      "epoch": 1.04004004004004,
      "grad_norm": 0.9067686796188354,
      "grpo_mean_advantage": 2.533197474008375e-08,
      "grpo_mean_group_score": 0.5551307797431946,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.6600588992332632e-07,
      "learning_rate": 2.557866275291035e-06,
      "loss": -0.1868,
      "step": 260
    },
    {
      "epoch": 1.06006006006006,
      "grad_norm": 0.9277902841567993,
      "grpo_mean_advantage": -5.662441182607836e-08,
      "grpo_mean_group_score": 0.535040020942688,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0909400316450046e-06,
      "learning_rate": 2.4751983595800093e-06,
      "loss": -0.1792,
      "step": 265
    },
    {
      "epoch": 1.08008008008008,
      "grad_norm": 1.0715463161468506,
      "grpo_mean_advantage": -9.536743306171047e-08,
      "grpo_mean_group_score": 0.5673571825027466,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.838213610331877e-07,
      "learning_rate": 2.392557564098649e-06,
      "loss": -0.1691,
      "step": 270
    },
    {
      "epoch": 1.1001001001001,
      "grad_norm": 0.7759184837341309,
      "grpo_mean_advantage": 3.278255533700758e-08,
      "grpo_mean_group_score": 0.5874732732772827,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 9.317170679423725e-07,
      "learning_rate": 2.3100342553434924e-06,
      "loss": -0.1655,
      "step": 275
    },
    {
      "epoch": 1.12012012012012,
      "grad_norm": 0.9387398958206177,
      "grpo_mean_advantage": -1.206994113545079e-07,
      "grpo_mean_group_score": 0.5569106340408325,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.201085511747806e-07,
      "learning_rate": 2.2277186713410688e-06,
      "loss": -0.1821,
      "step": 280
    },
    {
      "epoch": 1.14014014014014,
      "grad_norm": 1.6132302284240723,
      "grpo_mean_advantage": 4.470348358154297e-08,
      "grpo_mean_group_score": 0.5578873157501221,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.115651558502577e-07,
      "learning_rate": 2.1457008229739395e-06,
      "loss": -0.2102,
      "step": 285
    },
    {
      "epoch": 1.16016016016016,
      "grad_norm": 0.8679026961326599,
      "grpo_mean_advantage": -3.3453108017056365e-07,
      "grpo_mean_group_score": 0.5735999345779419,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.5326345368957845e-06,
      "learning_rate": 2.0640703955551214e-06,
      "loss": -0.2937,
      "step": 290
    },
    {
      "epoch": 1.1801801801801801,
      "grad_norm": 1.0550166368484497,
      "grpo_mean_advantage": -1.110136480519941e-07,
      "grpo_mean_group_score": 0.5626259446144104,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.731904823529476e-07,
      "learning_rate": 1.9829166507585084e-06,
      "loss": -0.2598,
      "step": 295
    },
    {
      "epoch": 1.2002002002002001,
      "grad_norm": 1.2819372415542603,
      "grpo_mean_advantage": -5.08874677507265e-07,
      "grpo_mean_group_score": 0.5463050603866577,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.840126174101897e-06,
      "learning_rate": 1.90232832901255e-06,
      "loss": -0.2546,
      "step": 300
    },
    {
      "epoch": 1.2202202202202201,
      "grad_norm": 1.0188143253326416,
      "grpo_mean_advantage": 1.01327898960335e-07,
      "grpo_mean_group_score": 0.5352144241333008,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 7.798533943059738e-07,
      "learning_rate": 1.82239355246389e-06,
      "loss": -0.1809,
      "step": 305
    },
    {
      "epoch": 1.2402402402402402,
      "grad_norm": 2.0709052085876465,
      "grpo_mean_advantage": 1.341104507446289e-07,
      "grpo_mean_group_score": 0.5547868013381958,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 7.821902840987605e-07,
      "learning_rate": 1.7431997286170923e-06,
      "loss": -0.3559,
      "step": 310
    },
    {
      "epoch": 1.2602602602602602,
      "grad_norm": 1.8516215085983276,
      "grpo_mean_advantage": 9.015202806494926e-08,
      "grpo_mean_group_score": 0.5859472751617432,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0693488547985908e-06,
      "learning_rate": 1.6648334547558227e-06,
      "loss": -0.3874,
      "step": 315
    },
    {
      "epoch": 1.2802802802802802,
      "grad_norm": 1.283104419708252,
      "grpo_mean_advantage": -2.443790378947597e-07,
      "grpo_mean_group_score": 0.5751550793647766,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.183122208203713e-06,
      "learning_rate": 1.5873804232499862e-06,
      "loss": -0.3467,
      "step": 320
    },
    {
      "epoch": 1.3003003003003002,
      "grad_norm": 1.4108576774597168,
      "grpo_mean_advantage": -6.705522537231445e-08,
      "grpo_mean_group_score": 0.5497723817825317,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.109748937888071e-07,
      "learning_rate": 1.51092532785238e-06,
      "loss": -0.1703,
      "step": 325
    },
    {
      "epoch": 1.3203203203203202,
      "grad_norm": 1.0421361923217773,
      "grpo_mean_advantage": -1.639127766850379e-08,
      "grpo_mean_group_score": 0.55989670753479,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 5.529495297196263e-07,
      "learning_rate": 1.4355517710873184e-06,
      "loss": -0.2918,
      "step": 330
    },
    {
      "epoch": 1.3403403403403402,
      "grad_norm": 1.3465828895568848,
      "grpo_mean_advantage": 4.418194237132411e-07,
      "grpo_mean_group_score": 0.5809233784675598,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.9275292945385445e-06,
      "learning_rate": 1.361342172832502e-06,
      "loss": -0.3069,
      "step": 335
    },
    {
      "epoch": 1.3603603603603602,
      "grad_norm": 1.1959459781646729,
      "grpo_mean_advantage": 9.685754776000977e-08,
      "grpo_mean_group_score": 0.5568087100982666,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 3.754235251562932e-07,
      "learning_rate": 1.2883776801940884e-06,
      "loss": -0.5594,
      "step": 340
    },
    {
      "epoch": 1.3803803803803802,
      "grad_norm": 1.8967422246932983,
      "grpo_mean_advantage": -2.384185791015625e-07,
      "grpo_mean_group_score": 0.5655568838119507,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 6.821086913078034e-07,
      "learning_rate": 1.216738078773522e-06,
      "loss": -0.4102,
      "step": 345
    },
    {
      "epoch": 1.4004004004004005,
      "grad_norm": 2.221132755279541,
      "grpo_mean_advantage": -8.717179156292332e-08,
      "grpo_mean_group_score": 0.6089578866958618,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.500940354366321e-06,
      "learning_rate": 1.146501705423155e-06,
      "loss": -0.338,
      "step": 350
    },
    {
      "epoch": 1.4204204204204205,
      "grad_norm": 2.3640377521514893,
      "grpo_mean_advantage": 2.1606683731079102e-07,
      "grpo_mean_group_score": 0.6129671335220337,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.4568390724889468e-06,
      "learning_rate": 1.0777453625860474e-06,
      "loss": -0.4985,
      "step": 355
    },
    {
      "epoch": 1.4404404404404405,
      "grad_norm": 1.9084734916687012,
      "grpo_mean_advantage": -3.725290298461914e-09,
      "grpo_mean_group_score": 0.5562310814857483,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.965894054796081e-06,
      "learning_rate": 1.0105442343136184e-06,
      "loss": -0.4347,
      "step": 360
    },
    {
      "epoch": 1.4604604604604605,
      "grad_norm": 1.6063904762268066,
      "grpo_mean_advantage": 4.313886279305734e-07,
      "grpo_mean_group_score": 0.5884170532226562,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.9621948013082147e-06,
      "learning_rate": 9.449718040529987e-07,
      "loss": -0.6217,
      "step": 365
    },
    {
      "epoch": 1.4804804804804805,
      "grad_norm": 2.114664077758789,
      "grpo_mean_advantage": 2.0489096641540527e-07,
      "grpo_mean_group_score": 0.5795440673828125,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0235522722723545e-06,
      "learning_rate": 8.810997742939531e-07,
      "loss": -0.5364,
      "step": 370
    },
    {
      "epoch": 1.5005005005005005,
      "grad_norm": 1.8450465202331543,
      "grpo_mean_advantage": -1.4185905001795618e-06,
      "grpo_mean_group_score": 0.5607603788375854,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.0947338523692451e-05,
      "learning_rate": 8.189979881632634e-07,
      "loss": -0.4798,
      "step": 375
    },
    {
      "epoch": 1.5205205205205206,
      "grad_norm": 2.673438787460327,
      "grpo_mean_advantage": -1.758337049295733e-07,
      "grpo_mean_group_score": 0.5381432771682739,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 9.663675655247062e-07,
      "learning_rate": 7.587343530522945e-07,
      "loss": -0.4805,
      "step": 380
    },
    {
      "epoch": 1.5405405405405406,
      "grad_norm": 2.2263550758361816,
      "grpo_mean_advantage": -6.973743325033865e-07,
      "grpo_mean_group_score": 0.5528443455696106,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 4.341973180999048e-06,
      "learning_rate": 7.003747663612581e-07,
      "loss": -0.433,
      "step": 385
    },
    {
      "epoch": 1.5605605605605606,
      "grad_norm": 2.3657093048095703,
      "grpo_mean_advantage": 1.7881394143159923e-08,
      "grpo_mean_group_score": 0.6091476678848267,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 1.3004198251564958e-07,
      "learning_rate": 6.439830434413754e-07,
      "loss": -0.6021,
      "step": 390
    },
    {
      "epoch": 1.5805805805805806,
      "grad_norm": 1.9847129583358765,
      "grpo_mean_advantage": 3.4868716625169327e-07,
      "grpo_mean_group_score": 0.5397372245788574,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 2.059372718576924e-06,
      "learning_rate": 5.896208478137222e-07,
      "loss": -0.5595,
      "step": 395
    },
    {
      "epoch": 1.6006006006006006,
      "grad_norm": 2.922114133834839,
      "grpo_mean_advantage": -2.1636485598719446e-06,
      "grpo_mean_group_score": 0.5873125195503235,
      "grpo_mean_kl_div": 0.0,
      "grpo_std_advantage": 9.725940799398813e-06,
      "learning_rate": 5.373476237410808e-07,
      "loss": -0.5592,
      "step": 400
    }
  ],
  "logging_steps": 5,
  "max_steps": 500,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}