Dishaaa25 commited on
Commit
463023c
·
verified ·
1 Parent(s): 5289413

Update latest checkpoint for run 15940d1d-7d8c-4253-8810-2ea934bedee4 at step 950

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1304 -4
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:906b506f4c0383c1f1ba52d17f82532596abb69efc02191ab16329eeac17f7dd
3
  size 29529752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b94cd8e622ebe3562c1ea9fa75f6468afc735c44a6e72504298526587d48b2
3
  size 29529752
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3790eb8fbfb29004b2d9709c773c5045988aab8ea3d2f4e0602b72066aa8d13
3
  size 59228491
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:251f0a4a035e02d9e4cf95f08d80e5e54486d715ef25d979334aba0c383c50fc
3
  size 59228491
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3769e5d1e21d9cba6aa0fcd44f4362dd224662b41dfa88a576d9f532290b714
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67cc8782dca6a2b07ec2b12d381573b2d3e7495fb1ce4a789f64f1147416fa5b
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0072f2ff918bd7bece9f1d5c143aaa2e59d66888d4f44a87dd988c2a5835002
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8f5a26ca81b45073ff90f001b3afe6c706ea419b3b9905df91151ee7ee1fc8a
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7578125,
6
  "eval_steps": 500,
7
- "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23408,11 +23408,1311 @@
23408
  "rewards/reward_func/mean": 0.7445999979972839,
23409
  "rewards/reward_func/std": 0.45967379212379456,
23410
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23411
  }
23412
  ],
23413
  "logging_steps": 1,
23414
  "max_steps": 950,
23415
- "num_input_tokens_seen": 4238976,
23416
  "num_train_epochs": 2,
23417
  "save_steps": 50,
23418
  "stateful_callbacks": {
@@ -23422,7 +24722,7 @@
23422
  "should_evaluate": false,
23423
  "should_log": false,
23424
  "should_save": true,
23425
- "should_training_stop": false
23426
  },
23427
  "attributes": {}
23428
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.85546875,
6
  "eval_steps": 500,
7
+ "global_step": 950,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23408
  "rewards/reward_func/mean": 0.7445999979972839,
23409
  "rewards/reward_func/std": 0.45967379212379456,
23410
  "step": 900
23411
+ },
23412
+ {
23413
+ "clip_ratio/high_max": 0.0,
23414
+ "clip_ratio/high_mean": 0.0,
23415
+ "clip_ratio/low_mean": 0.0,
23416
+ "clip_ratio/low_min": 0.0,
23417
+ "clip_ratio/region_mean": 0.0,
23418
+ "completions/clipped_ratio": 1.0,
23419
+ "completions/max_length": 384.0,
23420
+ "completions/max_terminated_length": 0.0,
23421
+ "completions/mean_length": 384.0,
23422
+ "completions/mean_terminated_length": 0.0,
23423
+ "completions/min_length": 384.0,
23424
+ "completions/min_terminated_length": 0.0,
23425
+ "entropy": 0.8848395217210054,
23426
+ "epoch": 1.759765625,
23427
+ "frac_reward_zero_std": 0.0,
23428
+ "grad_norm": 0.2501536011695862,
23429
+ "learning_rate": 2.6315789473684213e-07,
23430
+ "loss": 0.0,
23431
+ "num_tokens": 4243728.0,
23432
+ "reward": 0.73808753490448,
23433
+ "reward_std": 0.4636053442955017,
23434
+ "rewards/reward_func/mean": 0.7380874752998352,
23435
+ "rewards/reward_func/std": 0.4300731420516968,
23436
+ "step": 901
23437
+ },
23438
+ {
23439
+ "clip_ratio/high_max": 0.0,
23440
+ "clip_ratio/high_mean": 0.0,
23441
+ "clip_ratio/low_mean": 0.0,
23442
+ "clip_ratio/low_min": 0.0,
23443
+ "clip_ratio/region_mean": 0.0,
23444
+ "completions/clipped_ratio": 1.0,
23445
+ "completions/max_length": 384.0,
23446
+ "completions/max_terminated_length": 0.0,
23447
+ "completions/mean_length": 384.0,
23448
+ "completions/mean_terminated_length": 0.0,
23449
+ "completions/min_length": 384.0,
23450
+ "completions/min_terminated_length": 0.0,
23451
+ "entropy": 0.5995431952178478,
23452
+ "epoch": 1.76171875,
23453
+ "frac_reward_zero_std": 0.5,
23454
+ "grad_norm": 0.1700550615787506,
23455
+ "learning_rate": 2.578947368421053e-07,
23456
+ "loss": -0.0,
23457
+ "num_tokens": 4248720.0,
23458
+ "reward": 0.2184000015258789,
23459
+ "reward_std": 0.2522551119327545,
23460
+ "rewards/reward_func/mean": 0.2184000015258789,
23461
+ "rewards/reward_func/std": 0.4044714868068695,
23462
+ "step": 902
23463
+ },
23464
+ {
23465
+ "clip_ratio/high_max": 0.0,
23466
+ "clip_ratio/high_mean": 0.0,
23467
+ "clip_ratio/low_mean": 0.0,
23468
+ "clip_ratio/low_min": 0.0,
23469
+ "clip_ratio/region_mean": 0.0,
23470
+ "completions/clipped_ratio": 1.0,
23471
+ "completions/max_length": 384.0,
23472
+ "completions/max_terminated_length": 0.0,
23473
+ "completions/mean_length": 384.0,
23474
+ "completions/mean_terminated_length": 0.0,
23475
+ "completions/min_length": 384.0,
23476
+ "completions/min_terminated_length": 0.0,
23477
+ "entropy": 0.8698443882167339,
23478
+ "epoch": 1.763671875,
23479
+ "frac_reward_zero_std": 0.0,
23480
+ "grad_norm": 0.2777104675769806,
23481
+ "learning_rate": 2.5263157894736846e-07,
23482
+ "loss": -0.0,
23483
+ "num_tokens": 4253256.0,
23484
+ "reward": 0.504687488079071,
23485
+ "reward_std": 0.5720410346984863,
23486
+ "rewards/reward_func/mean": 0.504687488079071,
23487
+ "rewards/reward_func/std": 0.5296536087989807,
23488
+ "step": 903
23489
+ },
23490
+ {
23491
+ "clip_ratio/high_max": 0.0,
23492
+ "clip_ratio/high_mean": 0.0,
23493
+ "clip_ratio/low_mean": 0.0,
23494
+ "clip_ratio/low_min": 0.0,
23495
+ "clip_ratio/region_mean": 0.0,
23496
+ "completions/clipped_ratio": 1.0,
23497
+ "completions/max_length": 384.0,
23498
+ "completions/max_terminated_length": 0.0,
23499
+ "completions/mean_length": 384.0,
23500
+ "completions/mean_terminated_length": 0.0,
23501
+ "completions/min_length": 384.0,
23502
+ "completions/min_terminated_length": 0.0,
23503
+ "entropy": 0.7046304792165756,
23504
+ "epoch": 1.765625,
23505
+ "frac_reward_zero_std": 0.0,
23506
+ "grad_norm": 0.23841030895709991,
23507
+ "learning_rate": 2.473684210526316e-07,
23508
+ "loss": 0.0,
23509
+ "num_tokens": 4258024.0,
23510
+ "reward": 0.6187999844551086,
23511
+ "reward_std": 0.5329432487487793,
23512
+ "rewards/reward_func/mean": 0.6187999844551086,
23513
+ "rewards/reward_func/std": 0.5125207901000977,
23514
+ "step": 904
23515
+ },
23516
+ {
23517
+ "clip_ratio/high_max": 0.0,
23518
+ "clip_ratio/high_mean": 0.0,
23519
+ "clip_ratio/low_mean": 0.0,
23520
+ "clip_ratio/low_min": 0.0,
23521
+ "clip_ratio/region_mean": 0.0,
23522
+ "completions/clipped_ratio": 1.0,
23523
+ "completions/max_length": 384.0,
23524
+ "completions/max_terminated_length": 0.0,
23525
+ "completions/mean_length": 384.0,
23526
+ "completions/mean_terminated_length": 0.0,
23527
+ "completions/min_length": 384.0,
23528
+ "completions/min_terminated_length": 0.0,
23529
+ "entropy": 0.7159217670559883,
23530
+ "epoch": 1.767578125,
23531
+ "frac_reward_zero_std": 0.0,
23532
+ "grad_norm": 0.2175939530134201,
23533
+ "learning_rate": 2.4210526315789473e-07,
23534
+ "loss": 0.0,
23535
+ "num_tokens": 4263024.0,
23536
+ "reward": 0.3587000072002411,
23537
+ "reward_std": 0.5176094770431519,
23538
+ "rewards/reward_func/mean": 0.3587000072002411,
23539
+ "rewards/reward_func/std": 0.4951927065849304,
23540
+ "step": 905
23541
+ },
23542
+ {
23543
+ "clip_ratio/high_max": 0.0,
23544
+ "clip_ratio/high_mean": 0.0,
23545
+ "clip_ratio/low_mean": 0.0,
23546
+ "clip_ratio/low_min": 0.0,
23547
+ "clip_ratio/region_mean": 0.0,
23548
+ "completions/clipped_ratio": 1.0,
23549
+ "completions/max_length": 384.0,
23550
+ "completions/max_terminated_length": 0.0,
23551
+ "completions/mean_length": 384.0,
23552
+ "completions/mean_terminated_length": 0.0,
23553
+ "completions/min_length": 384.0,
23554
+ "completions/min_terminated_length": 0.0,
23555
+ "entropy": 0.4197417329996824,
23556
+ "epoch": 1.76953125,
23557
+ "frac_reward_zero_std": 0.0,
23558
+ "grad_norm": 0.18832392990589142,
23559
+ "learning_rate": 2.3684210526315792e-07,
23560
+ "loss": 0.0,
23561
+ "num_tokens": 4267968.0,
23562
+ "reward": 0.012500000186264515,
23563
+ "reward_std": 0.014433757402002811,
23564
+ "rewards/reward_func/mean": 0.012500000186264515,
23565
+ "rewards/reward_func/std": 0.013363063335418701,
23566
+ "step": 906
23567
+ },
23568
+ {
23569
+ "clip_ratio/high_max": 0.0,
23570
+ "clip_ratio/high_mean": 0.0,
23571
+ "clip_ratio/low_mean": 0.0,
23572
+ "clip_ratio/low_min": 0.0,
23573
+ "clip_ratio/region_mean": 0.0,
23574
+ "completions/clipped_ratio": 1.0,
23575
+ "completions/max_length": 384.0,
23576
+ "completions/max_terminated_length": 0.0,
23577
+ "completions/mean_length": 384.0,
23578
+ "completions/mean_terminated_length": 0.0,
23579
+ "completions/min_length": 384.0,
23580
+ "completions/min_terminated_length": 0.0,
23581
+ "entropy": 0.7613006345927715,
23582
+ "epoch": 1.771484375,
23583
+ "frac_reward_zero_std": 0.5,
23584
+ "grad_norm": 0.17875666916370392,
23585
+ "learning_rate": 2.315789473684211e-07,
23586
+ "loss": -0.0,
23587
+ "num_tokens": 4272328.0,
23588
+ "reward": 0.9947500228881836,
23589
+ "reward_std": 0.010500003583729267,
23590
+ "rewards/reward_func/mean": 0.9947500228881836,
23591
+ "rewards/reward_func/std": 0.014849240891635418,
23592
+ "step": 907
23593
+ },
23594
+ {
23595
+ "clip_ratio/high_max": 0.0,
23596
+ "clip_ratio/high_mean": 0.0,
23597
+ "clip_ratio/low_mean": 0.0,
23598
+ "clip_ratio/low_min": 0.0,
23599
+ "clip_ratio/region_mean": 0.0,
23600
+ "completions/clipped_ratio": 1.0,
23601
+ "completions/max_length": 384.0,
23602
+ "completions/max_terminated_length": 0.0,
23603
+ "completions/mean_length": 384.0,
23604
+ "completions/mean_terminated_length": 0.0,
23605
+ "completions/min_length": 384.0,
23606
+ "completions/min_terminated_length": 0.0,
23607
+ "entropy": 0.7562070265412331,
23608
+ "epoch": 1.7734375,
23609
+ "frac_reward_zero_std": 0.0,
23610
+ "grad_norm": 0.2620117664337158,
23611
+ "learning_rate": 2.2631578947368425e-07,
23612
+ "loss": 0.0,
23613
+ "num_tokens": 4276768.0,
23614
+ "reward": 0.6265624761581421,
23615
+ "reward_std": 0.5368822813034058,
23616
+ "rewards/reward_func/mean": 0.6265624761581421,
23617
+ "rewards/reward_func/std": 0.515407145023346,
23618
+ "step": 908
23619
+ },
23620
+ {
23621
+ "clip_ratio/high_max": 0.0,
23622
+ "clip_ratio/high_mean": 0.0,
23623
+ "clip_ratio/low_mean": 0.0,
23624
+ "clip_ratio/low_min": 0.0,
23625
+ "clip_ratio/region_mean": 0.0,
23626
+ "completions/clipped_ratio": 1.0,
23627
+ "completions/max_length": 384.0,
23628
+ "completions/max_terminated_length": 0.0,
23629
+ "completions/mean_length": 384.0,
23630
+ "completions/mean_terminated_length": 0.0,
23631
+ "completions/min_length": 384.0,
23632
+ "completions/min_terminated_length": 0.0,
23633
+ "entropy": 0.735223077237606,
23634
+ "epoch": 1.775390625,
23635
+ "frac_reward_zero_std": 0.0,
23636
+ "grad_norm": 0.24371092021465302,
23637
+ "learning_rate": 2.2105263157894736e-07,
23638
+ "loss": -0.0,
23639
+ "num_tokens": 4281480.0,
23640
+ "reward": 0.9846999645233154,
23641
+ "reward_std": 0.03060000017285347,
23642
+ "rewards/reward_func/mean": 0.9846999645233154,
23643
+ "rewards/reward_func/std": 0.030131708830595016,
23644
+ "step": 909
23645
+ },
23646
+ {
23647
+ "clip_ratio/high_max": 0.0,
23648
+ "clip_ratio/high_mean": 0.0,
23649
+ "clip_ratio/low_mean": 0.0,
23650
+ "clip_ratio/low_min": 0.0,
23651
+ "clip_ratio/region_mean": 0.0,
23652
+ "completions/clipped_ratio": 1.0,
23653
+ "completions/max_length": 384.0,
23654
+ "completions/max_terminated_length": 0.0,
23655
+ "completions/mean_length": 384.0,
23656
+ "completions/mean_terminated_length": 0.0,
23657
+ "completions/min_length": 384.0,
23658
+ "completions/min_terminated_length": 0.0,
23659
+ "entropy": 0.888181284070015,
23660
+ "epoch": 1.77734375,
23661
+ "frac_reward_zero_std": 0.0,
23662
+ "grad_norm": 0.2600753605365753,
23663
+ "learning_rate": 2.1578947368421053e-07,
23664
+ "loss": 0.0,
23665
+ "num_tokens": 4285840.0,
23666
+ "reward": 0.5,
23667
+ "reward_std": 0.5773502588272095,
23668
+ "rewards/reward_func/mean": 0.5,
23669
+ "rewards/reward_func/std": 0.5345224738121033,
23670
+ "step": 910
23671
+ },
23672
+ {
23673
+ "clip_ratio/high_max": 0.0,
23674
+ "clip_ratio/high_mean": 0.0,
23675
+ "clip_ratio/low_mean": 0.0,
23676
+ "clip_ratio/low_min": 0.0,
23677
+ "clip_ratio/region_mean": 0.0,
23678
+ "completions/clipped_ratio": 1.0,
23679
+ "completions/max_length": 384.0,
23680
+ "completions/max_terminated_length": 0.0,
23681
+ "completions/mean_length": 384.0,
23682
+ "completions/mean_terminated_length": 0.0,
23683
+ "completions/min_length": 384.0,
23684
+ "completions/min_terminated_length": 0.0,
23685
+ "entropy": 0.6173073314130306,
23686
+ "epoch": 1.779296875,
23687
+ "frac_reward_zero_std": 0.0,
23688
+ "grad_norm": 0.22857871651649475,
23689
+ "learning_rate": 2.105263157894737e-07,
23690
+ "loss": -0.0,
23691
+ "num_tokens": 4290496.0,
23692
+ "reward": 0.37229999899864197,
23693
+ "reward_std": 0.5355914831161499,
23694
+ "rewards/reward_func/mean": 0.37229999899864197,
23695
+ "rewards/reward_func/std": 0.5138660669326782,
23696
+ "step": 911
23697
+ },
23698
+ {
23699
+ "clip_ratio/high_max": 0.0,
23700
+ "clip_ratio/high_mean": 0.0,
23701
+ "clip_ratio/low_mean": 0.0,
23702
+ "clip_ratio/low_min": 0.0,
23703
+ "clip_ratio/region_mean": 0.0,
23704
+ "completions/clipped_ratio": 1.0,
23705
+ "completions/max_length": 384.0,
23706
+ "completions/max_terminated_length": 0.0,
23707
+ "completions/mean_length": 384.0,
23708
+ "completions/mean_terminated_length": 0.0,
23709
+ "completions/min_length": 384.0,
23710
+ "completions/min_terminated_length": 0.0,
23711
+ "entropy": 0.5189631078392267,
23712
+ "epoch": 1.78125,
23713
+ "frac_reward_zero_std": 0.5,
23714
+ "grad_norm": 0.14599941670894623,
23715
+ "learning_rate": 2.0526315789473685e-07,
23716
+ "loss": -0.0,
23717
+ "num_tokens": 4295008.0,
23718
+ "reward": 0.8134000301361084,
23719
+ "reward_std": 0.23240000009536743,
23720
+ "rewards/reward_func/mean": 0.8134000301361084,
23721
+ "rewards/reward_func/std": 0.3286632299423218,
23722
+ "step": 912
23723
+ },
23724
+ {
23725
+ "clip_ratio/high_max": 0.0,
23726
+ "clip_ratio/high_mean": 0.0,
23727
+ "clip_ratio/low_mean": 0.0,
23728
+ "clip_ratio/low_min": 0.0,
23729
+ "clip_ratio/region_mean": 0.0,
23730
+ "completions/clipped_ratio": 1.0,
23731
+ "completions/max_length": 384.0,
23732
+ "completions/max_terminated_length": 0.0,
23733
+ "completions/mean_length": 384.0,
23734
+ "completions/mean_terminated_length": 0.0,
23735
+ "completions/min_length": 384.0,
23736
+ "completions/min_terminated_length": 0.0,
23737
+ "entropy": 0.8033823296427727,
23738
+ "epoch": 1.783203125,
23739
+ "frac_reward_zero_std": 0.5,
23740
+ "grad_norm": 0.18053551018238068,
23741
+ "learning_rate": 2.0000000000000002e-07,
23742
+ "loss": 0.0,
23743
+ "num_tokens": 4299768.0,
23744
+ "reward": 0.8362500071525574,
23745
+ "reward_std": 0.22793914377689362,
23746
+ "rewards/reward_func/mean": 0.8362500071525574,
23747
+ "rewards/reward_func/std": 0.3183859884738922,
23748
+ "step": 913
23749
+ },
23750
+ {
23751
+ "clip_ratio/high_max": 0.0,
23752
+ "clip_ratio/high_mean": 0.0,
23753
+ "clip_ratio/low_mean": 0.0,
23754
+ "clip_ratio/low_min": 0.0,
23755
+ "clip_ratio/region_mean": 0.0,
23756
+ "completions/clipped_ratio": 1.0,
23757
+ "completions/max_length": 384.0,
23758
+ "completions/max_terminated_length": 0.0,
23759
+ "completions/mean_length": 384.0,
23760
+ "completions/mean_terminated_length": 0.0,
23761
+ "completions/min_length": 384.0,
23762
+ "completions/min_terminated_length": 0.0,
23763
+ "entropy": 0.6970736794173717,
23764
+ "epoch": 1.78515625,
23765
+ "frac_reward_zero_std": 0.5,
23766
+ "grad_norm": 0.16842643916606903,
23767
+ "learning_rate": 1.9473684210526318e-07,
23768
+ "loss": -0.0,
23769
+ "num_tokens": 4304520.0,
23770
+ "reward": 0.8224999904632568,
23771
+ "reward_std": 0.23499999940395355,
23772
+ "rewards/reward_func/mean": 0.8224999904632568,
23773
+ "rewards/reward_func/std": 0.33234018087387085,
23774
+ "step": 914
23775
+ },
23776
+ {
23777
+ "clip_ratio/high_max": 0.0,
23778
+ "clip_ratio/high_mean": 0.0,
23779
+ "clip_ratio/low_mean": 0.0,
23780
+ "clip_ratio/low_min": 0.0,
23781
+ "clip_ratio/region_mean": 0.0,
23782
+ "completions/clipped_ratio": 1.0,
23783
+ "completions/max_length": 384.0,
23784
+ "completions/max_terminated_length": 0.0,
23785
+ "completions/mean_length": 384.0,
23786
+ "completions/mean_terminated_length": 0.0,
23787
+ "completions/min_length": 384.0,
23788
+ "completions/min_terminated_length": 0.0,
23789
+ "entropy": 0.6607658788561821,
23790
+ "epoch": 1.787109375,
23791
+ "frac_reward_zero_std": 0.0,
23792
+ "grad_norm": 0.22088974714279175,
23793
+ "learning_rate": 1.8947368421052634e-07,
23794
+ "loss": 0.0,
23795
+ "num_tokens": 4309296.0,
23796
+ "reward": 0.7371000051498413,
23797
+ "reward_std": 0.28240811824798584,
23798
+ "rewards/reward_func/mean": 0.7371000051498413,
23799
+ "rewards/reward_func/std": 0.4553808271884918,
23800
+ "step": 915
23801
+ },
23802
+ {
23803
+ "clip_ratio/high_max": 0.0,
23804
+ "clip_ratio/high_mean": 0.0,
23805
+ "clip_ratio/low_mean": 0.0,
23806
+ "clip_ratio/low_min": 0.0,
23807
+ "clip_ratio/region_mean": 0.0,
23808
+ "completions/clipped_ratio": 1.0,
23809
+ "completions/max_length": 384.0,
23810
+ "completions/max_terminated_length": 0.0,
23811
+ "completions/mean_length": 384.0,
23812
+ "completions/mean_terminated_length": 0.0,
23813
+ "completions/min_length": 384.0,
23814
+ "completions/min_terminated_length": 0.0,
23815
+ "entropy": 0.6025678031146526,
23816
+ "epoch": 1.7890625,
23817
+ "frac_reward_zero_std": 0.0,
23818
+ "grad_norm": 0.2665260136127472,
23819
+ "learning_rate": 1.8421052631578948e-07,
23820
+ "loss": 0.0,
23821
+ "num_tokens": 4314088.0,
23822
+ "reward": 0.8602499961853027,
23823
+ "reward_std": 0.24355539679527283,
23824
+ "rewards/reward_func/mean": 0.8602499961853027,
23825
+ "rewards/reward_func/std": 0.32780489325523376,
23826
+ "step": 916
23827
+ },
23828
+ {
23829
+ "clip_ratio/high_max": 0.0,
23830
+ "clip_ratio/high_mean": 0.0,
23831
+ "clip_ratio/low_mean": 0.0,
23832
+ "clip_ratio/low_min": 0.0,
23833
+ "clip_ratio/region_mean": 0.0,
23834
+ "completions/clipped_ratio": 1.0,
23835
+ "completions/max_length": 384.0,
23836
+ "completions/max_terminated_length": 0.0,
23837
+ "completions/mean_length": 384.0,
23838
+ "completions/mean_terminated_length": 0.0,
23839
+ "completions/min_length": 384.0,
23840
+ "completions/min_terminated_length": 0.0,
23841
+ "entropy": 0.421773798763752,
23842
+ "epoch": 1.791015625,
23843
+ "frac_reward_zero_std": 0.5,
23844
+ "grad_norm": 0.12458179891109467,
23845
+ "learning_rate": 1.7894736842105265e-07,
23846
+ "loss": 0.0,
23847
+ "num_tokens": 4319040.0,
23848
+ "reward": 0.0031250000465661287,
23849
+ "reward_std": 0.0062500000931322575,
23850
+ "rewards/reward_func/mean": 0.0031250000465661287,
23851
+ "rewards/reward_func/std": 0.008838835172355175,
23852
+ "step": 917
23853
+ },
23854
+ {
23855
+ "clip_ratio/high_max": 0.0,
23856
+ "clip_ratio/high_mean": 0.0,
23857
+ "clip_ratio/low_mean": 0.0,
23858
+ "clip_ratio/low_min": 0.0,
23859
+ "clip_ratio/region_mean": 0.0,
23860
+ "completions/clipped_ratio": 1.0,
23861
+ "completions/max_length": 384.0,
23862
+ "completions/max_terminated_length": 0.0,
23863
+ "completions/mean_length": 384.0,
23864
+ "completions/mean_terminated_length": 0.0,
23865
+ "completions/min_length": 384.0,
23866
+ "completions/min_terminated_length": 0.0,
23867
+ "entropy": 0.8600736558437347,
23868
+ "epoch": 1.79296875,
23869
+ "frac_reward_zero_std": 0.0,
23870
+ "grad_norm": 0.2437726855278015,
23871
+ "learning_rate": 1.7368421052631578e-07,
23872
+ "loss": -0.0,
23873
+ "num_tokens": 4323544.0,
23874
+ "reward": 0.5066750049591064,
23875
+ "reward_std": 0.4920022189617157,
23876
+ "rewards/reward_func/mean": 0.5066750049591064,
23877
+ "rewards/reward_func/std": 0.5217258930206299,
23878
+ "step": 918
23879
+ },
23880
+ {
23881
+ "clip_ratio/high_max": 0.0,
23882
+ "clip_ratio/high_mean": 0.0,
23883
+ "clip_ratio/low_mean": 0.0,
23884
+ "clip_ratio/low_min": 0.0,
23885
+ "clip_ratio/region_mean": 0.0,
23886
+ "completions/clipped_ratio": 1.0,
23887
+ "completions/max_length": 384.0,
23888
+ "completions/max_terminated_length": 0.0,
23889
+ "completions/mean_length": 384.0,
23890
+ "completions/mean_terminated_length": 0.0,
23891
+ "completions/min_length": 384.0,
23892
+ "completions/min_terminated_length": 0.0,
23893
+ "entropy": 0.7552273385226727,
23894
+ "epoch": 1.794921875,
23895
+ "frac_reward_zero_std": 0.0,
23896
+ "grad_norm": 0.23656296730041504,
23897
+ "learning_rate": 1.6842105263157895e-07,
23898
+ "loss": -0.0,
23899
+ "num_tokens": 4328408.0,
23900
+ "reward": 0.8660625219345093,
23901
+ "reward_std": 0.2540762424468994,
23902
+ "rewards/reward_func/mean": 0.8660625219345093,
23903
+ "rewards/reward_func/std": 0.34541285037994385,
23904
+ "step": 919
23905
+ },
23906
+ {
23907
+ "clip_ratio/high_max": 0.0,
23908
+ "clip_ratio/high_mean": 0.0,
23909
+ "clip_ratio/low_mean": 0.0,
23910
+ "clip_ratio/low_min": 0.0,
23911
+ "clip_ratio/region_mean": 0.0,
23912
+ "completions/clipped_ratio": 1.0,
23913
+ "completions/max_length": 384.0,
23914
+ "completions/max_terminated_length": 0.0,
23915
+ "completions/mean_length": 384.0,
23916
+ "completions/mean_terminated_length": 0.0,
23917
+ "completions/min_length": 384.0,
23918
+ "completions/min_terminated_length": 0.0,
23919
+ "entropy": 0.6802506037056446,
23920
+ "epoch": 1.796875,
23921
+ "frac_reward_zero_std": 0.5,
23922
+ "grad_norm": 0.1381627321243286,
23923
+ "learning_rate": 1.631578947368421e-07,
23924
+ "loss": 0.0,
23925
+ "num_tokens": 4333080.0,
23926
+ "reward": 0.875,
23927
+ "reward_std": 0.25,
23928
+ "rewards/reward_func/mean": 0.875,
23929
+ "rewards/reward_func/std": 0.3535533845424652,
23930
+ "step": 920
23931
+ },
23932
+ {
23933
+ "clip_ratio/high_max": 0.0,
23934
+ "clip_ratio/high_mean": 0.0,
23935
+ "clip_ratio/low_mean": 0.0,
23936
+ "clip_ratio/low_min": 0.0,
23937
+ "clip_ratio/region_mean": 0.0,
23938
+ "completions/clipped_ratio": 1.0,
23939
+ "completions/max_length": 384.0,
23940
+ "completions/max_terminated_length": 0.0,
23941
+ "completions/mean_length": 384.0,
23942
+ "completions/mean_terminated_length": 0.0,
23943
+ "completions/min_length": 384.0,
23944
+ "completions/min_terminated_length": 0.0,
23945
+ "entropy": 0.19835966220125556,
23946
+ "epoch": 1.798828125,
23947
+ "frac_reward_zero_std": 1.0,
23948
+ "grad_norm": 0.0,
23949
+ "learning_rate": 1.5789473684210527e-07,
23950
+ "loss": 0.0,
23951
+ "num_tokens": 4337592.0,
23952
+ "reward": 0.9296000003814697,
23953
+ "reward_std": 0.0,
23954
+ "rewards/reward_func/mean": 0.9296000003814697,
23955
+ "rewards/reward_func/std": 0.0,
23956
+ "step": 921
23957
+ },
23958
+ {
23959
+ "clip_ratio/high_max": 0.0,
23960
+ "clip_ratio/high_mean": 0.0,
23961
+ "clip_ratio/low_mean": 0.0,
23962
+ "clip_ratio/low_min": 0.0,
23963
+ "clip_ratio/region_mean": 0.0,
23964
+ "completions/clipped_ratio": 1.0,
23965
+ "completions/max_length": 384.0,
23966
+ "completions/max_terminated_length": 0.0,
23967
+ "completions/mean_length": 384.0,
23968
+ "completions/mean_terminated_length": 0.0,
23969
+ "completions/min_length": 384.0,
23970
+ "completions/min_terminated_length": 0.0,
23971
+ "entropy": 0.8462485000491142,
23972
+ "epoch": 1.80078125,
23973
+ "frac_reward_zero_std": 0.0,
23974
+ "grad_norm": 0.24740563333034515,
23975
+ "learning_rate": 1.5263157894736844e-07,
23976
+ "loss": 0.0,
23977
+ "num_tokens": 4342608.0,
23978
+ "reward": 0.47290000319480896,
23979
+ "reward_std": 0.5461318492889404,
23980
+ "rewards/reward_func/mean": 0.47290000319480896,
23981
+ "rewards/reward_func/std": 0.5058882236480713,
23982
+ "step": 922
23983
+ },
23984
+ {
23985
+ "clip_ratio/high_max": 0.0,
23986
+ "clip_ratio/high_mean": 0.0,
23987
+ "clip_ratio/low_mean": 0.0,
23988
+ "clip_ratio/low_min": 0.0,
23989
+ "clip_ratio/region_mean": 0.0,
23990
+ "completions/clipped_ratio": 1.0,
23991
+ "completions/max_length": 384.0,
23992
+ "completions/max_terminated_length": 0.0,
23993
+ "completions/mean_length": 384.0,
23994
+ "completions/mean_terminated_length": 0.0,
23995
+ "completions/min_length": 384.0,
23996
+ "completions/min_terminated_length": 0.0,
23997
+ "entropy": 0.4742927271872759,
23998
+ "epoch": 1.802734375,
23999
+ "frac_reward_zero_std": 0.0,
24000
+ "grad_norm": 1.2252196073532104,
24001
+ "learning_rate": 1.4736842105263158e-07,
24002
+ "loss": 0.0,
24003
+ "num_tokens": 4347248.0,
24004
+ "reward": 0.3668999969959259,
24005
+ "reward_std": 0.5270397663116455,
24006
+ "rewards/reward_func/mean": 0.3668999969959259,
24007
+ "rewards/reward_func/std": 0.5063701272010803,
24008
+ "step": 923
24009
+ },
24010
+ {
24011
+ "clip_ratio/high_max": 0.0,
24012
+ "clip_ratio/high_mean": 0.0,
24013
+ "clip_ratio/low_mean": 0.0,
24014
+ "clip_ratio/low_min": 0.0,
24015
+ "clip_ratio/region_mean": 0.0,
24016
+ "completions/clipped_ratio": 1.0,
24017
+ "completions/max_length": 384.0,
24018
+ "completions/max_terminated_length": 0.0,
24019
+ "completions/mean_length": 384.0,
24020
+ "completions/mean_terminated_length": 0.0,
24021
+ "completions/min_length": 384.0,
24022
+ "completions/min_terminated_length": 0.0,
24023
+ "entropy": 0.6519879586994648,
24024
+ "epoch": 1.8046875,
24025
+ "frac_reward_zero_std": 0.0,
24026
+ "grad_norm": 0.25893595814704895,
24027
+ "learning_rate": 1.4210526315789474e-07,
24028
+ "loss": -0.0,
24029
+ "num_tokens": 4352264.0,
24030
+ "reward": 0.23256249725818634,
24031
+ "reward_std": 0.4609765410423279,
24032
+ "rewards/reward_func/mean": 0.23256249725818634,
24033
+ "rewards/reward_func/std": 0.4271462559700012,
24034
+ "step": 924
24035
+ },
24036
+ {
24037
+ "clip_ratio/high_max": 0.0,
24038
+ "clip_ratio/high_mean": 0.0,
24039
+ "clip_ratio/low_mean": 0.0,
24040
+ "clip_ratio/low_min": 0.0,
24041
+ "clip_ratio/region_mean": 0.0,
24042
+ "completions/clipped_ratio": 1.0,
24043
+ "completions/max_length": 384.0,
24044
+ "completions/max_terminated_length": 0.0,
24045
+ "completions/mean_length": 384.0,
24046
+ "completions/mean_terminated_length": 0.0,
24047
+ "completions/min_length": 384.0,
24048
+ "completions/min_terminated_length": 0.0,
24049
+ "entropy": 0.8685908392071724,
24050
+ "epoch": 1.806640625,
24051
+ "frac_reward_zero_std": 0.0,
24052
+ "grad_norm": 0.24174730479717255,
24053
+ "learning_rate": 1.368421052631579e-07,
24054
+ "loss": -0.0,
24055
+ "num_tokens": 4356792.0,
24056
+ "reward": 0.6343749761581421,
24057
+ "reward_std": 0.5239908695220947,
24058
+ "rewards/reward_func/mean": 0.6343749761581421,
24059
+ "rewards/reward_func/std": 0.5047431588172913,
24060
+ "step": 925
24061
+ },
24062
+ {
24063
+ "clip_ratio/high_max": 0.0,
24064
+ "clip_ratio/high_mean": 0.0,
24065
+ "clip_ratio/low_mean": 0.0,
24066
+ "clip_ratio/low_min": 0.0,
24067
+ "clip_ratio/region_mean": 0.0,
24068
+ "completions/clipped_ratio": 1.0,
24069
+ "completions/max_length": 384.0,
24070
+ "completions/max_terminated_length": 0.0,
24071
+ "completions/mean_length": 384.0,
24072
+ "completions/mean_terminated_length": 0.0,
24073
+ "completions/min_length": 384.0,
24074
+ "completions/min_terminated_length": 0.0,
24075
+ "entropy": 0.584613062441349,
24076
+ "epoch": 1.80859375,
24077
+ "frac_reward_zero_std": 0.5,
24078
+ "grad_norm": 0.15689092874526978,
24079
+ "learning_rate": 1.3157894736842107e-07,
24080
+ "loss": 0.0,
24081
+ "num_tokens": 4361448.0,
24082
+ "reward": 0.760937511920929,
24083
+ "reward_std": 0.27662280201911926,
24084
+ "rewards/reward_func/mean": 0.760937511920929,
24085
+ "rewards/reward_func/std": 0.4432750344276428,
24086
+ "step": 926
24087
+ },
24088
+ {
24089
+ "clip_ratio/high_max": 0.0,
24090
+ "clip_ratio/high_mean": 0.0,
24091
+ "clip_ratio/low_mean": 0.0,
24092
+ "clip_ratio/low_min": 0.0,
24093
+ "clip_ratio/region_mean": 0.0,
24094
+ "completions/clipped_ratio": 1.0,
24095
+ "completions/max_length": 384.0,
24096
+ "completions/max_terminated_length": 0.0,
24097
+ "completions/mean_length": 384.0,
24098
+ "completions/mean_terminated_length": 0.0,
24099
+ "completions/min_length": 384.0,
24100
+ "completions/min_terminated_length": 0.0,
24101
+ "entropy": 0.6101187616586685,
24102
+ "epoch": 1.810546875,
24103
+ "frac_reward_zero_std": 0.5,
24104
+ "grad_norm": 0.17513932287693024,
24105
+ "learning_rate": 1.2631578947368423e-07,
24106
+ "loss": 0.0,
24107
+ "num_tokens": 4366136.0,
24108
+ "reward": 0.75,
24109
+ "reward_std": 0.28867512941360474,
24110
+ "rewards/reward_func/mean": 0.75,
24111
+ "rewards/reward_func/std": 0.4629100561141968,
24112
+ "step": 927
24113
+ },
24114
+ {
24115
+ "clip_ratio/high_max": 0.0,
24116
+ "clip_ratio/high_mean": 0.0,
24117
+ "clip_ratio/low_mean": 0.0,
24118
+ "clip_ratio/low_min": 0.0,
24119
+ "clip_ratio/region_mean": 0.0,
24120
+ "completions/clipped_ratio": 1.0,
24121
+ "completions/max_length": 384.0,
24122
+ "completions/max_terminated_length": 0.0,
24123
+ "completions/mean_length": 384.0,
24124
+ "completions/mean_terminated_length": 0.0,
24125
+ "completions/min_length": 384.0,
24126
+ "completions/min_terminated_length": 0.0,
24127
+ "entropy": 0.6578428782522678,
24128
+ "epoch": 1.8125,
24129
+ "frac_reward_zero_std": 0.0,
24130
+ "grad_norm": 0.19013863801956177,
24131
+ "learning_rate": 1.2105263157894737e-07,
24132
+ "loss": 0.0,
24133
+ "num_tokens": 4371112.0,
24134
+ "reward": 0.21320000290870667,
24135
+ "reward_std": 0.42640000581741333,
24136
+ "rewards/reward_func/mean": 0.21320000290870667,
24137
+ "rewards/reward_func/std": 0.3952178359031677,
24138
+ "step": 928
24139
+ },
24140
+ {
24141
+ "clip_ratio/high_max": 0.0,
24142
+ "clip_ratio/high_mean": 0.0,
24143
+ "clip_ratio/low_mean": 0.0,
24144
+ "clip_ratio/low_min": 0.0,
24145
+ "clip_ratio/region_mean": 0.0,
24146
+ "completions/clipped_ratio": 1.0,
24147
+ "completions/max_length": 384.0,
24148
+ "completions/max_terminated_length": 0.0,
24149
+ "completions/mean_length": 384.0,
24150
+ "completions/mean_terminated_length": 0.0,
24151
+ "completions/min_length": 384.0,
24152
+ "completions/min_terminated_length": 0.0,
24153
+ "entropy": 0.505165308713913,
24154
+ "epoch": 1.814453125,
24155
+ "frac_reward_zero_std": 0.0,
24156
+ "grad_norm": 0.9573839902877808,
24157
+ "learning_rate": 1.1578947368421054e-07,
24158
+ "loss": -0.0,
24159
+ "num_tokens": 4375760.0,
24160
+ "reward": 0.6168624758720398,
24161
+ "reward_std": 0.5261497497558594,
24162
+ "rewards/reward_func/mean": 0.6168625354766846,
24163
+ "rewards/reward_func/std": 0.5074918866157532,
24164
+ "step": 929
24165
+ },
24166
+ {
24167
+ "clip_ratio/high_max": 0.0,
24168
+ "clip_ratio/high_mean": 0.0,
24169
+ "clip_ratio/low_mean": 0.0,
24170
+ "clip_ratio/low_min": 0.0,
24171
+ "clip_ratio/region_mean": 0.0,
24172
+ "completions/clipped_ratio": 1.0,
24173
+ "completions/max_length": 384.0,
24174
+ "completions/max_terminated_length": 0.0,
24175
+ "completions/mean_length": 384.0,
24176
+ "completions/mean_terminated_length": 0.0,
24177
+ "completions/min_length": 384.0,
24178
+ "completions/min_terminated_length": 0.0,
24179
+ "entropy": 0.874191090464592,
24180
+ "epoch": 1.81640625,
24181
+ "frac_reward_zero_std": 0.0,
24182
+ "grad_norm": 0.2154332995414734,
24183
+ "learning_rate": 1.1052631578947368e-07,
24184
+ "loss": 0.0,
24185
+ "num_tokens": 4380264.0,
24186
+ "reward": 0.6285499930381775,
24187
+ "reward_std": 0.2513039708137512,
24188
+ "rewards/reward_func/mean": 0.6285499930381775,
24189
+ "rewards/reward_func/std": 0.5067988634109497,
24190
+ "step": 930
24191
+ },
24192
+ {
24193
+ "clip_ratio/high_max": 0.0,
24194
+ "clip_ratio/high_mean": 0.0,
24195
+ "clip_ratio/low_mean": 0.0,
24196
+ "clip_ratio/low_min": 0.0,
24197
+ "clip_ratio/region_mean": 0.0,
24198
+ "completions/clipped_ratio": 1.0,
24199
+ "completions/max_length": 384.0,
24200
+ "completions/max_terminated_length": 0.0,
24201
+ "completions/mean_length": 384.0,
24202
+ "completions/mean_terminated_length": 0.0,
24203
+ "completions/min_length": 384.0,
24204
+ "completions/min_terminated_length": 0.0,
24205
+ "entropy": 0.9102950617671013,
24206
+ "epoch": 1.818359375,
24207
+ "frac_reward_zero_std": 0.5,
24208
+ "grad_norm": 0.1660175621509552,
24209
+ "learning_rate": 1.0526315789473685e-07,
24210
+ "loss": 0.0,
24211
+ "num_tokens": 4384872.0,
24212
+ "reward": 0.8723000288009644,
24213
+ "reward_std": 0.24825221300125122,
24214
+ "rewards/reward_func/mean": 0.8723000288009644,
24215
+ "rewards/reward_func/std": 0.3525434732437134,
24216
+ "step": 931
24217
+ },
24218
+ {
24219
+ "clip_ratio/high_max": 0.0,
24220
+ "clip_ratio/high_mean": 0.0,
24221
+ "clip_ratio/low_mean": 0.0,
24222
+ "clip_ratio/low_min": 0.0,
24223
+ "clip_ratio/region_mean": 0.0,
24224
+ "completions/clipped_ratio": 1.0,
24225
+ "completions/max_length": 384.0,
24226
+ "completions/max_terminated_length": 0.0,
24227
+ "completions/mean_length": 384.0,
24228
+ "completions/mean_terminated_length": 0.0,
24229
+ "completions/min_length": 384.0,
24230
+ "completions/min_terminated_length": 0.0,
24231
+ "entropy": 0.564951941370964,
24232
+ "epoch": 1.8203125,
24233
+ "frac_reward_zero_std": 0.0,
24234
+ "grad_norm": 0.23926304280757904,
24235
+ "learning_rate": 1.0000000000000001e-07,
24236
+ "loss": -0.0,
24237
+ "num_tokens": 4389696.0,
24238
+ "reward": 0.1210624948143959,
24239
+ "reward_std": 0.2421249896287918,
24240
+ "rewards/reward_func/mean": 0.1210624948143959,
24241
+ "rewards/reward_func/std": 0.33739402890205383,
24242
+ "step": 932
24243
+ },
24244
+ {
24245
+ "clip_ratio/high_max": 0.0,
24246
+ "clip_ratio/high_mean": 0.0,
24247
+ "clip_ratio/low_mean": 0.0,
24248
+ "clip_ratio/low_min": 0.0,
24249
+ "clip_ratio/region_mean": 0.0,
24250
+ "completions/clipped_ratio": 1.0,
24251
+ "completions/max_length": 384.0,
24252
+ "completions/max_terminated_length": 0.0,
24253
+ "completions/mean_length": 384.0,
24254
+ "completions/mean_terminated_length": 0.0,
24255
+ "completions/min_length": 384.0,
24256
+ "completions/min_terminated_length": 0.0,
24257
+ "entropy": 0.4994359500706196,
24258
+ "epoch": 1.822265625,
24259
+ "frac_reward_zero_std": 0.0,
24260
+ "grad_norm": 0.25481706857681274,
24261
+ "learning_rate": 9.473684210526317e-08,
24262
+ "loss": 0.0,
24263
+ "num_tokens": 4394352.0,
24264
+ "reward": 0.629212498664856,
24265
+ "reward_std": 0.49713975191116333,
24266
+ "rewards/reward_func/mean": 0.629212498664856,
24267
+ "rewards/reward_func/std": 0.47718602418899536,
24268
+ "step": 933
24269
+ },
24270
+ {
24271
+ "clip_ratio/high_max": 0.0,
24272
+ "clip_ratio/high_mean": 0.0,
24273
+ "clip_ratio/low_mean": 0.0,
24274
+ "clip_ratio/low_min": 0.0,
24275
+ "clip_ratio/region_mean": 0.0,
24276
+ "completions/clipped_ratio": 1.0,
24277
+ "completions/max_length": 384.0,
24278
+ "completions/max_terminated_length": 0.0,
24279
+ "completions/mean_length": 384.0,
24280
+ "completions/mean_terminated_length": 0.0,
24281
+ "completions/min_length": 384.0,
24282
+ "completions/min_terminated_length": 0.0,
24283
+ "entropy": 0.4711499195545912,
24284
+ "epoch": 1.82421875,
24285
+ "frac_reward_zero_std": 0.0,
24286
+ "grad_norm": 0.19426177442073822,
24287
+ "learning_rate": 8.947368421052632e-08,
24288
+ "loss": -0.0,
24289
+ "num_tokens": 4399000.0,
24290
+ "reward": 0.629349946975708,
24291
+ "reward_std": 0.5124804973602295,
24292
+ "rewards/reward_func/mean": 0.6293500065803528,
24293
+ "rewards/reward_func/std": 0.4940544664859772,
24294
+ "step": 934
24295
+ },
24296
+ {
24297
+ "clip_ratio/high_max": 0.0,
24298
+ "clip_ratio/high_mean": 0.0,
24299
+ "clip_ratio/low_mean": 0.0,
24300
+ "clip_ratio/low_min": 0.0,
24301
+ "clip_ratio/region_mean": 0.0,
24302
+ "completions/clipped_ratio": 1.0,
24303
+ "completions/max_length": 384.0,
24304
+ "completions/max_terminated_length": 0.0,
24305
+ "completions/mean_length": 384.0,
24306
+ "completions/mean_terminated_length": 0.0,
24307
+ "completions/min_length": 384.0,
24308
+ "completions/min_terminated_length": 0.0,
24309
+ "entropy": 0.8083347305655479,
24310
+ "epoch": 1.826171875,
24311
+ "frac_reward_zero_std": 0.5,
24312
+ "grad_norm": 0.17153169214725494,
24313
+ "learning_rate": 8.421052631578947e-08,
24314
+ "loss": 0.0,
24315
+ "num_tokens": 4403608.0,
24316
+ "reward": 0.7593749761581421,
24317
+ "reward_std": 0.27827125787734985,
24318
+ "rewards/reward_func/mean": 0.7593749761581421,
24319
+ "rewards/reward_func/std": 0.4460016191005707,
24320
+ "step": 935
24321
+ },
24322
+ {
24323
+ "clip_ratio/high_max": 0.0,
24324
+ "clip_ratio/high_mean": 0.0,
24325
+ "clip_ratio/low_mean": 0.0,
24326
+ "clip_ratio/low_min": 0.0,
24327
+ "clip_ratio/region_mean": 0.0,
24328
+ "completions/clipped_ratio": 1.0,
24329
+ "completions/max_length": 384.0,
24330
+ "completions/max_terminated_length": 0.0,
24331
+ "completions/mean_length": 384.0,
24332
+ "completions/mean_terminated_length": 0.0,
24333
+ "completions/min_length": 384.0,
24334
+ "completions/min_terminated_length": 0.0,
24335
+ "entropy": 0.6882787756621838,
24336
+ "epoch": 1.828125,
24337
+ "frac_reward_zero_std": 0.0,
24338
+ "grad_norm": 0.26333948969841003,
24339
+ "learning_rate": 7.894736842105264e-08,
24340
+ "loss": 0.0,
24341
+ "num_tokens": 4408400.0,
24342
+ "reward": 0.5950000286102295,
24343
+ "reward_std": 0.5117709040641785,
24344
+ "rewards/reward_func/mean": 0.5950000286102295,
24345
+ "rewards/reward_func/std": 0.4931241571903229,
24346
+ "step": 936
24347
+ },
24348
+ {
24349
+ "clip_ratio/high_max": 0.0,
24350
+ "clip_ratio/high_mean": 0.0,
24351
+ "clip_ratio/low_mean": 0.0,
24352
+ "clip_ratio/low_min": 0.0,
24353
+ "clip_ratio/region_mean": 0.0,
24354
+ "completions/clipped_ratio": 1.0,
24355
+ "completions/max_length": 384.0,
24356
+ "completions/max_terminated_length": 0.0,
24357
+ "completions/mean_length": 384.0,
24358
+ "completions/mean_terminated_length": 0.0,
24359
+ "completions/min_length": 384.0,
24360
+ "completions/min_terminated_length": 0.0,
24361
+ "entropy": 0.8129791170358658,
24362
+ "epoch": 1.830078125,
24363
+ "frac_reward_zero_std": 1.0,
24364
+ "grad_norm": 0.0,
24365
+ "learning_rate": 7.368421052631579e-08,
24366
+ "loss": 0.0,
24367
+ "num_tokens": 4412896.0,
24368
+ "reward": 1.0,
24369
+ "reward_std": 0.0,
24370
+ "rewards/reward_func/mean": 1.0,
24371
+ "rewards/reward_func/std": 0.0,
24372
+ "step": 937
24373
+ },
24374
+ {
24375
+ "clip_ratio/high_max": 0.0,
24376
+ "clip_ratio/high_mean": 0.0,
24377
+ "clip_ratio/low_mean": 0.0,
24378
+ "clip_ratio/low_min": 0.0,
24379
+ "clip_ratio/region_mean": 0.0,
24380
+ "completions/clipped_ratio": 1.0,
24381
+ "completions/max_length": 384.0,
24382
+ "completions/max_terminated_length": 0.0,
24383
+ "completions/mean_length": 384.0,
24384
+ "completions/mean_terminated_length": 0.0,
24385
+ "completions/min_length": 384.0,
24386
+ "completions/min_terminated_length": 0.0,
24387
+ "entropy": 0.45138827711343765,
24388
+ "epoch": 1.83203125,
24389
+ "frac_reward_zero_std": 0.0,
24390
+ "grad_norm": 0.18060848116874695,
24391
+ "learning_rate": 6.842105263157895e-08,
24392
+ "loss": 0.0,
24393
+ "num_tokens": 4417680.0,
24394
+ "reward": 0.3540624976158142,
24395
+ "reward_std": 0.5053315162658691,
24396
+ "rewards/reward_func/mean": 0.3540624976158142,
24397
+ "rewards/reward_func/std": 0.4852207601070404,
24398
+ "step": 938
24399
+ },
24400
+ {
24401
+ "clip_ratio/high_max": 0.0,
24402
+ "clip_ratio/high_mean": 0.0,
24403
+ "clip_ratio/low_mean": 0.0,
24404
+ "clip_ratio/low_min": 0.0,
24405
+ "clip_ratio/region_mean": 0.0,
24406
+ "completions/clipped_ratio": 1.0,
24407
+ "completions/max_length": 384.0,
24408
+ "completions/max_terminated_length": 0.0,
24409
+ "completions/mean_length": 384.0,
24410
+ "completions/mean_terminated_length": 0.0,
24411
+ "completions/min_length": 384.0,
24412
+ "completions/min_terminated_length": 0.0,
24413
+ "entropy": 0.6235154792666435,
24414
+ "epoch": 1.833984375,
24415
+ "frac_reward_zero_std": 0.0,
24416
+ "grad_norm": 0.16618505120277405,
24417
+ "learning_rate": 6.315789473684211e-08,
24418
+ "loss": -0.0,
24419
+ "num_tokens": 4422320.0,
24420
+ "reward": 0.7527874708175659,
24421
+ "reward_std": 0.4729631543159485,
24422
+ "rewards/reward_func/mean": 0.7527874708175659,
24423
+ "rewards/reward_func/std": 0.4385221302509308,
24424
+ "step": 939
24425
+ },
24426
+ {
24427
+ "clip_ratio/high_max": 0.0,
24428
+ "clip_ratio/high_mean": 0.0,
24429
+ "clip_ratio/low_mean": 0.0,
24430
+ "clip_ratio/low_min": 0.0,
24431
+ "clip_ratio/region_mean": 0.0,
24432
+ "completions/clipped_ratio": 1.0,
24433
+ "completions/max_length": 384.0,
24434
+ "completions/max_terminated_length": 0.0,
24435
+ "completions/mean_length": 384.0,
24436
+ "completions/mean_terminated_length": 0.0,
24437
+ "completions/min_length": 384.0,
24438
+ "completions/min_terminated_length": 0.0,
24439
+ "entropy": 0.32566132955253124,
24440
+ "epoch": 1.8359375,
24441
+ "frac_reward_zero_std": 0.0,
24442
+ "grad_norm": 0.1724817156791687,
24443
+ "learning_rate": 5.789473684210527e-08,
24444
+ "loss": 0.0,
24445
+ "num_tokens": 4426960.0,
24446
+ "reward": 0.38749998807907104,
24447
+ "reward_std": 0.5304675102233887,
24448
+ "rewards/reward_func/mean": 0.38750001788139343,
24449
+ "rewards/reward_func/std": 0.5074885487556458,
24450
+ "step": 940
24451
+ },
24452
+ {
24453
+ "clip_ratio/high_max": 0.0,
24454
+ "clip_ratio/high_mean": 0.0,
24455
+ "clip_ratio/low_mean": 0.0,
24456
+ "clip_ratio/low_min": 0.0,
24457
+ "clip_ratio/region_mean": 0.0,
24458
+ "completions/clipped_ratio": 1.0,
24459
+ "completions/max_length": 384.0,
24460
+ "completions/max_terminated_length": 0.0,
24461
+ "completions/mean_length": 384.0,
24462
+ "completions/mean_terminated_length": 0.0,
24463
+ "completions/min_length": 384.0,
24464
+ "completions/min_terminated_length": 0.0,
24465
+ "entropy": 0.3569133039563894,
24466
+ "epoch": 1.837890625,
24467
+ "frac_reward_zero_std": 0.0,
24468
+ "grad_norm": 0.17535947263240814,
24469
+ "learning_rate": 5.263157894736842e-08,
24470
+ "loss": 0.0,
24471
+ "num_tokens": 4431448.0,
24472
+ "reward": 0.6987625360488892,
24473
+ "reward_std": 0.4616749882698059,
24474
+ "rewards/reward_func/mean": 0.6987625360488892,
24475
+ "rewards/reward_func/std": 0.4274410605430603,
24476
+ "step": 941
24477
+ },
24478
+ {
24479
+ "clip_ratio/high_max": 0.0,
24480
+ "clip_ratio/high_mean": 0.0,
24481
+ "clip_ratio/low_mean": 0.0,
24482
+ "clip_ratio/low_min": 0.0,
24483
+ "clip_ratio/region_mean": 0.0,
24484
+ "completions/clipped_ratio": 1.0,
24485
+ "completions/max_length": 384.0,
24486
+ "completions/max_terminated_length": 0.0,
24487
+ "completions/mean_length": 384.0,
24488
+ "completions/mean_terminated_length": 0.0,
24489
+ "completions/min_length": 384.0,
24490
+ "completions/min_terminated_length": 0.0,
24491
+ "entropy": 0.5731943100690842,
24492
+ "epoch": 1.83984375,
24493
+ "frac_reward_zero_std": 0.0,
24494
+ "grad_norm": 0.19585168361663818,
24495
+ "learning_rate": 4.7368421052631586e-08,
24496
+ "loss": 0.0,
24497
+ "num_tokens": 4436104.0,
24498
+ "reward": 0.3721874952316284,
24499
+ "reward_std": 0.5249817371368408,
24500
+ "rewards/reward_func/mean": 0.3721874952316284,
24501
+ "rewards/reward_func/std": 0.5038166046142578,
24502
+ "step": 942
24503
+ },
24504
+ {
24505
+ "clip_ratio/high_max": 0.0,
24506
+ "clip_ratio/high_mean": 0.0,
24507
+ "clip_ratio/low_mean": 0.0,
24508
+ "clip_ratio/low_min": 0.0,
24509
+ "clip_ratio/region_mean": 0.0,
24510
+ "completions/clipped_ratio": 1.0,
24511
+ "completions/max_length": 384.0,
24512
+ "completions/max_terminated_length": 0.0,
24513
+ "completions/mean_length": 384.0,
24514
+ "completions/mean_terminated_length": 0.0,
24515
+ "completions/min_length": 384.0,
24516
+ "completions/min_terminated_length": 0.0,
24517
+ "entropy": 0.5804979503154755,
24518
+ "epoch": 1.841796875,
24519
+ "frac_reward_zero_std": 0.5,
24520
+ "grad_norm": 0.15701980888843536,
24521
+ "learning_rate": 4.2105263157894737e-08,
24522
+ "loss": 0.0,
24523
+ "num_tokens": 4441096.0,
24524
+ "reward": 0.22850000858306885,
24525
+ "reward_std": 0.26406246423721313,
24526
+ "rewards/reward_func/mean": 0.22850000858306885,
24527
+ "rewards/reward_func/std": 0.4233279824256897,
24528
+ "step": 943
24529
+ },
24530
+ {
24531
+ "clip_ratio/high_max": 0.0,
24532
+ "clip_ratio/high_mean": 0.0,
24533
+ "clip_ratio/low_mean": 0.0,
24534
+ "clip_ratio/low_min": 0.0,
24535
+ "clip_ratio/region_mean": 0.0,
24536
+ "completions/clipped_ratio": 1.0,
24537
+ "completions/max_length": 384.0,
24538
+ "completions/max_terminated_length": 0.0,
24539
+ "completions/mean_length": 384.0,
24540
+ "completions/mean_terminated_length": 0.0,
24541
+ "completions/min_length": 384.0,
24542
+ "completions/min_terminated_length": 0.0,
24543
+ "entropy": 0.8702232874929905,
24544
+ "epoch": 1.84375,
24545
+ "frac_reward_zero_std": 0.0,
24546
+ "grad_norm": 0.30385154485702515,
24547
+ "learning_rate": 3.6842105263157894e-08,
24548
+ "loss": 0.0,
24549
+ "num_tokens": 4445896.0,
24550
+ "reward": 0.5129749774932861,
24551
+ "reward_std": 0.47286224365234375,
24552
+ "rewards/reward_func/mean": 0.5129749774932861,
24553
+ "rewards/reward_func/std": 0.5019313097000122,
24554
+ "step": 944
24555
+ },
24556
+ {
24557
+ "clip_ratio/high_max": 0.0,
24558
+ "clip_ratio/high_mean": 0.0,
24559
+ "clip_ratio/low_mean": 0.0,
24560
+ "clip_ratio/low_min": 0.0,
24561
+ "clip_ratio/region_mean": 0.0,
24562
+ "completions/clipped_ratio": 1.0,
24563
+ "completions/max_length": 384.0,
24564
+ "completions/max_terminated_length": 0.0,
24565
+ "completions/mean_length": 384.0,
24566
+ "completions/mean_terminated_length": 0.0,
24567
+ "completions/min_length": 384.0,
24568
+ "completions/min_terminated_length": 0.0,
24569
+ "entropy": 0.6899589747190475,
24570
+ "epoch": 1.845703125,
24571
+ "frac_reward_zero_std": 0.0,
24572
+ "grad_norm": 0.23425976932048798,
24573
+ "learning_rate": 3.157894736842106e-08,
24574
+ "loss": 0.0,
24575
+ "num_tokens": 4450384.0,
24576
+ "reward": 0.8695999979972839,
24577
+ "reward_std": 0.25623539090156555,
24578
+ "rewards/reward_func/mean": 0.8695999979972839,
24579
+ "rewards/reward_func/std": 0.35150691866874695,
24580
+ "step": 945
24581
+ },
24582
+ {
24583
+ "clip_ratio/high_max": 0.0,
24584
+ "clip_ratio/high_mean": 0.0,
24585
+ "clip_ratio/low_mean": 0.0,
24586
+ "clip_ratio/low_min": 0.0,
24587
+ "clip_ratio/region_mean": 0.0,
24588
+ "completions/clipped_ratio": 1.0,
24589
+ "completions/max_length": 384.0,
24590
+ "completions/max_terminated_length": 0.0,
24591
+ "completions/mean_length": 384.0,
24592
+ "completions/mean_terminated_length": 0.0,
24593
+ "completions/min_length": 384.0,
24594
+ "completions/min_terminated_length": 0.0,
24595
+ "entropy": 0.7242371030151844,
24596
+ "epoch": 1.84765625,
24597
+ "frac_reward_zero_std": 0.5,
24598
+ "grad_norm": 0.18696576356887817,
24599
+ "learning_rate": 2.631578947368421e-08,
24600
+ "loss": 0.0,
24601
+ "num_tokens": 4454728.0,
24602
+ "reward": 0.8765624761581421,
24603
+ "reward_std": 0.24687500298023224,
24604
+ "rewards/reward_func/mean": 0.8765624761581421,
24605
+ "rewards/reward_func/std": 0.34913399815559387,
24606
+ "step": 946
24607
+ },
24608
+ {
24609
+ "clip_ratio/high_max": 0.0,
24610
+ "clip_ratio/high_mean": 0.0,
24611
+ "clip_ratio/low_mean": 0.0,
24612
+ "clip_ratio/low_min": 0.0,
24613
+ "clip_ratio/region_mean": 0.0,
24614
+ "completions/clipped_ratio": 1.0,
24615
+ "completions/max_length": 384.0,
24616
+ "completions/max_terminated_length": 0.0,
24617
+ "completions/mean_length": 384.0,
24618
+ "completions/mean_terminated_length": 0.0,
24619
+ "completions/min_length": 384.0,
24620
+ "completions/min_terminated_length": 0.0,
24621
+ "entropy": 0.6600115075707436,
24622
+ "epoch": 1.849609375,
24623
+ "frac_reward_zero_std": 0.0,
24624
+ "grad_norm": 0.26843783259391785,
24625
+ "learning_rate": 2.1052631578947368e-08,
24626
+ "loss": -0.0,
24627
+ "num_tokens": 4459744.0,
24628
+ "reward": 0.350600004196167,
24629
+ "reward_std": 0.4967568516731262,
24630
+ "rewards/reward_func/mean": 0.350600004196167,
24631
+ "rewards/reward_func/std": 0.4848525822162628,
24632
+ "step": 947
24633
+ },
24634
+ {
24635
+ "clip_ratio/high_max": 0.0,
24636
+ "clip_ratio/high_mean": 0.0,
24637
+ "clip_ratio/low_mean": 0.0,
24638
+ "clip_ratio/low_min": 0.0,
24639
+ "clip_ratio/region_mean": 0.0,
24640
+ "completions/clipped_ratio": 1.0,
24641
+ "completions/max_length": 384.0,
24642
+ "completions/max_terminated_length": 0.0,
24643
+ "completions/mean_length": 384.0,
24644
+ "completions/mean_terminated_length": 0.0,
24645
+ "completions/min_length": 384.0,
24646
+ "completions/min_terminated_length": 0.0,
24647
+ "entropy": 0.6009881878271699,
24648
+ "epoch": 1.8515625,
24649
+ "frac_reward_zero_std": 0.0,
24650
+ "grad_norm": 0.2861350476741791,
24651
+ "learning_rate": 1.578947368421053e-08,
24652
+ "loss": -0.0,
24653
+ "num_tokens": 4464248.0,
24654
+ "reward": 0.5809999704360962,
24655
+ "reward_std": 0.5007524490356445,
24656
+ "rewards/reward_func/mean": 0.5809999704360962,
24657
+ "rewards/reward_func/std": 0.4811137020587921,
24658
+ "step": 948
24659
+ },
24660
+ {
24661
+ "clip_ratio/high_max": 0.0,
24662
+ "clip_ratio/high_mean": 0.0,
24663
+ "clip_ratio/low_mean": 0.0,
24664
+ "clip_ratio/low_min": 0.0,
24665
+ "clip_ratio/region_mean": 0.0,
24666
+ "completions/clipped_ratio": 1.0,
24667
+ "completions/max_length": 384.0,
24668
+ "completions/max_terminated_length": 0.0,
24669
+ "completions/mean_length": 384.0,
24670
+ "completions/mean_terminated_length": 0.0,
24671
+ "completions/min_length": 384.0,
24672
+ "completions/min_terminated_length": 0.0,
24673
+ "entropy": 0.4016810180619359,
24674
+ "epoch": 1.853515625,
24675
+ "frac_reward_zero_std": 0.0,
24676
+ "grad_norm": 0.19072452187538147,
24677
+ "learning_rate": 1.0526315789473684e-08,
24678
+ "loss": -0.0,
24679
+ "num_tokens": 4468744.0,
24680
+ "reward": 0.5809999704360962,
24681
+ "reward_std": 0.5007524490356445,
24682
+ "rewards/reward_func/mean": 0.5809999704360962,
24683
+ "rewards/reward_func/std": 0.4811137020587921,
24684
+ "step": 949
24685
+ },
24686
+ {
24687
+ "clip_ratio/high_max": 0.0,
24688
+ "clip_ratio/high_mean": 0.0,
24689
+ "clip_ratio/low_mean": 0.0,
24690
+ "clip_ratio/low_min": 0.0,
24691
+ "clip_ratio/region_mean": 0.0,
24692
+ "completions/clipped_ratio": 1.0,
24693
+ "completions/max_length": 384.0,
24694
+ "completions/max_terminated_length": 0.0,
24695
+ "completions/mean_length": 384.0,
24696
+ "completions/mean_terminated_length": 0.0,
24697
+ "completions/min_length": 384.0,
24698
+ "completions/min_terminated_length": 0.0,
24699
+ "entropy": 0.92277492582798,
24700
+ "epoch": 1.85546875,
24701
+ "frac_reward_zero_std": 0.0,
24702
+ "grad_norm": 0.2709886431694031,
24703
+ "learning_rate": 5.263157894736842e-09,
24704
+ "loss": 0.0,
24705
+ "num_tokens": 4473248.0,
24706
+ "reward": 0.7473000288009644,
24707
+ "reward_std": 0.4982522130012512,
24708
+ "rewards/reward_func/mean": 0.7473000288009644,
24709
+ "rewards/reward_func/std": 0.4613037705421448,
24710
+ "step": 950
24711
  }
24712
  ],
24713
  "logging_steps": 1,
24714
  "max_steps": 950,
24715
+ "num_input_tokens_seen": 4473248,
24716
  "num_train_epochs": 2,
24717
  "save_steps": 50,
24718
  "stateful_callbacks": {
 
24722
  "should_evaluate": false,
24723
  "should_log": false,
24724
  "should_save": true,
24725
+ "should_training_stop": true
24726
  },
24727
  "attributes": {}
24728
  }