usr256864 commited on
Commit
90d856d
·
verified ·
1 Parent(s): 05f842a

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -25,8 +25,8 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "v_proj",
29
- "q_proj"
30
  ],
31
  "target_parameters": null,
32
  "task_type": "CAUSAL_LM",
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "q_proj",
29
+ "v_proj"
30
  ],
31
  "target_parameters": null,
32
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc5586e71765b19dc2e397d877b4f0ac65f9dc420556df4f1c68c2198aae70d4
3
  size 16794200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a1eaa0c2de60b9844d1cc03aff521bbfb4581f7ab3366f9a89e9fae89535ce
3
  size 16794200
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdebb1e819dab78d9eb52c802a9445fd32d5b815237b15e2302b31bcf54ba668
3
  size 33664331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6320d8db318aaf99c6b8796d5ef702c67d497a785359999a3379e6800c2346f1
3
  size 33664331
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e385d556bbaca5472b5cbae1aa6dc96bb3425eaabfdad7a2028faec6c78498c
3
  size 16325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71878d32742b6bb1d7e6f320668e9057eb306b367d448e89261acf1e34b5969
3
  size 16325
rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:94c6d9d64f0b87344c7fda0e5cb9c9e6eeed252d3b0a7cc477fd01be22f2eb4c
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75b95472c217f8113f1cc9de80ebc17fac2b25d38a92fe08d69870899adb2f6
3
  size 16389
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:154bda22c80b9bc23d3a88787f9d9a5b30244b7a72a3924dc013c4c0e4aafd36
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5451043be89709e3b8e5cde8a1bcc9ff167dd30ef2ebab491d7f2ee7bc224b36
3
  size 16389
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4690cb2b16ebb3e5458522aeeeca869a287cb497252af09417640f3a750c62b1
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b32717ec55a8a59baafd0f43933b1acddf22eef248d2231fd3347b42465fa607
3
  size 16389
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ed8b9fa5cd7191952886161b63192416187b67e954348173079eeffebb01639
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b613070a232b30a4f8e0d72fb403479556a5ed6861d080a034f21d7ee69c003
3
  size 16389
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e24e61221f86c5ee154f36d8d8fd1089783e3d3277fdfbea8eaffa3f87e7f28
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1035ef0576c9b6d7d98a15586cc2c3549a5dfa7fa16332c1ee1fb668547c0fee
3
  size 16389
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ab84383c0343c605cecb006a4f626c8653228a8d0d3c3c23f336ab523fa0da0
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:683799cf324fe7fb2fc880b968b1ca888155d8bb2c08cf2a0760ed3802a8e5fe
3
  size 16389
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9a4dc69b63580800bda2f46f5997e3f0341c4ad9592602717d3cd6de8ede9c5
3
  size 16325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d212ffe5a372e10e4ef7f41fb63a3acf5e7faa0d500fe86883a93de2e93e8f25
3
  size 16325
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1eb96a17ced1a30656568ce97c308d6b8d89567ef0d53a6f7c4c66dab25646a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f275d9fe3bf7acf1cd62850a289b5eeef9d0373b5a2b0c248e6f9ebb8c2512e6
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.9154518950437316,
6
  "eval_steps": 500,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4688,941 +4688,11 @@
4688
  "rewards/format_reward_fn/mean": 0.9545312523841858,
4689
  "rewards/format_reward_fn/std": 0.17224382143467665,
4690
  "step": 2496
4691
- },
4692
- {
4693
- "clip_ratio/high_max": 0.0,
4694
- "clip_ratio/high_mean": 0.0,
4695
- "clip_ratio/low_mean": 0.0,
4696
- "clip_ratio/low_min": 0.0,
4697
- "clip_ratio/region_mean": 0.0,
4698
- "completions/clipped_ratio": 0.029947916666666668,
4699
- "completions/max_length": 253.5,
4700
- "completions/max_terminated_length": 249.08333333333334,
4701
- "completions/mean_length": 210.64192708333334,
4702
- "completions/mean_terminated_length": 209.20124689737955,
4703
- "completions/min_length": 172.41666666666666,
4704
- "completions/min_terminated_length": 172.41666666666666,
4705
- "entropy": 0.08086393773555756,
4706
- "epoch": 2.441205053449951,
4707
- "frac_reward_zero_std": 0.3802083333333333,
4708
- "grad_norm": 0.10650806128978729,
4709
- "learning_rate": 5e-05,
4710
- "loss": 0.0013,
4711
- "num_tokens": 204152564.0,
4712
- "reward": 11.6739342212677,
4713
- "reward_std": 0.807344543437163,
4714
- "rewards/bm25_retrieval_reward_fn/mean": 0.9262488782405853,
4715
- "rewards/bm25_retrieval_reward_fn/std": 0.19760222919285297,
4716
- "rewards/event_reward_fn/mean": 9.798177083333334,
4717
- "rewards/event_reward_fn/std": 5.788699746131897,
4718
- "rewards/format_reward_fn/mean": 0.9495081007480621,
4719
- "rewards/format_reward_fn/std": 0.18829844643672308,
4720
- "step": 2512
4721
- },
4722
- {
4723
- "clip_ratio/high_max": 0.0,
4724
- "clip_ratio/high_mean": 0.0,
4725
- "clip_ratio/low_mean": 0.0,
4726
- "clip_ratio/low_min": 0.0,
4727
- "clip_ratio/region_mean": 0.0,
4728
- "completions/clipped_ratio": 0.0712890625,
4729
- "completions/max_length": 254.8125,
4730
- "completions/max_terminated_length": 251.625,
4731
- "completions/mean_length": 213.2236328125,
4732
- "completions/mean_terminated_length": 209.86785411834717,
4733
- "completions/min_length": 171.125,
4734
- "completions/min_terminated_length": 171.125,
4735
- "entropy": 0.08136322861537337,
4736
- "epoch": 2.456754130223518,
4737
- "frac_reward_zero_std": 0.328125,
4738
- "grad_norm": 0.2153819352388382,
4739
- "learning_rate": 5e-05,
4740
- "loss": 0.0006,
4741
- "num_tokens": 205500469.0,
4742
- "reward": 12.446550607681274,
4743
- "reward_std": 1.034587848931551,
4744
- "rewards/bm25_retrieval_reward_fn/mean": 0.8715435974299908,
4745
- "rewards/bm25_retrieval_reward_fn/std": 0.28421050729230046,
4746
- "rewards/event_reward_fn/mean": 10.6826171875,
4747
- "rewards/event_reward_fn/std": 6.132740959525108,
4748
- "rewards/format_reward_fn/mean": 0.892389789223671,
4749
- "rewards/format_reward_fn/std": 0.2872252073138952,
4750
- "step": 2528
4751
- },
4752
- {
4753
- "clip_ratio/high_max": 0.0,
4754
- "clip_ratio/high_mean": 0.0,
4755
- "clip_ratio/low_mean": 0.0,
4756
- "clip_ratio/low_min": 0.0,
4757
- "clip_ratio/region_mean": 0.0,
4758
- "completions/clipped_ratio": 0.052734375,
4759
- "completions/max_length": 255.5,
4760
- "completions/max_terminated_length": 251.9375,
4761
- "completions/mean_length": 216.9306640625,
4762
- "completions/mean_terminated_length": 214.75403022766113,
4763
- "completions/min_length": 176.1875,
4764
- "completions/min_terminated_length": 176.1875,
4765
- "entropy": 0.08582799974828959,
4766
- "epoch": 2.4723032069970845,
4767
- "frac_reward_zero_std": 0.26953125,
4768
- "grad_norm": 0.1978168785572052,
4769
- "learning_rate": 5e-05,
4770
- "loss": 0.0028,
4771
- "num_tokens": 206796278.0,
4772
- "reward": 11.944559633731842,
4773
- "reward_std": 0.9803863354027271,
4774
- "rewards/bm25_retrieval_reward_fn/mean": 0.9094629287719727,
4775
- "rewards/bm25_retrieval_reward_fn/std": 0.2257095631211996,
4776
- "rewards/event_reward_fn/mean": 10.103515625,
4777
- "rewards/event_reward_fn/std": 5.498953863978386,
4778
- "rewards/format_reward_fn/mean": 0.9315809458494186,
4779
- "rewards/format_reward_fn/std": 0.228111170232296,
4780
- "step": 2544
4781
- },
4782
- {
4783
- "clip_ratio/high_max": 0.0,
4784
- "clip_ratio/high_mean": 0.0,
4785
- "clip_ratio/low_mean": 0.0,
4786
- "clip_ratio/low_min": 0.0,
4787
- "clip_ratio/region_mean": 0.0,
4788
- "completions/clipped_ratio": 0.099609375,
4789
- "completions/max_length": 255.625,
4790
- "completions/max_terminated_length": 250.5,
4791
- "completions/mean_length": 217.259765625,
4792
- "completions/mean_terminated_length": 213.05884075164795,
4793
- "completions/min_length": 170.0,
4794
- "completions/min_terminated_length": 170.0,
4795
- "entropy": 0.08552914392203093,
4796
- "epoch": 2.487852283770651,
4797
- "frac_reward_zero_std": 0.28515625,
4798
- "grad_norm": 0.16805820167064667,
4799
- "learning_rate": 5e-05,
4800
- "loss": 0.007,
4801
- "num_tokens": 208168132.0,
4802
- "reward": 11.79398000240326,
4803
- "reward_std": 0.9481483921408653,
4804
- "rewards/bm25_retrieval_reward_fn/mean": 0.868280190974474,
4805
- "rewards/bm25_retrieval_reward_fn/std": 0.28656442323699594,
4806
- "rewards/event_reward_fn/mean": 10.037109375,
4807
- "rewards/event_reward_fn/std": 6.124383822083473,
4808
- "rewards/format_reward_fn/mean": 0.8885904960334301,
4809
- "rewards/format_reward_fn/std": 0.28199191950261593,
4810
- "step": 2560
4811
- },
4812
- {
4813
- "clip_ratio/high_max": 0.0,
4814
- "clip_ratio/high_mean": 0.0,
4815
- "clip_ratio/low_mean": 0.0,
4816
- "clip_ratio/low_min": 0.0,
4817
- "clip_ratio/region_mean": 0.0,
4818
- "completions/clipped_ratio": 0.0458984375,
4819
- "completions/max_length": 253.1875,
4820
- "completions/max_terminated_length": 248.0,
4821
- "completions/mean_length": 210.638671875,
4822
- "completions/mean_terminated_length": 208.40313148498535,
4823
- "completions/min_length": 173.625,
4824
- "completions/min_terminated_length": 173.625,
4825
- "entropy": 0.08260456612333655,
4826
- "epoch": 2.503401360544218,
4827
- "frac_reward_zero_std": 0.33203125,
4828
- "grad_norm": 0.09216822683811188,
4829
- "learning_rate": 5e-05,
4830
- "loss": -0.001,
4831
- "num_tokens": 209501810.0,
4832
- "reward": 11.016095101833344,
4833
- "reward_std": 0.860798167064786,
4834
- "rewards/bm25_retrieval_reward_fn/mean": 0.9306312911212444,
4835
- "rewards/bm25_retrieval_reward_fn/std": 0.19113765214569867,
4836
- "rewards/event_reward_fn/mean": 9.1396484375,
4837
- "rewards/event_reward_fn/std": 5.75250081717968,
4838
- "rewards/format_reward_fn/mean": 0.9458155073225498,
4839
- "rewards/format_reward_fn/std": 0.19293752522207797,
4840
- "step": 2576
4841
- },
4842
- {
4843
- "clip_ratio/high_max": 0.0,
4844
- "clip_ratio/high_mean": 0.0,
4845
- "clip_ratio/low_mean": 0.0,
4846
- "clip_ratio/low_min": 0.0,
4847
- "clip_ratio/region_mean": 0.0,
4848
- "completions/clipped_ratio": 0.01953125,
4849
- "completions/max_length": 247.375,
4850
- "completions/max_terminated_length": 242.875,
4851
- "completions/mean_length": 203.1708984375,
4852
- "completions/mean_terminated_length": 202.13115978240967,
4853
- "completions/min_length": 163.9375,
4854
- "completions/min_terminated_length": 163.9375,
4855
- "entropy": 0.08376244455575943,
4856
- "epoch": 2.518950437317784,
4857
- "frac_reward_zero_std": 0.35546875,
4858
- "grad_norm": 0.21585437655448914,
4859
- "learning_rate": 5e-05,
4860
- "loss": -0.0033,
4861
- "num_tokens": 210795597.0,
4862
- "reward": 10.72483429312706,
4863
- "reward_std": 0.7541004437953234,
4864
- "rewards/bm25_retrieval_reward_fn/mean": 0.9423710107803345,
4865
- "rewards/bm25_retrieval_reward_fn/std": 0.14322513493243605,
4866
- "rewards/event_reward_fn/mean": 8.8203125,
4867
- "rewards/event_reward_fn/std": 5.188908696174622,
4868
- "rewards/format_reward_fn/mean": 0.9621507674455643,
4869
- "rewards/format_reward_fn/std": 0.13946166937239468,
4870
- "step": 2592
4871
- },
4872
- {
4873
- "clip_ratio/high_max": 0.0,
4874
- "clip_ratio/high_mean": 0.0,
4875
- "clip_ratio/low_mean": 0.0,
4876
- "clip_ratio/low_min": 0.0,
4877
- "clip_ratio/region_mean": 0.0,
4878
- "completions/clipped_ratio": 0.0595703125,
4879
- "completions/max_length": 253.9375,
4880
- "completions/max_terminated_length": 249.0,
4881
- "completions/mean_length": 210.505859375,
4882
- "completions/mean_terminated_length": 207.62176704406738,
4883
- "completions/min_length": 173.25,
4884
- "completions/min_terminated_length": 173.25,
4885
- "entropy": 0.08947332156822085,
4886
- "epoch": 2.534499514091351,
4887
- "frac_reward_zero_std": 0.3359375,
4888
- "grad_norm": 0.14945168793201447,
4889
- "learning_rate": 5e-05,
4890
- "loss": -0.0063,
4891
- "num_tokens": 212112795.0,
4892
- "reward": 11.506966352462769,
4893
- "reward_std": 0.794132512062788,
4894
- "rewards/bm25_retrieval_reward_fn/mean": 0.9178526736795902,
4895
- "rewards/bm25_retrieval_reward_fn/std": 0.22195658483542502,
4896
- "rewards/event_reward_fn/mean": 9.6572265625,
4897
- "rewards/event_reward_fn/std": 5.744891852140427,
4898
- "rewards/format_reward_fn/mean": 0.9318870939314365,
4899
- "rewards/format_reward_fn/std": 0.2204800380859524,
4900
- "step": 2608
4901
- },
4902
- {
4903
- "clip_ratio/high_max": 0.0,
4904
- "clip_ratio/high_mean": 0.0,
4905
- "clip_ratio/low_mean": 0.0,
4906
- "clip_ratio/low_min": 0.0,
4907
- "clip_ratio/region_mean": 0.0,
4908
- "completions/clipped_ratio": 0.0888671875,
4909
- "completions/max_length": 256.0,
4910
- "completions/max_terminated_length": 250.4375,
4911
- "completions/mean_length": 216.943359375,
4912
- "completions/mean_terminated_length": 213.17963314056396,
4913
- "completions/min_length": 173.0625,
4914
- "completions/min_terminated_length": 173.0625,
4915
- "entropy": 0.08383294614031911,
4916
- "epoch": 2.5500485908649173,
4917
- "frac_reward_zero_std": 0.3203125,
4918
- "grad_norm": 0.1446155160665512,
4919
- "learning_rate": 5e-05,
4920
- "loss": 0.0028,
4921
- "num_tokens": 213441821.0,
4922
- "reward": 11.958227455615997,
4923
- "reward_std": 0.8823277465999126,
4924
- "rewards/bm25_retrieval_reward_fn/mean": 0.8775606565177441,
4925
- "rewards/bm25_retrieval_reward_fn/std": 0.2921582367271185,
4926
- "rewards/event_reward_fn/mean": 10.185546875,
4927
- "rewards/event_reward_fn/std": 5.809098601341248,
4928
- "rewards/format_reward_fn/mean": 0.8951199762523174,
4929
- "rewards/format_reward_fn/std": 0.29349780175834894,
4930
- "step": 2624
4931
- },
4932
- {
4933
- "clip_ratio/high_max": 0.0,
4934
- "clip_ratio/high_mean": 0.0,
4935
- "clip_ratio/low_mean": 0.0,
4936
- "clip_ratio/low_min": 0.0,
4937
- "clip_ratio/region_mean": 0.0,
4938
- "completions/clipped_ratio": 0.052734375,
4939
- "completions/max_length": 253.0,
4940
- "completions/max_terminated_length": 251.0625,
4941
- "completions/mean_length": 213.916015625,
4942
- "completions/mean_terminated_length": 211.6506052017212,
4943
- "completions/min_length": 178.5,
4944
- "completions/min_terminated_length": 178.5,
4945
- "entropy": 0.08359973039478064,
4946
- "epoch": 2.565597667638484,
4947
- "frac_reward_zero_std": 0.28125,
4948
- "grad_norm": 0.16694338619709015,
4949
- "learning_rate": 5e-05,
4950
- "loss": -0.0005,
4951
- "num_tokens": 214813067.0,
4952
- "reward": 11.792637586593628,
4953
- "reward_std": 0.8656186051666737,
4954
- "rewards/bm25_retrieval_reward_fn/mean": 0.8819389827549458,
4955
- "rewards/bm25_retrieval_reward_fn/std": 0.24387728050351143,
4956
- "rewards/event_reward_fn/mean": 9.998046875,
4957
- "rewards/event_reward_fn/std": 5.933807298541069,
4958
- "rewards/format_reward_fn/mean": 0.9126519113779068,
4959
- "rewards/format_reward_fn/std": 0.2305635418742895,
4960
- "step": 2640
4961
- },
4962
- {
4963
- "clip_ratio/high_max": 0.0,
4964
- "clip_ratio/high_mean": 0.0,
4965
- "clip_ratio/low_mean": 0.0,
4966
- "clip_ratio/low_min": 0.0,
4967
- "clip_ratio/region_mean": 0.0,
4968
- "completions/clipped_ratio": 0.0771484375,
4969
- "completions/max_length": 255.4375,
4970
- "completions/max_terminated_length": 249.875,
4971
- "completions/mean_length": 215.8505859375,
4972
- "completions/mean_terminated_length": 212.57020092010498,
4973
- "completions/min_length": 178.0,
4974
- "completions/min_terminated_length": 178.0,
4975
- "entropy": 0.08900781767442822,
4976
- "epoch": 2.5811467444120506,
4977
- "frac_reward_zero_std": 0.29296875,
4978
- "grad_norm": 0.1385410875082016,
4979
- "learning_rate": 5e-05,
4980
- "loss": 0.0008,
4981
- "num_tokens": 216172750.0,
4982
- "reward": 11.35420310497284,
4983
- "reward_std": 0.8416576944291592,
4984
- "rewards/bm25_retrieval_reward_fn/mean": 0.8935875110328197,
4985
- "rewards/bm25_retrieval_reward_fn/std": 0.26079373457469046,
4986
- "rewards/event_reward_fn/mean": 9.5556640625,
4987
- "rewards/event_reward_fn/std": 5.99031862616539,
4988
- "rewards/format_reward_fn/mean": 0.904951486736536,
4989
- "rewards/format_reward_fn/std": 0.2644943995401263,
4990
- "step": 2656
4991
- },
4992
- {
4993
- "clip_ratio/high_max": 0.0,
4994
- "clip_ratio/high_mean": 0.0,
4995
- "clip_ratio/low_mean": 0.0,
4996
- "clip_ratio/low_min": 0.0,
4997
- "clip_ratio/region_mean": 0.0,
4998
- "completions/clipped_ratio": 0.0361328125,
4999
- "completions/max_length": 254.625,
5000
- "completions/max_terminated_length": 249.75,
5001
- "completions/mean_length": 212.0947265625,
5002
- "completions/mean_terminated_length": 210.445143699646,
5003
- "completions/min_length": 177.1875,
5004
- "completions/min_terminated_length": 177.1875,
5005
- "entropy": 0.08960987254977226,
5006
- "epoch": 2.5966958211856173,
5007
- "frac_reward_zero_std": 0.35546875,
5008
- "grad_norm": 0.15648344159126282,
5009
- "learning_rate": 5e-05,
5010
- "loss": 0.0023,
5011
- "num_tokens": 217499543.0,
5012
- "reward": 11.51008290052414,
5013
- "reward_std": 0.7766602244228125,
5014
- "rewards/bm25_retrieval_reward_fn/mean": 0.9093844145536423,
5015
- "rewards/bm25_retrieval_reward_fn/std": 0.23443537193816155,
5016
- "rewards/event_reward_fn/mean": 9.6806640625,
5017
- "rewards/event_reward_fn/std": 5.529717803001404,
5018
- "rewards/format_reward_fn/mean": 0.9200344160199165,
5019
- "rewards/format_reward_fn/std": 0.23509666486643255,
5020
- "step": 2672
5021
- },
5022
- {
5023
- "clip_ratio/high_max": 0.0,
5024
- "clip_ratio/high_mean": 0.0,
5025
- "clip_ratio/low_mean": 0.0,
5026
- "clip_ratio/low_min": 0.0,
5027
- "clip_ratio/region_mean": 0.0,
5028
- "completions/clipped_ratio": 0.037109375,
5029
- "completions/max_length": 251.6875,
5030
- "completions/max_terminated_length": 247.0,
5031
- "completions/mean_length": 208.2880859375,
5032
- "completions/mean_terminated_length": 206.4787950515747,
5033
- "completions/min_length": 165.625,
5034
- "completions/min_terminated_length": 165.625,
5035
- "entropy": 0.09177634166553617,
5036
- "epoch": 2.612244897959184,
5037
- "frac_reward_zero_std": 0.30078125,
5038
- "grad_norm": 0.14749974012374878,
5039
- "learning_rate": 5e-05,
5040
- "loss": 0.0004,
5041
- "num_tokens": 218822370.0,
5042
- "reward": 11.045877933502197,
5043
- "reward_std": 0.9622980132699013,
5044
- "rewards/bm25_retrieval_reward_fn/mean": 0.9265813454985619,
5045
- "rewards/bm25_retrieval_reward_fn/std": 0.20256941742263734,
5046
- "rewards/event_reward_fn/mean": 9.1845703125,
5047
- "rewards/event_reward_fn/std": 5.212202668190002,
5048
- "rewards/format_reward_fn/mean": 0.9347261041402817,
5049
- "rewards/format_reward_fn/std": 0.2086858821567148,
5050
- "step": 2688
5051
- },
5052
- {
5053
- "clip_ratio/high_max": 0.0,
5054
- "clip_ratio/high_mean": 0.0,
5055
- "clip_ratio/low_mean": 0.0,
5056
- "clip_ratio/low_min": 0.0,
5057
- "clip_ratio/region_mean": 0.0,
5058
- "completions/clipped_ratio": 0.01953125,
5059
- "completions/max_length": 247.8125,
5060
- "completions/max_terminated_length": 243.4375,
5061
- "completions/mean_length": 200.87890625,
5062
- "completions/mean_terminated_length": 199.8343276977539,
5063
- "completions/min_length": 167.4375,
5064
- "completions/min_terminated_length": 167.4375,
5065
- "entropy": 0.09632771136239171,
5066
- "epoch": 2.62779397473275,
5067
- "frac_reward_zero_std": 0.28515625,
5068
- "grad_norm": 0.1771780103445053,
5069
- "learning_rate": 5e-05,
5070
- "loss": -0.008,
5071
- "num_tokens": 220135962.0,
5072
- "reward": 12.231472432613373,
5073
- "reward_std": 0.8915320560336113,
5074
- "rewards/bm25_retrieval_reward_fn/mean": 0.9349792711436749,
5075
- "rewards/bm25_retrieval_reward_fn/std": 0.16558197524864227,
5076
- "rewards/event_reward_fn/mean": 10.35546875,
5077
- "rewards/event_reward_fn/std": 5.747018381953239,
5078
- "rewards/format_reward_fn/mean": 0.9410244673490524,
5079
- "rewards/format_reward_fn/std": 0.17926215915940702,
5080
- "step": 2704
5081
- },
5082
- {
5083
- "clip_ratio/high_max": 0.0,
5084
- "clip_ratio/high_mean": 0.0,
5085
- "clip_ratio/low_mean": 0.0,
5086
- "clip_ratio/low_min": 0.0,
5087
- "clip_ratio/region_mean": 0.0,
5088
- "completions/clipped_ratio": 0.044921875,
5089
- "completions/max_length": 252.8125,
5090
- "completions/max_terminated_length": 246.3125,
5091
- "completions/mean_length": 209.3701171875,
5092
- "completions/mean_terminated_length": 207.07564544677734,
5093
- "completions/min_length": 170.625,
5094
- "completions/min_terminated_length": 170.625,
5095
- "entropy": 0.10564424749463797,
5096
- "epoch": 2.6433430515063168,
5097
- "frac_reward_zero_std": 0.22265625,
5098
- "grad_norm": 0.10102769732475281,
5099
- "learning_rate": 5e-05,
5100
- "loss": -0.0013,
5101
- "num_tokens": 221542285.0,
5102
- "reward": 12.009974837303162,
5103
- "reward_std": 0.9928734712302685,
5104
- "rewards/bm25_retrieval_reward_fn/mean": 0.9175033271312714,
5105
- "rewards/bm25_retrieval_reward_fn/std": 0.1976611790014431,
5106
- "rewards/event_reward_fn/mean": 10.1767578125,
5107
- "rewards/event_reward_fn/std": 6.012263968586922,
5108
- "rewards/format_reward_fn/mean": 0.9157139807939529,
5109
- "rewards/format_reward_fn/std": 0.21656434168107808,
5110
- "step": 2720
5111
- },
5112
- {
5113
- "clip_ratio/high_max": 0.0,
5114
- "clip_ratio/high_mean": 0.0,
5115
- "clip_ratio/low_mean": 0.0,
5116
- "clip_ratio/low_min": 0.0,
5117
- "clip_ratio/region_mean": 0.0,
5118
- "completions/clipped_ratio": 0.03515625,
5119
- "completions/max_length": 255.0625,
5120
- "completions/max_terminated_length": 253.375,
5121
- "completions/mean_length": 213.7919921875,
5122
- "completions/mean_terminated_length": 212.31354141235352,
5123
- "completions/min_length": 174.5,
5124
- "completions/min_terminated_length": 174.5,
5125
- "entropy": 0.10326679470017552,
5126
- "epoch": 2.6588921282798834,
5127
- "frac_reward_zero_std": 0.23828125,
5128
- "grad_norm": 0.15221992135047913,
5129
- "learning_rate": 5e-05,
5130
- "loss": 0.0011,
5131
- "num_tokens": 222797308.0,
5132
- "reward": 11.387903690338135,
5133
- "reward_std": 0.911373607814312,
5134
- "rewards/bm25_retrieval_reward_fn/mean": 0.9273334704339504,
5135
- "rewards/bm25_retrieval_reward_fn/std": 0.18763835495337844,
5136
- "rewards/event_reward_fn/mean": 9.5263671875,
5137
- "rewards/event_reward_fn/std": 5.706304341554642,
5138
- "rewards/format_reward_fn/mean": 0.9342031031847,
5139
- "rewards/format_reward_fn/std": 0.2074666447006166,
5140
- "step": 2736
5141
- },
5142
- {
5143
- "clip_ratio/high_max": 0.0,
5144
- "clip_ratio/high_mean": 0.0,
5145
- "clip_ratio/low_mean": 0.0,
5146
- "clip_ratio/low_min": 0.0,
5147
- "clip_ratio/region_mean": 0.0,
5148
- "completions/clipped_ratio": 0.0224609375,
5149
- "completions/max_length": 250.375,
5150
- "completions/max_terminated_length": 247.4375,
5151
- "completions/mean_length": 207.4228515625,
5152
- "completions/mean_terminated_length": 206.33260917663574,
5153
- "completions/min_length": 170.25,
5154
- "completions/min_terminated_length": 170.25,
5155
- "entropy": 0.09272929606959224,
5156
- "epoch": 2.67444120505345,
5157
- "frac_reward_zero_std": 0.30859375,
5158
- "grad_norm": 0.21459202468395233,
5159
- "learning_rate": 5e-05,
5160
- "loss": -0.0035,
5161
- "num_tokens": 224091517.0,
5162
- "reward": 11.830156862735748,
5163
- "reward_std": 0.8045283071696758,
5164
- "rewards/bm25_retrieval_reward_fn/mean": 0.9445540346205235,
5165
- "rewards/bm25_retrieval_reward_fn/std": 0.15531712002120912,
5166
- "rewards/event_reward_fn/mean": 9.9248046875,
5167
- "rewards/event_reward_fn/std": 5.416922226548195,
5168
- "rewards/format_reward_fn/mean": 0.9607979953289032,
5169
- "rewards/format_reward_fn/std": 0.1495908577926457,
5170
- "step": 2752
5171
- },
5172
- {
5173
- "clip_ratio/high_max": 0.0,
5174
- "clip_ratio/high_mean": 0.0,
5175
- "clip_ratio/low_mean": 0.0,
5176
- "clip_ratio/low_min": 0.0,
5177
- "clip_ratio/region_mean": 0.0,
5178
- "completions/clipped_ratio": 0.03125,
5179
- "completions/max_length": 250.4375,
5180
- "completions/max_terminated_length": 247.125,
5181
- "completions/mean_length": 213.107421875,
5182
- "completions/mean_terminated_length": 211.65931701660156,
5183
- "completions/min_length": 173.0625,
5184
- "completions/min_terminated_length": 173.0625,
5185
- "entropy": 0.08341792924329638,
5186
- "epoch": 2.6899902818270167,
5187
- "frac_reward_zero_std": 0.3515625,
5188
- "grad_norm": 0.06316018104553223,
5189
- "learning_rate": 5e-05,
5190
- "loss": -0.0016,
5191
- "num_tokens": 225405591.0,
5192
- "reward": 11.755984246730804,
5193
- "reward_std": 0.8010260127484798,
5194
- "rewards/bm25_retrieval_reward_fn/mean": 0.9238302148878574,
5195
- "rewards/bm25_retrieval_reward_fn/std": 0.17838482139632106,
5196
- "rewards/event_reward_fn/mean": 9.8876953125,
5197
- "rewards/event_reward_fn/std": 5.315806642174721,
5198
- "rewards/format_reward_fn/mean": 0.9444587081670761,
5199
- "rewards/format_reward_fn/std": 0.16798695269972086,
5200
- "step": 2768
5201
- },
5202
- {
5203
- "clip_ratio/high_max": 0.0,
5204
- "clip_ratio/high_mean": 0.0,
5205
- "clip_ratio/low_mean": 0.0,
5206
- "clip_ratio/low_min": 0.0,
5207
- "clip_ratio/region_mean": 0.0,
5208
- "completions/clipped_ratio": 0.0849609375,
5209
- "completions/max_length": 255.375,
5210
- "completions/max_terminated_length": 251.875,
5211
- "completions/mean_length": 216.7353515625,
5212
- "completions/mean_terminated_length": 213.09019565582275,
5213
- "completions/min_length": 174.0625,
5214
- "completions/min_terminated_length": 174.0625,
5215
- "entropy": 0.08323041070252657,
5216
- "epoch": 2.705539358600583,
5217
- "frac_reward_zero_std": 0.3515625,
5218
- "grad_norm": 0.2660459578037262,
5219
- "learning_rate": 5e-05,
5220
- "loss": 0.0082,
5221
- "num_tokens": 226754744.0,
5222
- "reward": 11.574803471565247,
5223
- "reward_std": 0.8111933209002018,
5224
- "rewards/bm25_retrieval_reward_fn/mean": 0.8713752776384354,
5225
- "rewards/bm25_retrieval_reward_fn/std": 0.28300391032826155,
5226
- "rewards/event_reward_fn/mean": 9.8173828125,
5227
- "rewards/event_reward_fn/std": 5.8233465403318405,
5228
- "rewards/format_reward_fn/mean": 0.8860453926026821,
5229
- "rewards/format_reward_fn/std": 0.28504633717238903,
5230
- "step": 2784
5231
- },
5232
- {
5233
- "clip_ratio/high_max": 0.0,
5234
- "clip_ratio/high_mean": 0.0,
5235
- "clip_ratio/low_mean": 0.0,
5236
- "clip_ratio/low_min": 0.0,
5237
- "clip_ratio/region_mean": 0.0,
5238
- "completions/clipped_ratio": 0.0537109375,
5239
- "completions/max_length": 254.125,
5240
- "completions/max_terminated_length": 250.625,
5241
- "completions/mean_length": 212.8642578125,
5242
- "completions/mean_terminated_length": 210.45547103881836,
5243
- "completions/min_length": 169.0,
5244
- "completions/min_terminated_length": 169.0,
5245
- "entropy": 0.08199881995096803,
5246
- "epoch": 2.7210884353741496,
5247
- "frac_reward_zero_std": 0.296875,
5248
- "grad_norm": 0.08463400602340698,
5249
- "learning_rate": 5e-05,
5250
- "loss": 0.0013,
5251
- "num_tokens": 228098621.0,
5252
- "reward": 11.454033315181732,
5253
- "reward_std": 0.836145743727684,
5254
- "rewards/bm25_retrieval_reward_fn/mean": 0.9158100821077824,
5255
- "rewards/bm25_retrieval_reward_fn/std": 0.19612007169052958,
5256
- "rewards/event_reward_fn/mean": 9.603515625,
5257
- "rewards/event_reward_fn/std": 5.212661325931549,
5258
- "rewards/format_reward_fn/mean": 0.9347075000405312,
5259
- "rewards/format_reward_fn/std": 0.1920458609238267,
5260
- "step": 2800
5261
- },
5262
- {
5263
- "clip_ratio/high_max": 0.0,
5264
- "clip_ratio/high_mean": 0.0,
5265
- "clip_ratio/low_mean": 0.0,
5266
- "clip_ratio/low_min": 0.0,
5267
- "clip_ratio/region_mean": 0.0,
5268
- "completions/clipped_ratio": 0.0380859375,
5269
- "completions/max_length": 254.125,
5270
- "completions/max_terminated_length": 249.0625,
5271
- "completions/mean_length": 209.9765625,
5272
- "completions/mean_terminated_length": 208.1564769744873,
5273
- "completions/min_length": 168.875,
5274
- "completions/min_terminated_length": 168.875,
5275
- "entropy": 0.0759361800737679,
5276
- "epoch": 2.7366375121477162,
5277
- "frac_reward_zero_std": 0.328125,
5278
- "grad_norm": 0.14759230613708496,
5279
- "learning_rate": 5e-05,
5280
- "loss": -0.0053,
5281
- "num_tokens": 229429565.0,
5282
- "reward": 12.068866312503815,
5283
- "reward_std": 0.8678888715803623,
5284
- "rewards/bm25_retrieval_reward_fn/mean": 0.9354286342859268,
5285
- "rewards/bm25_retrieval_reward_fn/std": 0.19999602530151606,
5286
- "rewards/event_reward_fn/mean": 10.185546875,
5287
- "rewards/event_reward_fn/std": 6.09708933532238,
5288
- "rewards/format_reward_fn/mean": 0.9478906244039536,
5289
- "rewards/format_reward_fn/std": 0.2007538639008999,
5290
- "step": 2816
5291
- },
5292
- {
5293
- "clip_ratio/high_max": 0.0,
5294
- "clip_ratio/high_mean": 0.0,
5295
- "clip_ratio/low_mean": 0.0,
5296
- "clip_ratio/low_min": 0.0,
5297
- "clip_ratio/region_mean": 0.0,
5298
- "completions/clipped_ratio": 0.07421875,
5299
- "completions/max_length": 255.0,
5300
- "completions/max_terminated_length": 251.4375,
5301
- "completions/mean_length": 217.4892578125,
5302
- "completions/mean_terminated_length": 214.5166711807251,
5303
- "completions/min_length": 171.125,
5304
- "completions/min_terminated_length": 171.125,
5305
- "entropy": 0.07404683344066143,
5306
- "epoch": 2.752186588921283,
5307
- "frac_reward_zero_std": 0.33203125,
5308
- "grad_norm": 0.18848936259746552,
5309
- "learning_rate": 5e-05,
5310
- "loss": 0.0016,
5311
- "num_tokens": 230773106.0,
5312
- "reward": 12.326741218566895,
5313
- "reward_std": 0.9671976566314697,
5314
- "rewards/bm25_retrieval_reward_fn/mean": 0.89621976390481,
5315
- "rewards/bm25_retrieval_reward_fn/std": 0.23690359899774194,
5316
- "rewards/event_reward_fn/mean": 10.515625,
5317
- "rewards/event_reward_fn/std": 5.634042501449585,
5318
- "rewards/format_reward_fn/mean": 0.9148964546620846,
5319
- "rewards/format_reward_fn/std": 0.2338833932299167,
5320
- "step": 2832
5321
- },
5322
- {
5323
- "clip_ratio/high_max": 0.0,
5324
- "clip_ratio/high_mean": 0.0,
5325
- "clip_ratio/low_mean": 0.0,
5326
- "clip_ratio/low_min": 0.0,
5327
- "clip_ratio/region_mean": 0.0,
5328
- "completions/clipped_ratio": 0.060546875,
5329
- "completions/max_length": 255.875,
5330
- "completions/max_terminated_length": 252.625,
5331
- "completions/mean_length": 217.236328125,
5332
- "completions/mean_terminated_length": 214.71324062347412,
5333
- "completions/min_length": 172.9375,
5334
- "completions/min_terminated_length": 172.9375,
5335
- "entropy": 0.08189457282423973,
5336
- "epoch": 2.7677356656948495,
5337
- "frac_reward_zero_std": 0.3046875,
5338
- "grad_norm": 0.20657600462436676,
5339
- "learning_rate": 5e-05,
5340
- "loss": 0.0028,
5341
- "num_tokens": 232070576.0,
5342
- "reward": 11.5172398686409,
5343
- "reward_std": 0.8970336727797985,
5344
- "rewards/bm25_retrieval_reward_fn/mean": 0.9108828380703926,
5345
- "rewards/bm25_retrieval_reward_fn/std": 0.22826198721304536,
5346
- "rewards/event_reward_fn/mean": 9.673828125,
5347
- "rewards/event_reward_fn/std": 5.733745768666267,
5348
- "rewards/format_reward_fn/mean": 0.9325288347899914,
5349
- "rewards/format_reward_fn/std": 0.226588967256248,
5350
- "step": 2848
5351
- },
5352
- {
5353
- "clip_ratio/high_max": 0.0,
5354
- "clip_ratio/high_mean": 0.0,
5355
- "clip_ratio/low_mean": 0.0,
5356
- "clip_ratio/low_min": 0.0,
5357
- "clip_ratio/region_mean": 0.0,
5358
- "completions/clipped_ratio": 0.056640625,
5359
- "completions/max_length": 252.5625,
5360
- "completions/max_terminated_length": 249.4375,
5361
- "completions/mean_length": 213.166015625,
5362
- "completions/mean_terminated_length": 210.57254600524902,
5363
- "completions/min_length": 167.75,
5364
- "completions/min_terminated_length": 167.75,
5365
- "entropy": 0.08049681456759572,
5366
- "epoch": 2.7832847424684157,
5367
- "frac_reward_zero_std": 0.33203125,
5368
- "grad_norm": 0.05886400490999222,
5369
- "learning_rate": 5e-05,
5370
- "loss": 0.0027,
5371
- "num_tokens": 233415010.0,
5372
- "reward": 11.385997593402863,
5373
- "reward_std": 0.7553573679178953,
5374
- "rewards/bm25_retrieval_reward_fn/mean": 0.9164049662649632,
5375
- "rewards/bm25_retrieval_reward_fn/std": 0.2016591742867604,
5376
- "rewards/event_reward_fn/mean": 9.53515625,
5377
- "rewards/event_reward_fn/std": 5.440419033169746,
5378
- "rewards/format_reward_fn/mean": 0.93443638458848,
5379
- "rewards/format_reward_fn/std": 0.19143922347575426,
5380
- "step": 2864
5381
- },
5382
- {
5383
- "clip_ratio/high_max": 0.0,
5384
- "clip_ratio/high_mean": 0.0,
5385
- "clip_ratio/low_mean": 0.0,
5386
- "clip_ratio/low_min": 0.0,
5387
- "clip_ratio/region_mean": 0.0,
5388
- "completions/clipped_ratio": 0.0498046875,
5389
- "completions/max_length": 254.3125,
5390
- "completions/max_terminated_length": 250.375,
5391
- "completions/mean_length": 214.185546875,
5392
- "completions/mean_terminated_length": 212.15652561187744,
5393
- "completions/min_length": 172.1875,
5394
- "completions/min_terminated_length": 172.1875,
5395
- "entropy": 0.0816779644228518,
5396
- "epoch": 2.7988338192419824,
5397
- "frac_reward_zero_std": 0.35546875,
5398
- "grad_norm": 0.0916222557425499,
5399
- "learning_rate": 5e-05,
5400
- "loss": -0.0032,
5401
- "num_tokens": 234744436.0,
5402
- "reward": 11.663362562656403,
5403
- "reward_std": 0.9015852566808462,
5404
- "rewards/bm25_retrieval_reward_fn/mean": 0.9179877303540707,
5405
- "rewards/bm25_retrieval_reward_fn/std": 0.2022923786425963,
5406
- "rewards/event_reward_fn/mean": 9.8125,
5407
- "rewards/event_reward_fn/std": 5.319433629512787,
5408
- "rewards/format_reward_fn/mean": 0.9328748136758804,
5409
- "rewards/format_reward_fn/std": 0.20254582911729813,
5410
- "step": 2880
5411
- },
5412
- {
5413
- "clip_ratio/high_max": 0.0,
5414
- "clip_ratio/high_mean": 0.0,
5415
- "clip_ratio/low_mean": 0.0,
5416
- "clip_ratio/low_min": 0.0,
5417
- "clip_ratio/region_mean": 0.0,
5418
- "completions/clipped_ratio": 0.0966796875,
5419
- "completions/max_length": 256.0,
5420
- "completions/max_terminated_length": 254.5625,
5421
- "completions/mean_length": 220.1044921875,
5422
- "completions/mean_terminated_length": 216.32228183746338,
5423
- "completions/min_length": 171.5,
5424
- "completions/min_terminated_length": 171.5,
5425
- "entropy": 0.08011228078976274,
5426
- "epoch": 2.814382896015549,
5427
- "frac_reward_zero_std": 0.3515625,
5428
- "grad_norm": 0.08450505882501602,
5429
- "learning_rate": 5e-05,
5430
- "loss": 0.0032,
5431
- "num_tokens": 236075035.0,
5432
- "reward": 11.988969624042511,
5433
- "reward_std": 0.7974276356399059,
5434
- "rewards/bm25_retrieval_reward_fn/mean": 0.8782762736082077,
5435
- "rewards/bm25_retrieval_reward_fn/std": 0.2924462389200926,
5436
- "rewards/event_reward_fn/mean": 10.2158203125,
5437
- "rewards/event_reward_fn/std": 5.5798052698373795,
5438
- "rewards/format_reward_fn/mean": 0.8948730453848839,
5439
- "rewards/format_reward_fn/std": 0.2953194109722972,
5440
- "step": 2896
5441
- },
5442
- {
5443
- "clip_ratio/high_max": 0.0,
5444
- "clip_ratio/high_mean": 0.0,
5445
- "clip_ratio/low_mean": 0.0,
5446
- "clip_ratio/low_min": 0.0,
5447
- "clip_ratio/region_mean": 0.0,
5448
- "completions/clipped_ratio": 0.0849609375,
5449
- "completions/max_length": 255.875,
5450
- "completions/max_terminated_length": 254.3125,
5451
- "completions/mean_length": 219.5263671875,
5452
- "completions/mean_terminated_length": 216.11603832244873,
5453
- "completions/min_length": 175.8125,
5454
- "completions/min_terminated_length": 175.8125,
5455
- "entropy": 0.08505099918693304,
5456
- "epoch": 2.8299319727891157,
5457
- "frac_reward_zero_std": 0.3359375,
5458
- "grad_norm": 0.12416191399097443,
5459
- "learning_rate": 5e-05,
5460
- "loss": 0.0027,
5461
- "num_tokens": 237421442.0,
5462
- "reward": 11.693400919437408,
5463
- "reward_std": 0.8284243606030941,
5464
- "rewards/bm25_retrieval_reward_fn/mean": 0.8833912238478661,
5465
- "rewards/bm25_retrieval_reward_fn/std": 0.267708154162392,
5466
- "rewards/event_reward_fn/mean": 9.90625,
5467
- "rewards/event_reward_fn/std": 5.605697572231293,
5468
- "rewards/format_reward_fn/mean": 0.9037597663700581,
5469
- "rewards/format_reward_fn/std": 0.2683409294113517,
5470
- "step": 2912
5471
- },
5472
- {
5473
- "clip_ratio/high_max": 0.0,
5474
- "clip_ratio/high_mean": 0.0,
5475
- "clip_ratio/low_mean": 0.0,
5476
- "clip_ratio/low_min": 0.0,
5477
- "clip_ratio/region_mean": 0.0,
5478
- "completions/clipped_ratio": 0.033203125,
5479
- "completions/max_length": 254.5625,
5480
- "completions/max_terminated_length": 252.0,
5481
- "completions/mean_length": 213.083984375,
5482
- "completions/mean_terminated_length": 211.6382074356079,
5483
- "completions/min_length": 169.9375,
5484
- "completions/min_terminated_length": 169.9375,
5485
- "entropy": 0.08473130548372865,
5486
- "epoch": 2.8454810495626823,
5487
- "frac_reward_zero_std": 0.34375,
5488
- "grad_norm": 0.06745623797178268,
5489
- "learning_rate": 5e-05,
5490
- "loss": 0.0006,
5491
- "num_tokens": 238715276.0,
5492
- "reward": 11.77674776315689,
5493
- "reward_std": 0.685878150165081,
5494
- "rewards/bm25_retrieval_reward_fn/mean": 0.9190886318683624,
5495
- "rewards/bm25_retrieval_reward_fn/std": 0.19958114624023438,
5496
- "rewards/event_reward_fn/mean": 9.9150390625,
5497
- "rewards/event_reward_fn/std": 5.207145616412163,
5498
- "rewards/format_reward_fn/mean": 0.9426199793815613,
5499
- "rewards/format_reward_fn/std": 0.1927571757696569,
5500
- "step": 2928
5501
- },
5502
- {
5503
- "clip_ratio/high_max": 0.0,
5504
- "clip_ratio/high_mean": 0.0,
5505
- "clip_ratio/low_mean": 0.0,
5506
- "clip_ratio/low_min": 0.0,
5507
- "clip_ratio/region_mean": 0.0,
5508
- "completions/clipped_ratio": 0.05078125,
5509
- "completions/max_length": 254.25,
5510
- "completions/max_terminated_length": 250.6875,
5511
- "completions/mean_length": 214.3232421875,
5512
- "completions/mean_terminated_length": 212.1032657623291,
5513
- "completions/min_length": 169.9375,
5514
- "completions/min_terminated_length": 169.9375,
5515
- "entropy": 0.08542184252291918,
5516
- "epoch": 2.8610301263362485,
5517
- "frac_reward_zero_std": 0.3203125,
5518
- "grad_norm": 0.10482887178659439,
5519
- "learning_rate": 5e-05,
5520
- "loss": -0.0002,
5521
- "num_tokens": 240057215.0,
5522
- "reward": 11.639575242996216,
5523
- "reward_std": 0.9158763885498047,
5524
- "rewards/bm25_retrieval_reward_fn/mean": 0.8979970328509808,
5525
- "rewards/bm25_retrieval_reward_fn/std": 0.23725404776632786,
5526
- "rewards/event_reward_fn/mean": 9.826171875,
5527
- "rewards/event_reward_fn/std": 5.315482467412949,
5528
- "rewards/format_reward_fn/mean": 0.9154064357280731,
5529
- "rewards/format_reward_fn/std": 0.2300750371068716,
5530
- "step": 2944
5531
- },
5532
- {
5533
- "clip_ratio/high_max": 0.0,
5534
- "clip_ratio/high_mean": 0.0,
5535
- "clip_ratio/low_mean": 0.0,
5536
- "clip_ratio/low_min": 0.0,
5537
- "clip_ratio/region_mean": 0.0,
5538
- "completions/clipped_ratio": 0.103515625,
5539
- "completions/max_length": 255.5,
5540
- "completions/max_terminated_length": 253.1875,
5541
- "completions/mean_length": 220.142578125,
5542
- "completions/mean_terminated_length": 216.11438083648682,
5543
- "completions/min_length": 169.6875,
5544
- "completions/min_terminated_length": 169.6875,
5545
- "entropy": 0.0917358947917819,
5546
- "epoch": 2.8765792031098156,
5547
- "frac_reward_zero_std": 0.34375,
5548
- "grad_norm": 0.29249680042266846,
5549
- "learning_rate": 5e-05,
5550
- "loss": -0.0023,
5551
- "num_tokens": 241437397.0,
5552
- "reward": 12.01733946800232,
5553
- "reward_std": 0.8955757319927216,
5554
- "rewards/bm25_retrieval_reward_fn/mean": 0.8503146581351757,
5555
- "rewards/bm25_retrieval_reward_fn/std": 0.3139411583542824,
5556
- "rewards/event_reward_fn/mean": 10.298828125,
5557
- "rewards/event_reward_fn/std": 5.663209050893784,
5558
- "rewards/format_reward_fn/mean": 0.868196614086628,
5559
- "rewards/format_reward_fn/std": 0.318668226711452,
5560
- "step": 2960
5561
- },
5562
- {
5563
- "clip_ratio/high_max": 0.0,
5564
- "clip_ratio/high_mean": 0.0,
5565
- "clip_ratio/low_mean": 0.0,
5566
- "clip_ratio/low_min": 0.0,
5567
- "clip_ratio/region_mean": 0.0,
5568
- "completions/clipped_ratio": 0.0791015625,
5569
- "completions/max_length": 254.0,
5570
- "completions/max_terminated_length": 250.0,
5571
- "completions/mean_length": 215.6435546875,
5572
- "completions/mean_terminated_length": 212.3185043334961,
5573
- "completions/min_length": 165.4375,
5574
- "completions/min_terminated_length": 165.4375,
5575
- "entropy": 0.09182127751410007,
5576
- "epoch": 2.892128279883382,
5577
- "frac_reward_zero_std": 0.3046875,
5578
- "grad_norm": 0.17700594663619995,
5579
- "learning_rate": 5e-05,
5580
- "loss": -0.0004,
5581
- "num_tokens": 242749992.0,
5582
- "reward": 11.478153705596924,
5583
- "reward_std": 0.8844601437449455,
5584
- "rewards/bm25_retrieval_reward_fn/mean": 0.880519162863493,
5585
- "rewards/bm25_retrieval_reward_fn/std": 0.25536160822957754,
5586
- "rewards/event_reward_fn/mean": 9.697265625,
5587
- "rewards/event_reward_fn/std": 5.846217334270477,
5588
- "rewards/format_reward_fn/mean": 0.9003689214587212,
5589
- "rewards/format_reward_fn/std": 0.24951867014169693,
5590
- "step": 2976
5591
- },
5592
- {
5593
- "clip_ratio/high_max": 0.0,
5594
- "clip_ratio/high_mean": 0.0,
5595
- "clip_ratio/low_mean": 0.0,
5596
- "clip_ratio/low_min": 0.0,
5597
- "clip_ratio/region_mean": 0.0,
5598
- "completions/clipped_ratio": 0.0615234375,
5599
- "completions/max_length": 255.5625,
5600
- "completions/max_terminated_length": 251.8125,
5601
- "completions/mean_length": 214.3974609375,
5602
- "completions/mean_terminated_length": 211.83877277374268,
5603
- "completions/min_length": 166.3125,
5604
- "completions/min_terminated_length": 166.3125,
5605
- "entropy": 0.0888472800143063,
5606
- "epoch": 2.9076773566569485,
5607
- "frac_reward_zero_std": 0.359375,
5608
- "grad_norm": 0.18143412470817566,
5609
- "learning_rate": 5e-05,
5610
- "loss": -0.0024,
5611
- "num_tokens": 244089207.0,
5612
- "reward": 11.602717459201813,
5613
- "reward_std": 0.7762532383203506,
5614
- "rewards/bm25_retrieval_reward_fn/mean": 0.8992017544806004,
5615
- "rewards/bm25_retrieval_reward_fn/std": 0.23547889525070786,
5616
- "rewards/event_reward_fn/mean": 9.78125,
5617
- "rewards/event_reward_fn/std": 5.857791095972061,
5618
- "rewards/format_reward_fn/mean": 0.9222656264901161,
5619
- "rewards/format_reward_fn/std": 0.2304223021492362,
5620
- "step": 2992
5621
  }
5622
  ],
5623
  "logging_steps": 16,
5624
  "max_steps": 10290,
5625
- "num_input_tokens_seen": 244713360,
5626
  "num_train_epochs": 10,
5627
  "save_steps": 500,
5628
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.4295432458697763,
6
  "eval_steps": 500,
7
+ "global_step": 2500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4688
  "rewards/format_reward_fn/mean": 0.9545312523841858,
4689
  "rewards/format_reward_fn/std": 0.17224382143467665,
4690
  "step": 2496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4691
  }
4692
  ],
4693
  "logging_steps": 16,
4694
  "max_steps": 10290,
4695
+ "num_input_tokens_seen": 203156199,
4696
  "num_train_epochs": 10,
4697
  "save_steps": 500,
4698
  "stateful_callbacks": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6080a1f8848f28921ef4ecb2b5afdd4a9c278c4ed3b854c9142c34b8d4a47201
3
  size 7313
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f983793f01e6ffd7237085f7a954c79829cc980c0aed4a2a0a2870848c9386
3
  size 7313