usr256864 commited on
Commit
b2973ba
·
verified ·
1 Parent(s): a50cd46

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f57e4dcb3f5a711acbcc3c7e53ddd0c18fbb2b4ef4b23b449e5c904e498bc3dc
3
  size 16794200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f49333bf8dc54c673cdb5d086454c1c2e11545428fdeab99e17f54d26dfd68b5
3
  size 16794200
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1f901deb4dfbc90e0f17d381200df60a829d4d4758a8c79a836ad21e7f7dc65
3
  size 33664331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847f243c3b0717de9d7d84a818a5cca2780f187c631f0830647f9a4f32a025e5
3
  size 33664331
rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1054442ffa3c487011bb5ffe943ab64c59e9c897d796de64033c6b88c3eaa3ae
3
  size 16325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64809719cba422cef96de4920261da9523f70c17a2b1212c597d67f1f7a800be
3
  size 16325
rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f66b4bfdd61eb727ab7daf786d19f5abbb048715af877262ed90c06b46f52ca8
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eebd48be5f45b09ad66e61b713e78075a6ed9758b0743b2ba14d0149a9d32663
3
  size 16389
rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1413ba4d11754afc68f246446acb9890c1d18acc37fc00f32a6d46c26cce616
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce6b5432b3dd7ce01917cc36abf764e4593e128309e145991b4505505b34ea51
3
  size 16389
rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93c6534ad05b1cafddd02600c598b0b0d3c6093f5caa987ccd55226851603e9a
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccd4238203b96310504fffc3e0300303148399ae449f5a3c1223bd19096fe34f
3
  size 16389
rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20029f71d91e48ecebbc254932e22147cad9394de6f6fbf0dd6e10938583be80
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7aad592eab42dab56e49c09184b6b56297b879407f0b6182f333c4c43d53d1a
3
  size 16389
rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2630ad25730bd50bda773ff2f680e37431047865906cce6b7f5a9e51bb6ab06b
3
  size 16389
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02690e58d4566e8b28fca4807f93c1b9ed552cda61c2c7266f83fc01375f9221
3
  size 16389
rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09c8c6db7e29e809e136621644eac06a29e16345bf0a1c208e400e09dd883638
3
  size 16325
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3f847170fd131c7017195f78afedddbaa655b33607982525c46cb880859458f
3
  size 16325
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bab67597abea6433f221727b7999d5c1bd467cfb427135df9c63356ac44a538b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ad77e042001d2e933aab4e1dccf5c99a07800e969ec3c1619623eadd806ef6c
3
  size 1465
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4577259475218658,
6
  "eval_steps": 500,
7
- "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2798,11 +2798,971 @@
2798
  "rewards/format_reward_fn/mean": 0.9190057702362537,
2799
  "rewards/format_reward_fn/std": 0.23810118879191577,
2800
  "step": 1488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2801
  }
2802
  ],
2803
  "logging_steps": 16,
2804
  "max_steps": 10290,
2805
- "num_input_tokens_seen": 120551388,
2806
  "num_train_epochs": 10,
2807
  "save_steps": 500,
2808
  "stateful_callbacks": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.943634596695821,
6
  "eval_steps": 500,
7
+ "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2798
  "rewards/format_reward_fn/mean": 0.9190057702362537,
2799
  "rewards/format_reward_fn/std": 0.23810118879191577,
2800
  "step": 1488
2801
+ },
2802
+ {
2803
+ "clip_ratio/high_max": 0.0,
2804
+ "clip_ratio/high_mean": 0.0,
2805
+ "clip_ratio/low_mean": 0.0,
2806
+ "clip_ratio/low_min": 0.0,
2807
+ "clip_ratio/region_mean": 0.0,
2808
+ "completions/clipped_ratio": 0.07421875,
2809
+ "completions/max_length": 253.5,
2810
+ "completions/max_terminated_length": 244.625,
2811
+ "completions/mean_length": 190.771484375,
2812
+ "completions/mean_terminated_length": 185.8874397277832,
2813
+ "completions/min_length": 140.1875,
2814
+ "completions/min_terminated_length": 140.1875,
2815
+ "entropy": 0.0769493873231113,
2816
+ "epoch": 1.4616132167152576,
2817
+ "frac_reward_zero_std": 0.27734375,
2818
+ "grad_norm": 0.13221606612205505,
2819
+ "learning_rate": 5e-05,
2820
+ "loss": -0.0052,
2821
+ "num_tokens": 120878670.0,
2822
+ "reward": 11.72208970785141,
2823
+ "reward_std": 0.9891778491437435,
2824
+ "rewards/bm25_retrieval_reward_fn/mean": 0.865969829261303,
2825
+ "rewards/bm25_retrieval_reward_fn/std": 0.3025930265430361,
2826
+ "rewards/event_reward_fn/mean": 9.978515625,
2827
+ "rewards/event_reward_fn/std": 6.088510304689407,
2828
+ "rewards/format_reward_fn/mean": 0.8776041679084301,
2829
+ "rewards/format_reward_fn/std": 0.30370487459003925,
2830
+ "step": 1504
2831
+ },
2832
+ {
2833
+ "clip_ratio/high_max": 0.0,
2834
+ "clip_ratio/high_mean": 0.0,
2835
+ "clip_ratio/low_mean": 0.0,
2836
+ "clip_ratio/low_min": 0.0,
2837
+ "clip_ratio/region_mean": 0.0,
2838
+ "completions/clipped_ratio": 0.09375,
2839
+ "completions/max_length": 256.0,
2840
+ "completions/max_terminated_length": 250.875,
2841
+ "completions/mean_length": 210.828125,
2842
+ "completions/mean_terminated_length": 206.00187873840332,
2843
+ "completions/min_length": 159.9375,
2844
+ "completions/min_terminated_length": 159.9375,
2845
+ "entropy": 0.09037951100617647,
2846
+ "epoch": 1.4771622934888242,
2847
+ "frac_reward_zero_std": 0.31640625,
2848
+ "grad_norm": 0.303564190864563,
2849
+ "learning_rate": 5e-05,
2850
+ "loss": -0.0015,
2851
+ "num_tokens": 122164070.0,
2852
+ "reward": 11.119612038135529,
2853
+ "reward_std": 0.99767005443573,
2854
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8756668232381344,
2855
+ "rewards/bm25_retrieval_reward_fn/std": 0.29590372927486897,
2856
+ "rewards/event_reward_fn/mean": 9.3515625,
2857
+ "rewards/event_reward_fn/std": 5.329805389046669,
2858
+ "rewards/format_reward_fn/mean": 0.8923828117549419,
2859
+ "rewards/format_reward_fn/std": 0.30003819055855274,
2860
+ "step": 1520
2861
+ },
2862
+ {
2863
+ "clip_ratio/high_max": 0.0,
2864
+ "clip_ratio/high_mean": 0.0,
2865
+ "clip_ratio/low_mean": 0.0,
2866
+ "clip_ratio/low_min": 0.0,
2867
+ "clip_ratio/region_mean": 0.0,
2868
+ "completions/clipped_ratio": 0.1669921875,
2869
+ "completions/max_length": 256.0,
2870
+ "completions/max_terminated_length": 253.0,
2871
+ "completions/mean_length": 215.0888671875,
2872
+ "completions/mean_terminated_length": 206.6857042312622,
2873
+ "completions/min_length": 164.4375,
2874
+ "completions/min_terminated_length": 164.4375,
2875
+ "entropy": 0.09090339438989758,
2876
+ "epoch": 1.4927113702623906,
2877
+ "frac_reward_zero_std": 0.3046875,
2878
+ "grad_norm": 0.16249267756938934,
2879
+ "learning_rate": 5e-05,
2880
+ "loss": 0.0016,
2881
+ "num_tokens": 123527081.0,
2882
+ "reward": 10.766064465045929,
2883
+ "reward_std": 0.8386576101183891,
2884
+ "rewards/bm25_retrieval_reward_fn/mean": 0.7968913167715073,
2885
+ "rewards/bm25_retrieval_reward_fn/std": 0.3705411199480295,
2886
+ "rewards/event_reward_fn/mean": 9.1552734375,
2887
+ "rewards/event_reward_fn/std": 5.637863516807556,
2888
+ "rewards/format_reward_fn/mean": 0.8138997405767441,
2889
+ "rewards/format_reward_fn/std": 0.3759169615805149,
2890
+ "step": 1536
2891
+ },
2892
+ {
2893
+ "clip_ratio/high_max": 0.0,
2894
+ "clip_ratio/high_mean": 0.0,
2895
+ "clip_ratio/low_mean": 0.0,
2896
+ "clip_ratio/low_min": 0.0,
2897
+ "clip_ratio/region_mean": 0.0,
2898
+ "completions/clipped_ratio": 0.1435546875,
2899
+ "completions/max_length": 255.375,
2900
+ "completions/max_terminated_length": 250.375,
2901
+ "completions/mean_length": 212.8251953125,
2902
+ "completions/mean_terminated_length": 205.47227001190186,
2903
+ "completions/min_length": 157.5625,
2904
+ "completions/min_terminated_length": 157.5625,
2905
+ "entropy": 0.10008962173014879,
2906
+ "epoch": 1.508260447035957,
2907
+ "frac_reward_zero_std": 0.265625,
2908
+ "grad_norm": 0.23113620281219482,
2909
+ "learning_rate": 5e-05,
2910
+ "loss": 0.004,
2911
+ "num_tokens": 124865830.0,
2912
+ "reward": 10.332128584384918,
2913
+ "reward_std": 1.082621719688177,
2914
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8261714465916157,
2915
+ "rewards/bm25_retrieval_reward_fn/std": 0.3391446927562356,
2916
+ "rewards/event_reward_fn/mean": 8.6630859375,
2917
+ "rewards/event_reward_fn/std": 5.3445031344890594,
2918
+ "rewards/format_reward_fn/mean": 0.8428710959851742,
2919
+ "rewards/format_reward_fn/std": 0.34356776159256697,
2920
+ "step": 1552
2921
+ },
2922
+ {
2923
+ "clip_ratio/high_max": 0.0,
2924
+ "clip_ratio/high_mean": 0.0,
2925
+ "clip_ratio/low_mean": 0.0,
2926
+ "clip_ratio/low_min": 0.0,
2927
+ "clip_ratio/region_mean": 0.0,
2928
+ "completions/clipped_ratio": 0.1103515625,
2929
+ "completions/max_length": 255.125,
2930
+ "completions/max_terminated_length": 247.875,
2931
+ "completions/mean_length": 204.4716796875,
2932
+ "completions/mean_terminated_length": 198.4902868270874,
2933
+ "completions/min_length": 149.9375,
2934
+ "completions/min_terminated_length": 149.9375,
2935
+ "entropy": 0.09716548025608063,
2936
+ "epoch": 1.5238095238095237,
2937
+ "frac_reward_zero_std": 0.30859375,
2938
+ "grad_norm": 0.13532325625419617,
2939
+ "learning_rate": 5e-05,
2940
+ "loss": 0.0014,
2941
+ "num_tokens": 126156049.0,
2942
+ "reward": 9.934103816747665,
2943
+ "reward_std": 0.9690110310912132,
2944
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8615453615784645,
2945
+ "rewards/bm25_retrieval_reward_fn/std": 0.2830730821006,
2946
+ "rewards/event_reward_fn/mean": 8.1953125,
2947
+ "rewards/event_reward_fn/std": 4.997192412614822,
2948
+ "rewards/format_reward_fn/mean": 0.8772460930049419,
2949
+ "rewards/format_reward_fn/std": 0.27953232545405626,
2950
+ "step": 1568
2951
+ },
2952
+ {
2953
+ "clip_ratio/high_max": 0.0,
2954
+ "clip_ratio/high_mean": 0.0,
2955
+ "clip_ratio/low_mean": 0.0,
2956
+ "clip_ratio/low_min": 0.0,
2957
+ "clip_ratio/region_mean": 0.0,
2958
+ "completions/clipped_ratio": 0.0634765625,
2959
+ "completions/max_length": 253.625,
2960
+ "completions/max_terminated_length": 241.8125,
2961
+ "completions/mean_length": 191.8193359375,
2962
+ "completions/mean_terminated_length": 187.4044713973999,
2963
+ "completions/min_length": 134.4375,
2964
+ "completions/min_terminated_length": 134.4375,
2965
+ "entropy": 0.08724062331020832,
2966
+ "epoch": 1.5393586005830904,
2967
+ "frac_reward_zero_std": 0.30078125,
2968
+ "grad_norm": 0.13813965022563934,
2969
+ "learning_rate": 5e-05,
2970
+ "loss": -0.0036,
2971
+ "num_tokens": 127483244.0,
2972
+ "reward": 11.109964549541473,
2973
+ "reward_std": 0.9232164584100246,
2974
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9082068763673306,
2975
+ "rewards/bm25_retrieval_reward_fn/std": 0.25597723573446274,
2976
+ "rewards/event_reward_fn/mean": 9.279296875,
2977
+ "rewards/event_reward_fn/std": 5.3837059289216995,
2978
+ "rewards/format_reward_fn/mean": 0.9224609360098839,
2979
+ "rewards/format_reward_fn/std": 0.2576202508062124,
2980
+ "step": 1584
2981
+ },
2982
+ {
2983
+ "clip_ratio/high_max": 0.0,
2984
+ "clip_ratio/high_mean": 0.0,
2985
+ "clip_ratio/low_mean": 0.0,
2986
+ "clip_ratio/low_min": 0.0,
2987
+ "clip_ratio/region_mean": 0.0,
2988
+ "completions/clipped_ratio": 0.0458984375,
2989
+ "completions/max_length": 251.0,
2990
+ "completions/max_terminated_length": 242.375,
2991
+ "completions/mean_length": 193.9111328125,
2992
+ "completions/mean_terminated_length": 190.90945529937744,
2993
+ "completions/min_length": 148.625,
2994
+ "completions/min_terminated_length": 148.625,
2995
+ "entropy": 0.08152232086285949,
2996
+ "epoch": 1.554907677356657,
2997
+ "frac_reward_zero_std": 0.34375,
2998
+ "grad_norm": 0.35102641582489014,
2999
+ "learning_rate": 5e-05,
3000
+ "loss": -0.0013,
3001
+ "num_tokens": 128764293.0,
3002
+ "reward": 11.371211469173431,
3003
+ "reward_std": 0.8595849685370922,
3004
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9204303659498692,
3005
+ "rewards/bm25_retrieval_reward_fn/std": 0.21888624806888402,
3006
+ "rewards/event_reward_fn/mean": 9.513671875,
3007
+ "rewards/event_reward_fn/std": 5.4597727209329605,
3008
+ "rewards/format_reward_fn/mean": 0.9371093735098839,
3009
+ "rewards/format_reward_fn/std": 0.2127007795497775,
3010
+ "step": 1600
3011
+ },
3012
+ {
3013
+ "clip_ratio/high_max": 0.0,
3014
+ "clip_ratio/high_mean": 0.0,
3015
+ "clip_ratio/low_mean": 0.0,
3016
+ "clip_ratio/low_min": 0.0,
3017
+ "clip_ratio/region_mean": 0.0,
3018
+ "completions/clipped_ratio": 0.091796875,
3019
+ "completions/max_length": 252.375,
3020
+ "completions/max_terminated_length": 245.375,
3021
+ "completions/mean_length": 200.150390625,
3022
+ "completions/mean_terminated_length": 194.58474922180176,
3023
+ "completions/min_length": 145.125,
3024
+ "completions/min_terminated_length": 145.125,
3025
+ "entropy": 0.08945442596450448,
3026
+ "epoch": 1.5704567541302237,
3027
+ "frac_reward_zero_std": 0.28125,
3028
+ "grad_norm": 0.11586015671491623,
3029
+ "learning_rate": 5e-05,
3030
+ "loss": -0.0005,
3031
+ "num_tokens": 130147811.0,
3032
+ "reward": 10.688360095024109,
3033
+ "reward_std": 0.8784848563373089,
3034
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8703912869095802,
3035
+ "rewards/bm25_retrieval_reward_fn/std": 0.26367771509103477,
3036
+ "rewards/event_reward_fn/mean": 8.9267578125,
3037
+ "rewards/event_reward_fn/std": 5.635714888572693,
3038
+ "rewards/format_reward_fn/mean": 0.8912109360098839,
3039
+ "rewards/format_reward_fn/std": 0.2533010635524988,
3040
+ "step": 1616
3041
+ },
3042
+ {
3043
+ "clip_ratio/high_max": 0.0,
3044
+ "clip_ratio/high_mean": 0.0,
3045
+ "clip_ratio/low_mean": 0.0,
3046
+ "clip_ratio/low_min": 0.0,
3047
+ "clip_ratio/region_mean": 0.0,
3048
+ "completions/clipped_ratio": 0.07421875,
3049
+ "completions/max_length": 254.3125,
3050
+ "completions/max_terminated_length": 239.75,
3051
+ "completions/mean_length": 195.1494140625,
3052
+ "completions/mean_terminated_length": 190.32571697235107,
3053
+ "completions/min_length": 143.625,
3054
+ "completions/min_terminated_length": 143.625,
3055
+ "entropy": 0.08721820963546634,
3056
+ "epoch": 1.58600583090379,
3057
+ "frac_reward_zero_std": 0.328125,
3058
+ "grad_norm": 0.1575620472431183,
3059
+ "learning_rate": 5e-05,
3060
+ "loss": -0.0,
3061
+ "num_tokens": 131479512.0,
3062
+ "reward": 10.922975957393646,
3063
+ "reward_std": 0.7370323836803436,
3064
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8828910291194916,
3065
+ "rewards/bm25_retrieval_reward_fn/std": 0.2897054869681597,
3066
+ "rewards/event_reward_fn/mean": 9.146484375,
3067
+ "rewards/event_reward_fn/std": 5.057717680931091,
3068
+ "rewards/format_reward_fn/mean": 0.8936002627015114,
3069
+ "rewards/format_reward_fn/std": 0.28906678687781096,
3070
+ "step": 1632
3071
+ },
3072
+ {
3073
+ "clip_ratio/high_max": 0.0,
3074
+ "clip_ratio/high_mean": 0.0,
3075
+ "clip_ratio/low_mean": 0.0,
3076
+ "clip_ratio/low_min": 0.0,
3077
+ "clip_ratio/region_mean": 0.0,
3078
+ "completions/clipped_ratio": 0.0703125,
3079
+ "completions/max_length": 254.0,
3080
+ "completions/max_terminated_length": 241.6875,
3081
+ "completions/mean_length": 198.5185546875,
3082
+ "completions/mean_terminated_length": 194.11609935760498,
3083
+ "completions/min_length": 149.1875,
3084
+ "completions/min_terminated_length": 149.1875,
3085
+ "entropy": 0.08794478559866548,
3086
+ "epoch": 1.6015549076773565,
3087
+ "frac_reward_zero_std": 0.34375,
3088
+ "grad_norm": 0.16397124528884888,
3089
+ "learning_rate": 5e-05,
3090
+ "loss": 0.001,
3091
+ "num_tokens": 132797395.0,
3092
+ "reward": 10.608273446559906,
3093
+ "reward_std": 0.8345479369163513,
3094
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8834850341081619,
3095
+ "rewards/bm25_retrieval_reward_fn/std": 0.27482672582846135,
3096
+ "rewards/event_reward_fn/mean": 8.8251953125,
3097
+ "rewards/event_reward_fn/std": 5.233703002333641,
3098
+ "rewards/format_reward_fn/mean": 0.8995930962264538,
3099
+ "rewards/format_reward_fn/std": 0.27503635361790657,
3100
+ "step": 1648
3101
+ },
3102
+ {
3103
+ "clip_ratio/high_max": 0.0,
3104
+ "clip_ratio/high_mean": 0.0,
3105
+ "clip_ratio/low_mean": 0.0,
3106
+ "clip_ratio/low_min": 0.0,
3107
+ "clip_ratio/region_mean": 0.0,
3108
+ "completions/clipped_ratio": 0.064453125,
3109
+ "completions/max_length": 254.6875,
3110
+ "completions/max_terminated_length": 241.25,
3111
+ "completions/mean_length": 192.1123046875,
3112
+ "completions/mean_terminated_length": 187.6938066482544,
3113
+ "completions/min_length": 137.3125,
3114
+ "completions/min_terminated_length": 137.3125,
3115
+ "entropy": 0.08001765748485923,
3116
+ "epoch": 1.6171039844509232,
3117
+ "frac_reward_zero_std": 0.33203125,
3118
+ "grad_norm": 0.16833443939685822,
3119
+ "learning_rate": 5e-05,
3120
+ "loss": 0.0008,
3121
+ "num_tokens": 134069966.0,
3122
+ "reward": 11.113677322864532,
3123
+ "reward_std": 0.9350622501224279,
3124
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9033686555922031,
3125
+ "rewards/bm25_retrieval_reward_fn/std": 0.2602922786027193,
3126
+ "rewards/event_reward_fn/mean": 9.2939453125,
3127
+ "rewards/event_reward_fn/std": 5.6752976179122925,
3128
+ "rewards/format_reward_fn/mean": 0.9163634702563286,
3129
+ "rewards/format_reward_fn/std": 0.2618194241076708,
3130
+ "step": 1664
3131
+ },
3132
+ {
3133
+ "clip_ratio/high_max": 0.0,
3134
+ "clip_ratio/high_mean": 0.0,
3135
+ "clip_ratio/low_mean": 0.0,
3136
+ "clip_ratio/low_min": 0.0,
3137
+ "clip_ratio/region_mean": 0.0,
3138
+ "completions/clipped_ratio": 0.06640625,
3139
+ "completions/max_length": 249.3125,
3140
+ "completions/max_terminated_length": 241.75,
3141
+ "completions/mean_length": 196.7158203125,
3142
+ "completions/mean_terminated_length": 192.56449699401855,
3143
+ "completions/min_length": 129.6875,
3144
+ "completions/min_terminated_length": 129.6875,
3145
+ "entropy": 0.08379031391814351,
3146
+ "epoch": 1.6326530612244898,
3147
+ "frac_reward_zero_std": 0.3125,
3148
+ "grad_norm": 0.14574581384658813,
3149
+ "learning_rate": 5e-05,
3150
+ "loss": -0.0047,
3151
+ "num_tokens": 135435439.0,
3152
+ "reward": 11.539310336112976,
3153
+ "reward_std": 0.9443789459764957,
3154
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8967322260141373,
3155
+ "rewards/bm25_retrieval_reward_fn/std": 0.22321847162675112,
3156
+ "rewards/event_reward_fn/mean": 9.7314453125,
3157
+ "rewards/event_reward_fn/std": 5.278485506772995,
3158
+ "rewards/format_reward_fn/mean": 0.9111328125,
3159
+ "rewards/format_reward_fn/std": 0.21248832251876593,
3160
+ "step": 1680
3161
+ },
3162
+ {
3163
+ "clip_ratio/high_max": 0.0,
3164
+ "clip_ratio/high_mean": 0.0,
3165
+ "clip_ratio/low_mean": 0.0,
3166
+ "clip_ratio/low_min": 0.0,
3167
+ "clip_ratio/region_mean": 0.0,
3168
+ "completions/clipped_ratio": 0.0791015625,
3169
+ "completions/max_length": 254.8125,
3170
+ "completions/max_terminated_length": 244.8125,
3171
+ "completions/mean_length": 202.83203125,
3172
+ "completions/mean_terminated_length": 198.2507667541504,
3173
+ "completions/min_length": 150.625,
3174
+ "completions/min_terminated_length": 150.625,
3175
+ "entropy": 0.08855495927855372,
3176
+ "epoch": 1.6482021379980565,
3177
+ "frac_reward_zero_std": 0.3046875,
3178
+ "grad_norm": 0.17940281331539154,
3179
+ "learning_rate": 5e-05,
3180
+ "loss": -0.0056,
3181
+ "num_tokens": 136778675.0,
3182
+ "reward": 11.134308993816376,
3183
+ "reward_std": 0.9293302595615387,
3184
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8743740394711494,
3185
+ "rewards/bm25_retrieval_reward_fn/std": 0.26636734034400433,
3186
+ "rewards/event_reward_fn/mean": 9.3662109375,
3187
+ "rewards/event_reward_fn/std": 5.85838320851326,
3188
+ "rewards/format_reward_fn/mean": 0.8937239646911621,
3189
+ "rewards/format_reward_fn/std": 0.2652863524854183,
3190
+ "step": 1696
3191
+ },
3192
+ {
3193
+ "clip_ratio/high_max": 0.0,
3194
+ "clip_ratio/high_mean": 0.0,
3195
+ "clip_ratio/low_mean": 0.0,
3196
+ "clip_ratio/low_min": 0.0,
3197
+ "clip_ratio/region_mean": 0.0,
3198
+ "completions/clipped_ratio": 0.138671875,
3199
+ "completions/max_length": 256.0,
3200
+ "completions/max_terminated_length": 251.6875,
3201
+ "completions/mean_length": 213.9638671875,
3202
+ "completions/mean_terminated_length": 207.2107219696045,
3203
+ "completions/min_length": 162.375,
3204
+ "completions/min_terminated_length": 162.375,
3205
+ "entropy": 0.09064092021435499,
3206
+ "epoch": 1.663751214771623,
3207
+ "frac_reward_zero_std": 0.3046875,
3208
+ "grad_norm": 0.15384909510612488,
3209
+ "learning_rate": 5e-05,
3210
+ "loss": 0.0029,
3211
+ "num_tokens": 138044578.0,
3212
+ "reward": 11.337530732154846,
3213
+ "reward_std": 0.9400022551417351,
3214
+ "rewards/bm25_retrieval_reward_fn/mean": 0.824542474001646,
3215
+ "rewards/bm25_retrieval_reward_fn/std": 0.33465168718248606,
3216
+ "rewards/event_reward_fn/mean": 9.6669921875,
3217
+ "rewards/event_reward_fn/std": 5.503222852945328,
3218
+ "rewards/format_reward_fn/mean": 0.8459960930049419,
3219
+ "rewards/format_reward_fn/std": 0.3366972776129842,
3220
+ "step": 1712
3221
+ },
3222
+ {
3223
+ "clip_ratio/high_max": 0.0,
3224
+ "clip_ratio/high_mean": 0.0,
3225
+ "clip_ratio/low_mean": 0.0,
3226
+ "clip_ratio/low_min": 0.0,
3227
+ "clip_ratio/region_mean": 0.0,
3228
+ "completions/clipped_ratio": 0.046875,
3229
+ "completions/max_length": 251.5625,
3230
+ "completions/max_terminated_length": 245.5625,
3231
+ "completions/mean_length": 203.677734375,
3232
+ "completions/mean_terminated_length": 201.15838241577148,
3233
+ "completions/min_length": 159.1875,
3234
+ "completions/min_terminated_length": 159.1875,
3235
+ "entropy": 0.08698790520429611,
3236
+ "epoch": 1.6793002915451893,
3237
+ "frac_reward_zero_std": 0.296875,
3238
+ "grad_norm": 0.11867301166057587,
3239
+ "learning_rate": 5e-05,
3240
+ "loss": 0.0003,
3241
+ "num_tokens": 139288124.0,
3242
+ "reward": 11.192306399345398,
3243
+ "reward_std": 0.9463471882045269,
3244
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9286414235830307,
3245
+ "rewards/bm25_retrieval_reward_fn/std": 0.20035810582339764,
3246
+ "rewards/event_reward_fn/mean": 9.3193359375,
3247
+ "rewards/event_reward_fn/std": 5.200570702552795,
3248
+ "rewards/format_reward_fn/mean": 0.9443289637565613,
3249
+ "rewards/format_reward_fn/std": 0.19202105328440666,
3250
+ "step": 1728
3251
+ },
3252
+ {
3253
+ "clip_ratio/high_max": 0.0,
3254
+ "clip_ratio/high_mean": 0.0,
3255
+ "clip_ratio/low_mean": 0.0,
3256
+ "clip_ratio/low_min": 0.0,
3257
+ "clip_ratio/region_mean": 0.0,
3258
+ "completions/clipped_ratio": 0.0556640625,
3259
+ "completions/max_length": 252.125,
3260
+ "completions/max_terminated_length": 244.875,
3261
+ "completions/mean_length": 193.6708984375,
3262
+ "completions/mean_terminated_length": 189.9529905319214,
3263
+ "completions/min_length": 136.625,
3264
+ "completions/min_terminated_length": 136.625,
3265
+ "entropy": 0.08362232241779566,
3266
+ "epoch": 1.694849368318756,
3267
+ "frac_reward_zero_std": 0.33203125,
3268
+ "grad_norm": 0.11613152176141739,
3269
+ "learning_rate": 5e-05,
3270
+ "loss": -0.0019,
3271
+ "num_tokens": 140615163.0,
3272
+ "reward": 11.211718916893005,
3273
+ "reward_std": 0.8285622540861368,
3274
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9016408734023571,
3275
+ "rewards/bm25_retrieval_reward_fn/std": 0.25260637141764164,
3276
+ "rewards/event_reward_fn/mean": 9.390625,
3277
+ "rewards/event_reward_fn/std": 5.310590535402298,
3278
+ "rewards/format_reward_fn/mean": 0.9194531291723251,
3279
+ "rewards/format_reward_fn/std": 0.251515906304121,
3280
+ "step": 1744
3281
+ },
3282
+ {
3283
+ "clip_ratio/high_max": 0.0,
3284
+ "clip_ratio/high_mean": 0.0,
3285
+ "clip_ratio/low_mean": 0.0,
3286
+ "clip_ratio/low_min": 0.0,
3287
+ "clip_ratio/region_mean": 0.0,
3288
+ "completions/clipped_ratio": 0.0751953125,
3289
+ "completions/max_length": 248.6875,
3290
+ "completions/max_terminated_length": 240.3125,
3291
+ "completions/mean_length": 194.3291015625,
3292
+ "completions/mean_terminated_length": 189.4942626953125,
3293
+ "completions/min_length": 139.25,
3294
+ "completions/min_terminated_length": 139.25,
3295
+ "entropy": 0.08920921664685011,
3296
+ "epoch": 1.7103984450923226,
3297
+ "frac_reward_zero_std": 0.265625,
3298
+ "grad_norm": 0.1495039016008377,
3299
+ "learning_rate": 5e-05,
3300
+ "loss": -0.0026,
3301
+ "num_tokens": 141995908.0,
3302
+ "reward": 11.331986844539642,
3303
+ "reward_std": 0.9946209099143744,
3304
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8805676624178886,
3305
+ "rewards/bm25_retrieval_reward_fn/std": 0.2605485112289898,
3306
+ "rewards/event_reward_fn/mean": 9.5615234375,
3307
+ "rewards/event_reward_fn/std": 5.626507669687271,
3308
+ "rewards/format_reward_fn/mean": 0.8898958377540112,
3309
+ "rewards/format_reward_fn/std": 0.25742682348936796,
3310
+ "step": 1760
3311
+ },
3312
+ {
3313
+ "clip_ratio/high_max": 0.0,
3314
+ "clip_ratio/high_mean": 0.0,
3315
+ "clip_ratio/low_mean": 0.0,
3316
+ "clip_ratio/low_min": 0.0,
3317
+ "clip_ratio/region_mean": 0.0,
3318
+ "completions/clipped_ratio": 0.0556640625,
3319
+ "completions/max_length": 250.625,
3320
+ "completions/max_terminated_length": 242.25,
3321
+ "completions/mean_length": 194.5751953125,
3322
+ "completions/mean_terminated_length": 190.86159992218018,
3323
+ "completions/min_length": 141.5625,
3324
+ "completions/min_terminated_length": 141.5625,
3325
+ "entropy": 0.09618484182283282,
3326
+ "epoch": 1.7259475218658893,
3327
+ "frac_reward_zero_std": 0.26953125,
3328
+ "grad_norm": 0.20417290925979614,
3329
+ "learning_rate": 5e-05,
3330
+ "loss": -0.0055,
3331
+ "num_tokens": 143301673.0,
3332
+ "reward": 10.538148939609528,
3333
+ "reward_std": 0.9361699968576431,
3334
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9052551127970219,
3335
+ "rewards/bm25_retrieval_reward_fn/std": 0.2257093784864992,
3336
+ "rewards/event_reward_fn/mean": 8.7158203125,
3337
+ "rewards/event_reward_fn/std": 4.607826009392738,
3338
+ "rewards/format_reward_fn/mean": 0.9170735664665699,
3339
+ "rewards/format_reward_fn/std": 0.22840105323120952,
3340
+ "step": 1776
3341
+ },
3342
+ {
3343
+ "clip_ratio/high_max": 0.0,
3344
+ "clip_ratio/high_mean": 0.0,
3345
+ "clip_ratio/low_mean": 0.0,
3346
+ "clip_ratio/low_min": 0.0,
3347
+ "clip_ratio/region_mean": 0.0,
3348
+ "completions/clipped_ratio": 0.0576171875,
3349
+ "completions/max_length": 252.8125,
3350
+ "completions/max_terminated_length": 244.4375,
3351
+ "completions/mean_length": 200.8662109375,
3352
+ "completions/mean_terminated_length": 197.39703178405762,
3353
+ "completions/min_length": 149.9375,
3354
+ "completions/min_terminated_length": 149.9375,
3355
+ "entropy": 0.08653424866497517,
3356
+ "epoch": 1.741496598639456,
3357
+ "frac_reward_zero_std": 0.3359375,
3358
+ "grad_norm": 0.14243784546852112,
3359
+ "learning_rate": 5e-05,
3360
+ "loss": 0.0011,
3361
+ "num_tokens": 144603412.0,
3362
+ "reward": 11.493825078010559,
3363
+ "reward_std": 0.8755283299833536,
3364
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9135190099477768,
3365
+ "rewards/bm25_retrieval_reward_fn/std": 0.24690337451465894,
3366
+ "rewards/event_reward_fn/mean": 9.658203125,
3367
+ "rewards/event_reward_fn/std": 5.445283606648445,
3368
+ "rewards/format_reward_fn/mean": 0.922102864831686,
3369
+ "rewards/format_reward_fn/std": 0.2481938637793064,
3370
+ "step": 1792
3371
+ },
3372
+ {
3373
+ "clip_ratio/high_max": 0.0,
3374
+ "clip_ratio/high_mean": 0.0,
3375
+ "clip_ratio/low_mean": 0.0,
3376
+ "clip_ratio/low_min": 0.0,
3377
+ "clip_ratio/region_mean": 0.0,
3378
+ "completions/clipped_ratio": 0.0556640625,
3379
+ "completions/max_length": 252.0625,
3380
+ "completions/max_terminated_length": 246.75,
3381
+ "completions/mean_length": 196.5146484375,
3382
+ "completions/mean_terminated_length": 193.12859344482422,
3383
+ "completions/min_length": 140.8125,
3384
+ "completions/min_terminated_length": 140.8125,
3385
+ "entropy": 0.08316960139200091,
3386
+ "epoch": 1.7570456754130224,
3387
+ "frac_reward_zero_std": 0.3125,
3388
+ "grad_norm": 0.109793521463871,
3389
+ "learning_rate": 5e-05,
3390
+ "loss": -0.0022,
3391
+ "num_tokens": 145919099.0,
3392
+ "reward": 11.740033328533173,
3393
+ "reward_std": 0.9224549978971481,
3394
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9150333367288113,
3395
+ "rewards/bm25_retrieval_reward_fn/std": 0.24498367216438055,
3396
+ "rewards/event_reward_fn/mean": 9.90234375,
3397
+ "rewards/event_reward_fn/std": 5.425331294536591,
3398
+ "rewards/format_reward_fn/mean": 0.9226562492549419,
3399
+ "rewards/format_reward_fn/std": 0.24409929476678371,
3400
+ "step": 1808
3401
+ },
3402
+ {
3403
+ "clip_ratio/high_max": 0.0,
3404
+ "clip_ratio/high_mean": 0.0,
3405
+ "clip_ratio/low_mean": 0.0,
3406
+ "clip_ratio/low_min": 0.0,
3407
+ "clip_ratio/region_mean": 0.0,
3408
+ "completions/clipped_ratio": 0.072265625,
3409
+ "completions/max_length": 254.75,
3410
+ "completions/max_terminated_length": 249.3125,
3411
+ "completions/mean_length": 208.4755859375,
3412
+ "completions/mean_terminated_length": 204.96116065979004,
3413
+ "completions/min_length": 160.5625,
3414
+ "completions/min_terminated_length": 160.5625,
3415
+ "entropy": 0.08932856796309352,
3416
+ "epoch": 1.7725947521865888,
3417
+ "frac_reward_zero_std": 0.3359375,
3418
+ "grad_norm": 0.24029314517974854,
3419
+ "learning_rate": 5e-05,
3420
+ "loss": 0.0045,
3421
+ "num_tokens": 147275350.0,
3422
+ "reward": 11.133660674095154,
3423
+ "reward_std": 1.0420608818531036,
3424
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8887974470853806,
3425
+ "rewards/bm25_retrieval_reward_fn/std": 0.2559172356268391,
3426
+ "rewards/event_reward_fn/mean": 9.34375,
3427
+ "rewards/event_reward_fn/std": 5.51551166176796,
3428
+ "rewards/format_reward_fn/mean": 0.9011132828891277,
3429
+ "rewards/format_reward_fn/std": 0.2507179146632552,
3430
+ "step": 1824
3431
+ },
3432
+ {
3433
+ "clip_ratio/high_max": 0.0,
3434
+ "clip_ratio/high_mean": 0.0,
3435
+ "clip_ratio/low_mean": 0.0,
3436
+ "clip_ratio/low_min": 0.0,
3437
+ "clip_ratio/region_mean": 0.0,
3438
+ "completions/clipped_ratio": 0.046875,
3439
+ "completions/max_length": 252.375,
3440
+ "completions/max_terminated_length": 240.875,
3441
+ "completions/mean_length": 198.576171875,
3442
+ "completions/mean_terminated_length": 195.7430601119995,
3443
+ "completions/min_length": 151.25,
3444
+ "completions/min_terminated_length": 151.25,
3445
+ "entropy": 0.08435806119814515,
3446
+ "epoch": 1.7881438289601554,
3447
+ "frac_reward_zero_std": 0.36328125,
3448
+ "grad_norm": 0.13869501650333405,
3449
+ "learning_rate": 5e-05,
3450
+ "loss": -0.0026,
3451
+ "num_tokens": 148539184.0,
3452
+ "reward": 11.031599402427673,
3453
+ "reward_std": 0.7965468689799309,
3454
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9208246804773808,
3455
+ "rewards/bm25_retrieval_reward_fn/std": 0.2361440734239295,
3456
+ "rewards/event_reward_fn/mean": 9.1796875,
3457
+ "rewards/event_reward_fn/std": 4.907300844788551,
3458
+ "rewards/format_reward_fn/mean": 0.9310872405767441,
3459
+ "rewards/format_reward_fn/std": 0.2394925099797547,
3460
+ "step": 1840
3461
+ },
3462
+ {
3463
+ "clip_ratio/high_max": 0.0,
3464
+ "clip_ratio/high_mean": 0.0,
3465
+ "clip_ratio/low_mean": 0.0,
3466
+ "clip_ratio/low_min": 0.0,
3467
+ "clip_ratio/region_mean": 0.0,
3468
+ "completions/clipped_ratio": 0.0380859375,
3469
+ "completions/max_length": 252.5,
3470
+ "completions/max_terminated_length": 250.125,
3471
+ "completions/mean_length": 202.90234375,
3472
+ "completions/mean_terminated_length": 200.88229370117188,
3473
+ "completions/min_length": 155.875,
3474
+ "completions/min_terminated_length": 155.875,
3475
+ "entropy": 0.08052209811285138,
3476
+ "epoch": 1.803692905733722,
3477
+ "frac_reward_zero_std": 0.35546875,
3478
+ "grad_norm": 0.1878909021615982,
3479
+ "learning_rate": 5e-05,
3480
+ "loss": 0.001,
3481
+ "num_tokens": 149840940.0,
3482
+ "reward": 10.959127485752106,
3483
+ "reward_std": 0.9578492008149624,
3484
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9300258904695511,
3485
+ "rewards/bm25_retrieval_reward_fn/std": 0.18775073438882828,
3486
+ "rewards/event_reward_fn/mean": 9.0859375,
3487
+ "rewards/event_reward_fn/std": 5.149698540568352,
3488
+ "rewards/format_reward_fn/mean": 0.9431640617549419,
3489
+ "rewards/format_reward_fn/std": 0.17401384096592665,
3490
+ "step": 1856
3491
+ },
3492
+ {
3493
+ "clip_ratio/high_max": 0.0,
3494
+ "clip_ratio/high_mean": 0.0,
3495
+ "clip_ratio/low_mean": 0.0,
3496
+ "clip_ratio/low_min": 0.0,
3497
+ "clip_ratio/region_mean": 0.0,
3498
+ "completions/clipped_ratio": 0.09765625,
3499
+ "completions/max_length": 255.9375,
3500
+ "completions/max_terminated_length": 248.875,
3501
+ "completions/mean_length": 211.8564453125,
3502
+ "completions/mean_terminated_length": 207.1767454147339,
3503
+ "completions/min_length": 164.5625,
3504
+ "completions/min_terminated_length": 164.5625,
3505
+ "entropy": 0.08094025542959571,
3506
+ "epoch": 1.8192419825072887,
3507
+ "frac_reward_zero_std": 0.34765625,
3508
+ "grad_norm": 0.14807139337062836,
3509
+ "learning_rate": 5e-05,
3510
+ "loss": 0.0023,
3511
+ "num_tokens": 151221749.0,
3512
+ "reward": 11.752990126609802,
3513
+ "reward_std": 0.9537594802677631,
3514
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8724758252501488,
3515
+ "rewards/bm25_retrieval_reward_fn/std": 0.29270493309013546,
3516
+ "rewards/event_reward_fn/mean": 9.9970703125,
3517
+ "rewards/event_reward_fn/std": 5.857491314411163,
3518
+ "rewards/format_reward_fn/mean": 0.8834440112113953,
3519
+ "rewards/format_reward_fn/std": 0.29244135320186615,
3520
+ "step": 1872
3521
+ },
3522
+ {
3523
+ "clip_ratio/high_max": 0.0,
3524
+ "clip_ratio/high_mean": 0.0,
3525
+ "clip_ratio/low_mean": 0.0,
3526
+ "clip_ratio/low_min": 0.0,
3527
+ "clip_ratio/region_mean": 0.0,
3528
+ "completions/clipped_ratio": 0.0546875,
3529
+ "completions/max_length": 254.3125,
3530
+ "completions/max_terminated_length": 250.25,
3531
+ "completions/mean_length": 208.7001953125,
3532
+ "completions/mean_terminated_length": 206.05935287475586,
3533
+ "completions/min_length": 159.125,
3534
+ "completions/min_terminated_length": 159.125,
3535
+ "entropy": 0.08766834484413266,
3536
+ "epoch": 1.8347910592808552,
3537
+ "frac_reward_zero_std": 0.34375,
3538
+ "grad_norm": 0.17317424714565277,
3539
+ "learning_rate": 5e-05,
3540
+ "loss": -0.0004,
3541
+ "num_tokens": 152521294.0,
3542
+ "reward": 11.364756107330322,
3543
+ "reward_std": 0.9098326228559017,
3544
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8889735676348209,
3545
+ "rewards/bm25_retrieval_reward_fn/std": 0.2667266938369721,
3546
+ "rewards/event_reward_fn/mean": 9.568359375,
3547
+ "rewards/event_reward_fn/std": 5.424193903803825,
3548
+ "rewards/format_reward_fn/mean": 0.9074231162667274,
3549
+ "rewards/format_reward_fn/std": 0.2601332040503621,
3550
+ "step": 1888
3551
+ },
3552
+ {
3553
+ "clip_ratio/high_max": 0.0,
3554
+ "clip_ratio/high_mean": 0.0,
3555
+ "clip_ratio/low_mean": 0.0,
3556
+ "clip_ratio/low_min": 0.0,
3557
+ "clip_ratio/region_mean": 0.0,
3558
+ "completions/clipped_ratio": 0.0478515625,
3559
+ "completions/max_length": 253.1875,
3560
+ "completions/max_terminated_length": 248.625,
3561
+ "completions/mean_length": 205.2333984375,
3562
+ "completions/mean_terminated_length": 202.61692428588867,
3563
+ "completions/min_length": 158.0625,
3564
+ "completions/min_terminated_length": 158.0625,
3565
+ "entropy": 0.0873062857426703,
3566
+ "epoch": 1.8503401360544216,
3567
+ "frac_reward_zero_std": 0.3359375,
3568
+ "grad_norm": 0.16510100662708282,
3569
+ "learning_rate": 5e-05,
3570
+ "loss": 0.0008,
3571
+ "num_tokens": 153814765.0,
3572
+ "reward": 10.840591430664062,
3573
+ "reward_std": 0.806601133197546,
3574
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8800444230437279,
3575
+ "rewards/bm25_retrieval_reward_fn/std": 0.2592724412679672,
3576
+ "rewards/event_reward_fn/mean": 9.05078125,
3577
+ "rewards/event_reward_fn/std": 5.017846331000328,
3578
+ "rewards/format_reward_fn/mean": 0.9097656235098839,
3579
+ "rewards/format_reward_fn/std": 0.2350642140954733,
3580
+ "step": 1904
3581
+ },
3582
+ {
3583
+ "clip_ratio/high_max": 0.0,
3584
+ "clip_ratio/high_mean": 0.0,
3585
+ "clip_ratio/low_mean": 0.0,
3586
+ "clip_ratio/low_min": 0.0,
3587
+ "clip_ratio/region_mean": 0.0,
3588
+ "completions/clipped_ratio": 0.0478515625,
3589
+ "completions/max_length": 250.6875,
3590
+ "completions/max_terminated_length": 246.625,
3591
+ "completions/mean_length": 199.6025390625,
3592
+ "completions/mean_terminated_length": 196.8941469192505,
3593
+ "completions/min_length": 146.75,
3594
+ "completions/min_terminated_length": 146.75,
3595
+ "entropy": 0.08616631478071213,
3596
+ "epoch": 1.8658892128279883,
3597
+ "frac_reward_zero_std": 0.328125,
3598
+ "grad_norm": 0.10711020976305008,
3599
+ "learning_rate": 5e-05,
3600
+ "loss": 0.003,
3601
+ "num_tokens": 155159530.0,
3602
+ "reward": 11.430678129196167,
3603
+ "reward_std": 0.7845460455864668,
3604
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9038552716374397,
3605
+ "rewards/bm25_retrieval_reward_fn/std": 0.21465440141037107,
3606
+ "rewards/event_reward_fn/mean": 9.5986328125,
3607
+ "rewards/event_reward_fn/std": 5.15682627260685,
3608
+ "rewards/format_reward_fn/mean": 0.928190104663372,
3609
+ "rewards/format_reward_fn/std": 0.19767758785746992,
3610
+ "step": 1920
3611
+ },
3612
+ {
3613
+ "clip_ratio/high_max": 0.0,
3614
+ "clip_ratio/high_mean": 0.0,
3615
+ "clip_ratio/low_mean": 0.0,
3616
+ "clip_ratio/low_min": 0.0,
3617
+ "clip_ratio/region_mean": 0.0,
3618
+ "completions/clipped_ratio": 0.0625,
3619
+ "completions/max_length": 253.1875,
3620
+ "completions/max_terminated_length": 246.6875,
3621
+ "completions/mean_length": 200.734375,
3622
+ "completions/mean_terminated_length": 197.25225925445557,
3623
+ "completions/min_length": 150.1875,
3624
+ "completions/min_terminated_length": 150.1875,
3625
+ "entropy": 0.08771243086084723,
3626
+ "epoch": 1.881438289601555,
3627
+ "frac_reward_zero_std": 0.27734375,
3628
+ "grad_norm": 0.119595006108284,
3629
+ "learning_rate": 5e-05,
3630
+ "loss": -0.001,
3631
+ "num_tokens": 156525614.0,
3632
+ "reward": 11.453014373779297,
3633
+ "reward_std": 1.111331295222044,
3634
+ "rewards/bm25_retrieval_reward_fn/mean": 0.8604980707168579,
3635
+ "rewards/bm25_retrieval_reward_fn/std": 0.2871107269311324,
3636
+ "rewards/event_reward_fn/mean": 9.708984375,
3637
+ "rewards/event_reward_fn/std": 5.215842500329018,
3638
+ "rewards/format_reward_fn/mean": 0.8835319019854069,
3639
+ "rewards/format_reward_fn/std": 0.2829501121304929,
3640
+ "step": 1936
3641
+ },
3642
+ {
3643
+ "clip_ratio/high_max": 0.0,
3644
+ "clip_ratio/high_mean": 0.0,
3645
+ "clip_ratio/low_mean": 0.0,
3646
+ "clip_ratio/low_min": 0.0,
3647
+ "clip_ratio/region_mean": 0.0,
3648
+ "completions/clipped_ratio": 0.021484375,
3649
+ "completions/max_length": 250.25,
3650
+ "completions/max_terminated_length": 245.8125,
3651
+ "completions/mean_length": 194.017578125,
3652
+ "completions/mean_terminated_length": 192.65657711029053,
3653
+ "completions/min_length": 140.6875,
3654
+ "completions/min_terminated_length": 140.6875,
3655
+ "entropy": 0.08764936728402972,
3656
+ "epoch": 1.8969873663751216,
3657
+ "frac_reward_zero_std": 0.28515625,
3658
+ "grad_norm": 0.1511303335428238,
3659
+ "learning_rate": 5e-05,
3660
+ "loss": 0.0032,
3661
+ "num_tokens": 157786400.0,
3662
+ "reward": 10.632731199264526,
3663
+ "reward_std": 0.9243863355368376,
3664
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9324383623898029,
3665
+ "rewards/bm25_retrieval_reward_fn/std": 0.18536719167605042,
3666
+ "rewards/event_reward_fn/mean": 8.7451171875,
3667
+ "rewards/event_reward_fn/std": 5.235057607293129,
3668
+ "rewards/format_reward_fn/mean": 0.9551757834851742,
3669
+ "rewards/format_reward_fn/std": 0.16272677155211568,
3670
+ "step": 1952
3671
+ },
3672
+ {
3673
+ "clip_ratio/high_max": 0.0,
3674
+ "clip_ratio/high_mean": 0.0,
3675
+ "clip_ratio/low_mean": 0.0,
3676
+ "clip_ratio/low_min": 0.0,
3677
+ "clip_ratio/region_mean": 0.0,
3678
+ "completions/clipped_ratio": 0.046875,
3679
+ "completions/max_length": 254.125,
3680
+ "completions/max_terminated_length": 250.6875,
3681
+ "completions/mean_length": 200.95703125,
3682
+ "completions/mean_terminated_length": 198.30670166015625,
3683
+ "completions/min_length": 154.6875,
3684
+ "completions/min_terminated_length": 154.6875,
3685
+ "entropy": 0.08874167408794165,
3686
+ "epoch": 1.9125364431486882,
3687
+ "frac_reward_zero_std": 0.33984375,
3688
+ "grad_norm": 0.17749741673469543,
3689
+ "learning_rate": 5e-05,
3690
+ "loss": 0.0037,
3691
+ "num_tokens": 159093888.0,
3692
+ "reward": 11.275705397129059,
3693
+ "reward_std": 0.856599148362875,
3694
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9131891131401062,
3695
+ "rewards/bm25_retrieval_reward_fn/std": 0.2150915495294612,
3696
+ "rewards/event_reward_fn/mean": 9.4345703125,
3697
+ "rewards/event_reward_fn/std": 5.729633465409279,
3698
+ "rewards/format_reward_fn/mean": 0.927945964038372,
3699
+ "rewards/format_reward_fn/std": 0.2096583191305399,
3700
+ "step": 1968
3701
+ },
3702
+ {
3703
+ "clip_ratio/high_max": 0.0,
3704
+ "clip_ratio/high_mean": 0.0,
3705
+ "clip_ratio/low_mean": 0.0,
3706
+ "clip_ratio/low_min": 0.0,
3707
+ "clip_ratio/region_mean": 0.0,
3708
+ "completions/clipped_ratio": 0.041015625,
3709
+ "completions/max_length": 253.875,
3710
+ "completions/max_terminated_length": 246.5625,
3711
+ "completions/mean_length": 200.9677734375,
3712
+ "completions/mean_terminated_length": 198.57855701446533,
3713
+ "completions/min_length": 142.8125,
3714
+ "completions/min_terminated_length": 142.8125,
3715
+ "entropy": 0.0850910097360611,
3716
+ "epoch": 1.9280855199222546,
3717
+ "frac_reward_zero_std": 0.375,
3718
+ "grad_norm": 0.12046821415424347,
3719
+ "learning_rate": 5e-05,
3720
+ "loss": 0.0007,
3721
+ "num_tokens": 160340479.0,
3722
+ "reward": 10.719317555427551,
3723
+ "reward_std": 0.8128865994513035,
3724
+ "rewards/bm25_retrieval_reward_fn/mean": 0.9061989188194275,
3725
+ "rewards/bm25_retrieval_reward_fn/std": 0.24043723253998905,
3726
+ "rewards/event_reward_fn/mean": 8.892578125,
3727
+ "rewards/event_reward_fn/std": 5.485840782523155,
3728
+ "rewards/format_reward_fn/mean": 0.9205403625965118,
3729
+ "rewards/format_reward_fn/std": 0.2409290496725589,
3730
+ "step": 1984
3731
+ },
3732
+ {
3733
+ "clip_ratio/high_max": 0.0,
3734
+ "clip_ratio/high_mean": 0.0,
3735
+ "clip_ratio/low_mean": 0.0,
3736
+ "clip_ratio/low_min": 0.0,
3737
+ "clip_ratio/region_mean": 0.0,
3738
+ "completions/clipped_ratio": 0.12109375,
3739
+ "completions/max_length": 256.0,
3740
+ "completions/max_terminated_length": 251.125,
3741
+ "completions/mean_length": 213.712890625,
3742
+ "completions/mean_terminated_length": 207.85150337219238,
3743
+ "completions/min_length": 160.9375,
3744
+ "completions/min_terminated_length": 160.9375,
3745
+ "entropy": 0.08267078269273043,
3746
+ "epoch": 1.943634596695821,
3747
+ "frac_reward_zero_std": 0.32421875,
3748
+ "grad_norm": 0.09311431646347046,
3749
+ "learning_rate": 5e-05,
3750
+ "loss": 0.0044,
3751
+ "num_tokens": 161744217.0,
3752
+ "reward": 10.99679410457611,
3753
+ "reward_std": 0.9773008767515421,
3754
+ "rewards/bm25_retrieval_reward_fn/mean": 0.856908455491066,
3755
+ "rewards/bm25_retrieval_reward_fn/std": 0.3204036271199584,
3756
+ "rewards/event_reward_fn/mean": 9.2744140625,
3757
+ "rewards/event_reward_fn/std": 5.77374792098999,
3758
+ "rewards/format_reward_fn/mean": 0.8654715418815613,
3759
+ "rewards/format_reward_fn/std": 0.3263047467917204,
3760
+ "step": 2000
3761
  }
3762
  ],
3763
  "logging_steps": 16,
3764
  "max_steps": 10290,
3765
+ "num_input_tokens_seen": 161744217,
3766
  "num_train_epochs": 10,
3767
  "save_steps": 500,
3768
  "stateful_callbacks": {